Q&A 22 How do you visualize ROC curves to compare classification models?

22.1 Explanation

Receiver Operating Characteristic (ROC) curves help visualize model performance across different thresholds. The Area Under the Curve (AUC) summarizes performance — closer to 1.0 is better.

Comparing ROC curves across models (e.g., Random Forest, Logistic Regression, XGBoost) provides insight into which performs best and where they differ.

This Q&A demonstrates ROC curve generation in Python and R.

22.2 Python Code

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Load and prepare data
otu_df = pd.read_csv("data/otu_table_filtered.tsv", sep="\t", index_col=0).T
meta_df = pd.read_csv("data/sample_metadata.tsv", sep="\t")
data = pd.merge(otu_df, meta_df, left_index=True, right_on="sample_id")
X = data[otu_df.columns]
y = data["group"].map({"Control": 0, "Treatment": 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

# Plot ROC curves
plt.figure(figsize=(8, 6))
for name, model in models.items():
    model.fit(X_train, y_train)
    probas = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, probas)
    auc_score = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc_score:.2f})")

plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.tight_layout()
plt.show()

22.3 R Code (caret + pROC)

library(tidyverse)
library(caret)
library(pROC)

otu_df <- read.delim("data/otu_table_filtered.tsv", row.names = 1)
meta_df <- read.delim("data/sample_metadata.tsv")
otu_df <- otu_df[, meta_df$sample_id]
otu_df <- t(otu_df)
data <- cbind(as.data.frame(otu_df), group = as.factor(meta_df$group))

# Train/test split
set.seed(42)
trainIndex <- createDataPartition(data$group, p = .7, list = FALSE)
train <- data[trainIndex, ]
test  <- data[-trainIndex, ]

# Define fixed tuning for models that require it
rf_grid <- data.frame(mtry = floor(sqrt(ncol(train) - 1)))
svm_grid <- data.frame(C = 1)
xgb_grid <- data.frame(
  nrounds = 50,
  max_depth = 3,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

# Train models using fixed tuneGrid
models <- list(
  rf = train(group ~ ., data = train, method = "rf", trControl = trainControl(method = "none"), tuneGrid = rf_grid),
  glm = train(group ~ ., data = train, method = "glm", family = "binomial", trControl = trainControl(method = "none")),
  svm = train(group ~ ., data = train, method = "svmLinear", trControl = trainControl(method = "none"), tuneGrid = svm_grid),
  xgb = train(group ~ ., data = train, method = "xgbTree", trControl = trainControl(method = "none"), tuneGrid = xgb_grid)
)

# ROC analysis
roc_list <- lapply(models, function(model) {
  probs <- predict(model, newdata = test, type = "prob")
  # Safely extract numeric probabilities for the "Treatment" class
  class_label <- "Treatment"
  if (!(class_label %in% colnames(probs))) {
    stop(paste("Class label", class_label, "not found in predicted probabilities"))
  }
  prob_values <- as.numeric(probs[, class_label])
  roc(response = test$group, predictor = prob_values)
})

# Plot ROC
plot(roc_list[[1]], col = "blue", legacy.axes = TRUE, main = "ROC Curves - Microbiome Classification")
cols <- c("blue", "green", "red", "purple")
for (i in 2:length(roc_list)) {
  plot(roc_list[[i]], col = cols[i], add = TRUE)
}
legend("bottomright", legend = names(models), col = cols, lwd = 2)