import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
confusion_matrix, accuracy_score, precision_score, recall_score,
f1_score, roc_curve, roc_auc_score, classification_report,
precision_recall_curve, average_precision_score
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
# Set random seed for reproducibility
np.random.seed(42)
# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("β
Setup complete")
Part 1: Confusion Matrix BasicsΒΆ
What is a Confusion Matrix?ΒΆ
A confusion matrix shows actual vs predicted classifications:
Predicted
Neg Pos
Actual Neg TN FP
Pos FN TP
Where:
TN (True Negative): Correctly predicted negative
FP (False Positive): Wrongly predicted positive (Type I error)
FN (False Negative): Wrongly predicted negative (Type II error)
TP (True Positive): Correctly predicted positive
Real-World Example: Email Spam DetectionΒΆ
Predicted
Ham Spam
Actual Ham 950 50 β 50 good emails wrongly marked spam!
Spam 20 80 β 20 spam emails reached inbox!
# Create sample predictions
y_true = np.array([0, 0, 1, 1, 0, 1, 0, 1, 1, 0])
y_pred = np.array([0, 0, 1, 0, 0, 1, 1, 1, 1, 0])
# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)
print("\nBreakdown:")
print(f"True Negatives (TN): {cm[0, 0]}")
print(f"False Positives (FP): {cm[0, 1]}")
print(f"False Negatives (FN): {cm[1, 0]}")
print(f"True Positives (TP): {cm[1, 1]}")
# Visualize confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels=['Negative', 'Positive']):
"""Plot a beautiful confusion matrix"""
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=labels, yticklabels=labels,
cbar_kws={'label': 'Count'})
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
# Add text annotations
plt.text(0.5, 0.15, f'TN = {cm[0,0]}', ha='center', fontsize=12, color='white')
plt.text(1.5, 0.15, f'FP = {cm[0,1]}', ha='center', fontsize=12, color='white')
plt.text(0.5, 1.15, f'FN = {cm[1,0]}', ha='center', fontsize=12, color='white')
plt.text(1.5, 1.15, f'TP = {cm[1,1]}', ha='center', fontsize=12, color='white')
plt.tight_layout()
plt.show()
plot_confusion_matrix(y_true, y_pred)
Part 2: Core MetricsΒΆ
1. AccuracyΒΆ
Formula: (TP + TN) / (TP + TN + FP + FN)
What it means: Percentage of correct predictions
When to use: Balanced datasets where all errors cost the same
When NOT to use: Imbalanced data (e.g., 99% negative, 1% positive)
2. Precision (Positive Predictive Value)ΒΆ
Formula: TP / (TP + FP)
What it means: Of all positive predictions, how many were correct?
Question it answers: βWhen model says positive, how often is it right?β
When to use: False positives are costly
Spam detection (donβt mark good email as spam)
Product recommendations (donβt show irrelevant items)
3. Recall (Sensitivity, True Positive Rate)ΒΆ
Formula: TP / (TP + FN)
What it means: Of all actual positives, how many did we catch?
Question it answers: βAre we missing positive cases?β
When to use: False negatives are costly
Disease diagnosis (donβt miss sick patients)
Fraud detection (catch all fraudulent transactions)
4. F1-ScoreΒΆ
Formula: 2 * (Precision * Recall) / (Precision + Recall)
What it means: Harmonic mean of precision and recall
When to use: Need balance between precision and recall
# Calculate all metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print("Classification Metrics:")
print("=" * 40)
print(f"Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
# Show full classification report
print("\nDetailed Report:")
print(classification_report(y_true, y_pred, target_names=['Negative', 'Positive']))
Precision-Recall Trade-offΒΆ
The Dilemma:
Increase precision β Decrease recall
Increase recall β Decrease precision
Example: Airport Security
High Precision (Low False Positives):
Only flag obvious threats
Fewer innocent people stopped
But: Might miss some real threats β
High Recall (Low False Negatives):
Flag anything suspicious
Catch all threats β
But: Many innocent people stopped β
# Demonstrate precision-recall trade-off
X, y = make_classification(n_samples=1000, n_features=20,
n_informative=15, n_redundant=5,
weights=[0.7, 0.3], random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
# Get probability scores
y_scores = model.predict_proba(X_test)[:, 1]
# Try different thresholds
thresholds = [0.3, 0.5, 0.7, 0.9]
results = []
for threshold in thresholds:
y_pred_thresh = (y_scores >= threshold).astype(int)
prec = precision_score(y_test, y_pred_thresh)
rec = recall_score(y_test, y_pred_thresh)
results.append({
'Threshold': threshold,
'Precision': prec,
'Recall': rec,
'F1-Score': 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0
})
results_df = pd.DataFrame(results)
print("Precision-Recall Trade-off:")
print(results_df.to_string(index=False))
print("\nπ‘ Notice:")
print("Lower threshold β Higher recall (catch more), Lower precision (more false alarms)")
print("Higher threshold β Higher precision (fewer false alarms), Lower recall (miss some)")
Part 3: ROC Curves & AUCΒΆ
ROC Curve (Receiver Operating Characteristic)ΒΆ
What it shows: Model performance across all classification thresholds
Axes:
X-axis: False Positive Rate (FPR) = FP / (FP + TN)
Y-axis: True Positive Rate (TPR) = TP / (TP + FN) = Recall
AUC (Area Under Curve):
1.0: Perfect classifier
0.9-1.0: Excellent
0.8-0.9: Good
0.7-0.8: Fair
0.5-0.7: Poor
0.5: Random guessing
< 0.5: Worse than random (somethingβs wrong!)
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = roc_auc_score(y_test, y_scores)
# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2,
label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
label='Random Classifier (AUC = 0.500)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate (Recall)', fontsize=12)
plt.title('ROC Curve', fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\nπ AUC Score: {roc_auc:.3f}")
if roc_auc >= 0.9:
print("β
Excellent model performance!")
elif roc_auc >= 0.8:
print("β
Good model performance")
elif roc_auc >= 0.7:
print("β οΈ Fair model - room for improvement")
else:
print("β Poor model - needs significant improvement")
Precision-Recall CurveΒΆ
When to use PR curve instead of ROC:
Imbalanced datasets (few positives)
Care more about positive class
Why? ROC can be overly optimistic on imbalanced data because the False Positive Rate denominator (FP + TN) is dominated by the large number of true negatives. A model can have a low FPR even with many false positives, making the ROC curve look deceptively good. The PR curve, by contrast, focuses entirely on the positive class predictions, where errors are most costly. In domains like fraud detection or rare disease screening, where positive cases may be fewer than 1% of all data, the PR curve gives a far more honest picture of model utility.
Average Precision (AP) summarizes the PR curve as the weighted mean of precisions at each threshold, providing a single number analogous to AUC but tailored for imbalanced settings.
# Calculate precision-recall curve
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_test, y_scores)
avg_precision = average_precision_score(y_test, y_scores)
# Plot PR curve
plt.figure(figsize=(10, 8))
plt.plot(recall_curve, precision_curve, color='blue', lw=2,
label=f'PR curve (AP = {avg_precision:.3f})')
plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curve', fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\nπ Average Precision: {avg_precision:.3f}")
Part 4: Handling Imbalanced DataΒΆ
The Accuracy ParadoxΒΆ
Problem: With 99% negative, 1% positive data:
Model that always predicts βnegativeβ = 99% accuracy!
But itβs completely useless
Solution: Use other metrics!
# Create highly imbalanced dataset
X_imb, y_imb = make_classification(
n_samples=1000,
n_features=20,
weights=[0.99, 0.01], # 99% negative, 1% positive
random_state=42
)
print("Class Distribution:")
unique, counts = np.unique(y_imb, return_counts=True)
for cls, count in zip(unique, counts):
print(f"Class {cls}: {count} ({count/len(y_imb)*100:.1f}%)")
# Dummy model: always predict negative
y_dummy = np.zeros_like(y_imb)
print("\nπ€ Dummy Model (always predicts negative):")
print(f"Accuracy: {accuracy_score(y_imb, y_dummy):.3f} β Looks great!")
print(f"Precision: {precision_score(y_imb, y_dummy, zero_division=0):.3f}")
print(f"Recall: {recall_score(y_imb, y_dummy):.3f} β Actually terrible!")
print(f"F1-Score: {f1_score(y_imb, y_dummy):.3f}")
print("\nπ‘ Key Insight: High accuracy means nothing on imbalanced data!")
Metrics for Imbalanced DataΒΆ
When the class distribution is heavily skewed, standard accuracy becomes meaningless β a model that always predicts the majority class can score 99%+ while being entirely useless. The metrics below are specifically designed to surface real performance on the minority class.
F1-Score β Harmonic mean of precision and recall; drops sharply if either is low
Cohenβs Kappa β Measures agreement between predicted and actual labels, adjusted for chance agreement. A kappa of 0 means your model is no better than random guessing given the class distribution.
Matthews Correlation Coefficient (MCC) β Uses all four quadrants of the confusion matrix (TP, TN, FP, FN) and returns a value between \(-1\) and \(+1\). It is considered one of the most balanced metrics for binary classification because it penalizes all types of errors proportionally.
Area under PR curve β Better than ROC-AUC for imbalanced data because it avoids the optimistic bias that comes from a large number of true negatives.
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef
# Train proper model on imbalanced data
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
X_imb, y_imb, test_size=0.3, random_state=42, stratify=y_imb
)
# Use class_weight='balanced' to handle imbalance
model_balanced = LogisticRegression(class_weight='balanced', random_state=42)
model_balanced.fit(X_train_imb, y_train_imb)
y_pred_balanced = model_balanced.predict(X_test_imb)
print("Balanced Model Metrics:")
print("=" * 40)
print(f"Accuracy: {accuracy_score(y_test_imb, y_pred_balanced):.3f}")
print(f"Precision: {precision_score(y_test_imb, y_pred_balanced):.3f}")
print(f"Recall: {recall_score(y_test_imb, y_pred_balanced):.3f}")
print(f"F1-Score: {f1_score(y_test_imb, y_pred_balanced):.3f}")
print(f"Cohen's Kappa: {cohen_kappa_score(y_test_imb, y_pred_balanced):.3f}")
print(f"MCC: {matthews_corrcoef(y_test_imb, y_pred_balanced):.3f}")
Part 5: Multi-Class MetricsΒΆ
Averaging StrategiesΒΆ
For multi-class problems (>2 classes):
Macro Average: Average of per-class metrics (treats all classes equally)
Weighted Average: Average weighted by class frequency
Micro Average: Calculate globally across all classes
# Create multi-class dataset
X_multi, y_multi = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_classes=3,
n_clusters_per_class=1,
random_state=42
)
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
X_multi, y_multi, test_size=0.3, random_state=42
)
# Train model
model_multi = LogisticRegression(multi_class='multinomial', random_state=42)
model_multi.fit(X_train_m, y_train_m)
y_pred_m = model_multi.predict(X_test_m)
# Calculate metrics with different averaging
print("Multi-Class Metrics:")
print("=" * 50)
print(f"Accuracy: {accuracy_score(y_test_m, y_pred_m):.3f}\n")
for avg in ['macro', 'weighted', 'micro']:
prec = precision_score(y_test_m, y_pred_m, average=avg)
rec = recall_score(y_test_m, y_pred_m, average=avg)
f1 = f1_score(y_test_m, y_pred_m, average=avg)
print(f"{avg.capitalize()} Average:")
print(f" Precision: {prec:.3f}")
print(f" Recall: {rec:.3f}")
print(f" F1-Score: {f1:.3f}\n")
# Per-class breakdown
print("\nPer-Class Report:")
print(classification_report(y_test_m, y_pred_m,
target_names=['Class 0', 'Class 1', 'Class 2']))
Part 6: Choosing the Right MetricΒΆ
Decision GuideΒΆ
Question 1: Is your data balanced?
β Yes β Accuracy is fine
β No β Use F1, PR-AUC, or MCC
Question 2: Whatβs more costly?
False Positives (wrong alarm) β Optimize Precision
False Negatives (missed case) β Optimize Recall
Both equally β Optimize F1-Score
Question 3: Multiple classes?
Use weighted average for imbalanced classes
Use macro average if all classes equally important
Real-World ScenariosΒΆ
scenarios = [
{
"Use Case": "Cancer Detection",
"Primary Metric": "Recall",
"Why": "Missing cancer (FN) is worse than false alarm (FP)",
"Acceptable": "Lower precision OK if recall high"
},
{
"Use Case": "Spam Filter",
"Primary Metric": "Precision",
"Why": "Blocking real email (FP) very bad for user",
"Acceptable": "Some spam in inbox OK"
},
{
"Use Case": "Fraud Detection",
"Primary Metric": "F1-Score",
"Why": "Balance catching fraud vs annoying customers",
"Acceptable": "Need both precision and recall"
},
{
"Use Case": "Credit Approval",
"Primary Metric": "Precision + Fairness",
"Why": "False approvals costly, must be fair",
"Acceptable": "Some valid applicants rejected"
}
]
df_scenarios = pd.DataFrame(scenarios)
print("\nπ Metric Selection Guide:")
print("=" * 80)
for idx, row in df_scenarios.iterrows():
print(f"\n{row['Use Case']}:")
print(f" Primary Metric: {row['Primary Metric']}")
print(f" Reason: {row['Why']}")
print(f" Trade-off: {row['Acceptable']}")
π― Knowledge CheckΒΆ
Q1: When is high accuracy misleading?
Q2: For airport security, optimize precision or recall?
Q3: What does AUC = 0.5 mean?
Q4: Why use F1-score instead of accuracy?
Click for answers
A1: When data is imbalanced (e.g., 99:1 ratio)
A2: Recall! Canβt miss threats, false alarms acceptable
A3: Model is no better than random guessing
A4: F1 balances precision & recall, better for imbalanced data
π Next StepsΒΆ
Complete Classification Metrics Challenge
Read Notebook 2: Regression Metrics
Practice with imbalanced datasets
Try different classification thresholds
Excellent work! You now understand how to properly evaluate classification models! π