import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score, recall_score, 
    f1_score, roc_curve, roc_auc_score, classification_report,
    precision_recall_curve, average_precision_score
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification

# Set random seed for reproducibility
np.random.seed(42)

# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("βœ… Setup complete")

Part 1: Confusion Matrix BasicsΒΆ

What is a Confusion Matrix?ΒΆ

A confusion matrix shows actual vs predicted classifications:

                 Predicted
                Neg    Pos
Actual  Neg     TN     FP
        Pos     FN     TP

Where:

  • TN (True Negative): Correctly predicted negative

  • FP (False Positive): Wrongly predicted positive (Type I error)

  • FN (False Negative): Wrongly predicted negative (Type II error)

  • TP (True Positive): Correctly predicted positive

Real-World Example: Email Spam DetectionΒΆ

                 Predicted
                Ham    Spam
Actual  Ham     950    50     ← 50 good emails wrongly marked spam!
        Spam    20     80     ← 20 spam emails reached inbox!
# Create sample predictions
y_true = np.array([0, 0, 1, 1, 0, 1, 0, 1, 1, 0])
y_pred = np.array([0, 0, 1, 0, 0, 1, 1, 1, 1, 0])

# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred)

print("Confusion Matrix:")
print(cm)
print("\nBreakdown:")
print(f"True Negatives (TN):  {cm[0, 0]}")
print(f"False Positives (FP): {cm[0, 1]}")
print(f"False Negatives (FN): {cm[1, 0]}")
print(f"True Positives (TP):  {cm[1, 1]}")
# Visualize confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels=['Negative', 'Positive']):
    """Plot a beautiful confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels, yticklabels=labels,
                cbar_kws={'label': 'Count'})
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    
    # Add text annotations
    plt.text(0.5, 0.15, f'TN = {cm[0,0]}', ha='center', fontsize=12, color='white')
    plt.text(1.5, 0.15, f'FP = {cm[0,1]}', ha='center', fontsize=12, color='white')
    plt.text(0.5, 1.15, f'FN = {cm[1,0]}', ha='center', fontsize=12, color='white')
    plt.text(1.5, 1.15, f'TP = {cm[1,1]}', ha='center', fontsize=12, color='white')
    
    plt.tight_layout()
    plt.show()

plot_confusion_matrix(y_true, y_pred)

Part 2: Core MetricsΒΆ

1. AccuracyΒΆ

Formula: (TP + TN) / (TP + TN + FP + FN)

What it means: Percentage of correct predictions

When to use: Balanced datasets where all errors cost the same

When NOT to use: Imbalanced data (e.g., 99% negative, 1% positive)

2. Precision (Positive Predictive Value)ΒΆ

Formula: TP / (TP + FP)

What it means: Of all positive predictions, how many were correct?

Question it answers: β€œWhen model says positive, how often is it right?”

When to use: False positives are costly

  • Spam detection (don’t mark good email as spam)

  • Product recommendations (don’t show irrelevant items)

3. Recall (Sensitivity, True Positive Rate)ΒΆ

Formula: TP / (TP + FN)

What it means: Of all actual positives, how many did we catch?

Question it answers: β€œAre we missing positive cases?”

When to use: False negatives are costly

  • Disease diagnosis (don’t miss sick patients)

  • Fraud detection (catch all fraudulent transactions)

4. F1-ScoreΒΆ

Formula: 2 * (Precision * Recall) / (Precision + Recall)

What it means: Harmonic mean of precision and recall

When to use: Need balance between precision and recall

# Calculate all metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Classification Metrics:")
print("=" * 40)
print(f"Accuracy:  {accuracy:.3f} ({accuracy*100:.1f}%)")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-Score:  {f1:.3f}")

# Show full classification report
print("\nDetailed Report:")
print(classification_report(y_true, y_pred, target_names=['Negative', 'Positive']))

Precision-Recall Trade-offΒΆ

The Dilemma:

  • Increase precision β†’ Decrease recall

  • Increase recall β†’ Decrease precision

Example: Airport Security

High Precision (Low False Positives):

  • Only flag obvious threats

  • Fewer innocent people stopped

  • But: Might miss some real threats ❌

High Recall (Low False Negatives):

  • Flag anything suspicious

  • Catch all threats βœ…

  • But: Many innocent people stopped ❌

# Demonstrate precision-recall trade-off
X, y = make_classification(n_samples=1000, n_features=20, 
                          n_informative=15, n_redundant=5,
                          weights=[0.7, 0.3], random_state=42)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Get probability scores
y_scores = model.predict_proba(X_test)[:, 1]

# Try different thresholds
thresholds = [0.3, 0.5, 0.7, 0.9]
results = []

for threshold in thresholds:
    y_pred_thresh = (y_scores >= threshold).astype(int)
    
    prec = precision_score(y_test, y_pred_thresh)
    rec = recall_score(y_test, y_pred_thresh)
    
    results.append({
        'Threshold': threshold,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0
    })

results_df = pd.DataFrame(results)
print("Precision-Recall Trade-off:")
print(results_df.to_string(index=False))

print("\nπŸ’‘ Notice:")
print("Lower threshold β†’ Higher recall (catch more), Lower precision (more false alarms)")
print("Higher threshold β†’ Higher precision (fewer false alarms), Lower recall (miss some)")

Part 3: ROC Curves & AUCΒΆ

ROC Curve (Receiver Operating Characteristic)ΒΆ

What it shows: Model performance across all classification thresholds

Axes:

  • X-axis: False Positive Rate (FPR) = FP / (FP + TN)

  • Y-axis: True Positive Rate (TPR) = TP / (TP + FN) = Recall

AUC (Area Under Curve):

  • 1.0: Perfect classifier

  • 0.9-1.0: Excellent

  • 0.8-0.9: Good

  • 0.7-0.8: Fair

  • 0.5-0.7: Poor

  • 0.5: Random guessing

  • < 0.5: Worse than random (something’s wrong!)

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = roc_auc_score(y_test, y_scores)

# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
         label='Random Classifier (AUC = 0.500)')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate (Recall)', fontsize=12)
plt.title('ROC Curve', fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nπŸ“Š AUC Score: {roc_auc:.3f}")
if roc_auc >= 0.9:
    print("βœ… Excellent model performance!")
elif roc_auc >= 0.8:
    print("βœ… Good model performance")
elif roc_auc >= 0.7:
    print("⚠️ Fair model - room for improvement")
else:
    print("❌ Poor model - needs significant improvement")

Precision-Recall CurveΒΆ

When to use PR curve instead of ROC:

  • Imbalanced datasets (few positives)

  • Care more about positive class

Why? ROC can be overly optimistic on imbalanced data because the False Positive Rate denominator (FP + TN) is dominated by the large number of true negatives. A model can have a low FPR even with many false positives, making the ROC curve look deceptively good. The PR curve, by contrast, focuses entirely on the positive class predictions, where errors are most costly. In domains like fraud detection or rare disease screening, where positive cases may be fewer than 1% of all data, the PR curve gives a far more honest picture of model utility.

Average Precision (AP) summarizes the PR curve as the weighted mean of precisions at each threshold, providing a single number analogous to AUC but tailored for imbalanced settings.

# Calculate precision-recall curve
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_test, y_scores)
avg_precision = average_precision_score(y_test, y_scores)

# Plot PR curve
plt.figure(figsize=(10, 8))
plt.plot(recall_curve, precision_curve, color='blue', lw=2,
         label=f'PR curve (AP = {avg_precision:.3f})')
plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curve', fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nπŸ“Š Average Precision: {avg_precision:.3f}")

Part 4: Handling Imbalanced DataΒΆ

The Accuracy ParadoxΒΆ

Problem: With 99% negative, 1% positive data:

  • Model that always predicts β€œnegative” = 99% accuracy!

  • But it’s completely useless

Solution: Use other metrics!

# Create highly imbalanced dataset
X_imb, y_imb = make_classification(
    n_samples=1000,
    n_features=20,
    weights=[0.99, 0.01],  # 99% negative, 1% positive
    random_state=42
)

print("Class Distribution:")
unique, counts = np.unique(y_imb, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"Class {cls}: {count} ({count/len(y_imb)*100:.1f}%)")

# Dummy model: always predict negative
y_dummy = np.zeros_like(y_imb)

print("\nπŸ€– Dummy Model (always predicts negative):")
print(f"Accuracy: {accuracy_score(y_imb, y_dummy):.3f} ← Looks great!")
print(f"Precision: {precision_score(y_imb, y_dummy, zero_division=0):.3f}")
print(f"Recall: {recall_score(y_imb, y_dummy):.3f} ← Actually terrible!")
print(f"F1-Score: {f1_score(y_imb, y_dummy):.3f}")

print("\nπŸ’‘ Key Insight: High accuracy means nothing on imbalanced data!")

Metrics for Imbalanced DataΒΆ

When the class distribution is heavily skewed, standard accuracy becomes meaningless – a model that always predicts the majority class can score 99%+ while being entirely useless. The metrics below are specifically designed to surface real performance on the minority class.

  1. F1-Score – Harmonic mean of precision and recall; drops sharply if either is low

  2. Cohen’s Kappa – Measures agreement between predicted and actual labels, adjusted for chance agreement. A kappa of 0 means your model is no better than random guessing given the class distribution.

  3. Matthews Correlation Coefficient (MCC) – Uses all four quadrants of the confusion matrix (TP, TN, FP, FN) and returns a value between \(-1\) and \(+1\). It is considered one of the most balanced metrics for binary classification because it penalizes all types of errors proportionally.

  4. Area under PR curve – Better than ROC-AUC for imbalanced data because it avoids the optimistic bias that comes from a large number of true negatives.

from sklearn.metrics import cohen_kappa_score, matthews_corrcoef

# Train proper model on imbalanced data
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
    X_imb, y_imb, test_size=0.3, random_state=42, stratify=y_imb
)

# Use class_weight='balanced' to handle imbalance
model_balanced = LogisticRegression(class_weight='balanced', random_state=42)
model_balanced.fit(X_train_imb, y_train_imb)
y_pred_balanced = model_balanced.predict(X_test_imb)

print("Balanced Model Metrics:")
print("=" * 40)
print(f"Accuracy:  {accuracy_score(y_test_imb, y_pred_balanced):.3f}")
print(f"Precision: {precision_score(y_test_imb, y_pred_balanced):.3f}")
print(f"Recall:    {recall_score(y_test_imb, y_pred_balanced):.3f}")
print(f"F1-Score:  {f1_score(y_test_imb, y_pred_balanced):.3f}")
print(f"Cohen's Kappa: {cohen_kappa_score(y_test_imb, y_pred_balanced):.3f}")
print(f"MCC: {matthews_corrcoef(y_test_imb, y_pred_balanced):.3f}")

Part 5: Multi-Class MetricsΒΆ

Averaging StrategiesΒΆ

For multi-class problems (>2 classes):

  1. Macro Average: Average of per-class metrics (treats all classes equally)

  2. Weighted Average: Average weighted by class frequency

  3. Micro Average: Calculate globally across all classes

# Create multi-class dataset
X_multi, y_multi = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_classes=3,
    n_clusters_per_class=1,
    random_state=42
)

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42
)

# Train model
model_multi = LogisticRegression(multi_class='multinomial', random_state=42)
model_multi.fit(X_train_m, y_train_m)
y_pred_m = model_multi.predict(X_test_m)

# Calculate metrics with different averaging
print("Multi-Class Metrics:")
print("=" * 50)
print(f"Accuracy: {accuracy_score(y_test_m, y_pred_m):.3f}\n")

for avg in ['macro', 'weighted', 'micro']:
    prec = precision_score(y_test_m, y_pred_m, average=avg)
    rec = recall_score(y_test_m, y_pred_m, average=avg)
    f1 = f1_score(y_test_m, y_pred_m, average=avg)
    
    print(f"{avg.capitalize()} Average:")
    print(f"  Precision: {prec:.3f}")
    print(f"  Recall:    {rec:.3f}")
    print(f"  F1-Score:  {f1:.3f}\n")

# Per-class breakdown
print("\nPer-Class Report:")
print(classification_report(y_test_m, y_pred_m, 
                          target_names=['Class 0', 'Class 1', 'Class 2']))

Part 6: Choosing the Right MetricΒΆ

Decision GuideΒΆ

Question 1: Is your data balanced?

  • βœ… Yes β†’ Accuracy is fine

  • ❌ No β†’ Use F1, PR-AUC, or MCC

Question 2: What’s more costly?

  • False Positives (wrong alarm) β†’ Optimize Precision

  • False Negatives (missed case) β†’ Optimize Recall

  • Both equally β†’ Optimize F1-Score

Question 3: Multiple classes?

  • Use weighted average for imbalanced classes

  • Use macro average if all classes equally important

Real-World ScenariosΒΆ

scenarios = [
    {
        "Use Case": "Cancer Detection",
        "Primary Metric": "Recall",
        "Why": "Missing cancer (FN) is worse than false alarm (FP)",
        "Acceptable": "Lower precision OK if recall high"
    },
    {
        "Use Case": "Spam Filter",
        "Primary Metric": "Precision",
        "Why": "Blocking real email (FP) very bad for user",
        "Acceptable": "Some spam in inbox OK"
    },
    {
        "Use Case": "Fraud Detection",
        "Primary Metric": "F1-Score",
        "Why": "Balance catching fraud vs annoying customers",
        "Acceptable": "Need both precision and recall"
    },
    {
        "Use Case": "Credit Approval",
        "Primary Metric": "Precision + Fairness",
        "Why": "False approvals costly, must be fair",
        "Acceptable": "Some valid applicants rejected"
    }
]

df_scenarios = pd.DataFrame(scenarios)
print("\nπŸ“‹ Metric Selection Guide:")
print("=" * 80)
for idx, row in df_scenarios.iterrows():
    print(f"\n{row['Use Case']}:")
    print(f"  Primary Metric: {row['Primary Metric']}")
    print(f"  Reason: {row['Why']}")
    print(f"  Trade-off: {row['Acceptable']}")

🎯 Knowledge Check¢

Q1: When is high accuracy misleading?
Q2: For airport security, optimize precision or recall?
Q3: What does AUC = 0.5 mean?
Q4: Why use F1-score instead of accuracy?

Click for answers

A1: When data is imbalanced (e.g., 99:1 ratio)
A2: Recall! Can’t miss threats, false alarms acceptable
A3: Model is no better than random guessing
A4: F1 balances precision & recall, better for imbalanced data

πŸš€ Next StepsΒΆ

  1. Complete Classification Metrics Challenge

  2. Read Notebook 2: Regression Metrics

  3. Practice with imbalanced datasets

  4. Try different classification thresholds

Excellent work! You now understand how to properly evaluate classification models! πŸ“Š