Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    cross_val_score, cross_validate, KFold, StratifiedKFold,
    train_test_split, learning_curve
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, mean_squared_error
)
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from scipy import stats
import time

np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("✅ Setup complete")

Part 1: Cross-Validation¶

Why Cross-Validation?¶

Problem with single train/test split:

Results depend on random split
May get lucky or unlucky split
Unreliable performance estimate

Solution: K-Fold Cross-Validation

Split data into K folds
Train on K-1 folds, test on 1
Repeat K times
Average results

Types of Cross-Validation¶

K-Fold: Standard, K=5 or K=10
Stratified K-Fold: Maintains class distribution
Leave-One-Out (LOO): K = n (expensive!)
Time Series Split: Respects temporal order

# Generate dataset
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    random_state=42
)

# Single train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
single_score = model.score(X_test, y_test)

print("Single Train/Test Split:")
print("=" * 60)
print(f"Accuracy: {single_score:.4f}")
print("⚠️ This is just ONE data split!")

# K-Fold Cross-Validation
cv_scores = cross_val_score(
    LogisticRegression(random_state=42),
    X, y,
    cv=5,  # 5-fold
    scoring='accuracy'
)

print("\n5-Fold Cross-Validation:")
print("=" * 60)
print(f"Fold scores: {cv_scores}")
print(f"Mean:  {cv_scores.mean():.4f}")
print(f"Std:   {cv_scores.std():.4f}")
print(f"95% CI: [{cv_scores.mean() - 1.96*cv_scores.std():.4f}, "
      f"{cv_scores.mean() + 1.96*cv_scores.std():.4f}]")
print("✅ Much more reliable estimate!")

Stratified K-Fold for Imbalanced Data¶

Standard K-Fold cross-validation splits data randomly, which can produce folds where the minority class is severely underrepresented or even absent. In a dataset with a 90/10 class split, a random fold might end up with 95/5, making the validation score unreliable. StratifiedKFold preserves the original class proportions in every fold, ensuring each validation set is a representative microcosm of the full dataset. This is essential for imbalanced classification problems – without stratification, variance between folds inflates artificially, and your cross-validation estimate becomes unreliable.

# Create imbalanced dataset
X_imb, y_imb = make_classification(
    n_samples=1000,
    n_features=20,
    weights=[0.9, 0.1],  # 90-10 split
    random_state=42
)

print("Class Distribution:")
print(f"Class 0: {(y_imb == 0).sum()} ({(y_imb == 0).sum()/len(y_imb)*100:.1f}%)")
print(f"Class 1: {(y_imb == 1).sum()} ({(y_imb == 1).sum()/len(y_imb)*100:.1f}%)")

# Regular K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
regular_scores = cross_val_score(
    LogisticRegression(random_state=42), X_imb, y_imb, cv=kf
)

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(
    LogisticRegression(random_state=42), X_imb, y_imb, cv=skf
)

print("\nComparison:")
print("=" * 60)
print(f"Regular K-Fold:    {regular_scores.mean():.4f} ± {regular_scores.std():.4f}")
print(f"Stratified K-Fold: {stratified_scores.mean():.4f} ± {stratified_scores.std():.4f}")
print("\n💡 Stratified K-Fold maintains class balance in each fold!")

Part 2: Comparing Multiple Models¶

Model Comparison Framework¶

Comparing models requires more than just looking at accuracy on a single test set. A rigorous comparison uses cross-validated scores across multiple metrics (accuracy, precision, recall, F1, ROC-AUC), accounts for training and inference time (critical for production deployment), and reports confidence intervals to distinguish genuine performance differences from random variation. The cross_validate function from scikit-learn computes multiple metrics simultaneously across K folds, providing a comprehensive picture of each model’s strengths and weaknesses.

# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True)
}

# Evaluate each model
results = []

for name, model in models.items():
    print(f"Evaluating {name}...")
    
    # Cross-validation with multiple metrics
    cv_results = cross_validate(
        model, X, y,
        cv=5,
        scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
        return_train_score=True
    )
    
    # Training time
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Prediction time
    start_time = time.time()
    _ = model.predict(X_test)
    pred_time = time.time() - start_time
    
    results.append({
        'Model': name,
        'Accuracy': cv_results['test_accuracy'].mean(),
        'Precision': cv_results['test_precision'].mean(),
        'Recall': cv_results['test_recall'].mean(),
        'F1': cv_results['test_f1'].mean(),
        'ROC-AUC': cv_results['test_roc_auc'].mean(),
        'Train Time (s)': train_time,
        'Pred Time (s)': pred_time,
        'Std Dev': cv_results['test_accuracy'].std()
    })

# Create comparison DataFrame
df_results = pd.DataFrame(results)
print("\nModel Comparison Results:")
print("=" * 100)
print(df_results.round(4).to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Performance metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC']
df_results.set_index('Model')[metrics].plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Performance Metrics', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Score', fontsize=12)
axes[0, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
axes[0, 0].grid(alpha=0.3, axis='y')

# Plot 2: Training time
df_results.plot(x='Model', y='Train Time (s)', kind='bar', ax=axes[0, 1], legend=False)
axes[0, 1].set_title('Training Time', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Seconds', fontsize=12)
axes[0, 1].set_xlabel('')
axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=45, ha='right')
axes[0, 1].grid(alpha=0.3, axis='y')

# Plot 3: Prediction time
df_results.plot(x='Model', y='Pred Time (s)', kind='bar', ax=axes[1, 0], 
                legend=False, color='orange')
axes[1, 0].set_title('Prediction Time', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Seconds', fontsize=12)
axes[1, 0].set_xticklabels(axes[1, 0].get_xticklabels(), rotation=45, ha='right')
axes[1, 0].grid(alpha=0.3, axis='y')

# Plot 4: Accuracy with error bars
axes[1, 1].bar(df_results['Model'], df_results['Accuracy'], 
               yerr=df_results['Std Dev'], capsize=5, color='green', alpha=0.7)
axes[1, 1].set_title('Accuracy with Standard Deviation', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Accuracy', fontsize=12)
axes[1, 1].set_xticklabels(df_results['Model'], rotation=45, ha='right')
axes[1, 1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Part 3: Statistical Significance Testing¶

Why Statistical Testing?¶

Question: Is Model A really better than Model B, or just got lucky?

Solution: Statistical hypothesis testing

Paired T-Test¶

Use: Compare two models on same data

H0 (Null Hypothesis): Models perform equally

Ha (Alternative): One model performs better

# Compare two models with statistical test
def compare_models_statistically(model1, model2, X, y, cv=5):
    """Compare two models using paired t-test"""
    
    # Get cross-validation scores for both models
    scores1 = cross_val_score(model1, X, y, cv=cv, scoring='accuracy')
    scores2 = cross_val_score(model2, X, y, cv=cv, scoring='accuracy')
    
    # Paired t-test
    t_stat, p_value = stats.ttest_rel(scores1, scores2)
    
    print(f"Model 1 scores: {scores1}")
    print(f"Model 1 mean: {scores1.mean():.4f} ± {scores1.std():.4f}")
    print(f"\nModel 2 scores: {scores2}")
    print(f"Model 2 mean: {scores2.mean():.4f} ± {scores2.std():.4f}")
    print(f"\nPaired t-test:")
    print(f"  t-statistic: {t_stat:.4f}")
    print(f"  p-value: {p_value:.4f}")
    
    alpha = 0.05
    if p_value < alpha:
        better = "Model 1" if scores1.mean() > scores2.mean() else "Model 2"
        print(f"\n✅ {better} is SIGNIFICANTLY better (p < {alpha})")
    else:
        print(f"\n❌ No significant difference (p >= {alpha})")
    
    return scores1, scores2, p_value

# Compare Logistic Regression vs Random Forest
print("Comparing Logistic Regression vs Random Forest:")
print("=" * 70)
scores_lr, scores_rf, p_val = compare_models_statistically(
    LogisticRegression(random_state=42, max_iter=1000),
    RandomForestClassifier(random_state=42, n_estimators=100),
    X, y, cv=10
)

McNemar’s Test for Classification¶

McNemar’s test is a non-parametric test specifically designed for comparing two classifiers on the same test set. Unlike the paired t-test (which compares scores across folds), McNemar’s focuses on the disagreements between two models: cases where Model A is correct but Model B is wrong, and vice versa. If the number of these disagreements is roughly balanced, the models perform equivalently. The test statistic follows a chi-squared distribution with 1 degree of freedom, and a significant result (p < 0.05) indicates the models make genuinely different types of errors.

from statsmodels.stats.contingency_tables import mcnemar

# Train two models
model1 = LogisticRegression(random_state=42, max_iter=1000)
model2 = RandomForestClassifier(random_state=42, n_estimators=100)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)

# Create contingency table
# [model2 wrong, model1 correct] [both correct]
# [both wrong]                   [model1 wrong, model2 correct]
contingency_table = pd.DataFrame(
    [[np.sum((pred1 == y_test) & (pred2 != y_test)),  # model1 right, model2 wrong
      np.sum((pred1 == y_test) & (pred2 == y_test))],  # both right
     [np.sum((pred1 != y_test) & (pred2 != y_test)),  # both wrong
      np.sum((pred1 != y_test) & (pred2 == y_test))]],  # model1 wrong, model2 right
    columns=['Model2 Wrong', 'Model2 Right'],
    index=['Model1 Right', 'Model1 Wrong']
)

print("McNemar's Test:")
print("=" * 60)
print("\nContingency Table:")
print(contingency_table)

# Perform McNemar's test
result = mcnemar(contingency_table.values, exact=False, correction=True)

print(f"\nStatistic: {result.statistic:.4f}")
print(f"P-value: {result.pvalue:.4f}")

if result.pvalue < 0.05:
    print("\n✅ Models perform SIGNIFICANTLY differently")
else:
    print("\n❌ No significant difference in model performance")

Part 4: A/B Testing¶

What is A/B Testing?¶

Production setting: Test new model (B) against current model (A)

Process:

Route traffic: 50% to model A, 50% to model B
Collect metrics
Statistical comparison
Make deployment decision

Key Metrics for A/B Tests¶

Primary Metric: Core business goal (accuracy, CTR, revenue)
Secondary Metrics: User experience, latency, cost
Guardrail Metrics: Safety checks (error rate, bias)

# Simulate A/B test
def simulate_ab_test(model_a, model_b, X_test, y_test, metric='accuracy'):
    """Simulate A/B test for model deployment"""
    
    # Randomly assign test data to A or B
    np.random.seed(42)
    assignment = np.random.choice(['A', 'B'], size=len(X_test), p=[0.5, 0.5])
    
    # Get predictions
    pred_a = model_a.predict(X_test)
    pred_b = model_b.predict(X_test)
    
    # Calculate metrics for each group
    mask_a = (assignment == 'A')
    mask_b = (assignment == 'B')
    
    score_a = accuracy_score(y_test[mask_a], pred_a[mask_a])
    score_b = accuracy_score(y_test[mask_b], pred_b[mask_b])
    
    # Sample sizes
    n_a = mask_a.sum()
    n_b = mask_b.sum()
    
    # Proportion test (z-test)
    # Calculate pooled proportion
    correct_a = (pred_a[mask_a] == y_test[mask_a]).sum()
    correct_b = (pred_b[mask_b] == y_test[mask_b]).sum()
    
    p_pool = (correct_a + correct_b) / (n_a + n_b)
    
    # Standard error
    se = np.sqrt(p_pool * (1 - p_pool) * (1/n_a + 1/n_b))
    
    # Z-statistic
    z_stat = (score_b - score_a) / se if se > 0 else 0
    
    # P-value (two-tailed)
    p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
    
    print("A/B Test Results:")
    print("=" * 70)
    print(f"Model A (Control):")
    print(f"  Sample size: {n_a}")
    print(f"  Accuracy: {score_a:.4f}")
    print(f"\nModel B (Treatment):")
    print(f"  Sample size: {n_b}")
    print(f"  Accuracy: {score_b:.4f}")
    print(f"\nDifference: {score_b - score_a:.4f} ({(score_b - score_a)/score_a * 100:+.2f}%)")
    print(f"\nStatistical Test:")
    print(f"  Z-statistic: {z_stat:.4f}")
    print(f"  P-value: {p_value:.4f}")
    
    if p_value < 0.05:
        if score_b > score_a:
            print("\n🎉 Model B is SIGNIFICANTLY better! Deploy it!")
        else:
            print("\n⚠️ Model B is SIGNIFICANTLY worse! Keep Model A!")
    else:
        print("\n❌ No significant difference. Need more data or keep Model A.")
    
    return score_a, score_b, p_value

# Run A/B test
score_a, score_b, p_val = simulate_ab_test(
    model1,  # Current model (Logistic Regression)
    model2,  # New model (Random Forest)
    X_test, y_test
)

Sample Size Calculation¶

Question: How much data do I need for my A/B test?

Running an A/B test with too few samples risks a Type II error – failing to detect a real improvement. The required sample size depends on three factors: the baseline metric (current model performance), the minimum detectable effect (MDE – the smallest improvement worth deploying for), and the desired statistical power (typically 80%, meaning an 80% chance of detecting a true effect). Smaller effects require exponentially more data to detect reliably, which is why teams must decide upfront what improvement threshold justifies the engineering cost of deploying a new model.

def calculate_sample_size(baseline_rate, mde, alpha=0.05, power=0.8):
    """
    Calculate required sample size for A/B test
    
    Args:
        baseline_rate: Current metric value (e.g., 0.80 for 80% accuracy)
        mde: Minimum Detectable Effect (e.g., 0.02 for 2% improvement)
        alpha: Significance level (default: 0.05)
        power: Statistical power (default: 0.8)
    """
    # Z-scores
    z_alpha = stats.norm.ppf(1 - alpha/2)
    z_beta = stats.norm.ppf(power)
    
    # Pooled variance
    p1 = baseline_rate
    p2 = baseline_rate + mde
    p_bar = (p1 + p2) / 2
    
    # Sample size per group
    n = (2 * p_bar * (1 - p_bar) * (z_alpha + z_beta)**2) / (mde**2)
    
    return int(np.ceil(n))

# Calculate for different scenarios
baseline = 0.80  # Current model: 80% accuracy

print("Sample Size Requirements (per group):")
print("=" * 70)
print(f"Baseline accuracy: {baseline*100:.0f}%\n")

for mde in [0.01, 0.02, 0.05, 0.10]:
    n = calculate_sample_size(baseline, mde)
    print(f"To detect {mde*100:.0f}% improvement: {n:,} samples per group ({n*2:,} total)")

print("\n💡 Smaller improvements require more data!")

Part 5: Multi-Objective Selection¶

The Trade-off Dilemma¶

Real world: Multiple objectives matter!

Accuracy
Fairness
Speed (latency)
Cost
Interpretability

Challenge: They often conflict!

# Multi-objective comparison
def normalize_score(scores, higher_better=True):
    """Normalize scores to 0-1 range"""
    if higher_better:
        return (scores - scores.min()) / (scores.max() - scores.min())
    else:
        return (scores.max() - scores) / (scores.max() - scores.min())

# Add normalized scores
df_results['Norm_Accuracy'] = normalize_score(df_results['Accuracy'])
df_results['Norm_Speed'] = normalize_score(df_results['Pred Time (s)'], higher_better=False)
df_results['Norm_F1'] = normalize_score(df_results['F1'])

# Calculate weighted composite score
# Example weights: 50% accuracy, 30% speed, 20% F1
weights = {'accuracy': 0.5, 'speed': 0.3, 'f1': 0.2}

df_results['Composite_Score'] = (
    weights['accuracy'] * df_results['Norm_Accuracy'] +
    weights['speed'] * df_results['Norm_Speed'] +
    weights['f1'] * df_results['Norm_F1']
)

print("Multi-Objective Model Comparison:")
print("=" * 80)
print(f"Weights: Accuracy={weights['accuracy']}, Speed={weights['speed']}, F1={weights['f1']}\n")
print(df_results[['Model', 'Accuracy', 'Pred Time (s)', 'F1', 'Composite_Score']]
      .sort_values('Composite_Score', ascending=False)
      .round(4)
      .to_string(index=False))

best_model = df_results.loc[df_results['Composite_Score'].idxmax(), 'Model']
print(f"\n🏆 Best model (weighted): {best_model}")

# Visualize trade-offs
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Accuracy vs Speed
axes[0].scatter(df_results['Pred Time (s)'], df_results['Accuracy'], s=200, alpha=0.6)
for idx, row in df_results.iterrows():
    axes[0].annotate(row['Model'], 
                     (row['Pred Time (s)'], row['Accuracy']),
                     fontsize=9, ha='center')
axes[0].set_xlabel('Prediction Time (seconds)', fontsize=12)
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].set_title('Accuracy vs Speed Trade-off', fontsize=14, fontweight='bold')
axes[0].grid(alpha=0.3)

# Plot 2: Composite scores
df_sorted = df_results.sort_values('Composite_Score')
axes[1].barh(df_sorted['Model'], df_sorted['Composite_Score'], color='teal', alpha=0.7)
axes[1].set_xlabel('Composite Score', fontsize=12)
axes[1].set_title('Overall Ranking (Weighted)', fontsize=14, fontweight='bold')
axes[1].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\n💡 Different weights yield different 'best' models!")
print("   Choose weights based on your priorities.")

Part 6: Model Selection Framework¶

Decision Checklist¶

Choosing the “best” model is rarely as simple as picking the highest accuracy. Production model selection requires weighing multiple competing objectives – accuracy vs. latency, fairness vs. overall performance, interpretability vs. predictive power. The decision framework below provides a structured sequence of questions that moves from performance to statistical significance to practical significance to deployment readiness, ensuring that the final choice is defensible to stakeholders at every level.

# Create decision framework
decision_framework = pd.DataFrame([
    {
        "Question": "Is accuracy the ONLY metric?",
        "If Yes": "Pick highest accuracy",
        "If No": "Use multi-objective approach"
    },
    {
        "Question": "Is there a performance difference?",
        "If Yes": "Do statistical test",
        "If No": "Choose simpler/faster model"
    },
    {
        "Question": "Is difference statistically significant?",
        "If Yes": "Consider practical significance",
        "If No": "Keep current model or choose simpler"
    },
    {
        "Question": "Is difference practically significant?",
        "If Yes": "Deploy better model",
        "If No": "Consider cost-benefit trade-off"
    },
    {
        "Question": "Are there fairness concerns?",
        "If Yes": "Evaluate bias metrics",
        "If No": "Proceed with deployment"
    },
    {
        "Question": "Is interpretability required?",
        "If Yes": "Favor simpler models",
        "If No": "Any model OK"
    }
])

print("Model Selection Decision Framework:")
print("=" * 90)
print(decision_framework.to_string(index=False))

Final Recommendation Template¶

A model selection recommendation should communicate not just which model to deploy, but why – including the primary metric, the performance gap, the speed trade-off, and caveats. The function below generates a structured recommendation that balances the best-performing model against the fastest model, applying a configurable threshold to determine whether the performance difference justifies the additional computational cost. This template is designed to be shared directly with engineering and product stakeholders.

def generate_recommendation(df_results, primary_metric='Accuracy', threshold=0.01):
    """Generate model selection recommendation"""
    
    # Get best model
    best_idx = df_results[primary_metric].idxmax()
    best_model = df_results.loc[best_idx]
    
    # Get simplest/fastest model
    fastest_idx = df_results['Pred Time (s)'].idxmin()
    fastest_model = df_results.loc[fastest_idx]
    
    # Check if difference is meaningful
    diff = best_model[primary_metric] - fastest_model[primary_metric]
    
    print("🎯 MODEL SELECTION RECOMMENDATION")
    print("=" * 80)
    print(f"\nPrimary Metric: {primary_metric}")
    print(f"Threshold for meaningful difference: {threshold*100:.1f}%\n")
    
    print(f"Best Performing Model: {best_model['Model']}")
    print(f"  {primary_metric}: {best_model[primary_metric]:.4f}")
    print(f"  Prediction Time: {best_model['Pred Time (s)']:.4f}s")
    
    print(f"\nFastest Model: {fastest_model['Model']}")
    print(f"  {primary_metric}: {fastest_model[primary_metric]:.4f}")
    print(f"  Prediction Time: {fastest_model['Pred Time (s)']:.4f}s")
    
    print(f"\nPerformance Difference: {diff:.4f} ({diff/fastest_model[primary_metric]*100:.2f}%)")
    
    print("\n" + "=" * 80)
    if diff > threshold:
        print(f"✅ RECOMMENDATION: Deploy '{best_model['Model']}'")
        print(f"   Reason: Significantly better {primary_metric} ({diff*100:.2f}% improvement)")
    else:
        print(f"✅ RECOMMENDATION: Deploy '{fastest_model['Model']}'")
        print(f"   Reason: Similar performance but {best_model['Pred Time (s)']/fastest_model['Pred Time (s)']:.1f}x faster")
    
    print("\n⚠️ Additional Considerations:")
    print("  • Monitor performance in production")
    print("  • Check for bias across user groups")
    print("  • Set up A/B test for validation")
    print("  • Plan for model retraining")

generate_recommendation(df_results, primary_metric='F1', threshold=0.02)

🎯 Knowledge Check¶

Q1: Why is cross-validation better than a single train/test split?
Q2: What does a p-value < 0.05 mean in model comparison?
Q3: When should you use stratified k-fold?
Q4: What’s the difference between statistical and practical significance?

Click for answers

A1: More reliable estimate, reduces variance from single split, uses all data
A2: Less than 5% chance the difference is due to random chance
A3: When classes are imbalanced - maintains class distribution in each fold
A4: Statistical: mathematically significant. Practical: meaningful in real world (worth the cost/effort)

📚 Summary¶

Model Comparison Best Practices¶

Use Cross-Validation
- K=5 or K=10 for most cases
- Stratified for imbalanced data
- Report mean ± std
Statistical Testing
- Paired t-test for cross-validation
- McNemar’s test for single test set
- Report p-values
Consider Multiple Objectives
- Accuracy
- Speed/Latency
- Fairness
- Interpretability
- Cost
Production Validation
- A/B testing
- Monitor continuously
- Track business metrics

🚀 Next Steps¶

Complete Model Comparison Challenge
Start Phase 15 Assignment
Practice with your own datasets
Set up A/B testing framework

Congratulations! You’ve completed Phase 15: Model Evaluation & Metrics! 🎉