import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
cross_val_score, cross_validate, KFold, StratifiedKFold,
train_test_split, learning_curve
)
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, mean_squared_error
)
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from scipy import stats
import time
np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("β
Setup complete")
Part 1: Cross-ValidationΒΆ
Why Cross-Validation?ΒΆ
Problem with single train/test split:
Results depend on random split
May get lucky or unlucky split
Unreliable performance estimate
Solution: K-Fold Cross-Validation
Split data into K folds
Train on K-1 folds, test on 1
Repeat K times
Average results
Types of Cross-ValidationΒΆ
K-Fold: Standard, K=5 or K=10
Stratified K-Fold: Maintains class distribution
Leave-One-Out (LOO): K = n (expensive!)
Time Series Split: Respects temporal order
# Generate dataset
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_redundant=5,
random_state=42
)
# Single train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
single_score = model.score(X_test, y_test)
print("Single Train/Test Split:")
print("=" * 60)
print(f"Accuracy: {single_score:.4f}")
print("β οΈ This is just ONE data split!")
# K-Fold Cross-Validation
cv_scores = cross_val_score(
LogisticRegression(random_state=42),
X, y,
cv=5, # 5-fold
scoring='accuracy'
)
print("\n5-Fold Cross-Validation:")
print("=" * 60)
print(f"Fold scores: {cv_scores}")
print(f"Mean: {cv_scores.mean():.4f}")
print(f"Std: {cv_scores.std():.4f}")
print(f"95% CI: [{cv_scores.mean() - 1.96*cv_scores.std():.4f}, "
f"{cv_scores.mean() + 1.96*cv_scores.std():.4f}]")
print("β
Much more reliable estimate!")
Stratified K-Fold for Imbalanced DataΒΆ
Standard K-Fold cross-validation splits data randomly, which can produce folds where the minority class is severely underrepresented or even absent. In a dataset with a 90/10 class split, a random fold might end up with 95/5, making the validation score unreliable. StratifiedKFold preserves the original class proportions in every fold, ensuring each validation set is a representative microcosm of the full dataset. This is essential for imbalanced classification problems β without stratification, variance between folds inflates artificially, and your cross-validation estimate becomes unreliable.
# Create imbalanced dataset
X_imb, y_imb = make_classification(
n_samples=1000,
n_features=20,
weights=[0.9, 0.1], # 90-10 split
random_state=42
)
print("Class Distribution:")
print(f"Class 0: {(y_imb == 0).sum()} ({(y_imb == 0).sum()/len(y_imb)*100:.1f}%)")
print(f"Class 1: {(y_imb == 1).sum()} ({(y_imb == 1).sum()/len(y_imb)*100:.1f}%)")
# Regular K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
regular_scores = cross_val_score(
LogisticRegression(random_state=42), X_imb, y_imb, cv=kf
)
# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(
LogisticRegression(random_state=42), X_imb, y_imb, cv=skf
)
print("\nComparison:")
print("=" * 60)
print(f"Regular K-Fold: {regular_scores.mean():.4f} Β± {regular_scores.std():.4f}")
print(f"Stratified K-Fold: {stratified_scores.mean():.4f} Β± {stratified_scores.std():.4f}")
print("\nπ‘ Stratified K-Fold maintains class balance in each fold!")
Part 2: Comparing Multiple ModelsΒΆ
Model Comparison FrameworkΒΆ
Comparing models requires more than just looking at accuracy on a single test set. A rigorous comparison uses cross-validated scores across multiple metrics (accuracy, precision, recall, F1, ROC-AUC), accounts for training and inference time (critical for production deployment), and reports confidence intervals to distinguish genuine performance differences from random variation. The cross_validate function from scikit-learn computes multiple metrics simultaneously across K folds, providing a comprehensive picture of each modelβs strengths and weaknesses.
# Define models to compare
models = {
'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
'SVM': SVC(random_state=42, probability=True)
}
# Evaluate each model
results = []
for name, model in models.items():
print(f"Evaluating {name}...")
# Cross-validation with multiple metrics
cv_results = cross_validate(
model, X, y,
cv=5,
scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
return_train_score=True
)
# Training time
start_time = time.time()
model.fit(X_train, y_train)
train_time = time.time() - start_time
# Prediction time
start_time = time.time()
_ = model.predict(X_test)
pred_time = time.time() - start_time
results.append({
'Model': name,
'Accuracy': cv_results['test_accuracy'].mean(),
'Precision': cv_results['test_precision'].mean(),
'Recall': cv_results['test_recall'].mean(),
'F1': cv_results['test_f1'].mean(),
'ROC-AUC': cv_results['test_roc_auc'].mean(),
'Train Time (s)': train_time,
'Pred Time (s)': pred_time,
'Std Dev': cv_results['test_accuracy'].std()
})
# Create comparison DataFrame
df_results = pd.DataFrame(results)
print("\nModel Comparison Results:")
print("=" * 100)
print(df_results.round(4).to_string(index=False))
# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Plot 1: Performance metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC']
df_results.set_index('Model')[metrics].plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Performance Metrics', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Score', fontsize=12)
axes[0, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
axes[0, 0].grid(alpha=0.3, axis='y')
# Plot 2: Training time
df_results.plot(x='Model', y='Train Time (s)', kind='bar', ax=axes[0, 1], legend=False)
axes[0, 1].set_title('Training Time', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Seconds', fontsize=12)
axes[0, 1].set_xlabel('')
axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=45, ha='right')
axes[0, 1].grid(alpha=0.3, axis='y')
# Plot 3: Prediction time
df_results.plot(x='Model', y='Pred Time (s)', kind='bar', ax=axes[1, 0],
legend=False, color='orange')
axes[1, 0].set_title('Prediction Time', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Seconds', fontsize=12)
axes[1, 0].set_xticklabels(axes[1, 0].get_xticklabels(), rotation=45, ha='right')
axes[1, 0].grid(alpha=0.3, axis='y')
# Plot 4: Accuracy with error bars
axes[1, 1].bar(df_results['Model'], df_results['Accuracy'],
yerr=df_results['Std Dev'], capsize=5, color='green', alpha=0.7)
axes[1, 1].set_title('Accuracy with Standard Deviation', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Accuracy', fontsize=12)
axes[1, 1].set_xticklabels(df_results['Model'], rotation=45, ha='right')
axes[1, 1].grid(alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
Part 3: Statistical Significance TestingΒΆ
Why Statistical Testing?ΒΆ
Question: Is Model A really better than Model B, or just got lucky?
Solution: Statistical hypothesis testing
Paired T-TestΒΆ
Use: Compare two models on same data
H0 (Null Hypothesis): Models perform equally
Ha (Alternative): One model performs better
# Compare two models with statistical test
def compare_models_statistically(model1, model2, X, y, cv=5):
"""Compare two models using paired t-test"""
# Get cross-validation scores for both models
scores1 = cross_val_score(model1, X, y, cv=cv, scoring='accuracy')
scores2 = cross_val_score(model2, X, y, cv=cv, scoring='accuracy')
# Paired t-test
t_stat, p_value = stats.ttest_rel(scores1, scores2)
print(f"Model 1 scores: {scores1}")
print(f"Model 1 mean: {scores1.mean():.4f} Β± {scores1.std():.4f}")
print(f"\nModel 2 scores: {scores2}")
print(f"Model 2 mean: {scores2.mean():.4f} Β± {scores2.std():.4f}")
print(f"\nPaired t-test:")
print(f" t-statistic: {t_stat:.4f}")
print(f" p-value: {p_value:.4f}")
alpha = 0.05
if p_value < alpha:
better = "Model 1" if scores1.mean() > scores2.mean() else "Model 2"
print(f"\nβ
{better} is SIGNIFICANTLY better (p < {alpha})")
else:
print(f"\nβ No significant difference (p >= {alpha})")
return scores1, scores2, p_value
# Compare Logistic Regression vs Random Forest
print("Comparing Logistic Regression vs Random Forest:")
print("=" * 70)
scores_lr, scores_rf, p_val = compare_models_statistically(
LogisticRegression(random_state=42, max_iter=1000),
RandomForestClassifier(random_state=42, n_estimators=100),
X, y, cv=10
)
McNemarβs Test for ClassificationΒΆ
McNemarβs test is a non-parametric test specifically designed for comparing two classifiers on the same test set. Unlike the paired t-test (which compares scores across folds), McNemarβs focuses on the disagreements between two models: cases where Model A is correct but Model B is wrong, and vice versa. If the number of these disagreements is roughly balanced, the models perform equivalently. The test statistic follows a chi-squared distribution with 1 degree of freedom, and a significant result (p < 0.05) indicates the models make genuinely different types of errors.
from statsmodels.stats.contingency_tables import mcnemar
# Train two models
model1 = LogisticRegression(random_state=42, max_iter=1000)
model2 = RandomForestClassifier(random_state=42, n_estimators=100)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
# Create contingency table
# [model2 wrong, model1 correct] [both correct]
# [both wrong] [model1 wrong, model2 correct]
contingency_table = pd.DataFrame(
[[np.sum((pred1 == y_test) & (pred2 != y_test)), # model1 right, model2 wrong
np.sum((pred1 == y_test) & (pred2 == y_test))], # both right
[np.sum((pred1 != y_test) & (pred2 != y_test)), # both wrong
np.sum((pred1 != y_test) & (pred2 == y_test))]], # model1 wrong, model2 right
columns=['Model2 Wrong', 'Model2 Right'],
index=['Model1 Right', 'Model1 Wrong']
)
print("McNemar's Test:")
print("=" * 60)
print("\nContingency Table:")
print(contingency_table)
# Perform McNemar's test
result = mcnemar(contingency_table.values, exact=False, correction=True)
print(f"\nStatistic: {result.statistic:.4f}")
print(f"P-value: {result.pvalue:.4f}")
if result.pvalue < 0.05:
print("\nβ
Models perform SIGNIFICANTLY differently")
else:
print("\nβ No significant difference in model performance")
Part 4: A/B TestingΒΆ
What is A/B Testing?ΒΆ
Production setting: Test new model (B) against current model (A)
Process:
Route traffic: 50% to model A, 50% to model B
Collect metrics
Statistical comparison
Make deployment decision
Key Metrics for A/B TestsΒΆ
Primary Metric: Core business goal (accuracy, CTR, revenue)
Secondary Metrics: User experience, latency, cost
Guardrail Metrics: Safety checks (error rate, bias)
# Simulate A/B test
def simulate_ab_test(model_a, model_b, X_test, y_test, metric='accuracy'):
"""Simulate A/B test for model deployment"""
# Randomly assign test data to A or B
np.random.seed(42)
assignment = np.random.choice(['A', 'B'], size=len(X_test), p=[0.5, 0.5])
# Get predictions
pred_a = model_a.predict(X_test)
pred_b = model_b.predict(X_test)
# Calculate metrics for each group
mask_a = (assignment == 'A')
mask_b = (assignment == 'B')
score_a = accuracy_score(y_test[mask_a], pred_a[mask_a])
score_b = accuracy_score(y_test[mask_b], pred_b[mask_b])
# Sample sizes
n_a = mask_a.sum()
n_b = mask_b.sum()
# Proportion test (z-test)
# Calculate pooled proportion
correct_a = (pred_a[mask_a] == y_test[mask_a]).sum()
correct_b = (pred_b[mask_b] == y_test[mask_b]).sum()
p_pool = (correct_a + correct_b) / (n_a + n_b)
# Standard error
se = np.sqrt(p_pool * (1 - p_pool) * (1/n_a + 1/n_b))
# Z-statistic
z_stat = (score_b - score_a) / se if se > 0 else 0
# P-value (two-tailed)
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
print("A/B Test Results:")
print("=" * 70)
print(f"Model A (Control):")
print(f" Sample size: {n_a}")
print(f" Accuracy: {score_a:.4f}")
print(f"\nModel B (Treatment):")
print(f" Sample size: {n_b}")
print(f" Accuracy: {score_b:.4f}")
print(f"\nDifference: {score_b - score_a:.4f} ({(score_b - score_a)/score_a * 100:+.2f}%)")
print(f"\nStatistical Test:")
print(f" Z-statistic: {z_stat:.4f}")
print(f" P-value: {p_value:.4f}")
if p_value < 0.05:
if score_b > score_a:
print("\nπ Model B is SIGNIFICANTLY better! Deploy it!")
else:
print("\nβ οΈ Model B is SIGNIFICANTLY worse! Keep Model A!")
else:
print("\nβ No significant difference. Need more data or keep Model A.")
return score_a, score_b, p_value
# Run A/B test
score_a, score_b, p_val = simulate_ab_test(
model1, # Current model (Logistic Regression)
model2, # New model (Random Forest)
X_test, y_test
)
Sample Size CalculationΒΆ
Question: How much data do I need for my A/B test?
Running an A/B test with too few samples risks a Type II error β failing to detect a real improvement. The required sample size depends on three factors: the baseline metric (current model performance), the minimum detectable effect (MDE β the smallest improvement worth deploying for), and the desired statistical power (typically 80%, meaning an 80% chance of detecting a true effect). Smaller effects require exponentially more data to detect reliably, which is why teams must decide upfront what improvement threshold justifies the engineering cost of deploying a new model.
def calculate_sample_size(baseline_rate, mde, alpha=0.05, power=0.8):
"""
Calculate required sample size for A/B test
Args:
baseline_rate: Current metric value (e.g., 0.80 for 80% accuracy)
mde: Minimum Detectable Effect (e.g., 0.02 for 2% improvement)
alpha: Significance level (default: 0.05)
power: Statistical power (default: 0.8)
"""
# Z-scores
z_alpha = stats.norm.ppf(1 - alpha/2)
z_beta = stats.norm.ppf(power)
# Pooled variance
p1 = baseline_rate
p2 = baseline_rate + mde
p_bar = (p1 + p2) / 2
# Sample size per group
n = (2 * p_bar * (1 - p_bar) * (z_alpha + z_beta)**2) / (mde**2)
return int(np.ceil(n))
# Calculate for different scenarios
baseline = 0.80 # Current model: 80% accuracy
print("Sample Size Requirements (per group):")
print("=" * 70)
print(f"Baseline accuracy: {baseline*100:.0f}%\n")
for mde in [0.01, 0.02, 0.05, 0.10]:
n = calculate_sample_size(baseline, mde)
print(f"To detect {mde*100:.0f}% improvement: {n:,} samples per group ({n*2:,} total)")
print("\nπ‘ Smaller improvements require more data!")
Part 5: Multi-Objective SelectionΒΆ
The Trade-off DilemmaΒΆ
Real world: Multiple objectives matter!
Accuracy
Fairness
Speed (latency)
Cost
Interpretability
Challenge: They often conflict!
# Multi-objective comparison
def normalize_score(scores, higher_better=True):
"""Normalize scores to 0-1 range"""
if higher_better:
return (scores - scores.min()) / (scores.max() - scores.min())
else:
return (scores.max() - scores) / (scores.max() - scores.min())
# Add normalized scores
df_results['Norm_Accuracy'] = normalize_score(df_results['Accuracy'])
df_results['Norm_Speed'] = normalize_score(df_results['Pred Time (s)'], higher_better=False)
df_results['Norm_F1'] = normalize_score(df_results['F1'])
# Calculate weighted composite score
# Example weights: 50% accuracy, 30% speed, 20% F1
weights = {'accuracy': 0.5, 'speed': 0.3, 'f1': 0.2}
df_results['Composite_Score'] = (
weights['accuracy'] * df_results['Norm_Accuracy'] +
weights['speed'] * df_results['Norm_Speed'] +
weights['f1'] * df_results['Norm_F1']
)
print("Multi-Objective Model Comparison:")
print("=" * 80)
print(f"Weights: Accuracy={weights['accuracy']}, Speed={weights['speed']}, F1={weights['f1']}\n")
print(df_results[['Model', 'Accuracy', 'Pred Time (s)', 'F1', 'Composite_Score']]
.sort_values('Composite_Score', ascending=False)
.round(4)
.to_string(index=False))
best_model = df_results.loc[df_results['Composite_Score'].idxmax(), 'Model']
print(f"\nπ Best model (weighted): {best_model}")
# Visualize trade-offs
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Plot 1: Accuracy vs Speed
axes[0].scatter(df_results['Pred Time (s)'], df_results['Accuracy'], s=200, alpha=0.6)
for idx, row in df_results.iterrows():
axes[0].annotate(row['Model'],
(row['Pred Time (s)'], row['Accuracy']),
fontsize=9, ha='center')
axes[0].set_xlabel('Prediction Time (seconds)', fontsize=12)
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].set_title('Accuracy vs Speed Trade-off', fontsize=14, fontweight='bold')
axes[0].grid(alpha=0.3)
# Plot 2: Composite scores
df_sorted = df_results.sort_values('Composite_Score')
axes[1].barh(df_sorted['Model'], df_sorted['Composite_Score'], color='teal', alpha=0.7)
axes[1].set_xlabel('Composite Score', fontsize=12)
axes[1].set_title('Overall Ranking (Weighted)', fontsize=14, fontweight='bold')
axes[1].grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
print("\nπ‘ Different weights yield different 'best' models!")
print(" Choose weights based on your priorities.")
Part 6: Model Selection FrameworkΒΆ
Decision ChecklistΒΆ
Choosing the βbestβ model is rarely as simple as picking the highest accuracy. Production model selection requires weighing multiple competing objectives β accuracy vs. latency, fairness vs. overall performance, interpretability vs. predictive power. The decision framework below provides a structured sequence of questions that moves from performance to statistical significance to practical significance to deployment readiness, ensuring that the final choice is defensible to stakeholders at every level.
# Create decision framework
decision_framework = pd.DataFrame([
{
"Question": "Is accuracy the ONLY metric?",
"If Yes": "Pick highest accuracy",
"If No": "Use multi-objective approach"
},
{
"Question": "Is there a performance difference?",
"If Yes": "Do statistical test",
"If No": "Choose simpler/faster model"
},
{
"Question": "Is difference statistically significant?",
"If Yes": "Consider practical significance",
"If No": "Keep current model or choose simpler"
},
{
"Question": "Is difference practically significant?",
"If Yes": "Deploy better model",
"If No": "Consider cost-benefit trade-off"
},
{
"Question": "Are there fairness concerns?",
"If Yes": "Evaluate bias metrics",
"If No": "Proceed with deployment"
},
{
"Question": "Is interpretability required?",
"If Yes": "Favor simpler models",
"If No": "Any model OK"
}
])
print("Model Selection Decision Framework:")
print("=" * 90)
print(decision_framework.to_string(index=False))
Final Recommendation TemplateΒΆ
A model selection recommendation should communicate not just which model to deploy, but why β including the primary metric, the performance gap, the speed trade-off, and caveats. The function below generates a structured recommendation that balances the best-performing model against the fastest model, applying a configurable threshold to determine whether the performance difference justifies the additional computational cost. This template is designed to be shared directly with engineering and product stakeholders.
def generate_recommendation(df_results, primary_metric='Accuracy', threshold=0.01):
"""Generate model selection recommendation"""
# Get best model
best_idx = df_results[primary_metric].idxmax()
best_model = df_results.loc[best_idx]
# Get simplest/fastest model
fastest_idx = df_results['Pred Time (s)'].idxmin()
fastest_model = df_results.loc[fastest_idx]
# Check if difference is meaningful
diff = best_model[primary_metric] - fastest_model[primary_metric]
print("π― MODEL SELECTION RECOMMENDATION")
print("=" * 80)
print(f"\nPrimary Metric: {primary_metric}")
print(f"Threshold for meaningful difference: {threshold*100:.1f}%\n")
print(f"Best Performing Model: {best_model['Model']}")
print(f" {primary_metric}: {best_model[primary_metric]:.4f}")
print(f" Prediction Time: {best_model['Pred Time (s)']:.4f}s")
print(f"\nFastest Model: {fastest_model['Model']}")
print(f" {primary_metric}: {fastest_model[primary_metric]:.4f}")
print(f" Prediction Time: {fastest_model['Pred Time (s)']:.4f}s")
print(f"\nPerformance Difference: {diff:.4f} ({diff/fastest_model[primary_metric]*100:.2f}%)")
print("\n" + "=" * 80)
if diff > threshold:
print(f"β
RECOMMENDATION: Deploy '{best_model['Model']}'")
print(f" Reason: Significantly better {primary_metric} ({diff*100:.2f}% improvement)")
else:
print(f"β
RECOMMENDATION: Deploy '{fastest_model['Model']}'")
print(f" Reason: Similar performance but {best_model['Pred Time (s)']/fastest_model['Pred Time (s)']:.1f}x faster")
print("\nβ οΈ Additional Considerations:")
print(" β’ Monitor performance in production")
print(" β’ Check for bias across user groups")
print(" β’ Set up A/B test for validation")
print(" β’ Plan for model retraining")
generate_recommendation(df_results, primary_metric='F1', threshold=0.02)
π― Knowledge CheckΒΆ
Q1: Why is cross-validation better than a single train/test split?
Q2: What does a p-value < 0.05 mean in model comparison?
Q3: When should you use stratified k-fold?
Q4: Whatβs the difference between statistical and practical significance?
Click for answers
A1: More reliable estimate, reduces variance from single split, uses all data
A2: Less than 5% chance the difference is due to random chance
A3: When classes are imbalanced - maintains class distribution in each fold
A4: Statistical: mathematically significant. Practical: meaningful in real world (worth the cost/effort)
π SummaryΒΆ
Model Comparison Best PracticesΒΆ
Use Cross-Validation
K=5 or K=10 for most cases
Stratified for imbalanced data
Report mean Β± std
Statistical Testing
Paired t-test for cross-validation
McNemarβs test for single test set
Report p-values
Consider Multiple Objectives
Accuracy
Speed/Latency
Fairness
Interpretability
Cost
Production Validation
A/B testing
Monitor continuously
Track business metrics
π Next StepsΒΆ
Complete Model Comparison Challenge
Start Phase 15 Assignment
Practice with your own datasets
Set up A/B testing framework
Congratulations! Youβve completed Phase 15: Model Evaluation & Metrics! π