Run this notebook: Open in Colab Open in Kaggle

Model Selection: Cross-Validation, Learning Curves & Bias-Variance¶

Choosing the right model and tuning it properly — without accidentally overfitting to your validation set.

1. The Train / Validation / Test Split — Why You Need Three Sets¶

All Data
├── Training Set (60-70%)   → Model learns patterns
├── Validation Set (10-20%) → Tune hyperparameters (model selection)
└── Test Set (10-20%)       → Final, unbiased performance estimate
                               (use ONCE, at the very end)

The cardinal rule: once you look at test set performance, you’ve “used it up.” Any further tuning based on test results leads to optimistic estimates — your model is now indirectly trained on the test set.

Cross-validation solves the validation problem: it reuses training data by rotating which fold is held out, giving a more reliable estimate than a single val split.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
X, y = data.data, data.target

# Hold out 20% as sacred test set
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Total samples:      {len(X)}")
print(f"Train+val samples:  {len(X_trainval)}")
print(f"Test samples:       {len(X_test)}  (locked away!)")
print(f"\nClass distribution in test: {np.bincount(y_test)}")

2. k-Fold Cross-Validation¶

Instead of a single train/val split, k-fold runs the model k times, each time using a different fold as validation.

from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Build a pipeline (always use pipelines with CV!)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

# Standard k-fold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores_kfold = cross_val_score(pipe, X_trainval, y_trainval, cv=kfold, scoring='accuracy')

# Stratified k-fold (RECOMMENDED for classification — preserves class ratio in each fold)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_skfold = cross_val_score(pipe, X_trainval, y_trainval, cv=skfold, scoring='accuracy')

print("Standard KFold:")
print(f"  Fold scores: {scores_kfold.round(4)}")
print(f"  Mean ± Std:  {scores_kfold.mean():.4f} ± {scores_kfold.std():.4f}")

print("\nStratified KFold (preferred for classification):")
print(f"  Fold scores: {scores_skfold.round(4)}")
print(f"  Mean ± Std:  {scores_skfold.mean():.4f} ± {scores_skfold.std():.4f}")

# Visualize fold-by-fold variation
fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(range(1, 6), scores_skfold, color='steelblue', alpha=0.8)
ax.axhline(scores_skfold.mean(), color='red', linestyle='--', label=f'Mean = {scores_skfold.mean():.4f}')
ax.fill_between(range(0, 7),
                scores_skfold.mean() - scores_skfold.std(),
                scores_skfold.mean() + scores_skfold.std(),
                alpha=0.2, color='red', label=f'±1 std = {scores_skfold.std():.4f}')
ax.set_xlabel('Fold')
ax.set_ylabel('Accuracy')
ax.set_title('5-Fold Cross-Validation Scores (Breast Cancer)')
ax.legend()
ax.set_ylim(0.9, 1.0)
ax.set_xticks(range(1, 6))
plt.tight_layout()
plt.show()

3. GridSearchCV vs RandomizedSearchCV¶

GridSearchCV: Exhaustive search over a discrete grid. Guaranteed to find the best grid point, but slow.
RandomizedSearchCV: Samples n_iter random combinations from distributions. ~10x faster, often finds equally good or better params.

import time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import loguniform

pipe_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(random_state=42))
])

# GridSearchCV: 3 x 3 x 2 = 18 combinations × 5 folds = 90 fits
param_grid = {
    'clf__C':      [0.1, 1.0, 10.0],
    'clf__gamma':  [0.001, 0.01, 0.1],
    'clf__kernel': ['rbf', 'linear']
}

t0 = time.time()
grid = GridSearchCV(pipe_svc, param_grid, cv=5, n_jobs=-1)
grid.fit(X_trainval, y_trainval)
t_grid = time.time() - t0

# RandomizedSearchCV: sample 10 combinations × 5 folds = 50 fits
param_dist = {
    'clf__C':      loguniform(1e-2, 1e2),
    'clf__gamma':  loguniform(1e-4, 1e-1),
    'clf__kernel': ['rbf', 'linear']
}

t0 = time.time()
rand = RandomizedSearchCV(pipe_svc, param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)
rand.fit(X_trainval, y_trainval)
t_rand = time.time() - t0

print(f"GridSearchCV  — Best CV: {grid.best_score_:.4f}, time: {t_grid:.2f}s, fits: {len(param_grid['clf__C'])*len(param_grid['clf__gamma'])*len(param_grid['clf__kernel'])*5}")
print(f"RandomizedCV  — Best CV: {rand.best_score_:.4f}, time: {t_rand:.2f}s, fits: {10*5}")
print(f"\nGrid best params:  {grid.best_params_}")
print(f"Random best params: {rand.best_params_}")

4. Learning Curves: Diagnosing Underfitting vs Overfitting¶

Learning curves plot training and validation score as a function of training set size. They diagnose:

High bias (underfitting): Both curves plateau at low accuracy
High variance (overfitting): Large gap between train and val curves

from sklearn.model_selection import learning_curve
from sklearn.tree import DecisionTreeClassifier

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

models = [
    ('Underfitting (depth=1)', DecisionTreeClassifier(max_depth=1, random_state=42)),
    ('Overfitting (depth=None)', DecisionTreeClassifier(max_depth=None, random_state=42)),
]

for ax, (title, model) in zip(axes, models):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X_trainval, y_trainval,
        cv=5,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy',
        n_jobs=-1
    )
    
    train_mean = train_scores.mean(axis=1)
    train_std  = train_scores.std(axis=1)
    val_mean   = val_scores.mean(axis=1)
    val_std    = val_scores.std(axis=1)
    
    ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')
    ax.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.2, color='orange')
    ax.plot(train_sizes, train_mean, 'o-', color='blue', label='Training score')
    ax.plot(train_sizes, val_mean, 'o-', color='orange', label='Validation score')
    ax.set_title(title)
    ax.set_xlabel('Training examples')
    ax.set_ylabel('Accuracy')
    ax.legend(loc='lower right')
    ax.set_ylim(0.5, 1.05)
    ax.grid(True, alpha=0.3)

plt.suptitle('Learning Curves: Diagnosing Bias vs Variance', fontsize=14)
plt.tight_layout()
plt.show()

print("Left:  Both curves plateau low → HIGH BIAS → need more complex model")
print("Right: Large gap between curves → HIGH VARIANCE → need regularization or more data")

5. Validation Curves: Effect of a Single Hyperparameter¶

from sklearn.model_selection import validation_curve

# How does regularization strength C affect Logistic Regression?
C_range = np.logspace(-4, 4, 20)

pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=5000, random_state=42))
])

train_scores_vc, val_scores_vc = validation_curve(
    pipe_lr, X_trainval, y_trainval,
    param_name='clf__C',
    param_range=C_range,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

fig, ax = plt.subplots(figsize=(9, 5))
ax.semilogx(C_range, train_scores_vc.mean(axis=1), 'o-', color='blue', label='Training score')
ax.fill_between(C_range, train_scores_vc.mean(axis=1) - train_scores_vc.std(axis=1),
                         train_scores_vc.mean(axis=1) + train_scores_vc.std(axis=1), alpha=0.15, color='blue')
ax.semilogx(C_range, val_scores_vc.mean(axis=1), 'o-', color='orange', label='Validation score')
ax.fill_between(C_range, val_scores_vc.mean(axis=1) - val_scores_vc.std(axis=1),
                         val_scores_vc.mean(axis=1) + val_scores_vc.std(axis=1), alpha=0.15, color='orange')

best_C_idx = val_scores_vc.mean(axis=1).argmax()
ax.axvline(C_range[best_C_idx], color='red', linestyle='--', label=f'Best C = {C_range[best_C_idx]:.3f}')
ax.set_xlabel('C (regularization strength — higher = less regularized)')
ax.set_ylabel('Accuracy')
ax.set_title('Validation Curve: LogisticRegression C parameter')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

6. Nested Cross-Validation: The Honest Estimate¶

When you do GridSearchCV inside a CV loop, you’re “peeking” at the validation data to pick hyperparameters. Nested CV solves this:

Inner loop: Finds the best hyperparameters (model selection)
Outer loop: Evaluates the selected model on held-out data (performance estimation)

Outer CV fold 1: [val fold 1] | [train folds 2-5]
                                    └──> Inner CV: GridSearch picks best params
                                    └──> Refit on all inner train data
                                    └──> Evaluate on outer val fold 1
Outer CV fold 2: [val fold 2] | [train folds 1, 3-5]
  ... and so on

from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC

# Inner CV: hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# Outer CV: performance estimation
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

pipe_nested = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(random_state=42))
])

param_grid_nested = {
    'clf__C':     [0.1, 1.0, 10.0],
    'clf__gamma': [0.01, 0.1]
}

# GridSearchCV becomes the "estimator" in outer cross_val_score
gs = GridSearchCV(pipe_nested, param_grid_nested, cv=inner_cv, n_jobs=-1)

nested_scores = cross_val_score(gs, X_trainval, y_trainval, cv=outer_cv, n_jobs=-1)

# Compare with non-nested (optimistic) estimate
non_nested_gs = GridSearchCV(pipe_nested, param_grid_nested, cv=5, n_jobs=-1)
non_nested_gs.fit(X_trainval, y_trainval)

print(f"Nested CV score:     {nested_scores.mean():.4f} ± {nested_scores.std():.4f}  ← honest estimate")
print(f"Non-nested CV score: {non_nested_gs.best_score_:.4f}                       ← slightly optimistic")
print(f"Score inflation:     {non_nested_gs.best_score_ - nested_scores.mean():.4f}")

Exercises¶

Compare CV strategies: On the breast cancer dataset, compare KFold, StratifiedKFold, and RepeatedStratifiedKFold(n_splits=5, n_repeats=10). Which gives the lowest variance in scores?
Learning curve diagnosis: Plot learning curves for (a) LogisticRegression, (b) DecisionTreeClassifier(max_depth=5), © SVC(C=100). Diagnose each as high-bias, high-variance, or well-fit.
RandomizedSearchCV tuning: Use RandomizedSearchCV with n_iter=50 to tune a RandomForestClassifier. Compare against GridSearchCV on the same dataset. Which finds a better score? How long does each take?
Nested vs non-nested gap: Generate 20 different random synthetic datasets (use make_classification with varying random_state). For each, compute the nested and non-nested CV score gap. What is the average inflation?
Validation curve for depth: Plot a validation curve for DecisionTreeClassifier with max_depth ranging from 1 to 20. At what depth does overfitting start? Does this change with dataset size?