# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer, make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report
import logging
import warnings

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette('husl')

print("βœ… Libraries imported successfully!")

Part 1: The Debugging WorkflowΒΆ

The 5-Step ML Debugging ProcessΒΆ

1. REPRODUCE β†’ Make the bug happen consistently
2. GATHER β†’ Collect data, logs, metrics
3. HYPOTHESIZE β†’ Form testable theories about the cause
4. TEST β†’ Make targeted changes and re-run
5. VERIFY β†’ Confirm fix works across scenarios

Example Scenario: Model Not LearningΒΆ

A model that trains to chance-level accuracy (~50% on binary classification) is one of the most common and frustrating bugs in ML. The root causes range from shuffled labels to data leakage to incorrect loss functions. Below, we simulate a subtle but devastating bug – independently shuffling features and labels, which destroys the correspondence between inputs and outputs. Walking through the 5-step debugging process on this concrete example builds the diagnostic muscle memory you need for real-world issues.

# Step 1: REPRODUCE - Create buggy code

# Load data
X, y = make_classification(
    n_samples=1000, 
    n_features=20, 
    n_informative=15,
    n_redundant=5,
    random_state=42
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# BUG: Features and labels are shuffled independently!
np.random.shuffle(X_train)  # ❌ This breaks the X-y correspondence
np.random.shuffle(y_train)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

print(f"❌ Train Accuracy: {train_acc:.3f}")
print(f"❌ Test Accuracy: {test_acc:.3f}")
print("\n⚠️ Problem: Model performs at chance level (~50%)!")

Step 2: GATHER DataΒΆ

Before forming hypotheses, collect all available diagnostic information. Python’s logging module provides structured, timestamped output that is far more useful than scattered print statements – especially when debugging in production or reviewing logs after the fact. Key data points to gather include dataset shapes, class distributions, feature ranges, presence of NaN/Inf values, and any preprocessing steps that have been applied. The more context you capture upfront, the faster you can narrow down the root cause.

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Gather diagnostic information
logger.info(f"Dataset shape: X={X.shape}, y={y.shape}")
logger.info(f"Train set: X_train={X_train.shape}, y_train={y_train.shape}")
logger.info(f"Test set: X_test={X_test.shape}, y_test={y_test.shape}")
logger.info(f"Class distribution: {np.bincount(y)}")
logger.info(f"Feature range: [{X.min():.2f}, {X.max():.2f}]")

# Check for common issues
logger.warning(f"Any NaN values? {np.isnan(X).any()}")
logger.warning(f"Any Inf values? {np.isinf(X).any()}")

Step 3: HYPOTHESIZE Possible CausesΒΆ

Hypothesis 1: Data leakage or preprocessing issue
Hypothesis 2: Model too simple for the problem
Hypothesis 3: Labels shuffled incorrectly βœ… (Correct!)

# Step 4: TEST - Create baseline model

# Reload clean data
X, y = make_classification(
    n_samples=1000, 
    n_features=20, 
    n_informative=15,
    random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Test baseline model
baseline = DummyClassifier(strategy='most_frequent')
baseline.fit(X_train, y_train)
baseline_acc = baseline.score(X_test, y_test)

print(f"πŸ“Š Baseline (most frequent): {baseline_acc:.3f}")
print("\nπŸ’‘ If our model can't beat this, something is wrong!")
# Fix: Don't shuffle features and labels independently
# Use train_test_split's shuffle parameter instead

# Correct approach:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

print(f"βœ… Train Accuracy: {train_acc:.3f}")
print(f"βœ… Test Accuracy: {test_acc:.3f}")
print("\nπŸŽ‰ Fixed! Model now performs well!")

Step 5: VERIFY the FixΒΆ

A fix that works on one random seed might fail on another. Verification means testing across multiple random seeds, different data subsets, and edge cases to confirm the fix is robust. If accuracy is consistent (low standard deviation) across runs, you can be confident the bug is genuinely resolved rather than masked by a lucky split. In production systems, this step often includes running the full regression test suite and monitoring metrics after deployment.

# Verify with multiple random seeds
results = []

for seed in [42, 123, 456, 789, 999]:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True, random_state=seed
    )
    
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    test_acc = model.score(X_test, y_test)
    results.append(test_acc)

print("Verification Results (5 different seeds):")
print(f"Mean accuracy: {np.mean(results):.3f} Β± {np.std(results):.3f}")
print(f"Min: {np.min(results):.3f}, Max: {np.max(results):.3f}")
print("\nβœ… Consistent performance across runs!")

Part 2: Sanity Checks ChecklistΒΆ

Essential Sanity Checks Before TrainingΒΆ

Sanity checks are the ML equivalent of a pilot’s pre-flight checklist. Skipping them is the single most common source of wasted debugging time. A comprehensive check covers data alignment (do X and y correspond?), missing values, class balance, feature scale, duplicates, and train/test leakage. Running these checks takes seconds but can save hours of downstream confusion. The function below packages all essential checks into a single reusable diagnostic tool.

def perform_sanity_checks(X_train, X_test, y_train, y_test):
    """
    Comprehensive sanity checks for ML datasets.
    """
    print("=" * 60)
    print("SANITY CHECKS")
    print("=" * 60)
    
    # 1. Shape checks
    print("\n1. SHAPE CHECKS")
    print(f"   X_train shape: {X_train.shape}")
    print(f"   X_test shape: {X_test.shape}")
    print(f"   y_train shape: {y_train.shape}")
    print(f"   y_test shape: {y_test.shape}")
    
    # Check alignment
    assert X_train.shape[0] == y_train.shape[0], "❌ X_train and y_train size mismatch!"
    assert X_test.shape[0] == y_test.shape[0], "❌ X_test and y_test size mismatch!"
    assert X_train.shape[1] == X_test.shape[1], "❌ Feature dimension mismatch!"
    print("   βœ… All shapes aligned")
    
    # 2. Missing values
    print("\n2. MISSING VALUES")
    X_train_missing = np.isnan(X_train).sum()
    X_test_missing = np.isnan(X_test).sum()
    y_train_missing = np.isnan(y_train).sum() if y_train.dtype == float else 0
    
    print(f"   X_train: {X_train_missing} NaN values")
    print(f"   X_test: {X_test_missing} NaN values")
    print(f"   y_train: {y_train_missing} NaN values")
    
    if X_train_missing > 0 or X_test_missing > 0:
        print("   ⚠️ Found missing values!")
    else:
        print("   βœ… No missing values")
    
    # 3. Class distribution
    print("\n3. CLASS DISTRIBUTION")
    train_dist = np.bincount(y_train)
    test_dist = np.bincount(y_test)
    
    print(f"   Train: {train_dist}")
    print(f"   Test: {test_dist}")
    
    # Check for severe imbalance
    train_ratio = train_dist.min() / train_dist.max()
    if train_ratio < 0.1:
        print(f"   ⚠️ Severe class imbalance (ratio: {train_ratio:.2f})")
    else:
        print(f"   βœ… Balanced classes (ratio: {train_ratio:.2f})")
    
    # 4. Feature statistics
    print("\n4. FEATURE STATISTICS")
    print(f"   Train - Mean: {X_train.mean():.3f}, Std: {X_train.std():.3f}")
    print(f"   Train - Range: [{X_train.min():.3f}, {X_train.max():.3f}]")
    print(f"   Test - Mean: {X_test.mean():.3f}, Std: {X_test.std():.3f}")
    print(f"   Test - Range: [{X_test.min():.3f}, {X_test.max():.3f}]")
    
    # Check for scale issues
    if X_train.max() > 1000 or X_train.min() < -1000:
        print("   ⚠️ Large feature values - consider scaling")
    else:
        print("   βœ… Feature values in reasonable range")
    
    # 5. Duplicates
    print("\n5. DUPLICATE CHECK")
    train_df = pd.DataFrame(X_train)
    duplicates = train_df.duplicated().sum()
    print(f"   Duplicate rows in train: {duplicates}")
    
    if duplicates > 0:
        print(f"   ⚠️ Found {duplicates} duplicates")
    else:
        print("   βœ… No duplicates")
    
    print("\n" + "=" * 60)
    print("SANITY CHECKS COMPLETE")
    print("=" * 60)

# Run sanity checks
perform_sanity_checks(X_train, X_test, y_train, y_test)

Part 3: Baseline ModelsΒΆ

Why Baselines MatterΒΆ

Always establish a baseline before investing in complex models. A DummyClassifier that always predicts the most frequent class tells you the floor – any useful model must beat this. If your Random Forest scores only marginally above the baseline, the problem might be in your features rather than your model choice. Baselines also catch data leakage: if a simple model achieves suspiciously perfect accuracy, your test data is probably contaminated with training information. The code below compares dummy strategies against real models on the Breast Cancer Wisconsin dataset.

# Load real dataset
data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Dataset: Breast Cancer Wisconsin")
print(f"Features: {X.shape[1]}, Samples: {X.shape[0]}")
print(f"Classes: {np.unique(y)}")
# Test multiple baseline strategies
baselines = {
    'Most Frequent': DummyClassifier(strategy='most_frequent'),
    'Stratified': DummyClassifier(strategy='stratified'),
    'Uniform': DummyClassifier(strategy='uniform')
}

print("Baseline Model Performance:\n")
baseline_results = {}

for name, model in baselines.items():
    model.fit(X_train_scaled, y_train)
    acc = model.score(X_test_scaled, y_test)
    baseline_results[name] = acc
    print(f"{name:15s}: {acc:.3f}")

best_baseline = max(baseline_results.values())
print(f"\nπŸ“Š Best Baseline: {best_baseline:.3f}")
print("\nπŸ’‘ Any model should beat this!")
# Simple model
simple_model = LogisticRegression(max_iter=1000)
simple_model.fit(X_train_scaled, y_train)
simple_acc = simple_model.score(X_test_scaled, y_test)

# Complex model
complex_model = RandomForestClassifier(n_estimators=100, random_state=42)
complex_model.fit(X_train_scaled, y_train)
complex_acc = complex_model.score(X_test_scaled, y_test)

# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))

models = ['Baseline\n(Most Frequent)', 'Logistic\nRegression', 'Random\nForest']
accuracies = [best_baseline, simple_acc, complex_acc]
colors = ['red', 'orange', 'green']

bars = ax.bar(models, accuracies, color=colors, alpha=0.7, edgecolor='black')
ax.axhline(y=best_baseline, color='red', linestyle='--', label='Baseline Threshold')
ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_ylim([0, 1])
ax.legend()

# Add value labels
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{acc:.3f}', ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nβœ… Logistic Regression: {simple_acc:.3f} (+{simple_acc-best_baseline:.3f})")
print(f"βœ… Random Forest: {complex_acc:.3f} (+{complex_acc-best_baseline:.3f})")

Part 4: Debugging ChecklistΒΆ

When Your Model Isn’t WorkingΒΆ

Systematic checklists prevent you from chasing exotic explanations while ignoring mundane root causes. In practice, the majority of ML bugs fall into four buckets: data issues (misaligned arrays, leakage, missing values), model issues (wrong loss function, learning rate), code issues (shape mismatches, forgetting to zero gradients), and validation issues (evaluating on training data). Going through each category methodically eliminates possibilities and converges on the true cause much faster than ad-hoc investigation.

def debugging_checklist():
    """
    Interactive debugging checklist.
    """
    checklist = {
        "Data Issues": [
            "βœ“ Features and labels aligned?",
            "βœ“ No data leakage (test data in preprocessing)?",
            "βœ“ Proper train/val/test split?",
            "βœ“ No missing values or handled properly?",
            "βœ“ Outliers checked and handled?",
            "βœ“ Features scaled/normalized if needed?",
            "βœ“ Class distribution reasonable?"
        ],
        "Model Issues": [
            "βœ“ Using correct loss function?",
            "βœ“ Learning rate appropriate?",
            "βœ“ Model capacity sufficient (not too simple)?",
            "βœ“ Model not too complex (overfitting)?",
            "βœ“ Regularization applied if needed?",
            "βœ“ Gradients flowing (for neural nets)?",
            "βœ“ Weights initialized properly?"
        ],
        "Code Issues": [
            "βœ“ Random seed set for reproducibility?",
            "βœ“ Model in correct mode (train vs eval)?",
            "βœ“ Gradients zeroed between steps?",
            "βœ“ Batching implemented correctly?",
            "βœ“ No shape mismatches?",
            "βœ“ Evaluation metrics computed correctly?"
        ],
        "Validation Issues": [
            "βœ“ Separate validation set used?",
            "βœ“ Not evaluating on training data?",
            "βœ“ Cross-validation used if dataset small?",
            "βœ“ Stratified split for imbalanced data?"
        ]
    }
    
    print("=" * 70)
    print("ML DEBUGGING CHECKLIST")
    print("=" * 70)
    
    for category, items in checklist.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  {item}")
    
    print("\n" + "=" * 70)
    print("Go through this list systematically when debugging!")
    print("=" * 70)

debugging_checklist()

Part 5: Logging and InstrumentationΒΆ

Building Observable ML PipelinesΒΆ

Production ML systems need structured logging far beyond print statements. A good ML logger captures dataset metadata, training parameters, timing information, and metric snapshots at each stage – enabling post-hoc debugging without re-running experiments. The MLLogger class below demonstrates a reusable pattern that tracks experiment lifecycle from data loading through training to evaluation, automatically flagging common issues like overfitting (large train-test gap) and recording everything with timestamps for reproducible analysis.

import time
from datetime import datetime

class MLLogger:
    """
    Custom logger for ML experiments.
    """
    def __init__(self, experiment_name):
        self.experiment_name = experiment_name
        self.start_time = time.time()
        self.metrics = {}
        
        # Set up file logging
        self.logger = logging.getLogger(experiment_name)
        self.logger.setLevel(logging.INFO)
        
        # Console handler
        console = logging.StreamHandler()
        console.setLevel(logging.INFO)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        console.setFormatter(formatter)
        self.logger.addHandler(console)
    
    def log_data_info(self, X_train, y_train, X_test, y_test):
        """Log dataset information."""
        self.logger.info("="*50)
        self.logger.info("DATA INFO")
        self.logger.info(f"Train samples: {X_train.shape[0]}")
        self.logger.info(f"Test samples: {X_test.shape[0]}")
        self.logger.info(f"Features: {X_train.shape[1]}")
        self.logger.info(f"Classes: {np.unique(y_train)}")
        self.logger.info(f"Class distribution: {np.bincount(y_train)}")
    
    def log_training_start(self, model_name, params):
        """Log training start."""
        self.logger.info("="*50)
        self.logger.info(f"TRAINING: {model_name}")
        self.logger.info(f"Parameters: {params}")
        self.training_start = time.time()
    
    def log_training_end(self, train_score, test_score):
        """Log training completion."""
        duration = time.time() - self.training_start
        self.logger.info(f"Training completed in {duration:.2f}s")
        self.logger.info(f"Train score: {train_score:.4f}")
        self.logger.info(f"Test score: {test_score:.4f}")
        
        self.metrics['train_score'] = train_score
        self.metrics['test_score'] = test_score
        self.metrics['training_time'] = duration
    
    def log_warning(self, message):
        """Log warning."""
        self.logger.warning(f"⚠️ {message}")
    
    def log_error(self, message):
        """Log error."""
        self.logger.error(f"❌ {message}")
    
    def summary(self):
        """Print experiment summary."""
        total_time = time.time() - self.start_time
        self.logger.info("="*50)
        self.logger.info("EXPERIMENT SUMMARY")
        self.logger.info(f"Experiment: {self.experiment_name}")
        self.logger.info(f"Total time: {total_time:.2f}s")
        for key, value in self.metrics.items():
            self.logger.info(f"{key}: {value}")
        self.logger.info("="*50)

# Example usage
ml_logger = MLLogger("breast_cancer_classification")

# Log data info
ml_logger.log_data_info(X_train_scaled, y_train, X_test_scaled, y_test)

# Log training
ml_logger.log_training_start(
    "LogisticRegression",
    {"max_iter": 1000, "random_state": 42}
)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

train_score = model.score(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)

ml_logger.log_training_end(train_score, test_score)

# Check for issues
if train_score - test_score > 0.1:
    ml_logger.log_warning("Possible overfitting detected!")

# Summary
ml_logger.summary()

🎯 Key Takeaways¢

  1. Follow a systematic workflow: Reproduce β†’ Gather β†’ Hypothesize β†’ Test β†’ Verify

  2. Always use sanity checks before training complex models

  3. Establish baselines to know if your model is actually learning

  4. Use checklists to systematically eliminate common issues

  5. Instrument your code with logging for better debugging

  6. Start simple - test with small data, simple models first

  7. Be reproducible - set random seeds, document changes

πŸ“ Practice ExerciseΒΆ

Task: Debug the following intentionally buggy code:

# Buggy code - find and fix 3 bugs!
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

data = load_iris()
X, y = data.data, data.target

# Bug 1: Wrong test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9)

# Bug 2: Using test data for preprocessing
scaler = StandardScaler()
scaler.fit(X_test)  # Should fit on X_train!
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Bug 3: Wrong labels
model = DecisionTreeClassifier()
model.fit(X_train, y_test)  # Should be y_train!

Solution in next cell ↓

# Solution: Fixed code
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

data = load_iris()
X, y = data.data, data.target

# Fix 1: Correct test size (typically 0.2-0.3)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fix 2: Fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fix 3: Use correct labels
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate
train_acc = model.score(X_train_scaled, y_train)
test_acc = model.score(X_test_scaled, y_test)

print(f"βœ… Train Accuracy: {train_acc:.3f}")
print(f"βœ… Test Accuracy: {test_acc:.3f}")

πŸš€ Next StepsΒΆ

  • Notebook 2: Data Issues Diagnosis

  • Notebook 3: Performance Profiling

  • Notebook 4: Model-Specific Debugging

  • Notebook 5: Error Analysis Framework

Great job! You’ve learned the fundamentals of ML debugging! πŸŽ‰