import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error, median_absolute_error
)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from scipy import stats

np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("βœ… Setup complete")

Part 1: Core Regression MetricsΒΆ

1. Mean Absolute Error (MAE)ΒΆ

Formula: MAE = mean(|actual - predicted|)

What it means: Average magnitude of errors

Units: Same as target variable

Pros:

  • Easy to interpret

  • Robust to outliers

  • All errors weighted equally

Cons:

  • Doesn’t penalize large errors more

2. Mean Squared Error (MSE)ΒΆ

Formula: MSE = mean((actual - predicted)Β²)

What it means: Average of squared errors

Units: Squared units of target

Pros:

  • Penalizes large errors more (squared)

  • Mathematically convenient

Cons:

  • Hard to interpret (squared units)

  • Very sensitive to outliers

3. Root Mean Squared Error (RMSE)ΒΆ

Formula: RMSE = sqrt(MSE)

What it means: Square root of MSE

Units: Same as target variable

Pros:

  • Interpretable (same units as target)

  • Penalizes large errors

Cons:

  • Sensitive to outliers

4. RΒ² Score (Coefficient of Determination)ΒΆ

Formula: RΒ² = 1 - (SS_residual / SS_total)

Range: -∞ to 1.0

Interpretation:

  • 1.0: Perfect predictions

  • 0.8-1.0: Strong model

  • 0.6-0.8: Moderate model

  • 0.4-0.6: Weak model

  • 0.0: No better than predicting the mean

  • < 0.0: Worse than predicting the mean!

# Create sample regression data
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate all metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Regression Metrics:")
print("=" * 50)
print(f"MAE:  {mae:.3f} (average error magnitude)")
print(f"MSE:  {mse:.3f} (squared errors)")
print(f"RMSE: {rmse:.3f} (typical error size)")
print(f"RΒ²:   {r2:.3f} (variance explained)")

# Visualize predictions
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X_test, y_test, alpha=0.6, label='Actual')
plt.scatter(X_test, y_pred, alpha=0.6, label='Predicted')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
         'r--', lw=2, label='Perfect Predictions')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'Prediction Plot (RΒ² = {r2:.3f})')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

MAE vs RMSE: When Does the Distinction Matter?ΒΆ

Both MAE and RMSE measure prediction error in the same units as the target variable, but they respond very differently to outliers. MAE treats every error equally – a 1-unit miss and a 100-unit miss both contribute linearly. RMSE, because it squares errors before averaging, amplifies large mistakes: a single large outlier can dominate the entire metric.

Rule of Thumb:

  • RMSE \(\approx\) MAE: Errors are uniformly distributed; no single prediction is wildly off.

  • RMSE \(\gg\) MAE: A few predictions have very large errors. The ratio \(\text{RMSE}/\text{MAE}\) quantifies how β€œheavy-tailed” the error distribution is. If this ratio exceeds roughly 1.4, you likely have significant outlier predictions worth investigating individually.

In production ML, monitoring both metrics together is common practice – MAE for interpretable average performance, RMSE as a sentinel for catastrophic misses.

# Demonstrate MAE vs RMSE on different error distributions
scenarios = [
    {"name": "Consistent Errors", 
     "errors": [1, 2, 1, 2, 1, 2, 1, 2]},
    {"name": "One Outlier", 
     "errors": [1, 1, 1, 1, 1, 1, 1, 100]},
    {"name": "Many Outliers", 
     "errors": [1, 1, 50, 1, 75, 1, 100, 1]}
]

print("MAE vs RMSE Comparison:")
print("=" * 60)

for scenario in scenarios:
    errors = np.array(scenario['errors'])
    mae_val = np.mean(np.abs(errors))
    rmse_val = np.sqrt(np.mean(errors ** 2))
    ratio = rmse_val / mae_val
    
    print(f"\n{scenario['name']}:")
    print(f"  Errors: {errors.tolist()}")
    print(f"  MAE:  {mae_val:.2f}")
    print(f"  RMSE: {rmse_val:.2f}")
    print(f"  RMSE/MAE Ratio: {ratio:.2f}x")
    
    if ratio < 1.2:
        print("  β†’ Errors are consistent")
    elif ratio < 2.0:
        print("  β†’ Some larger errors present")
    else:
        print("  β†’ ⚠️ Significant outliers detected!")

Part 2: Understanding ResidualsΒΆ

What are Residuals?ΒΆ

Residual = Actual - Predicted

Good Model Characteristics:

  • βœ… Residuals randomly scattered around 0

  • βœ… No patterns in residual plot

  • βœ… Normally distributed residuals

  • βœ… Constant variance (homoscedasticity)

Bad Signs:

  • ❌ Patterns in residuals (non-linear relationship)

  • ❌ Increasing variance (heteroscedasticity)

  • ❌ Non-normal distribution

# Calculate residuals
residuals = y_test - y_pred

# Create comprehensive residual analysis
fig = plt.figure(figsize=(15, 10))

# 1. Residuals vs Predicted
plt.subplot(2, 3, 1)
plt.scatter(y_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(alpha=0.3)

# 2. Histogram of Residuals
plt.subplot(2, 3, 2)
plt.hist(residuals, bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.grid(alpha=0.3)

# 3. Q-Q Plot
plt.subplot(2, 3, 3)
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.grid(alpha=0.3)

# 4. Residuals vs Actual
plt.subplot(2, 3, 4)
plt.scatter(y_test, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Actual')
plt.grid(alpha=0.3)

# 5. Absolute Residuals
plt.subplot(2, 3, 5)
plt.scatter(y_pred, np.abs(residuals), alpha=0.6)
plt.xlabel('Predicted Values')
plt.ylabel('Absolute Residuals')
plt.title('Scale-Location Plot')
plt.grid(alpha=0.3)

# 6. Summary Statistics
plt.subplot(2, 3, 6)
plt.axis('off')
summary_text = f"""
Residual Statistics:
{'='*30}
Mean:     {np.mean(residuals):>10.3f}
Std Dev:  {np.std(residuals):>10.3f}
Min:      {np.min(residuals):>10.3f}
Max:      {np.max(residuals):>10.3f}
Median:   {np.median(residuals):>10.3f}

Skewness: {stats.skew(residuals):>10.3f}
Kurtosis: {stats.kurtosis(residuals):>10.3f}
"""
plt.text(0.1, 0.5, summary_text, fontsize=11, family='monospace',
         verticalalignment='center')

plt.tight_layout()
plt.show()

# Normality test
shapiro_stat, shapiro_p = stats.shapiro(residuals)
print(f"\nShapiro-Wilk Test for Normality:")
print(f"  Statistic: {shapiro_stat:.4f}")
print(f"  P-value: {shapiro_p:.4f}")
if shapiro_p > 0.05:
    print("  βœ… Residuals appear normally distributed")
else:
    print("  ⚠️ Residuals may not be normally distributed")

Part 3: R-Squared ExplainedΒΆ

What Does RΒ² Really Mean?ΒΆ

RΒ² = 0.75 means:

  • Model explains 75% of the variance in the data

  • 25% of variance is unexplained (residual)

Components of RΒ²ΒΆ

# Breakdown RΒ² calculation
y_mean = np.mean(y_test)

# Total sum of squares (variance in data)
ss_total = np.sum((y_test - y_mean) ** 2)

# Residual sum of squares (unexplained variance)
ss_residual = np.sum((y_test - y_pred) ** 2)

# RΒ² calculation
r2_manual = 1 - (ss_residual / ss_total)

print("RΒ² Breakdown:")
print("=" * 50)
print(f"Total Variance (SS_total):     {ss_total:.2f}")
print(f"Unexplained Variance (SS_res): {ss_residual:.2f}")
print(f"Explained Variance:            {ss_total - ss_residual:.2f}")
print(f"\nRΒ² = 1 - ({ss_residual:.2f} / {ss_total:.2f})")
print(f"RΒ² = {r2_manual:.3f}")
print(f"\nInterpretation: Model explains {r2_manual*100:.1f}% of variance")

Adjusted R-SquaredΒΆ

Problem with R-Squared: \(R^2\) always increases (or stays the same) when you add more features to a model, even if those features are pure random noise. A model with 100 irrelevant features will have a higher \(R^2\) than a model with 5 meaningful ones, simply because extra parameters give the model more degrees of freedom to fit the training data.

Solution: Adjusted \(R^2\) adds a penalty proportional to the number of features, discouraging unnecessary complexity.

Formula: $\(R^2_{\text{adj}} = 1 - \frac{(1 - R^2)(n - 1)}{n - p - 1}\)$

Where \(n\) is the number of samples and \(p\) is the number of features. When a new feature genuinely improves prediction, the numerator drops enough to offset the shrinking denominator. When it does not, the penalty term dominates and Adjusted \(R^2\) decreases. This makes it a far more reliable guide during feature selection than raw \(R^2\).

def adjusted_r2(r2, n_samples, n_features):
    """Calculate adjusted RΒ²"""
    return 1 - ((1 - r2) * (n_samples - 1) / (n_samples - n_features - 1))

n_samples = len(y_test)
n_features = X_test.shape[1]

adj_r2 = adjusted_r2(r2, n_samples, n_features)

print(f"RΒ²:          {r2:.4f}")
print(f"Adjusted RΒ²: {adj_r2:.4f}")
print(f"\nWith {n_features} feature(s) and {n_samples} samples")

# Show impact of adding features
print("\nπŸ“Š Impact of Adding Features:")
print("Features | RΒ² (example) | Adj RΒ²")
print("-" * 40)
for n_feat in [1, 5, 10, 20]:
    # Assuming RΒ² stays constant (unrealistic but illustrative)
    adj = adjusted_r2(0.75, 100, n_feat)
    print(f"{n_feat:8d} | {0.75:12.4f} | {adj:.4f}")

print("\nπŸ’‘ Notice: Adjusted RΒ² decreases as features increase!")

Part 4: Choosing the Right MetricΒΆ

Decision GuideΒΆ

Situation

Best Metric

Why

Outliers present

MAE or Median AE

Robust to outliers

Large errors bad

RMSE or MSE

Penalizes large errors

Interpretability

MAE or RMSE

Same units as target

Percentage errors

MAPE

Relative error, scale-independent

Model comparison

RΒ²

Normalized, easy to compare

Feature selection

Adjusted RΒ²

Accounts for model complexity

# Real-world example: House Price Prediction
np.random.seed(42)

# Generate synthetic house prices
actual_prices = np.array([200, 300, 250, 400, 350, 500, 280, 320, 450, 600])
predicted_prices = np.array([210, 290, 260, 380, 370, 520, 275, 310, 430, 1000])  # Note: large error on last

# Calculate metrics
mae_house = mean_absolute_error(actual_prices, predicted_prices)
rmse_house = np.sqrt(mean_squared_error(actual_prices, predicted_prices))
mape_house = mean_absolute_percentage_error(actual_prices, predicted_prices) * 100
medae_house = median_absolute_error(actual_prices, predicted_prices)

print("House Price Prediction ($1000s):")
print("=" * 50)
print(f"MAE:        ${mae_house:.1f}k (typical error)")
print(f"RMSE:       ${rmse_house:.1f}k (penalizes large errors)")
print(f"Median AE:  ${medae_house:.1f}k (robust to outliers)")
print(f"MAPE:       {mape_house:.1f}% (percentage error)")

print("\n🏠 Interpretation for Stakeholders:")
print(f"  β€’ Typical prediction error: Β±${mae_house:.1f}k")
print(f"  β€’ Half of predictions within: Β±${medae_house:.1f}k")
print(f"  β€’ Average percentage error: {mape_house:.1f}%")

if rmse_house > mae_house * 1.5:
    print("\n⚠️ Warning: Large outlier errors detected!")
    print(f"   RMSE (${rmse_house:.1f}k) >> MAE (${mae_house:.1f}k)")

Part 5: Outlier HandlingΒΆ

Impact of Outliers on MetricsΒΆ

Outliers in predictions can arise from data entry errors, rare edge cases, or systematic model failures on certain subgroups. Their effect on evaluation metrics is dramatic and non-uniform – some metrics barely flinch while others are thrown off entirely. Understanding this sensitivity is critical when choosing which metric to report to stakeholders and which to use for model selection. The code below injects a single outlier prediction and measures how each metric responds, illustrating why robust metrics like Median Absolute Error are essential when your data has long tails.

# Create predictions with and without outliers
y_clean = np.array([10, 20, 15, 25, 18, 22, 16, 24])
pred_clean = np.array([11, 19, 16, 24, 17, 23, 15, 25])

# Add one outlier
y_outlier = np.append(y_clean, 10)
pred_outlier = np.append(pred_clean, 100)  # Huge error!

# Compare metrics
metrics_comparison = pd.DataFrame({
    'Metric': ['MAE', 'RMSE', 'Median AE'],
    'Without Outlier': [
        mean_absolute_error(y_clean, pred_clean),
        np.sqrt(mean_squared_error(y_clean, pred_clean)),
        median_absolute_error(y_clean, pred_clean)
    ],
    'With Outlier': [
        mean_absolute_error(y_outlier, pred_outlier),
        np.sqrt(mean_squared_error(y_outlier, pred_outlier)),
        median_absolute_error(y_outlier, pred_outlier)
    ]
})

metrics_comparison['% Change'] = (
    (metrics_comparison['With Outlier'] - metrics_comparison['Without Outlier']) / 
    metrics_comparison['Without Outlier'] * 100
)

print("Impact of One Outlier:")
print("=" * 60)
print(metrics_comparison.to_string(index=False))

print("\nπŸ“Š Key Insights:")
print("  β€’ RMSE is HIGHLY sensitive to outliers")
print("  β€’ MAE is moderately affected")
print("  β€’ Median AE is ROBUST to outliers")
print("\nπŸ’‘ Use Median AE when outliers are expected!")

Part 6: Advanced MetricsΒΆ

Mean Absolute Percentage Error (MAPE)ΒΆ

Formula: MAPE = mean(|actual - predicted| / |actual|) Γ— 100%

Pros:

  • Scale-independent (compare across datasets)

  • Easy to interpret (percentage)

Cons:

  • Undefined when actual = 0

  • Asymmetric (penalizes over-predictions more)

# MAPE example with different scales
small_actual = np.array([10, 20, 15, 25])
small_pred = np.array([12, 18, 17, 23])

large_actual = np.array([1000, 2000, 1500, 2500])
large_pred = np.array([1200, 1800, 1700, 2300])

# Same absolute errors, different scales
mae_small = mean_absolute_error(small_actual, small_pred)
mae_large = mean_absolute_error(large_actual, large_pred)

mape_small = mean_absolute_percentage_error(small_actual, small_pred) * 100
mape_large = mean_absolute_percentage_error(large_actual, large_pred) * 100

print("Comparing Different Scales:")
print("=" * 50)
print(f"Small Values (10-25):")
print(f"  MAE:  {mae_small:.2f}")
print(f"  MAPE: {mape_small:.1f}%")
print(f"\nLarge Values (1000-2500):")
print(f"  MAE:  {mae_large:.2f}")
print(f"  MAPE: {mape_large:.1f}%")

print("\nπŸ’‘ Notice: MAPE is the SAME! It's scale-independent.")

Symmetric MAPE (sMAPE)ΒΆ

Standard MAPE has an asymmetry problem: over-predictions and under-predictions of the same magnitude produce different percentage errors. If the actual value is 100 and you predict 150, MAPE is 50%. But if you predict 50, MAPE is 100% – even though both predictions miss by the same absolute amount. sMAPE corrects this by using the average of the actual and predicted values in the denominator.

Formula: $\(\text{sMAPE} = \frac{1}{n}\sum_{i=1}^{n} \frac{2|y_i - \hat{y}_i|}{|y_i| + |\hat{y}_i|} \times 100\%\)$

sMAPE is bounded between 0% and 200%, and treats over- and under-predictions symmetrically. It is commonly used in forecasting competitions like the M-competitions where this symmetry property matters for fair model comparison.

def smape(y_true, y_pred):
    """Calculate Symmetric MAPE"""
    return np.mean(
        2.0 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    ) * 100

# Demonstrate asymmetry
actual = np.array([100])
over_pred = np.array([150])  # 50% over
under_pred = np.array([50])   # 50% under

mape_over = mean_absolute_percentage_error(actual, over_pred) * 100
mape_under = mean_absolute_percentage_error(actual, under_pred) * 100

smape_over = smape(actual, over_pred)
smape_under = smape(actual, under_pred)

print("MAPE vs sMAPE:")
print("=" * 50)
print(f"Actual: 100")
print(f"\nOver-prediction (150):")
print(f"  MAPE:  {mape_over:.1f}%")
print(f"  sMAPE: {smape_over:.1f}%")
print(f"\nUnder-prediction (50):")
print(f"  MAPE:  {mape_under:.1f}%")
print(f"  sMAPE: {smape_under:.1f}%")

print("\nπŸ’‘ sMAPE treats over/under predictions equally!")

🎯 Knowledge Check¢

Q1: When should you use MAE instead of RMSE?
Q2: What does RΒ² = 0.0 mean?
Q3: Why is MAPE problematic when actual values are near zero?
Q4: What pattern in residuals indicates model problems?

Click for answers

A1: When outliers are present or you want metric robust to large errors
A2: Model is no better than predicting the mean
A3: Division by zero or near-zero causes undefined/huge errors
A4: Any systematic pattern (U-shape, increasing variance, trend)

πŸ“š SummaryΒΆ

Quick Reference TableΒΆ

Metric

Best For

Avoid When

MAE

Interpretability, outliers

Need to penalize large errors

RMSE

Penalizing large errors

Many outliers present

RΒ²

Model comparison

Different scales/datasets

MAPE

Different scales

Actual values near zero

Median AE

Robust evaluation

Need sensitivity to all errors

πŸš€ Next StepsΒΆ

  1. Complete Regression Metrics Challenge

  2. Read Notebook 3: LLM Evaluation Metrics

  3. Practice residual analysis

  4. Try different metrics on your data

Great work! You can now properly evaluate regression models! πŸ“ˆ