import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error, median_absolute_error
)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from scipy import stats
np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("β
Setup complete")
Part 1: Core Regression MetricsΒΆ
1. Mean Absolute Error (MAE)ΒΆ
Formula: MAE = mean(|actual - predicted|)
What it means: Average magnitude of errors
Units: Same as target variable
Pros:
Easy to interpret
Robust to outliers
All errors weighted equally
Cons:
Doesnβt penalize large errors more
2. Mean Squared Error (MSE)ΒΆ
Formula: MSE = mean((actual - predicted)Β²)
What it means: Average of squared errors
Units: Squared units of target
Pros:
Penalizes large errors more (squared)
Mathematically convenient
Cons:
Hard to interpret (squared units)
Very sensitive to outliers
3. Root Mean Squared Error (RMSE)ΒΆ
Formula: RMSE = sqrt(MSE)
What it means: Square root of MSE
Units: Same as target variable
Pros:
Interpretable (same units as target)
Penalizes large errors
Cons:
Sensitive to outliers
4. RΒ² Score (Coefficient of Determination)ΒΆ
Formula: RΒ² = 1 - (SS_residual / SS_total)
Range: -β to 1.0
Interpretation:
1.0: Perfect predictions
0.8-1.0: Strong model
0.6-0.8: Moderate model
0.4-0.6: Weak model
0.0: No better than predicting the mean
< 0.0: Worse than predicting the mean!
# Create sample regression data
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate all metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("Regression Metrics:")
print("=" * 50)
print(f"MAE: {mae:.3f} (average error magnitude)")
print(f"MSE: {mse:.3f} (squared errors)")
print(f"RMSE: {rmse:.3f} (typical error size)")
print(f"RΒ²: {r2:.3f} (variance explained)")
# Visualize predictions
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_test, y_test, alpha=0.6, label='Actual')
plt.scatter(X_test, y_pred, alpha=0.6, label='Predicted')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(alpha=0.3)
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
'r--', lw=2, label='Perfect Predictions')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'Prediction Plot (RΒ² = {r2:.3f})')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
MAE vs RMSE: When Does the Distinction Matter?ΒΆ
Both MAE and RMSE measure prediction error in the same units as the target variable, but they respond very differently to outliers. MAE treats every error equally β a 1-unit miss and a 100-unit miss both contribute linearly. RMSE, because it squares errors before averaging, amplifies large mistakes: a single large outlier can dominate the entire metric.
Rule of Thumb:
RMSE \(\approx\) MAE: Errors are uniformly distributed; no single prediction is wildly off.
RMSE \(\gg\) MAE: A few predictions have very large errors. The ratio \(\text{RMSE}/\text{MAE}\) quantifies how βheavy-tailedβ the error distribution is. If this ratio exceeds roughly 1.4, you likely have significant outlier predictions worth investigating individually.
In production ML, monitoring both metrics together is common practice β MAE for interpretable average performance, RMSE as a sentinel for catastrophic misses.
# Demonstrate MAE vs RMSE on different error distributions
scenarios = [
{"name": "Consistent Errors",
"errors": [1, 2, 1, 2, 1, 2, 1, 2]},
{"name": "One Outlier",
"errors": [1, 1, 1, 1, 1, 1, 1, 100]},
{"name": "Many Outliers",
"errors": [1, 1, 50, 1, 75, 1, 100, 1]}
]
print("MAE vs RMSE Comparison:")
print("=" * 60)
for scenario in scenarios:
errors = np.array(scenario['errors'])
mae_val = np.mean(np.abs(errors))
rmse_val = np.sqrt(np.mean(errors ** 2))
ratio = rmse_val / mae_val
print(f"\n{scenario['name']}:")
print(f" Errors: {errors.tolist()}")
print(f" MAE: {mae_val:.2f}")
print(f" RMSE: {rmse_val:.2f}")
print(f" RMSE/MAE Ratio: {ratio:.2f}x")
if ratio < 1.2:
print(" β Errors are consistent")
elif ratio < 2.0:
print(" β Some larger errors present")
else:
print(" β β οΈ Significant outliers detected!")
Part 2: Understanding ResidualsΒΆ
What are Residuals?ΒΆ
Residual = Actual - Predicted
Good Model Characteristics:
β Residuals randomly scattered around 0
β No patterns in residual plot
β Normally distributed residuals
β Constant variance (homoscedasticity)
Bad Signs:
β Patterns in residuals (non-linear relationship)
β Increasing variance (heteroscedasticity)
β Non-normal distribution
# Calculate residuals
residuals = y_test - y_pred
# Create comprehensive residual analysis
fig = plt.figure(figsize=(15, 10))
# 1. Residuals vs Predicted
plt.subplot(2, 3, 1)
plt.scatter(y_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(alpha=0.3)
# 2. Histogram of Residuals
plt.subplot(2, 3, 2)
plt.hist(residuals, bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.grid(alpha=0.3)
# 3. Q-Q Plot
plt.subplot(2, 3, 3)
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.grid(alpha=0.3)
# 4. Residuals vs Actual
plt.subplot(2, 3, 4)
plt.scatter(y_test, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Actual')
plt.grid(alpha=0.3)
# 5. Absolute Residuals
plt.subplot(2, 3, 5)
plt.scatter(y_pred, np.abs(residuals), alpha=0.6)
plt.xlabel('Predicted Values')
plt.ylabel('Absolute Residuals')
plt.title('Scale-Location Plot')
plt.grid(alpha=0.3)
# 6. Summary Statistics
plt.subplot(2, 3, 6)
plt.axis('off')
summary_text = f"""
Residual Statistics:
{'='*30}
Mean: {np.mean(residuals):>10.3f}
Std Dev: {np.std(residuals):>10.3f}
Min: {np.min(residuals):>10.3f}
Max: {np.max(residuals):>10.3f}
Median: {np.median(residuals):>10.3f}
Skewness: {stats.skew(residuals):>10.3f}
Kurtosis: {stats.kurtosis(residuals):>10.3f}
"""
plt.text(0.1, 0.5, summary_text, fontsize=11, family='monospace',
verticalalignment='center')
plt.tight_layout()
plt.show()
# Normality test
shapiro_stat, shapiro_p = stats.shapiro(residuals)
print(f"\nShapiro-Wilk Test for Normality:")
print(f" Statistic: {shapiro_stat:.4f}")
print(f" P-value: {shapiro_p:.4f}")
if shapiro_p > 0.05:
print(" β
Residuals appear normally distributed")
else:
print(" β οΈ Residuals may not be normally distributed")
Part 3: R-Squared ExplainedΒΆ
What Does RΒ² Really Mean?ΒΆ
RΒ² = 0.75 means:
Model explains 75% of the variance in the data
25% of variance is unexplained (residual)
Components of RΒ²ΒΆ
# Breakdown RΒ² calculation
y_mean = np.mean(y_test)
# Total sum of squares (variance in data)
ss_total = np.sum((y_test - y_mean) ** 2)
# Residual sum of squares (unexplained variance)
ss_residual = np.sum((y_test - y_pred) ** 2)
# RΒ² calculation
r2_manual = 1 - (ss_residual / ss_total)
print("RΒ² Breakdown:")
print("=" * 50)
print(f"Total Variance (SS_total): {ss_total:.2f}")
print(f"Unexplained Variance (SS_res): {ss_residual:.2f}")
print(f"Explained Variance: {ss_total - ss_residual:.2f}")
print(f"\nRΒ² = 1 - ({ss_residual:.2f} / {ss_total:.2f})")
print(f"RΒ² = {r2_manual:.3f}")
print(f"\nInterpretation: Model explains {r2_manual*100:.1f}% of variance")
Adjusted R-SquaredΒΆ
Problem with R-Squared: \(R^2\) always increases (or stays the same) when you add more features to a model, even if those features are pure random noise. A model with 100 irrelevant features will have a higher \(R^2\) than a model with 5 meaningful ones, simply because extra parameters give the model more degrees of freedom to fit the training data.
Solution: Adjusted \(R^2\) adds a penalty proportional to the number of features, discouraging unnecessary complexity.
Formula: $\(R^2_{\text{adj}} = 1 - \frac{(1 - R^2)(n - 1)}{n - p - 1}\)$
Where \(n\) is the number of samples and \(p\) is the number of features. When a new feature genuinely improves prediction, the numerator drops enough to offset the shrinking denominator. When it does not, the penalty term dominates and Adjusted \(R^2\) decreases. This makes it a far more reliable guide during feature selection than raw \(R^2\).
def adjusted_r2(r2, n_samples, n_features):
"""Calculate adjusted RΒ²"""
return 1 - ((1 - r2) * (n_samples - 1) / (n_samples - n_features - 1))
n_samples = len(y_test)
n_features = X_test.shape[1]
adj_r2 = adjusted_r2(r2, n_samples, n_features)
print(f"RΒ²: {r2:.4f}")
print(f"Adjusted RΒ²: {adj_r2:.4f}")
print(f"\nWith {n_features} feature(s) and {n_samples} samples")
# Show impact of adding features
print("\nπ Impact of Adding Features:")
print("Features | RΒ² (example) | Adj RΒ²")
print("-" * 40)
for n_feat in [1, 5, 10, 20]:
# Assuming RΒ² stays constant (unrealistic but illustrative)
adj = adjusted_r2(0.75, 100, n_feat)
print(f"{n_feat:8d} | {0.75:12.4f} | {adj:.4f}")
print("\nπ‘ Notice: Adjusted RΒ² decreases as features increase!")
Part 4: Choosing the Right MetricΒΆ
Decision GuideΒΆ
Situation |
Best Metric |
Why |
|---|---|---|
Outliers present |
MAE or Median AE |
Robust to outliers |
Large errors bad |
RMSE or MSE |
Penalizes large errors |
Interpretability |
MAE or RMSE |
Same units as target |
Percentage errors |
MAPE |
Relative error, scale-independent |
Model comparison |
RΒ² |
Normalized, easy to compare |
Feature selection |
Adjusted RΒ² |
Accounts for model complexity |
# Real-world example: House Price Prediction
np.random.seed(42)
# Generate synthetic house prices
actual_prices = np.array([200, 300, 250, 400, 350, 500, 280, 320, 450, 600])
predicted_prices = np.array([210, 290, 260, 380, 370, 520, 275, 310, 430, 1000]) # Note: large error on last
# Calculate metrics
mae_house = mean_absolute_error(actual_prices, predicted_prices)
rmse_house = np.sqrt(mean_squared_error(actual_prices, predicted_prices))
mape_house = mean_absolute_percentage_error(actual_prices, predicted_prices) * 100
medae_house = median_absolute_error(actual_prices, predicted_prices)
print("House Price Prediction ($1000s):")
print("=" * 50)
print(f"MAE: ${mae_house:.1f}k (typical error)")
print(f"RMSE: ${rmse_house:.1f}k (penalizes large errors)")
print(f"Median AE: ${medae_house:.1f}k (robust to outliers)")
print(f"MAPE: {mape_house:.1f}% (percentage error)")
print("\nπ Interpretation for Stakeholders:")
print(f" β’ Typical prediction error: Β±${mae_house:.1f}k")
print(f" β’ Half of predictions within: Β±${medae_house:.1f}k")
print(f" β’ Average percentage error: {mape_house:.1f}%")
if rmse_house > mae_house * 1.5:
print("\nβ οΈ Warning: Large outlier errors detected!")
print(f" RMSE (${rmse_house:.1f}k) >> MAE (${mae_house:.1f}k)")
Part 5: Outlier HandlingΒΆ
Impact of Outliers on MetricsΒΆ
Outliers in predictions can arise from data entry errors, rare edge cases, or systematic model failures on certain subgroups. Their effect on evaluation metrics is dramatic and non-uniform β some metrics barely flinch while others are thrown off entirely. Understanding this sensitivity is critical when choosing which metric to report to stakeholders and which to use for model selection. The code below injects a single outlier prediction and measures how each metric responds, illustrating why robust metrics like Median Absolute Error are essential when your data has long tails.
# Create predictions with and without outliers
y_clean = np.array([10, 20, 15, 25, 18, 22, 16, 24])
pred_clean = np.array([11, 19, 16, 24, 17, 23, 15, 25])
# Add one outlier
y_outlier = np.append(y_clean, 10)
pred_outlier = np.append(pred_clean, 100) # Huge error!
# Compare metrics
metrics_comparison = pd.DataFrame({
'Metric': ['MAE', 'RMSE', 'Median AE'],
'Without Outlier': [
mean_absolute_error(y_clean, pred_clean),
np.sqrt(mean_squared_error(y_clean, pred_clean)),
median_absolute_error(y_clean, pred_clean)
],
'With Outlier': [
mean_absolute_error(y_outlier, pred_outlier),
np.sqrt(mean_squared_error(y_outlier, pred_outlier)),
median_absolute_error(y_outlier, pred_outlier)
]
})
metrics_comparison['% Change'] = (
(metrics_comparison['With Outlier'] - metrics_comparison['Without Outlier']) /
metrics_comparison['Without Outlier'] * 100
)
print("Impact of One Outlier:")
print("=" * 60)
print(metrics_comparison.to_string(index=False))
print("\nπ Key Insights:")
print(" β’ RMSE is HIGHLY sensitive to outliers")
print(" β’ MAE is moderately affected")
print(" β’ Median AE is ROBUST to outliers")
print("\nπ‘ Use Median AE when outliers are expected!")
Part 6: Advanced MetricsΒΆ
Mean Absolute Percentage Error (MAPE)ΒΆ
Formula: MAPE = mean(|actual - predicted| / |actual|) Γ 100%
Pros:
Scale-independent (compare across datasets)
Easy to interpret (percentage)
Cons:
Undefined when actual = 0
Asymmetric (penalizes over-predictions more)
# MAPE example with different scales
small_actual = np.array([10, 20, 15, 25])
small_pred = np.array([12, 18, 17, 23])
large_actual = np.array([1000, 2000, 1500, 2500])
large_pred = np.array([1200, 1800, 1700, 2300])
# Same absolute errors, different scales
mae_small = mean_absolute_error(small_actual, small_pred)
mae_large = mean_absolute_error(large_actual, large_pred)
mape_small = mean_absolute_percentage_error(small_actual, small_pred) * 100
mape_large = mean_absolute_percentage_error(large_actual, large_pred) * 100
print("Comparing Different Scales:")
print("=" * 50)
print(f"Small Values (10-25):")
print(f" MAE: {mae_small:.2f}")
print(f" MAPE: {mape_small:.1f}%")
print(f"\nLarge Values (1000-2500):")
print(f" MAE: {mae_large:.2f}")
print(f" MAPE: {mape_large:.1f}%")
print("\nπ‘ Notice: MAPE is the SAME! It's scale-independent.")
Symmetric MAPE (sMAPE)ΒΆ
Standard MAPE has an asymmetry problem: over-predictions and under-predictions of the same magnitude produce different percentage errors. If the actual value is 100 and you predict 150, MAPE is 50%. But if you predict 50, MAPE is 100% β even though both predictions miss by the same absolute amount. sMAPE corrects this by using the average of the actual and predicted values in the denominator.
Formula: $\(\text{sMAPE} = \frac{1}{n}\sum_{i=1}^{n} \frac{2|y_i - \hat{y}_i|}{|y_i| + |\hat{y}_i|} \times 100\%\)$
sMAPE is bounded between 0% and 200%, and treats over- and under-predictions symmetrically. It is commonly used in forecasting competitions like the M-competitions where this symmetry property matters for fair model comparison.
def smape(y_true, y_pred):
"""Calculate Symmetric MAPE"""
return np.mean(
2.0 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
) * 100
# Demonstrate asymmetry
actual = np.array([100])
over_pred = np.array([150]) # 50% over
under_pred = np.array([50]) # 50% under
mape_over = mean_absolute_percentage_error(actual, over_pred) * 100
mape_under = mean_absolute_percentage_error(actual, under_pred) * 100
smape_over = smape(actual, over_pred)
smape_under = smape(actual, under_pred)
print("MAPE vs sMAPE:")
print("=" * 50)
print(f"Actual: 100")
print(f"\nOver-prediction (150):")
print(f" MAPE: {mape_over:.1f}%")
print(f" sMAPE: {smape_over:.1f}%")
print(f"\nUnder-prediction (50):")
print(f" MAPE: {mape_under:.1f}%")
print(f" sMAPE: {smape_under:.1f}%")
print("\nπ‘ sMAPE treats over/under predictions equally!")
π― Knowledge CheckΒΆ
Q1: When should you use MAE instead of RMSE?
Q2: What does RΒ² = 0.0 mean?
Q3: Why is MAPE problematic when actual values are near zero?
Q4: What pattern in residuals indicates model problems?
Click for answers
A1: When outliers are present or you want metric robust to large errors
A2: Model is no better than predicting the mean
A3: Division by zero or near-zero causes undefined/huge errors
A4: Any systematic pattern (U-shape, increasing variance, trend)
π SummaryΒΆ
Quick Reference TableΒΆ
Metric |
Best For |
Avoid When |
|---|---|---|
MAE |
Interpretability, outliers |
Need to penalize large errors |
RMSE |
Penalizing large errors |
Many outliers present |
RΒ² |
Model comparison |
Different scales/datasets |
MAPE |
Different scales |
Actual values near zero |
Median AE |
Robust evaluation |
Need sensitivity to all errors |
π Next StepsΒΆ
Complete Regression Metrics Challenge
Read Notebook 3: LLM Evaluation Metrics
Practice residual analysis
Try different metrics on your data
Great work! You can now properly evaluate regression models! π