# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)
9.1 Problem FormulationΒΆ
Linear Model:ΒΆ
where:
x: input features (d-dimensional)
w: weights/parameters
Ξ΅: noise (typically Gaussian)
y: target output
Matrix Form:ΒΆ
For N data points:
where:
X: NΓd design matrix (each row is one data point)
y: NΓ1 target vector
w: dΓ1 weight vector
# Generate synthetic data
np.random.seed(42)
def generate_linear_data(n_samples=100, n_features=1, noise=0.1):
"""Generate linear regression data with Gaussian noise."""
X = np.random.randn(n_samples, n_features)
# True weights
w_true = np.random.randn(n_features, 1) * 2
# Generate y with noise
y = X @ w_true + noise * np.random.randn(n_samples, 1)
return X, y, w_true
# 1D example for visualization
X, y, w_true = generate_linear_data(n_samples=50, n_features=1, noise=0.5)
print("Linear Regression Data")
print(f"\nData shape: X={X.shape}, y={y.shape}")
print(f"True weight w = {w_true[0, 0]:.4f}")
print(f"\nFirst 5 samples:")
print("X y")
print("-" * 20)
for i in range(5):
print(f"{X[i, 0]:6.3f} {y[i, 0]:6.3f}")
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, s=50, label='Data points')
x_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_line = x_line @ w_true
plt.plot(x_line, y_line, 'r-', linewidth=2, label=f'True: y = {w_true[0,0]:.2f}x')
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Linear Regression: Data Generation', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
9.2 Maximum Likelihood Estimation (MLE)ΒΆ
Likelihood Function:ΒΆ
Assuming Gaussian noise: Ξ΅ ~ N(0, ΟΒ²)
Log-Likelihood:ΒΆ
MLE Solution (Normal Equations):ΒΆ
Key: MLE minimizes sum of squared errors!
# Maximum Likelihood Estimation
def fit_mle(X, y):
"""
Fit linear regression using MLE (normal equations).
w_MLE = (X^T X)^{-1} X^T y
"""
# Add bias term (intercept)
X_bias = np.hstack([np.ones((X.shape[0], 1)), X])
# Normal equations
w_mle = np.linalg.inv(X_bias.T @ X_bias) @ X_bias.T @ y
return w_mle
# Fit the model
w_mle = fit_mle(X, y)
print("Maximum Likelihood Estimation")
print(f"\nEstimated parameters:")
print(f" Intercept (wβ): {w_mle[0, 0]:.4f}")
print(f" Slope (wβ): {w_mle[1, 0]:.4f}")
print(f"\nTrue slope: {w_true[0, 0]:.4f}")
# Predictions
X_bias = np.hstack([np.ones((X.shape[0], 1)), X])
y_pred = X_bias @ w_mle
# Compute MSE
mse = np.mean((y - y_pred)**2)
print(f"\nMean Squared Error: {mse:.4f}")
# Visualize
plt.figure(figsize=(12, 5))
# Fit
plt.subplot(1, 2, 1)
plt.scatter(X, y, alpha=0.6, s=50, label='Data')
x_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
X_line_bias = np.hstack([np.ones((x_line.shape[0], 1)), x_line])
y_line_pred = X_line_bias @ w_mle
y_line_true = x_line @ w_true
plt.plot(x_line, y_line_true, 'r--', linewidth=2, label='True model')
plt.plot(x_line, y_line_pred, 'b-', linewidth=2, label='MLE fit')
plt.xlabel('x')
plt.ylabel('y')
plt.title('MLE Linear Regression')
plt.legend()
plt.grid(True, alpha=0.3)
# Residuals
plt.subplot(1, 2, 2)
residuals = y - y_pred
plt.scatter(y_pred, residuals, alpha=0.6, s=50)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted y')
plt.ylabel('Residuals (y - Ε·)')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nResiduals should be randomly scattered around zero!")
9.3 Regularization: Ridge and LassoΒΆ
Ridge Regression (L2 regularization):ΒΆ
Closed form: \(\mathbf{w}_{Ridge} = (\mathbf{X}^T\mathbf{X} + \lambda\mathbf{I})^{-1}\mathbf{X}^T\mathbf{y}\)
Lasso Regression (L1 regularization):ΒΆ
Effect: L1 produces sparse solutions (feature selection)!
# Demonstrate overfitting and regularization
# Generate polynomial features
np.random.seed(42)
n = 30
X_poly = np.linspace(-3, 3, n).reshape(-1, 1)
y_poly = np.sin(X_poly) + 0.3 * np.random.randn(n, 1)
# Test different polynomial degrees
degrees = [1, 3, 9, 15]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
for idx, degree in enumerate(degrees):
# Create polynomial features
poly = PolynomialFeatures(degree=degree, include_bias=True)
X_poly_features = poly.fit_transform(X_poly)
# Fit
w = np.linalg.lstsq(X_poly_features, y_poly, rcond=None)[0]
# Predict on fine grid
X_test = np.linspace(-3, 3, 200).reshape(-1, 1)
X_test_features = poly.transform(X_test)
y_test_pred = X_test_features @ w
# Plot
axes[idx].scatter(X_poly, y_poly, alpha=0.6, s=50, label='Data')
axes[idx].plot(X_test, y_test_pred, 'r-', linewidth=2, label=f'Degree {degree}')
axes[idx].plot(X_test, np.sin(X_test), 'g--', linewidth=2, alpha=0.5, label='True function')
axes[idx].set_xlabel('x')
axes[idx].set_ylabel('y')
axes[idx].set_title(f'Polynomial Degree {degree}')
axes[idx].legend()
axes[idx].grid(True, alpha=0.3)
axes[idx].set_ylim([-2, 2])
plt.tight_layout()
plt.show()
print("Observation:")
print(" Degree 1: Underfitting (too simple)")
print(" Degree 3: Good fit")
print(" Degree 9: Starting to overfit")
print(" Degree 15: Severe overfitting (wild oscillations)")
# Ridge vs Lasso regularization
# High-dimensional example with many features
np.random.seed(42)
n_samples = 100
n_features = 50
X_high = np.random.randn(n_samples, n_features)
# True model uses only 5 features
w_true_sparse = np.zeros((n_features, 1))
active_features = [5, 10, 15, 20, 25]
w_true_sparse[active_features] = np.random.randn(5, 1) * 2
y_high = X_high @ w_true_sparse + 0.5 * np.random.randn(n_samples, 1)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_high, y_high, test_size=0.3, random_state=42
)
# Compare different methods
lambdas = [0.001, 0.01, 0.1, 1.0, 10.0]
results = {'lambda': [], 'ridge_mse': [], 'lasso_mse': [], 'lasso_sparsity': []}
for lam in lambdas:
# Ridge
ridge = Ridge(alpha=lam)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
# Lasso
lasso = Lasso(alpha=lam, max_iter=10000)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
# Sparsity (number of non-zero coefficients)
sparsity = np.sum(np.abs(lasso.coef_) > 1e-5)
results['lambda'].append(lam)
results['ridge_mse'].append(mse_ridge)
results['lasso_mse'].append(mse_lasso)
results['lasso_sparsity'].append(sparsity)
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# MSE comparison
axes[0].semilogx(results['lambda'], results['ridge_mse'], 'bo-', linewidth=2,
markersize=8, label='Ridge')
axes[0].semilogx(results['lambda'], results['lasso_mse'], 'rs-', linewidth=2,
markersize=8, label='Lasso')
axes[0].set_xlabel('Ξ» (regularization strength)', fontsize=12)
axes[0].set_ylabel('Test MSE', fontsize=12)
axes[0].set_title('Ridge vs Lasso: Test Error')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)
# Sparsity
axes[1].semilogx(results['lambda'], results['lasso_sparsity'], 'gs-',
linewidth=2, markersize=8)
axes[1].axhline(y=5, color='r', linestyle='--', linewidth=2, label='True # features')
axes[1].set_xlabel('Ξ» (regularization strength)', fontsize=12)
axes[1].set_ylabel('# Non-zero coefficients', fontsize=12)
axes[1].set_title('Lasso Sparsity (Feature Selection)')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Ridge vs Lasso:")
print("\nRidge (L2):")
print(" - Shrinks all coefficients")
print(" - Never exactly zero")
print(" - Good when all features are relevant")
print("\nLasso (L1):")
print(" - Sets some coefficients to exactly zero")
print(" - Automatic feature selection")
print(" - Good for high-dimensional sparse problems")
print(f"\nTrue model uses {len(active_features)} features")
print(f"Lasso with Ξ»=0.1 selects {results['lasso_sparsity'][2]} features")
9.4 Bayesian Linear RegressionΒΆ
Prior on Weights:ΒΆ
Posterior (after seeing data):ΒΆ
where:
\(\mathbf{m}_N = \beta \mathbf{S}_N \mathbf{X}^T \mathbf{y}\) (posterior mean)
\(\mathbf{S}_N = (\alpha\mathbf{I} + \beta\mathbf{X}^T\mathbf{X})^{-1}\) (posterior covariance)
Predictive Distribution:ΒΆ
Benefit: Quantifies uncertainty in predictions!
# Bayesian Linear Regression
class BayesianLinearRegression:
"""Bayesian linear regression with Gaussian prior."""
def __init__(self, alpha=1.0, beta=1.0):
"""
Parameters:
-----------
alpha : float
Precision of prior on weights (higher = stronger regularization)
beta : float
Precision of noise (higher = less noise)
"""
self.alpha = alpha
self.beta = beta
self.m_N = None
self.S_N = None
def fit(self, X, y):
"""Compute posterior distribution over weights."""
# Add bias
X_bias = np.hstack([np.ones((X.shape[0], 1)), X])
# Posterior covariance: S_N = (Ξ±I + Ξ²X^T X)^{-1}
d = X_bias.shape[1]
self.S_N = np.linalg.inv(
self.alpha * np.eye(d) + self.beta * X_bias.T @ X_bias
)
# Posterior mean: m_N = Ξ² S_N X^T y
self.m_N = self.beta * self.S_N @ X_bias.T @ y
return self
def predict(self, X, return_std=False):
"""
Predict with uncertainty.
Returns mean and optionally standard deviation.
"""
X_bias = np.hstack([np.ones((X.shape[0], 1)), X])
# Predictive mean
y_mean = X_bias @ self.m_N
if return_std:
# Predictive variance: ΟΒ²(x) = 1/Ξ² + x^T S_N x
y_var = 1.0/self.beta + np.sum(X_bias @ self.S_N * X_bias, axis=1, keepdims=True)
y_std = np.sqrt(y_var)
return y_mean, y_std
else:
return y_mean
# Fit Bayesian model
bayes_reg = BayesianLinearRegression(alpha=1.0, beta=25.0)
bayes_reg.fit(X, y)
# Predictions with uncertainty
X_test = np.linspace(X.min() - 1, X.max() + 1, 200).reshape(-1, 1)
y_mean, y_std = bayes_reg.predict(X_test, return_std=True)
print("Bayesian Linear Regression")
print(f"\nPosterior mean (weights):")
print(f" Intercept: {bayes_reg.m_N[0, 0]:.4f}")
print(f" Slope: {bayes_reg.m_N[1, 0]:.4f}")
print(f"\nPosterior covariance:")
print(bayes_reg.S_N)
# Visualize with uncertainty
plt.figure(figsize=(12, 6))
plt.scatter(X, y, alpha=0.6, s=50, label='Data', zorder=3)
plt.plot(X_test, y_mean, 'b-', linewidth=2, label='Posterior mean')
plt.fill_between(X_test.ravel(),
(y_mean - 2*y_std).ravel(),
(y_mean + 2*y_std).ravel(),
alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Bayesian Linear Regression with Uncertainty', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
print("\nKey insight: Uncertainty is higher where we have less data!")
print("This is automatic with Bayesian approach.")
# Show effect of more data on uncertainty
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
n_samples_list = [5, 20, 100]
for idx, n in enumerate(n_samples_list):
# Generate data
X_n, y_n, _ = generate_linear_data(n_samples=n, n_features=1, noise=0.5)
# Fit Bayesian model
bayes = BayesianLinearRegression(alpha=1.0, beta=25.0)
bayes.fit(X_n, y_n)
# Predict
y_mean, y_std = bayes.predict(X_test, return_std=True)
# Plot
axes[idx].scatter(X_n, y_n, alpha=0.6, s=50, color='red', zorder=3)
axes[idx].plot(X_test, y_mean, 'b-', linewidth=2)
axes[idx].fill_between(X_test.ravel(),
(y_mean - 2*y_std).ravel(),
(y_mean + 2*y_std).ravel(),
alpha=0.3, color='blue')
axes[idx].set_xlabel('x', fontsize=12)
axes[idx].set_ylabel('y', fontsize=12)
axes[idx].set_title(f'N = {n} samples', fontsize=13)
axes[idx].set_ylim([-4, 4])
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("As we get more data:")
print(" - Uncertainty decreases (confidence band narrows)")
print(" - Posterior converges to true parameters")
print(" - Predictions become more confident")
9.5 Model Selection and EvaluationΒΆ
Metrics:ΒΆ
Mean Squared Error (MSE): $\(MSE = \frac{1}{N}\sum_{i=1}^N (y_i - \hat{y}_i)^2\)$
RΒ² Score: $\(R^2 = 1 - \frac{\sum(y_i - \hat{y}_i)^2}{\sum(y_i - \bar{y})^2}\)$
Cross-Validation: Split data into folds, train on k-1, test on 1
Bias-Variance Tradeoff:ΒΆ
Underfitting (high bias): Model too simple
Overfitting (high variance): Model too complex
Sweet spot: Just right complexity
# Demonstrate bias-variance tradeoff
def generate_nonlinear_data(n=100):
"""Generate data from nonlinear function."""
X = np.random.uniform(-3, 3, n).reshape(-1, 1)
y = np.sin(X) + 0.3 * np.random.randn(n, 1)
return X, y
# Generate train and test data
np.random.seed(42)
X_train, y_train = generate_nonlinear_data(n=50)
X_test, y_test = generate_nonlinear_data(n=1000)
# Test different polynomial degrees
degrees = range(1, 16)
train_errors = []
test_errors = []
for degree in degrees:
# Create features
poly = PolynomialFeatures(degree=degree, include_bias=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Fit
w = np.linalg.lstsq(X_train_poly, y_train, rcond=None)[0]
# Errors
y_train_pred = X_train_poly @ w
y_test_pred = X_test_poly @ w
train_mse = np.mean((y_train - y_train_pred)**2)
test_mse = np.mean((y_test - y_test_pred)**2)
train_errors.append(train_mse)
test_errors.append(test_mse)
# Visualize bias-variance tradeoff
plt.figure(figsize=(12, 6))
plt.plot(degrees, train_errors, 'bo-', linewidth=2, markersize=8, label='Training error')
plt.plot(degrees, test_errors, 'rs-', linewidth=2, markersize=8, label='Test error')
plt.axvline(x=degrees[np.argmin(test_errors)], color='g', linestyle='--',
linewidth=2, label=f'Optimal degree = {degrees[np.argmin(test_errors)]}')
plt.xlabel('Polynomial Degree', fontsize=12)
plt.ylabel('Mean Squared Error', fontsize=12)
plt.title('Bias-Variance Tradeoff', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.yscale('log')
plt.show()
print("Bias-Variance Tradeoff:")
print(f"\nOptimal degree: {degrees[np.argmin(test_errors)]}")
print(f" Training MSE: {train_errors[np.argmin(test_errors)]:.4f}")
print(f" Test MSE: {test_errors[np.argmin(test_errors)]:.4f}")
print("\nObservations:")
print(" - Low degree: High bias (underfitting), train & test errors both high")
print(" - Optimal degree: Good balance")
print(" - High degree: High variance (overfitting), train error low but test error high")
SummaryΒΆ
Key Concepts from Chapter 9:ΒΆ
Linear Model: y = w^T x + Ξ΅
MLE: w_MLE = (X^T X)^{-1} X^T y
Ridge: L2 regularization, shrinks coefficients
Lasso: L1 regularization, produces sparsity
Bayesian: Provides uncertainty quantification
Model Selection: Bias-variance tradeoff
Methods Comparison:ΒΆ
Method |
Advantages |
Disadvantages |
|---|---|---|
MLE |
Simple, closed form |
No regularization, can overfit |
Ridge |
Stable, handles collinearity |
Doesnβt select features |
Lasso |
Feature selection |
Can be unstable |
Bayesian |
Uncertainty quantification |
Computationally expensive |
When to Use:ΒΆ
Few features, lots of data: MLE or Ridge
Many features, some irrelevant: Lasso
Need uncertainty: Bayesian
Collinear features: Ridge
Connection to Other Chapters:ΒΆ
Concept |
Chapter |
|---|---|
Linear algebra (X^T X)^{-1} |
Chapter 2 |
Least squares (minimize |
|
Gradients (optimization) |
Chapter 5, 7 |
Gaussian distribution |
Chapter 6 |
Maximum likelihood |
Chapter 6 |
Next Steps:ΒΆ
Chapter 10: PCA (dimensionality reduction)
Chapter 11: GMM (clustering)
Chapter 12: SVM (classification)
Practice: Implement cross-validation for model selection!