Run this notebook: Open in Colab Open in Kaggle

# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)

9.1 Problem Formulation¶

Linear Model:¶

\[y = \mathbf{w}^T \mathbf{x} + \epsilon\]

where:

x: input features (d-dimensional)
w: weights/parameters
ε: noise (typically Gaussian)
y: target output

Matrix Form:¶

For N data points:

\[\mathbf{y} = \mathbf{X}\mathbf{w} + \boldsymbol{\epsilon}\]

where:

X: N×d design matrix (each row is one data point)
y: N×1 target vector
w: d×1 weight vector

# Generate synthetic data
np.random.seed(42)

def generate_linear_data(n_samples=100, n_features=1, noise=0.1):
    """Generate linear regression data with Gaussian noise."""
    X = np.random.randn(n_samples, n_features)
    # True weights
    w_true = np.random.randn(n_features, 1) * 2
    # Generate y with noise
    y = X @ w_true + noise * np.random.randn(n_samples, 1)
    return X, y, w_true

# 1D example for visualization
X, y, w_true = generate_linear_data(n_samples=50, n_features=1, noise=0.5)

print("Linear Regression Data")
print(f"\nData shape: X={X.shape}, y={y.shape}")
print(f"True weight w = {w_true[0, 0]:.4f}")
print(f"\nFirst 5 samples:")
print("X         y")
print("-" * 20)
for i in range(5):
    print(f"{X[i, 0]:6.3f}   {y[i, 0]:6.3f}")

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, s=50, label='Data points')
x_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_line = x_line @ w_true
plt.plot(x_line, y_line, 'r-', linewidth=2, label=f'True: y = {w_true[0,0]:.2f}x')
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Linear Regression: Data Generation', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

9.2 Maximum Likelihood Estimation (MLE)¶

Likelihood Function:¶

Assuming Gaussian noise: ε ~ N(0, σ²)

\[p(y | \mathbf{x}, \mathbf{w}, \sigma^2) = \mathcal{N}(y | \mathbf{w}^T\mathbf{x}, \sigma^2)\]

Log-Likelihood:¶

\[\log p(\mathbf{y} | \mathbf{X}, \mathbf{w}, \sigma^2) = -\frac{N}{2}\log(2\pi\sigma^2) - \frac{1}{2\sigma^2}||\mathbf{y} - \mathbf{X}\mathbf{w}||^2\]

MLE Solution (Normal Equations):¶

\[\mathbf{w}_{MLE} = (\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{y}\]

Key: MLE minimizes sum of squared errors!

# Maximum Likelihood Estimation

def fit_mle(X, y):
    """
    Fit linear regression using MLE (normal equations).
    
    w_MLE = (X^T X)^{-1} X^T y
    """
    # Add bias term (intercept)
    X_bias = np.hstack([np.ones((X.shape[0], 1)), X])
    
    # Normal equations
    w_mle = np.linalg.inv(X_bias.T @ X_bias) @ X_bias.T @ y
    
    return w_mle

# Fit the model
w_mle = fit_mle(X, y)

print("Maximum Likelihood Estimation")
print(f"\nEstimated parameters:")
print(f"  Intercept (w₀): {w_mle[0, 0]:.4f}")
print(f"  Slope (w₁):     {w_mle[1, 0]:.4f}")
print(f"\nTrue slope:       {w_true[0, 0]:.4f}")

# Predictions
X_bias = np.hstack([np.ones((X.shape[0], 1)), X])
y_pred = X_bias @ w_mle

# Compute MSE
mse = np.mean((y - y_pred)**2)
print(f"\nMean Squared Error: {mse:.4f}")

# Visualize
plt.figure(figsize=(12, 5))

# Fit
plt.subplot(1, 2, 1)
plt.scatter(X, y, alpha=0.6, s=50, label='Data')
x_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
X_line_bias = np.hstack([np.ones((x_line.shape[0], 1)), x_line])
y_line_pred = X_line_bias @ w_mle
y_line_true = x_line @ w_true
plt.plot(x_line, y_line_true, 'r--', linewidth=2, label='True model')
plt.plot(x_line, y_line_pred, 'b-', linewidth=2, label='MLE fit')
plt.xlabel('x')
plt.ylabel('y')
plt.title('MLE Linear Regression')
plt.legend()
plt.grid(True, alpha=0.3)

# Residuals
plt.subplot(1, 2, 2)
residuals = y - y_pred
plt.scatter(y_pred, residuals, alpha=0.6, s=50)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted y')
plt.ylabel('Residuals (y - ŷ)')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nResiduals should be randomly scattered around zero!")

9.3 Regularization: Ridge and Lasso¶

Ridge Regression (L2 regularization):¶

\[\mathbf{w}_{Ridge} = \arg\min_{\mathbf{w}} ||\mathbf{y} - \mathbf{X}\mathbf{w}||^2 + \lambda||\mathbf{w}||^2\]

Closed form: $\mathbf{w}_{Ridge} = (\mathbf{X}^T\mathbf{X} + \lambda\mathbf{I})^{-1}\mathbf{X}^T\mathbf{y}$

Lasso Regression (L1 regularization):¶

\[\mathbf{w}_{Lasso} = \arg\min_{\mathbf{w}} ||\mathbf{y} - \mathbf{X}\mathbf{w}||^2 + \lambda||\mathbf{w}||_1\]

Effect: L1 produces sparse solutions (feature selection)!

# Demonstrate overfitting and regularization

# Generate polynomial features
np.random.seed(42)
n = 30
X_poly = np.linspace(-3, 3, n).reshape(-1, 1)
y_poly = np.sin(X_poly) + 0.3 * np.random.randn(n, 1)

# Test different polynomial degrees
degrees = [1, 3, 9, 15]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, degree in enumerate(degrees):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree, include_bias=True)
    X_poly_features = poly.fit_transform(X_poly)
    
    # Fit
    w = np.linalg.lstsq(X_poly_features, y_poly, rcond=None)[0]
    
    # Predict on fine grid
    X_test = np.linspace(-3, 3, 200).reshape(-1, 1)
    X_test_features = poly.transform(X_test)
    y_test_pred = X_test_features @ w
    
    # Plot
    axes[idx].scatter(X_poly, y_poly, alpha=0.6, s=50, label='Data')
    axes[idx].plot(X_test, y_test_pred, 'r-', linewidth=2, label=f'Degree {degree}')
    axes[idx].plot(X_test, np.sin(X_test), 'g--', linewidth=2, alpha=0.5, label='True function')
    axes[idx].set_xlabel('x')
    axes[idx].set_ylabel('y')
    axes[idx].set_title(f'Polynomial Degree {degree}')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)
    axes[idx].set_ylim([-2, 2])

plt.tight_layout()
plt.show()

print("Observation:")
print("  Degree 1: Underfitting (too simple)")
print("  Degree 3: Good fit")
print("  Degree 9: Starting to overfit")
print("  Degree 15: Severe overfitting (wild oscillations)")

# Ridge vs Lasso regularization

# High-dimensional example with many features
np.random.seed(42)
n_samples = 100
n_features = 50

X_high = np.random.randn(n_samples, n_features)
# True model uses only 5 features
w_true_sparse = np.zeros((n_features, 1))
active_features = [5, 10, 15, 20, 25]
w_true_sparse[active_features] = np.random.randn(5, 1) * 2

y_high = X_high @ w_true_sparse + 0.5 * np.random.randn(n_samples, 1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_high, y_high, test_size=0.3, random_state=42
)

# Compare different methods
lambdas = [0.001, 0.01, 0.1, 1.0, 10.0]

results = {'lambda': [], 'ridge_mse': [], 'lasso_mse': [], 'lasso_sparsity': []}

for lam in lambdas:
    # Ridge
    ridge = Ridge(alpha=lam)
    ridge.fit(X_train, y_train)
    y_pred_ridge = ridge.predict(X_test)
    mse_ridge = mean_squared_error(y_test, y_pred_ridge)
    
    # Lasso
    lasso = Lasso(alpha=lam, max_iter=10000)
    lasso.fit(X_train, y_train)
    y_pred_lasso = lasso.predict(X_test)
    mse_lasso = mean_squared_error(y_test, y_pred_lasso)
    
    # Sparsity (number of non-zero coefficients)
    sparsity = np.sum(np.abs(lasso.coef_) > 1e-5)
    
    results['lambda'].append(lam)
    results['ridge_mse'].append(mse_ridge)
    results['lasso_mse'].append(mse_lasso)
    results['lasso_sparsity'].append(sparsity)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MSE comparison
axes[0].semilogx(results['lambda'], results['ridge_mse'], 'bo-', linewidth=2, 
                 markersize=8, label='Ridge')
axes[0].semilogx(results['lambda'], results['lasso_mse'], 'rs-', linewidth=2, 
                 markersize=8, label='Lasso')
axes[0].set_xlabel('λ (regularization strength)', fontsize=12)
axes[0].set_ylabel('Test MSE', fontsize=12)
axes[0].set_title('Ridge vs Lasso: Test Error')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Sparsity
axes[1].semilogx(results['lambda'], results['lasso_sparsity'], 'gs-', 
                 linewidth=2, markersize=8)
axes[1].axhline(y=5, color='r', linestyle='--', linewidth=2, label='True # features')
axes[1].set_xlabel('λ (regularization strength)', fontsize=12)
axes[1].set_ylabel('# Non-zero coefficients', fontsize=12)
axes[1].set_title('Lasso Sparsity (Feature Selection)')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Ridge vs Lasso:")
print("\nRidge (L2):")
print("  - Shrinks all coefficients")
print("  - Never exactly zero")
print("  - Good when all features are relevant")

print("\nLasso (L1):")
print("  - Sets some coefficients to exactly zero")
print("  - Automatic feature selection")
print("  - Good for high-dimensional sparse problems")

print(f"\nTrue model uses {len(active_features)} features")
print(f"Lasso with λ=0.1 selects {results['lasso_sparsity'][2]} features")

9.4 Bayesian Linear Regression¶

Prior on Weights:¶

\[p(\mathbf{w}) = \mathcal{N}(\mathbf{w} | \mathbf{0}, \alpha^{-1}\mathbf{I})\]

Posterior (after seeing data):¶

\[p(\mathbf{w} | \mathbf{X}, \mathbf{y}) = \mathcal{N}(\mathbf{w} | \mathbf{m}_N, \mathbf{S}_N)\]

where:

$\mathbf{m}_N = \beta \mathbf{S}_N \mathbf{X}^T \mathbf{y}$ (posterior mean)
$\mathbf{S}_N = (\alpha\mathbf{I} + \beta\mathbf{X}^T\mathbf{X})^{-1}$ (posterior covariance)

Predictive Distribution:¶

\[p(y^* | \mathbf{x}^*, \mathbf{X}, \mathbf{y}) = \mathcal{N}(y^* | \mathbf{m}_N^T\mathbf{x}^*, \sigma_N^2(\mathbf{x}^*))\]

Benefit: Quantifies uncertainty in predictions!

# Bayesian Linear Regression

class BayesianLinearRegression:
    """Bayesian linear regression with Gaussian prior."""
    
    def __init__(self, alpha=1.0, beta=1.0):
        """
        Parameters:
        -----------
        alpha : float
            Precision of prior on weights (higher = stronger regularization)
        beta : float  
            Precision of noise (higher = less noise)
        """
        self.alpha = alpha
        self.beta = beta
        self.m_N = None
        self.S_N = None
        
    def fit(self, X, y):
        """Compute posterior distribution over weights."""
        # Add bias
        X_bias = np.hstack([np.ones((X.shape[0], 1)), X])
        
        # Posterior covariance: S_N = (αI + βX^T X)^{-1}
        d = X_bias.shape[1]
        self.S_N = np.linalg.inv(
            self.alpha * np.eye(d) + self.beta * X_bias.T @ X_bias
        )
        
        # Posterior mean: m_N = β S_N X^T y
        self.m_N = self.beta * self.S_N @ X_bias.T @ y
        
        return self
    
    def predict(self, X, return_std=False):
        """
        Predict with uncertainty.
        
        Returns mean and optionally standard deviation.
        """
        X_bias = np.hstack([np.ones((X.shape[0], 1)), X])
        
        # Predictive mean
        y_mean = X_bias @ self.m_N
        
        if return_std:
            # Predictive variance: σ²(x) = 1/β + x^T S_N x
            y_var = 1.0/self.beta + np.sum(X_bias @ self.S_N * X_bias, axis=1, keepdims=True)
            y_std = np.sqrt(y_var)
            return y_mean, y_std
        else:
            return y_mean

# Fit Bayesian model
bayes_reg = BayesianLinearRegression(alpha=1.0, beta=25.0)
bayes_reg.fit(X, y)

# Predictions with uncertainty
X_test = np.linspace(X.min() - 1, X.max() + 1, 200).reshape(-1, 1)
y_mean, y_std = bayes_reg.predict(X_test, return_std=True)

print("Bayesian Linear Regression")
print(f"\nPosterior mean (weights):")
print(f"  Intercept: {bayes_reg.m_N[0, 0]:.4f}")
print(f"  Slope:     {bayes_reg.m_N[1, 0]:.4f}")

print(f"\nPosterior covariance:")
print(bayes_reg.S_N)

# Visualize with uncertainty
plt.figure(figsize=(12, 6))
plt.scatter(X, y, alpha=0.6, s=50, label='Data', zorder=3)
plt.plot(X_test, y_mean, 'b-', linewidth=2, label='Posterior mean')
plt.fill_between(X_test.ravel(), 
                 (y_mean - 2*y_std).ravel(), 
                 (y_mean + 2*y_std).ravel(),
                 alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Bayesian Linear Regression with Uncertainty', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print("\nKey insight: Uncertainty is higher where we have less data!")
print("This is automatic with Bayesian approach.")

# Show effect of more data on uncertainty

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

n_samples_list = [5, 20, 100]

for idx, n in enumerate(n_samples_list):
    # Generate data
    X_n, y_n, _ = generate_linear_data(n_samples=n, n_features=1, noise=0.5)
    
    # Fit Bayesian model
    bayes = BayesianLinearRegression(alpha=1.0, beta=25.0)
    bayes.fit(X_n, y_n)
    
    # Predict
    y_mean, y_std = bayes.predict(X_test, return_std=True)
    
    # Plot
    axes[idx].scatter(X_n, y_n, alpha=0.6, s=50, color='red', zorder=3)
    axes[idx].plot(X_test, y_mean, 'b-', linewidth=2)
    axes[idx].fill_between(X_test.ravel(), 
                          (y_mean - 2*y_std).ravel(), 
                          (y_mean + 2*y_std).ravel(),
                          alpha=0.3, color='blue')
    axes[idx].set_xlabel('x', fontsize=12)
    axes[idx].set_ylabel('y', fontsize=12)
    axes[idx].set_title(f'N = {n} samples', fontsize=13)
    axes[idx].set_ylim([-4, 4])
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("As we get more data:")
print("  - Uncertainty decreases (confidence band narrows)")
print("  - Posterior converges to true parameters")
print("  - Predictions become more confident")

9.5 Model Selection and Evaluation¶

Metrics:¶

Mean Squared Error (MSE): $$MSE = \frac{1}{N}\sum_{i=1}^N (y_i - \hat{y}_i)^2$$

R² Score: $$R^2 = 1 - \frac{\sum(y_i - \hat{y}_i)^2}{\sum(y_i - \bar{y})^2}$$

Cross-Validation: Split data into folds, train on k-1, test on 1

Bias-Variance Tradeoff:¶

Underfitting (high bias): Model too simple
Overfitting (high variance): Model too complex
Sweet spot: Just right complexity

# Demonstrate bias-variance tradeoff

def generate_nonlinear_data(n=100):
    """Generate data from nonlinear function."""
    X = np.random.uniform(-3, 3, n).reshape(-1, 1)
    y = np.sin(X) + 0.3 * np.random.randn(n, 1)
    return X, y

# Generate train and test data
np.random.seed(42)
X_train, y_train = generate_nonlinear_data(n=50)
X_test, y_test = generate_nonlinear_data(n=1000)

# Test different polynomial degrees
degrees = range(1, 16)
train_errors = []
test_errors = []

for degree in degrees:
    # Create features
    poly = PolynomialFeatures(degree=degree, include_bias=True)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    # Fit
    w = np.linalg.lstsq(X_train_poly, y_train, rcond=None)[0]
    
    # Errors
    y_train_pred = X_train_poly @ w
    y_test_pred = X_test_poly @ w
    
    train_mse = np.mean((y_train - y_train_pred)**2)
    test_mse = np.mean((y_test - y_test_pred)**2)
    
    train_errors.append(train_mse)
    test_errors.append(test_mse)

# Visualize bias-variance tradeoff
plt.figure(figsize=(12, 6))
plt.plot(degrees, train_errors, 'bo-', linewidth=2, markersize=8, label='Training error')
plt.plot(degrees, test_errors, 'rs-', linewidth=2, markersize=8, label='Test error')
plt.axvline(x=degrees[np.argmin(test_errors)], color='g', linestyle='--', 
            linewidth=2, label=f'Optimal degree = {degrees[np.argmin(test_errors)]}')
plt.xlabel('Polynomial Degree', fontsize=12)
plt.ylabel('Mean Squared Error', fontsize=12)
plt.title('Bias-Variance Tradeoff', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.yscale('log')
plt.show()

print("Bias-Variance Tradeoff:")
print(f"\nOptimal degree: {degrees[np.argmin(test_errors)]}")
print(f"  Training MSE: {train_errors[np.argmin(test_errors)]:.4f}")
print(f"  Test MSE:     {test_errors[np.argmin(test_errors)]:.4f}")

print("\nObservations:")
print("  - Low degree: High bias (underfitting), train & test errors both high")
print("  - Optimal degree: Good balance")
print("  - High degree: High variance (overfitting), train error low but test error high")

Summary¶

Key Concepts from Chapter 9:¶

Linear Model: y = w^T x + ε
MLE: w_MLE = (X^T X)^{-1} X^T y
Ridge: L2 regularization, shrinks coefficients
Lasso: L1 regularization, produces sparsity
Bayesian: Provides uncertainty quantification
Model Selection: Bias-variance tradeoff

Methods Comparison:¶

Method	Advantages	Disadvantages
MLE	Simple, closed form	No regularization, can overfit
Ridge	Stable, handles collinearity	Doesn’t select features
Lasso	Feature selection	Can be unstable
Bayesian	Uncertainty quantification	Computationally expensive

When to Use:¶

Few features, lots of data: MLE or Ridge
Many features, some irrelevant: Lasso
Need uncertainty: Bayesian
Collinear features: Ridge

Connection to Other Chapters:¶

Concept	Chapter
Linear algebra (X^T X)^{-1}	Chapter 2
Least squares (minimize
Gradients (optimization)	Chapter 5, 7
Gaussian distribution	Chapter 6
Maximum likelihood	Chapter 6

Next Steps:¶

Chapter 10: PCA (dimensionality reduction)
Chapter 11: GMM (clustering)
Chapter 12: SVM (classification)

Practice: Implement cross-validation for model selection!