Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.optimize import minimize
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
from sklearn.datasets import make_classification, load_iris
import pandas as pd

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

1. Linear Regression¶

Model¶

\[y = \mathbf{w}^T \mathbf{x} + \epsilon, \quad \epsilon \sim \mathcal{N}(0, \sigma^2)\]

Maximum Likelihood Estimation (MLE)¶

Objective: Minimize squared error

\[\mathbf{w}_{MLE} = \arg\min_{\mathbf{w}} \|\mathbf{y} - \mathbf{X}\mathbf{w}\|^2\]

Solution (Normal equations):

\[\mathbf{w}_{MLE} = (\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{y}\]

# Generate synthetic data
np.random.seed(42)
n_samples = 100
X_simple = np.linspace(0, 10, n_samples)
true_w = [2, 3]  # intercept, slope
y_true = true_w[0] + true_w[1] * X_simple
noise = np.random.normal(0, 2, n_samples)
y_simple = y_true + noise

# Add intercept term
X_design = np.column_stack([np.ones(n_samples), X_simple])

class LinearRegressionMLE:
    """Linear Regression using Normal Equations"""
    
    def fit(self, X, y):
        """Fit using normal equations: w = (X^T X)^-1 X^T y"""
        self.w = np.linalg.solve(X.T @ X, X.T @ y)
        return self
    
    def predict(self, X):
        return X @ self.w
    
    def mse(self, X, y):
        predictions = self.predict(X)
        return np.mean((y - predictions)**2)

# Fit model
lr_mle = LinearRegressionMLE()
lr_mle.fit(X_design, y_simple)
y_pred = lr_mle.predict(X_design)

# Compare with sklearn
lr_sklearn = LinearRegression()
lr_sklearn.fit(X_simple.reshape(-1, 1), y_simple)

print("Linear Regression Results:")
print(f"True parameters: intercept={true_w[0]}, slope={true_w[1]}")
print(f"\nMLE (custom): intercept={lr_mle.w[0]:.4f}, slope={lr_mle.w[1]:.4f}")
print(f"MSE (custom): {lr_mle.mse(X_design, y_simple):.4f}")
print(f"\nSklearn: intercept={lr_sklearn.intercept_:.4f}, slope={lr_sklearn.coef_[0]:.4f}")
print(f"MSE (sklearn): {mean_squared_error(y_simple, lr_sklearn.predict(X_simple.reshape(-1, 1))):.4f}")

# Visualize
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X_simple, y_simple, alpha=0.6, label='Data')
plt.plot(X_simple, y_true, 'g--', linewidth=2, label='True line')
plt.plot(X_simple, y_pred, 'r-', linewidth=2, label='MLE fit')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Linear Regression: MLE')
plt.legend()
plt.grid(True, alpha=0.3)

# Residuals
plt.subplot(1, 2, 2)
residuals = y_simple - y_pred
plt.scatter(y_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Polynomial Regression and Overfitting¶

What and Why: Polynomial regression extends linear regression by fitting \(y = w_0 + w_1 x + w_2 x^2 + \cdots + w_M x^M\), demonstrating the fundamental bias-variance tradeoff. Low-degree polynomials underfit (high bias, low variance), while high-degree polynomials overfit by fitting noise in the training data (low bias, high variance). Understanding this tradeoff is essential for choosing model complexity in any ML system.

How: The code generates data from a sinusoidal function with added noise, then fits polynomials of increasing degree using np.polyfit. As degree \(M\) increases, training error monotonically decreases, but test error eventually increases when the model starts memorizing noise rather than learning the underlying signal.

Connection: This is the same phenomenon behind choosing the number of layers/neurons in neural networks, the depth of decision trees, or the number of features in any model. Regularization techniques (Ridge, Lasso, dropout, early stopping) all address overfitting by constraining model capacity.

# Generate nonlinear data
np.random.seed(42)
n = 30
X_poly = np.linspace(0, 1, n)
y_poly = np.sin(2 * np.pi * X_poly) + np.random.normal(0, 0.2, n)

# Try different polynomial degrees
degrees = [1, 3, 9, 15]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

X_test_range = np.linspace(0, 1, 200).reshape(-1, 1)
y_true_test = np.sin(2 * np.pi * X_test_range.flatten())

for idx, degree in enumerate(degrees):
    ax = axes[idx // 2, idx % 2]
    
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree, include_bias=True)
    X_poly_train = poly.fit_transform(X_poly.reshape(-1, 1))
    X_poly_test = poly.transform(X_test_range)
    
    # Fit model
    lr = LinearRegressionMLE()
    lr.fit(X_poly_train, y_poly)
    y_pred_test = lr.predict(X_poly_test)
    
    # Calculate errors
    train_mse = lr.mse(X_poly_train, y_poly)
    test_mse = np.mean((y_true_test - y_pred_test)**2)
    
    # Plot
    ax.scatter(X_poly, y_poly, alpha=0.6, s=50, label='Training data')
    ax.plot(X_test_range, y_true_test, 'g--', linewidth=2, alpha=0.7, label='True function')
    ax.plot(X_test_range, y_pred_test, 'r-', linewidth=2, label=f'Degree {degree} fit')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title(f'Degree {degree}\nTrain MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
    ax.set_ylim(-2, 2)
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Observation: Higher degree polynomials overfit!")
print("- Degree 1: Underfits (high bias)")
print("- Degree 3: Good balance")
print("- Degree 9+: Overfits (high variance)")

2. Ridge Regression (L2 Regularization)¶

Objective¶

\[\mathbf{w}_{Ridge} = \arg\min_{\mathbf{w}} \|\mathbf{y} - \mathbf{X}\mathbf{w}\|^2 + \lambda \|\mathbf{w}\|^2\]

Solution¶

\[\mathbf{w}_{Ridge} = (\mathbf{X}^T\mathbf{X} + \lambda\mathbf{I})^{-1}\mathbf{X}^T\mathbf{y}\]

Ridge regression shrinks coefficients towards zero, preventing overfitting.

class RidgeRegression:
    """Ridge Regression with L2 regularization"""
    
    def __init__(self, lambda_=1.0):
        self.lambda_ = lambda_
    
    def fit(self, X, y):
        n_features = X.shape[1]
        # Ridge solution: w = (X^T X + λI)^-1 X^T y
        self.w = np.linalg.solve(X.T @ X + self.lambda_ * np.eye(n_features), X.T @ y)
        return self
    
    def predict(self, X):
        return X @ self.w
    
    def mse(self, X, y):
        return np.mean((y - self.predict(X))**2)

# Test Ridge with high-degree polynomial
degree = 15
poly = PolynomialFeatures(degree=degree, include_bias=True)
X_poly_train = poly.fit_transform(X_poly.reshape(-1, 1))
X_poly_test = poly.transform(X_test_range)

# Different regularization strengths
lambdas = [0, 0.001, 0.01, 0.1]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, lam in enumerate(lambdas):
    ax = axes[idx // 2, idx % 2]
    
    ridge = RidgeRegression(lambda_=lam)
    ridge.fit(X_poly_train, y_poly)
    y_pred_test = ridge.predict(X_poly_test)
    
    train_mse = ridge.mse(X_poly_train, y_poly)
    test_mse = np.mean((y_true_test - y_pred_test)**2)
    
    ax.scatter(X_poly, y_poly, alpha=0.6, s=50, label='Training data')
    ax.plot(X_test_range, y_true_test, 'g--', linewidth=2, alpha=0.7, label='True function')
    ax.plot(X_test_range, y_pred_test, 'r-', linewidth=2, label='Ridge fit')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title(f'Ridge λ={lam}\nTrain MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
    ax.set_ylim(-2, 2)
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Weight magnitude comparison
lambdas_range = np.logspace(-4, 2, 50)
weights_norm = []

for lam in lambdas_range:
    ridge = RidgeRegression(lambda_=lam)
    ridge.fit(X_poly_train, y_poly)
    weights_norm.append(np.linalg.norm(ridge.w))

plt.figure(figsize=(10, 5))
plt.semilogx(lambdas_range, weights_norm, 'b-', linewidth=2)
plt.xlabel('λ (regularization strength)')
plt.ylabel('||w|| (weight norm)')
plt.title('Ridge Regularization Path')
plt.grid(True, alpha=0.3)
plt.show()

print("Ridge regularization shrinks weights as λ increases")

3. Bayesian Linear Regression¶

Prior¶

\[p(\mathbf{w}) = \mathcal{N}(\mathbf{0}, \alpha^{-1}\mathbf{I})\]

Likelihood¶

\[p(\mathbf{y}|\mathbf{X}, \mathbf{w}, \sigma^2) = \mathcal{N}(\mathbf{X}\mathbf{w}, \sigma^2\mathbf{I})\]

Posterior¶

\[p(\mathbf{w}|\mathbf{X}, \mathbf{y}) = \mathcal{N}(\mathbf{w}_N, \mathbf{S}_N)\]

Where:

\(\mathbf{S}_N = (\alpha\mathbf{I} + \sigma^{-2}\mathbf{X}^T\mathbf{X})^{-1}\)
\(\mathbf{w}_N = \sigma^{-2}\mathbf{S}_N\mathbf{X}^T\mathbf{y}\)

Predictive Distribution¶

\[p(y_{new}|x_{new}, \mathcal{D}) = \mathcal{N}(\mathbf{w}_N^T\mathbf{x}_{new}, \sigma^2_{new})\]

class BayesianLinearRegression:
    """Bayesian Linear Regression with Gaussian prior"""
    
    def __init__(self, alpha=1.0, sigma2=1.0):
        self.alpha = alpha  # Prior precision
        self.sigma2 = sigma2  # Noise variance
    
    def fit(self, X, y):
        n_features = X.shape[1]
        
        # Posterior covariance: S_N = (αI + σ^-2 X^T X)^-1
        self.S_N = np.linalg.inv(self.alpha * np.eye(n_features) + 
                                 (1/self.sigma2) * X.T @ X)
        
        # Posterior mean: w_N = σ^-2 S_N X^T y
        self.w_N = (1/self.sigma2) * self.S_N @ X.T @ y
        
        return self
    
    def predict(self, X, return_std=False):
        """Predict with uncertainty"""
        # Mean prediction
        y_mean = X @ self.w_N
        
        if return_std:
            # Predictive variance: σ² + x^T S_N x
            y_var = self.sigma2 + np.sum(X @ self.S_N * X, axis=1)
            y_std = np.sqrt(y_var)
            return y_mean, y_std
        
        return y_mean

# Generate data
np.random.seed(42)
n_train = 20
X_bayes = np.linspace(0, 1, n_train)
y_bayes = np.sin(2 * np.pi * X_bayes) + np.random.normal(0, 0.2, n_train)

# Use polynomial features
degree = 9
poly = PolynomialFeatures(degree=degree, include_bias=True)
X_bayes_train = poly.fit_transform(X_bayes.reshape(-1, 1))

# Test points
X_test_fine = np.linspace(-0.2, 1.2, 200)
X_bayes_test = poly.transform(X_test_fine.reshape(-1, 1))
y_true_fine = np.sin(2 * np.pi * X_test_fine)

# Fit Bayesian model
alpha = 2.0  # Regularization (stronger prior)
sigma2 = 0.2**2  # Noise variance
blr = BayesianLinearRegression(alpha=alpha, sigma2=sigma2)
blr.fit(X_bayes_train, y_bayes)

# Predictions with uncertainty
y_pred_mean, y_pred_std = blr.predict(X_bayes_test, return_std=True)

# Compare with MLE
lr_mle = LinearRegressionMLE()
lr_mle.fit(X_bayes_train, y_bayes)
y_pred_mle = lr_mle.predict(X_bayes_test)

# Visualize
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_bayes, y_bayes, s=80, alpha=0.8, label='Training data', zorder=3)
plt.plot(X_test_fine, y_true_fine, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_fine, y_pred_mean, 'r-', linewidth=2, label='Bayesian mean')
plt.fill_between(X_test_fine, 
                 y_pred_mean - 2*y_pred_std, 
                 y_pred_mean + 2*y_pred_std,
                 alpha=0.3, color='red', label='95% credible interval')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Bayesian Linear Regression with Uncertainty')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-0.2, 1.2)

plt.subplot(1, 2, 2)
plt.scatter(X_bayes, y_bayes, s=80, alpha=0.8, label='Training data', zorder=3)
plt.plot(X_test_fine, y_true_fine, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_fine, y_pred_mle, 'b-', linewidth=2, label='MLE (no uncertainty)')
plt.xlabel('x')
plt.ylabel('y')
plt.title('MLE Linear Regression (Point Estimate)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-0.2, 1.2)

plt.tight_layout()
plt.show()

print("Key observation:")
print("- Bayesian method provides uncertainty estimates")
print("- Uncertainty is larger far from training data")
print("- MLE gives only point estimates (no uncertainty)")

4. Logistic Regression for Binary Classification¶

Model¶

\[p(y=1|\mathbf{x}, \mathbf{w}) = \sigma(\mathbf{w}^T\mathbf{x}) = \frac{1}{1 + e^{-\mathbf{w}^T\mathbf{x}}}\]

Negative Log-Likelihood (Cross-Entropy Loss)¶

\[\mathcal{L}(\mathbf{w}) = -\sum_{i=1}^N \left[y_i \log \mu_i + (1-y_i)\log(1-\mu_i)\right]\]

Where \(\mu_i = \sigma(\mathbf{w}^T\mathbf{x}_i)\)

No closed-form solution → use gradient descent or Newton’s method

class LogisticRegressionGD:
    """Logistic Regression using Gradient Descent"""
    
    def __init__(self, learning_rate=0.01, n_iterations=1000, lambda_=0):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.lambda_ = lambda_  # L2 regularization
        self.losses = []
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-np.clip(z, -500, 500)))  # Clip for numerical stability
    
    def compute_loss(self, X, y):
        """Cross-entropy loss with L2 regularization"""
        m = len(y)
        h = self.sigmoid(X @ self.w)
        
        # Cross-entropy
        epsilon = 1e-10  # For numerical stability
        ce_loss = -np.mean(y * np.log(h + epsilon) + (1 - y) * np.log(1 - h + epsilon))
        
        # L2 regularization (don't regularize bias term)
        reg_loss = (self.lambda_ / (2 * m)) * np.sum(self.w[1:]**2)
        
        return ce_loss + reg_loss
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        
        for i in range(self.n_iterations):
            # Forward pass
            h = self.sigmoid(X @ self.w)
            
            # Gradient
            gradient = (1 / n_samples) * X.T @ (h - y)
            
            # Add L2 regularization gradient (don't regularize bias)
            gradient[1:] += (self.lambda_ / n_samples) * self.w[1:]
            
            # Update weights
            self.w -= self.lr * gradient
            
            # Track loss
            if i % 100 == 0:
                loss = self.compute_loss(X, y)
                self.losses.append(loss)
        
        return self
    
    def predict_proba(self, X):
        return self.sigmoid(X @ self.w)
    
    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)

# Generate binary classification data
np.random.seed(42)
X_cls, y_cls = make_classification(n_samples=200, n_features=2, n_informative=2,
                                   n_redundant=0, n_clusters_per_class=1,
                                   class_sep=2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X_cls, y_cls, test_size=0.3, random_state=42)

# Add intercept
X_train_int = np.column_stack([np.ones(len(X_train)), X_train])
X_test_int = np.column_stack([np.ones(len(X_test)), X_test])

# Train custom logistic regression
log_reg_custom = LogisticRegressionGD(learning_rate=0.1, n_iterations=1000, lambda_=0.1)
log_reg_custom.fit(X_train_int, y_train)
y_pred_custom = log_reg_custom.predict(X_test_int)

# Train sklearn logistic regression
log_reg_sklearn = LogisticRegression(C=10, max_iter=1000)
log_reg_sklearn.fit(X_train, y_train)
y_pred_sklearn = log_reg_sklearn.predict(X_test)

print("Logistic Regression Results:")
print(f"Custom implementation accuracy: {accuracy_score(y_test, y_pred_custom):.4f}")
print(f"Sklearn implementation accuracy: {accuracy_score(y_test, y_pred_sklearn):.4f}")

# Visualize decision boundary
def plot_decision_boundary_logistic(model, X, y, title, is_sklearn=False):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    if is_sklearn:
        Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    else:
        X_grid = np.column_stack([np.ones(len(xx.ravel())), xx.ravel(), yy.ravel()])
        Z = model.predict_proba(X_grid)
    
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.3, levels=20, cmap='RdYlBu')
    plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='black', s=50)
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.colorbar(label='P(y=1)')

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

plt.subplot(1, 3, 1)
plot_decision_boundary_logistic(log_reg_custom, X_test, y_test, 'Custom Logistic Regression')

plt.subplot(1, 3, 2)
plot_decision_boundary_logistic(log_reg_sklearn, X_test, y_test, 'Sklearn Logistic Regression', is_sklearn=True)

# Loss curve
plt.subplot(1, 3, 3)
plt.plot(range(0, len(log_reg_custom.losses)*100, 100), log_reg_custom.losses, 'b-', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Loss (Cross-Entropy)')
plt.title('Training Loss Convergence')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

5. Multinomial Logistic Regression (Softmax)¶

Model (K classes)¶

\[p(y=k|\mathbf{x}, \mathbf{W}) = \frac{\exp(\mathbf{w}_k^T\mathbf{x})}{\sum_{j=1}^K \exp(\mathbf{w}_j^T\mathbf{x})}\]

This is called the softmax function.

Cross-Entropy Loss¶

\[\mathcal{L}(\mathbf{W}) = -\sum_{i=1}^N \sum_{k=1}^K \mathbb{I}(y_i=k) \log p(y=k|\mathbf{x}_i, \mathbf{W})\]

class SoftmaxRegression:
    """Multinomial Logistic Regression (Softmax)"""
    
    def __init__(self, learning_rate=0.01, n_iterations=1000, lambda_=0.01):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.lambda_ = lambda_
        self.losses = []
    
    def softmax(self, Z):
        """Numerically stable softmax"""
        exp_Z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return exp_Z / np.sum(exp_Z, axis=1, keepdims=True)
    
    def one_hot(self, y, n_classes):
        """Convert labels to one-hot encoding"""
        one_hot = np.zeros((len(y), n_classes))
        one_hot[np.arange(len(y)), y] = 1
        return one_hot
    
    def compute_loss(self, X, y):
        """Cross-entropy loss with L2 regularization"""
        m = len(y)
        logits = X @ self.W
        probs = self.softmax(logits)
        
        # Cross-entropy
        y_one_hot = self.one_hot(y, self.n_classes)
        ce_loss = -np.mean(np.sum(y_one_hot * np.log(probs + 1e-10), axis=1))
        
        # L2 regularization
        reg_loss = (self.lambda_ / (2 * m)) * np.sum(self.W[1:, :]**2)
        
        return ce_loss + reg_loss
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.n_classes = len(np.unique(y))
        
        # Initialize weights (one weight vector per class)
        self.W = np.zeros((n_features, self.n_classes))
        
        y_one_hot = self.one_hot(y, self.n_classes)
        
        for i in range(self.n_iterations):
            # Forward pass
            logits = X @ self.W
            probs = self.softmax(logits)
            
            # Gradient
            gradient = (1 / n_samples) * X.T @ (probs - y_one_hot)
            
            # Add L2 regularization (don't regularize bias)
            gradient[1:, :] += (self.lambda_ / n_samples) * self.W[1:, :]
            
            # Update weights
            self.W -= self.lr * gradient
            
            # Track loss
            if i % 100 == 0:
                loss = self.compute_loss(X, y)
                self.losses.append(loss)
        
        return self
    
    def predict_proba(self, X):
        logits = X @ self.W
        return self.softmax(logits)
    
    def predict(self, X):
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)

# Load iris dataset (3 classes)
iris = load_iris()
X_iris = iris.data[:, :2]  # Use only first 2 features for visualization
y_iris = iris.target

X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.3, random_state=42)

# Add intercept
X_train_iris_int = np.column_stack([np.ones(len(X_train_iris)), X_train_iris])
X_test_iris_int = np.column_stack([np.ones(len(X_test_iris)), X_test_iris])

# Train custom softmax regression
softmax_custom = SoftmaxRegression(learning_rate=0.1, n_iterations=2000, lambda_=0.1)
softmax_custom.fit(X_train_iris_int, y_train_iris)
y_pred_softmax = softmax_custom.predict(X_test_iris_int)

# Train sklearn logistic regression (multinomial)
log_reg_multi = LogisticRegression(multi_class='multinomial', max_iter=2000)
log_reg_multi.fit(X_train_iris, y_train_iris)
y_pred_sklearn_multi = log_reg_multi.predict(X_test_iris)

print("Softmax Regression Results (Iris 3-class):")
print(f"Custom implementation accuracy: {accuracy_score(y_test_iris, y_pred_softmax):.4f}")
print(f"Sklearn implementation accuracy: {accuracy_score(y_test_iris, y_pred_sklearn_multi):.4f}")

# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Custom softmax confusion matrix
plt.subplot(1, 3, 1)
cm_custom = confusion_matrix(y_test_iris, y_pred_softmax)
sns.heatmap(cm_custom, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Custom Softmax Regression')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Sklearn confusion matrix
plt.subplot(1, 3, 2)
cm_sklearn = confusion_matrix(y_test_iris, y_pred_sklearn_multi)
sns.heatmap(cm_sklearn, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Sklearn Multinomial Logistic')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Loss curve
plt.subplot(1, 3, 3)
plt.plot(range(0, len(softmax_custom.losses)*100, 100), softmax_custom.losses, 'b-', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Loss (Cross-Entropy)')
plt.title('Training Loss Convergence')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Decision boundaries
def plot_multiclass_decision_boundary(model, X, y, title, is_sklearn=False):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    if is_sklearn:
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    else:
        X_grid = np.column_stack([np.ones(len(xx.ravel())), xx.ravel(), yy.ravel()])
        Z = model.predict(X_grid)
    
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='black', s=50)
    plt.title(title)
    plt.xlabel(iris.feature_names[0])
    plt.ylabel(iris.feature_names[1])

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plot_multiclass_decision_boundary(softmax_custom, X_test_iris, y_test_iris, 
                                  'Custom Softmax Regression')

plt.subplot(1, 2, 2)
plot_multiclass_decision_boundary(log_reg_multi, X_test_iris, y_test_iris,
                                  'Sklearn Multinomial Logistic', is_sklearn=True)

plt.tight_layout()
plt.show()

6. Model Selection and Regularization Comparison¶

What and Why: With multiple regression techniques available (OLS, Ridge, Lasso, Elastic Net), model selection determines which approach best balances fit and generalization for a given dataset. This comparison reveals how different regularizers impose different structural assumptions on the solution: Ridge (\(L_2\)) shrinks all coefficients uniformly, Lasso (\(L_1\)) drives some to exactly zero (performing feature selection), and Elastic Net combines both effects.

How: The code fits each regularized model across a range of regularization strengths \(\lambda\) and evaluates on a held-out test set. Cross-validation identifies the optimal \(\lambda\) for each method. The comparison plots show coefficient paths (how weights change with \(\lambda\)) and test error curves, revealing which regularizer best matches the data’s underlying sparsity structure.

Connection: Model selection and regularization tuning are daily tasks in applied ML. Scikit-learn’s RidgeCV, LassoCV, and ElasticNetCV automate this process, but understanding the underlying principles helps diagnose when models underperform.

# Compare different regularization strengths
from sklearn.model_selection import cross_val_score

# Generate high-dimensional data
np.random.seed(42)
X_high, y_high = make_classification(n_samples=100, n_features=50, 
                                     n_informative=10, n_redundant=40,
                                     random_state=42)

# Test different C values (inverse of regularization strength)
C_values = np.logspace(-3, 3, 20)
train_scores = []
val_scores = []

for C in C_values:
    lr = LogisticRegression(C=C, max_iter=1000, random_state=42)
    
    # Train score
    lr.fit(X_high, y_high)
    train_scores.append(lr.score(X_high, y_high))
    
    # Cross-validation score
    cv_scores = cross_val_score(lr, X_high, y_high, cv=5)
    val_scores.append(np.mean(cv_scores))

# Plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.semilogx(C_values, train_scores, 'b-', linewidth=2, label='Training accuracy')
plt.semilogx(C_values, val_scores, 'r-', linewidth=2, label='CV accuracy')
optimal_idx = np.argmax(val_scores)
plt.axvline(C_values[optimal_idx], color='g', linestyle='--', 
           label=f'Optimal C={C_values[optimal_idx]:.3f}')
plt.xlabel('C (inverse regularization strength)')
plt.ylabel('Accuracy')
plt.title('Regularization Strength vs Performance')
plt.legend()
plt.grid(True, alpha=0.3)

# Weight magnitude
plt.subplot(1, 2, 2)
weight_norms = []
for C in C_values:
    lr = LogisticRegression(C=C, max_iter=1000, random_state=42)
    lr.fit(X_high, y_high)
    weight_norms.append(np.linalg.norm(lr.coef_))

plt.semilogx(C_values, weight_norms, 'b-', linewidth=2)
plt.xlabel('C (inverse regularization strength)')
plt.ylabel('||w|| (weight magnitude)')
plt.title('Regularization Effect on Weights')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Optimal C value: {C_values[optimal_idx]:.4f}")
print(f"Best CV accuracy: {val_scores[optimal_idx]:.4f}")
print("\nKey insights:")
print("- Small C (strong regularization): underfitting")
print("- Large C (weak regularization): overfitting")
print("- Optimal C balances bias-variance tradeoff")

Summary¶

In this notebook, we covered:

Linear Regression: MLE solution via normal equations
Ridge Regression: L2 regularization prevents overfitting
Bayesian Linear Regression: Provides uncertainty quantification
Logistic Regression: Binary classification with sigmoid function
Softmax Regression: Multiclass classification
Model Selection: Choosing optimal regularization strength

Key Takeaways¶

Linear regression: MLE ≡ minimizing squared error, has closed-form solution
Regularization: Essential for high-dimensional or limited data problems
Bayesian approach: Provides uncertainty estimates, useful for decision-making
Logistic regression: No closed-form solution, requires iterative optimization
Softmax: Natural extension of logistic regression to multiple classes
Cross-validation: Critical for selecting hyperparameters

Exercises¶

Implement Lasso regression (L1 regularization) using coordinate descent
Derive the gradient for logistic regression
Implement Newton’s method for logistic regression
Compare Ridge, Lasso, and Elastic Net on a high-dimensional dataset
Implement Bayesian logistic regression using Laplace approximation