import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.optimize import minimize
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
from sklearn.datasets import make_classification, load_iris
import pandas as pd
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)
1. Linear RegressionΒΆ
ModelΒΆ
Maximum Likelihood Estimation (MLE)ΒΆ
Objective: Minimize squared error
Solution (Normal equations):
# Generate synthetic data
np.random.seed(42)
n_samples = 100
X_simple = np.linspace(0, 10, n_samples)
true_w = [2, 3] # intercept, slope
y_true = true_w[0] + true_w[1] * X_simple
noise = np.random.normal(0, 2, n_samples)
y_simple = y_true + noise
# Add intercept term
X_design = np.column_stack([np.ones(n_samples), X_simple])
class LinearRegressionMLE:
"""Linear Regression using Normal Equations"""
def fit(self, X, y):
"""Fit using normal equations: w = (X^T X)^-1 X^T y"""
self.w = np.linalg.solve(X.T @ X, X.T @ y)
return self
def predict(self, X):
return X @ self.w
def mse(self, X, y):
predictions = self.predict(X)
return np.mean((y - predictions)**2)
# Fit model
lr_mle = LinearRegressionMLE()
lr_mle.fit(X_design, y_simple)
y_pred = lr_mle.predict(X_design)
# Compare with sklearn
lr_sklearn = LinearRegression()
lr_sklearn.fit(X_simple.reshape(-1, 1), y_simple)
print("Linear Regression Results:")
print(f"True parameters: intercept={true_w[0]}, slope={true_w[1]}")
print(f"\nMLE (custom): intercept={lr_mle.w[0]:.4f}, slope={lr_mle.w[1]:.4f}")
print(f"MSE (custom): {lr_mle.mse(X_design, y_simple):.4f}")
print(f"\nSklearn: intercept={lr_sklearn.intercept_:.4f}, slope={lr_sklearn.coef_[0]:.4f}")
print(f"MSE (sklearn): {mean_squared_error(y_simple, lr_sklearn.predict(X_simple.reshape(-1, 1))):.4f}")
# Visualize
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_simple, y_simple, alpha=0.6, label='Data')
plt.plot(X_simple, y_true, 'g--', linewidth=2, label='True line')
plt.plot(X_simple, y_pred, 'r-', linewidth=2, label='MLE fit')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Linear Regression: MLE')
plt.legend()
plt.grid(True, alpha=0.3)
# Residuals
plt.subplot(1, 2, 2)
residuals = y_simple - y_pred
plt.scatter(y_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Polynomial Regression and OverfittingΒΆ
What and Why: Polynomial regression extends linear regression by fitting \(y = w_0 + w_1 x + w_2 x^2 + \cdots + w_M x^M\), demonstrating the fundamental bias-variance tradeoff. Low-degree polynomials underfit (high bias, low variance), while high-degree polynomials overfit by fitting noise in the training data (low bias, high variance). Understanding this tradeoff is essential for choosing model complexity in any ML system.
How: The code generates data from a sinusoidal function with added noise, then fits polynomials of increasing degree using np.polyfit. As degree \(M\) increases, training error monotonically decreases, but test error eventually increases when the model starts memorizing noise rather than learning the underlying signal.
Connection: This is the same phenomenon behind choosing the number of layers/neurons in neural networks, the depth of decision trees, or the number of features in any model. Regularization techniques (Ridge, Lasso, dropout, early stopping) all address overfitting by constraining model capacity.
# Generate nonlinear data
np.random.seed(42)
n = 30
X_poly = np.linspace(0, 1, n)
y_poly = np.sin(2 * np.pi * X_poly) + np.random.normal(0, 0.2, n)
# Try different polynomial degrees
degrees = [1, 3, 9, 15]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
X_test_range = np.linspace(0, 1, 200).reshape(-1, 1)
y_true_test = np.sin(2 * np.pi * X_test_range.flatten())
for idx, degree in enumerate(degrees):
ax = axes[idx // 2, idx % 2]
# Create polynomial features
poly = PolynomialFeatures(degree=degree, include_bias=True)
X_poly_train = poly.fit_transform(X_poly.reshape(-1, 1))
X_poly_test = poly.transform(X_test_range)
# Fit model
lr = LinearRegressionMLE()
lr.fit(X_poly_train, y_poly)
y_pred_test = lr.predict(X_poly_test)
# Calculate errors
train_mse = lr.mse(X_poly_train, y_poly)
test_mse = np.mean((y_true_test - y_pred_test)**2)
# Plot
ax.scatter(X_poly, y_poly, alpha=0.6, s=50, label='Training data')
ax.plot(X_test_range, y_true_test, 'g--', linewidth=2, alpha=0.7, label='True function')
ax.plot(X_test_range, y_pred_test, 'r-', linewidth=2, label=f'Degree {degree} fit')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title(f'Degree {degree}\nTrain MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
ax.set_ylim(-2, 2)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Observation: Higher degree polynomials overfit!")
print("- Degree 1: Underfits (high bias)")
print("- Degree 3: Good balance")
print("- Degree 9+: Overfits (high variance)")
2. Ridge Regression (L2 Regularization)ΒΆ
ObjectiveΒΆ
SolutionΒΆ
Ridge regression shrinks coefficients towards zero, preventing overfitting.
class RidgeRegression:
"""Ridge Regression with L2 regularization"""
def __init__(self, lambda_=1.0):
self.lambda_ = lambda_
def fit(self, X, y):
n_features = X.shape[1]
# Ridge solution: w = (X^T X + Ξ»I)^-1 X^T y
self.w = np.linalg.solve(X.T @ X + self.lambda_ * np.eye(n_features), X.T @ y)
return self
def predict(self, X):
return X @ self.w
def mse(self, X, y):
return np.mean((y - self.predict(X))**2)
# Test Ridge with high-degree polynomial
degree = 15
poly = PolynomialFeatures(degree=degree, include_bias=True)
X_poly_train = poly.fit_transform(X_poly.reshape(-1, 1))
X_poly_test = poly.transform(X_test_range)
# Different regularization strengths
lambdas = [0, 0.001, 0.01, 0.1]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for idx, lam in enumerate(lambdas):
ax = axes[idx // 2, idx % 2]
ridge = RidgeRegression(lambda_=lam)
ridge.fit(X_poly_train, y_poly)
y_pred_test = ridge.predict(X_poly_test)
train_mse = ridge.mse(X_poly_train, y_poly)
test_mse = np.mean((y_true_test - y_pred_test)**2)
ax.scatter(X_poly, y_poly, alpha=0.6, s=50, label='Training data')
ax.plot(X_test_range, y_true_test, 'g--', linewidth=2, alpha=0.7, label='True function')
ax.plot(X_test_range, y_pred_test, 'r-', linewidth=2, label='Ridge fit')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title(f'Ridge Ξ»={lam}\nTrain MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
ax.set_ylim(-2, 2)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Weight magnitude comparison
lambdas_range = np.logspace(-4, 2, 50)
weights_norm = []
for lam in lambdas_range:
ridge = RidgeRegression(lambda_=lam)
ridge.fit(X_poly_train, y_poly)
weights_norm.append(np.linalg.norm(ridge.w))
plt.figure(figsize=(10, 5))
plt.semilogx(lambdas_range, weights_norm, 'b-', linewidth=2)
plt.xlabel('Ξ» (regularization strength)')
plt.ylabel('||w|| (weight norm)')
plt.title('Ridge Regularization Path')
plt.grid(True, alpha=0.3)
plt.show()
print("Ridge regularization shrinks weights as Ξ» increases")
3. Bayesian Linear RegressionΒΆ
PriorΒΆ
LikelihoodΒΆ
PosteriorΒΆ
Where:
\(\mathbf{S}_N = (\alpha\mathbf{I} + \sigma^{-2}\mathbf{X}^T\mathbf{X})^{-1}\)
\(\mathbf{w}_N = \sigma^{-2}\mathbf{S}_N\mathbf{X}^T\mathbf{y}\)
Predictive DistributionΒΆ
class BayesianLinearRegression:
"""Bayesian Linear Regression with Gaussian prior"""
def __init__(self, alpha=1.0, sigma2=1.0):
self.alpha = alpha # Prior precision
self.sigma2 = sigma2 # Noise variance
def fit(self, X, y):
n_features = X.shape[1]
# Posterior covariance: S_N = (Ξ±I + Ο^-2 X^T X)^-1
self.S_N = np.linalg.inv(self.alpha * np.eye(n_features) +
(1/self.sigma2) * X.T @ X)
# Posterior mean: w_N = Ο^-2 S_N X^T y
self.w_N = (1/self.sigma2) * self.S_N @ X.T @ y
return self
def predict(self, X, return_std=False):
"""Predict with uncertainty"""
# Mean prediction
y_mean = X @ self.w_N
if return_std:
# Predictive variance: ΟΒ² + x^T S_N x
y_var = self.sigma2 + np.sum(X @ self.S_N * X, axis=1)
y_std = np.sqrt(y_var)
return y_mean, y_std
return y_mean
# Generate data
np.random.seed(42)
n_train = 20
X_bayes = np.linspace(0, 1, n_train)
y_bayes = np.sin(2 * np.pi * X_bayes) + np.random.normal(0, 0.2, n_train)
# Use polynomial features
degree = 9
poly = PolynomialFeatures(degree=degree, include_bias=True)
X_bayes_train = poly.fit_transform(X_bayes.reshape(-1, 1))
# Test points
X_test_fine = np.linspace(-0.2, 1.2, 200)
X_bayes_test = poly.transform(X_test_fine.reshape(-1, 1))
y_true_fine = np.sin(2 * np.pi * X_test_fine)
# Fit Bayesian model
alpha = 2.0 # Regularization (stronger prior)
sigma2 = 0.2**2 # Noise variance
blr = BayesianLinearRegression(alpha=alpha, sigma2=sigma2)
blr.fit(X_bayes_train, y_bayes)
# Predictions with uncertainty
y_pred_mean, y_pred_std = blr.predict(X_bayes_test, return_std=True)
# Compare with MLE
lr_mle = LinearRegressionMLE()
lr_mle.fit(X_bayes_train, y_bayes)
y_pred_mle = lr_mle.predict(X_bayes_test)
# Visualize
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.scatter(X_bayes, y_bayes, s=80, alpha=0.8, label='Training data', zorder=3)
plt.plot(X_test_fine, y_true_fine, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_fine, y_pred_mean, 'r-', linewidth=2, label='Bayesian mean')
plt.fill_between(X_test_fine,
y_pred_mean - 2*y_pred_std,
y_pred_mean + 2*y_pred_std,
alpha=0.3, color='red', label='95% credible interval')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Bayesian Linear Regression with Uncertainty')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-0.2, 1.2)
plt.subplot(1, 2, 2)
plt.scatter(X_bayes, y_bayes, s=80, alpha=0.8, label='Training data', zorder=3)
plt.plot(X_test_fine, y_true_fine, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_fine, y_pred_mle, 'b-', linewidth=2, label='MLE (no uncertainty)')
plt.xlabel('x')
plt.ylabel('y')
plt.title('MLE Linear Regression (Point Estimate)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-0.2, 1.2)
plt.tight_layout()
plt.show()
print("Key observation:")
print("- Bayesian method provides uncertainty estimates")
print("- Uncertainty is larger far from training data")
print("- MLE gives only point estimates (no uncertainty)")
4. Logistic Regression for Binary ClassificationΒΆ
ModelΒΆ
Negative Log-Likelihood (Cross-Entropy Loss)ΒΆ
Where \(\mu_i = \sigma(\mathbf{w}^T\mathbf{x}_i)\)
No closed-form solution β use gradient descent or Newtonβs method
class LogisticRegressionGD:
"""Logistic Regression using Gradient Descent"""
def __init__(self, learning_rate=0.01, n_iterations=1000, lambda_=0):
self.lr = learning_rate
self.n_iterations = n_iterations
self.lambda_ = lambda_ # L2 regularization
self.losses = []
def sigmoid(self, z):
return 1 / (1 + np.exp(-np.clip(z, -500, 500))) # Clip for numerical stability
def compute_loss(self, X, y):
"""Cross-entropy loss with L2 regularization"""
m = len(y)
h = self.sigmoid(X @ self.w)
# Cross-entropy
epsilon = 1e-10 # For numerical stability
ce_loss = -np.mean(y * np.log(h + epsilon) + (1 - y) * np.log(1 - h + epsilon))
# L2 regularization (don't regularize bias term)
reg_loss = (self.lambda_ / (2 * m)) * np.sum(self.w[1:]**2)
return ce_loss + reg_loss
def fit(self, X, y):
n_samples, n_features = X.shape
self.w = np.zeros(n_features)
for i in range(self.n_iterations):
# Forward pass
h = self.sigmoid(X @ self.w)
# Gradient
gradient = (1 / n_samples) * X.T @ (h - y)
# Add L2 regularization gradient (don't regularize bias)
gradient[1:] += (self.lambda_ / n_samples) * self.w[1:]
# Update weights
self.w -= self.lr * gradient
# Track loss
if i % 100 == 0:
loss = self.compute_loss(X, y)
self.losses.append(loss)
return self
def predict_proba(self, X):
return self.sigmoid(X @ self.w)
def predict(self, X, threshold=0.5):
return (self.predict_proba(X) >= threshold).astype(int)
# Generate binary classification data
np.random.seed(42)
X_cls, y_cls = make_classification(n_samples=200, n_features=2, n_informative=2,
n_redundant=0, n_clusters_per_class=1,
class_sep=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_cls, y_cls, test_size=0.3, random_state=42)
# Add intercept
X_train_int = np.column_stack([np.ones(len(X_train)), X_train])
X_test_int = np.column_stack([np.ones(len(X_test)), X_test])
# Train custom logistic regression
log_reg_custom = LogisticRegressionGD(learning_rate=0.1, n_iterations=1000, lambda_=0.1)
log_reg_custom.fit(X_train_int, y_train)
y_pred_custom = log_reg_custom.predict(X_test_int)
# Train sklearn logistic regression
log_reg_sklearn = LogisticRegression(C=10, max_iter=1000)
log_reg_sklearn.fit(X_train, y_train)
y_pred_sklearn = log_reg_sklearn.predict(X_test)
print("Logistic Regression Results:")
print(f"Custom implementation accuracy: {accuracy_score(y_test, y_pred_custom):.4f}")
print(f"Sklearn implementation accuracy: {accuracy_score(y_test, y_pred_sklearn):.4f}")
# Visualize decision boundary
def plot_decision_boundary_logistic(model, X, y, title, is_sklearn=False):
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
if is_sklearn:
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
else:
X_grid = np.column_stack([np.ones(len(xx.ravel())), xx.ravel(), yy.ravel()])
Z = model.predict_proba(X_grid)
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3, levels=20, cmap='RdYlBu')
plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='black', s=50)
plt.title(title)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='P(y=1)')
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
plt.subplot(1, 3, 1)
plot_decision_boundary_logistic(log_reg_custom, X_test, y_test, 'Custom Logistic Regression')
plt.subplot(1, 3, 2)
plot_decision_boundary_logistic(log_reg_sklearn, X_test, y_test, 'Sklearn Logistic Regression', is_sklearn=True)
# Loss curve
plt.subplot(1, 3, 3)
plt.plot(range(0, len(log_reg_custom.losses)*100, 100), log_reg_custom.losses, 'b-', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Loss (Cross-Entropy)')
plt.title('Training Loss Convergence')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
5. Multinomial Logistic Regression (Softmax)ΒΆ
Model (K classes)ΒΆ
This is called the softmax function.
Cross-Entropy LossΒΆ
class SoftmaxRegression:
"""Multinomial Logistic Regression (Softmax)"""
def __init__(self, learning_rate=0.01, n_iterations=1000, lambda_=0.01):
self.lr = learning_rate
self.n_iterations = n_iterations
self.lambda_ = lambda_
self.losses = []
def softmax(self, Z):
"""Numerically stable softmax"""
exp_Z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
return exp_Z / np.sum(exp_Z, axis=1, keepdims=True)
def one_hot(self, y, n_classes):
"""Convert labels to one-hot encoding"""
one_hot = np.zeros((len(y), n_classes))
one_hot[np.arange(len(y)), y] = 1
return one_hot
def compute_loss(self, X, y):
"""Cross-entropy loss with L2 regularization"""
m = len(y)
logits = X @ self.W
probs = self.softmax(logits)
# Cross-entropy
y_one_hot = self.one_hot(y, self.n_classes)
ce_loss = -np.mean(np.sum(y_one_hot * np.log(probs + 1e-10), axis=1))
# L2 regularization
reg_loss = (self.lambda_ / (2 * m)) * np.sum(self.W[1:, :]**2)
return ce_loss + reg_loss
def fit(self, X, y):
n_samples, n_features = X.shape
self.n_classes = len(np.unique(y))
# Initialize weights (one weight vector per class)
self.W = np.zeros((n_features, self.n_classes))
y_one_hot = self.one_hot(y, self.n_classes)
for i in range(self.n_iterations):
# Forward pass
logits = X @ self.W
probs = self.softmax(logits)
# Gradient
gradient = (1 / n_samples) * X.T @ (probs - y_one_hot)
# Add L2 regularization (don't regularize bias)
gradient[1:, :] += (self.lambda_ / n_samples) * self.W[1:, :]
# Update weights
self.W -= self.lr * gradient
# Track loss
if i % 100 == 0:
loss = self.compute_loss(X, y)
self.losses.append(loss)
return self
def predict_proba(self, X):
logits = X @ self.W
return self.softmax(logits)
def predict(self, X):
probs = self.predict_proba(X)
return np.argmax(probs, axis=1)
# Load iris dataset (3 classes)
iris = load_iris()
X_iris = iris.data[:, :2] # Use only first 2 features for visualization
y_iris = iris.target
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
X_iris, y_iris, test_size=0.3, random_state=42)
# Add intercept
X_train_iris_int = np.column_stack([np.ones(len(X_train_iris)), X_train_iris])
X_test_iris_int = np.column_stack([np.ones(len(X_test_iris)), X_test_iris])
# Train custom softmax regression
softmax_custom = SoftmaxRegression(learning_rate=0.1, n_iterations=2000, lambda_=0.1)
softmax_custom.fit(X_train_iris_int, y_train_iris)
y_pred_softmax = softmax_custom.predict(X_test_iris_int)
# Train sklearn logistic regression (multinomial)
log_reg_multi = LogisticRegression(multi_class='multinomial', max_iter=2000)
log_reg_multi.fit(X_train_iris, y_train_iris)
y_pred_sklearn_multi = log_reg_multi.predict(X_test_iris)
print("Softmax Regression Results (Iris 3-class):")
print(f"Custom implementation accuracy: {accuracy_score(y_test_iris, y_pred_softmax):.4f}")
print(f"Sklearn implementation accuracy: {accuracy_score(y_test_iris, y_pred_sklearn_multi):.4f}")
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Custom softmax confusion matrix
plt.subplot(1, 3, 1)
cm_custom = confusion_matrix(y_test_iris, y_pred_softmax)
sns.heatmap(cm_custom, annot=True, fmt='d', cmap='Blues',
xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Custom Softmax Regression')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
# Sklearn confusion matrix
plt.subplot(1, 3, 2)
cm_sklearn = confusion_matrix(y_test_iris, y_pred_sklearn_multi)
sns.heatmap(cm_sklearn, annot=True, fmt='d', cmap='Blues',
xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Sklearn Multinomial Logistic')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
# Loss curve
plt.subplot(1, 3, 3)
plt.plot(range(0, len(softmax_custom.losses)*100, 100), softmax_custom.losses, 'b-', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Loss (Cross-Entropy)')
plt.title('Training Loss Convergence')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Decision boundaries
def plot_multiclass_decision_boundary(model, X, y, title, is_sklearn=False):
h = 0.02
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
if is_sklearn:
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
else:
X_grid = np.column_stack([np.ones(len(xx.ravel())), xx.ravel(), yy.ravel()])
Z = model.predict(X_grid)
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='black', s=50)
plt.title(title)
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plot_multiclass_decision_boundary(softmax_custom, X_test_iris, y_test_iris,
'Custom Softmax Regression')
plt.subplot(1, 2, 2)
plot_multiclass_decision_boundary(log_reg_multi, X_test_iris, y_test_iris,
'Sklearn Multinomial Logistic', is_sklearn=True)
plt.tight_layout()
plt.show()
6. Model Selection and Regularization ComparisonΒΆ
What and Why: With multiple regression techniques available (OLS, Ridge, Lasso, Elastic Net), model selection determines which approach best balances fit and generalization for a given dataset. This comparison reveals how different regularizers impose different structural assumptions on the solution: Ridge (\(L_2\)) shrinks all coefficients uniformly, Lasso (\(L_1\)) drives some to exactly zero (performing feature selection), and Elastic Net combines both effects.
How: The code fits each regularized model across a range of regularization strengths \(\lambda\) and evaluates on a held-out test set. Cross-validation identifies the optimal \(\lambda\) for each method. The comparison plots show coefficient paths (how weights change with \(\lambda\)) and test error curves, revealing which regularizer best matches the dataβs underlying sparsity structure.
Connection: Model selection and regularization tuning are daily tasks in applied ML. Scikit-learnβs RidgeCV, LassoCV, and ElasticNetCV automate this process, but understanding the underlying principles helps diagnose when models underperform.
# Compare different regularization strengths
from sklearn.model_selection import cross_val_score
# Generate high-dimensional data
np.random.seed(42)
X_high, y_high = make_classification(n_samples=100, n_features=50,
n_informative=10, n_redundant=40,
random_state=42)
# Test different C values (inverse of regularization strength)
C_values = np.logspace(-3, 3, 20)
train_scores = []
val_scores = []
for C in C_values:
lr = LogisticRegression(C=C, max_iter=1000, random_state=42)
# Train score
lr.fit(X_high, y_high)
train_scores.append(lr.score(X_high, y_high))
# Cross-validation score
cv_scores = cross_val_score(lr, X_high, y_high, cv=5)
val_scores.append(np.mean(cv_scores))
# Plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.semilogx(C_values, train_scores, 'b-', linewidth=2, label='Training accuracy')
plt.semilogx(C_values, val_scores, 'r-', linewidth=2, label='CV accuracy')
optimal_idx = np.argmax(val_scores)
plt.axvline(C_values[optimal_idx], color='g', linestyle='--',
label=f'Optimal C={C_values[optimal_idx]:.3f}')
plt.xlabel('C (inverse regularization strength)')
plt.ylabel('Accuracy')
plt.title('Regularization Strength vs Performance')
plt.legend()
plt.grid(True, alpha=0.3)
# Weight magnitude
plt.subplot(1, 2, 2)
weight_norms = []
for C in C_values:
lr = LogisticRegression(C=C, max_iter=1000, random_state=42)
lr.fit(X_high, y_high)
weight_norms.append(np.linalg.norm(lr.coef_))
plt.semilogx(C_values, weight_norms, 'b-', linewidth=2)
plt.xlabel('C (inverse regularization strength)')
plt.ylabel('||w|| (weight magnitude)')
plt.title('Regularization Effect on Weights')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Optimal C value: {C_values[optimal_idx]:.4f}")
print(f"Best CV accuracy: {val_scores[optimal_idx]:.4f}")
print("\nKey insights:")
print("- Small C (strong regularization): underfitting")
print("- Large C (weak regularization): overfitting")
print("- Optimal C balances bias-variance tradeoff")
SummaryΒΆ
In this notebook, we covered:
Linear Regression: MLE solution via normal equations
Ridge Regression: L2 regularization prevents overfitting
Bayesian Linear Regression: Provides uncertainty quantification
Logistic Regression: Binary classification with sigmoid function
Softmax Regression: Multiclass classification
Model Selection: Choosing optimal regularization strength
Key TakeawaysΒΆ
Linear regression: MLE β‘ minimizing squared error, has closed-form solution
Regularization: Essential for high-dimensional or limited data problems
Bayesian approach: Provides uncertainty estimates, useful for decision-making
Logistic regression: No closed-form solution, requires iterative optimization
Softmax: Natural extension of logistic regression to multiple classes
Cross-validation: Critical for selecting hyperparameters
ExercisesΒΆ
Implement Lasso regression (L1 regularization) using coordinate descent
Derive the gradient for logistic regression
Implement Newtonβs method for logistic regression
Compare Ridge, Lasso, and Elastic Net on a high-dimensional dataset
Implement Bayesian logistic regression using Laplace approximation