Run this notebook: Open in Colab Open in Kaggle

# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_digits, make_classification, make_moons
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import multivariate_normal

sns.set_style('whitegrid')
np.random.seed(42)
print("✅ Setup complete!")

Chapter 9: Linear Regression Solutions¶

Solution 9.1: Maximum Likelihood Estimation¶

# 1. Generate data: y = 3x + 2 + noise
np.random.seed(42)
n_samples = 50
X = np.random.uniform(0, 10, n_samples)
true_slope, true_intercept = 3, 2
noise = np.random.normal(0, 0.5, n_samples)
y = true_slope * X + true_intercept + noise

print(f"True parameters: slope={true_slope}, intercept={true_intercept}")

# 2. MLE using normal equations: w = (X^T X)^(-1) X^T y
# Add bias term (column of 1s)
X_with_bias = np.column_stack([np.ones(n_samples), X])

# Normal equations
w_mle = np.linalg.inv(X_with_bias.T @ X_with_bias) @ X_with_bias.T @ y

intercept_mle, slope_mle = w_mle
print(f"\nMLE estimates: slope={slope_mle:.3f}, intercept={intercept_mle:.3f}")

# 3. Plot
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.5, label='Data')
x_line = np.array([0, 10])

# True line
y_true = true_slope * x_line + true_intercept
plt.plot(x_line, y_true, 'g--', linewidth=2, label='True line')

# MLE fitted line
y_fitted = slope_mle * x_line + intercept_mle
plt.plot(x_line, y_fitted, 'r-', linewidth=2, label='MLE fit')

plt.xlabel('x')
plt.ylabel('y')
plt.title('Linear Regression: Maximum Likelihood Estimation')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 4. Compare with sklearn
lr = LinearRegression()
lr.fit(X.reshape(-1, 1), y)

print(f"\nsklearn estimates: slope={lr.coef_[0]:.3f}, intercept={lr.intercept_:.3f}")
print(f"\nMethods match: {np.allclose([intercept_mle, slope_mle], [lr.intercept_, lr.coef_[0]])}")

# Residuals
y_pred = slope_mle * X + intercept_mle
mse = np.mean((y - y_pred)**2)
print(f"\nMean Squared Error: {mse:.4f}")
print(f"R² score: {1 - np.sum((y - y_pred)**2) / np.sum((y - y.mean())**2):.4f}")

Solution 9.2: Polynomial Regression & Overfitting¶

# 1. Generate data from y = sin(2πx) + noise
np.random.seed(42)
n = 20
X_train = np.sort(np.random.uniform(0, 1, n))
y_train = np.sin(2 * np.pi * X_train) + np.random.normal(0, 0.1, n)

# Test data (dense for smooth curve)
X_test = np.linspace(0, 1, 100)
y_test_true = np.sin(2 * np.pi * X_test)

# 2-4. Fit polynomials of different degrees
degrees = [1, 3, 5, 9, 15]
colors = ['blue', 'green', 'orange', 'red', 'purple']

plt.figure(figsize=(14, 8))

# Plot data and true function
plt.scatter(X_train, y_train, s=50, alpha=0.6, label='Training data', zorder=5)
plt.plot(X_test, y_test_true, 'k--', linewidth=2, label='True function', alpha=0.3)

training_mses = []

for degree, color in zip(degrees, colors):
    # Create polynomial features
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))
    X_test_poly = poly.transform(X_test.reshape(-1, 1))
    
    # Fit
    lr = LinearRegression()
    lr.fit(X_train_poly, y_train)
    
    # Predict
    y_pred_train = lr.predict(X_train_poly)
    y_pred_test = lr.predict(X_test_poly)
    
    # Training MSE
    mse = np.mean((y_train - y_pred_train)**2)
    training_mses.append(mse)
    
    # Plot
    plt.plot(X_test, y_pred_test, color=color, linewidth=2, 
             label=f'Degree {degree} (MSE={mse:.4f})', alpha=0.7)

plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Polynomial Regression: Bias-Variance Tradeoff', fontsize=14)
plt.legend(loc='upper right')
plt.ylim(-2, 2)
plt.grid(True, alpha=0.3)
plt.show()

# 5. Analysis
print("Training MSE by degree:")
for degree, mse in zip(degrees, training_mses):
    print(f"  Degree {degree:2d}: {mse:.6f}")

print("\n💡 Analysis:")
print("  • Degree 1 (linear): UNDERFITS - high bias, can't capture sine wave")
print("  • Degree 3-5: GOOD FIT - balances bias and variance")
print("  • Degree 9-15: OVERFITS - low training error but wild oscillations")
print("  • High-degree polynomials memorize noise instead of learning pattern!")

# Visualize MSE vs degree
plt.figure(figsize=(8, 5))
plt.plot(degrees, training_mses, 'o-', linewidth=2, markersize=8)
plt.xlabel('Polynomial Degree', fontsize=12)
plt.ylabel('Training MSE', fontsize=12)
plt.title('Training Error Decreases with Model Complexity', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()

Solution 9.3: Ridge Regression (L2 Regularization)¶

# 1. Generate high-dimensional data (n=50, p=100)
np.random.seed(42)
n, p = 50, 100

# True weights (sparse: only 10 non-zero)
w_true = np.zeros(p)
w_true[:10] = np.random.randn(10) * 2

# Generate data
X = np.random.randn(n, p)
y = X @ w_true + np.random.randn(n) * 0.5

print(f"Data shape: X={X.shape}, y={y.shape}")
print(f"True weights: {np.count_nonzero(w_true)} non-zero out of {p}")

# 2. Implement Ridge: w = (X^T X + λI)^(-1) X^T y
def ridge_regression(X, y, lambda_reg):
    """Ridge regression closed-form solution."""
    I = np.eye(X.shape[1])
    w = np.linalg.inv(X.T @ X + lambda_reg * I) @ X.T @ y
    return w

# 3. Try different λ values
lambdas = [0.001, 0.01, 0.1, 1, 10, 100]
weights_norms = []
mses = []

for lam in lambdas:
    w_ridge = ridge_regression(X, y, lam)
    weights_norms.append(np.linalg.norm(w_ridge))
    
    # MSE
    y_pred = X @ w_ridge
    mse = np.mean((y - y_pred)**2)
    mses.append(mse)
    
    print(f"λ={lam:6.3f}: ||w||={np.linalg.norm(w_ridge):6.2f}, MSE={mse:.4f}")

# 4. Plot ||w|| vs log(λ)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Weight norm
ax1.semilogx(lambdas, weights_norms, 'o-', linewidth=2, markersize=8)
ax1.axhline(y=np.linalg.norm(w_true), color='r', linestyle='--', label='True ||w||')
ax1.set_xlabel('λ (Regularization)', fontsize=12)
ax1.set_ylabel('||w|| (Weight Magnitude)', fontsize=12)
ax1.set_title('Ridge Regularization Shrinks Weights', fontsize=14)
ax1.legend()
ax1.grid(True, alpha=0.3)

# MSE
ax2.semilogx(lambdas, mses, 's-', linewidth=2, markersize=8, color='orange')
ax2.set_xlabel('λ (Regularization)', fontsize=12)
ax2.set_ylabel('Training MSE', fontsize=12)
ax2.set_title('Training Error vs Regularization', fontsize=14)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 5. Find optimal λ using cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = []
for lam in lambdas:
    ridge = Ridge(alpha=lam)
    scores = cross_val_score(ridge, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores.append(-scores.mean())  # Negative because sklearn returns negative MSE

optimal_idx = np.argmin(cv_scores)
optimal_lambda = lambdas[optimal_idx]

plt.figure(figsize=(8, 5))
plt.semilogx(lambdas, cv_scores, 'o-', linewidth=2, markersize=8)
plt.axvline(x=optimal_lambda, color='r', linestyle='--', 
            label=f'Optimal λ={optimal_lambda}')
plt.xlabel('λ (Regularization)', fontsize=12)
plt.ylabel('Cross-Validation MSE', fontsize=12)
plt.title('Model Selection via Cross-Validation', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"\n✨ Optimal λ = {optimal_lambda} (minimizes CV error)")

Solution 9.4: Bayesian Linear Regression¶

# Bayesian Linear Regression with predictive uncertainty

def bayesian_linear_regression(X, y, alpha=1.0, beta=25.0):
    """
    Bayesian linear regression with Gaussian prior.
    
    Prior: w ~ N(0, α^(-1) I)
    Likelihood: y|X,w ~ N(Xw, β^(-1) I)
    Posterior: w|X,y ~ N(m_N, S_N)
    
    Returns posterior mean m_N and covariance S_N.
    """
    n, d = X.shape
    
    # Posterior covariance: S_N^(-1) = α I + β X^T X
    S_N_inv = alpha * np.eye(d) + beta * X.T @ X
    S_N = np.linalg.inv(S_N_inv)
    
    # Posterior mean: m_N = β S_N X^T y
    m_N = beta * S_N @ X.T @ y
    
    return m_N, S_N

def predictive_distribution(X_new, m_N, S_N, beta):
    """
    Compute predictive distribution p(y*|x*, X, y).
    
    Returns: mean and variance for each test point.
    """
    # Predictive mean
    y_mean = X_new @ m_N
    
    # Predictive variance: σ²(x*) = β^(-1) + x*^T S_N x*
    y_var = 1/beta + np.sum(X_new @ S_N * X_new, axis=1)
    
    return y_mean, y_var

# Generate true function
true_func = lambda x: np.sin(2 * np.pi * x)
X_test = np.linspace(0, 1, 100).reshape(-1, 1)
y_true = true_func(X_test)

# Experiment with different data sizes
data_sizes = [5, 20, 100]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, n in enumerate(data_sizes):
    ax = axes[idx]
    
    # 1. Generate training data
    np.random.seed(42)
    X_train = np.random.uniform(0, 1, n).reshape(-1, 1)
    y_train = true_func(X_train).ravel() + np.random.normal(0, 0.2, n)
    
    # 2. Compute posterior
    m_N, S_N = bayesian_linear_regression(X_train, y_train, alpha=1.0, beta=25.0)
    
    # 3. Predictive distribution
    y_mean, y_var = predictive_distribution(X_test, m_N, S_N, beta=25.0)
    y_std = np.sqrt(y_var)
    
    # 4. Plot
    # True function
    ax.plot(X_test, y_true, 'g--', linewidth=2, label='True function', alpha=0.5)
    
    # Training data
    ax.scatter(X_train, y_train, s=50, alpha=0.6, zorder=5, label='Data')
    
    # Posterior mean
    ax.plot(X_test, y_mean, 'r-', linewidth=2, label='Posterior mean')
    
    # Confidence bands (±2σ ≈ 95% confidence)
    ax.fill_between(X_test.ravel(), 
                     y_mean - 2*y_std, 
                     y_mean + 2*y_std, 
                     alpha=0.2, color='red', label='95% confidence')
    
    ax.set_xlabel('x', fontsize=12)
    ax.set_ylabel('y', fontsize=12)
    ax.set_title(f'N = {n} samples', fontsize=14)
    ax.set_ylim(-2, 2)
    ax.legend(loc='upper right', fontsize=9)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Bayesian Linear Regression: Uncertainty Quantification', 
             fontsize=16, y=1.02)
plt.show()

print("\n💡 Key Observations:")
print("  • N=5: Wide confidence bands (high uncertainty)")
print("  • N=20: Narrower bands near data, wider far from data")
print("  • N=100: Tight bands (low uncertainty), better fit")
print("\n  Bayesian regression naturally quantifies uncertainty!")
print("  Uncertainty is HIGH where data is sparse.")

Chapter 10: PCA Solutions¶

Solution 10.1: PCA from Scratch¶

# 1. Generate correlated 2D data
np.random.seed(42)
mean = [0, 0]
cov = [[3, 1.5], 
       [1.5, 1]]  # Positive correlation

X = np.random.multivariate_normal(mean, cov, 200)

print(f"Data shape: {X.shape}")
print(f"Data mean: {X.mean(axis=0)}")

# 2. Center the data
X_centered = X - X.mean(axis=0)

# 3. Compute covariance matrix
C = np.cov(X_centered.T)
print(f"\nCovariance matrix:")
print(C)

# 4. Find eigenvectors (principal components)
eigenvalues, eigenvectors = np.linalg.eig(C)

# Sort by eigenvalue (descending)
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]

print(f"\nEigenvalues (variance explained): {eigenvalues}")
print(f"\nPrincipal components (eigenvectors):")
print(eigenvectors)

# 5. Project onto first PC
pc1 = eigenvectors[:, 0]
projections = X_centered @ pc1.reshape(-1, 1) * pc1

# 6. Visualize
plt.figure(figsize=(10, 8))

# Data points
plt.scatter(X[:, 0], X[:, 1], alpha=0.5, s=30, label='Data')

# Principal components as arrows
scale = 3  # Scale for visibility
for i in range(2):
    pc = eigenvectors[:, i]
    plt.arrow(0, 0, scale*np.sqrt(eigenvalues[i])*pc[0], 
              scale*np.sqrt(eigenvalues[i])*pc[1],
              head_width=0.3, head_length=0.3, fc=f'C{i+1}', ec=f'C{i+1}',
              linewidth=3, label=f'PC{i+1} (λ={eigenvalues[i]:.2f})')

# Projections onto PC1
for i in range(0, len(X), 10):  # Show subset for clarity
    plt.plot([X_centered[i, 0], projections[i, 0]], 
             [X_centered[i, 1], projections[i, 1]], 
             'k--', alpha=0.2, linewidth=0.5)

plt.scatter(projections[:, 0], projections[:, 1], 
            alpha=0.3, s=20, c='red', label='Projections on PC1')

plt.axhline(y=0, color='k', linewidth=0.5)
plt.axvline(x=0, color='k', linewidth=0.5)
plt.axis('equal')
plt.xlabel('x₁', fontsize=12)
plt.ylabel('x₂', fontsize=12)
plt.title('PCA: Principal Components & Projections', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("\n💡 Interpretation:")
print(f"  • PC1 points in direction of maximum variance (λ₁={eigenvalues[0]:.2f})")
print(f"  • PC2 points in perpendicular direction (λ₂={eigenvalues[1]:.2f})")
print(f"  • Variance explained by PC1: {eigenvalues[0]/eigenvalues.sum():.1%}")
print(f"  • Total variance: {eigenvalues.sum():.2f} = trace(Covariance)")

Solutions continue for remaining exercises…

(Note: This preview shows solutions for Chapters 9-10. The complete notebook would include all chapters and the bonus challenge with full implementations, visualizations, and explanatory text.)

📚 Complete Solutions Structure¶

The full solutions notebook includes:

Chapter 10 (PCA):

✅ Solution 10.1: PCA from scratch (shown above)
📝 Solution 10.2: Digits dimensionality reduction with scree plot
📝 Solution 10.3: PCA vs random projections comparison
📝 Solution 10.4: Whitening transformation with visualization

Chapter 11 (GMM):

📝 Solution 11.1: EM algorithm implementation from scratch
📝 Solution 11.2: BIC/AIC model selection
📝 Solution 11.3: GMM vs K-Means soft vs hard clustering
📝 Solution 11.4: Anomaly detection via density estimation

Chapter 12 (SVM):

📝 Solution 12.1: Linear SVM with margin visualization
📝 Solution 12.2: Kernel trick on XOR problem
📝 Solution 12.3: C parameter effect on decision boundaries
📝 Solution 12.4: Multi-class SVM with OvR and OvO strategies

Bonus Challenge:

📝 Complete end-to-end ML pipeline with all techniques

🎯 Learning Outcomes¶

After completing these solutions, you should be able to:

✅ Implement ML algorithms from scratch (understand inner workings) ✅ Diagnose overfitting/underfitting (bias-variance tradeoff) ✅ Choose appropriate regularization (Ridge vs Lasso, when to use) ✅ Quantify uncertainty (Bayesian methods) ✅ Reduce dimensionality (PCA for visualization and compression) ✅ Perform model selection (cross-validation, information criteria) ✅ Apply kernel methods (handle non-linear patterns) ✅ Tune hyperparameters (systematic search strategies)

You’ve mastered the mathematical foundations of machine learning! 🎓