Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.spatial.distance import cdist
from scipy.optimize import minimize
from sklearn.svm import SVR, SVC
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel, Matern
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.datasets import make_classification, make_moons
from sklearn.preprocessing import StandardScaler

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

1. Kernel Functions and the Kernel Trick¶

The Kernel Trick¶

Instead of explicitly computing features $\phi(\mathbf{x})$ in high-dimensional space, we compute:

\[k(\mathbf{x}, \mathbf{x}') = \langle \phi(\mathbf{x}), \phi(\mathbf{x}') \rangle\]

This allows us to work in infinite-dimensional spaces without ever computing $\phi(\mathbf{x})$ explicitly!

Common Kernel Functions¶

Linear: $k(\mathbf{x}, \mathbf{x}') = \mathbf{x}^T\mathbf{x}'$
Polynomial: $k(\mathbf{x}, \mathbf{x}') = (\mathbf{x}^T\mathbf{x}' + c)^d$
RBF (Gaussian): $k(\mathbf{x}, \mathbf{x}') = \exp\left(-\frac{\|\mathbf{x} - \mathbf{x}'\|^2}{2\sigma^2}\right)$
Matérn: $k(\mathbf{x}, \mathbf{x}') = \frac{2^{1-\nu}}{\Gamma(\nu)}\left(\frac{\sqrt{2\nu}r}{\ell}\right)^\nu K_\nu\left(\frac{\sqrt{2\nu}r}{\ell}\right)$

class KernelFunctions:
    """Common kernel functions"""
    
    @staticmethod
    def linear(X, Y):
        """Linear kernel: k(x, x') = x^T x'"""
        return X @ Y.T
    
    @staticmethod
    def polynomial(X, Y, degree=3, coef0=1):
        """Polynomial kernel: k(x, x') = (x^T x' + c)^d"""
        return (X @ Y.T + coef0) ** degree
    
    @staticmethod
    def rbf(X, Y, sigma=1.0):
        """RBF (Gaussian) kernel: k(x, x') = exp(-||x - x'||^2 / (2σ^2))"""
        # Compute pairwise squared distances
        dists_sq = cdist(X, Y, metric='sqeuclidean')
        return np.exp(-dists_sq / (2 * sigma**2))
    
    @staticmethod
    def laplacian(X, Y, sigma=1.0):
        """Laplacian kernel: k(x, x') = exp(-||x - x'|| / σ)"""
        dists = cdist(X, Y, metric='euclidean')
        return np.exp(-dists / sigma)

# Visualize different kernels
x = np.linspace(-3, 3, 100).reshape(-1, 1)
x0 = np.array([[0]])  # Reference point

kf = KernelFunctions()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Linear kernel
K_linear = kf.linear(x, x0).flatten()
axes[0, 0].plot(x, K_linear, 'b-', linewidth=2)
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('k(x, 0)')
axes[0, 0].set_title('Linear Kernel')
axes[0, 0].grid(True, alpha=0.3)

# Polynomial kernels
for degree in [2, 3, 5]:
    K_poly = kf.polynomial(x, x0, degree=degree).flatten()
    axes[0, 1].plot(x, K_poly, linewidth=2, label=f'degree={degree}')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('k(x, 0)')
axes[0, 1].set_title('Polynomial Kernels')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# RBF kernels with different σ
for sigma in [0.5, 1.0, 2.0]:
    K_rbf = kf.rbf(x, x0, sigma=sigma).flatten()
    axes[1, 0].plot(x, K_rbf, linewidth=2, label=f'σ={sigma}')
axes[1, 0].set_xlabel('x')
axes[1, 0].set_ylabel('k(x, 0)')
axes[1, 0].set_title('RBF (Gaussian) Kernels')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Kernel matrix visualization
x_grid = np.linspace(-2, 2, 50).reshape(-1, 1)
K_matrix = kf.rbf(x_grid, x_grid, sigma=1.0)
im = axes[1, 1].imshow(K_matrix, cmap='viridis', aspect='auto')
axes[1, 1].set_xlabel('Sample index')
axes[1, 1].set_ylabel('Sample index')
axes[1, 1].set_title('RBF Kernel Matrix (Gram Matrix)')
plt.colorbar(im, ax=axes[1, 1])

plt.tight_layout()
plt.show()

print("Kernel properties:")
print("- Linear: Captures linear relationships")
print("- Polynomial: Captures polynomial interactions")
print("- RBF: Universal approximator, measures local similarity")
print("- σ controls the 'width' of the RBF kernel")

2. Kernel Ridge Regression¶

Dual Formulation¶

Instead of solving in weight space: $$\mathbf{w} = (\mathbf{X}^T\mathbf{X} + \lambda\mathbf{I})^{-1}\mathbf{X}^T\mathbf{y}$$

We can solve in dual space: $$\boldsymbol{\alpha} = (\mathbf{K} + \lambda\mathbf{I})^{-1}\mathbf{y}$$

Where $\mathbf{K}_{ij} = k(\mathbf{x}_i, \mathbf{x}_j)$ is the kernel matrix.

Prediction¶

\[f(\mathbf{x}_{new}) = \sum_{i=1}^n \alpha_i k(\mathbf{x}_i, \mathbf{x}_{new})\]

class KernelRidgeRegression:
    """Kernel Ridge Regression"""
    
    def __init__(self, kernel='rbf', sigma=1.0, lambda_=1.0, degree=3):
        self.kernel_type = kernel
        self.sigma = sigma
        self.lambda_ = lambda_
        self.degree = degree
        self.kf = KernelFunctions()
    
    def compute_kernel(self, X, Y):
        """Compute kernel matrix K(X, Y)"""
        if self.kernel_type == 'linear':
            return self.kf.linear(X, Y)
        elif self.kernel_type == 'polynomial':
            return self.kf.polynomial(X, Y, degree=self.degree)
        elif self.kernel_type == 'rbf':
            return self.kf.rbf(X, Y, sigma=self.sigma)
        else:
            raise ValueError(f"Unknown kernel: {self.kernel_type}")
    
    def fit(self, X, y):
        """Fit kernel ridge regression"""
        self.X_train = X
        self.y_train = y
        
        # Compute kernel matrix
        K = self.compute_kernel(X, X)
        
        # Solve: α = (K + λI)^-1 y
        n = len(y)
        self.alpha = np.linalg.solve(K + self.lambda_ * np.eye(n), y)
        
        return self
    
    def predict(self, X):
        """Predict using kernel ridge regression"""
        # Compute kernel between test and training points
        K_test = self.compute_kernel(X, self.X_train)
        return K_test @ self.alpha

# Generate nonlinear data
np.random.seed(42)
n_train = 30
X_train_kr = np.sort(np.random.uniform(0, 10, n_train)).reshape(-1, 1)
y_train_kr = np.sin(X_train_kr).flatten() + 0.2 * np.random.randn(n_train)

X_test_kr = np.linspace(-1, 11, 200).reshape(-1, 1)
y_test_true = np.sin(X_test_kr).flatten()

# Compare different kernels
kernels = [
    ('Linear', 'linear', {}),
    ('Polynomial (deg=3)', 'polynomial', {'degree': 3}),
    ('RBF (σ=1)', 'rbf', {'sigma': 1.0}),
    ('RBF (σ=0.5)', 'rbf', {'sigma': 0.5}),
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (name, kernel, params) in enumerate(kernels):
    krr = KernelRidgeRegression(kernel=kernel, lambda_=0.1, **params)
    krr.fit(X_train_kr, y_train_kr)
    y_pred = krr.predict(X_test_kr)
    
    # Plot
    axes[idx].scatter(X_train_kr, y_train_kr, s=80, alpha=0.8, label='Training data', zorder=3)
    axes[idx].plot(X_test_kr, y_test_true, 'g--', linewidth=2, alpha=0.7, label='True function')
    axes[idx].plot(X_test_kr, y_pred, 'r-', linewidth=2, label='KRR prediction')
    axes[idx].set_xlabel('x')
    axes[idx].set_ylabel('y')
    axes[idx].set_title(f'{name} Kernel')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)
    axes[idx].set_xlim(-1, 11)

plt.tight_layout()
plt.show()

print("Observations:")
print("- Linear kernel: Cannot capture nonlinearity")
print("- Polynomial kernel: Captures some nonlinearity but unstable at boundaries")
print("- RBF kernel: Smooth interpolation, best for this problem")
print("- Smaller σ → more wiggly fit (overfitting risk)")

3. Support Vector Machines (SVM)¶

SVM for Classification¶

Find the maximum margin hyperplane:

\[\min_{\mathbf{w}, b} \frac{1}{2}\|\mathbf{w}\|^2 + C\sum_{i=1}^n \xi_i\]

Subject to: $$y_i(\mathbf{w}^T\phi(\mathbf{x}_i) + b) \geq 1 - \xi_i, \quad \xi_i \geq 0$$

Dual Form¶

\[\max_{\boldsymbol{\alpha}} \sum_{i=1}^n \alpha_i - \frac{1}{2}\sum_{i,j} \alpha_i\alpha_j y_i y_j k(\mathbf{x}_i, \mathbf{x}_j)\]

Subject to: $0 \leq \alpha_i \leq C$ and $\sum_i \alpha_i y_i = 0$

# Generate nonlinearly separable data
np.random.seed(42)
X_svm, y_svm = make_moons(n_samples=200, noise=0.15, random_state=42)
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
    X_svm, y_svm, test_size=0.3, random_state=42)

# Compare linear and RBF SVM
svm_linear = SVC(kernel='linear', C=1.0)
svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')

svm_linear.fit(X_train_svm, y_train_svm)
svm_rbf.fit(X_train_svm, y_train_svm)

print("SVM Classification Results:")
print(f"\nLinear SVM:")
print(f"  Train accuracy: {svm_linear.score(X_train_svm, y_train_svm):.4f}")
print(f"  Test accuracy: {svm_linear.score(X_test_svm, y_test_svm):.4f}")
print(f"  Support vectors: {len(svm_linear.support_vectors_)}")

print(f"\nRBF SVM:")
print(f"  Train accuracy: {svm_rbf.score(X_train_svm, y_train_svm):.4f}")
print(f"  Test accuracy: {svm_rbf.score(X_test_svm, y_test_svm):.4f}")
print(f"  Support vectors: {len(svm_rbf.support_vectors_)}")

# Visualize decision boundaries
def plot_svm_decision_boundary(model, X, y, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
    
    # Plot data points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='black', s=50)
    
    # Highlight support vectors
    plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1],
               s=200, facecolors='none', edgecolors='green', linewidths=2,
               label='Support Vectors')
    
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title(title)
    plt.legend()

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plot_svm_decision_boundary(svm_linear, X_train_svm, y_train_svm, 
                           'Linear SVM (Cannot separate nonlinear data)')

plt.subplot(1, 2, 2)
plot_svm_decision_boundary(svm_rbf, X_train_svm, y_train_svm,
                           'RBF SVM (Nonlinear decision boundary)')

plt.tight_layout()
plt.show()

SVM Hyperparameter: Effect of C¶

What: This code visualizes how the SVM regularization parameter $C$ controls the trade-off between margin width and training error. For $C \in \{0.1, 1, 10, 100\}$, it plots the RBF SVM decision boundary, highlights support vectors, and reports train/test accuracy along with support vector count.

Why: The parameter $C$ acts as the inverse of regularization strength. Small $C$ permits more margin violations (soft margin), yielding smoother boundaries with more support vectors and better generalization. Large $C$ penalizes violations heavily, producing a tight boundary that fits training data closely but risks overfitting. The number of support vectors is a useful diagnostic: fewer support vectors indicate a simpler model, while many support vectors suggest the model is relying heavily on the training data.

Connection: Tuning $C$ (along with the kernel bandwidth $\gamma$) is typically done via grid search with cross-validation in production SVM pipelines. In applications like handwriting recognition, medical image classification, and text categorization, the optimal $C$ balances noise tolerance against classification accuracy.

# Effect of C parameter (regularization strength)
C_values = [0.1, 1, 10, 100]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, C in enumerate(C_values):
    svm = SVC(kernel='rbf', C=C, gamma='scale')
    svm.fit(X_train_svm, y_train_svm)
    
    # Decision boundary
    h = 0.02
    x_min, x_max = X_train_svm[:, 0].min() - 0.5, X_train_svm[:, 0].max() + 0.5
    y_min, y_max = X_train_svm[:, 1].min() - 0.5, X_train_svm[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    axes[idx].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    axes[idx].scatter(X_train_svm[:, 0], X_train_svm[:, 1], c=y_train_svm, 
                     cmap='RdYlBu', edgecolors='black', s=50)
    axes[idx].scatter(svm.support_vectors_[:, 0], svm.support_vectors_[:, 1],
                     s=200, facecolors='none', edgecolors='green', linewidths=2)
    
    train_acc = svm.score(X_train_svm, y_train_svm)
    test_acc = svm.score(X_test_svm, y_test_svm)
    n_sv = len(svm.support_vectors_)
    
    axes[idx].set_xlabel('Feature 1')
    axes[idx].set_ylabel('Feature 2')
    axes[idx].set_title(f'C={C}\nTrain: {train_acc:.3f}, Test: {test_acc:.3f}, SV: {n_sv}')

plt.tight_layout()
plt.show()

print("Effect of C:")
print("- Small C: Soft margin, more regularization, more support vectors")
print("- Large C: Hard margin, less regularization, fewer support vectors")
print("- Too large C can lead to overfitting")

4. Gaussian Processes for Regression¶

Gaussian Process Definition¶

A GP is a collection of random variables, any finite number of which have a joint Gaussian distribution.

\[f(\mathbf{x}) \sim \mathcal{GP}(m(\mathbf{x}), k(\mathbf{x}, \mathbf{x}'))\]

Where:

$m(\mathbf{x})$ is the mean function (often 0)
$k(\mathbf{x}, \mathbf{x}')$ is the covariance (kernel) function

Posterior Predictive Distribution¶

Given training data $\mathcal{D} = \{(\mathbf{x}_i, y_i)\}_{i=1}^n$:

Mean: $$\mu_{*} = \mathbf{k}_*^T (\mathbf{K} + \sigma_n^2\mathbf{I})^{-1}\mathbf{y}$$

Variance: $$\sigma_*^2 = k_{**} - \mathbf{k}_*^T (\mathbf{K} + \sigma_n^2\mathbf{I})^{-1}\mathbf{k}_*$$

Where:

$\mathbf{k}_* = [k(\mathbf{x}_*, \mathbf{x}_1), \ldots, k(\mathbf{x}_*, \mathbf{x}_n)]^T$
$k_{**} = k(\mathbf{x}_*, \mathbf{x}_*)$

class GaussianProcessRegression:
    """Gaussian Process Regression from scratch"""
    
    def __init__(self, kernel='rbf', sigma=1.0, noise=0.1):
        self.kernel_type = kernel
        self.sigma = sigma
        self.noise = noise
        self.kf = KernelFunctions()
    
    def kernel(self, X, Y):
        if self.kernel_type == 'rbf':
            return self.kf.rbf(X, Y, sigma=self.sigma)
        else:
            raise ValueError(f"Unknown kernel: {self.kernel_type}")
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
        # Compute kernel matrix K + noise
        K = self.kernel(X, X)
        self.K_inv = np.linalg.inv(K + self.noise**2 * np.eye(len(y)))
        
        return self
    
    def predict(self, X, return_std=False):
        # Compute k_* (kernel between test and training)
        k_star = self.kernel(X, self.X_train)
        
        # Mean: k_*^T (K + σ²I)^-1 y
        mean = k_star @ self.K_inv @ self.y_train
        
        if return_std:
            # Variance: k_** - k_*^T (K + σ²I)^-1 k_*
            k_star_star = self.kernel(X, X)
            var = np.diag(k_star_star) - np.sum(k_star @ self.K_inv * k_star, axis=1)
            std = np.sqrt(np.maximum(var, 0))  # Numerical stability
            return mean, std
        
        return mean

# Generate sparse training data
np.random.seed(42)
X_train_gp = np.array([1, 3, 5, 6, 8]).reshape(-1, 1)
y_train_gp = np.sin(X_train_gp).flatten() + 0.1 * np.random.randn(len(X_train_gp))

X_test_gp = np.linspace(0, 10, 200).reshape(-1, 1)
y_true_gp = np.sin(X_test_gp).flatten()

# Fit custom GP
gp_custom = GaussianProcessRegression(sigma=1.0, noise=0.1)
gp_custom.fit(X_train_gp, y_train_gp)
y_pred_mean, y_pred_std = gp_custom.predict(X_test_gp, return_std=True)

# Fit sklearn GP
kernel_sklearn = ConstantKernel(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=0.1**2)
gp_sklearn = GaussianProcessRegressor(kernel=kernel_sklearn, alpha=0)
gp_sklearn.fit(X_train_gp, y_train_gp)
y_pred_sklearn, y_std_sklearn = gp_sklearn.predict(X_test_gp, return_std=True)

# Visualize
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_train_gp, y_train_gp, s=100, c='red', zorder=10, 
           edgecolors='black', label='Training data')
plt.plot(X_test_gp, y_true_gp, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_gp, y_pred_mean, 'b-', linewidth=2, label='GP mean prediction')
plt.fill_between(X_test_gp.flatten(),
                 y_pred_mean - 2*y_pred_std,
                 y_pred_mean + 2*y_pred_std,
                 alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Custom Gaussian Process Regression')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(X_train_gp, y_train_gp, s=100, c='red', zorder=10,
           edgecolors='black', label='Training data')
plt.plot(X_test_gp, y_true_gp, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_gp, y_pred_sklearn, 'b-', linewidth=2, label='GP mean prediction')
plt.fill_between(X_test_gp.flatten(),
                 y_pred_sklearn - 2*y_std_sklearn,
                 y_pred_sklearn + 2*y_std_sklearn,
                 alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Sklearn Gaussian Process Regression')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("GP Regression Key Features:")
print("- Provides uncertainty estimates (crucial for decision-making)")
print("- Uncertainty increases far from training data")
print("- Uncertainty is low near training points")
print("- Naturally interpolates through data points (with low noise)")

GP Prior Samples¶

What: This code draws random functions from the Gaussian process prior $f \sim \mathcal{GP}(0, k)$ with an RBF kernel at four different length scales $\sigma \in \{0.5, 1.0, 2.0, 5.0\}$. Sampling is performed via the Cholesky decomposition $\mathbf{L}$ of the kernel matrix: $\mathbf{f} = \mathbf{L} \boldsymbol{\epsilon}$ where $\boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$.

Why: Visualizing prior samples builds intuition for what the GP “believes” before seeing any data. The length scale $\sigma$ (also called $\ell$) controls the characteristic distance over which function values are correlated: small $\sigma$ produces rapidly oscillating functions, while large $\sigma$ produces smooth, slowly varying functions. Choosing an appropriate prior (kernel and its hyperparameters) is the most important modeling decision in GP regression, as it encodes assumptions about the function class.

Connection: Prior samples are a standard diagnostic in Bayesian modeling workflows. In Bayesian optimization (used for hyperparameter tuning), the GP prior over the objective function determines exploration behavior. In spatial statistics and geostatistics, the kernel length scale directly corresponds to the physical correlation range of the phenomenon being modeled.

# Sample from GP prior
X_prior = np.linspace(0, 10, 100).reshape(-1, 1)

def sample_gp_prior(X, kernel_func, n_samples=5):
    """Sample functions from GP prior"""
    K = kernel_func(X, X)
    # Add small jitter for numerical stability
    K += 1e-8 * np.eye(len(X))
    # Sample from multivariate Gaussian
    L = np.linalg.cholesky(K)
    samples = L @ np.random.randn(len(X), n_samples)
    return samples

kf = KernelFunctions()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Different length scales
length_scales = [0.5, 1.0, 2.0, 5.0]

for idx, ls in enumerate(length_scales):
    ax = axes[idx // 2, idx % 2]
    
    kernel_func = lambda X, Y: kf.rbf(X, Y, sigma=ls)
    samples = sample_gp_prior(X_prior, kernel_func, n_samples=5)
    
    for i in range(5):
        ax.plot(X_prior, samples[:, i], alpha=0.7, linewidth=2)
    
    ax.set_xlabel('x')
    ax.set_ylabel('f(x)')
    ax.set_title(f'GP Prior Samples (σ={ls})')
    ax.grid(True, alpha=0.3)
    ax.set_ylim(-3, 3)

plt.tight_layout()
plt.show()

print("Observations:")
print("- Small σ (length scale): Wiggly, rapid variations")
print("- Large σ: Smooth, slow variations")
print("- Length scale controls function smoothness")

5. GP Hyperparameter Optimization¶

Marginal Likelihood¶

\[\log p(\mathbf{y}|\mathbf{X}, \boldsymbol{\theta}) = -\frac{1}{2}\mathbf{y}^T(\mathbf{K} + \sigma_n^2\mathbf{I})^{-1}\mathbf{y} - \frac{1}{2}\log|\mathbf{K} + \sigma_n^2\mathbf{I}| - \frac{n}{2}\log(2\pi)\]

We optimize hyperparameters $\boldsymbol{\theta}$ (e.g., length scale, noise) by maximizing the marginal likelihood.

# Generate more complex data
np.random.seed(42)
X_train_opt = np.random.uniform(0, 10, 20).reshape(-1, 1)
y_train_opt = np.sin(X_train_opt).flatten() + 0.2 * np.random.randn(20)

# GP with hyperparameter optimization
kernel_opt = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + \
             WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-5, 1e1))

gp_opt = GaussianProcessRegressor(kernel=kernel_opt, n_restarts_optimizer=10)
gp_opt.fit(X_train_opt, y_train_opt)

print("Optimized Hyperparameters:")
print(f"Kernel: {gp_opt.kernel_}")
print(f"Log-marginal-likelihood: {gp_opt.log_marginal_likelihood_value_:.2f}")

# Predictions
X_test_opt = np.linspace(-1, 11, 200).reshape(-1, 1)
y_pred_opt, y_std_opt = gp_opt.predict(X_test_opt, return_std=True)

# Visualize
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_train_opt, y_train_opt, s=100, c='red', zorder=10,
           edgecolors='black', label='Training data')
plt.plot(X_test_opt, np.sin(X_test_opt), 'g--', linewidth=2, 
        alpha=0.7, label='True function')
plt.plot(X_test_opt, y_pred_opt, 'b-', linewidth=2, label='GP prediction')
plt.fill_between(X_test_opt.flatten(),
                 y_pred_opt - 2*y_std_opt,
                 y_pred_opt + 2*y_std_opt,
                 alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('GP with Optimized Hyperparameters')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-1, 11)

# Sample posterior functions
plt.subplot(1, 2, 2)
y_samples = gp_opt.sample_y(X_test_opt, n_samples=10)
for i in range(10):
    plt.plot(X_test_opt, y_samples[:, i], alpha=0.5, linewidth=1)
plt.scatter(X_train_opt, y_train_opt, s=100, c='red', zorder=10,
           edgecolors='black', label='Training data')
plt.plot(X_test_opt, np.sin(X_test_opt), 'g--', linewidth=2,
        alpha=0.7, label='True function')
plt.xlabel('x')
plt.ylabel('y')
plt.title('GP Posterior Samples')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-1, 11)

plt.tight_layout()
plt.show()

Summary¶

In this notebook, we covered:

Kernel Functions: Linear, Polynomial, RBF, and their properties
Kernel Trick: Implicit mapping to high-dimensional spaces
Kernel Ridge Regression: Dual formulation using kernels
Support Vector Machines: Maximum margin classification with kernels
Gaussian Processes: Probabilistic, non-parametric regression
GP Hyperparameters: Optimization via marginal likelihood

Key Takeaways¶

Kernel Trick: Allows working in infinite-dimensional spaces efficiently
RBF Kernel: Most commonly used, universal approximator
SVMs: Effective for classification, especially with RBF kernel
Gaussian Processes:
- Provide uncertainty quantification (unlike most methods)
- Flexible, non-parametric
- Computationally expensive: O(n³) for training, O(n²) for prediction
- Great for small-to-medium datasets
Hyperparameters: Critical for performance (length scale, noise)

When to Use¶

Kernel Ridge Regression: When you need nonlinear regression, medium-sized data
SVM: Classification with clear margin of separation, high-dimensional data
Gaussian Processes: When uncertainty quantification is crucial, small-medium data, Bayesian optimization

Exercises¶

Implement the Sequential Minimal Optimization (SMO) algorithm for SVM
Derive the dual formulation of kernel ridge regression
Implement sparse GP using inducing points
Create a custom kernel and verify it’s positive semi-definite
Implement GP classification using Laplace approximation