import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.spatial.distance import cdist
from scipy.optimize import minimize
from sklearn.svm import SVR, SVC
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel, Matern
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.datasets import make_classification, make_moons
from sklearn.preprocessing import StandardScaler

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

1. Kernel Functions and the Kernel TrickΒΆ

The Kernel TrickΒΆ

Instead of explicitly computing features \(\phi(\mathbf{x})\) in high-dimensional space, we compute:

\[k(\mathbf{x}, \mathbf{x}') = \langle \phi(\mathbf{x}), \phi(\mathbf{x}') \rangle\]

This allows us to work in infinite-dimensional spaces without ever computing \(\phi(\mathbf{x})\) explicitly!

Common Kernel FunctionsΒΆ

  1. Linear: \(k(\mathbf{x}, \mathbf{x}') = \mathbf{x}^T\mathbf{x}'\)

  2. Polynomial: \(k(\mathbf{x}, \mathbf{x}') = (\mathbf{x}^T\mathbf{x}' + c)^d\)

  3. RBF (Gaussian): \(k(\mathbf{x}, \mathbf{x}') = \exp\left(-\frac{\|\mathbf{x} - \mathbf{x}'\|^2}{2\sigma^2}\right)\)

  4. MatΓ©rn: \(k(\mathbf{x}, \mathbf{x}') = \frac{2^{1-\nu}}{\Gamma(\nu)}\left(\frac{\sqrt{2\nu}r}{\ell}\right)^\nu K_\nu\left(\frac{\sqrt{2\nu}r}{\ell}\right)\)

class KernelFunctions:
    """Common kernel functions"""
    
    @staticmethod
    def linear(X, Y):
        """Linear kernel: k(x, x') = x^T x'"""
        return X @ Y.T
    
    @staticmethod
    def polynomial(X, Y, degree=3, coef0=1):
        """Polynomial kernel: k(x, x') = (x^T x' + c)^d"""
        return (X @ Y.T + coef0) ** degree
    
    @staticmethod
    def rbf(X, Y, sigma=1.0):
        """RBF (Gaussian) kernel: k(x, x') = exp(-||x - x'||^2 / (2Οƒ^2))"""
        # Compute pairwise squared distances
        dists_sq = cdist(X, Y, metric='sqeuclidean')
        return np.exp(-dists_sq / (2 * sigma**2))
    
    @staticmethod
    def laplacian(X, Y, sigma=1.0):
        """Laplacian kernel: k(x, x') = exp(-||x - x'|| / Οƒ)"""
        dists = cdist(X, Y, metric='euclidean')
        return np.exp(-dists / sigma)

# Visualize different kernels
x = np.linspace(-3, 3, 100).reshape(-1, 1)
x0 = np.array([[0]])  # Reference point

kf = KernelFunctions()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Linear kernel
K_linear = kf.linear(x, x0).flatten()
axes[0, 0].plot(x, K_linear, 'b-', linewidth=2)
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('k(x, 0)')
axes[0, 0].set_title('Linear Kernel')
axes[0, 0].grid(True, alpha=0.3)

# Polynomial kernels
for degree in [2, 3, 5]:
    K_poly = kf.polynomial(x, x0, degree=degree).flatten()
    axes[0, 1].plot(x, K_poly, linewidth=2, label=f'degree={degree}')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('k(x, 0)')
axes[0, 1].set_title('Polynomial Kernels')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# RBF kernels with different Οƒ
for sigma in [0.5, 1.0, 2.0]:
    K_rbf = kf.rbf(x, x0, sigma=sigma).flatten()
    axes[1, 0].plot(x, K_rbf, linewidth=2, label=f'Οƒ={sigma}')
axes[1, 0].set_xlabel('x')
axes[1, 0].set_ylabel('k(x, 0)')
axes[1, 0].set_title('RBF (Gaussian) Kernels')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Kernel matrix visualization
x_grid = np.linspace(-2, 2, 50).reshape(-1, 1)
K_matrix = kf.rbf(x_grid, x_grid, sigma=1.0)
im = axes[1, 1].imshow(K_matrix, cmap='viridis', aspect='auto')
axes[1, 1].set_xlabel('Sample index')
axes[1, 1].set_ylabel('Sample index')
axes[1, 1].set_title('RBF Kernel Matrix (Gram Matrix)')
plt.colorbar(im, ax=axes[1, 1])

plt.tight_layout()
plt.show()

print("Kernel properties:")
print("- Linear: Captures linear relationships")
print("- Polynomial: Captures polynomial interactions")
print("- RBF: Universal approximator, measures local similarity")
print("- Οƒ controls the 'width' of the RBF kernel")

2. Kernel Ridge RegressionΒΆ

Dual FormulationΒΆ

Instead of solving in weight space: $\(\mathbf{w} = (\mathbf{X}^T\mathbf{X} + \lambda\mathbf{I})^{-1}\mathbf{X}^T\mathbf{y}\)$

We can solve in dual space: $\(\boldsymbol{\alpha} = (\mathbf{K} + \lambda\mathbf{I})^{-1}\mathbf{y}\)$

Where \(\mathbf{K}_{ij} = k(\mathbf{x}_i, \mathbf{x}_j)\) is the kernel matrix.

PredictionΒΆ

\[f(\mathbf{x}_{new}) = \sum_{i=1}^n \alpha_i k(\mathbf{x}_i, \mathbf{x}_{new})\]
class KernelRidgeRegression:
    """Kernel Ridge Regression"""
    
    def __init__(self, kernel='rbf', sigma=1.0, lambda_=1.0, degree=3):
        self.kernel_type = kernel
        self.sigma = sigma
        self.lambda_ = lambda_
        self.degree = degree
        self.kf = KernelFunctions()
    
    def compute_kernel(self, X, Y):
        """Compute kernel matrix K(X, Y)"""
        if self.kernel_type == 'linear':
            return self.kf.linear(X, Y)
        elif self.kernel_type == 'polynomial':
            return self.kf.polynomial(X, Y, degree=self.degree)
        elif self.kernel_type == 'rbf':
            return self.kf.rbf(X, Y, sigma=self.sigma)
        else:
            raise ValueError(f"Unknown kernel: {self.kernel_type}")
    
    def fit(self, X, y):
        """Fit kernel ridge regression"""
        self.X_train = X
        self.y_train = y
        
        # Compute kernel matrix
        K = self.compute_kernel(X, X)
        
        # Solve: Ξ± = (K + Ξ»I)^-1 y
        n = len(y)
        self.alpha = np.linalg.solve(K + self.lambda_ * np.eye(n), y)
        
        return self
    
    def predict(self, X):
        """Predict using kernel ridge regression"""
        # Compute kernel between test and training points
        K_test = self.compute_kernel(X, self.X_train)
        return K_test @ self.alpha

# Generate nonlinear data
np.random.seed(42)
n_train = 30
X_train_kr = np.sort(np.random.uniform(0, 10, n_train)).reshape(-1, 1)
y_train_kr = np.sin(X_train_kr).flatten() + 0.2 * np.random.randn(n_train)

X_test_kr = np.linspace(-1, 11, 200).reshape(-1, 1)
y_test_true = np.sin(X_test_kr).flatten()

# Compare different kernels
kernels = [
    ('Linear', 'linear', {}),
    ('Polynomial (deg=3)', 'polynomial', {'degree': 3}),
    ('RBF (Οƒ=1)', 'rbf', {'sigma': 1.0}),
    ('RBF (Οƒ=0.5)', 'rbf', {'sigma': 0.5}),
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (name, kernel, params) in enumerate(kernels):
    krr = KernelRidgeRegression(kernel=kernel, lambda_=0.1, **params)
    krr.fit(X_train_kr, y_train_kr)
    y_pred = krr.predict(X_test_kr)
    
    # Plot
    axes[idx].scatter(X_train_kr, y_train_kr, s=80, alpha=0.8, label='Training data', zorder=3)
    axes[idx].plot(X_test_kr, y_test_true, 'g--', linewidth=2, alpha=0.7, label='True function')
    axes[idx].plot(X_test_kr, y_pred, 'r-', linewidth=2, label='KRR prediction')
    axes[idx].set_xlabel('x')
    axes[idx].set_ylabel('y')
    axes[idx].set_title(f'{name} Kernel')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)
    axes[idx].set_xlim(-1, 11)

plt.tight_layout()
plt.show()

print("Observations:")
print("- Linear kernel: Cannot capture nonlinearity")
print("- Polynomial kernel: Captures some nonlinearity but unstable at boundaries")
print("- RBF kernel: Smooth interpolation, best for this problem")
print("- Smaller Οƒ β†’ more wiggly fit (overfitting risk)")

3. Support Vector Machines (SVM)ΒΆ

SVM for ClassificationΒΆ

Find the maximum margin hyperplane:

\[\min_{\mathbf{w}, b} \frac{1}{2}\|\mathbf{w}\|^2 + C\sum_{i=1}^n \xi_i\]

Subject to: $\(y_i(\mathbf{w}^T\phi(\mathbf{x}_i) + b) \geq 1 - \xi_i, \quad \xi_i \geq 0\)$

Dual FormΒΆ

\[\max_{\boldsymbol{\alpha}} \sum_{i=1}^n \alpha_i - \frac{1}{2}\sum_{i,j} \alpha_i\alpha_j y_i y_j k(\mathbf{x}_i, \mathbf{x}_j)\]

Subject to: \(0 \leq \alpha_i \leq C\) and \(\sum_i \alpha_i y_i = 0\)

# Generate nonlinearly separable data
np.random.seed(42)
X_svm, y_svm = make_moons(n_samples=200, noise=0.15, random_state=42)
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
    X_svm, y_svm, test_size=0.3, random_state=42)

# Compare linear and RBF SVM
svm_linear = SVC(kernel='linear', C=1.0)
svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')

svm_linear.fit(X_train_svm, y_train_svm)
svm_rbf.fit(X_train_svm, y_train_svm)

print("SVM Classification Results:")
print(f"\nLinear SVM:")
print(f"  Train accuracy: {svm_linear.score(X_train_svm, y_train_svm):.4f}")
print(f"  Test accuracy: {svm_linear.score(X_test_svm, y_test_svm):.4f}")
print(f"  Support vectors: {len(svm_linear.support_vectors_)}")

print(f"\nRBF SVM:")
print(f"  Train accuracy: {svm_rbf.score(X_train_svm, y_train_svm):.4f}")
print(f"  Test accuracy: {svm_rbf.score(X_test_svm, y_test_svm):.4f}")
print(f"  Support vectors: {len(svm_rbf.support_vectors_)}")

# Visualize decision boundaries
def plot_svm_decision_boundary(model, X, y, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
    
    # Plot data points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='black', s=50)
    
    # Highlight support vectors
    plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1],
               s=200, facecolors='none', edgecolors='green', linewidths=2,
               label='Support Vectors')
    
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title(title)
    plt.legend()

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plot_svm_decision_boundary(svm_linear, X_train_svm, y_train_svm, 
                           'Linear SVM (Cannot separate nonlinear data)')

plt.subplot(1, 2, 2)
plot_svm_decision_boundary(svm_rbf, X_train_svm, y_train_svm,
                           'RBF SVM (Nonlinear decision boundary)')

plt.tight_layout()
plt.show()

SVM Hyperparameter: Effect of CΒΆ

What: This code visualizes how the SVM regularization parameter \(C\) controls the trade-off between margin width and training error. For \(C \in \{0.1, 1, 10, 100\}\), it plots the RBF SVM decision boundary, highlights support vectors, and reports train/test accuracy along with support vector count.

Why: The parameter \(C\) acts as the inverse of regularization strength. Small \(C\) permits more margin violations (soft margin), yielding smoother boundaries with more support vectors and better generalization. Large \(C\) penalizes violations heavily, producing a tight boundary that fits training data closely but risks overfitting. The number of support vectors is a useful diagnostic: fewer support vectors indicate a simpler model, while many support vectors suggest the model is relying heavily on the training data.

Connection: Tuning \(C\) (along with the kernel bandwidth \(\gamma\)) is typically done via grid search with cross-validation in production SVM pipelines. In applications like handwriting recognition, medical image classification, and text categorization, the optimal \(C\) balances noise tolerance against classification accuracy.

# Effect of C parameter (regularization strength)
C_values = [0.1, 1, 10, 100]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, C in enumerate(C_values):
    svm = SVC(kernel='rbf', C=C, gamma='scale')
    svm.fit(X_train_svm, y_train_svm)
    
    # Decision boundary
    h = 0.02
    x_min, x_max = X_train_svm[:, 0].min() - 0.5, X_train_svm[:, 0].max() + 0.5
    y_min, y_max = X_train_svm[:, 1].min() - 0.5, X_train_svm[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    axes[idx].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    axes[idx].scatter(X_train_svm[:, 0], X_train_svm[:, 1], c=y_train_svm, 
                     cmap='RdYlBu', edgecolors='black', s=50)
    axes[idx].scatter(svm.support_vectors_[:, 0], svm.support_vectors_[:, 1],
                     s=200, facecolors='none', edgecolors='green', linewidths=2)
    
    train_acc = svm.score(X_train_svm, y_train_svm)
    test_acc = svm.score(X_test_svm, y_test_svm)
    n_sv = len(svm.support_vectors_)
    
    axes[idx].set_xlabel('Feature 1')
    axes[idx].set_ylabel('Feature 2')
    axes[idx].set_title(f'C={C}\nTrain: {train_acc:.3f}, Test: {test_acc:.3f}, SV: {n_sv}')

plt.tight_layout()
plt.show()

print("Effect of C:")
print("- Small C: Soft margin, more regularization, more support vectors")
print("- Large C: Hard margin, less regularization, fewer support vectors")
print("- Too large C can lead to overfitting")

4. Gaussian Processes for RegressionΒΆ

Gaussian Process DefinitionΒΆ

A GP is a collection of random variables, any finite number of which have a joint Gaussian distribution.

\[f(\mathbf{x}) \sim \mathcal{GP}(m(\mathbf{x}), k(\mathbf{x}, \mathbf{x}'))\]

Where:

  • \(m(\mathbf{x})\) is the mean function (often 0)

  • \(k(\mathbf{x}, \mathbf{x}')\) is the covariance (kernel) function

Posterior Predictive DistributionΒΆ

Given training data \(\mathcal{D} = \{(\mathbf{x}_i, y_i)\}_{i=1}^n\):

Mean: $\(\mu_{*} = \mathbf{k}_*^T (\mathbf{K} + \sigma_n^2\mathbf{I})^{-1}\mathbf{y}\)$

Variance: $\(\sigma_*^2 = k_{**} - \mathbf{k}_*^T (\mathbf{K} + \sigma_n^2\mathbf{I})^{-1}\mathbf{k}_*\)$

Where:

  • \(\mathbf{k}_* = [k(\mathbf{x}_*, \mathbf{x}_1), \ldots, k(\mathbf{x}_*, \mathbf{x}_n)]^T\)

  • \(k_{**} = k(\mathbf{x}_*, \mathbf{x}_*)\)

class GaussianProcessRegression:
    """Gaussian Process Regression from scratch"""
    
    def __init__(self, kernel='rbf', sigma=1.0, noise=0.1):
        self.kernel_type = kernel
        self.sigma = sigma
        self.noise = noise
        self.kf = KernelFunctions()
    
    def kernel(self, X, Y):
        if self.kernel_type == 'rbf':
            return self.kf.rbf(X, Y, sigma=self.sigma)
        else:
            raise ValueError(f"Unknown kernel: {self.kernel_type}")
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
        # Compute kernel matrix K + noise
        K = self.kernel(X, X)
        self.K_inv = np.linalg.inv(K + self.noise**2 * np.eye(len(y)))
        
        return self
    
    def predict(self, X, return_std=False):
        # Compute k_* (kernel between test and training)
        k_star = self.kernel(X, self.X_train)
        
        # Mean: k_*^T (K + σ²I)^-1 y
        mean = k_star @ self.K_inv @ self.y_train
        
        if return_std:
            # Variance: k_** - k_*^T (K + σ²I)^-1 k_*
            k_star_star = self.kernel(X, X)
            var = np.diag(k_star_star) - np.sum(k_star @ self.K_inv * k_star, axis=1)
            std = np.sqrt(np.maximum(var, 0))  # Numerical stability
            return mean, std
        
        return mean

# Generate sparse training data
np.random.seed(42)
X_train_gp = np.array([1, 3, 5, 6, 8]).reshape(-1, 1)
y_train_gp = np.sin(X_train_gp).flatten() + 0.1 * np.random.randn(len(X_train_gp))

X_test_gp = np.linspace(0, 10, 200).reshape(-1, 1)
y_true_gp = np.sin(X_test_gp).flatten()

# Fit custom GP
gp_custom = GaussianProcessRegression(sigma=1.0, noise=0.1)
gp_custom.fit(X_train_gp, y_train_gp)
y_pred_mean, y_pred_std = gp_custom.predict(X_test_gp, return_std=True)

# Fit sklearn GP
kernel_sklearn = ConstantKernel(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=0.1**2)
gp_sklearn = GaussianProcessRegressor(kernel=kernel_sklearn, alpha=0)
gp_sklearn.fit(X_train_gp, y_train_gp)
y_pred_sklearn, y_std_sklearn = gp_sklearn.predict(X_test_gp, return_std=True)

# Visualize
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_train_gp, y_train_gp, s=100, c='red', zorder=10, 
           edgecolors='black', label='Training data')
plt.plot(X_test_gp, y_true_gp, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_gp, y_pred_mean, 'b-', linewidth=2, label='GP mean prediction')
plt.fill_between(X_test_gp.flatten(),
                 y_pred_mean - 2*y_pred_std,
                 y_pred_mean + 2*y_pred_std,
                 alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Custom Gaussian Process Regression')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(X_train_gp, y_train_gp, s=100, c='red', zorder=10,
           edgecolors='black', label='Training data')
plt.plot(X_test_gp, y_true_gp, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_gp, y_pred_sklearn, 'b-', linewidth=2, label='GP mean prediction')
plt.fill_between(X_test_gp.flatten(),
                 y_pred_sklearn - 2*y_std_sklearn,
                 y_pred_sklearn + 2*y_std_sklearn,
                 alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Sklearn Gaussian Process Regression')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("GP Regression Key Features:")
print("- Provides uncertainty estimates (crucial for decision-making)")
print("- Uncertainty increases far from training data")
print("- Uncertainty is low near training points")
print("- Naturally interpolates through data points (with low noise)")

GP Prior SamplesΒΆ

What: This code draws random functions from the Gaussian process prior \(f \sim \mathcal{GP}(0, k)\) with an RBF kernel at four different length scales \(\sigma \in \{0.5, 1.0, 2.0, 5.0\}\). Sampling is performed via the Cholesky decomposition \(\mathbf{L}\) of the kernel matrix: \(\mathbf{f} = \mathbf{L} \boldsymbol{\epsilon}\) where \(\boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})\).

Why: Visualizing prior samples builds intuition for what the GP β€œbelieves” before seeing any data. The length scale \(\sigma\) (also called \(\ell\)) controls the characteristic distance over which function values are correlated: small \(\sigma\) produces rapidly oscillating functions, while large \(\sigma\) produces smooth, slowly varying functions. Choosing an appropriate prior (kernel and its hyperparameters) is the most important modeling decision in GP regression, as it encodes assumptions about the function class.

Connection: Prior samples are a standard diagnostic in Bayesian modeling workflows. In Bayesian optimization (used for hyperparameter tuning), the GP prior over the objective function determines exploration behavior. In spatial statistics and geostatistics, the kernel length scale directly corresponds to the physical correlation range of the phenomenon being modeled.

# Sample from GP prior
X_prior = np.linspace(0, 10, 100).reshape(-1, 1)

def sample_gp_prior(X, kernel_func, n_samples=5):
    """Sample functions from GP prior"""
    K = kernel_func(X, X)
    # Add small jitter for numerical stability
    K += 1e-8 * np.eye(len(X))
    # Sample from multivariate Gaussian
    L = np.linalg.cholesky(K)
    samples = L @ np.random.randn(len(X), n_samples)
    return samples

kf = KernelFunctions()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Different length scales
length_scales = [0.5, 1.0, 2.0, 5.0]

for idx, ls in enumerate(length_scales):
    ax = axes[idx // 2, idx % 2]
    
    kernel_func = lambda X, Y: kf.rbf(X, Y, sigma=ls)
    samples = sample_gp_prior(X_prior, kernel_func, n_samples=5)
    
    for i in range(5):
        ax.plot(X_prior, samples[:, i], alpha=0.7, linewidth=2)
    
    ax.set_xlabel('x')
    ax.set_ylabel('f(x)')
    ax.set_title(f'GP Prior Samples (Οƒ={ls})')
    ax.grid(True, alpha=0.3)
    ax.set_ylim(-3, 3)

plt.tight_layout()
plt.show()

print("Observations:")
print("- Small Οƒ (length scale): Wiggly, rapid variations")
print("- Large Οƒ: Smooth, slow variations")
print("- Length scale controls function smoothness")

5. GP Hyperparameter OptimizationΒΆ

Marginal LikelihoodΒΆ

\[\log p(\mathbf{y}|\mathbf{X}, \boldsymbol{\theta}) = -\frac{1}{2}\mathbf{y}^T(\mathbf{K} + \sigma_n^2\mathbf{I})^{-1}\mathbf{y} - \frac{1}{2}\log|\mathbf{K} + \sigma_n^2\mathbf{I}| - \frac{n}{2}\log(2\pi)\]

We optimize hyperparameters \(\boldsymbol{\theta}\) (e.g., length scale, noise) by maximizing the marginal likelihood.

# Generate more complex data
np.random.seed(42)
X_train_opt = np.random.uniform(0, 10, 20).reshape(-1, 1)
y_train_opt = np.sin(X_train_opt).flatten() + 0.2 * np.random.randn(20)

# GP with hyperparameter optimization
kernel_opt = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + \
             WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-5, 1e1))

gp_opt = GaussianProcessRegressor(kernel=kernel_opt, n_restarts_optimizer=10)
gp_opt.fit(X_train_opt, y_train_opt)

print("Optimized Hyperparameters:")
print(f"Kernel: {gp_opt.kernel_}")
print(f"Log-marginal-likelihood: {gp_opt.log_marginal_likelihood_value_:.2f}")

# Predictions
X_test_opt = np.linspace(-1, 11, 200).reshape(-1, 1)
y_pred_opt, y_std_opt = gp_opt.predict(X_test_opt, return_std=True)

# Visualize
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_train_opt, y_train_opt, s=100, c='red', zorder=10,
           edgecolors='black', label='Training data')
plt.plot(X_test_opt, np.sin(X_test_opt), 'g--', linewidth=2, 
        alpha=0.7, label='True function')
plt.plot(X_test_opt, y_pred_opt, 'b-', linewidth=2, label='GP prediction')
plt.fill_between(X_test_opt.flatten(),
                 y_pred_opt - 2*y_std_opt,
                 y_pred_opt + 2*y_std_opt,
                 alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('GP with Optimized Hyperparameters')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-1, 11)

# Sample posterior functions
plt.subplot(1, 2, 2)
y_samples = gp_opt.sample_y(X_test_opt, n_samples=10)
for i in range(10):
    plt.plot(X_test_opt, y_samples[:, i], alpha=0.5, linewidth=1)
plt.scatter(X_train_opt, y_train_opt, s=100, c='red', zorder=10,
           edgecolors='black', label='Training data')
plt.plot(X_test_opt, np.sin(X_test_opt), 'g--', linewidth=2,
        alpha=0.7, label='True function')
plt.xlabel('x')
plt.ylabel('y')
plt.title('GP Posterior Samples')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-1, 11)

plt.tight_layout()
plt.show()

SummaryΒΆ

In this notebook, we covered:

  1. Kernel Functions: Linear, Polynomial, RBF, and their properties

  2. Kernel Trick: Implicit mapping to high-dimensional spaces

  3. Kernel Ridge Regression: Dual formulation using kernels

  4. Support Vector Machines: Maximum margin classification with kernels

  5. Gaussian Processes: Probabilistic, non-parametric regression

  6. GP Hyperparameters: Optimization via marginal likelihood

Key TakeawaysΒΆ

  • Kernel Trick: Allows working in infinite-dimensional spaces efficiently

  • RBF Kernel: Most commonly used, universal approximator

  • SVMs: Effective for classification, especially with RBF kernel

  • Gaussian Processes:

    • Provide uncertainty quantification (unlike most methods)

    • Flexible, non-parametric

    • Computationally expensive: O(nΒ³) for training, O(nΒ²) for prediction

    • Great for small-to-medium datasets

  • Hyperparameters: Critical for performance (length scale, noise)

When to UseΒΆ

  • Kernel Ridge Regression: When you need nonlinear regression, medium-sized data

  • SVM: Classification with clear margin of separation, high-dimensional data

  • Gaussian Processes: When uncertainty quantification is crucial, small-medium data, Bayesian optimization

ExercisesΒΆ

  1. Implement the Sequential Minimal Optimization (SMO) algorithm for SVM

  2. Derive the dual formulation of kernel ridge regression

  3. Implement sparse GP using inducing points

  4. Create a custom kernel and verify it’s positive semi-definite

  5. Implement GP classification using Laplace approximation