import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.spatial.distance import cdist
from scipy.optimize import minimize
from sklearn.svm import SVR, SVC
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel, Matern
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.datasets import make_classification, make_moons
from sklearn.preprocessing import StandardScaler
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)
1. Kernel Functions and the Kernel TrickΒΆ
The Kernel TrickΒΆ
Instead of explicitly computing features \(\phi(\mathbf{x})\) in high-dimensional space, we compute:
This allows us to work in infinite-dimensional spaces without ever computing \(\phi(\mathbf{x})\) explicitly!
Common Kernel FunctionsΒΆ
Linear: \(k(\mathbf{x}, \mathbf{x}') = \mathbf{x}^T\mathbf{x}'\)
Polynomial: \(k(\mathbf{x}, \mathbf{x}') = (\mathbf{x}^T\mathbf{x}' + c)^d\)
RBF (Gaussian): \(k(\mathbf{x}, \mathbf{x}') = \exp\left(-\frac{\|\mathbf{x} - \mathbf{x}'\|^2}{2\sigma^2}\right)\)
MatΓ©rn: \(k(\mathbf{x}, \mathbf{x}') = \frac{2^{1-\nu}}{\Gamma(\nu)}\left(\frac{\sqrt{2\nu}r}{\ell}\right)^\nu K_\nu\left(\frac{\sqrt{2\nu}r}{\ell}\right)\)
class KernelFunctions:
"""Common kernel functions"""
@staticmethod
def linear(X, Y):
"""Linear kernel: k(x, x') = x^T x'"""
return X @ Y.T
@staticmethod
def polynomial(X, Y, degree=3, coef0=1):
"""Polynomial kernel: k(x, x') = (x^T x' + c)^d"""
return (X @ Y.T + coef0) ** degree
@staticmethod
def rbf(X, Y, sigma=1.0):
"""RBF (Gaussian) kernel: k(x, x') = exp(-||x - x'||^2 / (2Ο^2))"""
# Compute pairwise squared distances
dists_sq = cdist(X, Y, metric='sqeuclidean')
return np.exp(-dists_sq / (2 * sigma**2))
@staticmethod
def laplacian(X, Y, sigma=1.0):
"""Laplacian kernel: k(x, x') = exp(-||x - x'|| / Ο)"""
dists = cdist(X, Y, metric='euclidean')
return np.exp(-dists / sigma)
# Visualize different kernels
x = np.linspace(-3, 3, 100).reshape(-1, 1)
x0 = np.array([[0]]) # Reference point
kf = KernelFunctions()
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Linear kernel
K_linear = kf.linear(x, x0).flatten()
axes[0, 0].plot(x, K_linear, 'b-', linewidth=2)
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('k(x, 0)')
axes[0, 0].set_title('Linear Kernel')
axes[0, 0].grid(True, alpha=0.3)
# Polynomial kernels
for degree in [2, 3, 5]:
K_poly = kf.polynomial(x, x0, degree=degree).flatten()
axes[0, 1].plot(x, K_poly, linewidth=2, label=f'degree={degree}')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('k(x, 0)')
axes[0, 1].set_title('Polynomial Kernels')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
# RBF kernels with different Ο
for sigma in [0.5, 1.0, 2.0]:
K_rbf = kf.rbf(x, x0, sigma=sigma).flatten()
axes[1, 0].plot(x, K_rbf, linewidth=2, label=f'Ο={sigma}')
axes[1, 0].set_xlabel('x')
axes[1, 0].set_ylabel('k(x, 0)')
axes[1, 0].set_title('RBF (Gaussian) Kernels')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# Kernel matrix visualization
x_grid = np.linspace(-2, 2, 50).reshape(-1, 1)
K_matrix = kf.rbf(x_grid, x_grid, sigma=1.0)
im = axes[1, 1].imshow(K_matrix, cmap='viridis', aspect='auto')
axes[1, 1].set_xlabel('Sample index')
axes[1, 1].set_ylabel('Sample index')
axes[1, 1].set_title('RBF Kernel Matrix (Gram Matrix)')
plt.colorbar(im, ax=axes[1, 1])
plt.tight_layout()
plt.show()
print("Kernel properties:")
print("- Linear: Captures linear relationships")
print("- Polynomial: Captures polynomial interactions")
print("- RBF: Universal approximator, measures local similarity")
print("- Ο controls the 'width' of the RBF kernel")
2. Kernel Ridge RegressionΒΆ
Dual FormulationΒΆ
Instead of solving in weight space: $\(\mathbf{w} = (\mathbf{X}^T\mathbf{X} + \lambda\mathbf{I})^{-1}\mathbf{X}^T\mathbf{y}\)$
We can solve in dual space: $\(\boldsymbol{\alpha} = (\mathbf{K} + \lambda\mathbf{I})^{-1}\mathbf{y}\)$
Where \(\mathbf{K}_{ij} = k(\mathbf{x}_i, \mathbf{x}_j)\) is the kernel matrix.
PredictionΒΆ
class KernelRidgeRegression:
"""Kernel Ridge Regression"""
def __init__(self, kernel='rbf', sigma=1.0, lambda_=1.0, degree=3):
self.kernel_type = kernel
self.sigma = sigma
self.lambda_ = lambda_
self.degree = degree
self.kf = KernelFunctions()
def compute_kernel(self, X, Y):
"""Compute kernel matrix K(X, Y)"""
if self.kernel_type == 'linear':
return self.kf.linear(X, Y)
elif self.kernel_type == 'polynomial':
return self.kf.polynomial(X, Y, degree=self.degree)
elif self.kernel_type == 'rbf':
return self.kf.rbf(X, Y, sigma=self.sigma)
else:
raise ValueError(f"Unknown kernel: {self.kernel_type}")
def fit(self, X, y):
"""Fit kernel ridge regression"""
self.X_train = X
self.y_train = y
# Compute kernel matrix
K = self.compute_kernel(X, X)
# Solve: Ξ± = (K + Ξ»I)^-1 y
n = len(y)
self.alpha = np.linalg.solve(K + self.lambda_ * np.eye(n), y)
return self
def predict(self, X):
"""Predict using kernel ridge regression"""
# Compute kernel between test and training points
K_test = self.compute_kernel(X, self.X_train)
return K_test @ self.alpha
# Generate nonlinear data
np.random.seed(42)
n_train = 30
X_train_kr = np.sort(np.random.uniform(0, 10, n_train)).reshape(-1, 1)
y_train_kr = np.sin(X_train_kr).flatten() + 0.2 * np.random.randn(n_train)
X_test_kr = np.linspace(-1, 11, 200).reshape(-1, 1)
y_test_true = np.sin(X_test_kr).flatten()
# Compare different kernels
kernels = [
('Linear', 'linear', {}),
('Polynomial (deg=3)', 'polynomial', {'degree': 3}),
('RBF (Ο=1)', 'rbf', {'sigma': 1.0}),
('RBF (Ο=0.5)', 'rbf', {'sigma': 0.5}),
]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
for idx, (name, kernel, params) in enumerate(kernels):
krr = KernelRidgeRegression(kernel=kernel, lambda_=0.1, **params)
krr.fit(X_train_kr, y_train_kr)
y_pred = krr.predict(X_test_kr)
# Plot
axes[idx].scatter(X_train_kr, y_train_kr, s=80, alpha=0.8, label='Training data', zorder=3)
axes[idx].plot(X_test_kr, y_test_true, 'g--', linewidth=2, alpha=0.7, label='True function')
axes[idx].plot(X_test_kr, y_pred, 'r-', linewidth=2, label='KRR prediction')
axes[idx].set_xlabel('x')
axes[idx].set_ylabel('y')
axes[idx].set_title(f'{name} Kernel')
axes[idx].legend()
axes[idx].grid(True, alpha=0.3)
axes[idx].set_xlim(-1, 11)
plt.tight_layout()
plt.show()
print("Observations:")
print("- Linear kernel: Cannot capture nonlinearity")
print("- Polynomial kernel: Captures some nonlinearity but unstable at boundaries")
print("- RBF kernel: Smooth interpolation, best for this problem")
print("- Smaller Ο β more wiggly fit (overfitting risk)")
3. Support Vector Machines (SVM)ΒΆ
SVM for ClassificationΒΆ
Find the maximum margin hyperplane:
Subject to: $\(y_i(\mathbf{w}^T\phi(\mathbf{x}_i) + b) \geq 1 - \xi_i, \quad \xi_i \geq 0\)$
Dual FormΒΆ
Subject to: \(0 \leq \alpha_i \leq C\) and \(\sum_i \alpha_i y_i = 0\)
# Generate nonlinearly separable data
np.random.seed(42)
X_svm, y_svm = make_moons(n_samples=200, noise=0.15, random_state=42)
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
X_svm, y_svm, test_size=0.3, random_state=42)
# Compare linear and RBF SVM
svm_linear = SVC(kernel='linear', C=1.0)
svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_linear.fit(X_train_svm, y_train_svm)
svm_rbf.fit(X_train_svm, y_train_svm)
print("SVM Classification Results:")
print(f"\nLinear SVM:")
print(f" Train accuracy: {svm_linear.score(X_train_svm, y_train_svm):.4f}")
print(f" Test accuracy: {svm_linear.score(X_test_svm, y_test_svm):.4f}")
print(f" Support vectors: {len(svm_linear.support_vectors_)}")
print(f"\nRBF SVM:")
print(f" Train accuracy: {svm_rbf.score(X_train_svm, y_train_svm):.4f}")
print(f" Test accuracy: {svm_rbf.score(X_test_svm, y_test_svm):.4f}")
print(f" Support vectors: {len(svm_rbf.support_vectors_)}")
# Visualize decision boundaries
def plot_svm_decision_boundary(model, X, y, title):
h = 0.02
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
# Plot data points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='black', s=50)
# Highlight support vectors
plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1],
s=200, facecolors='none', edgecolors='green', linewidths=2,
label='Support Vectors')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(title)
plt.legend()
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plot_svm_decision_boundary(svm_linear, X_train_svm, y_train_svm,
'Linear SVM (Cannot separate nonlinear data)')
plt.subplot(1, 2, 2)
plot_svm_decision_boundary(svm_rbf, X_train_svm, y_train_svm,
'RBF SVM (Nonlinear decision boundary)')
plt.tight_layout()
plt.show()
SVM Hyperparameter: Effect of CΒΆ
What: This code visualizes how the SVM regularization parameter \(C\) controls the trade-off between margin width and training error. For \(C \in \{0.1, 1, 10, 100\}\), it plots the RBF SVM decision boundary, highlights support vectors, and reports train/test accuracy along with support vector count.
Why: The parameter \(C\) acts as the inverse of regularization strength. Small \(C\) permits more margin violations (soft margin), yielding smoother boundaries with more support vectors and better generalization. Large \(C\) penalizes violations heavily, producing a tight boundary that fits training data closely but risks overfitting. The number of support vectors is a useful diagnostic: fewer support vectors indicate a simpler model, while many support vectors suggest the model is relying heavily on the training data.
Connection: Tuning \(C\) (along with the kernel bandwidth \(\gamma\)) is typically done via grid search with cross-validation in production SVM pipelines. In applications like handwriting recognition, medical image classification, and text categorization, the optimal \(C\) balances noise tolerance against classification accuracy.
# Effect of C parameter (regularization strength)
C_values = [0.1, 1, 10, 100]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
for idx, C in enumerate(C_values):
svm = SVC(kernel='rbf', C=C, gamma='scale')
svm.fit(X_train_svm, y_train_svm)
# Decision boundary
h = 0.02
x_min, x_max = X_train_svm[:, 0].min() - 0.5, X_train_svm[:, 0].max() + 0.5
y_min, y_max = X_train_svm[:, 1].min() - 0.5, X_train_svm[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[idx].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
axes[idx].scatter(X_train_svm[:, 0], X_train_svm[:, 1], c=y_train_svm,
cmap='RdYlBu', edgecolors='black', s=50)
axes[idx].scatter(svm.support_vectors_[:, 0], svm.support_vectors_[:, 1],
s=200, facecolors='none', edgecolors='green', linewidths=2)
train_acc = svm.score(X_train_svm, y_train_svm)
test_acc = svm.score(X_test_svm, y_test_svm)
n_sv = len(svm.support_vectors_)
axes[idx].set_xlabel('Feature 1')
axes[idx].set_ylabel('Feature 2')
axes[idx].set_title(f'C={C}\nTrain: {train_acc:.3f}, Test: {test_acc:.3f}, SV: {n_sv}')
plt.tight_layout()
plt.show()
print("Effect of C:")
print("- Small C: Soft margin, more regularization, more support vectors")
print("- Large C: Hard margin, less regularization, fewer support vectors")
print("- Too large C can lead to overfitting")
4. Gaussian Processes for RegressionΒΆ
Gaussian Process DefinitionΒΆ
A GP is a collection of random variables, any finite number of which have a joint Gaussian distribution.
Where:
\(m(\mathbf{x})\) is the mean function (often 0)
\(k(\mathbf{x}, \mathbf{x}')\) is the covariance (kernel) function
Posterior Predictive DistributionΒΆ
Given training data \(\mathcal{D} = \{(\mathbf{x}_i, y_i)\}_{i=1}^n\):
Mean: $\(\mu_{*} = \mathbf{k}_*^T (\mathbf{K} + \sigma_n^2\mathbf{I})^{-1}\mathbf{y}\)$
Variance: $\(\sigma_*^2 = k_{**} - \mathbf{k}_*^T (\mathbf{K} + \sigma_n^2\mathbf{I})^{-1}\mathbf{k}_*\)$
Where:
\(\mathbf{k}_* = [k(\mathbf{x}_*, \mathbf{x}_1), \ldots, k(\mathbf{x}_*, \mathbf{x}_n)]^T\)
\(k_{**} = k(\mathbf{x}_*, \mathbf{x}_*)\)
class GaussianProcessRegression:
"""Gaussian Process Regression from scratch"""
def __init__(self, kernel='rbf', sigma=1.0, noise=0.1):
self.kernel_type = kernel
self.sigma = sigma
self.noise = noise
self.kf = KernelFunctions()
def kernel(self, X, Y):
if self.kernel_type == 'rbf':
return self.kf.rbf(X, Y, sigma=self.sigma)
else:
raise ValueError(f"Unknown kernel: {self.kernel_type}")
def fit(self, X, y):
self.X_train = X
self.y_train = y
# Compute kernel matrix K + noise
K = self.kernel(X, X)
self.K_inv = np.linalg.inv(K + self.noise**2 * np.eye(len(y)))
return self
def predict(self, X, return_std=False):
# Compute k_* (kernel between test and training)
k_star = self.kernel(X, self.X_train)
# Mean: k_*^T (K + ΟΒ²I)^-1 y
mean = k_star @ self.K_inv @ self.y_train
if return_std:
# Variance: k_** - k_*^T (K + ΟΒ²I)^-1 k_*
k_star_star = self.kernel(X, X)
var = np.diag(k_star_star) - np.sum(k_star @ self.K_inv * k_star, axis=1)
std = np.sqrt(np.maximum(var, 0)) # Numerical stability
return mean, std
return mean
# Generate sparse training data
np.random.seed(42)
X_train_gp = np.array([1, 3, 5, 6, 8]).reshape(-1, 1)
y_train_gp = np.sin(X_train_gp).flatten() + 0.1 * np.random.randn(len(X_train_gp))
X_test_gp = np.linspace(0, 10, 200).reshape(-1, 1)
y_true_gp = np.sin(X_test_gp).flatten()
# Fit custom GP
gp_custom = GaussianProcessRegression(sigma=1.0, noise=0.1)
gp_custom.fit(X_train_gp, y_train_gp)
y_pred_mean, y_pred_std = gp_custom.predict(X_test_gp, return_std=True)
# Fit sklearn GP
kernel_sklearn = ConstantKernel(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=0.1**2)
gp_sklearn = GaussianProcessRegressor(kernel=kernel_sklearn, alpha=0)
gp_sklearn.fit(X_train_gp, y_train_gp)
y_pred_sklearn, y_std_sklearn = gp_sklearn.predict(X_test_gp, return_std=True)
# Visualize
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.scatter(X_train_gp, y_train_gp, s=100, c='red', zorder=10,
edgecolors='black', label='Training data')
plt.plot(X_test_gp, y_true_gp, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_gp, y_pred_mean, 'b-', linewidth=2, label='GP mean prediction')
plt.fill_between(X_test_gp.flatten(),
y_pred_mean - 2*y_pred_std,
y_pred_mean + 2*y_pred_std,
alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Custom Gaussian Process Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.scatter(X_train_gp, y_train_gp, s=100, c='red', zorder=10,
edgecolors='black', label='Training data')
plt.plot(X_test_gp, y_true_gp, 'g--', linewidth=2, alpha=0.7, label='True function')
plt.plot(X_test_gp, y_pred_sklearn, 'b-', linewidth=2, label='GP mean prediction')
plt.fill_between(X_test_gp.flatten(),
y_pred_sklearn - 2*y_std_sklearn,
y_pred_sklearn + 2*y_std_sklearn,
alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Sklearn Gaussian Process Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("GP Regression Key Features:")
print("- Provides uncertainty estimates (crucial for decision-making)")
print("- Uncertainty increases far from training data")
print("- Uncertainty is low near training points")
print("- Naturally interpolates through data points (with low noise)")
GP Prior SamplesΒΆ
What: This code draws random functions from the Gaussian process prior \(f \sim \mathcal{GP}(0, k)\) with an RBF kernel at four different length scales \(\sigma \in \{0.5, 1.0, 2.0, 5.0\}\). Sampling is performed via the Cholesky decomposition \(\mathbf{L}\) of the kernel matrix: \(\mathbf{f} = \mathbf{L} \boldsymbol{\epsilon}\) where \(\boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})\).
Why: Visualizing prior samples builds intuition for what the GP βbelievesβ before seeing any data. The length scale \(\sigma\) (also called \(\ell\)) controls the characteristic distance over which function values are correlated: small \(\sigma\) produces rapidly oscillating functions, while large \(\sigma\) produces smooth, slowly varying functions. Choosing an appropriate prior (kernel and its hyperparameters) is the most important modeling decision in GP regression, as it encodes assumptions about the function class.
Connection: Prior samples are a standard diagnostic in Bayesian modeling workflows. In Bayesian optimization (used for hyperparameter tuning), the GP prior over the objective function determines exploration behavior. In spatial statistics and geostatistics, the kernel length scale directly corresponds to the physical correlation range of the phenomenon being modeled.
# Sample from GP prior
X_prior = np.linspace(0, 10, 100).reshape(-1, 1)
def sample_gp_prior(X, kernel_func, n_samples=5):
"""Sample functions from GP prior"""
K = kernel_func(X, X)
# Add small jitter for numerical stability
K += 1e-8 * np.eye(len(X))
# Sample from multivariate Gaussian
L = np.linalg.cholesky(K)
samples = L @ np.random.randn(len(X), n_samples)
return samples
kf = KernelFunctions()
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Different length scales
length_scales = [0.5, 1.0, 2.0, 5.0]
for idx, ls in enumerate(length_scales):
ax = axes[idx // 2, idx % 2]
kernel_func = lambda X, Y: kf.rbf(X, Y, sigma=ls)
samples = sample_gp_prior(X_prior, kernel_func, n_samples=5)
for i in range(5):
ax.plot(X_prior, samples[:, i], alpha=0.7, linewidth=2)
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.set_title(f'GP Prior Samples (Ο={ls})')
ax.grid(True, alpha=0.3)
ax.set_ylim(-3, 3)
plt.tight_layout()
plt.show()
print("Observations:")
print("- Small Ο (length scale): Wiggly, rapid variations")
print("- Large Ο: Smooth, slow variations")
print("- Length scale controls function smoothness")
5. GP Hyperparameter OptimizationΒΆ
Marginal LikelihoodΒΆ
We optimize hyperparameters \(\boldsymbol{\theta}\) (e.g., length scale, noise) by maximizing the marginal likelihood.
# Generate more complex data
np.random.seed(42)
X_train_opt = np.random.uniform(0, 10, 20).reshape(-1, 1)
y_train_opt = np.sin(X_train_opt).flatten() + 0.2 * np.random.randn(20)
# GP with hyperparameter optimization
kernel_opt = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + \
WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-5, 1e1))
gp_opt = GaussianProcessRegressor(kernel=kernel_opt, n_restarts_optimizer=10)
gp_opt.fit(X_train_opt, y_train_opt)
print("Optimized Hyperparameters:")
print(f"Kernel: {gp_opt.kernel_}")
print(f"Log-marginal-likelihood: {gp_opt.log_marginal_likelihood_value_:.2f}")
# Predictions
X_test_opt = np.linspace(-1, 11, 200).reshape(-1, 1)
y_pred_opt, y_std_opt = gp_opt.predict(X_test_opt, return_std=True)
# Visualize
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.scatter(X_train_opt, y_train_opt, s=100, c='red', zorder=10,
edgecolors='black', label='Training data')
plt.plot(X_test_opt, np.sin(X_test_opt), 'g--', linewidth=2,
alpha=0.7, label='True function')
plt.plot(X_test_opt, y_pred_opt, 'b-', linewidth=2, label='GP prediction')
plt.fill_between(X_test_opt.flatten(),
y_pred_opt - 2*y_std_opt,
y_pred_opt + 2*y_std_opt,
alpha=0.3, color='blue', label='95% confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.title('GP with Optimized Hyperparameters')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-1, 11)
# Sample posterior functions
plt.subplot(1, 2, 2)
y_samples = gp_opt.sample_y(X_test_opt, n_samples=10)
for i in range(10):
plt.plot(X_test_opt, y_samples[:, i], alpha=0.5, linewidth=1)
plt.scatter(X_train_opt, y_train_opt, s=100, c='red', zorder=10,
edgecolors='black', label='Training data')
plt.plot(X_test_opt, np.sin(X_test_opt), 'g--', linewidth=2,
alpha=0.7, label='True function')
plt.xlabel('x')
plt.ylabel('y')
plt.title('GP Posterior Samples')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-1, 11)
plt.tight_layout()
plt.show()
SummaryΒΆ
In this notebook, we covered:
Kernel Functions: Linear, Polynomial, RBF, and their properties
Kernel Trick: Implicit mapping to high-dimensional spaces
Kernel Ridge Regression: Dual formulation using kernels
Support Vector Machines: Maximum margin classification with kernels
Gaussian Processes: Probabilistic, non-parametric regression
GP Hyperparameters: Optimization via marginal likelihood
Key TakeawaysΒΆ
Kernel Trick: Allows working in infinite-dimensional spaces efficiently
RBF Kernel: Most commonly used, universal approximator
SVMs: Effective for classification, especially with RBF kernel
Gaussian Processes:
Provide uncertainty quantification (unlike most methods)
Flexible, non-parametric
Computationally expensive: O(nΒ³) for training, O(nΒ²) for prediction
Great for small-to-medium datasets
Hyperparameters: Critical for performance (length scale, noise)
When to UseΒΆ
Kernel Ridge Regression: When you need nonlinear regression, medium-sized data
SVM: Classification with clear margin of separation, high-dimensional data
Gaussian Processes: When uncertainty quantification is crucial, small-medium data, Bayesian optimization
ExercisesΒΆ
Implement the Sequential Minimal Optimization (SMO) algorithm for SVM
Derive the dual formulation of kernel ridge regression
Implement sparse GP using inducing points
Create a custom kernel and verify itβs positive semi-definite
Implement GP classification using Laplace approximation