import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import seaborn as sns
sns.set_style('whitegrid')
np.random.seed(42)
torch.manual_seed(42)
1. PAC-Bayes FrameworkΒΆ
Key IdeaΒΆ
Instead of bounding a single hypothesis \(h\), bound the expected error over a posterior distribution \(Q\) over hypotheses.
SetupΒΆ
Prior \(P\): Distribution over hypotheses (chosen before seeing data)
Posterior \(Q\): Distribution over hypotheses (learned from data)
Stochastic classifier: Draw \(h \sim Q\), predict with \(h\)
PAC-Bayes Theorem (McAllester, 1999)ΒΆ
For any \(\delta > 0\), with probability \(\geq 1 - \delta\) over training set \(S\):
where:
\(R(h)\) = true risk
\(\hat{R}(h)\) = empirical risk
\(KL(Q \| P)\) = KL divergence between posterior and prior
Intuition: Generalization depends on:
Empirical performance
Complexity = how much posterior differs from prior
2. Proof SketchΒΆ
Step 1: Change of MeasureΒΆ
For any random variable \(Z(h)\) and distributions \(P, Q\):
Step 2: Donsker-Varadhan Variational FormulaΒΆ
Step 3: Apply ConcentrationΒΆ
Use Hoeffdingβs inequality on \(\mathbb{E}_{h \sim P}[e^{\lambda(R(h) - \hat{R}(h))}]\) and optimize \(\lambda\).
Result: Bound that holds uniformly over all posteriors \(Q\).
def pac_bayes_bound(emp_risk, kl_div, n, delta=0.05):
"""Compute PAC-Bayes generalization bound."""
return emp_risk + np.sqrt((kl_div + np.log(2*np.sqrt(n)/delta)) / (2*n))
def visualize_pac_bayes_bound():
"""Visualize how PAC-Bayes bound depends on KL and sample size."""
n_values = np.logspace(2, 4, 50)
kl_values = [0.1, 1.0, 5.0, 10.0]
emp_risk = 0.1
plt.figure(figsize=(12, 5))
# Plot 1: Bound vs sample size for different KL
plt.subplot(1, 2, 1)
for kl in kl_values:
bounds = [pac_bayes_bound(emp_risk, kl, n) for n in n_values]
plt.plot(n_values, bounds, label=f'KL={kl}', linewidth=2)
plt.axhline(emp_risk, color='black', linestyle='--', label='Empirical Risk')
plt.xscale('log')
plt.xlabel('Sample Size (n)', fontsize=12)
plt.ylabel('Risk Bound', fontsize=12)
plt.title('PAC-Bayes Bound vs Sample Size', fontsize=13)
plt.legend()
plt.grid(True, alpha=0.3)
# Plot 2: Bound vs KL for different sample sizes
plt.subplot(1, 2, 2)
kl_range = np.linspace(0, 20, 100)
for n in [100, 500, 1000, 5000]:
bounds = [pac_bayes_bound(emp_risk, kl, n) for kl in kl_range]
plt.plot(kl_range, bounds, label=f'n={n}', linewidth=2)
plt.axhline(emp_risk, color='black', linestyle='--', label='Empirical Risk')
plt.xlabel('KL(Q || P)', fontsize=12)
plt.ylabel('Risk Bound', fontsize=12)
plt.title('PAC-Bayes Bound vs KL Divergence', fontsize=13)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
visualize_pac_bayes_bound()
print("Key insight: Larger KL β worse bound, more data β tighter bound")
3. Application: Gaussian Posterior over WeightsΒΆ
Common Choice for Neural NetworksΒΆ
Prior: \(P = \mathcal{N}(0, \sigma_P^2 I)\)
Posterior: \(Q = \mathcal{N}(\mu, \sigma_Q^2 I)\)
KL Divergence (Closed Form)ΒΆ
where \(d\) = dimension of weight space.
PAC-Bayes RegularizationΒΆ
Minimize: $\(\mathcal{L} = \mathbb{E}_{h \sim Q}[\hat{R}(h)] + \lambda \cdot KL(Q \| P)\)$
This is exactly variational inference with data-dependent regularization!
class PACBayesLinear(nn.Module):
"""Linear layer with Gaussian posterior for PAC-Bayes."""
def __init__(self, in_features, out_features, prior_std=1.0):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.prior_std = prior_std
# Posterior parameters (learnable)
self.weight_mu = nn.Parameter(torch.randn(out_features, in_features) * 0.01)
self.weight_logsigma = nn.Parameter(torch.ones(out_features, in_features) * -3)
self.bias_mu = nn.Parameter(torch.zeros(out_features))
self.bias_logsigma = nn.Parameter(torch.ones(out_features) * -3)
def forward(self, x, sample=True):
"""Forward pass with reparameterization trick."""
if sample:
# Sample weights from posterior
weight_sigma = torch.exp(self.weight_logsigma)
weight = self.weight_mu + weight_sigma * torch.randn_like(self.weight_mu)
bias_sigma = torch.exp(self.bias_logsigma)
bias = self.bias_mu + bias_sigma * torch.randn_like(self.bias_mu)
else:
# Use mean (for evaluation)
weight = self.weight_mu
bias = self.bias_mu
return F.linear(x, weight, bias)
def kl_divergence(self):
"""Compute KL(Q || P) for Gaussian distributions."""
weight_sigma = torch.exp(self.weight_logsigma)
bias_sigma = torch.exp(self.bias_logsigma)
# KL for weights
kl_weight = 0.5 * torch.sum(
weight_sigma**2 / self.prior_std**2 +
self.weight_mu**2 / self.prior_std**2 -
1 - 2 * self.weight_logsigma + 2 * np.log(self.prior_std)
)
# KL for bias
kl_bias = 0.5 * torch.sum(
bias_sigma**2 / self.prior_std**2 +
self.bias_mu**2 / self.prior_std**2 -
1 - 2 * self.bias_logsigma + 2 * np.log(self.prior_std)
)
return kl_weight + kl_bias
# Test
layer = PACBayesLinear(10, 5, prior_std=1.0)
x = torch.randn(32, 10)
out = layer(x)
kl = layer.kl_divergence()
print(f"Output shape: {out.shape}")
print(f"KL divergence: {kl.item():.4f}")
4. Training with PAC-Bayes BoundΒΆ
# Generate synthetic binary classification data
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
X, y = make_moons(n_samples=200, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train = torch.FloatTensor(X_train)
y_train = torch.LongTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.LongTensor(y_test)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")
class PACBayesNet(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, prior_std=1.0):
super().__init__()
self.fc1 = PACBayesLinear(input_dim, hidden_dim, prior_std)
self.fc2 = PACBayesLinear(hidden_dim, output_dim, prior_std)
def forward(self, x, sample=True):
x = torch.relu(self.fc1(x, sample))
return self.fc2(x, sample)
def kl_divergence(self):
return self.fc1.kl_divergence() + self.fc2.kl_divergence()
# Train with PAC-Bayes objective
model = PACBayesNet(2, 20, 2, prior_std=1.0)
optimizer = optim.Adam(model.parameters(), lr=0.01)
n_epochs = 200
kl_weight = 0.01 # Ξ» in objective
n_train = len(X_train)
history = {'loss': [], 'nll': [], 'kl': [], 'train_acc': [], 'test_acc': []}
for epoch in range(n_epochs):
model.train()
optimizer.zero_grad()
# Forward pass (sample from posterior)
logits = model(X_train, sample=True)
nll = F.cross_entropy(logits, y_train)
kl = model.kl_divergence()
# PAC-Bayes objective
loss = nll + (kl_weight / n_train) * kl
loss.backward()
optimizer.step()
# Evaluate
model.eval()
with torch.no_grad():
train_pred = model(X_train, sample=False).argmax(1)
test_pred = model(X_test, sample=False).argmax(1)
train_acc = (train_pred == y_train).float().mean()
test_acc = (test_pred == y_test).float().mean()
history['loss'].append(loss.item())
history['nll'].append(nll.item())
history['kl'].append(kl.item())
history['train_acc'].append(train_acc.item())
history['test_acc'].append(test_acc.item())
if (epoch + 1) % 50 == 0:
print(f"Epoch {epoch+1}: Loss={loss:.4f}, NLL={nll:.4f}, KL={kl:.4f}, "
f"Train Acc={train_acc:.3f}, Test Acc={test_acc:.3f}")
print("\nTraining complete!")
# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Loss components
ax1 = axes[0, 0]
ax1.plot(history['nll'], label='NLL', linewidth=2)
ax1.plot(np.array(history['kl']) * kl_weight / n_train, label='KL (weighted)', linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Loss Components')
ax1.legend()
ax1.grid(True, alpha=0.3)
# Accuracy
ax2 = axes[0, 1]
ax2.plot(history['train_acc'], label='Train', linewidth=2)
ax2.plot(history['test_acc'], label='Test', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Classification Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)
# Decision boundary
ax3 = axes[1, 0]
h = 0.02
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
grid = torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()])
with torch.no_grad():
Z = model(grid, sample=False).argmax(1).numpy()
Z = Z.reshape(xx.shape)
ax3.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
ax3.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', cmap='RdYlBu', s=50)
ax3.set_title('Decision Boundary')
ax3.set_xlabel('Feature 1')
ax3.set_ylabel('Feature 2')
# PAC-Bayes bound
ax4 = axes[1, 1]
emp_errors = 1 - np.array(history['train_acc'])
kl_divs = np.array(history['kl'])
bounds = [pac_bayes_bound(emp, kl, n_train) for emp, kl in zip(emp_errors, kl_divs)]
test_errors = 1 - np.array(history['test_acc'])
ax4.plot(emp_errors, label='Train Error', linewidth=2)
ax4.plot(test_errors, label='Test Error', linewidth=2)
ax4.plot(bounds, label='PAC-Bayes Bound', linewidth=2, linestyle='--')
ax4.set_xlabel('Epoch')
ax4.set_ylabel('Error Rate')
ax4.set_title('Generalization Bound')
ax4.legend()
ax4.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Final test error: {test_errors[-1]:.4f}")
print(f"PAC-Bayes bound: {bounds[-1]:.4f}")
print(f"Bound holds: {test_errors[-1] <= bounds[-1]}")
5. Advantages of PAC-BayesΒΆ
Compared to Standard PAC Learning:ΒΆ
Data-dependent: Bound adapts to learned posterior
Tighter: Often much better than worst-case VC bounds
Bayesian flavor: Incorporates prior knowledge naturally
Stochastic predictions: Can calibrate uncertainty
Applications:ΒΆ
Neural network generalization (understanding deep learning)
Model compression (quantization, pruning)
Transfer learning (prior from pre-training)
Federated learning (privacy-preserving bounds)
Modern Extensions:ΒΆ
PAC-Bayes for deep learning (Dziugaite & Roy, 2017)
Non-vacuous bounds for ImageNet (Zhou et al., 2019)
PAC-Bayes compression (Letarte et al., 2019)
SummaryΒΆ
Key Takeaways:ΒΆ
PAC-Bayes bounds expected risk over posterior distribution
Generalization depends on KL divergence from prior
Practical objective: Empirical risk + KL regularization
Connection to Bayesian learning: Variational inference with data-dependent prior weight
Modern relevance: Understanding deep learning generalization
When to Use PAC-Bayes:ΒΆ
Need data-dependent bounds
Have prior knowledge to encode in \(P\)
Want stochastic predictions with calibrated uncertainty
Studying neural network generalization
Next Steps:ΒΆ
05_neural_tangent_kernel.ipynb - Kernel view of neural networks
03_rademacher_complexity.ipynb - Alternative complexity measure
09_expectation_maximization.ipynb - Related optimization technique