import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import seaborn as sns

sns.set_style('whitegrid')
np.random.seed(42)
torch.manual_seed(42)

1. PAC-Bayes FrameworkΒΆ

Key IdeaΒΆ

Instead of bounding a single hypothesis \(h\), bound the expected error over a posterior distribution \(Q\) over hypotheses.

SetupΒΆ

  • Prior \(P\): Distribution over hypotheses (chosen before seeing data)

  • Posterior \(Q\): Distribution over hypotheses (learned from data)

  • Stochastic classifier: Draw \(h \sim Q\), predict with \(h\)

PAC-Bayes Theorem (McAllester, 1999)ΒΆ

For any \(\delta > 0\), with probability \(\geq 1 - \delta\) over training set \(S\):

\[\forall Q: \quad \mathbb{E}_{h \sim Q}[R(h)] \leq \mathbb{E}_{h \sim Q}[\hat{R}(h)] + \sqrt{\frac{KL(Q \| P) + \log(2\sqrt{n}/\delta)}{2n}}\]

where:

  • \(R(h)\) = true risk

  • \(\hat{R}(h)\) = empirical risk

  • \(KL(Q \| P)\) = KL divergence between posterior and prior

Intuition: Generalization depends on:

  1. Empirical performance

  2. Complexity = how much posterior differs from prior

2. Proof SketchΒΆ

Step 1: Change of MeasureΒΆ

For any random variable \(Z(h)\) and distributions \(P, Q\):

\[\mathbb{E}_{h \sim Q}[Z(h)] = \mathbb{E}_{h \sim P}\left[\frac{dQ}{dP}(h) Z(h)\right]\]

Step 2: Donsker-Varadhan Variational FormulaΒΆ

\[\log \mathbb{E}_{h \sim P}[e^{Z(h)}] \geq \mathbb{E}_{h \sim Q}[Z(h)] - KL(Q \| P)\]

Step 3: Apply ConcentrationΒΆ

Use Hoeffding’s inequality on \(\mathbb{E}_{h \sim P}[e^{\lambda(R(h) - \hat{R}(h))}]\) and optimize \(\lambda\).

Result: Bound that holds uniformly over all posteriors \(Q\).

def pac_bayes_bound(emp_risk, kl_div, n, delta=0.05):
    """Compute PAC-Bayes generalization bound."""
    return emp_risk + np.sqrt((kl_div + np.log(2*np.sqrt(n)/delta)) / (2*n))

def visualize_pac_bayes_bound():
    """Visualize how PAC-Bayes bound depends on KL and sample size."""
    n_values = np.logspace(2, 4, 50)
    kl_values = [0.1, 1.0, 5.0, 10.0]
    emp_risk = 0.1
    
    plt.figure(figsize=(12, 5))
    
    # Plot 1: Bound vs sample size for different KL
    plt.subplot(1, 2, 1)
    for kl in kl_values:
        bounds = [pac_bayes_bound(emp_risk, kl, n) for n in n_values]
        plt.plot(n_values, bounds, label=f'KL={kl}', linewidth=2)
    plt.axhline(emp_risk, color='black', linestyle='--', label='Empirical Risk')
    plt.xscale('log')
    plt.xlabel('Sample Size (n)', fontsize=12)
    plt.ylabel('Risk Bound', fontsize=12)
    plt.title('PAC-Bayes Bound vs Sample Size', fontsize=13)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 2: Bound vs KL for different sample sizes
    plt.subplot(1, 2, 2)
    kl_range = np.linspace(0, 20, 100)
    for n in [100, 500, 1000, 5000]:
        bounds = [pac_bayes_bound(emp_risk, kl, n) for kl in kl_range]
        plt.plot(kl_range, bounds, label=f'n={n}', linewidth=2)
    plt.axhline(emp_risk, color='black', linestyle='--', label='Empirical Risk')
    plt.xlabel('KL(Q || P)', fontsize=12)
    plt.ylabel('Risk Bound', fontsize=12)
    plt.title('PAC-Bayes Bound vs KL Divergence', fontsize=13)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

visualize_pac_bayes_bound()
print("Key insight: Larger KL β†’ worse bound, more data β†’ tighter bound")

3. Application: Gaussian Posterior over WeightsΒΆ

Common Choice for Neural NetworksΒΆ

  • Prior: \(P = \mathcal{N}(0, \sigma_P^2 I)\)

  • Posterior: \(Q = \mathcal{N}(\mu, \sigma_Q^2 I)\)

KL Divergence (Closed Form)ΒΆ

\[KL(\mathcal{N}(\mu, \sigma_Q^2) \| \mathcal{N}(0, \sigma_P^2)) = \frac{d}{2}\left[\frac{\sigma_Q^2}{\sigma_P^2} + \frac{\|\mu\|^2}{\sigma_P^2} - 1 - \log\frac{\sigma_Q^2}{\sigma_P^2}\right]\]

where \(d\) = dimension of weight space.

PAC-Bayes RegularizationΒΆ

Minimize: $\(\mathcal{L} = \mathbb{E}_{h \sim Q}[\hat{R}(h)] + \lambda \cdot KL(Q \| P)\)$

This is exactly variational inference with data-dependent regularization!

class PACBayesLinear(nn.Module):
    """Linear layer with Gaussian posterior for PAC-Bayes."""
    def __init__(self, in_features, out_features, prior_std=1.0):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.prior_std = prior_std
        
        # Posterior parameters (learnable)
        self.weight_mu = nn.Parameter(torch.randn(out_features, in_features) * 0.01)
        self.weight_logsigma = nn.Parameter(torch.ones(out_features, in_features) * -3)
        self.bias_mu = nn.Parameter(torch.zeros(out_features))
        self.bias_logsigma = nn.Parameter(torch.ones(out_features) * -3)
    
    def forward(self, x, sample=True):
        """Forward pass with reparameterization trick."""
        if sample:
            # Sample weights from posterior
            weight_sigma = torch.exp(self.weight_logsigma)
            weight = self.weight_mu + weight_sigma * torch.randn_like(self.weight_mu)
            
            bias_sigma = torch.exp(self.bias_logsigma)
            bias = self.bias_mu + bias_sigma * torch.randn_like(self.bias_mu)
        else:
            # Use mean (for evaluation)
            weight = self.weight_mu
            bias = self.bias_mu
        
        return F.linear(x, weight, bias)
    
    def kl_divergence(self):
        """Compute KL(Q || P) for Gaussian distributions."""
        weight_sigma = torch.exp(self.weight_logsigma)
        bias_sigma = torch.exp(self.bias_logsigma)
        
        # KL for weights
        kl_weight = 0.5 * torch.sum(
            weight_sigma**2 / self.prior_std**2 + 
            self.weight_mu**2 / self.prior_std**2 - 
            1 - 2 * self.weight_logsigma + 2 * np.log(self.prior_std)
        )
        
        # KL for bias
        kl_bias = 0.5 * torch.sum(
            bias_sigma**2 / self.prior_std**2 + 
            self.bias_mu**2 / self.prior_std**2 - 
            1 - 2 * self.bias_logsigma + 2 * np.log(self.prior_std)
        )
        
        return kl_weight + kl_bias

# Test
layer = PACBayesLinear(10, 5, prior_std=1.0)
x = torch.randn(32, 10)
out = layer(x)
kl = layer.kl_divergence()
print(f"Output shape: {out.shape}")
print(f"KL divergence: {kl.item():.4f}")

4. Training with PAC-Bayes BoundΒΆ

# Generate synthetic binary classification data
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=200, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train = torch.FloatTensor(X_train)
y_train = torch.LongTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.LongTensor(y_test)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")
class PACBayesNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, prior_std=1.0):
        super().__init__()
        self.fc1 = PACBayesLinear(input_dim, hidden_dim, prior_std)
        self.fc2 = PACBayesLinear(hidden_dim, output_dim, prior_std)
    
    def forward(self, x, sample=True):
        x = torch.relu(self.fc1(x, sample))
        return self.fc2(x, sample)
    
    def kl_divergence(self):
        return self.fc1.kl_divergence() + self.fc2.kl_divergence()

# Train with PAC-Bayes objective
model = PACBayesNet(2, 20, 2, prior_std=1.0)
optimizer = optim.Adam(model.parameters(), lr=0.01)

n_epochs = 200
kl_weight = 0.01  # Ξ» in objective
n_train = len(X_train)

history = {'loss': [], 'nll': [], 'kl': [], 'train_acc': [], 'test_acc': []}

for epoch in range(n_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass (sample from posterior)
    logits = model(X_train, sample=True)
    nll = F.cross_entropy(logits, y_train)
    kl = model.kl_divergence()
    
    # PAC-Bayes objective
    loss = nll + (kl_weight / n_train) * kl
    
    loss.backward()
    optimizer.step()
    
    # Evaluate
    model.eval()
    with torch.no_grad():
        train_pred = model(X_train, sample=False).argmax(1)
        test_pred = model(X_test, sample=False).argmax(1)
        train_acc = (train_pred == y_train).float().mean()
        test_acc = (test_pred == y_test).float().mean()
    
    history['loss'].append(loss.item())
    history['nll'].append(nll.item())
    history['kl'].append(kl.item())
    history['train_acc'].append(train_acc.item())
    history['test_acc'].append(test_acc.item())
    
    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1}: Loss={loss:.4f}, NLL={nll:.4f}, KL={kl:.4f}, "
              f"Train Acc={train_acc:.3f}, Test Acc={test_acc:.3f}")

print("\nTraining complete!")
# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Loss components
ax1 = axes[0, 0]
ax1.plot(history['nll'], label='NLL', linewidth=2)
ax1.plot(np.array(history['kl']) * kl_weight / n_train, label='KL (weighted)', linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Loss Components')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Accuracy
ax2 = axes[0, 1]
ax2.plot(history['train_acc'], label='Train', linewidth=2)
ax2.plot(history['test_acc'], label='Test', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Classification Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Decision boundary
ax3 = axes[1, 0]
h = 0.02
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
grid = torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()])

with torch.no_grad():
    Z = model(grid, sample=False).argmax(1).numpy()
Z = Z.reshape(xx.shape)

ax3.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
ax3.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', cmap='RdYlBu', s=50)
ax3.set_title('Decision Boundary')
ax3.set_xlabel('Feature 1')
ax3.set_ylabel('Feature 2')

# PAC-Bayes bound
ax4 = axes[1, 1]
emp_errors = 1 - np.array(history['train_acc'])
kl_divs = np.array(history['kl'])
bounds = [pac_bayes_bound(emp, kl, n_train) for emp, kl in zip(emp_errors, kl_divs)]
test_errors = 1 - np.array(history['test_acc'])

ax4.plot(emp_errors, label='Train Error', linewidth=2)
ax4.plot(test_errors, label='Test Error', linewidth=2)
ax4.plot(bounds, label='PAC-Bayes Bound', linewidth=2, linestyle='--')
ax4.set_xlabel('Epoch')
ax4.set_ylabel('Error Rate')
ax4.set_title('Generalization Bound')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Final test error: {test_errors[-1]:.4f}")
print(f"PAC-Bayes bound: {bounds[-1]:.4f}")
print(f"Bound holds: {test_errors[-1] <= bounds[-1]}")

5. Advantages of PAC-BayesΒΆ

Compared to Standard PAC Learning:ΒΆ

  1. Data-dependent: Bound adapts to learned posterior

  2. Tighter: Often much better than worst-case VC bounds

  3. Bayesian flavor: Incorporates prior knowledge naturally

  4. Stochastic predictions: Can calibrate uncertainty

Applications:ΒΆ

  • Neural network generalization (understanding deep learning)

  • Model compression (quantization, pruning)

  • Transfer learning (prior from pre-training)

  • Federated learning (privacy-preserving bounds)

Modern Extensions:ΒΆ

  • PAC-Bayes for deep learning (Dziugaite & Roy, 2017)

  • Non-vacuous bounds for ImageNet (Zhou et al., 2019)

  • PAC-Bayes compression (Letarte et al., 2019)

SummaryΒΆ

Key Takeaways:ΒΆ

  1. PAC-Bayes bounds expected risk over posterior distribution

  2. Generalization depends on KL divergence from prior

  3. Practical objective: Empirical risk + KL regularization

  4. Connection to Bayesian learning: Variational inference with data-dependent prior weight

  5. Modern relevance: Understanding deep learning generalization

When to Use PAC-Bayes:ΒΆ

  • Need data-dependent bounds

  • Have prior knowledge to encode in \(P\)

  • Want stochastic predictions with calibrated uncertainty

  • Studying neural network generalization

Next Steps:ΒΆ

  • 05_neural_tangent_kernel.ipynb - Kernel view of neural networks

  • 03_rademacher_complexity.ipynb - Alternative complexity measure

  • 09_expectation_maximization.ipynb - Related optimization technique