Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns

sns.set_style('whitegrid')
np.random.seed(42)

1. Rademacher Complexity Definition¶

Empirical Rademacher Complexity¶

For a hypothesis class $\mathcal{H}$ and sample $S = \{x_1, \ldots, x_n\}$:

\[\hat{\mathfrak{R}}_S(\mathcal{H}) = \mathbb{E}_{\sigma} \left[ \sup_{h \in \mathcal{H}} \frac{1}{n} \sum_{i=1}^n \sigma_i h(x_i) \right]\]

where $\sigma_i$ are independent Rademacher random variables: $P(\sigma_i = +1) = P(\sigma_i = -1) = 1/2$.

Rademacher Complexity¶

\[\mathfrak{R}_n(\mathcal{H}) = \mathbb{E}_{S \sim \mathcal{D}^n} [\hat{\mathfrak{R}}_S(\mathcal{H})]\]

Intuition: Measures how well the hypothesis class can fit random noise. Higher complexity means the class can better correlate with random labels.

2. Generalization Bound via Rademacher Complexity¶

Theorem (Rademacher Generalization Bound):

For any $\delta > 0$, with probability at least $1-\delta$ over the draw of $S \sim \mathcal{D}^n$:

\[\forall h \in \mathcal{H}: \quad R(h) \leq \hat{R}(h) + 2\mathfrak{R}_n(\mathcal{H}) + \sqrt{\frac{\log(1/\delta)}{2n}}\]

where:

$R(h)$ = true risk (expected error)
$\hat{R}(h)$ = empirical risk (training error)
$\mathfrak{R}_n(\mathcal{H})$ = Rademacher complexity

Key insight: Generalization gap bounded by complexity of hypothesis class.

def compute_empirical_rademacher(X, hypothesis_class, n_trials=1000):
    """
    Compute empirical Rademacher complexity via Monte Carlo.
    
    Args:
        X: Data points (n_samples, n_features)
        hypothesis_class: Function that returns predictions for all hypotheses
        n_trials: Number of random sign patterns to sample
    
    Returns:
        Empirical Rademacher complexity estimate
    """
    n = len(X)
    suprema = []
    
    for _ in range(n_trials):
        # Sample Rademacher variables
        sigma = np.random.choice([-1, 1], size=n)
        
        # Get predictions from all hypotheses in class
        predictions = hypothesis_class(X)
        
        # Compute correlation with random signs for each hypothesis
        correlations = predictions @ sigma / n
        
        # Take supremum over hypothesis class
        suprema.append(np.max(correlations))
    
    return np.mean(suprema)

# Example: Linear hypotheses on 1D data
def linear_hypothesis_class_1d(X, weights=None):
    """Generate predictions from linear hypotheses h(x) = wx."""
    if weights is None:
        weights = np.linspace(-2, 2, 100)
    
    X_flat = X.flatten()
    # predictions[i, j] = weight[i] * X[j]
    predictions = np.outer(weights, X_flat)
    return predictions

# Test with varying sample sizes
sample_sizes = [10, 20, 50, 100, 200, 500]
complexities = []

for n in sample_sizes:
    X = np.random.randn(n, 1)
    rad_comp = compute_empirical_rademacher(X, linear_hypothesis_class_1d)
    complexities.append(rad_comp)

# Theoretical bound for linear class: R_n ~ 1/sqrt(n)
theoretical = 1.0 / np.sqrt(sample_sizes)

plt.figure(figsize=(10, 6))
plt.plot(sample_sizes, complexities, 'o-', label='Empirical', linewidth=2, markersize=8)
plt.plot(sample_sizes, theoretical, 's--', label='Theoretical O(1/√n)', linewidth=2, markersize=8)
plt.xlabel('Sample Size (n)', fontsize=12)
plt.ylabel('Rademacher Complexity', fontsize=12)
plt.title('Rademacher Complexity vs Sample Size (Linear Class)', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Empirical complexities: {complexities}")
print(f"Decay rate: approximately 1/√n")

3. Rademacher Complexity for Different Hypothesis Classes¶

Known Results:¶

Linear functions in $\mathbb{R}^d$ with bounded $\|x\| \leq X$, $\|w\| \leq W$: $$\mathfrak{R}_n(\mathcal{H}) = O\left(\frac{WX}{\sqrt{n}}\right)$$
Neural networks (depth $L$, width $W$, spectral norm bound $M$): $$\mathfrak{R}_n(\mathcal{H}) = O\left(\frac{M^L \sqrt{W L \log(W)}}{\sqrt{n}}\right)$$
Polynomial features of degree $d$ in $\mathbb{R}^p$: $$\mathfrak{R}_n(\mathcal{H}) = O\left(\sqrt{\frac{\binom{p+d}{d}}{n}}\right)$$

# Compare Rademacher complexity across polynomial degrees
def polynomial_hypothesis_class(X, degree, n_hypotheses=50):
    """Generate predictions from polynomial hypotheses up to given degree."""
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)
    
    predictions = []
    for _ in range(n_hypotheses):
        # Random weights for polynomial features
        w = np.random.randn(X_poly.shape[1])
        w = w / np.linalg.norm(w)  # Normalize
        pred = X_poly @ w
        predictions.append(pred)
    
    return np.array(predictions)

# Test different polynomial degrees
n_samples = 100
X = np.random.randn(n_samples, 1)

degrees = range(1, 11)
rad_complexities = []

for d in degrees:
    def hyp_class(X_input):
        return polynomial_hypothesis_class(X_input, degree=d)
    
    rad_comp = compute_empirical_rademacher(X, hyp_class, n_trials=200)
    rad_complexities.append(rad_comp)

plt.figure(figsize=(10, 6))
plt.plot(degrees, rad_complexities, 'o-', linewidth=2, markersize=8, color='darkblue')
plt.xlabel('Polynomial Degree', fontsize=12)
plt.ylabel('Empirical Rademacher Complexity', fontsize=12)
plt.title(f'Complexity Growth with Model Capacity (n={n_samples})', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Rademacher complexity increases with model capacity (polynomial degree)")

4. Connection to Generalization¶

Experiment: Rademacher Complexity vs Generalization Gap¶

We’ll verify empirically that models with higher Rademacher complexity exhibit larger generalization gaps.

# Generate synthetic regression data
def generate_data(n, noise=0.3):
    X = np.random.uniform(-3, 3, (n, 1))
    y = np.sin(X).flatten() + noise * np.random.randn(n)
    return X, y

n_train = 50
n_test = 500
X_train, y_train = generate_data(n_train)
X_test, y_test = generate_data(n_test)

degrees = [1, 3, 5, 7, 9, 12, 15]
results = []

for degree in degrees:
    # Fit polynomial model
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    model = Ridge(alpha=0.01)
    model.fit(X_train_poly, y_train)
    
    # Compute errors
    train_error = np.mean((model.predict(X_train_poly) - y_train)**2)
    test_error = np.mean((model.predict(X_test_poly) - y_test)**2)
    gen_gap = test_error - train_error
    
    # Estimate Rademacher complexity
    def hyp_class(X_input):
        return polynomial_hypothesis_class(X_input, degree=degree, n_hypotheses=30)
    rad_comp = compute_empirical_rademacher(X_train, hyp_class, n_trials=100)
    
    results.append({
        'degree': degree,
        'train_error': train_error,
        'test_error': test_error,
        'gen_gap': gen_gap,
        'rad_complexity': rad_comp
    })

# Plot results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Train vs Test Error
ax1 = axes[0]
ax1.plot([r['degree'] for r in results], [r['train_error'] for r in results], 
         'o-', label='Train Error', linewidth=2, markersize=8)
ax1.plot([r['degree'] for r in results], [r['test_error'] for r in results], 
         's-', label='Test Error', linewidth=2, markersize=8)
ax1.set_xlabel('Polynomial Degree', fontsize=12)
ax1.set_ylabel('MSE', fontsize=12)
ax1.set_title('Training vs Test Error', fontsize=13)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: Rademacher Complexity vs Generalization Gap
ax2 = axes[1]
rad_vals = [r['rad_complexity'] for r in results]
gap_vals = [r['gen_gap'] for r in results]
ax2.scatter(rad_vals, gap_vals, s=100, alpha=0.6)
for i, r in enumerate(results):
    ax2.annotate(f"d={r['degree']}", (rad_vals[i], gap_vals[i]), 
                fontsize=9, ha='right')
ax2.set_xlabel('Rademacher Complexity', fontsize=12)
ax2.set_ylabel('Generalization Gap', fontsize=12)
ax2.set_title('Rademacher Complexity vs Gen. Gap', fontsize=13)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Observation: Higher Rademacher complexity correlates with larger generalization gap")

5. Comparison: Rademacher Complexity vs VC Dimension¶

Relationship:¶

For binary classification with $0$-$1$ loss and VC dimension $d$:

\[\mathfrak{R}_n(\mathcal{H}) \leq \sqrt{\frac{2d \log(en/d)}{n}}\]

Key Differences:¶

Aspect	VC Dimension	Rademacher Complexity
Type	Combinatorial	Probabilistic
Distribution	Independent	Depends on $\mathcal{D}$
Tightness	Often loose	Often tighter
Applicability	Binary classification	General losses
Computation	Hard	Can estimate empirically

Advantage of Rademacher: Can incorporate data distribution, leading to tighter bounds.

6. Advanced: Gaussian Complexity¶

Gaussian Complexity (similar to Rademacher):

\[\mathfrak{G}_S(\mathcal{H}) = \mathbb{E}_{g} \left[ \sup_{h \in \mathcal{H}} \frac{1}{n} \sum_{i=1}^n g_i h(x_i) \right]\]

where $g_i \sim \mathcal{N}(0, 1)$ are independent Gaussians.

Key Result: $$\mathfrak{R}_S(\mathcal{H}) \leq \mathfrak{G}_S(\mathcal{H}) \leq \sqrt{\frac{\pi}{2}} \mathfrak{R}_S(\mathcal{H})$$

Both give similar bounds, Gaussian complexity sometimes easier to analyze theoretically.

def compute_gaussian_complexity(X, hypothesis_class, n_trials=1000):
    """Compute Gaussian complexity (similar to Rademacher but with Gaussian noise)."""
    n = len(X)
    suprema = []
    
    for _ in range(n_trials):
        # Sample Gaussian variables
        g = np.random.randn(n)
        
        predictions = hypothesis_class(X)
        correlations = predictions @ g / n
        suprema.append(np.max(correlations))
    
    return np.mean(suprema)

# Compare Rademacher and Gaussian complexities
n = 100
X = np.random.randn(n, 1)

rad_comp = compute_empirical_rademacher(X, linear_hypothesis_class_1d, n_trials=500)
gauss_comp = compute_gaussian_complexity(X, linear_hypothesis_class_1d, n_trials=500)

ratio = gauss_comp / rad_comp
theoretical_ratio = np.sqrt(np.pi / 2)

print(f"Rademacher Complexity: {rad_comp:.4f}")
print(f"Gaussian Complexity: {gauss_comp:.4f}")
print(f"Ratio (Gaussian/Rademacher): {ratio:.4f}")
print(f"Theoretical upper bound: {theoretical_ratio:.4f}")
print(f"\nVerifies: R ≤ G ≤ √(π/2) × R")

Summary¶

Key Takeaways:¶

Rademacher complexity measures how well a hypothesis class can fit random noise
Provides data-dependent generalization bounds (often tighter than VC dimension)
Decreases as $O(1/\sqrt{n})$ for common hypothesis classes
Higher complexity → larger generalization gap (empirically verified)
Can be estimated empirically via Monte Carlo sampling
Gaussian complexity is closely related and sometimes easier to analyze

Practical Implications:¶

Use simpler models (lower complexity) when data is limited
Regularization reduces effective complexity
Model selection should balance fit and complexity

Next Steps:¶

04_pac_bayes_theory.ipynb - Bayesian approach to PAC learning
05_neural_tangent_kernel.ipynb - Kernel perspective on neural networks
Return to 01_introduction_learning_theory.ipynb for PAC framework review