import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns

sns.set_style('whitegrid')
np.random.seed(42)

1. Rademacher Complexity DefinitionΒΆ

Empirical Rademacher ComplexityΒΆ

For a hypothesis class \(\mathcal{H}\) and sample \(S = \{x_1, \ldots, x_n\}\):

\[\hat{\mathfrak{R}}_S(\mathcal{H}) = \mathbb{E}_{\sigma} \left[ \sup_{h \in \mathcal{H}} \frac{1}{n} \sum_{i=1}^n \sigma_i h(x_i) \right]\]

where \(\sigma_i\) are independent Rademacher random variables: \(P(\sigma_i = +1) = P(\sigma_i = -1) = 1/2\).

Rademacher ComplexityΒΆ

\[\mathfrak{R}_n(\mathcal{H}) = \mathbb{E}_{S \sim \mathcal{D}^n} [\hat{\mathfrak{R}}_S(\mathcal{H})]\]

Intuition: Measures how well the hypothesis class can fit random noise. Higher complexity means the class can better correlate with random labels.

2. Generalization Bound via Rademacher ComplexityΒΆ

Theorem (Rademacher Generalization Bound):

For any \(\delta > 0\), with probability at least \(1-\delta\) over the draw of \(S \sim \mathcal{D}^n\):

\[\forall h \in \mathcal{H}: \quad R(h) \leq \hat{R}(h) + 2\mathfrak{R}_n(\mathcal{H}) + \sqrt{\frac{\log(1/\delta)}{2n}}\]

where:

  • \(R(h)\) = true risk (expected error)

  • \(\hat{R}(h)\) = empirical risk (training error)

  • \(\mathfrak{R}_n(\mathcal{H})\) = Rademacher complexity

Key insight: Generalization gap bounded by complexity of hypothesis class.

def compute_empirical_rademacher(X, hypothesis_class, n_trials=1000):
    """
    Compute empirical Rademacher complexity via Monte Carlo.
    
    Args:
        X: Data points (n_samples, n_features)
        hypothesis_class: Function that returns predictions for all hypotheses
        n_trials: Number of random sign patterns to sample
    
    Returns:
        Empirical Rademacher complexity estimate
    """
    n = len(X)
    suprema = []
    
    for _ in range(n_trials):
        # Sample Rademacher variables
        sigma = np.random.choice([-1, 1], size=n)
        
        # Get predictions from all hypotheses in class
        predictions = hypothesis_class(X)
        
        # Compute correlation with random signs for each hypothesis
        correlations = predictions @ sigma / n
        
        # Take supremum over hypothesis class
        suprema.append(np.max(correlations))
    
    return np.mean(suprema)

# Example: Linear hypotheses on 1D data
def linear_hypothesis_class_1d(X, weights=None):
    """Generate predictions from linear hypotheses h(x) = wx."""
    if weights is None:
        weights = np.linspace(-2, 2, 100)
    
    X_flat = X.flatten()
    # predictions[i, j] = weight[i] * X[j]
    predictions = np.outer(weights, X_flat)
    return predictions

# Test with varying sample sizes
sample_sizes = [10, 20, 50, 100, 200, 500]
complexities = []

for n in sample_sizes:
    X = np.random.randn(n, 1)
    rad_comp = compute_empirical_rademacher(X, linear_hypothesis_class_1d)
    complexities.append(rad_comp)

# Theoretical bound for linear class: R_n ~ 1/sqrt(n)
theoretical = 1.0 / np.sqrt(sample_sizes)

plt.figure(figsize=(10, 6))
plt.plot(sample_sizes, complexities, 'o-', label='Empirical', linewidth=2, markersize=8)
plt.plot(sample_sizes, theoretical, 's--', label='Theoretical O(1/√n)', linewidth=2, markersize=8)
plt.xlabel('Sample Size (n)', fontsize=12)
plt.ylabel('Rademacher Complexity', fontsize=12)
plt.title('Rademacher Complexity vs Sample Size (Linear Class)', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Empirical complexities: {complexities}")
print(f"Decay rate: approximately 1/√n")

3. Rademacher Complexity for Different Hypothesis ClassesΒΆ

Known Results:ΒΆ

  1. Linear functions in \(\mathbb{R}^d\) with bounded \(\|x\| \leq X\), \(\|w\| \leq W\): $\(\mathfrak{R}_n(\mathcal{H}) = O\left(\frac{WX}{\sqrt{n}}\right)\)$

  2. Neural networks (depth \(L\), width \(W\), spectral norm bound \(M\)): $\(\mathfrak{R}_n(\mathcal{H}) = O\left(\frac{M^L \sqrt{W L \log(W)}}{\sqrt{n}}\right)\)$

  3. Polynomial features of degree \(d\) in \(\mathbb{R}^p\): $\(\mathfrak{R}_n(\mathcal{H}) = O\left(\sqrt{\frac{\binom{p+d}{d}}{n}}\right)\)$

# Compare Rademacher complexity across polynomial degrees
def polynomial_hypothesis_class(X, degree, n_hypotheses=50):
    """Generate predictions from polynomial hypotheses up to given degree."""
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)
    
    predictions = []
    for _ in range(n_hypotheses):
        # Random weights for polynomial features
        w = np.random.randn(X_poly.shape[1])
        w = w / np.linalg.norm(w)  # Normalize
        pred = X_poly @ w
        predictions.append(pred)
    
    return np.array(predictions)

# Test different polynomial degrees
n_samples = 100
X = np.random.randn(n_samples, 1)

degrees = range(1, 11)
rad_complexities = []

for d in degrees:
    def hyp_class(X_input):
        return polynomial_hypothesis_class(X_input, degree=d)
    
    rad_comp = compute_empirical_rademacher(X, hyp_class, n_trials=200)
    rad_complexities.append(rad_comp)

plt.figure(figsize=(10, 6))
plt.plot(degrees, rad_complexities, 'o-', linewidth=2, markersize=8, color='darkblue')
plt.xlabel('Polynomial Degree', fontsize=12)
plt.ylabel('Empirical Rademacher Complexity', fontsize=12)
plt.title(f'Complexity Growth with Model Capacity (n={n_samples})', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Rademacher complexity increases with model capacity (polynomial degree)")

4. Connection to GeneralizationΒΆ

Experiment: Rademacher Complexity vs Generalization GapΒΆ

We’ll verify empirically that models with higher Rademacher complexity exhibit larger generalization gaps.

# Generate synthetic regression data
def generate_data(n, noise=0.3):
    X = np.random.uniform(-3, 3, (n, 1))
    y = np.sin(X).flatten() + noise * np.random.randn(n)
    return X, y

n_train = 50
n_test = 500
X_train, y_train = generate_data(n_train)
X_test, y_test = generate_data(n_test)

degrees = [1, 3, 5, 7, 9, 12, 15]
results = []

for degree in degrees:
    # Fit polynomial model
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    model = Ridge(alpha=0.01)
    model.fit(X_train_poly, y_train)
    
    # Compute errors
    train_error = np.mean((model.predict(X_train_poly) - y_train)**2)
    test_error = np.mean((model.predict(X_test_poly) - y_test)**2)
    gen_gap = test_error - train_error
    
    # Estimate Rademacher complexity
    def hyp_class(X_input):
        return polynomial_hypothesis_class(X_input, degree=degree, n_hypotheses=30)
    rad_comp = compute_empirical_rademacher(X_train, hyp_class, n_trials=100)
    
    results.append({
        'degree': degree,
        'train_error': train_error,
        'test_error': test_error,
        'gen_gap': gen_gap,
        'rad_complexity': rad_comp
    })

# Plot results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Train vs Test Error
ax1 = axes[0]
ax1.plot([r['degree'] for r in results], [r['train_error'] for r in results], 
         'o-', label='Train Error', linewidth=2, markersize=8)
ax1.plot([r['degree'] for r in results], [r['test_error'] for r in results], 
         's-', label='Test Error', linewidth=2, markersize=8)
ax1.set_xlabel('Polynomial Degree', fontsize=12)
ax1.set_ylabel('MSE', fontsize=12)
ax1.set_title('Training vs Test Error', fontsize=13)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: Rademacher Complexity vs Generalization Gap
ax2 = axes[1]
rad_vals = [r['rad_complexity'] for r in results]
gap_vals = [r['gen_gap'] for r in results]
ax2.scatter(rad_vals, gap_vals, s=100, alpha=0.6)
for i, r in enumerate(results):
    ax2.annotate(f"d={r['degree']}", (rad_vals[i], gap_vals[i]), 
                fontsize=9, ha='right')
ax2.set_xlabel('Rademacher Complexity', fontsize=12)
ax2.set_ylabel('Generalization Gap', fontsize=12)
ax2.set_title('Rademacher Complexity vs Gen. Gap', fontsize=13)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Observation: Higher Rademacher complexity correlates with larger generalization gap")

5. Comparison: Rademacher Complexity vs VC DimensionΒΆ

Relationship:ΒΆ

For binary classification with \(0\)-\(1\) loss and VC dimension \(d\):

\[\mathfrak{R}_n(\mathcal{H}) \leq \sqrt{\frac{2d \log(en/d)}{n}}\]

Key Differences:ΒΆ

Aspect

VC Dimension

Rademacher Complexity

Type

Combinatorial

Probabilistic

Distribution

Independent

Depends on \(\mathcal{D}\)

Tightness

Often loose

Often tighter

Applicability

Binary classification

General losses

Computation

Hard

Can estimate empirically

Advantage of Rademacher: Can incorporate data distribution, leading to tighter bounds.

6. Advanced: Gaussian ComplexityΒΆ

Gaussian Complexity (similar to Rademacher):

\[\mathfrak{G}_S(\mathcal{H}) = \mathbb{E}_{g} \left[ \sup_{h \in \mathcal{H}} \frac{1}{n} \sum_{i=1}^n g_i h(x_i) \right]\]

where \(g_i \sim \mathcal{N}(0, 1)\) are independent Gaussians.

Key Result: $\(\mathfrak{R}_S(\mathcal{H}) \leq \mathfrak{G}_S(\mathcal{H}) \leq \sqrt{\frac{\pi}{2}} \mathfrak{R}_S(\mathcal{H})\)$

Both give similar bounds, Gaussian complexity sometimes easier to analyze theoretically.

def compute_gaussian_complexity(X, hypothesis_class, n_trials=1000):
    """Compute Gaussian complexity (similar to Rademacher but with Gaussian noise)."""
    n = len(X)
    suprema = []
    
    for _ in range(n_trials):
        # Sample Gaussian variables
        g = np.random.randn(n)
        
        predictions = hypothesis_class(X)
        correlations = predictions @ g / n
        suprema.append(np.max(correlations))
    
    return np.mean(suprema)

# Compare Rademacher and Gaussian complexities
n = 100
X = np.random.randn(n, 1)

rad_comp = compute_empirical_rademacher(X, linear_hypothesis_class_1d, n_trials=500)
gauss_comp = compute_gaussian_complexity(X, linear_hypothesis_class_1d, n_trials=500)

ratio = gauss_comp / rad_comp
theoretical_ratio = np.sqrt(np.pi / 2)

print(f"Rademacher Complexity: {rad_comp:.4f}")
print(f"Gaussian Complexity: {gauss_comp:.4f}")
print(f"Ratio (Gaussian/Rademacher): {ratio:.4f}")
print(f"Theoretical upper bound: {theoretical_ratio:.4f}")
print(f"\nVerifies: R ≀ G ≀ √(Ο€/2) Γ— R")

SummaryΒΆ

Key Takeaways:ΒΆ

  1. Rademacher complexity measures how well a hypothesis class can fit random noise

  2. Provides data-dependent generalization bounds (often tighter than VC dimension)

  3. Decreases as \(O(1/\sqrt{n})\) for common hypothesis classes

  4. Higher complexity β†’ larger generalization gap (empirically verified)

  5. Can be estimated empirically via Monte Carlo sampling

  6. Gaussian complexity is closely related and sometimes easier to analyze

Practical Implications:ΒΆ

  • Use simpler models (lower complexity) when data is limited

  • Regularization reduces effective complexity

  • Model selection should balance fit and complexity

Next Steps:ΒΆ

  • 04_pac_bayes_theory.ipynb - Bayesian approach to PAC learning

  • 05_neural_tangent_kernel.ipynb - Kernel perspective on neural networks

  • Return to 01_introduction_learning_theory.ipynb for PAC framework review