import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
sns.set_style('whitegrid')
np.random.seed(42)
1. Rademacher Complexity DefinitionΒΆ
Empirical Rademacher ComplexityΒΆ
For a hypothesis class \(\mathcal{H}\) and sample \(S = \{x_1, \ldots, x_n\}\):
where \(\sigma_i\) are independent Rademacher random variables: \(P(\sigma_i = +1) = P(\sigma_i = -1) = 1/2\).
Rademacher ComplexityΒΆ
Intuition: Measures how well the hypothesis class can fit random noise. Higher complexity means the class can better correlate with random labels.
2. Generalization Bound via Rademacher ComplexityΒΆ
Theorem (Rademacher Generalization Bound):
For any \(\delta > 0\), with probability at least \(1-\delta\) over the draw of \(S \sim \mathcal{D}^n\):
where:
\(R(h)\) = true risk (expected error)
\(\hat{R}(h)\) = empirical risk (training error)
\(\mathfrak{R}_n(\mathcal{H})\) = Rademacher complexity
Key insight: Generalization gap bounded by complexity of hypothesis class.
def compute_empirical_rademacher(X, hypothesis_class, n_trials=1000):
"""
Compute empirical Rademacher complexity via Monte Carlo.
Args:
X: Data points (n_samples, n_features)
hypothesis_class: Function that returns predictions for all hypotheses
n_trials: Number of random sign patterns to sample
Returns:
Empirical Rademacher complexity estimate
"""
n = len(X)
suprema = []
for _ in range(n_trials):
# Sample Rademacher variables
sigma = np.random.choice([-1, 1], size=n)
# Get predictions from all hypotheses in class
predictions = hypothesis_class(X)
# Compute correlation with random signs for each hypothesis
correlations = predictions @ sigma / n
# Take supremum over hypothesis class
suprema.append(np.max(correlations))
return np.mean(suprema)
# Example: Linear hypotheses on 1D data
def linear_hypothesis_class_1d(X, weights=None):
"""Generate predictions from linear hypotheses h(x) = wx."""
if weights is None:
weights = np.linspace(-2, 2, 100)
X_flat = X.flatten()
# predictions[i, j] = weight[i] * X[j]
predictions = np.outer(weights, X_flat)
return predictions
# Test with varying sample sizes
sample_sizes = [10, 20, 50, 100, 200, 500]
complexities = []
for n in sample_sizes:
X = np.random.randn(n, 1)
rad_comp = compute_empirical_rademacher(X, linear_hypothesis_class_1d)
complexities.append(rad_comp)
# Theoretical bound for linear class: R_n ~ 1/sqrt(n)
theoretical = 1.0 / np.sqrt(sample_sizes)
plt.figure(figsize=(10, 6))
plt.plot(sample_sizes, complexities, 'o-', label='Empirical', linewidth=2, markersize=8)
plt.plot(sample_sizes, theoretical, 's--', label='Theoretical O(1/βn)', linewidth=2, markersize=8)
plt.xlabel('Sample Size (n)', fontsize=12)
plt.ylabel('Rademacher Complexity', fontsize=12)
plt.title('Rademacher Complexity vs Sample Size (Linear Class)', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Empirical complexities: {complexities}")
print(f"Decay rate: approximately 1/βn")
3. Rademacher Complexity for Different Hypothesis ClassesΒΆ
Known Results:ΒΆ
Linear functions in \(\mathbb{R}^d\) with bounded \(\|x\| \leq X\), \(\|w\| \leq W\): $\(\mathfrak{R}_n(\mathcal{H}) = O\left(\frac{WX}{\sqrt{n}}\right)\)$
Neural networks (depth \(L\), width \(W\), spectral norm bound \(M\)): $\(\mathfrak{R}_n(\mathcal{H}) = O\left(\frac{M^L \sqrt{W L \log(W)}}{\sqrt{n}}\right)\)$
Polynomial features of degree \(d\) in \(\mathbb{R}^p\): $\(\mathfrak{R}_n(\mathcal{H}) = O\left(\sqrt{\frac{\binom{p+d}{d}}{n}}\right)\)$
# Compare Rademacher complexity across polynomial degrees
def polynomial_hypothesis_class(X, degree, n_hypotheses=50):
"""Generate predictions from polynomial hypotheses up to given degree."""
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X)
predictions = []
for _ in range(n_hypotheses):
# Random weights for polynomial features
w = np.random.randn(X_poly.shape[1])
w = w / np.linalg.norm(w) # Normalize
pred = X_poly @ w
predictions.append(pred)
return np.array(predictions)
# Test different polynomial degrees
n_samples = 100
X = np.random.randn(n_samples, 1)
degrees = range(1, 11)
rad_complexities = []
for d in degrees:
def hyp_class(X_input):
return polynomial_hypothesis_class(X_input, degree=d)
rad_comp = compute_empirical_rademacher(X, hyp_class, n_trials=200)
rad_complexities.append(rad_comp)
plt.figure(figsize=(10, 6))
plt.plot(degrees, rad_complexities, 'o-', linewidth=2, markersize=8, color='darkblue')
plt.xlabel('Polynomial Degree', fontsize=12)
plt.ylabel('Empirical Rademacher Complexity', fontsize=12)
plt.title(f'Complexity Growth with Model Capacity (n={n_samples})', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Rademacher complexity increases with model capacity (polynomial degree)")
4. Connection to GeneralizationΒΆ
Experiment: Rademacher Complexity vs Generalization GapΒΆ
Weβll verify empirically that models with higher Rademacher complexity exhibit larger generalization gaps.
# Generate synthetic regression data
def generate_data(n, noise=0.3):
X = np.random.uniform(-3, 3, (n, 1))
y = np.sin(X).flatten() + noise * np.random.randn(n)
return X, y
n_train = 50
n_test = 500
X_train, y_train = generate_data(n_train)
X_test, y_test = generate_data(n_test)
degrees = [1, 3, 5, 7, 9, 12, 15]
results = []
for degree in degrees:
# Fit polynomial model
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
model = Ridge(alpha=0.01)
model.fit(X_train_poly, y_train)
# Compute errors
train_error = np.mean((model.predict(X_train_poly) - y_train)**2)
test_error = np.mean((model.predict(X_test_poly) - y_test)**2)
gen_gap = test_error - train_error
# Estimate Rademacher complexity
def hyp_class(X_input):
return polynomial_hypothesis_class(X_input, degree=degree, n_hypotheses=30)
rad_comp = compute_empirical_rademacher(X_train, hyp_class, n_trials=100)
results.append({
'degree': degree,
'train_error': train_error,
'test_error': test_error,
'gen_gap': gen_gap,
'rad_complexity': rad_comp
})
# Plot results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Plot 1: Train vs Test Error
ax1 = axes[0]
ax1.plot([r['degree'] for r in results], [r['train_error'] for r in results],
'o-', label='Train Error', linewidth=2, markersize=8)
ax1.plot([r['degree'] for r in results], [r['test_error'] for r in results],
's-', label='Test Error', linewidth=2, markersize=8)
ax1.set_xlabel('Polynomial Degree', fontsize=12)
ax1.set_ylabel('MSE', fontsize=12)
ax1.set_title('Training vs Test Error', fontsize=13)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)
# Plot 2: Rademacher Complexity vs Generalization Gap
ax2 = axes[1]
rad_vals = [r['rad_complexity'] for r in results]
gap_vals = [r['gen_gap'] for r in results]
ax2.scatter(rad_vals, gap_vals, s=100, alpha=0.6)
for i, r in enumerate(results):
ax2.annotate(f"d={r['degree']}", (rad_vals[i], gap_vals[i]),
fontsize=9, ha='right')
ax2.set_xlabel('Rademacher Complexity', fontsize=12)
ax2.set_ylabel('Generalization Gap', fontsize=12)
ax2.set_title('Rademacher Complexity vs Gen. Gap', fontsize=13)
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Observation: Higher Rademacher complexity correlates with larger generalization gap")
5. Comparison: Rademacher Complexity vs VC DimensionΒΆ
Relationship:ΒΆ
For binary classification with \(0\)-\(1\) loss and VC dimension \(d\):
Key Differences:ΒΆ
Aspect |
VC Dimension |
Rademacher Complexity |
|---|---|---|
Type |
Combinatorial |
Probabilistic |
Distribution |
Independent |
Depends on \(\mathcal{D}\) |
Tightness |
Often loose |
Often tighter |
Applicability |
Binary classification |
General losses |
Computation |
Hard |
Can estimate empirically |
Advantage of Rademacher: Can incorporate data distribution, leading to tighter bounds.
6. Advanced: Gaussian ComplexityΒΆ
Gaussian Complexity (similar to Rademacher):
where \(g_i \sim \mathcal{N}(0, 1)\) are independent Gaussians.
Key Result: $\(\mathfrak{R}_S(\mathcal{H}) \leq \mathfrak{G}_S(\mathcal{H}) \leq \sqrt{\frac{\pi}{2}} \mathfrak{R}_S(\mathcal{H})\)$
Both give similar bounds, Gaussian complexity sometimes easier to analyze theoretically.
def compute_gaussian_complexity(X, hypothesis_class, n_trials=1000):
"""Compute Gaussian complexity (similar to Rademacher but with Gaussian noise)."""
n = len(X)
suprema = []
for _ in range(n_trials):
# Sample Gaussian variables
g = np.random.randn(n)
predictions = hypothesis_class(X)
correlations = predictions @ g / n
suprema.append(np.max(correlations))
return np.mean(suprema)
# Compare Rademacher and Gaussian complexities
n = 100
X = np.random.randn(n, 1)
rad_comp = compute_empirical_rademacher(X, linear_hypothesis_class_1d, n_trials=500)
gauss_comp = compute_gaussian_complexity(X, linear_hypothesis_class_1d, n_trials=500)
ratio = gauss_comp / rad_comp
theoretical_ratio = np.sqrt(np.pi / 2)
print(f"Rademacher Complexity: {rad_comp:.4f}")
print(f"Gaussian Complexity: {gauss_comp:.4f}")
print(f"Ratio (Gaussian/Rademacher): {ratio:.4f}")
print(f"Theoretical upper bound: {theoretical_ratio:.4f}")
print(f"\nVerifies: R β€ G β€ β(Ο/2) Γ R")
SummaryΒΆ
Key Takeaways:ΒΆ
Rademacher complexity measures how well a hypothesis class can fit random noise
Provides data-dependent generalization bounds (often tighter than VC dimension)
Decreases as \(O(1/\sqrt{n})\) for common hypothesis classes
Higher complexity β larger generalization gap (empirically verified)
Can be estimated empirically via Monte Carlo sampling
Gaussian complexity is closely related and sometimes easier to analyze
Practical Implications:ΒΆ
Use simpler models (lower complexity) when data is limited
Regularization reduces effective complexity
Model selection should balance fit and complexity
Next Steps:ΒΆ
04_pac_bayes_theory.ipynb - Bayesian approach to PAC learning
05_neural_tangent_kernel.ipynb - Kernel perspective on neural networks
Return to 01_introduction_learning_theory.ipynb for PAC framework review