import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
sns.set_style('whitegrid')
np.random.seed(42)
1. Why Concentration Inequalities?¶
The Central Question¶
Given i.i.d. random variables \(X_1, \ldots, X_n\): $\(S_n = \frac{1}{n} \sum_{i=1}^n X_i\)$
How close is \(S_n\) to \(\mathbb{E}[X]\)?
Classical Results¶
Law of Large Numbers: \(S_n \to \mathbb{E}[X]\) as \(n \to \infty\)
Central Limit Theorem: \(\sqrt{n}(S_n - \mathbb{E}[X]) \sim \mathcal{N}(0, \sigma^2)\)
Problem: These are asymptotic! Need finite-sample guarantees.
Concentration Inequalities Provide:¶
where \(f\) decays exponentially in \(n\)!
Applications¶
Machine learning: Generalization bounds
Statistics: Confidence intervals
Algorithms: Randomized analysis
High-dimensional probability: Blessing of dimensionality
2. Markov and Chebyshev Inequalities¶
Markovâs Inequality¶
For \(X \geq 0\) and \(a > 0\): $\(P(X \geq a) \leq \frac{\mathbb{E}[X]}{a}\)$
Proof: $\(\mathbb{E}[X] = \int_0^\infty x f(x) dx \geq \int_a^\infty x f(x) dx \geq a \int_a^\infty f(x) dx = a \cdot P(X \geq a)\)$
Chebyshevâs Inequality¶
For any random variable \(X\) with variance \(\sigma^2\): $\(P(|X - \mathbb{E}[X]| \geq t) \leq \frac{\sigma^2}{t^2}\)$
Proof: Apply Markov to \((X - \mathbb{E}[X])^2\).
Limitations¶
Polynomial decay in \(t\) (not exponential)
Loose bounds in practice
Donât exploit independence
# Empirical verification of Chebyshev
n_samples = 10000
n_trials = 100
# Sample means
sample_size = 50
means = []
for _ in range(n_trials):
X = np.random.binomial(1, 0.5, sample_size)
means.append(X.mean())
means = np.array(means)
# Theoretical parameters
mu = 0.5
sigma2 = 0.25 / sample_size # Variance of sample mean
# Check Chebyshev bound
epsilons = np.linspace(0.05, 0.3, 10)
empirical_prob = []
chebyshev_bound = []
for eps in epsilons:
emp = np.mean(np.abs(means - mu) >= eps)
empirical_prob.append(emp)
chebyshev_bound.append(sigma2 / eps**2)
plt.figure(figsize=(10, 6))
plt.plot(epsilons, empirical_prob, 'o-', linewidth=2, markersize=8, label='Empirical P(|mean - Ό| ℠Δ)')
plt.plot(epsilons, chebyshev_bound, 's-', linewidth=2, markersize=8, label='Chebyshev bound: ÏÂČ/ΔÂČ')
plt.xlabel('Δ', fontsize=12)
plt.ylabel('Probability', fontsize=12)
plt.title('Chebyshev Inequality Verification', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
print("Chebyshev bound is valid but loose!")
3. Hoeffdingâs Inequality¶
Hoeffdingâs Lemma¶
If \(X \in [a, b]\) with \(\mathbb{E}[X] = 0\), then: $\(\mathbb{E}[e^{\lambda X}] \leq e^{\lambda^2 (b-a)^2 / 8}\)$
Proof sketch: Use convexity and Taylor expansion.
Hoeffdingâs Inequality (1963)¶
Let \(X_1, \ldots, X_n\) be independent with \(X_i \in [a_i, b_i]\). Then: $\(P\left(\sum_{i=1}^n (X_i - \mathbb{E}[X_i]) \geq t\right) \leq \exp\left(-\frac{2t^2}{\sum_{i=1}^n (b_i - a_i)^2}\right)\)$
Bounded Case¶
If \(X_i \in [0, 1]\) and \(\bar{X} = \frac{1}{n}\sum X_i\): $\(P(|\bar{X} - \mathbb{E}[\bar{X}]| \geq \epsilon) \leq 2 \exp(-2n\epsilon^2)\)$
Exponential decay in \(n\)!
def hoeffding_bound(n, epsilon):
"""Hoeffding bound for [0,1] variables."""
return 2 * np.exp(-2 * n * epsilon**2)
def empirical_deviation_prob(n, epsilon, p=0.5, n_trials=10000):
"""Empirical probability that sample mean deviates by epsilon."""
deviations = []
for _ in range(n_trials):
X = np.random.binomial(1, p, n)
deviations.append(abs(X.mean() - p))
return np.mean(np.array(deviations) >= epsilon)
# Compare empirical vs Hoeffding bound
sample_sizes = [10, 50, 100, 500, 1000]
epsilon = 0.1
empirical_probs = []
hoeffding_bounds = []
for n in sample_sizes:
emp = empirical_deviation_prob(n, epsilon)
bound = hoeffding_bound(n, epsilon)
empirical_probs.append(emp)
hoeffding_bounds.append(bound)
print(f"n={n:4d}: Empirical={emp:.6f}, Hoeffding={bound:.6f}")
plt.figure(figsize=(10, 6))
plt.semilogy(sample_sizes, empirical_probs, 'o-', linewidth=2, markersize=8, label='Empirical')
plt.semilogy(sample_sizes, hoeffding_bounds, 's-', linewidth=2, markersize=8, label='Hoeffding bound')
plt.xlabel('Sample size n', fontsize=12)
plt.ylabel('P(|mean - Ό| ℠0.1)', fontsize=12)
plt.title('Hoeffding Inequality: Exponential Decay', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
4. Bernsteinâs Inequality¶
Motivation¶
Hoeffding only uses range \([a, b]\).
Bernstein uses variance for tighter bounds when variance is small!
Bernsteinâs Inequality (1924)¶
Let \(X_1, \ldots, X_n\) be independent with:
\(\mathbb{E}[X_i] = 0\)
\(|X_i| \leq M\) almost surely
\(\sum_{i=1}^n \mathbb{E}[X_i^2] \leq n\sigma^2\)
Then: $\(P\left(\sum_{i=1}^n X_i \geq t\right) \leq \exp\left(-\frac{t^2/2}{n\sigma^2 + Mt/3}\right)\)$
Simplified Form¶
For bounded \(X_i \in [0, 1]\) with variance \(\sigma^2\): $\(P(|\bar{X} - \mu| \geq \epsilon) \leq 2\exp\left(-\frac{n\epsilon^2}{2(\sigma^2 + \epsilon/3)}\right)\)$
Comparison¶
Small \(\sigma^2\): Bernstein â Hoeffding with better constant
Large \(\sigma^2\): Bernstein much better (adapts to variance)
def bernstein_bound(n, epsilon, sigma2):
"""Bernstein bound for [0,1] variables."""
return 2 * np.exp(-n * epsilon**2 / (2 * (sigma2 + epsilon/3)))
# Compare Hoeffding vs Bernstein for different variances
n = 100
epsilons = np.linspace(0.01, 0.3, 50)
# Low variance case (p=0.5 â ÏÂČ=0.25)
sigma2_low = 0.25
hoeff_low = [hoeffding_bound(n, eps) for eps in epsilons]
bern_low = [bernstein_bound(n, eps, sigma2_low) for eps in epsilons]
# Very low variance (p=0.9 â ÏÂČ=0.09)
sigma2_verylow = 0.09
hoeff_verylow = [hoeffding_bound(n, eps) for eps in epsilons]
bern_verylow = [bernstein_bound(n, eps, sigma2_verylow) for eps in epsilons]
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
axes[0].semilogy(epsilons, hoeff_low, linewidth=2, label='Hoeffding')
axes[0].semilogy(epsilons, bern_low, linewidth=2, label='Bernstein')
axes[0].set_xlabel('Δ', fontsize=12)
axes[0].set_ylabel('Bound', fontsize=12)
axes[0].set_title(f'Moderate Variance (ÏÂČ={sigma2_low})', fontsize=13)
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)
axes[1].semilogy(epsilons, hoeff_verylow, linewidth=2, label='Hoeffding')
axes[1].semilogy(epsilons, bern_verylow, linewidth=2, label='Bernstein')
axes[1].set_xlabel('Δ', fontsize=12)
axes[1].set_ylabel('Bound', fontsize=12)
axes[1].set_title(f'Low Variance (ÏÂČ={sigma2_verylow})', fontsize=13)
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Bernstein is tighter when variance is small!")
5. Application to Learning Theory¶
Generalization Bound¶
Training error: \(\hat{R}_n(h) = \frac{1}{n}\sum_{i=1}^n \mathbb{1}[h(x_i) \neq y_i]\)
True error: \(R(h) = \mathbb{E}[\mathbb{1}[h(x) \neq y]]\)
Single Hypothesis¶
By Hoeffding: $\(P(|\hat{R}_n(h) - R(h)| > \epsilon) \leq 2e^{-2n\epsilon^2}\)$
With confidence \(1 - \delta\): $\(R(h) \leq \hat{R}_n(h) + \sqrt{\frac{\log(2/\delta)}{2n}}\)$
Finite Hypothesis Class¶
For \(|\mathcal{H}| = m\) hypotheses, by union bound: $\(P\left(\max_{h \in \mathcal{H}} |\hat{R}_n(h) - R(h)| > \epsilon\right) \leq 2me^{-2n\epsilon^2}\)$
Bound: $\(R(h) \leq \hat{R}_n(h) + \sqrt{\frac{\log(2m/\delta)}{2n}}\)$
# Simulate learning with generalization bound
def true_error(h_id, p_error=0.2):
"""Simulate true error for hypothesis h."""
return p_error + 0.05 * h_id # Different hypotheses have different errors
def empirical_error(h_id, n, p_error):
"""Sample empirical error."""
errors = np.random.binomial(1, p_error, n)
return errors.mean()
# Parameters
n_hypotheses = 10
n_samples = 100
delta = 0.05
# Compute bound
epsilon_bound = np.sqrt(np.log(2 * n_hypotheses / delta) / (2 * n_samples))
# Simulate
true_errors = [true_error(h, 0.2) for h in range(n_hypotheses)]
emp_errors = [empirical_error(h, n_samples, true_errors[h]) for h in range(n_hypotheses)]
upper_bounds = [emp + epsilon_bound for emp in emp_errors]
# Plot
x = np.arange(n_hypotheses)
plt.figure(figsize=(12, 6))
plt.plot(x, true_errors, 'go-', linewidth=2, markersize=8, label='True error')
plt.plot(x, emp_errors, 'bs-', linewidth=2, markersize=8, label='Empirical error')
plt.plot(x, upper_bounds, 'r^-', linewidth=2, markersize=8, label='Upper bound (95% conf.)')
plt.fill_between(x, emp_errors, upper_bounds, alpha=0.2, color='red')
plt.xlabel('Hypothesis index', fontsize=12)
plt.ylabel('Error', fontsize=12)
plt.title(f'Generalization Bound (n={n_samples}, |H|={n_hypotheses})', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
print(f"\nBound Δ = {epsilon_bound:.4f}")
print(f"All true errors within bounds: {all(t <= u for t, u in zip(true_errors, upper_bounds))}")
Summary¶
Key Inequalities:¶
Inequality |
Bound |
Decay |
Uses |
|---|---|---|---|
Markov |
\(P(X \geq a) \leq \mathbb{E}[X]/a\) |
\(O(1/t)\) |
Mean |
Chebyshev |
$P( |
X-\mu |
\geq t) \leq \sigma^2/t^2$ |
Hoeffding |
\(\leq 2e^{-2n\epsilon^2}\) |
Exponential |
Range |
Bernstein |
\(\leq 2e^{-n\epsilon^2/(2\sigma^2+O(\epsilon))}\) |
Exponential |
Variance |
When to Use:¶
Hoeffding: Bounded variables, unknown variance
Bernstein: Small variance known, want tighter bounds
Chebyshev: Only variance known, need quick estimate
Learning Theory Applications:¶
Advanced Extensions:¶
McDiarmidâs inequality: Functions of independent variables
Azuma-Hoeffding: Martingale concentration
Bennettâs inequality: Better constants than Bernstein
Sub-Gaussian/sub-exponential: Broader classes
Further Reading:¶
Boucheron et al. (2013) - âConcentration Inequalitiesâ
Vershynin (2018) - âHigh-Dimensional Probabilityâ
Shalev-Shwartz & Ben-David (2014) - âUnderstanding Machine Learningâ
Next Steps:¶
03_rademacher_complexity.ipynb - Data-dependent bounds
01_introduction_learning_theory.ipynb - PAC learning framework
04_pac_bayes_theory.ipynb - Bayesian generalization