import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_style('whitegrid')
np.random.seed(42)

1. Why Concentration Inequalities?¶

The Central Question¶

Given i.i.d. random variables \(X_1, \ldots, X_n\): $\(S_n = \frac{1}{n} \sum_{i=1}^n X_i\)$

How close is \(S_n\) to \(\mathbb{E}[X]\)?

Classical Results¶

Law of Large Numbers: \(S_n \to \mathbb{E}[X]\) as \(n \to \infty\)

Central Limit Theorem: \(\sqrt{n}(S_n - \mathbb{E}[X]) \sim \mathcal{N}(0, \sigma^2)\)

Problem: These are asymptotic! Need finite-sample guarantees.

Concentration Inequalities Provide:¶

\[P(|S_n - \mathbb{E}[X]| > \epsilon) \leq f(n, \epsilon)\]

where \(f\) decays exponentially in \(n\)!

Applications¶

  • Machine learning: Generalization bounds

  • Statistics: Confidence intervals

  • Algorithms: Randomized analysis

  • High-dimensional probability: Blessing of dimensionality

2. Markov and Chebyshev Inequalities¶

Markov’s Inequality¶

For \(X \geq 0\) and \(a > 0\): $\(P(X \geq a) \leq \frac{\mathbb{E}[X]}{a}\)$

Proof: $\(\mathbb{E}[X] = \int_0^\infty x f(x) dx \geq \int_a^\infty x f(x) dx \geq a \int_a^\infty f(x) dx = a \cdot P(X \geq a)\)$

Chebyshev’s Inequality¶

For any random variable \(X\) with variance \(\sigma^2\): $\(P(|X - \mathbb{E}[X]| \geq t) \leq \frac{\sigma^2}{t^2}\)$

Proof: Apply Markov to \((X - \mathbb{E}[X])^2\).

Limitations¶

  • Polynomial decay in \(t\) (not exponential)

  • Loose bounds in practice

  • Don’t exploit independence

# Empirical verification of Chebyshev
n_samples = 10000
n_trials = 100

# Sample means
sample_size = 50
means = []
for _ in range(n_trials):
    X = np.random.binomial(1, 0.5, sample_size)
    means.append(X.mean())
means = np.array(means)

# Theoretical parameters
mu = 0.5
sigma2 = 0.25 / sample_size  # Variance of sample mean

# Check Chebyshev bound
epsilons = np.linspace(0.05, 0.3, 10)
empirical_prob = []
chebyshev_bound = []

for eps in epsilons:
    emp = np.mean(np.abs(means - mu) >= eps)
    empirical_prob.append(emp)
    chebyshev_bound.append(sigma2 / eps**2)

plt.figure(figsize=(10, 6))
plt.plot(epsilons, empirical_prob, 'o-', linewidth=2, markersize=8, label='Empirical P(|mean - ÎŒ| ≄ Δ)')
plt.plot(epsilons, chebyshev_bound, 's-', linewidth=2, markersize=8, label='Chebyshev bound: σÂČ/ΔÂČ')
plt.xlabel('Δ', fontsize=12)
plt.ylabel('Probability', fontsize=12)
plt.title('Chebyshev Inequality Verification', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print("Chebyshev bound is valid but loose!")

3. Hoeffding’s Inequality¶

Hoeffding’s Lemma¶

If \(X \in [a, b]\) with \(\mathbb{E}[X] = 0\), then: $\(\mathbb{E}[e^{\lambda X}] \leq e^{\lambda^2 (b-a)^2 / 8}\)$

Proof sketch: Use convexity and Taylor expansion.

Hoeffding’s Inequality (1963)¶

Let \(X_1, \ldots, X_n\) be independent with \(X_i \in [a_i, b_i]\). Then: $\(P\left(\sum_{i=1}^n (X_i - \mathbb{E}[X_i]) \geq t\right) \leq \exp\left(-\frac{2t^2}{\sum_{i=1}^n (b_i - a_i)^2}\right)\)$

Bounded Case¶

If \(X_i \in [0, 1]\) and \(\bar{X} = \frac{1}{n}\sum X_i\): $\(P(|\bar{X} - \mathbb{E}[\bar{X}]| \geq \epsilon) \leq 2 \exp(-2n\epsilon^2)\)$

Exponential decay in \(n\)!

def hoeffding_bound(n, epsilon):
    """Hoeffding bound for [0,1] variables."""
    return 2 * np.exp(-2 * n * epsilon**2)

def empirical_deviation_prob(n, epsilon, p=0.5, n_trials=10000):
    """Empirical probability that sample mean deviates by epsilon."""
    deviations = []
    for _ in range(n_trials):
        X = np.random.binomial(1, p, n)
        deviations.append(abs(X.mean() - p))
    return np.mean(np.array(deviations) >= epsilon)

# Compare empirical vs Hoeffding bound
sample_sizes = [10, 50, 100, 500, 1000]
epsilon = 0.1

empirical_probs = []
hoeffding_bounds = []

for n in sample_sizes:
    emp = empirical_deviation_prob(n, epsilon)
    bound = hoeffding_bound(n, epsilon)
    empirical_probs.append(emp)
    hoeffding_bounds.append(bound)
    print(f"n={n:4d}: Empirical={emp:.6f}, Hoeffding={bound:.6f}")

plt.figure(figsize=(10, 6))
plt.semilogy(sample_sizes, empirical_probs, 'o-', linewidth=2, markersize=8, label='Empirical')
plt.semilogy(sample_sizes, hoeffding_bounds, 's-', linewidth=2, markersize=8, label='Hoeffding bound')
plt.xlabel('Sample size n', fontsize=12)
plt.ylabel('P(|mean - ÎŒ| ≄ 0.1)', fontsize=12)
plt.title('Hoeffding Inequality: Exponential Decay', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

4. Bernstein’s Inequality¶

Motivation¶

Hoeffding only uses range \([a, b]\).

Bernstein uses variance for tighter bounds when variance is small!

Bernstein’s Inequality (1924)¶

Let \(X_1, \ldots, X_n\) be independent with:

  • \(\mathbb{E}[X_i] = 0\)

  • \(|X_i| \leq M\) almost surely

  • \(\sum_{i=1}^n \mathbb{E}[X_i^2] \leq n\sigma^2\)

Then: $\(P\left(\sum_{i=1}^n X_i \geq t\right) \leq \exp\left(-\frac{t^2/2}{n\sigma^2 + Mt/3}\right)\)$

Simplified Form¶

For bounded \(X_i \in [0, 1]\) with variance \(\sigma^2\): $\(P(|\bar{X} - \mu| \geq \epsilon) \leq 2\exp\left(-\frac{n\epsilon^2}{2(\sigma^2 + \epsilon/3)}\right)\)$

Comparison¶

  • Small \(\sigma^2\): Bernstein ≈ Hoeffding with better constant

  • Large \(\sigma^2\): Bernstein much better (adapts to variance)

def bernstein_bound(n, epsilon, sigma2):
    """Bernstein bound for [0,1] variables."""
    return 2 * np.exp(-n * epsilon**2 / (2 * (sigma2 + epsilon/3)))

# Compare Hoeffding vs Bernstein for different variances
n = 100
epsilons = np.linspace(0.01, 0.3, 50)

# Low variance case (p=0.5 → σÂČ=0.25)
sigma2_low = 0.25
hoeff_low = [hoeffding_bound(n, eps) for eps in epsilons]
bern_low = [bernstein_bound(n, eps, sigma2_low) for eps in epsilons]

# Very low variance (p=0.9 → σÂČ=0.09)
sigma2_verylow = 0.09
hoeff_verylow = [hoeffding_bound(n, eps) for eps in epsilons]
bern_verylow = [bernstein_bound(n, eps, sigma2_verylow) for eps in epsilons]

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].semilogy(epsilons, hoeff_low, linewidth=2, label='Hoeffding')
axes[0].semilogy(epsilons, bern_low, linewidth=2, label='Bernstein')
axes[0].set_xlabel('Δ', fontsize=12)
axes[0].set_ylabel('Bound', fontsize=12)
axes[0].set_title(f'Moderate Variance (σÂČ={sigma2_low})', fontsize=13)
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

axes[1].semilogy(epsilons, hoeff_verylow, linewidth=2, label='Hoeffding')
axes[1].semilogy(epsilons, bern_verylow, linewidth=2, label='Bernstein')
axes[1].set_xlabel('Δ', fontsize=12)
axes[1].set_ylabel('Bound', fontsize=12)
axes[1].set_title(f'Low Variance (σÂČ={sigma2_verylow})', fontsize=13)
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Bernstein is tighter when variance is small!")

5. Application to Learning Theory¶

Generalization Bound¶

Training error: \(\hat{R}_n(h) = \frac{1}{n}\sum_{i=1}^n \mathbb{1}[h(x_i) \neq y_i]\)

True error: \(R(h) = \mathbb{E}[\mathbb{1}[h(x) \neq y]]\)

Single Hypothesis¶

By Hoeffding: $\(P(|\hat{R}_n(h) - R(h)| > \epsilon) \leq 2e^{-2n\epsilon^2}\)$

With confidence \(1 - \delta\): $\(R(h) \leq \hat{R}_n(h) + \sqrt{\frac{\log(2/\delta)}{2n}}\)$

Finite Hypothesis Class¶

For \(|\mathcal{H}| = m\) hypotheses, by union bound: $\(P\left(\max_{h \in \mathcal{H}} |\hat{R}_n(h) - R(h)| > \epsilon\right) \leq 2me^{-2n\epsilon^2}\)$

Bound: $\(R(h) \leq \hat{R}_n(h) + \sqrt{\frac{\log(2m/\delta)}{2n}}\)$

# Simulate learning with generalization bound
def true_error(h_id, p_error=0.2):
    """Simulate true error for hypothesis h."""
    return p_error + 0.05 * h_id  # Different hypotheses have different errors

def empirical_error(h_id, n, p_error):
    """Sample empirical error."""
    errors = np.random.binomial(1, p_error, n)
    return errors.mean()

# Parameters
n_hypotheses = 10
n_samples = 100
delta = 0.05

# Compute bound
epsilon_bound = np.sqrt(np.log(2 * n_hypotheses / delta) / (2 * n_samples))

# Simulate
true_errors = [true_error(h, 0.2) for h in range(n_hypotheses)]
emp_errors = [empirical_error(h, n_samples, true_errors[h]) for h in range(n_hypotheses)]
upper_bounds = [emp + epsilon_bound for emp in emp_errors]

# Plot
x = np.arange(n_hypotheses)
plt.figure(figsize=(12, 6))
plt.plot(x, true_errors, 'go-', linewidth=2, markersize=8, label='True error')
plt.plot(x, emp_errors, 'bs-', linewidth=2, markersize=8, label='Empirical error')
plt.plot(x, upper_bounds, 'r^-', linewidth=2, markersize=8, label='Upper bound (95% conf.)')
plt.fill_between(x, emp_errors, upper_bounds, alpha=0.2, color='red')
plt.xlabel('Hypothesis index', fontsize=12)
plt.ylabel('Error', fontsize=12)
plt.title(f'Generalization Bound (n={n_samples}, |H|={n_hypotheses})', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nBound Δ = {epsilon_bound:.4f}")
print(f"All true errors within bounds: {all(t <= u for t, u in zip(true_errors, upper_bounds))}")

Summary¶

Key Inequalities:¶

Inequality

Bound

Decay

Uses

Markov

\(P(X \geq a) \leq \mathbb{E}[X]/a\)

\(O(1/t)\)

Mean

Chebyshev

$P(

X-\mu

\geq t) \leq \sigma^2/t^2$

Hoeffding

\(\leq 2e^{-2n\epsilon^2}\)

Exponential

Range

Bernstein

\(\leq 2e^{-n\epsilon^2/(2\sigma^2+O(\epsilon))}\)

Exponential

Variance

When to Use:¶

  • Hoeffding: Bounded variables, unknown variance

  • Bernstein: Small variance known, want tighter bounds

  • Chebyshev: Only variance known, need quick estimate

Learning Theory Applications:¶

\[R(h) \leq \hat{R}_n(h) + \underbrace{\sqrt{\frac{\log(|\mathcal{H}|/\delta)}{2n}}}_{\text{complexity penalty}}\]

Advanced Extensions:¶

  • McDiarmid’s inequality: Functions of independent variables

  • Azuma-Hoeffding: Martingale concentration

  • Bennett’s inequality: Better constants than Bernstein

  • Sub-Gaussian/sub-exponential: Broader classes

Further Reading:¶

  • Boucheron et al. (2013) - “Concentration Inequalities”

  • Vershynin (2018) - “High-Dimensional Probability”

  • Shalev-Shwartz & Ben-David (2014) - “Understanding Machine Learning”

Next Steps:¶

  • 03_rademacher_complexity.ipynb - Data-dependent bounds

  • 01_introduction_learning_theory.ipynb - PAC learning framework

  • 04_pac_bayes_theory.ipynb - Bayesian generalization