Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import linalg

sns.set_style('whitegrid')
np.random.seed(42)

1. Matrix Bernstein Inequality¶

Setup¶

Independent random matrices \(X_1, \ldots, X_n \in \mathbb{R}^{d \times d}\):

\(\mathbb{E}[X_i] = 0\)
\(\|X_i\| \leq L\) almost surely

Variance¶

\[\sigma^2 = \left\|\sum_{i=1}^n \mathbb{E}[X_i^2]\right\|\]

Theorem (Matrix Bernstein)¶

\[P\left(\left\|\sum_{i=1}^n X_i\right\| \geq t\right) \leq 2d \cdot \exp\left(-\frac{t^2/2}{\sigma^2 + Lt/3}\right)\]

def matrix_bernstein_experiment(d=10, n=100, n_trials=1000):
    """Verify Matrix Bernstein inequality."""
    norms = []
    
    for _ in range(n_trials):
        # Generate random symmetric matrices
        matrices = []
        for _ in range(n):
            A = np.random.randn(d, d) / np.sqrt(d)
            A = (A + A.T) / 2  # Symmetric
            A = A - A.trace() / d * np.eye(d)  # Zero trace (E[X]=0)
            matrices.append(A)
        
        # Sum
        S = np.sum(matrices, axis=0)
        norms.append(np.linalg.norm(S, ord=2))
    
    norms = np.array(norms)
    
    # Compute parameters
    L = max([np.linalg.norm(M, ord=2) for M in matrices])
    
    # Variance estimate
    variance_sum = sum([M @ M for M in matrices])
    sigma_sq = np.linalg.norm(variance_sum, ord=2)
    
    return norms, L, np.sqrt(sigma_sq)

norms, L, sigma = matrix_bernstein_experiment(d=10, n=100, n_trials=2000)

# Theoretical bound
t_vals = np.linspace(0, norms.max(), 100)
d = 10
prob_bound = 2 * d * np.exp(-t_vals**2 / (2 * (sigma**2 + L * t_vals / 3)))

# Empirical
empirical_prob = np.array([np.mean(norms >= t) for t in t_vals])

plt.figure(figsize=(10, 6))
plt.semilogy(t_vals, empirical_prob, 'b-', linewidth=2, label='Empirical')
plt.semilogy(t_vals, prob_bound, 'r--', linewidth=2, label='Bound')
plt.xlabel('t', fontsize=11)
plt.ylabel('P(||S|| ≥ t)', fontsize=11)
plt.title('Matrix Bernstein Inequality', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"L = {L:.3f}, σ = {sigma:.3f}")

2. Matrix Chernoff Bound¶

For PSD matrices¶

\(A_1, \ldots, A_n \succeq 0\) independent with \(A_i \preceq R \cdot I\).

\[\mu_{\min} I \preceq \mathbb{E}\left[\sum_{i=1}^n A_i\right] \preceq \mu_{\max} I\]

Upper Tail¶

\[P\left(\lambda_{\max}\left(\sum_{i=1}^n A_i\right) \geq (1+\delta)\mu_{\max}\right) \leq d \cdot \left(\frac{e^\delta}{(1+\delta)^{1+\delta}}\right)^{\mu_{\max}/R}\]

def matrix_chernoff_experiment(d=5, n=50, R=1.0, n_trials=1000):
    """Verify Matrix Chernoff bound."""
    max_eigvals = []
    
    for _ in range(n_trials):
        # Generate PSD matrices
        matrices = []
        for _ in range(n):
            # Random PSD with bounded eigenvalues
            eigvals = np.random.uniform(0, R, d)
            Q = linalg.qr(np.random.randn(d, d))[0]
            A = Q @ np.diag(eigvals) @ Q.T
            matrices.append(A)
        
        # Sum
        S = np.sum(matrices, axis=0)
        max_eigvals.append(np.linalg.eigvalsh(S)[-1])
    
    max_eigvals = np.array(max_eigvals)
    
    # Expected max eigenvalue
    mu_max = n * R / 2  # Approx for uniform
    
    return max_eigvals, mu_max, R

max_eigs, mu_max, R = matrix_chernoff_experiment(d=5, n=50, R=1.0, n_trials=1500)

# Plot
plt.figure(figsize=(10, 6))
plt.hist(max_eigs, bins=30, density=True, alpha=0.7, edgecolor='black')
plt.axvline(mu_max, color='r', linestyle='--', linewidth=2, label=f'E[λ_max] ≈ {mu_max:.2f}')
plt.xlabel('λ_max(Σ A_i)', fontsize=11)
plt.ylabel('Density', fontsize=11)
plt.title('Matrix Chernoff: Max Eigenvalue Distribution', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Concentration
delta_vals = np.linspace(0.1, 1, 20)
empirical = []
theoretical = []

for delta in delta_vals:
    threshold = (1 + delta) * mu_max
    emp = np.mean(max_eigs >= threshold)
    empirical.append(emp)
    
    # Bound
    bound = 5 * (np.exp(delta) / (1 + delta)**(1 + delta))**(mu_max / R)
    theoretical.append(bound)

plt.figure(figsize=(10, 6))
plt.semilogy(delta_vals, empirical, 'bo-', label='Empirical', markersize=6)
plt.semilogy(delta_vals, theoretical, 'r--', linewidth=2, label='Chernoff Bound')
plt.xlabel('δ', fontsize=11)
plt.ylabel('P(λ_max ≥ (1+δ)μ)', fontsize=11)
plt.title('Matrix Chernoff Tail Bound', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

3. Random Matrix Theory¶

Wigner Semicircle Law¶

For GOE matrix \(W \in \mathbb{R}^{n \times n}\), eigenvalue density:

\[\rho(\lambda) = \frac{1}{2\pi} \sqrt{4 - \lambda^2}, \quad |\lambda| \leq 2\]

def wigner_semicircle(n=1000):
    """Verify Wigner semicircle law."""
    # GOE matrix
    W = np.random.randn(n, n) / np.sqrt(n)
    W = (W + W.T) / 2
    
    eigvals = np.linalg.eigvalsh(W)
    
    # Theoretical density
    x = np.linspace(-2, 2, 200)
    rho = np.sqrt(np.maximum(4 - x**2, 0)) / (2 * np.pi)
    
    plt.figure(figsize=(10, 6))
    plt.hist(eigvals, bins=50, density=True, alpha=0.6, edgecolor='black', label='Empirical')
    plt.plot(x, rho, 'r-', linewidth=2, label='Semicircle Law')
    plt.xlabel('λ', fontsize=11)
    plt.ylabel('Density', fontsize=11)
    plt.title(f'Wigner Semicircle Law (n={n})', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

wigner_semicircle(n=1000)

4. Marchenko-Pastur Law¶

What and Why¶

The Marchenko-Pastur law describes the limiting eigenvalue distribution of sample covariance matrices when both the number of samples \(n\) and the dimension \(p\) grow together with a fixed ratio \(\gamma = p/n\). This is the foundational result of high-dimensional statistics – in modern ML, we routinely work with datasets where \(p\) is comparable to or larger than \(n\), making classical fixed-\(p\) asymptotics inadequate.

Mathematical Statement¶

For a data matrix \(X \in \mathbb{R}^{n \times p}\) with i.i.d. standard entries, the empirical eigenvalue distribution of \(\frac{1}{n}X^T X\) converges to the MP density:

\[\rho(\lambda) = \frac{1}{2\pi \gamma \lambda}\sqrt{(\lambda_+ - \lambda)(\lambda - \lambda_-)}\]

supported on \([\lambda_-, \lambda_+]\) where \(\lambda_\pm = (1 \pm \sqrt{\gamma})^2\). When \(\gamma > 1\), there is also a point mass \((1 - 1/\gamma)\) at zero, reflecting the null space of the sample covariance.

Connection to ML¶

Understanding the MP law is critical for PCA in high dimensions – eigenvalues of the sample covariance matrix can be far from their population counterparts when \(p/n\) is not negligible. This informs when principal components are spurious (driven by noise rather than signal) and motivates techniques like shrinkage estimation and spiked covariance models used in random matrix-based denoising of neural network weight matrices.

def marchenko_pastur(n=500, p=1000, n_trials=20):
    """Verify Marchenko-Pastur law for sample covariance."""
    gamma = p / n
    
    all_eigvals = []
    for _ in range(n_trials):
        X = np.random.randn(n, p) / np.sqrt(n)
        C = X.T @ X
        eigvals = np.linalg.eigvalsh(C)
        all_eigvals.extend(eigvals)
    
    # MP density
    lambda_minus = (1 - np.sqrt(gamma))**2
    lambda_plus = (1 + np.sqrt(gamma))**2
    
    x = np.linspace(lambda_minus, lambda_plus, 200)
    rho = np.sqrt(np.maximum((lambda_plus - x) * (x - lambda_minus), 0)) / (2 * np.pi * gamma * x)
    
    plt.figure(figsize=(10, 6))
    plt.hist(all_eigvals, bins=50, density=True, alpha=0.6, edgecolor='black', label='Empirical')
    plt.plot(x, rho, 'r-', linewidth=2, label='MP Law')
    plt.axvline(lambda_minus, color='g', linestyle='--', alpha=0.7)
    plt.axvline(lambda_plus, color='g', linestyle='--', alpha=0.7)
    plt.xlabel('λ', fontsize=11)
    plt.ylabel('Density', fontsize=11)
    plt.title(f'Marchenko-Pastur Law (p/n={gamma:.2f})', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xlim(0, lambda_plus * 1.2)
    plt.show()

marchenko_pastur(n=500, p=1000)

5. Application: Compressed Sensing¶

What and Why¶

Compressed sensing demonstrates the practical power of matrix concentration inequalities by showing that sparse signals can be exactly recovered from far fewer measurements than the ambient dimension. The key theoretical ingredient is the Restricted Isometry Property (RIP): a measurement matrix \(A \in \mathbb{R}^{m \times n}\) satisfies RIP of order \(s\) if for all \(s\)-sparse vectors \(x\),

\[(1 - \delta_s)\|x\|^2 \leq \|Ax\|^2 \leq (1 + \delta_s)\|x\|^2\]

Matrix concentration bounds (especially Matrix Bernstein) prove that random Gaussian matrices satisfy RIP with \(m = O(s \log(n/s))\) measurements – a dramatic reduction from the full dimension \(n\).

How It Works¶

The recovery algorithm solves the basis pursuit problem: minimize \(\|z\|_1\) subject to \(Az = y\), where \(y = Ax\) are the compressed measurements. The \(\ell_1\) norm promotes sparsity, and under the RIP guarantee, this convex relaxation recovers the original sparse signal exactly. The code below uses scipy.optimize.minimize with SLSQP to solve this constrained optimization.

Connection to ML¶

Compressed sensing underpins sparse signal recovery in MRI imaging, radar, and sensor networks. In ML, the same principles appear in sparse model selection (Lasso), random feature approximations for kernel methods, and sketching algorithms that compress large datasets or gradients for distributed training.

def compressed_sensing_demo(n=200, m=100, s=10):
    """Demonstrate RIP via matrix concentration."""
    # Random measurement matrix
    A = np.random.randn(m, n) / np.sqrt(m)
    
    # Sparse signal
    x = np.zeros(n)
    idx = np.random.choice(n, s, replace=False)
    x[idx] = np.random.randn(s)
    
    # Measurements
    y = A @ x
    
    # L1 recovery
    from scipy.optimize import minimize
    
    def objective(z):
        return np.linalg.norm(z, ord=1)
    
    def constraint(z):
        return np.linalg.norm(A @ z - y)**2
    
    cons = {'type': 'eq', 'fun': constraint}
    result = minimize(objective, np.zeros(n), constraints=cons, method='SLSQP')
    x_recovered = result.x
    
    # Plot
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].stem(x, basefmt=' ', linefmt='b-', markerfmt='bo', label='Original')
    axes[0].set_xlabel('Index', fontsize=11)
    axes[0].set_ylabel('Value', fontsize=11)
    axes[0].set_title(f'Original (s={s})', fontsize=12)
    axes[0].grid(True, alpha=0.3)
    
    axes[1].stem(x_recovered, basefmt=' ', linefmt='r-', markerfmt='ro', label='Recovered')
    axes[1].set_xlabel('Index', fontsize=11)
    axes[1].set_ylabel('Value', fontsize=11)
    axes[1].set_title(f'Recovered (m={m} measurements)', fontsize=12)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    error = np.linalg.norm(x - x_recovered) / np.linalg.norm(x)
    print(f"Recovery error: {error:.4f}")

compressed_sensing_demo(n=200, m=100, s=10)

Summary¶

Matrix Bernstein:¶

\[P\left(\left\|\sum X_i\right\| \geq t\right) \leq 2d \exp\left(-\frac{t^2/2}{\sigma^2 + Lt/3}\right)\]

Matrix Chernoff:¶

Concentration for sums of PSD matrices.

Random Matrix Theory:¶

Wigner semicircle (GOE)
Marchenko-Pastur (covariance)

Applications:¶

Compressed sensing (RIP)
Dimensionality reduction
Random features
Spectral clustering

Next Steps:¶

07_concentration_inequalities.ipynb - Scalar version
Study free probability
Explore matrix completion