import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import linalg
sns.set_style('whitegrid')
np.random.seed(42)
1. Matrix Bernstein InequalityΒΆ
SetupΒΆ
Independent random matrices \(X_1, \ldots, X_n \in \mathbb{R}^{d \times d}\):
\(\mathbb{E}[X_i] = 0\)
\(\|X_i\| \leq L\) almost surely
VarianceΒΆ
Theorem (Matrix Bernstein)ΒΆ
def matrix_bernstein_experiment(d=10, n=100, n_trials=1000):
"""Verify Matrix Bernstein inequality."""
norms = []
for _ in range(n_trials):
# Generate random symmetric matrices
matrices = []
for _ in range(n):
A = np.random.randn(d, d) / np.sqrt(d)
A = (A + A.T) / 2 # Symmetric
A = A - A.trace() / d * np.eye(d) # Zero trace (E[X]=0)
matrices.append(A)
# Sum
S = np.sum(matrices, axis=0)
norms.append(np.linalg.norm(S, ord=2))
norms = np.array(norms)
# Compute parameters
L = max([np.linalg.norm(M, ord=2) for M in matrices])
# Variance estimate
variance_sum = sum([M @ M for M in matrices])
sigma_sq = np.linalg.norm(variance_sum, ord=2)
return norms, L, np.sqrt(sigma_sq)
norms, L, sigma = matrix_bernstein_experiment(d=10, n=100, n_trials=2000)
# Theoretical bound
t_vals = np.linspace(0, norms.max(), 100)
d = 10
prob_bound = 2 * d * np.exp(-t_vals**2 / (2 * (sigma**2 + L * t_vals / 3)))
# Empirical
empirical_prob = np.array([np.mean(norms >= t) for t in t_vals])
plt.figure(figsize=(10, 6))
plt.semilogy(t_vals, empirical_prob, 'b-', linewidth=2, label='Empirical')
plt.semilogy(t_vals, prob_bound, 'r--', linewidth=2, label='Bound')
plt.xlabel('t', fontsize=11)
plt.ylabel('P(||S|| β₯ t)', fontsize=11)
plt.title('Matrix Bernstein Inequality', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"L = {L:.3f}, Ο = {sigma:.3f}")
2. Matrix Chernoff BoundΒΆ
For PSD matricesΒΆ
\(A_1, \ldots, A_n \succeq 0\) independent with \(A_i \preceq R \cdot I\).
Upper TailΒΆ
def matrix_chernoff_experiment(d=5, n=50, R=1.0, n_trials=1000):
"""Verify Matrix Chernoff bound."""
max_eigvals = []
for _ in range(n_trials):
# Generate PSD matrices
matrices = []
for _ in range(n):
# Random PSD with bounded eigenvalues
eigvals = np.random.uniform(0, R, d)
Q = linalg.qr(np.random.randn(d, d))[0]
A = Q @ np.diag(eigvals) @ Q.T
matrices.append(A)
# Sum
S = np.sum(matrices, axis=0)
max_eigvals.append(np.linalg.eigvalsh(S)[-1])
max_eigvals = np.array(max_eigvals)
# Expected max eigenvalue
mu_max = n * R / 2 # Approx for uniform
return max_eigvals, mu_max, R
max_eigs, mu_max, R = matrix_chernoff_experiment(d=5, n=50, R=1.0, n_trials=1500)
# Plot
plt.figure(figsize=(10, 6))
plt.hist(max_eigs, bins=30, density=True, alpha=0.7, edgecolor='black')
plt.axvline(mu_max, color='r', linestyle='--', linewidth=2, label=f'E[Ξ»_max] β {mu_max:.2f}')
plt.xlabel('Ξ»_max(Ξ£ A_i)', fontsize=11)
plt.ylabel('Density', fontsize=11)
plt.title('Matrix Chernoff: Max Eigenvalue Distribution', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Concentration
delta_vals = np.linspace(0.1, 1, 20)
empirical = []
theoretical = []
for delta in delta_vals:
threshold = (1 + delta) * mu_max
emp = np.mean(max_eigs >= threshold)
empirical.append(emp)
# Bound
bound = 5 * (np.exp(delta) / (1 + delta)**(1 + delta))**(mu_max / R)
theoretical.append(bound)
plt.figure(figsize=(10, 6))
plt.semilogy(delta_vals, empirical, 'bo-', label='Empirical', markersize=6)
plt.semilogy(delta_vals, theoretical, 'r--', linewidth=2, label='Chernoff Bound')
plt.xlabel('Ξ΄', fontsize=11)
plt.ylabel('P(Ξ»_max β₯ (1+Ξ΄)ΞΌ)', fontsize=11)
plt.title('Matrix Chernoff Tail Bound', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
3. Random Matrix TheoryΒΆ
Wigner Semicircle LawΒΆ
For GOE matrix \(W \in \mathbb{R}^{n \times n}\), eigenvalue density:
def wigner_semicircle(n=1000):
"""Verify Wigner semicircle law."""
# GOE matrix
W = np.random.randn(n, n) / np.sqrt(n)
W = (W + W.T) / 2
eigvals = np.linalg.eigvalsh(W)
# Theoretical density
x = np.linspace(-2, 2, 200)
rho = np.sqrt(np.maximum(4 - x**2, 0)) / (2 * np.pi)
plt.figure(figsize=(10, 6))
plt.hist(eigvals, bins=50, density=True, alpha=0.6, edgecolor='black', label='Empirical')
plt.plot(x, rho, 'r-', linewidth=2, label='Semicircle Law')
plt.xlabel('Ξ»', fontsize=11)
plt.ylabel('Density', fontsize=11)
plt.title(f'Wigner Semicircle Law (n={n})', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
wigner_semicircle(n=1000)
4. Marchenko-Pastur LawΒΆ
What and WhyΒΆ
The Marchenko-Pastur law describes the limiting eigenvalue distribution of sample covariance matrices when both the number of samples \(n\) and the dimension \(p\) grow together with a fixed ratio \(\gamma = p/n\). This is the foundational result of high-dimensional statistics β in modern ML, we routinely work with datasets where \(p\) is comparable to or larger than \(n\), making classical fixed-\(p\) asymptotics inadequate.
Mathematical StatementΒΆ
For a data matrix \(X \in \mathbb{R}^{n \times p}\) with i.i.d. standard entries, the empirical eigenvalue distribution of \(\frac{1}{n}X^T X\) converges to the MP density:
supported on \([\lambda_-, \lambda_+]\) where \(\lambda_\pm = (1 \pm \sqrt{\gamma})^2\). When \(\gamma > 1\), there is also a point mass \((1 - 1/\gamma)\) at zero, reflecting the null space of the sample covariance.
Connection to MLΒΆ
Understanding the MP law is critical for PCA in high dimensions β eigenvalues of the sample covariance matrix can be far from their population counterparts when \(p/n\) is not negligible. This informs when principal components are spurious (driven by noise rather than signal) and motivates techniques like shrinkage estimation and spiked covariance models used in random matrix-based denoising of neural network weight matrices.
def marchenko_pastur(n=500, p=1000, n_trials=20):
"""Verify Marchenko-Pastur law for sample covariance."""
gamma = p / n
all_eigvals = []
for _ in range(n_trials):
X = np.random.randn(n, p) / np.sqrt(n)
C = X.T @ X
eigvals = np.linalg.eigvalsh(C)
all_eigvals.extend(eigvals)
# MP density
lambda_minus = (1 - np.sqrt(gamma))**2
lambda_plus = (1 + np.sqrt(gamma))**2
x = np.linspace(lambda_minus, lambda_plus, 200)
rho = np.sqrt(np.maximum((lambda_plus - x) * (x - lambda_minus), 0)) / (2 * np.pi * gamma * x)
plt.figure(figsize=(10, 6))
plt.hist(all_eigvals, bins=50, density=True, alpha=0.6, edgecolor='black', label='Empirical')
plt.plot(x, rho, 'r-', linewidth=2, label='MP Law')
plt.axvline(lambda_minus, color='g', linestyle='--', alpha=0.7)
plt.axvline(lambda_plus, color='g', linestyle='--', alpha=0.7)
plt.xlabel('Ξ»', fontsize=11)
plt.ylabel('Density', fontsize=11)
plt.title(f'Marchenko-Pastur Law (p/n={gamma:.2f})', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(0, lambda_plus * 1.2)
plt.show()
marchenko_pastur(n=500, p=1000)
5. Application: Compressed SensingΒΆ
What and WhyΒΆ
Compressed sensing demonstrates the practical power of matrix concentration inequalities by showing that sparse signals can be exactly recovered from far fewer measurements than the ambient dimension. The key theoretical ingredient is the Restricted Isometry Property (RIP): a measurement matrix \(A \in \mathbb{R}^{m \times n}\) satisfies RIP of order \(s\) if for all \(s\)-sparse vectors \(x\),
Matrix concentration bounds (especially Matrix Bernstein) prove that random Gaussian matrices satisfy RIP with \(m = O(s \log(n/s))\) measurements β a dramatic reduction from the full dimension \(n\).
How It WorksΒΆ
The recovery algorithm solves the basis pursuit problem: minimize \(\|z\|_1\) subject to \(Az = y\), where \(y = Ax\) are the compressed measurements. The \(\ell_1\) norm promotes sparsity, and under the RIP guarantee, this convex relaxation recovers the original sparse signal exactly. The code below uses scipy.optimize.minimize with SLSQP to solve this constrained optimization.
Connection to MLΒΆ
Compressed sensing underpins sparse signal recovery in MRI imaging, radar, and sensor networks. In ML, the same principles appear in sparse model selection (Lasso), random feature approximations for kernel methods, and sketching algorithms that compress large datasets or gradients for distributed training.
def compressed_sensing_demo(n=200, m=100, s=10):
"""Demonstrate RIP via matrix concentration."""
# Random measurement matrix
A = np.random.randn(m, n) / np.sqrt(m)
# Sparse signal
x = np.zeros(n)
idx = np.random.choice(n, s, replace=False)
x[idx] = np.random.randn(s)
# Measurements
y = A @ x
# L1 recovery
from scipy.optimize import minimize
def objective(z):
return np.linalg.norm(z, ord=1)
def constraint(z):
return np.linalg.norm(A @ z - y)**2
cons = {'type': 'eq', 'fun': constraint}
result = minimize(objective, np.zeros(n), constraints=cons, method='SLSQP')
x_recovered = result.x
# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].stem(x, basefmt=' ', linefmt='b-', markerfmt='bo', label='Original')
axes[0].set_xlabel('Index', fontsize=11)
axes[0].set_ylabel('Value', fontsize=11)
axes[0].set_title(f'Original (s={s})', fontsize=12)
axes[0].grid(True, alpha=0.3)
axes[1].stem(x_recovered, basefmt=' ', linefmt='r-', markerfmt='ro', label='Recovered')
axes[1].set_xlabel('Index', fontsize=11)
axes[1].set_ylabel('Value', fontsize=11)
axes[1].set_title(f'Recovered (m={m} measurements)', fontsize=12)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
error = np.linalg.norm(x - x_recovered) / np.linalg.norm(x)
print(f"Recovery error: {error:.4f}")
compressed_sensing_demo(n=200, m=100, s=10)
SummaryΒΆ
Matrix Bernstein:ΒΆ
Matrix Chernoff:ΒΆ
Concentration for sums of PSD matrices.
Random Matrix Theory:ΒΆ
Wigner semicircle (GOE)
Marchenko-Pastur (covariance)
Applications:ΒΆ
Compressed sensing (RIP)
Dimensionality reduction
Random features
Spectral clustering
Next Steps:ΒΆ
07_concentration_inequalities.ipynb - Scalar version
Study free probability
Explore matrix completion