import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist, squareform
sns.set_style('whitegrid')
np.random.seed(42)
1. The Johnson-Lindenstrauss LemmaΒΆ
StatementΒΆ
For any \(0 < \epsilon < 1\) and integer \(n\), let \(k \geq \frac{8\ln n}{\epsilon^2}\).
For any set \(X\) of \(n\) points in \(\mathbb{R}^d\), there exists \(f: \mathbb{R}^d \to \mathbb{R}^k\) such that:
for all \(u, v \in X\).
Key InsightΒΆ
Dimension \(k\) depends on \(n\) (number of points), not \(d\) (original dimension)!
2. Random ProjectionΒΆ
ConstructionΒΆ
Sample \(R \in \mathbb{R}^{k \times d}\) with i.i.d. entries:
Define: $\(f(x) = Rx\)$
Proof SketchΒΆ
For \(u - v = w\):
Concentration inequalities give high probability bounds.
def jl_dimension(n, epsilon):
"""Compute required dimension for JL."""
return int(np.ceil(8 * np.log(n) / epsilon**2))
# Example
n_values = [100, 1000, 10000]
eps_values = [0.1, 0.2, 0.5]
print("Required dimension k for JL lemma:\n")
print("n\\Ξ΅\t" + "\t".join([f"{e:.1f}" for e in eps_values]))
for n in n_values:
dims = [jl_dimension(n, e) for e in eps_values]
print(f"{n}\t" + "\t".join([str(d) for d in dims]))
3. ImplementationΒΆ
class RandomProjection:
"""Johnson-Lindenstrauss random projection."""
def __init__(self, n_components, random_state=None):
self.n_components = n_components
self.random_state = random_state
self.R = None
def fit(self, X):
n, d = X.shape
rng = np.random.RandomState(self.random_state)
# Gaussian random matrix
self.R = rng.randn(self.n_components, d) / np.sqrt(self.n_components)
return self
def transform(self, X):
return X @ self.R.T
def fit_transform(self, X):
return self.fit(X).transform(X)
# Test on synthetic data
n = 100
d = 1000
X = np.random.randn(n, d)
# Compute original distances
dist_orig = pdist(X, metric='euclidean')
# Project
epsilon = 0.3
k = jl_dimension(n, epsilon)
print(f"Projecting from d={d} to k={k}")
rp = RandomProjection(n_components=k, random_state=42)
X_proj = rp.fit_transform(X)
dist_proj = pdist(X_proj, metric='euclidean')
# Check distortion
ratios = dist_proj / dist_orig
plt.figure(figsize=(10, 5))
plt.hist(ratios, bins=50, alpha=0.7, edgecolor='black')
plt.axvline(1-epsilon, color='red', linestyle='--', label=f'1-Ξ΅={1-epsilon:.2f}')
plt.axvline(1+epsilon, color='red', linestyle='--', label=f'1+Ξ΅={1+epsilon:.2f}')
plt.axvline(1, color='black', linestyle='-', alpha=0.5, label='1.0')
plt.xlabel('Distance Ratio (projected/original)', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title(f'JL Distortion (n={n}, d={d}βk={k}, Ξ΅={epsilon})', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
violations = np.sum((ratios < 1-epsilon) | (ratios > 1+epsilon))
print(f"\nViolations: {violations}/{len(ratios)} ({100*violations/len(ratios):.2f}%)")
print(f"Mean ratio: {np.mean(ratios):.4f}")
print(f"Std ratio: {np.std(ratios):.4f}")
4. Sparse Random ProjectionsΒΆ
def sparse_random_projection(X, k, sparsity=0.1):
"""Faster projection using sparse matrix."""
n, d = X.shape
# Sparse random matrix
R = np.zeros((k, d))
for i in range(k):
indices = np.random.choice(d, size=int(d*sparsity), replace=False)
R[i, indices] = np.random.randn(len(indices))
R /= np.sqrt(k)
return X @ R.T
# Compare dense vs sparse
import time
n, d = 500, 5000
X = np.random.randn(n, d)
k = jl_dimension(n, 0.3)
# Dense
start = time.time()
rp_dense = RandomProjection(k).fit_transform(X)
time_dense = time.time() - start
# Sparse
start = time.time()
rp_sparse = sparse_random_projection(X, k, sparsity=0.1)
time_sparse = time.time() - start
print(f"Dense: {time_dense:.4f}s")
print(f"Sparse: {time_sparse:.4f}s")
print(f"Speedup: {time_dense/time_sparse:.2f}x")
5. Application: Nearest NeighborsΒΆ
# Nearest neighbor preservation
from sklearn.neighbors import NearestNeighbors
n, d = 200, 500
X = np.random.randn(n, d)
# Original k-NN
knn_orig = NearestNeighbors(n_neighbors=10)
knn_orig.fit(X)
neighbors_orig = knn_orig.kneighbors(X, return_distance=False)
# Projected k-NN
k = jl_dimension(n, 0.2)
X_proj = RandomProjection(k).fit_transform(X)
knn_proj = NearestNeighbors(n_neighbors=10)
knn_proj.fit(X_proj)
neighbors_proj = knn_proj.kneighbors(X_proj, return_distance=False)
# Compute recall@10
recalls = []
for i in range(n):
intersection = len(set(neighbors_orig[i]) & set(neighbors_proj[i]))
recalls.append(intersection / 10)
plt.figure(figsize=(8, 5))
plt.hist(recalls, bins=20, alpha=0.7, edgecolor='black')
plt.xlabel('Recall@10', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title(f'NN Preservation (d={d}βk={k})', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()
print(f"Mean recall@10: {np.mean(recalls):.3f}")
print(f"Dimension reduction: {100*(1-k/d):.1f}%")
SummaryΒΆ
JL Lemma:ΒΆ
Preserves distances with distortion \(\epsilon\)
Random Projection:ΒΆ
Key Properties:ΒΆ
Dimension-independent of \(d\)
Data-independent (no training)
Fast computation
Applications:ΒΆ
Nearest neighbor search
Clustering speedup
Compressed sensing
Privacy-preserving ML
Next Steps:ΒΆ
07_concentration_inequalities.ipynb - Theory
Explore sketching algorithms
Apply to large-scale data