Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist, squareform

sns.set_style('whitegrid')
np.random.seed(42)

1. The Johnson-Lindenstrauss Lemma¶

Statement¶

For any $0 < \epsilon < 1$ and integer $n$, let $k \geq \frac{8\ln n}{\epsilon^2}$.

For any set $X$ of $n$ points in $\mathbb{R}^d$, there exists $f: \mathbb{R}^d \to \mathbb{R}^k$ such that:

\[(1-\epsilon)\|u-v\|^2 \leq \|f(u) - f(v)\|^2 \leq (1+\epsilon)\|u-v\|^2\]

for all $u, v \in X$.

Key Insight¶

Dimension $k$ depends on $n$ (number of points), not $d$ (original dimension)!

2. Random Projection¶

Construction¶

Sample $R \in \mathbb{R}^{k \times d}$ with i.i.d. entries:

\[R_{ij} \sim \mathcal{N}(0, 1/k)\]

Define: $$f(x) = Rx$$

Proof Sketch¶

For $u - v = w$:

\[\mathbb{E}[\|Rw\|^2] = \|w\|^2\]

Concentration inequalities give high probability bounds.

def jl_dimension(n, epsilon):
    """Compute required dimension for JL."""
    return int(np.ceil(8 * np.log(n) / epsilon**2))

# Example
n_values = [100, 1000, 10000]
eps_values = [0.1, 0.2, 0.5]

print("Required dimension k for JL lemma:\n")
print("n\\ε\t" + "\t".join([f"{e:.1f}" for e in eps_values]))
for n in n_values:
    dims = [jl_dimension(n, e) for e in eps_values]
    print(f"{n}\t" + "\t".join([str(d) for d in dims]))

3. Implementation¶

class RandomProjection:
    """Johnson-Lindenstrauss random projection."""
    
    def __init__(self, n_components, random_state=None):
        self.n_components = n_components
        self.random_state = random_state
        self.R = None
    
    def fit(self, X):
        n, d = X.shape
        rng = np.random.RandomState(self.random_state)
        
        # Gaussian random matrix
        self.R = rng.randn(self.n_components, d) / np.sqrt(self.n_components)
        return self
    
    def transform(self, X):
        return X @ self.R.T
    
    def fit_transform(self, X):
        return self.fit(X).transform(X)

# Test on synthetic data
n = 100
d = 1000
X = np.random.randn(n, d)

# Compute original distances
dist_orig = pdist(X, metric='euclidean')

# Project
epsilon = 0.3
k = jl_dimension(n, epsilon)
print(f"Projecting from d={d} to k={k}")

rp = RandomProjection(n_components=k, random_state=42)
X_proj = rp.fit_transform(X)
dist_proj = pdist(X_proj, metric='euclidean')

# Check distortion
ratios = dist_proj / dist_orig

plt.figure(figsize=(10, 5))
plt.hist(ratios, bins=50, alpha=0.7, edgecolor='black')
plt.axvline(1-epsilon, color='red', linestyle='--', label=f'1-ε={1-epsilon:.2f}')
plt.axvline(1+epsilon, color='red', linestyle='--', label=f'1+ε={1+epsilon:.2f}')
plt.axvline(1, color='black', linestyle='-', alpha=0.5, label='1.0')
plt.xlabel('Distance Ratio (projected/original)', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title(f'JL Distortion (n={n}, d={d}→k={k}, ε={epsilon})', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

violations = np.sum((ratios < 1-epsilon) | (ratios > 1+epsilon))
print(f"\nViolations: {violations}/{len(ratios)} ({100*violations/len(ratios):.2f}%)")
print(f"Mean ratio: {np.mean(ratios):.4f}")
print(f"Std ratio: {np.std(ratios):.4f}")

4. Sparse Random Projections¶

def sparse_random_projection(X, k, sparsity=0.1):
    """Faster projection using sparse matrix."""
    n, d = X.shape
    
    # Sparse random matrix
    R = np.zeros((k, d))
    for i in range(k):
        indices = np.random.choice(d, size=int(d*sparsity), replace=False)
        R[i, indices] = np.random.randn(len(indices))
    
    R /= np.sqrt(k)
    return X @ R.T

# Compare dense vs sparse
import time

n, d = 500, 5000
X = np.random.randn(n, d)
k = jl_dimension(n, 0.3)

# Dense
start = time.time()
rp_dense = RandomProjection(k).fit_transform(X)
time_dense = time.time() - start

# Sparse
start = time.time()
rp_sparse = sparse_random_projection(X, k, sparsity=0.1)
time_sparse = time.time() - start

print(f"Dense: {time_dense:.4f}s")
print(f"Sparse: {time_sparse:.4f}s")
print(f"Speedup: {time_dense/time_sparse:.2f}x")

5. Application: Nearest Neighbors¶

# Nearest neighbor preservation
from sklearn.neighbors import NearestNeighbors

n, d = 200, 500
X = np.random.randn(n, d)

# Original k-NN
knn_orig = NearestNeighbors(n_neighbors=10)
knn_orig.fit(X)
neighbors_orig = knn_orig.kneighbors(X, return_distance=False)

# Projected k-NN
k = jl_dimension(n, 0.2)
X_proj = RandomProjection(k).fit_transform(X)
knn_proj = NearestNeighbors(n_neighbors=10)
knn_proj.fit(X_proj)
neighbors_proj = knn_proj.kneighbors(X_proj, return_distance=False)

# Compute recall@10
recalls = []
for i in range(n):
    intersection = len(set(neighbors_orig[i]) & set(neighbors_proj[i]))
    recalls.append(intersection / 10)

plt.figure(figsize=(8, 5))
plt.hist(recalls, bins=20, alpha=0.7, edgecolor='black')
plt.xlabel('Recall@10', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title(f'NN Preservation (d={d}→k={k})', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

print(f"Mean recall@10: {np.mean(recalls):.3f}")
print(f"Dimension reduction: {100*(1-k/d):.1f}%")

Summary¶

JL Lemma:¶

\[k = O\left(\frac{\log n}{\epsilon^2}\right)\]

Preserves distances with distortion $\epsilon$

Random Projection:¶

\[f(x) = Rx, \quad R_{ij} \sim \mathcal{N}(0, 1/k)\]

Key Properties:¶

Dimension-independent of $d$
Data-independent (no training)
Fast computation

Applications:¶

Nearest neighbor search
Clustering speedup
Compressed sensing
Privacy-preserving ML

Next Steps:¶

07_concentration_inequalities.ipynb - Theory
Explore sketching algorithms
Apply to large-scale data