import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist, squareform

sns.set_style('whitegrid')
np.random.seed(42)

1. The Johnson-Lindenstrauss LemmaΒΆ

StatementΒΆ

For any \(0 < \epsilon < 1\) and integer \(n\), let \(k \geq \frac{8\ln n}{\epsilon^2}\).

For any set \(X\) of \(n\) points in \(\mathbb{R}^d\), there exists \(f: \mathbb{R}^d \to \mathbb{R}^k\) such that:

\[(1-\epsilon)\|u-v\|^2 \leq \|f(u) - f(v)\|^2 \leq (1+\epsilon)\|u-v\|^2\]

for all \(u, v \in X\).

Key InsightΒΆ

Dimension \(k\) depends on \(n\) (number of points), not \(d\) (original dimension)!

2. Random ProjectionΒΆ

ConstructionΒΆ

Sample \(R \in \mathbb{R}^{k \times d}\) with i.i.d. entries:

\[R_{ij} \sim \mathcal{N}(0, 1/k)\]

Define: $\(f(x) = Rx\)$

Proof SketchΒΆ

For \(u - v = w\):

\[\mathbb{E}[\|Rw\|^2] = \|w\|^2\]

Concentration inequalities give high probability bounds.

def jl_dimension(n, epsilon):
    """Compute required dimension for JL."""
    return int(np.ceil(8 * np.log(n) / epsilon**2))

# Example
n_values = [100, 1000, 10000]
eps_values = [0.1, 0.2, 0.5]

print("Required dimension k for JL lemma:\n")
print("n\\Ξ΅\t" + "\t".join([f"{e:.1f}" for e in eps_values]))
for n in n_values:
    dims = [jl_dimension(n, e) for e in eps_values]
    print(f"{n}\t" + "\t".join([str(d) for d in dims]))

3. ImplementationΒΆ

class RandomProjection:
    """Johnson-Lindenstrauss random projection."""
    
    def __init__(self, n_components, random_state=None):
        self.n_components = n_components
        self.random_state = random_state
        self.R = None
    
    def fit(self, X):
        n, d = X.shape
        rng = np.random.RandomState(self.random_state)
        
        # Gaussian random matrix
        self.R = rng.randn(self.n_components, d) / np.sqrt(self.n_components)
        return self
    
    def transform(self, X):
        return X @ self.R.T
    
    def fit_transform(self, X):
        return self.fit(X).transform(X)

# Test on synthetic data
n = 100
d = 1000
X = np.random.randn(n, d)

# Compute original distances
dist_orig = pdist(X, metric='euclidean')

# Project
epsilon = 0.3
k = jl_dimension(n, epsilon)
print(f"Projecting from d={d} to k={k}")

rp = RandomProjection(n_components=k, random_state=42)
X_proj = rp.fit_transform(X)
dist_proj = pdist(X_proj, metric='euclidean')

# Check distortion
ratios = dist_proj / dist_orig

plt.figure(figsize=(10, 5))
plt.hist(ratios, bins=50, alpha=0.7, edgecolor='black')
plt.axvline(1-epsilon, color='red', linestyle='--', label=f'1-Ξ΅={1-epsilon:.2f}')
plt.axvline(1+epsilon, color='red', linestyle='--', label=f'1+Ξ΅={1+epsilon:.2f}')
plt.axvline(1, color='black', linestyle='-', alpha=0.5, label='1.0')
plt.xlabel('Distance Ratio (projected/original)', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title(f'JL Distortion (n={n}, d={d}β†’k={k}, Ξ΅={epsilon})', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

violations = np.sum((ratios < 1-epsilon) | (ratios > 1+epsilon))
print(f"\nViolations: {violations}/{len(ratios)} ({100*violations/len(ratios):.2f}%)")
print(f"Mean ratio: {np.mean(ratios):.4f}")
print(f"Std ratio: {np.std(ratios):.4f}")

4. Sparse Random ProjectionsΒΆ

def sparse_random_projection(X, k, sparsity=0.1):
    """Faster projection using sparse matrix."""
    n, d = X.shape
    
    # Sparse random matrix
    R = np.zeros((k, d))
    for i in range(k):
        indices = np.random.choice(d, size=int(d*sparsity), replace=False)
        R[i, indices] = np.random.randn(len(indices))
    
    R /= np.sqrt(k)
    return X @ R.T

# Compare dense vs sparse
import time

n, d = 500, 5000
X = np.random.randn(n, d)
k = jl_dimension(n, 0.3)

# Dense
start = time.time()
rp_dense = RandomProjection(k).fit_transform(X)
time_dense = time.time() - start

# Sparse
start = time.time()
rp_sparse = sparse_random_projection(X, k, sparsity=0.1)
time_sparse = time.time() - start

print(f"Dense: {time_dense:.4f}s")
print(f"Sparse: {time_sparse:.4f}s")
print(f"Speedup: {time_dense/time_sparse:.2f}x")

5. Application: Nearest NeighborsΒΆ

# Nearest neighbor preservation
from sklearn.neighbors import NearestNeighbors

n, d = 200, 500
X = np.random.randn(n, d)

# Original k-NN
knn_orig = NearestNeighbors(n_neighbors=10)
knn_orig.fit(X)
neighbors_orig = knn_orig.kneighbors(X, return_distance=False)

# Projected k-NN
k = jl_dimension(n, 0.2)
X_proj = RandomProjection(k).fit_transform(X)
knn_proj = NearestNeighbors(n_neighbors=10)
knn_proj.fit(X_proj)
neighbors_proj = knn_proj.kneighbors(X_proj, return_distance=False)

# Compute recall@10
recalls = []
for i in range(n):
    intersection = len(set(neighbors_orig[i]) & set(neighbors_proj[i]))
    recalls.append(intersection / 10)

plt.figure(figsize=(8, 5))
plt.hist(recalls, bins=20, alpha=0.7, edgecolor='black')
plt.xlabel('Recall@10', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title(f'NN Preservation (d={d}β†’k={k})', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

print(f"Mean recall@10: {np.mean(recalls):.3f}")
print(f"Dimension reduction: {100*(1-k/d):.1f}%")

SummaryΒΆ

JL Lemma:ΒΆ

\[k = O\left(\frac{\log n}{\epsilon^2}\right)\]

Preserves distances with distortion \(\epsilon\)

Random Projection:ΒΆ

\[f(x) = Rx, \quad R_{ij} \sim \mathcal{N}(0, 1/k)\]

Key Properties:ΒΆ

  • Dimension-independent of \(d\)

  • Data-independent (no training)

  • Fast computation

Applications:ΒΆ

  • Nearest neighbor search

  • Clustering speedup

  • Compressed sensing

  • Privacy-preserving ML

Next Steps:ΒΆ

  • 07_concentration_inequalities.ipynb - Theory

  • Explore sketching algorithms

  • Apply to large-scale data