# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits, fetch_olivetti_faces
from mpl_toolkits.mplot3d import Axes3D
sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)
10.1 Problem SettingΒΆ
High-Dimensional Data:ΒΆ
Given N data points in D dimensions: \(\mathbf{X} \in \mathbb{R}^{N \times D}\)
Goal: Find M-dimensional representation (M < D) that:
Captures maximum variance
Minimizes reconstruction error
PCA Solution:ΒΆ
Project data onto M orthogonal directions (principal components) that capture most variance.
where:
B: DΓM matrix of principal components (orthonormal)
z_n: M-dimensional compressed representation
ΞΌ: Mean of data
Reconstruction:ΒΆ
# Generate correlated 2D data for visualization
np.random.seed(42)
# Generate data with strong correlation
n_samples = 200
mean = [0, 0]
cov = [[3, 2.5], # Strong positive correlation
[2.5, 3]]
X = np.random.multivariate_normal(mean, cov, n_samples)
print("Original Data")
print(f"Shape: {X.shape}")
print(f"Mean: {X.mean(axis=0)}")
print(f"\nCovariance matrix:")
print(np.cov(X.T))
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], alpha=0.6, s=50)
plt.xlabel('xβ', fontsize=12)
plt.ylabel('xβ', fontsize=12)
plt.title('Correlated 2D Data', fontsize=14)
plt.axis('equal')
plt.grid(True, alpha=0.3)
plt.show()
print("\nData is correlated β can be compressed to 1D!")
10.2 Maximum Variance PerspectiveΒΆ
First Principal Component:ΒΆ
Find direction bβ that maximizes variance of projections:
Variance of Projections:ΒΆ
where S is the data covariance matrix.
Solution:ΒΆ
bβ is the eigenvector of S with largest eigenvalue!
Eigenvalue Ξ»α΅’ = variance along principal component bα΅’
# Compute PCA manually: Maximum Variance Perspective
def compute_pca_manual(X, n_components=2):
"""
Compute PCA from scratch.
Steps:
1. Center the data
2. Compute covariance matrix
3. Compute eigenvectors and eigenvalues
4. Sort by eigenvalue
5. Select top n_components
"""
# Step 1: Center data
mu = X.mean(axis=0)
X_centered = X - mu
# Step 2: Covariance matrix
# S = (1/N) X^T X
S = (X_centered.T @ X_centered) / X.shape[0]
# Step 3: Eigendecomposition
eigenvalues, eigenvectors = np.linalg.eig(S)
# Step 4: Sort by eigenvalue (descending)
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
# Step 5: Select top components
principal_components = eigenvectors[:, :n_components]
explained_variance = eigenvalues[:n_components]
return principal_components, explained_variance, mu
# Compute PCA
B, eigenvalues, mu = compute_pca_manual(X, n_components=2)
print("PCA Results (Manual Computation)")
print(f"\nMean (ΞΌ):")
print(mu)
print(f"\nPrincipal Components (eigenvectors):")
print("PC1:", B[:, 0])
print("PC2:", B[:, 1])
print(f"\nEigenvalues (variance along each PC):")
print("Ξ»β:", eigenvalues[0])
print("Ξ»β:", eigenvalues[1])
print(f"\nExplained variance ratio:")
total_var = eigenvalues.sum()
print(f"PC1: {eigenvalues[0]/total_var*100:.1f}%")
print(f"PC2: {eigenvalues[1]/total_var*100:.1f}%")
# Visualize principal components
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], alpha=0.6, s=50, label='Data')
plt.scatter(mu[0], mu[1], color='red', s=200, marker='x', linewidth=3, label='Mean')
# Draw principal components
scale = 3 # Scale for visualization
for i in range(2):
direction = B[:, i] * np.sqrt(eigenvalues[i]) * scale
plt.arrow(mu[0], mu[1], direction[0], direction[1],
head_width=0.3, head_length=0.2, fc=f'C{i+1}', ec=f'C{i+1}',
linewidth=3, label=f'PC{i+1}')
plt.xlabel('xβ', fontsize=12)
plt.ylabel('xβ', fontsize=12)
plt.title('Principal Components (Maximum Variance Directions)', fontsize=14)
plt.axis('equal')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
print("\nPC1 points in direction of maximum variance!")
print("PC2 is orthogonal to PC1 (maximum remaining variance)")
10.3 Projection PerspectiveΒΆ
Reconstruction Error:ΒΆ
PCA also minimizes reconstruction error:
where \(\mathbf{\tilde{x}}_n = \mathbf{B}\mathbf{B}^T(\mathbf{x}_n - \boldsymbol{\mu}) + \boldsymbol{\mu}\)
Key Result:ΒΆ
Minimizing reconstruction error β‘ Maximizing variance captured
Both perspectives give the same solution!
# Project data onto principal components
# Transform to PCA space (project onto principal components)
X_centered = X - mu
Z = X_centered @ B # N x M matrix of PCA coefficients
print("PCA Transformation")
print(f"\nOriginal data shape: {X.shape}")
print(f"PCA coefficients shape: {Z.shape}")
print(f"\nFirst 5 samples in original space:")
print(X[:5])
print(f"\nFirst 5 samples in PCA space:")
print(Z[:5])
# Reconstruct using M components
def reconstruct(Z, B, mu, n_components):
"""Reconstruct data from PCA representation."""
return Z[:, :n_components] @ B[:, :n_components].T + mu
# Reconstruct with 1 PC vs 2 PCs
X_recon_1 = reconstruct(Z, B, mu, n_components=1)
X_recon_2 = reconstruct(Z, B, mu, n_components=2)
# Reconstruction error
error_1 = np.mean(np.sum((X - X_recon_1)**2, axis=1))
error_2 = np.mean(np.sum((X - X_recon_2)**2, axis=1))
print(f"\nReconstruction Error:")
print(f" 1 PC: {error_1:.4f}")
print(f" 2 PCs: {error_2:.4f}")
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Original
axes[0].scatter(X[:, 0], X[:, 1], alpha=0.6, s=50)
axes[0].set_xlabel('xβ')
axes[0].set_ylabel('xβ')
axes[0].set_title('Original Data')
axes[0].axis('equal')
axes[0].grid(True, alpha=0.3)
# Reconstructed with 1 PC
axes[1].scatter(X[:, 0], X[:, 1], alpha=0.3, s=30, label='Original')
axes[1].scatter(X_recon_1[:, 0], X_recon_1[:, 1], alpha=0.6, s=50, label='Reconstructed')
# Draw projection lines
for i in range(0, len(X), 10): # Show every 10th for clarity
axes[1].plot([X[i, 0], X_recon_1[i, 0]],
[X[i, 1], X_recon_1[i, 1]], 'k-', alpha=0.2, linewidth=0.5)
axes[1].set_xlabel('xβ')
axes[1].set_ylabel('xβ')
axes[1].set_title(f'1 PC (Error: {error_1:.3f})')
axes[1].axis('equal')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# Reconstructed with 2 PCs
axes[2].scatter(X[:, 0], X[:, 1], alpha=0.3, s=30, label='Original')
axes[2].scatter(X_recon_2[:, 0], X_recon_2[:, 1], alpha=0.6, s=50, label='Reconstructed')
axes[2].set_xlabel('xβ')
axes[2].set_ylabel('xβ')
axes[2].set_title(f'2 PCs (Error: {error_2:.3f})')
axes[2].axis('equal')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nWith 1 PC: Data projected onto line (information loss)")
print("With 2 PCs: Perfect reconstruction (no information loss)")
# Demonstrate dimensionality reduction in 3D
# Generate 3D data lying on a 2D plane (with noise)
np.random.seed(42)
n = 300
# Create data on a tilted plane
t1 = np.random.uniform(-2, 2, n)
t2 = np.random.uniform(-2, 2, n)
X_3d = np.column_stack([
t1 + 0.3*t2,
0.5*t1 + t2,
-0.7*t1 + 0.2*t2
])
# Add small noise
X_3d += 0.1 * np.random.randn(n, 3)
# Compute PCA
B_3d, eigenvalues_3d, mu_3d = compute_pca_manual(X_3d, n_components=3)
# Visualize
fig = plt.figure(figsize=(16, 5))
# Original 3D data
ax1 = fig.add_subplot(131, projection='3d')
ax1.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], alpha=0.6, s=30)
ax1.set_xlabel('xβ')
ax1.set_ylabel('xβ')
ax1.set_zlabel('xβ')
ax1.set_title('Original 3D Data')
# PCA space (first 2 PCs)
ax2 = fig.add_subplot(132)
Z_3d = (X_3d - mu_3d) @ B_3d
ax2.scatter(Z_3d[:, 0], Z_3d[:, 1], alpha=0.6, s=30)
ax2.set_xlabel('PC1')
ax2.set_ylabel('PC2')
ax2.set_title('PCA Space (2D)')
ax2.axis('equal')
ax2.grid(True, alpha=0.3)
# Explained variance
ax3 = fig.add_subplot(133)
variance_ratio = eigenvalues_3d / eigenvalues_3d.sum()
cumulative_variance = np.cumsum(variance_ratio)
bars = ax3.bar(range(1, 4), variance_ratio, alpha=0.6)
ax3.plot(range(1, 4), cumulative_variance, 'ro-', linewidth=2, markersize=8)
ax3.set_xlabel('Principal Component')
ax3.set_ylabel('Explained Variance Ratio')
ax3.set_title('Scree Plot')
ax3.set_xticks([1, 2, 3])
ax3.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("3D β 2D Dimensionality Reduction")
print(f"\nExplained variance:")
for i, (ev, ratio) in enumerate(zip(eigenvalues_3d, variance_ratio)):
print(f" PC{i+1}: {ratio*100:.1f}% (Ξ»={ev:.4f})")
print(f"\nCumulative variance with 2 PCs: {cumulative_variance[1]*100:.1f}%")
print("β Can reduce from 3D to 2D with minimal information loss!")
10.4 PCA AlgorithmΒΆ
Standard PCA Algorithm:ΒΆ
Input: Data matrix X (NΓD), number of components M
Steps:
Center the data: \(\mathbf{X}_c = \mathbf{X} - \boldsymbol{\mu}\)
Compute covariance: \(\mathbf{S} = \frac{1}{N}\mathbf{X}_c^T\mathbf{X}_c\)
Eigendecomposition: \(\mathbf{S}\mathbf{b}_i = \lambda_i\mathbf{b}_i\)
Sort eigenvalues: \(\lambda_1 \geq \lambda_2 \geq ... \geq \lambda_D\)
Select top M: Principal components matrix B = [bβ, β¦, b_M]
Transform: \(\mathbf{Z} = \mathbf{X}_c\mathbf{B}\)
Reconstruct: \(\mathbf{\tilde{X}} = \mathbf{Z}\mathbf{B}^T + \boldsymbol{\mu}\)
Alternative: SVDΒΆ
For high-dimensional data (D >> N), use SVD instead:
Principal components = columns of V!
# Compare eigendecomposition vs SVD for PCA
def pca_via_eig(X, n_components):
"""PCA using eigendecomposition of covariance matrix."""
mu = X.mean(axis=0)
X_c = X - mu
S = (X_c.T @ X_c) / X.shape[0]
eigenvalues, eigenvectors = np.linalg.eig(S)
idx = eigenvalues.argsort()[::-1]
B = eigenvectors[:, idx[:n_components]]
return B, mu
def pca_via_svd(X, n_components):
"""PCA using SVD (more efficient for high dimensions)."""
mu = X.mean(axis=0)
X_c = X - mu
U, S, Vt = np.linalg.svd(X_c, full_matrices=False)
B = Vt.T[:, :n_components] # V = Vt.T
return B, mu
# Compare both methods
B_eig, mu_eig = pca_via_eig(X, n_components=2)
B_svd, mu_svd = pca_via_svd(X, n_components=2)
print("PCA via Eigendecomposition:")
print(B_eig)
print("\nPCA via SVD:")
print(B_svd)
print("\nBoth methods give same principal components!")
print("(signs may differ, but span same subspace)")
# Compare with sklearn
from sklearn.decomposition import PCA
pca_sklearn = PCA(n_components=2)
pca_sklearn.fit(X)
print("\nSklearn PCA components:")
print(pca_sklearn.components_.T)
print("\nExplained variance ratio:")
print(pca_sklearn.explained_variance_ratio_)
10.5 PCA on Real Data: Handwritten DigitsΒΆ
Letβs apply PCA to the MNIST digits dataset (8Γ8 images).
Original: 64 dimensions (64 pixels) Compressed: M << 64 dimensions
Can we reconstruct digits from compressed representation?
# Load digits dataset
digits = load_digits()
X_digits = digits.data # 1797 samples, 64 features
y_digits = digits.target
print("Handwritten Digits Dataset")
print(f"Shape: {X_digits.shape}")
print(f"Classes: {np.unique(y_digits)}")
# Visualize some digits
fig, axes = plt.subplots(2, 10, figsize=(14, 3))
for i, ax in enumerate(axes.flat):
ax.imshow(X_digits[i].reshape(8, 8), cmap='gray')
ax.set_title(f'{y_digits[i]}')
ax.axis('off')
plt.suptitle('Original Digits (64 dimensions)', fontsize=14)
plt.tight_layout()
plt.show()
# Apply PCA
pca = PCA()
pca.fit(X_digits)
# Scree plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, 65), pca.explained_variance_ratio_, 'bo-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot')
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
cumulative = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1, 65), cumulative, 'ro-')
plt.axhline(y=0.9, color='g', linestyle='--', label='90% variance')
plt.axhline(y=0.95, color='b', linestyle='--', label='95% variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Variance')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Find components needed for 90% and 95% variance
n_90 = np.argmax(cumulative >= 0.90) + 1
n_95 = np.argmax(cumulative >= 0.95) + 1
print(f"\nDimensionality Reduction:")
print(f" Components for 90% variance: {n_90}/64 ({n_90/64*100:.1f}%)")
print(f" Components for 95% variance: {n_95}/64 ({n_95/64*100:.1f}%)")
print(f"\nCompression ratio (95%): {64/n_95:.1f}x")
# Reconstruct digits with different numbers of components
n_components_list = [2, 5, 10, 20, 64]
fig, axes = plt.subplots(len(n_components_list), 10, figsize=(14, 2*len(n_components_list)))
for row, n_comp in enumerate(n_components_list):
# PCA with n_comp components
pca_n = PCA(n_components=n_comp)
X_transformed = pca_n.fit_transform(X_digits)
X_reconstructed = pca_n.inverse_transform(X_transformed)
# Reconstruction error
mse = np.mean((X_digits - X_reconstructed)**2)
var_explained = pca_n.explained_variance_ratio_.sum()
# Visualize
for col in range(10):
axes[row, col].imshow(X_reconstructed[col].reshape(8, 8), cmap='gray')
axes[row, col].axis('off')
# Title for first column
axes[row, 0].set_ylabel(f'{n_comp} PCs\n{var_explained*100:.1f}%', fontsize=10)
plt.suptitle('Digit Reconstruction with Different Numbers of Principal Components', fontsize=14)
plt.tight_layout()
plt.show()
print("Observation:")
print(" 2 PCs: Very blurry, barely recognizable")
print(" 5 PCs: Can recognize digits, but details lost")
print(" 10 PCs: Good reconstruction")
print(" 20 PCs: Very good reconstruction")
print(" 64 PCs: Perfect reconstruction (original)")
# Visualize principal components (eigenfaces for digits)
fig, axes = plt.subplots(2, 8, figsize=(14, 4))
for i, ax in enumerate(axes.flat[:16]):
# Each principal component is a 64-dim vector
component = pca.components_[i].reshape(8, 8)
ax.imshow(component, cmap='RdBu_r')
ax.set_title(f'PC{i+1}\n{pca.explained_variance_ratio_[i]*100:.1f}%')
ax.axis('off')
plt.suptitle('First 16 Principal Components (Eigendigits)', fontsize=14)
plt.tight_layout()
plt.show()
print("Principal Components:")
print(" - Each PC is a 'typical pattern' in the data")
print(" - First PCs capture most variance (e.g., overall brightness)")
print(" - Later PCs capture finer details")
print(" - Any digit β weighted sum of these patterns!")
# Visualize data in 2D PCA space
# Transform to 2D
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_digits)
# Plot
plt.figure(figsize=(12, 10))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_digits,
cmap='tab10', alpha=0.6, s=30)
plt.colorbar(scatter, label='Digit')
plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]*100:.1f}% variance)')
plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]*100:.1f}% variance)')
plt.title('Digits in 2D PCA Space')
plt.grid(True, alpha=0.3)
# Add digit labels
for digit in range(10):
mask = y_digits == digit
center = X_2d[mask].mean(axis=0)
plt.text(center[0], center[1], str(digit),
fontsize=20, fontweight='bold',
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
plt.show()
print("2D PCA Visualization:")
print(" - Each point is a digit (64D β 2D)")
print(" - Colors show digit classes")
print(" - Some separation visible even in 2D!")
print(f" - Total variance explained: {pca_2d.explained_variance_ratio_.sum()*100:.1f}%")
10.6 PCA for Data PreprocessingΒΆ
Common Uses:ΒΆ
Noise Reduction: Project away noise in low-variance directions
Visualization: Project high-D data to 2D/3D
Feature Extraction: Use PCA features for ML models
Compression: Store/transmit data more efficiently
Whitening: Transform to uncorrelated, unit variance
Whitening Transformation:ΒΆ
Result: Uncorrelated features with unit variance
Important Notes:ΒΆ
β οΈ Always standardize if features have different scales β οΈ PCA assumes linearity - may miss nonlinear structure β οΈ Sensitive to outliers - consider robust PCA variants
# Demonstrate whitening
# Generate correlated data
np.random.seed(42)
mean = [0, 0]
cov = [[3, 2.5], [2.5, 3]]
X_corr = np.random.multivariate_normal(mean, cov, 300)
# Compute PCA
pca_white = PCA(whiten=False)
X_pca = pca_white.fit_transform(X_corr)
# Whitened PCA
pca_whiten = PCA(whiten=True)
X_whitened = pca_whiten.fit_transform(X_corr)
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Original
axes[0].scatter(X_corr[:, 0], X_corr[:, 1], alpha=0.6, s=30)
axes[0].set_xlabel('xβ')
axes[0].set_ylabel('xβ')
axes[0].set_title('Original Data\n(Correlated, Different Variances)')
axes[0].axis('equal')
axes[0].grid(True, alpha=0.3)
axes[0].set_xlim([-6, 6])
axes[0].set_ylim([-6, 6])
# PCA (rotated, uncorrelated)
axes[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6, s=30)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].set_title('PCA Space\n(Uncorrelated, Different Variances)')
axes[1].axis('equal')
axes[1].grid(True, alpha=0.3)
axes[1].set_xlim([-6, 6])
axes[1].set_ylim([-6, 6])
# Whitened (rotated, uncorrelated, unit variance)
axes[2].scatter(X_whitened[:, 0], X_whitened[:, 1], alpha=0.6, s=30)
axes[2].set_xlabel('Whitened PC1')
axes[2].set_ylabel('Whitened PC2')
axes[2].set_title('Whitened PCA\n(Uncorrelated, Unit Variance)')
axes[2].axis('equal')
axes[2].grid(True, alpha=0.3)
axes[2].set_xlim([-6, 6])
axes[2].set_ylim([-6, 6])
plt.tight_layout()
plt.show()
print("Covariance Matrices:\n")
print("Original:")
print(np.cov(X_corr.T))
print("\nPCA (uncorrelated):")
print(np.cov(X_pca.T))
print("\nWhitened (uncorrelated + unit variance):")
print(np.cov(X_whitened.T))
print("\nWhitening makes features:")
print(" 1. Uncorrelated (off-diagonal = 0)")
print(" 2. Unit variance (diagonal = 1)")
SummaryΒΆ
Key Concepts from Chapter 10:ΒΆ
Goal: Find low-dimensional representation preserving maximum variance
Solution: Project onto top eigenvectors of covariance matrix
Two Perspectives: Maximum variance = Minimum reconstruction error
Algorithm: Center β Covariance β Eigendecomposition β Project
Alternative: Use SVD for efficiency (especially when D >> N)
PCA Mathematics:ΒΆ
Concept |
Formula |
|---|---|
Covariance |
S = (1/N) X^T X |
Eigenvalue problem |
Sb = Ξ»b |
Transform |
z = B^T (x - ΞΌ) |
Reconstruct |
xΜ = Bz + ΞΌ |
Variance explained |
λᡒ / Σλⱼ |
When to Use PCA:ΒΆ
β Good for:
High-dimensional data with correlations
Visualization (project to 2D/3D)
Noise reduction
Compression
Preprocessing for ML
β Not good for:
Nonlinear structure (use kernel PCA, autoencoders)
When interpretability is crucial
When all features are important
Connection to Other Chapters:ΒΆ
Concept |
Chapter |
|---|---|
Eigenvalues/eigenvectors |
Chapter 4 |
SVD |
Chapter 4 |
Covariance matrix |
Chapter 6 |
Gaussian distribution |
Chapter 6 |
Orthogonal projections |
Chapter 3 |
Variance maximization |
Chapter 7 |
Practical Tips:ΒΆ
Standardize if features have different scales
Use scree plot to choose number of components
Check cumulative variance (90-95% is common)
Whitening helps some ML algorithms
Consider robust PCA for outliers
Next Steps:ΒΆ
Chapter 11: GMM (clustering with Gaussians)
Chapter 12: SVM (classification)
Advanced: Kernel PCA, Sparse PCA, Probabilistic PCA
Practice: Apply PCA to a dataset and interpret the principal components!