import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.linalg import svd, eigh
import pandas as pd
from sklearn.decomposition import PCA, FactorAnalysis, FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)
1. Principal Component Analysis (PCA)ΒΆ
GoalΒΆ
Find a low-dimensional representation of data that preserves maximum variance.
FormulationΒΆ
Given data \(\mathbf{X} \in \mathbb{R}^{N \times D}\), find projection \(\mathbf{W} \in \mathbb{R}^{D \times K}\) (where \(K < D\)):
SolutionΒΆ
\(\mathbf{W}\) contains the top \(K\) eigenvectors of the covariance matrix:
SVD ApproachΒΆ
Principal components are columns of \(\mathbf{V}\)
# Generate correlated 2D data
np.random.seed(42)
n_samples = 200
# Generate data along a line with noise
t = np.linspace(0, 10, n_samples)
X_2d = np.zeros((n_samples, 2))
X_2d[:, 0] = t + np.random.randn(n_samples) * 0.5
X_2d[:, 1] = 2 * t + np.random.randn(n_samples) * 0.5
# Center the data
X_centered = X_2d - X_2d.mean(axis=0)
# Compute PCA manually using SVD
def pca_svd(X, n_components=2):
"""PCA using SVD"""
# Center data
X_centered = X - X.mean(axis=0)
# SVD
U, S, Vt = svd(X_centered, full_matrices=False)
# Principal components (columns of V)
components = Vt.T[:, :n_components]
# Explained variance
explained_variance = (S ** 2) / (len(X) - 1)
# Project data
X_pca = X_centered @ components
return X_pca, components, explained_variance
X_pca, components, explained_var = pca_svd(X_2d, n_components=2)
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Original data with principal components
axes[0].scatter(X_centered[:, 0], X_centered[:, 1], alpha=0.6, s=30)
axes[0].arrow(0, 0, components[0, 0]*3, components[1, 0]*3,
color='red', width=0.1, head_width=0.3, label='PC1')
axes[0].arrow(0, 0, components[0, 1]*2, components[1, 1]*2,
color='blue', width=0.1, head_width=0.3, label='PC2')
axes[0].set_xlabel('xβ')
axes[0].set_ylabel('xβ')
axes[0].set_title('Original Data with Principal Components')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].axis('equal')
# Data in PC space
axes[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6, s=30)
axes[1].axhline(y=0, color='k', linestyle='-', linewidth=0.5)
axes[1].axvline(x=0, color='k', linestyle='-', linewidth=0.5)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].set_title('Data in Principal Component Space')
axes[1].grid(True, alpha=0.3)
axes[1].axis('equal')
# Explained variance
explained_var_ratio = explained_var / explained_var.sum()
axes[2].bar(['PC1', 'PC2'], explained_var_ratio, color=['red', 'blue'], alpha=0.7)
axes[2].set_ylabel('Explained Variance Ratio')
axes[2].set_title('Variance Explained by Each PC')
axes[2].grid(True, alpha=0.3, axis='y')
# Add values on bars
for i, v in enumerate(explained_var_ratio):
axes[2].text(i, v + 0.02, f'{v:.1%}', ha='center', fontweight='bold')
plt.tight_layout()
plt.show()
print("PCA Results:")
print(f"Principal components (columns of W):\n{components}")
print(f"\nExplained variance: {explained_var}")
print(f"Explained variance ratio: {explained_var_ratio}")
print(f"Total variance explained by PC1: {explained_var_ratio[0]:.1%}")
# PCA on high-dimensional data (digits)
digits = load_digits()
X_digits = digits.data # 1797 samples, 64 features (8x8 images)
y_digits = digits.target
# Standardize
scaler = StandardScaler()
X_digits_scaled = scaler.fit_transform(X_digits)
# Apply PCA
pca = PCA()
X_pca_digits = pca.fit_transform(X_digits_scaled)
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# Scree plot
axes[0, 0].plot(np.arange(1, 65), pca.explained_variance_ratio_, 'b-o', markersize=4)
axes[0, 0].set_xlabel('Principal Component')
axes[0, 0].set_ylabel('Explained Variance Ratio')
axes[0, 0].set_title('Scree Plot')
axes[0, 0].grid(True, alpha=0.3)
# Cumulative explained variance
cumsum = np.cumsum(pca.explained_variance_ratio_)
axes[0, 1].plot(np.arange(1, 65), cumsum, 'g-', linewidth=2)
axes[0, 1].axhline(y=0.9, color='r', linestyle='--', label='90% variance')
axes[0, 1].axhline(y=0.95, color='orange', linestyle='--', label='95% variance')
axes[0, 1].set_xlabel('Number of Components')
axes[0, 1].set_ylabel('Cumulative Explained Variance')
axes[0, 1].set_title('Cumulative Variance Explained')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
# First two PCs
scatter = axes[1, 0].scatter(X_pca_digits[:, 0], X_pca_digits[:, 1],
c=y_digits, cmap='tab10', alpha=0.6, s=20)
axes[1, 0].set_xlabel('PC1')
axes[1, 0].set_ylabel('PC2')
axes[1, 0].set_title('Digits in First Two Principal Components')
axes[1, 0].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[1, 0], label='Digit')
# Top 10 principal components as images
axes[1, 1].axis('off')
n_components_to_show = 10
fig2, axes2 = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes2.flatten()):
if i < n_components_to_show:
component_img = pca.components_[i].reshape(8, 8)
ax.imshow(component_img, cmap='RdBu_r')
var_explained = pca.explained_variance_ratio_[i]
ax.set_title(f'PC{i+1}\n({var_explained:.1%})', fontsize=10)
ax.axis('off')
plt.suptitle('Top 10 Principal Components (Eigendigits)', fontsize=14)
plt.tight_layout()
plt.figure(fig.number)
plt.tight_layout()
plt.show()
plt.figure(fig2.number)
plt.show()
# Find number of components for 90% variance
n_components_90 = np.argmax(cumsum >= 0.9) + 1
print(f"\nDigits Dataset (64 dimensions):")
print(f"Components for 90% variance: {n_components_90}")
print(f"Compression ratio: {64/n_components_90:.1f}x")
2. Probabilistic PCAΒΆ
ModelΒΆ
Latent variable model: $\(\mathbf{z}_i \sim \mathcal{N}(\mathbf{0}, \mathbf{I})\)\( \)\(\mathbf{x}_i | \mathbf{z}_i \sim \mathcal{N}(\mathbf{W}\mathbf{z}_i + \boldsymbol{\mu}, \sigma^2\mathbf{I})\)$
where:
\(\mathbf{z}_i \in \mathbb{R}^K\) is latent variable
\(\mathbf{W} \in \mathbb{R}^{D \times K}\) is loading matrix
\(\sigma^2\) is noise variance
PropertiesΒΆ
Marginal: \(p(\mathbf{x}) = \mathcal{N}(\boldsymbol{\mu}, \mathbf{WW}^T + \sigma^2\mathbf{I})\)
Maximum likelihood solution: \(\mathbf{W}_{\text{ML}} = \mathbf{V}_K(\boldsymbol{\Lambda}_K - \sigma^2\mathbf{I})^{1/2}\)
As \(\sigma^2 \to 0\), recovers standard PCA
# Reconstruction comparison
def reconstruct_pca(X, n_components):
"""Reconstruct data using PCA"""
pca = PCA(n_components=n_components)
X_transformed = pca.fit_transform(X)
X_reconstructed = pca.inverse_transform(X_transformed)
return X_reconstructed, pca
# Visualize reconstruction quality
components_list = [2, 5, 10, 20, 30, 40]
reconstruction_errors = []
for n_comp in components_list:
X_recon, _ = reconstruct_pca(X_digits_scaled, n_comp)
mse = np.mean((X_digits_scaled - X_recon) ** 2)
reconstruction_errors.append(mse)
# Plot reconstruction error
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes[0, 0].plot(components_list, reconstruction_errors, 'b-o', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('Number of Components')
axes[0, 0].set_ylabel('Mean Squared Error')
axes[0, 0].set_title('Reconstruction Error vs Components')
axes[0, 0].grid(True, alpha=0.3)
# Show reconstruction examples
idx = 0
original_img = X_digits[idx].reshape(8, 8)
axes[0, 1].imshow(original_img, cmap='gray')
axes[0, 1].set_title(f'Original Digit: {y_digits[idx]}')
axes[0, 1].axis('off')
# Reconstruct with different numbers of components
for i, n_comp in enumerate([5, 20]):
X_recon, pca_model = reconstruct_pca(X_digits_scaled, n_comp)
recon_img = scaler.inverse_transform(X_recon[idx:idx+1]).reshape(8, 8)
axes[1, i].imshow(recon_img, cmap='gray')
var_explained = pca_model.explained_variance_ratio_.sum()
axes[1, i].set_title(f'{n_comp} components\n({var_explained:.1%} variance)')
axes[1, i].axis('off')
plt.tight_layout()
plt.show()
print("Reconstruction Quality:")
for n_comp, error in zip(components_list, reconstruction_errors):
print(f"{n_comp:2d} components: MSE = {error:.4f}")
3. Factor Analysis (FA)ΒΆ
Difference from PCAΒΆ
Factor Analysis allows different noise variances for each dimension:
where \(\boldsymbol{\Psi} = \text{diag}(\psi_1, \ldots, \psi_D)\)
MarginalΒΆ
InterpretationΒΆ
\(\mathbf{WW}^T\): shared variance (explained by factors)
\(\boldsymbol{\Psi}\): unique variance (noise specific to each feature)
# Compare PCA and FA on digits
n_components = 10
# PCA
pca_model = PCA(n_components=n_components)
X_pca = pca_model.fit_transform(X_digits_scaled)
# Factor Analysis
fa_model = FactorAnalysis(n_components=n_components, random_state=42)
X_fa = fa_model.fit_transform(X_digits_scaled)
# Visualize
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
# First two components
axes[0, 0].scatter(X_pca[:, 0], X_pca[:, 1], c=y_digits, cmap='tab10', alpha=0.6, s=20)
axes[0, 0].set_xlabel('Component 1')
axes[0, 0].set_ylabel('Component 2')
axes[0, 0].set_title('PCA (First 2 Components)')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 1].scatter(X_fa[:, 0], X_fa[:, 1], c=y_digits, cmap='tab10', alpha=0.6, s=20)
axes[0, 1].set_xlabel('Factor 1')
axes[0, 1].set_ylabel('Factor 2')
axes[0, 1].set_title('Factor Analysis (First 2 Factors)')
axes[0, 1].grid(True, alpha=0.3)
# Noise variances (FA only)
noise_var = fa_model.noise_variance_
axes[0, 2].bar(range(len(noise_var)), noise_var, alpha=0.7)
axes[0, 2].set_xlabel('Feature Dimension')
axes[0, 2].set_ylabel('Noise Variance (Ο)')
axes[0, 2].set_title('FA: Per-dimension Noise Variance')
axes[0, 2].grid(True, alpha=0.3, axis='y')
# Reconstruction
X_pca_recon = pca_model.inverse_transform(X_pca)
X_fa_recon = fa_model.inverse_transform(X_fa)
idx = 0
axes[1, 0].imshow(X_digits_scaled[idx].reshape(8, 8), cmap='gray')
axes[1, 0].set_title('Original (scaled)')
axes[1, 0].axis('off')
axes[1, 1].imshow(X_pca_recon[idx].reshape(8, 8), cmap='gray')
pca_error = np.mean((X_digits_scaled[idx] - X_pca_recon[idx])**2)
axes[1, 1].set_title(f'PCA Reconstruction\nMSE: {pca_error:.4f}')
axes[1, 1].axis('off')
axes[1, 2].imshow(X_fa_recon[idx].reshape(8, 8), cmap='gray')
fa_error = np.mean((X_digits_scaled[idx] - X_fa_recon[idx])**2)
axes[1, 2].set_title(f'FA Reconstruction\nMSE: {fa_error:.4f}')
axes[1, 2].axis('off')
plt.tight_layout()
plt.show()
print("\nPCA vs Factor Analysis:")
print(f"\nPCA:")
print(f" Explained variance: {pca_model.explained_variance_ratio_.sum():.1%}")
print(f" Assumes: Equal noise across all dimensions")
print(f"\nFactor Analysis:")
print(f" Mean noise variance: {noise_var.mean():.4f}")
print(f" Noise variance range: [{noise_var.min():.4f}, {noise_var.max():.4f}]")
print(f" Assumes: Different noise per dimension")
4. Independent Component Analysis (ICA)ΒΆ
GoalΒΆ
Find statistically independent components (not just uncorrelated like PCA).
ModelΒΆ
where:
\(\mathbf{s}\): independent sources
\(\mathbf{A}\): mixing matrix
GoalΒΆ
Recover \(\mathbf{s}\) given \(\mathbf{x}\): $\(\mathbf{s} = \mathbf{W}\mathbf{x}\)$
where \(\mathbf{W} = \mathbf{A}^{-1}\)
ApplicationΒΆ
Blind source separation (cocktail party problem)
# Generate synthetic mixed signals
np.random.seed(42)
n_samples = 2000
time = np.linspace(0, 8, n_samples)
# True independent sources
s1 = np.sin(2 * time) # Sinusoidal
s2 = np.sign(np.sin(3 * time)) # Square wave
s3 = stats.laplace.rvs(size=n_samples) # Random laplacian
S = np.c_[s1, s2, s3]
S = StandardScaler().fit_transform(S) # Standardize
# Mix signals
A = np.array([[1, 1, 1],
[0.5, 2, 1.0],
[1.5, 1.0, 2.0]])
X_mixed = S @ A.T # Mixed observations
# Apply ICA
ica = FastICA(n_components=3, random_state=42)
S_ica = ica.fit_transform(X_mixed)
# Apply PCA for comparison
pca_ica = PCA(n_components=3)
S_pca = pca_ica.fit_transform(X_mixed)
# Visualize
fig, axes = plt.subplots(4, 3, figsize=(15, 12))
# True sources
for i in range(3):
axes[0, i].plot(time[:500], S[:500, i], linewidth=1)
axes[0, i].set_title(f'True Source {i+1}')
axes[0, i].set_ylim(-3, 3)
axes[0, i].grid(True, alpha=0.3)
if i == 0:
axes[0, i].set_ylabel('Amplitude')
# Mixed signals
for i in range(3):
axes[1, i].plot(time[:500], X_mixed[:500, i], linewidth=1, color='orange')
axes[1, i].set_title(f'Mixed Signal {i+1}')
axes[1, i].set_ylim(-6, 6)
axes[1, i].grid(True, alpha=0.3)
if i == 0:
axes[1, i].set_ylabel('Amplitude')
# ICA recovered
for i in range(3):
axes[2, i].plot(time[:500], S_ica[:500, i], linewidth=1, color='green')
axes[2, i].set_title(f'ICA Component {i+1}')
axes[2, i].set_ylim(-3, 3)
axes[2, i].grid(True, alpha=0.3)
if i == 0:
axes[2, i].set_ylabel('Amplitude')
# PCA
for i in range(3):
axes[3, i].plot(time[:500], S_pca[:500, i], linewidth=1, color='red')
axes[3, i].set_title(f'PCA Component {i+1}')
axes[3, i].set_ylim(-6, 6)
axes[3, i].grid(True, alpha=0.3)
axes[3, i].set_xlabel('Time')
if i == 0:
axes[3, i].set_ylabel('Amplitude')
plt.tight_layout()
plt.show()
print("ICA vs PCA for Source Separation:")
print("\nICA:")
print(" - Finds independent components")
print(" - Good for: Signal separation, cocktail party problem")
print(" - Assumes: Non-Gaussian sources")
print("\nPCA:")
print(" - Finds uncorrelated components")
print(" - Good for: Dimensionality reduction, noise reduction")
print(" - Assumes: Gaussian noise, maximize variance")
# Measure independence vs decorrelation
def measure_independence(S):
"""Measure statistical independence using mutual information proxy"""
# Compute correlations
corr = np.corrcoef(S.T)
# Off-diagonal correlations (should be 0 for independent)
off_diag = corr[np.triu_indices_from(corr, k=1)]
return np.abs(off_diag).mean()
print("\nMeasure of Dependence (lower is more independent):")
print(f"True sources: {measure_independence(S):.4f}")
print(f"Mixed signals: {measure_independence(X_mixed):.4f}")
print(f"ICA recovered: {measure_independence(S_ica):.4f}")
print(f"PCA components: {measure_independence(S_pca):.4f}")
# Visualize scatter plots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# True sources
axes[0, 0].scatter(S[:, 0], S[:, 1], alpha=0.3, s=1)
axes[0, 0].set_xlabel('Source 1')
axes[0, 0].set_ylabel('Source 2')
axes[0, 0].set_title('True Sources (Independent)')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 1].scatter(X_mixed[:, 0], X_mixed[:, 1], alpha=0.3, s=1, color='orange')
axes[0, 1].set_xlabel('Mixed 1')
axes[0, 1].set_ylabel('Mixed 2')
axes[0, 1].set_title('Mixed Signals (Dependent)')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 2].scatter(S_ica[:, 0], S_ica[:, 1], alpha=0.3, s=1, color='green')
axes[0, 2].set_xlabel('ICA 1')
axes[0, 2].set_ylabel('ICA 2')
axes[0, 2].set_title('ICA (Recovered Independence)')
axes[0, 2].grid(True, alpha=0.3)
# Histograms showing non-Gaussianity
axes[1, 0].hist(S[:, 1], bins=50, density=True, alpha=0.7)
axes[1, 0].set_title('True Source 2 (Square wave)\nHighly non-Gaussian')
axes[1, 0].grid(True, alpha=0.3, axis='y')
axes[1, 1].hist(X_mixed[:, 0], bins=50, density=True, alpha=0.7, color='orange')
axes[1, 1].set_title('Mixed Signal 1\nMore Gaussian (CLT)')
axes[1, 1].grid(True, alpha=0.3, axis='y')
axes[1, 2].hist(S_ica[:, 1], bins=50, density=True, alpha=0.7, color='green')
axes[1, 2].set_title('ICA Component\nRecovered non-Gaussianity')
axes[1, 2].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
SummaryΒΆ
In this notebook, we covered:
PCA: Maximize variance, orthogonal components
Probabilistic PCA: Generative model with isotropic noise
Factor Analysis: Different noise per dimension
ICA: Statistical independence, non-Gaussian sources
Comparison TableΒΆ
Method |
Objective |
Noise Model |
Use Case |
|---|---|---|---|
PCA |
Max variance |
Isotropic (\(\sigma^2\mathbf{I}\)) |
Dimensionality reduction, visualization |
Probabilistic PCA |
Likelihood |
Isotropic |
Missing data, Bayesian inference |
Factor Analysis |
Likelihood |
Diagonal (\(\boldsymbol{\Psi}\)) |
Latent variable discovery |
ICA |
Independence |
None |
Signal separation, feature extraction |
Key InsightsΒΆ
PCAΒΆ
Pros: Fast, optimal for Gaussian data, good for compression
Cons: Assumes linear structure, sensitive to outliers
When to use: Dimensionality reduction, data visualization, noise reduction
Factor AnalysisΒΆ
Pros: Models per-feature noise, probabilistic interpretation
Cons: Slower than PCA, requires EM algorithm
When to use: Latent factor discovery, psychometrics
ICAΒΆ
Pros: Recovers independent sources
Cons: Requires non-Gaussian sources, cannot determine scale/order
When to use: Blind source separation, cocktail party problem, fMRI
Practical TipsΒΆ
Always standardize data before PCA/FA/ICA
Use scree plot to choose number of components
PCA for compression, ICA for source separation
FA when noise varies across features
Check assumptions: Gaussianity for PCA, non-Gaussianity for ICA
ExercisesΒΆ
Implement kernel PCA for nonlinear dimensionality reduction
Compare PCA and t-SNE for visualization
Apply ICA to image denoising
Implement sparse PCA (L1 penalty)
Use FA for collaborative filtering