import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.linalg import svd, eigh
import pandas as pd
from sklearn.decomposition import PCA, FactorAnalysis, FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

1. Principal Component Analysis (PCA)ΒΆ

GoalΒΆ

Find a low-dimensional representation of data that preserves maximum variance.

FormulationΒΆ

Given data \(\mathbf{X} \in \mathbb{R}^{N \times D}\), find projection \(\mathbf{W} \in \mathbb{R}^{D \times K}\) (where \(K < D\)):

\[\mathbf{Z} = \mathbf{X}\mathbf{W}\]

SolutionΒΆ

\(\mathbf{W}\) contains the top \(K\) eigenvectors of the covariance matrix:

\[\mathbf{C} = \frac{1}{N}\mathbf{X}^T\mathbf{X}\]

SVD ApproachΒΆ

\[\mathbf{X} = \mathbf{U}\mathbf{S}\mathbf{V}^T\]

Principal components are columns of \(\mathbf{V}\)

# Generate correlated 2D data
np.random.seed(42)
n_samples = 200

# Generate data along a line with noise
t = np.linspace(0, 10, n_samples)
X_2d = np.zeros((n_samples, 2))
X_2d[:, 0] = t + np.random.randn(n_samples) * 0.5
X_2d[:, 1] = 2 * t + np.random.randn(n_samples) * 0.5

# Center the data
X_centered = X_2d - X_2d.mean(axis=0)

# Compute PCA manually using SVD
def pca_svd(X, n_components=2):
    """PCA using SVD"""
    # Center data
    X_centered = X - X.mean(axis=0)
    
    # SVD
    U, S, Vt = svd(X_centered, full_matrices=False)
    
    # Principal components (columns of V)
    components = Vt.T[:, :n_components]
    
    # Explained variance
    explained_variance = (S ** 2) / (len(X) - 1)
    
    # Project data
    X_pca = X_centered @ components
    
    return X_pca, components, explained_variance

X_pca, components, explained_var = pca_svd(X_2d, n_components=2)

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Original data with principal components
axes[0].scatter(X_centered[:, 0], X_centered[:, 1], alpha=0.6, s=30)
axes[0].arrow(0, 0, components[0, 0]*3, components[1, 0]*3, 
             color='red', width=0.1, head_width=0.3, label='PC1')
axes[0].arrow(0, 0, components[0, 1]*2, components[1, 1]*2, 
             color='blue', width=0.1, head_width=0.3, label='PC2')
axes[0].set_xlabel('x₁')
axes[0].set_ylabel('xβ‚‚')
axes[0].set_title('Original Data with Principal Components')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].axis('equal')

# Data in PC space
axes[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6, s=30)
axes[1].axhline(y=0, color='k', linestyle='-', linewidth=0.5)
axes[1].axvline(x=0, color='k', linestyle='-', linewidth=0.5)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].set_title('Data in Principal Component Space')
axes[1].grid(True, alpha=0.3)
axes[1].axis('equal')

# Explained variance
explained_var_ratio = explained_var / explained_var.sum()
axes[2].bar(['PC1', 'PC2'], explained_var_ratio, color=['red', 'blue'], alpha=0.7)
axes[2].set_ylabel('Explained Variance Ratio')
axes[2].set_title('Variance Explained by Each PC')
axes[2].grid(True, alpha=0.3, axis='y')

# Add values on bars
for i, v in enumerate(explained_var_ratio):
    axes[2].text(i, v + 0.02, f'{v:.1%}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("PCA Results:")
print(f"Principal components (columns of W):\n{components}")
print(f"\nExplained variance: {explained_var}")
print(f"Explained variance ratio: {explained_var_ratio}")
print(f"Total variance explained by PC1: {explained_var_ratio[0]:.1%}")
# PCA on high-dimensional data (digits)
digits = load_digits()
X_digits = digits.data  # 1797 samples, 64 features (8x8 images)
y_digits = digits.target

# Standardize
scaler = StandardScaler()
X_digits_scaled = scaler.fit_transform(X_digits)

# Apply PCA
pca = PCA()
X_pca_digits = pca.fit_transform(X_digits_scaled)

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Scree plot
axes[0, 0].plot(np.arange(1, 65), pca.explained_variance_ratio_, 'b-o', markersize=4)
axes[0, 0].set_xlabel('Principal Component')
axes[0, 0].set_ylabel('Explained Variance Ratio')
axes[0, 0].set_title('Scree Plot')
axes[0, 0].grid(True, alpha=0.3)

# Cumulative explained variance
cumsum = np.cumsum(pca.explained_variance_ratio_)
axes[0, 1].plot(np.arange(1, 65), cumsum, 'g-', linewidth=2)
axes[0, 1].axhline(y=0.9, color='r', linestyle='--', label='90% variance')
axes[0, 1].axhline(y=0.95, color='orange', linestyle='--', label='95% variance')
axes[0, 1].set_xlabel('Number of Components')
axes[0, 1].set_ylabel('Cumulative Explained Variance')
axes[0, 1].set_title('Cumulative Variance Explained')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# First two PCs
scatter = axes[1, 0].scatter(X_pca_digits[:, 0], X_pca_digits[:, 1], 
                            c=y_digits, cmap='tab10', alpha=0.6, s=20)
axes[1, 0].set_xlabel('PC1')
axes[1, 0].set_ylabel('PC2')
axes[1, 0].set_title('Digits in First Two Principal Components')
axes[1, 0].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[1, 0], label='Digit')

# Top 10 principal components as images
axes[1, 1].axis('off')
n_components_to_show = 10
fig2, axes2 = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes2.flatten()):
    if i < n_components_to_show:
        component_img = pca.components_[i].reshape(8, 8)
        ax.imshow(component_img, cmap='RdBu_r')
        var_explained = pca.explained_variance_ratio_[i]
        ax.set_title(f'PC{i+1}\n({var_explained:.1%})', fontsize=10)
    ax.axis('off')
plt.suptitle('Top 10 Principal Components (Eigendigits)', fontsize=14)
plt.tight_layout()

plt.figure(fig.number)
plt.tight_layout()
plt.show()
plt.figure(fig2.number)
plt.show()

# Find number of components for 90% variance
n_components_90 = np.argmax(cumsum >= 0.9) + 1
print(f"\nDigits Dataset (64 dimensions):")
print(f"Components for 90% variance: {n_components_90}")
print(f"Compression ratio: {64/n_components_90:.1f}x")

2. Probabilistic PCAΒΆ

ModelΒΆ

Latent variable model: $\(\mathbf{z}_i \sim \mathcal{N}(\mathbf{0}, \mathbf{I})\)\( \)\(\mathbf{x}_i | \mathbf{z}_i \sim \mathcal{N}(\mathbf{W}\mathbf{z}_i + \boldsymbol{\mu}, \sigma^2\mathbf{I})\)$

where:

  • \(\mathbf{z}_i \in \mathbb{R}^K\) is latent variable

  • \(\mathbf{W} \in \mathbb{R}^{D \times K}\) is loading matrix

  • \(\sigma^2\) is noise variance

PropertiesΒΆ

  • Marginal: \(p(\mathbf{x}) = \mathcal{N}(\boldsymbol{\mu}, \mathbf{WW}^T + \sigma^2\mathbf{I})\)

  • Maximum likelihood solution: \(\mathbf{W}_{\text{ML}} = \mathbf{V}_K(\boldsymbol{\Lambda}_K - \sigma^2\mathbf{I})^{1/2}\)

  • As \(\sigma^2 \to 0\), recovers standard PCA

# Reconstruction comparison
def reconstruct_pca(X, n_components):
    """Reconstruct data using PCA"""
    pca = PCA(n_components=n_components)
    X_transformed = pca.fit_transform(X)
    X_reconstructed = pca.inverse_transform(X_transformed)
    return X_reconstructed, pca

# Visualize reconstruction quality
components_list = [2, 5, 10, 20, 30, 40]
reconstruction_errors = []

for n_comp in components_list:
    X_recon, _ = reconstruct_pca(X_digits_scaled, n_comp)
    mse = np.mean((X_digits_scaled - X_recon) ** 2)
    reconstruction_errors.append(mse)

# Plot reconstruction error
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

axes[0, 0].plot(components_list, reconstruction_errors, 'b-o', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('Number of Components')
axes[0, 0].set_ylabel('Mean Squared Error')
axes[0, 0].set_title('Reconstruction Error vs Components')
axes[0, 0].grid(True, alpha=0.3)

# Show reconstruction examples
idx = 0
original_img = X_digits[idx].reshape(8, 8)

axes[0, 1].imshow(original_img, cmap='gray')
axes[0, 1].set_title(f'Original Digit: {y_digits[idx]}')
axes[0, 1].axis('off')

# Reconstruct with different numbers of components
for i, n_comp in enumerate([5, 20]):
    X_recon, pca_model = reconstruct_pca(X_digits_scaled, n_comp)
    recon_img = scaler.inverse_transform(X_recon[idx:idx+1]).reshape(8, 8)
    
    axes[1, i].imshow(recon_img, cmap='gray')
    var_explained = pca_model.explained_variance_ratio_.sum()
    axes[1, i].set_title(f'{n_comp} components\n({var_explained:.1%} variance)')
    axes[1, i].axis('off')

plt.tight_layout()
plt.show()

print("Reconstruction Quality:")
for n_comp, error in zip(components_list, reconstruction_errors):
    print(f"{n_comp:2d} components: MSE = {error:.4f}")

3. Factor Analysis (FA)ΒΆ

Difference from PCAΒΆ

Factor Analysis allows different noise variances for each dimension:

\[\mathbf{x}_i | \mathbf{z}_i \sim \mathcal{N}(\mathbf{W}\mathbf{z}_i + \boldsymbol{\mu}, \boldsymbol{\Psi})\]

where \(\boldsymbol{\Psi} = \text{diag}(\psi_1, \ldots, \psi_D)\)

MarginalΒΆ

\[p(\mathbf{x}) = \mathcal{N}(\boldsymbol{\mu}, \mathbf{WW}^T + \boldsymbol{\Psi})\]

InterpretationΒΆ

  • \(\mathbf{WW}^T\): shared variance (explained by factors)

  • \(\boldsymbol{\Psi}\): unique variance (noise specific to each feature)

# Compare PCA and FA on digits
n_components = 10

# PCA
pca_model = PCA(n_components=n_components)
X_pca = pca_model.fit_transform(X_digits_scaled)

# Factor Analysis
fa_model = FactorAnalysis(n_components=n_components, random_state=42)
X_fa = fa_model.fit_transform(X_digits_scaled)

# Visualize
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# First two components
axes[0, 0].scatter(X_pca[:, 0], X_pca[:, 1], c=y_digits, cmap='tab10', alpha=0.6, s=20)
axes[0, 0].set_xlabel('Component 1')
axes[0, 0].set_ylabel('Component 2')
axes[0, 0].set_title('PCA (First 2 Components)')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].scatter(X_fa[:, 0], X_fa[:, 1], c=y_digits, cmap='tab10', alpha=0.6, s=20)
axes[0, 1].set_xlabel('Factor 1')
axes[0, 1].set_ylabel('Factor 2')
axes[0, 1].set_title('Factor Analysis (First 2 Factors)')
axes[0, 1].grid(True, alpha=0.3)

# Noise variances (FA only)
noise_var = fa_model.noise_variance_
axes[0, 2].bar(range(len(noise_var)), noise_var, alpha=0.7)
axes[0, 2].set_xlabel('Feature Dimension')
axes[0, 2].set_ylabel('Noise Variance (ψ)')
axes[0, 2].set_title('FA: Per-dimension Noise Variance')
axes[0, 2].grid(True, alpha=0.3, axis='y')

# Reconstruction
X_pca_recon = pca_model.inverse_transform(X_pca)
X_fa_recon = fa_model.inverse_transform(X_fa)

idx = 0
axes[1, 0].imshow(X_digits_scaled[idx].reshape(8, 8), cmap='gray')
axes[1, 0].set_title('Original (scaled)')
axes[1, 0].axis('off')

axes[1, 1].imshow(X_pca_recon[idx].reshape(8, 8), cmap='gray')
pca_error = np.mean((X_digits_scaled[idx] - X_pca_recon[idx])**2)
axes[1, 1].set_title(f'PCA Reconstruction\nMSE: {pca_error:.4f}')
axes[1, 1].axis('off')

axes[1, 2].imshow(X_fa_recon[idx].reshape(8, 8), cmap='gray')
fa_error = np.mean((X_digits_scaled[idx] - X_fa_recon[idx])**2)
axes[1, 2].set_title(f'FA Reconstruction\nMSE: {fa_error:.4f}')
axes[1, 2].axis('off')

plt.tight_layout()
plt.show()

print("\nPCA vs Factor Analysis:")
print(f"\nPCA:")
print(f"  Explained variance: {pca_model.explained_variance_ratio_.sum():.1%}")
print(f"  Assumes: Equal noise across all dimensions")
print(f"\nFactor Analysis:")
print(f"  Mean noise variance: {noise_var.mean():.4f}")
print(f"  Noise variance range: [{noise_var.min():.4f}, {noise_var.max():.4f}]")
print(f"  Assumes: Different noise per dimension")

4. Independent Component Analysis (ICA)ΒΆ

GoalΒΆ

Find statistically independent components (not just uncorrelated like PCA).

ModelΒΆ

\[\mathbf{x} = \mathbf{As}\]

where:

  • \(\mathbf{s}\): independent sources

  • \(\mathbf{A}\): mixing matrix

GoalΒΆ

Recover \(\mathbf{s}\) given \(\mathbf{x}\): $\(\mathbf{s} = \mathbf{W}\mathbf{x}\)$

where \(\mathbf{W} = \mathbf{A}^{-1}\)

ApplicationΒΆ

Blind source separation (cocktail party problem)

# Generate synthetic mixed signals
np.random.seed(42)
n_samples = 2000
time = np.linspace(0, 8, n_samples)

# True independent sources
s1 = np.sin(2 * time)  # Sinusoidal
s2 = np.sign(np.sin(3 * time))  # Square wave
s3 = stats.laplace.rvs(size=n_samples)  # Random laplacian

S = np.c_[s1, s2, s3]
S = StandardScaler().fit_transform(S)  # Standardize

# Mix signals
A = np.array([[1, 1, 1],
              [0.5, 2, 1.0],
              [1.5, 1.0, 2.0]])
X_mixed = S @ A.T  # Mixed observations

# Apply ICA
ica = FastICA(n_components=3, random_state=42)
S_ica = ica.fit_transform(X_mixed)

# Apply PCA for comparison
pca_ica = PCA(n_components=3)
S_pca = pca_ica.fit_transform(X_mixed)

# Visualize
fig, axes = plt.subplots(4, 3, figsize=(15, 12))

# True sources
for i in range(3):
    axes[0, i].plot(time[:500], S[:500, i], linewidth=1)
    axes[0, i].set_title(f'True Source {i+1}')
    axes[0, i].set_ylim(-3, 3)
    axes[0, i].grid(True, alpha=0.3)
    if i == 0:
        axes[0, i].set_ylabel('Amplitude')

# Mixed signals
for i in range(3):
    axes[1, i].plot(time[:500], X_mixed[:500, i], linewidth=1, color='orange')
    axes[1, i].set_title(f'Mixed Signal {i+1}')
    axes[1, i].set_ylim(-6, 6)
    axes[1, i].grid(True, alpha=0.3)
    if i == 0:
        axes[1, i].set_ylabel('Amplitude')

# ICA recovered
for i in range(3):
    axes[2, i].plot(time[:500], S_ica[:500, i], linewidth=1, color='green')
    axes[2, i].set_title(f'ICA Component {i+1}')
    axes[2, i].set_ylim(-3, 3)
    axes[2, i].grid(True, alpha=0.3)
    if i == 0:
        axes[2, i].set_ylabel('Amplitude')

# PCA
for i in range(3):
    axes[3, i].plot(time[:500], S_pca[:500, i], linewidth=1, color='red')
    axes[3, i].set_title(f'PCA Component {i+1}')
    axes[3, i].set_ylim(-6, 6)
    axes[3, i].grid(True, alpha=0.3)
    axes[3, i].set_xlabel('Time')
    if i == 0:
        axes[3, i].set_ylabel('Amplitude')

plt.tight_layout()
plt.show()

print("ICA vs PCA for Source Separation:")
print("\nICA:")
print("  - Finds independent components")
print("  - Good for: Signal separation, cocktail party problem")
print("  - Assumes: Non-Gaussian sources")
print("\nPCA:")
print("  - Finds uncorrelated components")
print("  - Good for: Dimensionality reduction, noise reduction")
print("  - Assumes: Gaussian noise, maximize variance")
# Measure independence vs decorrelation
def measure_independence(S):
    """Measure statistical independence using mutual information proxy"""
    # Compute correlations
    corr = np.corrcoef(S.T)
    # Off-diagonal correlations (should be 0 for independent)
    off_diag = corr[np.triu_indices_from(corr, k=1)]
    return np.abs(off_diag).mean()

print("\nMeasure of Dependence (lower is more independent):")
print(f"True sources: {measure_independence(S):.4f}")
print(f"Mixed signals: {measure_independence(X_mixed):.4f}")
print(f"ICA recovered: {measure_independence(S_ica):.4f}")
print(f"PCA components: {measure_independence(S_pca):.4f}")

# Visualize scatter plots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# True sources
axes[0, 0].scatter(S[:, 0], S[:, 1], alpha=0.3, s=1)
axes[0, 0].set_xlabel('Source 1')
axes[0, 0].set_ylabel('Source 2')
axes[0, 0].set_title('True Sources (Independent)')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].scatter(X_mixed[:, 0], X_mixed[:, 1], alpha=0.3, s=1, color='orange')
axes[0, 1].set_xlabel('Mixed 1')
axes[0, 1].set_ylabel('Mixed 2')
axes[0, 1].set_title('Mixed Signals (Dependent)')
axes[0, 1].grid(True, alpha=0.3)

axes[0, 2].scatter(S_ica[:, 0], S_ica[:, 1], alpha=0.3, s=1, color='green')
axes[0, 2].set_xlabel('ICA 1')
axes[0, 2].set_ylabel('ICA 2')
axes[0, 2].set_title('ICA (Recovered Independence)')
axes[0, 2].grid(True, alpha=0.3)

# Histograms showing non-Gaussianity
axes[1, 0].hist(S[:, 1], bins=50, density=True, alpha=0.7)
axes[1, 0].set_title('True Source 2 (Square wave)\nHighly non-Gaussian')
axes[1, 0].grid(True, alpha=0.3, axis='y')

axes[1, 1].hist(X_mixed[:, 0], bins=50, density=True, alpha=0.7, color='orange')
axes[1, 1].set_title('Mixed Signal 1\nMore Gaussian (CLT)')
axes[1, 1].grid(True, alpha=0.3, axis='y')

axes[1, 2].hist(S_ica[:, 1], bins=50, density=True, alpha=0.7, color='green')
axes[1, 2].set_title('ICA Component\nRecovered non-Gaussianity')
axes[1, 2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

SummaryΒΆ

In this notebook, we covered:

  1. PCA: Maximize variance, orthogonal components

  2. Probabilistic PCA: Generative model with isotropic noise

  3. Factor Analysis: Different noise per dimension

  4. ICA: Statistical independence, non-Gaussian sources

Comparison TableΒΆ

Method

Objective

Noise Model

Use Case

PCA

Max variance

Isotropic (\(\sigma^2\mathbf{I}\))

Dimensionality reduction, visualization

Probabilistic PCA

Likelihood

Isotropic

Missing data, Bayesian inference

Factor Analysis

Likelihood

Diagonal (\(\boldsymbol{\Psi}\))

Latent variable discovery

ICA

Independence

None

Signal separation, feature extraction

Key InsightsΒΆ

PCAΒΆ

  • Pros: Fast, optimal for Gaussian data, good for compression

  • Cons: Assumes linear structure, sensitive to outliers

  • When to use: Dimensionality reduction, data visualization, noise reduction

Factor AnalysisΒΆ

  • Pros: Models per-feature noise, probabilistic interpretation

  • Cons: Slower than PCA, requires EM algorithm

  • When to use: Latent factor discovery, psychometrics

ICAΒΆ

  • Pros: Recovers independent sources

  • Cons: Requires non-Gaussian sources, cannot determine scale/order

  • When to use: Blind source separation, cocktail party problem, fMRI

Practical TipsΒΆ

  1. Always standardize data before PCA/FA/ICA

  2. Use scree plot to choose number of components

  3. PCA for compression, ICA for source separation

  4. FA when noise varies across features

  5. Check assumptions: Gaussianity for PCA, non-Gaussianity for ICA

ExercisesΒΆ

  1. Implement kernel PCA for nonlinear dimensionality reduction

  2. Compare PCA and t-SNE for visualization

  3. Apply ICA to image denoising

  4. Implement sparse PCA (L1 penalty)

  5. Use FA for collaborative filtering