# Setup
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy import linalg
sns.set_style('whitegrid')
np.set_printoptions(precision=3, suppress=True)
4.1 Determinant and Trace¶
Determinant¶
The determinant det(A) or |A| of a square matrix:
Measures the volume scaling of the linear transformation
det(A) = 0 ⟺ A is singular (not invertible)
det(AB) = det(A) det(B)
Trace¶
The trace tr(A) is the sum of diagonal elements:
Properties:
tr(A + B) = tr(A) + tr(B)
tr(AB) = tr(BA) (cyclic property)
tr(A) = sum of eigenvalues
# Determinant and trace
A = np.array([[4, 2],
[1, 3]])
print("Matrix A:")
print(A)
# Determinant
det_A = np.linalg.det(A)
print(f"\nDeterminant: det(A) = {det_A:.3f}")
# Manual for 2x2: ad - bc
det_manual = A[0,0]*A[1,1] - A[0,1]*A[1,0]
print(f"Manual (ad-bc): {A[0,0]}×{A[1,1]} - {A[0,1]}×{A[1,0]} = {det_manual}")
# Trace
trace_A = np.trace(A)
print(f"\nTrace: tr(A) = {trace_A}")
print(f"Manual (sum of diagonal): {A[0,0]} + {A[1,1]} = {A[0,0] + A[1,1]}")
# Geometric interpretation
print("\n" + "="*50)
print("\nGeometric interpretation:")
print(f"det(A) = {det_A:.3f} → area scaling factor")
print("Unit square area = 1")
print(f"Transformed area = {det_A:.3f}")
# Visualize determinant as area scaling
A = np.array([[2, 1],
[0.5, 1.5]])
# Unit square
square = np.array([[0, 1, 1, 0, 0],
[0, 0, 1, 1, 0]])
# Transform square
transformed = A @ square
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Original square
ax1.fill(square[0], square[1], alpha=0.3, color='blue')
ax1.plot(square[0], square[1], 'b-', linewidth=2)
ax1.set_xlim(-0.5, 3)
ax1.set_ylim(-0.5, 3)
ax1.set_aspect('equal')
ax1.grid(True, alpha=0.3)
ax1.set_title('Original\nArea = 1', fontsize=13)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
# Transformed
ax2.fill(transformed[0], transformed[1], alpha=0.3, color='red')
ax2.plot(transformed[0], transformed[1], 'r-', linewidth=2)
ax2.set_xlim(-0.5, 3)
ax2.set_ylim(-0.5, 3)
ax2.set_aspect('equal')
ax2.grid(True, alpha=0.3)
det_A = np.linalg.det(A)
ax2.set_title(f'Transformed\nArea = det(A) = {det_A:.2f}', fontsize=13)
ax2.set_xlabel('x')
ax2.set_ylabel('y')
plt.tight_layout()
plt.show()
print(f"Matrix A:")
print(A)
print(f"\ndet(A) = {det_A:.3f}")
print(f"The transformation scales area by a factor of {det_A:.3f}")
4.2 Eigenvalues and Eigenvectors¶
Definition 4.3 (Eigenvalue/Eigenvector)¶
For a square matrix A, λ is an eigenvalue with eigenvector v if:
Interpretation: v is a direction that A only stretches (by factor λ), doesn’t rotate.
Characteristic Polynomial:¶
Properties:¶
tr(A) = Σλᵢ (sum of eigenvalues)
det(A) = Πλᵢ (product of eigenvalues)
# Compute eigenvalues and eigenvectors
A = np.array([[4, 2],
[1, 3]])
print("Matrix A:")
print(A)
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(A)
print("\nEigenvalues:")
for i, lam in enumerate(eigenvalues):
print(f"λ_{i+1} = {lam:.3f}")
print("\nEigenvectors (as columns):")
print(eigenvectors)
# Verify Av = λv for first eigenpair
v1 = eigenvectors[:, 0]
lam1 = eigenvalues[0]
print("\n" + "="*50)
print("\nVerification for first eigenpair:")
print(f"v₁ = {v1}")
print(f"λ₁ = {lam1:.3f}")
Av1 = A @ v1
lam_v1 = lam1 * v1
print(f"\nAv₁ = {Av1}")
print(f"λ₁v₁ = {lam_v1}")
print(f"\nAv₁ = λ₁v₁? {np.allclose(Av1, lam_v1)}")
# Verify trace and determinant
print("\n" + "="*50)
print("\nProperties:")
print(f"tr(A) = {np.trace(A)}")
print(f"λ₁ + λ₂ = {np.sum(eigenvalues):.3f}")
print(f"\ndet(A) = {np.linalg.det(A):.3f}")
print(f"λ₁ × λ₂ = {np.prod(eigenvalues):.3f}")
# Visualize eigenvectors
A = np.array([[3, 1],
[0, 2]])
eigenvalues, eigenvectors = np.linalg.eig(A)
plt.figure(figsize=(12, 6))
# Plot eigenvectors
for i in range(2):
v = eigenvectors[:, i]
lam = eigenvalues[i]
# Original eigenvector
plt.quiver(0, 0, v[0], v[1], angles='xy', scale_units='xy', scale=1,
color=f'C{i}', width=0.012, label=f'v_{i+1}', zorder=3)
# Transformed: Av = λv
Av = A @ v
plt.quiver(0, 0, Av[0], Av[1], angles='xy', scale_units='xy', scale=1,
color=f'C{i}', width=0.008, alpha=0.5, linestyle='--',
label=f'Av_{i+1} = {lam:.1f}v_{i+1}', zorder=2)
# Line showing direction
t = np.linspace(-2, 2, 100)
line = np.outer(t, v)
plt.plot(line[:, 0], line[:, 1], f'C{i}--', alpha=0.2, linewidth=1)
# Plot some other vectors for comparison
test_v = np.array([1, 1])
test_Av = A @ test_v
plt.quiver(0, 0, test_v[0], test_v[1], angles='xy', scale_units='xy', scale=1,
color='gray', width=0.01, label='test vector', zorder=3)
plt.quiver(0, 0, test_Av[0], test_Av[1], angles='xy', scale_units='xy', scale=1,
color='gray', width=0.006, alpha=0.5, linestyle='--',
label='A(test) (rotated!)', zorder=2)
plt.xlim(-2, 5)
plt.ylim(-2, 3)
plt.grid(True, alpha=0.3)
plt.legend(loc='upper left', fontsize=10)
plt.title('Eigenvectors: Directions Preserved by A\n(only scaled, not rotated)', fontsize=13)
plt.xlabel('x')
plt.ylabel('y')
plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()
print("Key observation: Eigenvectors maintain their direction!")
print("Other vectors get both scaled AND rotated.")
4.3 Cholesky Decomposition¶
Definition 4.5 (Cholesky Decomposition)¶
For a symmetric positive definite matrix A:
where L is lower triangular with positive diagonal entries.
Uses:
Efficiently solve Ax = b
Simulate correlated random variables
Numerically stable
Gaussian processes, multivariate normal
# Cholesky decomposition
# Create symmetric positive definite matrix
# (covariance matrices are always SPD)
np.random.seed(42)
M = np.random.randn(3, 3)
A = M @ M.T # Guaranteed to be SPD
print("Symmetric positive definite matrix A:")
print(A)
# Verify symmetry
print(f"\nSymmetric? {np.allclose(A, A.T)}")
# Verify positive definite (all eigenvalues > 0)
eigs = np.linalg.eigvalsh(A)
print(f"Eigenvalues: {eigs}")
print(f"Positive definite? {np.all(eigs > 0)}")
# Compute Cholesky decomposition
L = np.linalg.cholesky(A)
print("\n" + "="*50)
print("\nCholesky factor L (lower triangular):")
print(L)
# Verify A = LL^T
A_reconstructed = L @ L.T
print("\nReconstruction LL^T:")
print(A_reconstructed)
print(f"\nA = LL^T? {np.allclose(A, A_reconstructed)}")
# Application: Solving linear systems with Cholesky
# Create SPD matrix and RHS
np.random.seed(42)
n = 100
M = np.random.randn(n, n)
A = M @ M.T + np.eye(n) # Add identity for numerical stability
b = np.random.randn(n)
# Method 1: Direct solve
import time
start = time.time()
x1 = np.linalg.solve(A, b)
time_direct = time.time() - start
# Method 2: Cholesky solve
start = time.time()
L = np.linalg.cholesky(A)
y = np.linalg.solve(L, b) # Forward substitution: Ly = b
x2 = np.linalg.solve(L.T, y) # Back substitution: L^Tx = y
time_cholesky = time.time() - start
print(f"Matrix size: {n}×{n}")
print(f"\nDirect solve time: {time_direct*1000:.2f} ms")
print(f"Cholesky solve time: {time_cholesky*1000:.2f} ms")
print(f"Speedup: {time_direct/time_cholesky:.2f}x")
print(f"\nSolutions match? {np.allclose(x1, x2)}")
print(f"Residual (direct): {np.linalg.norm(A@x1 - b):.2e}")
print(f"Residual (Cholesky): {np.linalg.norm(A@x2 - b):.2e}")
4.4 Eigendecomposition and Diagonalization¶
Eigendecomposition:¶
If A has n linearly independent eigenvectors:
where:
P = [v₁ … vₙ] (eigenvectors as columns)
D = diag(λ₁, …, λₙ) (eigenvalues on diagonal)
Spectral Theorem:¶
For symmetric matrices A = A^T:
where Q is orthogonal (Q^T Q = I) and Λ is diagonal.
Benefits: Computing A^k, matrix exponential, understanding dynamics
# Eigendecomposition
# Symmetric matrix (guaranteed to have real eigenvalues and orthogonal eigenvectors)
A = np.array([[4, 2],
[2, 3]], dtype=float)
print("Symmetric matrix A:")
print(A)
# Compute eigendecomposition
eigenvalues, Q = np.linalg.eigh(A) # eigh for symmetric/Hermitian
Lambda = np.diag(eigenvalues)
print("\nEigenvalues (Λ):")
print(Lambda)
print("\nEigenvectors (Q):")
print(Q)
# Verify A = QΛQ^T
A_reconstructed = Q @ Lambda @ Q.T
print("\nReconstruction A = QΛQ^T:")
print(A_reconstructed)
print(f"\nCorrect? {np.allclose(A, A_reconstructed)}")
# Verify Q is orthogonal
print("\n" + "="*50)
print("\nQ^T Q (should be identity):")
print(Q.T @ Q)
print(f"Q is orthogonal? {np.allclose(Q.T @ Q, np.eye(2))}")
# Application: Computing matrix powers efficiently
A = np.array([[3, 1],
[1, 3]], dtype=float)
print("Matrix A:")
print(A)
# Eigendecomposition
eigenvalues, Q = np.linalg.eigh(A)
Lambda = np.diag(eigenvalues)
# Compute A^10 two ways
k = 10
# Method 1: Direct (slow for large k)
A_k_direct = np.linalg.matrix_power(A, k)
# Method 2: Using eigendecomposition
# A^k = Q Λ^k Q^T
Lambda_k = np.diag(eigenvalues**k)
A_k_eigen = Q @ Lambda_k @ Q.T
print(f"\nA^{k} (direct):")
print(A_k_direct)
print(f"\nA^{k} (eigendecomposition):")
print(A_k_eigen)
print(f"\nMatch? {np.allclose(A_k_direct, A_k_eigen)}")
# Key insight
print("\n" + "="*50)
print("\nKey insight:")
print(f"λ₁^{k} = {eigenvalues[0]**k:.2e}")
print(f"λ₂^{k} = {eigenvalues[1]**k:.2e}")
print("\nLarger eigenvalue dominates for large k!")
print("This is the basis of PageRank, Markov chains, etc.")
4.5 Singular Value Decomposition (SVD)¶
Theorem 4.13 (SVD)¶
Every matrix A ∈ ℝ^(m×n) can be factorized as:
where:
U ∈ ℝ^(m×m): orthogonal (left singular vectors)
Σ ∈ ℝ^(m×n): diagonal (singular values σ₁ ≥ σ₂ ≥ … ≥ 0)
V ∈ ℝ^(n×n): orthogonal (right singular vectors)
Key Properties:
Works for any matrix (not just square or symmetric)
Singular values = √eigenvalues of A^T A
Reveals rank, nullspace, range
# SVD
# Create rectangular matrix
A = np.array([[3, 1, 1],
[-1, 3, 1],
[1, 1, 3],
[1, -1, 1]], dtype=float)
m, n = A.shape
print(f"Matrix A ({m}×{n}):")
print(A)
# Compute SVD
U, s, VT = np.linalg.svd(A, full_matrices=True)
print(f"\nU ({U.shape}):")
print(U)
print(f"\nSingular values:")
print(s)
print(f"\nV^T ({VT.shape}):")
print(VT)
# Reconstruct A
Sigma = np.zeros((m, n))
Sigma[:n, :n] = np.diag(s)
print("\nΣ (with padding):")
print(Sigma)
A_reconstructed = U @ Sigma @ VT
print("\nReconstruction A = UΣV^T:")
print(A_reconstructed)
print(f"\nCorrect? {np.allclose(A, A_reconstructed)}")
# Verify orthogonality
print("\n" + "="*50)
print("\nOrthogonality checks:")
print(f"U^T U = I? {np.allclose(U.T @ U, np.eye(m))}")
print(f"V^T V = I? {np.allclose(VT.T @ VT, np.eye(n))}")
# Visualize SVD geometrically (2D example)
A = np.array([[3, 1],
[1, 2]])
U, s, VT = np.linalg.svd(A)
# Unit circle
theta = np.linspace(0, 2*np.pi, 100)
circle = np.array([np.cos(theta), np.sin(theta)])
# Apply transformations
step1 = VT @ circle # Rotate by V^T
step2 = np.diag(s) @ step1 # Scale by Σ
step3 = U @ step2 # Rotate by U
ellipse = step3 # = A @ circle
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
# Step 0: Original circle
axes[0,0].plot(circle[0], circle[1], 'b-', linewidth=2)
axes[0,0].set_aspect('equal')
axes[0,0].grid(True, alpha=0.3)
axes[0,0].set_title('1. Unit Circle', fontsize=12)
axes[0,0].set_xlim(-4, 4)
axes[0,0].set_ylim(-4, 4)
# Step 1: After V^T (rotation)
axes[0,1].plot(step1[0], step1[1], 'g-', linewidth=2)
axes[0,1].set_aspect('equal')
axes[0,1].grid(True, alpha=0.3)
axes[0,1].set_title('2. After V^T (rotate)', fontsize=12)
axes[0,1].set_xlim(-4, 4)
axes[0,1].set_ylim(-4, 4)
# Step 2: After Σ (scaling)
axes[1,0].plot(step2[0], step2[1], 'orange', linewidth=2)
axes[1,0].set_aspect('equal')
axes[1,0].grid(True, alpha=0.3)
axes[1,0].set_title(f'3. After Σ (scale by σ={s})', fontsize=12)
axes[1,0].set_xlim(-4, 4)
axes[1,0].set_ylim(-4, 4)
# Step 3: After U (rotation)
axes[1,1].plot(ellipse[0], ellipse[1], 'r-', linewidth=2)
axes[1,1].set_aspect('equal')
axes[1,1].grid(True, alpha=0.3)
axes[1,1].set_title('4. After U (rotate) = A×circle', fontsize=12)
axes[1,1].set_xlim(-4, 4)
axes[1,1].set_ylim(-4, 4)
plt.tight_layout()
plt.show()
print("SVD decomposes A into: Rotate → Scale → Rotate")
print(f"Singular values: {s}")
print("These are the semi-axes of the ellipse!")
4.6 Matrix Approximation¶
Eckart-Young Theorem:¶
The best rank-k approximation to A (in Frobenius norm) is:
where σᵢ are singular values (sorted), uᵢ and vᵢ are left/right singular vectors.
Applications:
PCA (Chapter 10)
Image compression
Noise reduction
Recommender systems
# Low-rank approximation
# Create rank-3 matrix with some noise
np.random.seed(42)
m, n = 50, 50
r_true = 3 # True rank
# Low-rank component
U_true = np.random.randn(m, r_true)
V_true = np.random.randn(n, r_true)
A_lowrank = U_true @ V_true.T
# Add noise
noise = 0.1 * np.random.randn(m, n)
A = A_lowrank + noise
print(f"Matrix A: {A.shape}")
print(f"True rank: {r_true}")
# SVD
U, s, VT = np.linalg.svd(A, full_matrices=False)
print(f"\nTop 10 singular values:")
print(s[:10])
# Try different ranks
print("\n" + "="*50)
print("\nApproximation errors:")
for k in [1, 2, 3, 5, 10]:
# Rank-k approximation
A_k = U[:, :k] @ np.diag(s[:k]) @ VT[:k, :]
# Error
error = np.linalg.norm(A - A_k, 'fro')
relative_error = error / np.linalg.norm(A, 'fro')
print(f"Rank {k:2d}: error = {error:6.3f}, relative = {relative_error:.4f}")
# Image compression with SVD
# Create a simple synthetic image
x = np.linspace(-5, 5, 200)
y = np.linspace(-5, 5, 200)
X, Y = np.meshgrid(x, y)
# Create pattern
img = np.sin(X) * np.cos(Y) + np.sin(X**2 + Y**2) / 2
print(f"Image size: {img.shape}")
print(f"Original storage: {img.size} values")
# SVD
U, s, VT = np.linalg.svd(img, full_matrices=False)
# Visualize original and approximations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# Original
axes[0,0].imshow(img, cmap='gray')
axes[0,0].set_title(f'Original\n({img.size} values)', fontsize=11)
axes[0,0].axis('off')
# Approximations with different ranks
ranks = [5, 10, 20, 50, 100]
for idx, k in enumerate(ranks):
# Reconstruct with k components
img_k = U[:, :k] @ np.diag(s[:k]) @ VT[:k, :]
# Storage needed: U[:,:k] + s[:k] + VT[:k,:]
storage = k * (img.shape[0] + img.shape[1] + 1)
compression_ratio = img.size / storage
# Error
error = np.linalg.norm(img - img_k, 'fro') / np.linalg.norm(img, 'fro')
row = (idx + 1) // 3
col = (idx + 1) % 3
axes[row, col].imshow(img_k, cmap='gray')
axes[row, col].set_title(f'Rank {k}\n{compression_ratio:.1f}× compression\nError: {error:.3f}',
fontsize=11)
axes[row, col].axis('off')
plt.tight_layout()
plt.show()
# Plot singular values
plt.figure(figsize=(10, 6))
plt.semilogy(s, 'b-', linewidth=2)
plt.xlabel('Index', fontsize=12)
plt.ylabel('Singular Value (log scale)', fontsize=12)
plt.title('Singular Value Spectrum', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nKey insight: A few singular values capture most of the information!")
Summary¶
Key Concepts from Chapter 4:¶
Determinant: Volume scaling, invertibility test
Trace: Sum of eigenvalues, useful invariant
Eigenvalues/Eigenvectors: Principal directions, dynamics
Cholesky: Efficient solving for SPD matrices
Eigendecomposition: Diagonalization, matrix powers
SVD: Universal decomposition, reveals structure
Low-Rank Approximation: Compression, denoising
ML Applications:¶
Decomposition |
ML Application |
|---|---|
Eigendecomposition |
PCA, spectral clustering |
SVD |
Recommender systems, LSA, PCA |
Cholesky |
Gaussian processes, optimization |
Low-rank |
Matrix completion, compression |
Connections:¶
Chapter 10 (PCA): Uses SVD for dimensionality reduction
Chapter 9 (Linear Regression): Uses Cholesky for normal equations
Chapter 11 (GMM): Uses eigendecomposition for covariance
Next Steps:¶
Chapter 5: Vector Calculus (gradients, optimization)
Chapter 6: Probability (statistical foundations)
Chapter 10: PCA (direct application of SVD)
Practice: Implement PCA using SVD from scratch!