# Setup
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy import linalg

sns.set_style('whitegrid')
np.set_printoptions(precision=3, suppress=True)

4.1 Determinant and Trace

Determinant

The determinant det(A) or |A| of a square matrix:

  • Measures the volume scaling of the linear transformation

  • det(A) = 0 ⟺ A is singular (not invertible)

  • det(AB) = det(A) det(B)

Trace

The trace tr(A) is the sum of diagonal elements:

\[\text{tr}(A) = \sum_{i=1}^n a_{ii}\]

Properties:

  • tr(A + B) = tr(A) + tr(B)

  • tr(AB) = tr(BA) (cyclic property)

  • tr(A) = sum of eigenvalues

# Determinant and trace

A = np.array([[4, 2],
              [1, 3]])

print("Matrix A:")
print(A)

# Determinant
det_A = np.linalg.det(A)
print(f"\nDeterminant: det(A) = {det_A:.3f}")

# Manual for 2x2: ad - bc
det_manual = A[0,0]*A[1,1] - A[0,1]*A[1,0]
print(f"Manual (ad-bc): {A[0,0]}×{A[1,1]} - {A[0,1]}×{A[1,0]} = {det_manual}")

# Trace
trace_A = np.trace(A)
print(f"\nTrace: tr(A) = {trace_A}")
print(f"Manual (sum of diagonal): {A[0,0]} + {A[1,1]} = {A[0,0] + A[1,1]}")

# Geometric interpretation
print("\n" + "="*50)
print("\nGeometric interpretation:")
print(f"det(A) = {det_A:.3f} → area scaling factor")
print("Unit square area = 1")
print(f"Transformed area = {det_A:.3f}")
# Visualize determinant as area scaling

A = np.array([[2, 1],
              [0.5, 1.5]])

# Unit square
square = np.array([[0, 1, 1, 0, 0],
                   [0, 0, 1, 1, 0]])

# Transform square
transformed = A @ square

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Original square
ax1.fill(square[0], square[1], alpha=0.3, color='blue')
ax1.plot(square[0], square[1], 'b-', linewidth=2)
ax1.set_xlim(-0.5, 3)
ax1.set_ylim(-0.5, 3)
ax1.set_aspect('equal')
ax1.grid(True, alpha=0.3)
ax1.set_title('Original\nArea = 1', fontsize=13)
ax1.set_xlabel('x')
ax1.set_ylabel('y')

# Transformed
ax2.fill(transformed[0], transformed[1], alpha=0.3, color='red')
ax2.plot(transformed[0], transformed[1], 'r-', linewidth=2)
ax2.set_xlim(-0.5, 3)
ax2.set_ylim(-0.5, 3)
ax2.set_aspect('equal')
ax2.grid(True, alpha=0.3)
det_A = np.linalg.det(A)
ax2.set_title(f'Transformed\nArea = det(A) = {det_A:.2f}', fontsize=13)
ax2.set_xlabel('x')
ax2.set_ylabel('y')

plt.tight_layout()
plt.show()

print(f"Matrix A:")
print(A)
print(f"\ndet(A) = {det_A:.3f}")
print(f"The transformation scales area by a factor of {det_A:.3f}")

4.2 Eigenvalues and Eigenvectors

Definition 4.3 (Eigenvalue/Eigenvector)

For a square matrix A, λ is an eigenvalue with eigenvector v if:

\[Av = \lambda v\]

Interpretation: v is a direction that A only stretches (by factor λ), doesn’t rotate.

Characteristic Polynomial:

\[\det(A - \lambda I) = 0\]

Properties:

  • tr(A) = Σλᵢ (sum of eigenvalues)

  • det(A) = Πλᵢ (product of eigenvalues)

# Compute eigenvalues and eigenvectors

A = np.array([[4, 2],
              [1, 3]])

print("Matrix A:")
print(A)

# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(A)

print("\nEigenvalues:")
for i, lam in enumerate(eigenvalues):
    print(f"λ_{i+1} = {lam:.3f}")

print("\nEigenvectors (as columns):")
print(eigenvectors)

# Verify Av = λv for first eigenpair
v1 = eigenvectors[:, 0]
lam1 = eigenvalues[0]

print("\n" + "="*50)
print("\nVerification for first eigenpair:")
print(f"v₁ = {v1}")
print(f"λ₁ = {lam1:.3f}")

Av1 = A @ v1
lam_v1 = lam1 * v1

print(f"\nAv₁ = {Av1}")
print(f"λ₁v₁ = {lam_v1}")
print(f"\nAv₁ = λ₁v₁? {np.allclose(Av1, lam_v1)}")

# Verify trace and determinant
print("\n" + "="*50)
print("\nProperties:")
print(f"tr(A) = {np.trace(A)}")
print(f"λ₁ + λ₂ = {np.sum(eigenvalues):.3f}")
print(f"\ndet(A) = {np.linalg.det(A):.3f}")
print(f"λ₁ × λ₂ = {np.prod(eigenvalues):.3f}")
# Visualize eigenvectors

A = np.array([[3, 1],
              [0, 2]])

eigenvalues, eigenvectors = np.linalg.eig(A)

plt.figure(figsize=(12, 6))

# Plot eigenvectors
for i in range(2):
    v = eigenvectors[:, i]
    lam = eigenvalues[i]
    
    # Original eigenvector
    plt.quiver(0, 0, v[0], v[1], angles='xy', scale_units='xy', scale=1, 
               color=f'C{i}', width=0.012, label=f'v_{i+1}', zorder=3)
    
    # Transformed: Av = λv
    Av = A @ v
    plt.quiver(0, 0, Av[0], Av[1], angles='xy', scale_units='xy', scale=1, 
               color=f'C{i}', width=0.008, alpha=0.5, linestyle='--', 
               label=f'Av_{i+1} = {lam:.1f}v_{i+1}', zorder=2)
    
    # Line showing direction
    t = np.linspace(-2, 2, 100)
    line = np.outer(t, v)
    plt.plot(line[:, 0], line[:, 1], f'C{i}--', alpha=0.2, linewidth=1)

# Plot some other vectors for comparison
test_v = np.array([1, 1])
test_Av = A @ test_v
plt.quiver(0, 0, test_v[0], test_v[1], angles='xy', scale_units='xy', scale=1, 
           color='gray', width=0.01, label='test vector', zorder=3)
plt.quiver(0, 0, test_Av[0], test_Av[1], angles='xy', scale_units='xy', scale=1, 
           color='gray', width=0.006, alpha=0.5, linestyle='--', 
           label='A(test) (rotated!)', zorder=2)

plt.xlim(-2, 5)
plt.ylim(-2, 3)
plt.grid(True, alpha=0.3)
plt.legend(loc='upper left', fontsize=10)
plt.title('Eigenvectors: Directions Preserved by A\n(only scaled, not rotated)', fontsize=13)
plt.xlabel('x')
plt.ylabel('y')
plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()

print("Key observation: Eigenvectors maintain their direction!")
print("Other vectors get both scaled AND rotated.")

4.3 Cholesky Decomposition

Definition 4.5 (Cholesky Decomposition)

For a symmetric positive definite matrix A:

\[A = LL^T\]

where L is lower triangular with positive diagonal entries.

Uses:

  • Efficiently solve Ax = b

  • Simulate correlated random variables

  • Numerically stable

  • Gaussian processes, multivariate normal

# Cholesky decomposition

# Create symmetric positive definite matrix
# (covariance matrices are always SPD)
np.random.seed(42)
M = np.random.randn(3, 3)
A = M @ M.T  # Guaranteed to be SPD

print("Symmetric positive definite matrix A:")
print(A)

# Verify symmetry
print(f"\nSymmetric? {np.allclose(A, A.T)}")

# Verify positive definite (all eigenvalues > 0)
eigs = np.linalg.eigvalsh(A)
print(f"Eigenvalues: {eigs}")
print(f"Positive definite? {np.all(eigs > 0)}")

# Compute Cholesky decomposition
L = np.linalg.cholesky(A)

print("\n" + "="*50)
print("\nCholesky factor L (lower triangular):")
print(L)

# Verify A = LL^T
A_reconstructed = L @ L.T
print("\nReconstruction LL^T:")
print(A_reconstructed)

print(f"\nA = LL^T? {np.allclose(A, A_reconstructed)}")
# Application: Solving linear systems with Cholesky

# Create SPD matrix and RHS
np.random.seed(42)
n = 100
M = np.random.randn(n, n)
A = M @ M.T + np.eye(n)  # Add identity for numerical stability
b = np.random.randn(n)

# Method 1: Direct solve
import time
start = time.time()
x1 = np.linalg.solve(A, b)
time_direct = time.time() - start

# Method 2: Cholesky solve
start = time.time()
L = np.linalg.cholesky(A)
y = np.linalg.solve(L, b)  # Forward substitution: Ly = b
x2 = np.linalg.solve(L.T, y)  # Back substitution: L^Tx = y
time_cholesky = time.time() - start

print(f"Matrix size: {n}×{n}")
print(f"\nDirect solve time: {time_direct*1000:.2f} ms")
print(f"Cholesky solve time: {time_cholesky*1000:.2f} ms")
print(f"Speedup: {time_direct/time_cholesky:.2f}x")

print(f"\nSolutions match? {np.allclose(x1, x2)}")
print(f"Residual (direct): {np.linalg.norm(A@x1 - b):.2e}")
print(f"Residual (Cholesky): {np.linalg.norm(A@x2 - b):.2e}")

4.4 Eigendecomposition and Diagonalization

Eigendecomposition:

If A has n linearly independent eigenvectors:

\[A = PDP^{-1}\]

where:

  • P = [v₁ … vₙ] (eigenvectors as columns)

  • D = diag(λ₁, …, λₙ) (eigenvalues on diagonal)

Spectral Theorem:

For symmetric matrices A = A^T:

\[A = QΛQ^T\]

where Q is orthogonal (Q^T Q = I) and Λ is diagonal.

Benefits: Computing A^k, matrix exponential, understanding dynamics

# Eigendecomposition

# Symmetric matrix (guaranteed to have real eigenvalues and orthogonal eigenvectors)
A = np.array([[4, 2],
              [2, 3]], dtype=float)

print("Symmetric matrix A:")
print(A)

# Compute eigendecomposition
eigenvalues, Q = np.linalg.eigh(A)  # eigh for symmetric/Hermitian
Lambda = np.diag(eigenvalues)

print("\nEigenvalues (Λ):")
print(Lambda)

print("\nEigenvectors (Q):")
print(Q)

# Verify A = QΛQ^T
A_reconstructed = Q @ Lambda @ Q.T
print("\nReconstruction A = QΛQ^T:")
print(A_reconstructed)

print(f"\nCorrect? {np.allclose(A, A_reconstructed)}")

# Verify Q is orthogonal
print("\n" + "="*50)
print("\nQ^T Q (should be identity):")
print(Q.T @ Q)
print(f"Q is orthogonal? {np.allclose(Q.T @ Q, np.eye(2))}")
# Application: Computing matrix powers efficiently

A = np.array([[3, 1],
              [1, 3]], dtype=float)

print("Matrix A:")
print(A)

# Eigendecomposition
eigenvalues, Q = np.linalg.eigh(A)
Lambda = np.diag(eigenvalues)

# Compute A^10 two ways
k = 10

# Method 1: Direct (slow for large k)
A_k_direct = np.linalg.matrix_power(A, k)

# Method 2: Using eigendecomposition
# A^k = Q Λ^k Q^T
Lambda_k = np.diag(eigenvalues**k)
A_k_eigen = Q @ Lambda_k @ Q.T

print(f"\nA^{k} (direct):")
print(A_k_direct)

print(f"\nA^{k} (eigendecomposition):")
print(A_k_eigen)

print(f"\nMatch? {np.allclose(A_k_direct, A_k_eigen)}")

# Key insight
print("\n" + "="*50)
print("\nKey insight:")
print(f"λ₁^{k} = {eigenvalues[0]**k:.2e}")
print(f"λ₂^{k} = {eigenvalues[1]**k:.2e}")
print("\nLarger eigenvalue dominates for large k!")
print("This is the basis of PageRank, Markov chains, etc.")

4.5 Singular Value Decomposition (SVD)

Theorem 4.13 (SVD)

Every matrix A ∈ ℝ^(m×n) can be factorized as:

\[A = UΣV^T\]

where:

  • U ∈ ℝ^(m×m): orthogonal (left singular vectors)

  • Σ ∈ ℝ^(m×n): diagonal (singular values σ₁ ≥ σ₂ ≥ … ≥ 0)

  • V ∈ ℝ^(n×n): orthogonal (right singular vectors)

Key Properties:

  • Works for any matrix (not just square or symmetric)

  • Singular values = √eigenvalues of A^T A

  • Reveals rank, nullspace, range

# SVD

# Create rectangular matrix
A = np.array([[3, 1, 1],
              [-1, 3, 1],
              [1, 1, 3],
              [1, -1, 1]], dtype=float)

m, n = A.shape
print(f"Matrix A ({m}×{n}):")
print(A)

# Compute SVD
U, s, VT = np.linalg.svd(A, full_matrices=True)

print(f"\nU ({U.shape}):")
print(U)

print(f"\nSingular values:")
print(s)

print(f"\nV^T ({VT.shape}):")
print(VT)

# Reconstruct A
Sigma = np.zeros((m, n))
Sigma[:n, :n] = np.diag(s)

print("\nΣ (with padding):")
print(Sigma)

A_reconstructed = U @ Sigma @ VT
print("\nReconstruction A = UΣV^T:")
print(A_reconstructed)

print(f"\nCorrect? {np.allclose(A, A_reconstructed)}")

# Verify orthogonality
print("\n" + "="*50)
print("\nOrthogonality checks:")
print(f"U^T U = I? {np.allclose(U.T @ U, np.eye(m))}")
print(f"V^T V = I? {np.allclose(VT.T @ VT, np.eye(n))}")
# Visualize SVD geometrically (2D example)

A = np.array([[3, 1],
              [1, 2]])

U, s, VT = np.linalg.svd(A)

# Unit circle
theta = np.linspace(0, 2*np.pi, 100)
circle = np.array([np.cos(theta), np.sin(theta)])

# Apply transformations
step1 = VT @ circle  # Rotate by V^T
step2 = np.diag(s) @ step1  # Scale by Σ
step3 = U @ step2  # Rotate by U
ellipse = step3  # = A @ circle

fig, axes = plt.subplots(2, 2, figsize=(12, 12))

# Step 0: Original circle
axes[0,0].plot(circle[0], circle[1], 'b-', linewidth=2)
axes[0,0].set_aspect('equal')
axes[0,0].grid(True, alpha=0.3)
axes[0,0].set_title('1. Unit Circle', fontsize=12)
axes[0,0].set_xlim(-4, 4)
axes[0,0].set_ylim(-4, 4)

# Step 1: After V^T (rotation)
axes[0,1].plot(step1[0], step1[1], 'g-', linewidth=2)
axes[0,1].set_aspect('equal')
axes[0,1].grid(True, alpha=0.3)
axes[0,1].set_title('2. After V^T (rotate)', fontsize=12)
axes[0,1].set_xlim(-4, 4)
axes[0,1].set_ylim(-4, 4)

# Step 2: After Σ (scaling)
axes[1,0].plot(step2[0], step2[1], 'orange', linewidth=2)
axes[1,0].set_aspect('equal')
axes[1,0].grid(True, alpha=0.3)
axes[1,0].set_title(f'3. After Σ (scale by σ={s})', fontsize=12)
axes[1,0].set_xlim(-4, 4)
axes[1,0].set_ylim(-4, 4)

# Step 3: After U (rotation)
axes[1,1].plot(ellipse[0], ellipse[1], 'r-', linewidth=2)
axes[1,1].set_aspect('equal')
axes[1,1].grid(True, alpha=0.3)
axes[1,1].set_title('4. After U (rotate) = A×circle', fontsize=12)
axes[1,1].set_xlim(-4, 4)
axes[1,1].set_ylim(-4, 4)

plt.tight_layout()
plt.show()

print("SVD decomposes A into: Rotate → Scale → Rotate")
print(f"Singular values: {s}")
print("These are the semi-axes of the ellipse!")

4.6 Matrix Approximation

Eckart-Young Theorem:

The best rank-k approximation to A (in Frobenius norm) is:

\[A_k = \sum_{i=1}^k σ_i u_i v_i^T\]

where σᵢ are singular values (sorted), uᵢ and vᵢ are left/right singular vectors.

Applications:

  • PCA (Chapter 10)

  • Image compression

  • Noise reduction

  • Recommender systems

# Low-rank approximation

# Create rank-3 matrix with some noise
np.random.seed(42)
m, n = 50, 50
r_true = 3  # True rank

# Low-rank component
U_true = np.random.randn(m, r_true)
V_true = np.random.randn(n, r_true)
A_lowrank = U_true @ V_true.T

# Add noise
noise = 0.1 * np.random.randn(m, n)
A = A_lowrank + noise

print(f"Matrix A: {A.shape}")
print(f"True rank: {r_true}")

# SVD
U, s, VT = np.linalg.svd(A, full_matrices=False)

print(f"\nTop 10 singular values:")
print(s[:10])

# Try different ranks
print("\n" + "="*50)
print("\nApproximation errors:")

for k in [1, 2, 3, 5, 10]:
    # Rank-k approximation
    A_k = U[:, :k] @ np.diag(s[:k]) @ VT[:k, :]
    
    # Error
    error = np.linalg.norm(A - A_k, 'fro')
    relative_error = error / np.linalg.norm(A, 'fro')
    
    print(f"Rank {k:2d}: error = {error:6.3f}, relative = {relative_error:.4f}")
# Image compression with SVD

# Create a simple synthetic image
x = np.linspace(-5, 5, 200)
y = np.linspace(-5, 5, 200)
X, Y = np.meshgrid(x, y)

# Create pattern
img = np.sin(X) * np.cos(Y) + np.sin(X**2 + Y**2) / 2

print(f"Image size: {img.shape}")
print(f"Original storage: {img.size} values")

# SVD
U, s, VT = np.linalg.svd(img, full_matrices=False)

# Visualize original and approximations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Original
axes[0,0].imshow(img, cmap='gray')
axes[0,0].set_title(f'Original\n({img.size} values)', fontsize=11)
axes[0,0].axis('off')

# Approximations with different ranks
ranks = [5, 10, 20, 50, 100]
for idx, k in enumerate(ranks):
    # Reconstruct with k components
    img_k = U[:, :k] @ np.diag(s[:k]) @ VT[:k, :]
    
    # Storage needed: U[:,:k] + s[:k] + VT[:k,:]
    storage = k * (img.shape[0] + img.shape[1] + 1)
    compression_ratio = img.size / storage
    
    # Error
    error = np.linalg.norm(img - img_k, 'fro') / np.linalg.norm(img, 'fro')
    
    row = (idx + 1) // 3
    col = (idx + 1) % 3
    
    axes[row, col].imshow(img_k, cmap='gray')
    axes[row, col].set_title(f'Rank {k}\n{compression_ratio:.1f}× compression\nError: {error:.3f}', 
                              fontsize=11)
    axes[row, col].axis('off')

plt.tight_layout()
plt.show()

# Plot singular values
plt.figure(figsize=(10, 6))
plt.semilogy(s, 'b-', linewidth=2)
plt.xlabel('Index', fontsize=12)
plt.ylabel('Singular Value (log scale)', fontsize=12)
plt.title('Singular Value Spectrum', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nKey insight: A few singular values capture most of the information!")

Summary

Key Concepts from Chapter 4:

  1. Determinant: Volume scaling, invertibility test

  2. Trace: Sum of eigenvalues, useful invariant

  3. Eigenvalues/Eigenvectors: Principal directions, dynamics

  4. Cholesky: Efficient solving for SPD matrices

  5. Eigendecomposition: Diagonalization, matrix powers

  6. SVD: Universal decomposition, reveals structure

  7. Low-Rank Approximation: Compression, denoising

ML Applications:

Decomposition

ML Application

Eigendecomposition

PCA, spectral clustering

SVD

Recommender systems, LSA, PCA

Cholesky

Gaussian processes, optimization

Low-rank

Matrix completion, compression

Connections:

  • Chapter 10 (PCA): Uses SVD for dimensionality reduction

  • Chapter 9 (Linear Regression): Uses Cholesky for normal equations

  • Chapter 11 (GMM): Uses eigendecomposition for covariance

Next Steps:

  • Chapter 5: Vector Calculus (gradients, optimization)

  • Chapter 6: Probability (statistical foundations)

  • Chapter 10: PCA (direct application of SVD)

Practice: Implement PCA using SVD from scratch!