Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
np.random.seed(42)

1. Problem: Solve \(Ax = b\)¶

Where \(A \in \mathbb{R}^{n \times n}\) is symmetric positive definite.

Equivalent: Minimize¶

\[f(x) = \frac{1}{2}x^TAx - b^Tx\]

Gradient Descent Issues¶

Zigzagging for ill-conditioned \(A\)
Slow convergence: \(O(\kappa \log(1/\epsilon))\) iterations

CG Solution¶

Converges in \(\leq n\) iterations!

2. Conjugate Directions¶

A-Orthogonality¶

Directions \(\{p_0, \ldots, p_{n-1}\}\) are A-conjugate if:

\[p_i^T A p_j = 0, \quad \forall i \neq j\]

Line Search¶

\[x_{k+1} = x_k + \alpha_k p_k\]

\[\alpha_k = \frac{r_k^T p_k}{p_k^T A p_k}\]

where \(r_k = b - Ax_k\) (residual).

def conjugate_gradient(A, b, x0=None, tol=1e-6, max_iter=None):
    """Conjugate Gradient algorithm."""
    n = len(b)
    if x0 is None:
        x = np.zeros(n)
    else:
        x = x0.copy()
    
    if max_iter is None:
        max_iter = n
    
    r = b - A @ x
    p = r.copy()
    rs_old = r @ r
    
    history = [x.copy()]
    residuals = [np.linalg.norm(r)]
    
    for i in range(max_iter):
        Ap = A @ p
        alpha = rs_old / (p @ Ap)
        x = x + alpha * p
        r = r - alpha * Ap
        rs_new = r @ r
        
        residuals.append(np.sqrt(rs_new))
        history.append(x.copy())
        
        if np.sqrt(rs_new) < tol:
            break
        
        beta = rs_new / rs_old
        p = r + beta * p
        rs_old = rs_new
    
    return x, history, residuals

# Test on 2D problem
A = np.array([[3, 0.5], [0.5, 1]])
b = np.array([1, 2])

# True solution
x_true = np.linalg.solve(A, b)

# CG
x_cg, hist_cg, res_cg = conjugate_gradient(A, b, x0=np.zeros(2))

# Gradient descent for comparison
def gradient_descent(A, b, x0, lr=0.1, n_iter=20):
    x = x0.copy()
    history = [x.copy()]
    for _ in range(n_iter):
        grad = A @ x - b
        x = x - lr * grad
        history.append(x.copy())
    return x, history

x_gd, hist_gd = gradient_descent(A, b, np.zeros(2), lr=0.4, n_iter=20)

# Visualize
x1 = np.linspace(-1, 3, 100)
x2 = np.linspace(-1, 3, 100)
X1, X2 = np.meshgrid(x1, x2)
Z = 0.5 * (A[0,0]*X1**2 + 2*A[0,1]*X1*X2 + A[1,1]*X2**2) - b[0]*X1 - b[1]*X2

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# CG
axes[0].contour(X1, X2, Z, levels=20, cmap='viridis', alpha=0.6)
hist_cg = np.array(hist_cg)
axes[0].plot(hist_cg[:, 0], hist_cg[:, 1], 'ro-', markersize=8, label='CG')
axes[0].plot(x_true[0], x_true[1], 'g*', markersize=15, label='Solution')
axes[0].set_xlabel('x₁', fontsize=11)
axes[0].set_ylabel('x₂', fontsize=11)
axes[0].set_title(f'CG ({len(hist_cg)} steps)', fontsize=12)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# GD
axes[1].contour(X1, X2, Z, levels=20, cmap='viridis', alpha=0.6)
hist_gd = np.array(hist_gd)
axes[1].plot(hist_gd[:, 0], hist_gd[:, 1], 'bo-', markersize=6, label='GD')
axes[1].plot(x_true[0], x_true[1], 'g*', markersize=15, label='Solution')
axes[1].set_xlabel('x₁', fontsize=11)
axes[1].set_ylabel('x₂', fontsize=11)
axes[1].set_title(f'GD ({len(hist_gd)} steps)', fontsize=12)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"CG iterations: {len(hist_cg)-1}")
print(f"CG error: {np.linalg.norm(x_cg - x_true):.6f}")

3. Convergence Analysis¶

# Test on larger system
n = 100
np.random.seed(42)
Q = np.random.randn(n, n)
A = Q.T @ Q + np.eye(n)  # SPD
x_true = np.random.randn(n)
b = A @ x_true

# CG
x_cg, _, res_cg = conjugate_gradient(A, b, x0=np.zeros(n), max_iter=n)

# Condition number
eigvals = np.linalg.eigvalsh(A)
kappa = eigvals[-1] / eigvals[0]

# Theoretical bound
iters = np.arange(len(res_cg))
bound = res_cg[0] * ((np.sqrt(kappa) - 1) / (np.sqrt(kappa) + 1))**iters

plt.figure(figsize=(10, 6))
plt.semilogy(res_cg, 'b-', linewidth=2, label='CG Residual')
plt.semilogy(bound, 'r--', label=f'Bound (κ={kappa:.1f})')
plt.xlabel('Iteration', fontsize=11)
plt.ylabel('Residual Norm', fontsize=11)
plt.title('CG Convergence', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"κ(A) = {kappa:.2f}")
print(f"Iterations to convergence: {len(res_cg)-1}")

4. Preconditioned CG¶

def preconditioned_cg(A, b, M_inv=None, x0=None, tol=1e-6, max_iter=None):
    """Preconditioned CG."""
    n = len(b)
    if x0 is None:
        x = np.zeros(n)
    else:
        x = x0.copy()
    
    if M_inv is None:
        M_inv = np.eye(n)
    
    if max_iter is None:
        max_iter = n
    
    r = b - A @ x
    z = M_inv @ r
    p = z.copy()
    rz_old = r @ z
    
    residuals = [np.linalg.norm(r)]
    
    for i in range(max_iter):
        Ap = A @ p
        alpha = rz_old / (p @ Ap)
        x = x + alpha * p
        r = r - alpha * Ap
        
        residuals.append(np.linalg.norm(r))
        
        if np.linalg.norm(r) < tol:
            break
        
        z = M_inv @ r
        rz_new = r @ z
        beta = rz_new / rz_old
        p = z + beta * p
        rz_old = rz_new
    
    return x, residuals

# Jacobi preconditioner
M_inv = np.diag(1.0 / np.diag(A))

x_pcg, res_pcg = preconditioned_cg(A, b, M_inv=M_inv, max_iter=n)

plt.figure(figsize=(10, 6))
plt.semilogy(res_cg, 'b-', label='CG')
plt.semilogy(res_pcg, 'r-', label='Preconditioned CG')
plt.xlabel('Iteration', fontsize=11)
plt.ylabel('Residual', fontsize=11)
plt.title('CG vs Preconditioned CG', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"CG: {len(res_cg)-1} iterations")
print(f"PCG: {len(res_pcg)-1} iterations")

Summary¶

CG Algorithm:¶

Solves \(Ax = b\) for SPD \(A\) in \(\leq n\) iterations.

Convergence:¶

\[\|x_k - x^*\|_A \leq 2\left(\frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1}\right)^k \|x_0 - x^*\|_A\]

Key Properties:¶

A-conjugate directions
Krylov subspace methods
Optimal in subspace

Preconditioning:¶

Solve \(M^{-1}Ax = M^{-1}b\) to reduce \(\kappa\).

Applications:¶

Large sparse systems
PDEs (finite elements)
Deep learning (second-order)
Computer graphics

Next Steps:¶

09_gradient_descent_convergence.ipynb - First-order
Study GMRES for non-symmetric
Explore incomplete Cholesky