import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
np.random.seed(42)
1. Problem: Solve \(Ax = b\)ΒΆ
Where \(A \in \mathbb{R}^{n \times n}\) is symmetric positive definite.
Equivalent: MinimizeΒΆ
\[f(x) = \frac{1}{2}x^TAx - b^Tx\]
Gradient Descent IssuesΒΆ
Zigzagging for ill-conditioned \(A\)
Slow convergence: \(O(\kappa \log(1/\epsilon))\) iterations
CG SolutionΒΆ
Converges in \(\leq n\) iterations!
2. Conjugate DirectionsΒΆ
A-OrthogonalityΒΆ
Directions \(\{p_0, \ldots, p_{n-1}\}\) are A-conjugate if:
\[p_i^T A p_j = 0, \quad \forall i \neq j\]
Line SearchΒΆ
\[x_{k+1} = x_k + \alpha_k p_k\]
\[\alpha_k = \frac{r_k^T p_k}{p_k^T A p_k}\]
where \(r_k = b - Ax_k\) (residual).
def conjugate_gradient(A, b, x0=None, tol=1e-6, max_iter=None):
"""Conjugate Gradient algorithm."""
n = len(b)
if x0 is None:
x = np.zeros(n)
else:
x = x0.copy()
if max_iter is None:
max_iter = n
r = b - A @ x
p = r.copy()
rs_old = r @ r
history = [x.copy()]
residuals = [np.linalg.norm(r)]
for i in range(max_iter):
Ap = A @ p
alpha = rs_old / (p @ Ap)
x = x + alpha * p
r = r - alpha * Ap
rs_new = r @ r
residuals.append(np.sqrt(rs_new))
history.append(x.copy())
if np.sqrt(rs_new) < tol:
break
beta = rs_new / rs_old
p = r + beta * p
rs_old = rs_new
return x, history, residuals
# Test on 2D problem
A = np.array([[3, 0.5], [0.5, 1]])
b = np.array([1, 2])
# True solution
x_true = np.linalg.solve(A, b)
# CG
x_cg, hist_cg, res_cg = conjugate_gradient(A, b, x0=np.zeros(2))
# Gradient descent for comparison
def gradient_descent(A, b, x0, lr=0.1, n_iter=20):
x = x0.copy()
history = [x.copy()]
for _ in range(n_iter):
grad = A @ x - b
x = x - lr * grad
history.append(x.copy())
return x, history
x_gd, hist_gd = gradient_descent(A, b, np.zeros(2), lr=0.4, n_iter=20)
# Visualize
x1 = np.linspace(-1, 3, 100)
x2 = np.linspace(-1, 3, 100)
X1, X2 = np.meshgrid(x1, x2)
Z = 0.5 * (A[0,0]*X1**2 + 2*A[0,1]*X1*X2 + A[1,1]*X2**2) - b[0]*X1 - b[1]*X2
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# CG
axes[0].contour(X1, X2, Z, levels=20, cmap='viridis', alpha=0.6)
hist_cg = np.array(hist_cg)
axes[0].plot(hist_cg[:, 0], hist_cg[:, 1], 'ro-', markersize=8, label='CG')
axes[0].plot(x_true[0], x_true[1], 'g*', markersize=15, label='Solution')
axes[0].set_xlabel('xβ', fontsize=11)
axes[0].set_ylabel('xβ', fontsize=11)
axes[0].set_title(f'CG ({len(hist_cg)} steps)', fontsize=12)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# GD
axes[1].contour(X1, X2, Z, levels=20, cmap='viridis', alpha=0.6)
hist_gd = np.array(hist_gd)
axes[1].plot(hist_gd[:, 0], hist_gd[:, 1], 'bo-', markersize=6, label='GD')
axes[1].plot(x_true[0], x_true[1], 'g*', markersize=15, label='Solution')
axes[1].set_xlabel('xβ', fontsize=11)
axes[1].set_ylabel('xβ', fontsize=11)
axes[1].set_title(f'GD ({len(hist_gd)} steps)', fontsize=12)
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"CG iterations: {len(hist_cg)-1}")
print(f"CG error: {np.linalg.norm(x_cg - x_true):.6f}")
3. Convergence AnalysisΒΆ
# Test on larger system
n = 100
np.random.seed(42)
Q = np.random.randn(n, n)
A = Q.T @ Q + np.eye(n) # SPD
x_true = np.random.randn(n)
b = A @ x_true
# CG
x_cg, _, res_cg = conjugate_gradient(A, b, x0=np.zeros(n), max_iter=n)
# Condition number
eigvals = np.linalg.eigvalsh(A)
kappa = eigvals[-1] / eigvals[0]
# Theoretical bound
iters = np.arange(len(res_cg))
bound = res_cg[0] * ((np.sqrt(kappa) - 1) / (np.sqrt(kappa) + 1))**iters
plt.figure(figsize=(10, 6))
plt.semilogy(res_cg, 'b-', linewidth=2, label='CG Residual')
plt.semilogy(bound, 'r--', label=f'Bound (ΞΊ={kappa:.1f})')
plt.xlabel('Iteration', fontsize=11)
plt.ylabel('Residual Norm', fontsize=11)
plt.title('CG Convergence', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"ΞΊ(A) = {kappa:.2f}")
print(f"Iterations to convergence: {len(res_cg)-1}")
4. Preconditioned CGΒΆ
def preconditioned_cg(A, b, M_inv=None, x0=None, tol=1e-6, max_iter=None):
"""Preconditioned CG."""
n = len(b)
if x0 is None:
x = np.zeros(n)
else:
x = x0.copy()
if M_inv is None:
M_inv = np.eye(n)
if max_iter is None:
max_iter = n
r = b - A @ x
z = M_inv @ r
p = z.copy()
rz_old = r @ z
residuals = [np.linalg.norm(r)]
for i in range(max_iter):
Ap = A @ p
alpha = rz_old / (p @ Ap)
x = x + alpha * p
r = r - alpha * Ap
residuals.append(np.linalg.norm(r))
if np.linalg.norm(r) < tol:
break
z = M_inv @ r
rz_new = r @ z
beta = rz_new / rz_old
p = z + beta * p
rz_old = rz_new
return x, residuals
# Jacobi preconditioner
M_inv = np.diag(1.0 / np.diag(A))
x_pcg, res_pcg = preconditioned_cg(A, b, M_inv=M_inv, max_iter=n)
plt.figure(figsize=(10, 6))
plt.semilogy(res_cg, 'b-', label='CG')
plt.semilogy(res_pcg, 'r-', label='Preconditioned CG')
plt.xlabel('Iteration', fontsize=11)
plt.ylabel('Residual', fontsize=11)
plt.title('CG vs Preconditioned CG', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"CG: {len(res_cg)-1} iterations")
print(f"PCG: {len(res_pcg)-1} iterations")
SummaryΒΆ
CG Algorithm:ΒΆ
Solves \(Ax = b\) for SPD \(A\) in \(\leq n\) iterations.
Convergence:ΒΆ
\[\|x_k - x^*\|_A \leq 2\left(\frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1}\right)^k \|x_0 - x^*\|_A\]
Key Properties:ΒΆ
A-conjugate directions
Krylov subspace methods
Optimal in subspace
Preconditioning:ΒΆ
Solve \(M^{-1}Ax = M^{-1}b\) to reduce \(\kappa\).
Applications:ΒΆ
Large sparse systems
PDEs (finite elements)
Deep learning (second-order)
Computer graphics
Next Steps:ΒΆ
09_gradient_descent_convergence.ipynb - First-order
Study GMRES for non-symmetric
Explore incomplete Cholesky