import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import animation
from IPython.display import HTML
sns.set_style('whitegrid')
np.random.seed(42)
1. Function Properties¶
L-Smooth Functions¶
Definition: \(f\) is \(L\)-smooth if: $\(\|\nabla f(x) - \nabla f(y)\| \leq L \|x - y\|\)$
Equivalently: $\(f(y) \leq f(x) + \nabla f(x)^T(y-x) + \frac{L}{2}\|y-x\|^2\)$
Interpretation: Gradient doesn’t change too fast.
Convex Functions¶
Definition: For all \(x, y\) and \(\lambda \in [0,1]\): $\(f(\lambda x + (1-\lambda)y) \leq \lambda f(x) + (1-\lambda)f(y)\)$
First-order condition: $\(f(y) \geq f(x) + \nabla f(x)^T(y-x)\)$
μ-Strongly Convex¶
Definition: \(f\) is \(\mu\)-strongly convex if: $\(f(y) \geq f(x) + \nabla f(x)^T(y-x) + \frac{\mu}{2}\|y-x\|^2\)$
Interpretation: Function grows at least quadratically from any point.
# Visualize function properties
x = np.linspace(-2, 2, 100)
fig, axes = plt.subplots(1, 3, figsize=(16, 4))
# L-smooth (bounded curvature)
f_smooth = x**2
f_nonsmooth = np.abs(x)
axes[0].plot(x, f_smooth, linewidth=2, label='x² (smooth)')
axes[0].plot(x, f_nonsmooth, linewidth=2, label='|x| (non-smooth at 0)')
axes[0].set_xlabel('x', fontsize=11)
axes[0].set_ylabel('f(x)', fontsize=11)
axes[0].set_title('L-Smoothness', fontsize=12)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Convex
f_convex = x**2
f_nonconvex = x**3
axes[1].plot(x, f_convex, linewidth=2, label='x² (convex)')
axes[1].plot(x, f_nonconvex, linewidth=2, label='x³ (non-convex)')
axes[1].set_xlabel('x', fontsize=11)
axes[1].set_ylabel('f(x)', fontsize=11)
axes[1].set_title('Convexity', fontsize=12)
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# Strong convexity
f_strong = 2*x**2
f_weak = 0.1*x**2
axes[2].plot(x, f_strong, linewidth=2, label='2x² (μ=4)')
axes[2].plot(x, f_weak, linewidth=2, label='0.1x² (μ=0.2)')
axes[2].set_xlabel('x', fontsize=11)
axes[2].set_ylabel('f(x)', fontsize=11)
axes[2].set_title('Strong Convexity', fontsize=12)
axes[2].legend()
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
2. Gradient Descent Algorithm¶
Update Rule¶
where \(\eta\) is the learning rate (step size).
Key Questions¶
Does it converge? ✓
How fast? Depends on function class!
What step size? Also depends!
3. Convergence for L-Smooth Functions¶
Theorem (Descent Lemma)¶
If \(f\) is \(L\)-smooth, then with step size \(\eta \leq \frac{1}{L}\): $\(f(x_{t+1}) \leq f(x_t) - \frac{\eta}{2}\|\nabla f(x_t)\|^2\)$
Proof:¶
By \(L\)-smoothness: $\(f(x_{t+1}) \leq f(x_t) + \nabla f(x_t)^T(x_{t+1}-x_t) + \frac{L}{2}\|x_{t+1}-x_t\|^2\)$
Substitute \(x_{t+1} = x_t - \eta \nabla f(x_t)\): $\(f(x_{t+1}) \leq f(x_t) - \eta\|\nabla f(x_t)\|^2 + \frac{L\eta^2}{2}\|\nabla f(x_t)\|^2\)$
With \(\eta \leq \frac{1}{L}\), we have \(1 - \frac{L\eta}{2} \geq \frac{1}{2}\). □
Convergence Rate¶
For convex and \(L\)-smooth: $\(f(x_T) - f(x^*) \leq \frac{2L\|x_0 - x^*\|^2}{T}\)$
Rate: \(O(1/T)\) (sublinear)
def gradient_descent(grad_f, x0, learning_rate, n_iterations):
"""Standard gradient descent."""
x = x0.copy()
trajectory = [x.copy()]
for _ in range(n_iterations):
grad = grad_f(x)
x = x - learning_rate * grad
trajectory.append(x.copy())
return np.array(trajectory)
# Test on quadratic: f(x) = 0.5 * x^T A x
A = np.array([[2.0, 0.5], [0.5, 1.0]])
L = np.linalg.eigvalsh(A).max() # Largest eigenvalue = smoothness constant
def f(x):
return 0.5 * x @ A @ x
def grad_f(x):
return A @ x
x0 = np.array([3.0, 2.0])
# Try different step sizes
step_sizes = [0.9/L, 1.0/L, 1.5/L]
trajectories = []
for eta in step_sizes:
traj = gradient_descent(grad_f, x0, eta, 20)
trajectories.append(traj)
# Visualize
x_range = np.linspace(-4, 4, 100)
y_range = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = np.zeros_like(X)
for i in range(len(x_range)):
for j in range(len(y_range)):
Z[j, i] = f(np.array([X[j, i], Y[j, i]]))
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
for idx, (eta, traj) in enumerate(zip(step_sizes, trajectories)):
axes[idx].contour(X, Y, Z, levels=20, alpha=0.6)
axes[idx].plot(traj[:, 0], traj[:, 1], 'ro-', linewidth=2, markersize=4)
axes[idx].plot(0, 0, 'g*', markersize=15, label='Optimum')
axes[idx].set_xlabel('x₁', fontsize=11)
axes[idx].set_ylabel('x₂', fontsize=11)
axes[idx].set_title(f'η = {eta:.3f} ({eta*L:.2f}/L)', fontsize=12)
axes[idx].legend()
axes[idx].grid(True, alpha=0.3)
axes[idx].set_aspect('equal')
plt.tight_layout()
plt.show()
print(f"L-smoothness constant L = {L:.3f}")
print(f"Step size > 1/L may diverge!")
4. Strong Convexity: Linear Convergence¶
Theorem¶
If \(f\) is \(\mu\)-strongly convex and \(L\)-smooth, with \(\eta = \frac{1}{L}\): $\(\|x_t - x^*\|^2 \leq \left(1 - \frac{\mu}{L}\right)^t \|x_0 - x^*\|^2\)$
Proof Sketch:¶
From descent lemma: $\(f(x_{t+1}) \leq f(x_t) - \frac{1}{2L}\|\nabla f(x_t)\|^2\)$
By strong convexity: $\(\|\nabla f(x_t)\|^2 \geq 2\mu(f(x_t) - f(x^*))\)$
Combining: $\(f(x_{t+1}) - f(x^*) \leq \left(1 - \frac{\mu}{L}\right)(f(x_t) - f(x^*))\)$
This gives linear (exponential) convergence! □
Condition Number¶
\(\kappa\) close to 1: Fast convergence
\(\kappa\) large: Slow convergence (ill-conditioned)
# Compare different condition numbers
def quadratic_gd(A, x0, n_iterations):
"""GD on f(x) = 0.5 x^T A x with optimal step size."""
L = np.linalg.eigvalsh(A).max()
eta = 1.0 / L
x = x0.copy()
errors = [np.linalg.norm(x)**2]
for _ in range(n_iterations):
x = x - eta * (A @ x)
errors.append(np.linalg.norm(x)**2)
return np.array(errors)
# Different condition numbers
x0 = np.array([1.0, 1.0])
n_iter = 50
# Well-conditioned: κ ≈ 1
A1 = np.array([[1.0, 0.0], [0.0, 1.0]])
errors1 = quadratic_gd(A1, x0, n_iter)
# Moderate: κ ≈ 10
A2 = np.array([[10.0, 0.0], [0.0, 1.0]])
errors2 = quadratic_gd(A2, x0, n_iter)
# Ill-conditioned: κ ≈ 100
A3 = np.array([[100.0, 0.0], [0.0, 1.0]])
errors3 = quadratic_gd(A3, x0, n_iter)
# Theoretical rates
kappa1 = np.linalg.cond(A1)
kappa2 = np.linalg.cond(A2)
kappa3 = np.linalg.cond(A3)
t = np.arange(n_iter + 1)
theory1 = errors1[0] * (1 - 1/kappa1)**t
theory2 = errors2[0] * (1 - 1/kappa2)**t
theory3 = errors3[0] * (1 - 1/kappa3)**t
# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Log scale
axes[0].semilogy(errors1, 'o-', linewidth=2, markersize=4, label=f'κ={kappa1:.1f}')
axes[0].semilogy(errors2, 's-', linewidth=2, markersize=4, label=f'κ={kappa2:.1f}')
axes[0].semilogy(errors3, '^-', linewidth=2, markersize=4, label=f'κ={kappa3:.1f}')
axes[0].set_xlabel('Iteration', fontsize=12)
axes[0].set_ylabel('||x - x*||²', fontsize=12)
axes[0].set_title('Convergence vs Condition Number', fontsize=13)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Compare with theory
axes[1].semilogy(errors2, 'o-', linewidth=2, label='Actual', markersize=4)
axes[1].semilogy(theory2, '--', linewidth=2, label='Theory: (1-μ/L)ᵗ')
axes[1].set_xlabel('Iteration', fontsize=12)
axes[1].set_ylabel('||x - x*||²', fontsize=12)
axes[1].set_title(f'Theory vs Practice (κ={kappa2:.1f})', fontsize=13)
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Linear convergence: error × (1-1/κ) per iteration")
print(f"Larger κ → slower convergence")
5. Summary of Convergence Rates¶
Function Class |
Step Size |
Rate |
Iterations to ε-accuracy |
|---|---|---|---|
L-smooth only |
\(\eta = 1/L\) |
- |
No guarantee |
Convex + L-smooth |
\(\eta = 1/L\) |
\(O(1/T)\) |
\(O(1/ε)\) |
μ-strongly convex + L-smooth |
\(\eta = 1/L\) |
\(O((1-\mu/L)^T)\) |
\(O(κ \log(1/ε))\) |
where \(\kappa = L/\mu\) is the condition number.
Summary¶
Key Concepts:¶
L-smoothness: Bounded gradient Lipschitz constant
Convexity: Bowl-shaped functions
Strong convexity: Lower bounded curvature
Condition number: \(\kappa = L/\mu\) controls convergence
Convergence Guarantees:¶
Descent Lemma: $\(f(x_{t+1}) \leq f(x_t) - \frac{\eta}{2}\|\nabla f(x_t)\|^2\)$
Convex + Smooth: $\(f(x_T) - f^* \leq O(1/T)\)$
Strongly Convex + Smooth: $\(\|x_T - x^*\|^2 \leq (1 - \mu/L)^T \|x_0 - x^*\|^2\)$
Step Size Selection:¶
Too large: Divergence
Optimal: \(\eta = 1/L\)
Too small: Slow but stable
Extensions:¶
Momentum: Accelerated convergence
Adam: Adaptive learning rates
Nesterov: Optimal \(O(1/T^2)\) for convex
Stochastic GD: Noisy gradients
Applications:¶
Deep learning optimization
Convex programming
Statistical learning
Next Steps:¶
07_concentration_inequalities.ipynb - Stochastic analysis
06_variational_inference.ipynb - Optimization in Bayesian inference
Phase 6 neural networks for practical optimization