import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import animation
from IPython.display import HTML

sns.set_style('whitegrid')
np.random.seed(42)

1. Function Properties

L-Smooth Functions

Definition: \(f\) is \(L\)-smooth if: $\(\|\nabla f(x) - \nabla f(y)\| \leq L \|x - y\|\)$

Equivalently: $\(f(y) \leq f(x) + \nabla f(x)^T(y-x) + \frac{L}{2}\|y-x\|^2\)$

Interpretation: Gradient doesn’t change too fast.

Convex Functions

Definition: For all \(x, y\) and \(\lambda \in [0,1]\): $\(f(\lambda x + (1-\lambda)y) \leq \lambda f(x) + (1-\lambda)f(y)\)$

First-order condition: $\(f(y) \geq f(x) + \nabla f(x)^T(y-x)\)$

μ-Strongly Convex

Definition: \(f\) is \(\mu\)-strongly convex if: $\(f(y) \geq f(x) + \nabla f(x)^T(y-x) + \frac{\mu}{2}\|y-x\|^2\)$

Interpretation: Function grows at least quadratically from any point.

# Visualize function properties
x = np.linspace(-2, 2, 100)

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# L-smooth (bounded curvature)
f_smooth = x**2
f_nonsmooth = np.abs(x)
axes[0].plot(x, f_smooth, linewidth=2, label='x² (smooth)')
axes[0].plot(x, f_nonsmooth, linewidth=2, label='|x| (non-smooth at 0)')
axes[0].set_xlabel('x', fontsize=11)
axes[0].set_ylabel('f(x)', fontsize=11)
axes[0].set_title('L-Smoothness', fontsize=12)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Convex
f_convex = x**2
f_nonconvex = x**3
axes[1].plot(x, f_convex, linewidth=2, label='x² (convex)')
axes[1].plot(x, f_nonconvex, linewidth=2, label='x³ (non-convex)')
axes[1].set_xlabel('x', fontsize=11)
axes[1].set_ylabel('f(x)', fontsize=11)
axes[1].set_title('Convexity', fontsize=12)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Strong convexity
f_strong = 2*x**2
f_weak = 0.1*x**2
axes[2].plot(x, f_strong, linewidth=2, label='2x² (μ=4)')
axes[2].plot(x, f_weak, linewidth=2, label='0.1x² (μ=0.2)')
axes[2].set_xlabel('x', fontsize=11)
axes[2].set_ylabel('f(x)', fontsize=11)
axes[2].set_title('Strong Convexity', fontsize=12)
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

2. Gradient Descent Algorithm

Update Rule

\[x_{t+1} = x_t - \eta \nabla f(x_t)\]

where \(\eta\) is the learning rate (step size).

Key Questions

  1. Does it converge? ✓

  2. How fast? Depends on function class!

  3. What step size? Also depends!

3. Convergence for L-Smooth Functions

Theorem (Descent Lemma)

If \(f\) is \(L\)-smooth, then with step size \(\eta \leq \frac{1}{L}\): $\(f(x_{t+1}) \leq f(x_t) - \frac{\eta}{2}\|\nabla f(x_t)\|^2\)$

Proof:

By \(L\)-smoothness: $\(f(x_{t+1}) \leq f(x_t) + \nabla f(x_t)^T(x_{t+1}-x_t) + \frac{L}{2}\|x_{t+1}-x_t\|^2\)$

Substitute \(x_{t+1} = x_t - \eta \nabla f(x_t)\): $\(f(x_{t+1}) \leq f(x_t) - \eta\|\nabla f(x_t)\|^2 + \frac{L\eta^2}{2}\|\nabla f(x_t)\|^2\)$

\[= f(x_t) - \eta(1 - \frac{L\eta}{2})\|\nabla f(x_t)\|^2\]

With \(\eta \leq \frac{1}{L}\), we have \(1 - \frac{L\eta}{2} \geq \frac{1}{2}\). □

Convergence Rate

For convex and \(L\)-smooth: $\(f(x_T) - f(x^*) \leq \frac{2L\|x_0 - x^*\|^2}{T}\)$

Rate: \(O(1/T)\) (sublinear)

def gradient_descent(grad_f, x0, learning_rate, n_iterations):
    """Standard gradient descent."""
    x = x0.copy()
    trajectory = [x.copy()]
    
    for _ in range(n_iterations):
        grad = grad_f(x)
        x = x - learning_rate * grad
        trajectory.append(x.copy())
    
    return np.array(trajectory)

# Test on quadratic: f(x) = 0.5 * x^T A x
A = np.array([[2.0, 0.5], [0.5, 1.0]])
L = np.linalg.eigvalsh(A).max()  # Largest eigenvalue = smoothness constant

def f(x):
    return 0.5 * x @ A @ x

def grad_f(x):
    return A @ x

x0 = np.array([3.0, 2.0])

# Try different step sizes
step_sizes = [0.9/L, 1.0/L, 1.5/L]
trajectories = []

for eta in step_sizes:
    traj = gradient_descent(grad_f, x0, eta, 20)
    trajectories.append(traj)

# Visualize
x_range = np.linspace(-4, 4, 100)
y_range = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = np.zeros_like(X)
for i in range(len(x_range)):
    for j in range(len(y_range)):
        Z[j, i] = f(np.array([X[j, i], Y[j, i]]))

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, (eta, traj) in enumerate(zip(step_sizes, trajectories)):
    axes[idx].contour(X, Y, Z, levels=20, alpha=0.6)
    axes[idx].plot(traj[:, 0], traj[:, 1], 'ro-', linewidth=2, markersize=4)
    axes[idx].plot(0, 0, 'g*', markersize=15, label='Optimum')
    axes[idx].set_xlabel('x₁', fontsize=11)
    axes[idx].set_ylabel('x₂', fontsize=11)
    axes[idx].set_title(f'η = {eta:.3f} ({eta*L:.2f}/L)', fontsize=12)
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)
    axes[idx].set_aspect('equal')

plt.tight_layout()
plt.show()

print(f"L-smoothness constant L = {L:.3f}")
print(f"Step size > 1/L may diverge!")

4. Strong Convexity: Linear Convergence

Theorem

If \(f\) is \(\mu\)-strongly convex and \(L\)-smooth, with \(\eta = \frac{1}{L}\): $\(\|x_t - x^*\|^2 \leq \left(1 - \frac{\mu}{L}\right)^t \|x_0 - x^*\|^2\)$

Proof Sketch:

From descent lemma: $\(f(x_{t+1}) \leq f(x_t) - \frac{1}{2L}\|\nabla f(x_t)\|^2\)$

By strong convexity: $\(\|\nabla f(x_t)\|^2 \geq 2\mu(f(x_t) - f(x^*))\)$

Combining: $\(f(x_{t+1}) - f(x^*) \leq \left(1 - \frac{\mu}{L}\right)(f(x_t) - f(x^*))\)$

This gives linear (exponential) convergence! □

Condition Number

\[\kappa = \frac{L}{\mu}\]
  • \(\kappa\) close to 1: Fast convergence

  • \(\kappa\) large: Slow convergence (ill-conditioned)

# Compare different condition numbers
def quadratic_gd(A, x0, n_iterations):
    """GD on f(x) = 0.5 x^T A x with optimal step size."""
    L = np.linalg.eigvalsh(A).max()
    eta = 1.0 / L
    
    x = x0.copy()
    errors = [np.linalg.norm(x)**2]
    
    for _ in range(n_iterations):
        x = x - eta * (A @ x)
        errors.append(np.linalg.norm(x)**2)
    
    return np.array(errors)

# Different condition numbers
x0 = np.array([1.0, 1.0])
n_iter = 50

# Well-conditioned: κ ≈ 1
A1 = np.array([[1.0, 0.0], [0.0, 1.0]])
errors1 = quadratic_gd(A1, x0, n_iter)

# Moderate: κ ≈ 10
A2 = np.array([[10.0, 0.0], [0.0, 1.0]])
errors2 = quadratic_gd(A2, x0, n_iter)

# Ill-conditioned: κ ≈ 100
A3 = np.array([[100.0, 0.0], [0.0, 1.0]])
errors3 = quadratic_gd(A3, x0, n_iter)

# Theoretical rates
kappa1 = np.linalg.cond(A1)
kappa2 = np.linalg.cond(A2)
kappa3 = np.linalg.cond(A3)

t = np.arange(n_iter + 1)
theory1 = errors1[0] * (1 - 1/kappa1)**t
theory2 = errors2[0] * (1 - 1/kappa2)**t
theory3 = errors3[0] * (1 - 1/kappa3)**t

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Log scale
axes[0].semilogy(errors1, 'o-', linewidth=2, markersize=4, label=f'κ={kappa1:.1f}')
axes[0].semilogy(errors2, 's-', linewidth=2, markersize=4, label=f'κ={kappa2:.1f}')
axes[0].semilogy(errors3, '^-', linewidth=2, markersize=4, label=f'κ={kappa3:.1f}')
axes[0].set_xlabel('Iteration', fontsize=12)
axes[0].set_ylabel('||x - x*||²', fontsize=12)
axes[0].set_title('Convergence vs Condition Number', fontsize=13)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Compare with theory
axes[1].semilogy(errors2, 'o-', linewidth=2, label='Actual', markersize=4)
axes[1].semilogy(theory2, '--', linewidth=2, label='Theory: (1-μ/L)ᵗ')
axes[1].set_xlabel('Iteration', fontsize=12)
axes[1].set_ylabel('||x - x*||²', fontsize=12)
axes[1].set_title(f'Theory vs Practice (κ={kappa2:.1f})', fontsize=13)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Linear convergence: error × (1-1/κ) per iteration")
print(f"Larger κ → slower convergence")

5. Summary of Convergence Rates

Function Class

Step Size

Rate

Iterations to ε-accuracy

L-smooth only

\(\eta = 1/L\)

-

No guarantee

Convex + L-smooth

\(\eta = 1/L\)

\(O(1/T)\)

\(O(1/ε)\)

μ-strongly convex + L-smooth

\(\eta = 1/L\)

\(O((1-\mu/L)^T)\)

\(O(κ \log(1/ε))\)

where \(\kappa = L/\mu\) is the condition number.

Summary

Key Concepts:

  1. L-smoothness: Bounded gradient Lipschitz constant

  2. Convexity: Bowl-shaped functions

  3. Strong convexity: Lower bounded curvature

  4. Condition number: \(\kappa = L/\mu\) controls convergence

Convergence Guarantees:

Descent Lemma: $\(f(x_{t+1}) \leq f(x_t) - \frac{\eta}{2}\|\nabla f(x_t)\|^2\)$

Convex + Smooth: $\(f(x_T) - f^* \leq O(1/T)\)$

Strongly Convex + Smooth: $\(\|x_T - x^*\|^2 \leq (1 - \mu/L)^T \|x_0 - x^*\|^2\)$

Step Size Selection:

  • Too large: Divergence

  • Optimal: \(\eta = 1/L\)

  • Too small: Slow but stable

Extensions:

  • Momentum: Accelerated convergence

  • Adam: Adaptive learning rates

  • Nesterov: Optimal \(O(1/T^2)\) for convex

  • Stochastic GD: Noisy gradients

Applications:

  • Deep learning optimization

  • Convex programming

  • Statistical learning

Next Steps:

  • 07_concentration_inequalities.ipynb - Stochastic analysis

  • 06_variational_inference.ipynb - Optimization in Bayesian inference

  • Phase 6 neural networks for practical optimization