Gradient DescentΒΆ
Gradient descent variants (SGD, mini-batch, Adam, momentum), learning rates, and convergence analysis.
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import seaborn as sns
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
np.random.seed(42)
1. The Gradient Descent AlgorithmΒΆ
Goal: Find the minimum of a function \(f(\theta)\)
Algorithm:
Initialize parameters: \(\theta_0\)
Repeat until convergence:
Compute gradient: \(g = \nabla f(\theta)\)
Update: \(\theta_{new} = \theta_{old} - \alpha \cdot g\)
Where:
\(\alpha\) is the learning rate (step size)
\(\nabla f(\theta)\) is the gradient (direction of steepest ascent)
We move in the negative gradient direction (to descend)
2. Simple 1D ExampleΒΆ
Starting with the function \(f(x) = x^2 + 5\) strips gradient descent down to its essence. The gradient is simply \(f'(x) = 2x\), so at any point the update rule becomes \(x_{\text{new}} = x - \alpha \cdot 2x = x(1 - 2\alpha)\). With \(\alpha = 0.1\), each step multiplies \(x\) by \(0.8\), geometrically shrinking the distance to the minimum at \(x = 0\). Watching this process on a plot builds the intuition that gradient descent is just βroll downhillβ β the steeper the slope, the bigger the step.
def f(x):
"""Objective function: f(x) = x^2 + 5"""
return x**2 + 5
def grad_f(x):
"""Gradient: f'(x) = 2x"""
return 2*x
def gradient_descent_1d(initial_x, learning_rate, n_iterations):
"""Perform gradient descent in 1D"""
x = initial_x
history_x = [x]
history_f = [f(x)]
for i in range(n_iterations):
gradient = grad_f(x)
x = x - learning_rate * gradient
history_x.append(x)
history_f.append(f(x))
return np.array(history_x), np.array(history_f)
# Run gradient descent
initial_x = 8.0
learning_rate = 0.1
n_iterations = 20
history_x, history_f = gradient_descent_1d(initial_x, learning_rate, n_iterations)
# Visualize
x_range = np.linspace(-10, 10, 200)
y_range = f(x_range)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Plot 1: Descent path on function
ax1.plot(x_range, y_range, 'b-', linewidth=2, label='f(x) = xΒ² + 5')
ax1.plot(history_x, history_f, 'ro-', markersize=8, linewidth=1.5,
alpha=0.7, label='Gradient descent path')
ax1.plot(history_x[0], history_f[0], 'g*', markersize=20, label='Start')
ax1.plot(history_x[-1], history_f[-1], 'r*', markersize=20, label='End')
# Add arrows to show direction
for i in range(0, len(history_x)-1, 2):
ax1.annotate('', xy=(history_x[i+1], history_f[i+1]),
xytext=(history_x[i], history_f[i]),
arrowprops=dict(arrowstyle='->', color='red', lw=1.5, alpha=0.5))
ax1.set_xlabel('x', fontsize=12)
ax1.set_ylabel('f(x)', fontsize=12)
ax1.set_title(f'Gradient Descent Path (Ξ±={learning_rate})', fontsize=14)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)
# Plot 2: Convergence over iterations
ax2.plot(history_f, 'o-', linewidth=2, markersize=6)
ax2.axhline(y=5, color='r', linestyle='--', linewidth=2, label='True minimum')
ax2.set_xlabel('Iteration', fontsize=12)
ax2.set_ylabel('f(x)', fontsize=12)
ax2.set_title('Convergence Plot', fontsize=14)
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("=== Gradient Descent Results ===")
print(f"Initial x: {history_x[0]:.6f}")
print(f"Final x: {history_x[-1]:.6f}")
print(f"True minimum: x = 0")
print(f"\nInitial f(x): {history_f[0]:.6f}")
print(f"Final f(x): {history_f[-1]:.6f}")
print(f"True minimum: f(x) = 5")
3. Effect of Learning RateΒΆ
The learning rate is crucial:
Too small: Slow convergence
Too large: Overshooting, divergence
Just right: Fast, stable convergence
# Compare different learning rates
learning_rates = [0.01, 0.1, 0.5, 0.9]
colors = ['blue', 'green', 'orange', 'red']
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()
for idx, (lr, color) in enumerate(zip(learning_rates, colors)):
history_x, history_f = gradient_descent_1d(initial_x=8.0,
learning_rate=lr,
n_iterations=30)
# Plot function
x_range = np.linspace(-10, 10, 200)
axes[idx].plot(x_range, f(x_range), 'b-', linewidth=2, alpha=0.3)
# Plot path
axes[idx].plot(history_x, history_f, 'o-', color=color,
linewidth=2, markersize=6, alpha=0.8)
axes[idx].plot(history_x[0], history_f[0], 'g*', markersize=15)
# Labels
axes[idx].set_xlabel('x', fontsize=11)
axes[idx].set_ylabel('f(x)', fontsize=11)
axes[idx].set_title(f'Learning Rate Ξ± = {lr}\nFinal x = {history_x[-1]:.4f}',
fontsize=12)
axes[idx].grid(True, alpha=0.3)
# Set y-limits for better visualization
if lr < 0.9:
axes[idx].set_ylim([0, 100])
else:
axes[idx].set_ylim([0, 500])
plt.suptitle('Effect of Learning Rate on Convergence', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
print("Observations:")
print("- Ξ±=0.01: Very slow, steady progress")
print("- Ξ±=0.1: Good balance, smooth convergence")
print("- Ξ±=0.5: Faster but some oscillation")
print("- Ξ±=0.9: Too large! Oscillates wildly, may diverge")
4. 2D Gradient DescentΒΆ
Letβs visualize gradient descent on a 2D surface:
This is a simple paraboloid (bowl shape).
def f_2d(x, y):
"""2D objective function"""
return x**2 + y**2
def grad_f_2d(x, y):
"""Gradient of f"""
return np.array([2*x, 2*y])
def gradient_descent_2d(initial_pos, learning_rate, n_iterations):
"""Perform gradient descent in 2D"""
pos = np.array(initial_pos, dtype=float)
history = [pos.copy()]
for i in range(n_iterations):
gradient = grad_f_2d(pos[0], pos[1])
pos = pos - learning_rate * gradient
history.append(pos.copy())
return np.array(history)
# Run gradient descent
initial_pos = [5, 5]
learning_rate = 0.1
history = gradient_descent_2d(initial_pos, learning_rate, n_iterations=30)
# Create meshgrid for visualization
x = np.linspace(-6, 6, 100)
y = np.linspace(-6, 6, 100)
X, Y = np.meshgrid(x, y)
Z = f_2d(X, Y)
# Visualize
fig = plt.figure(figsize=(16, 6))
# 3D surface plot
ax1 = fig.add_subplot(131, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap=cm.viridis, alpha=0.6)
ax1.plot(history[:, 0], history[:, 1],
[f_2d(p[0], p[1]) for p in history],
'ro-', linewidth=2, markersize=5, label='GD path')
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('3D Surface with GD Path')
# Contour plot with path
ax2 = fig.add_subplot(132)
contour = ax2.contour(X, Y, Z, levels=30, cmap=cm.viridis)
ax2.clabel(contour, inline=True, fontsize=8)
ax2.plot(history[:, 0], history[:, 1], 'ro-', linewidth=2, markersize=6)
ax2.plot(history[0, 0], history[0, 1], 'g*', markersize=20, label='Start')
ax2.plot(history[-1, 0], history[-1, 1], 'r*', markersize=20, label='End')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Contour Plot with GD Path')
ax2.legend()
ax2.grid(True, alpha=0.3)
# Convergence plot
ax3 = fig.add_subplot(133)
f_values = [f_2d(p[0], p[1]) for p in history]
ax3.plot(f_values, 'o-', linewidth=2, markersize=6)
ax3.axhline(y=0, color='r', linestyle='--', linewidth=2, label='True minimum')
ax3.set_xlabel('Iteration')
ax3.set_ylabel('f(x,y)')
ax3.set_title('Convergence')
ax3.legend()
ax3.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Initial position: {history[0]}")
print(f"Final position: {history[-1]}")
print(f"Distance from minimum: {np.linalg.norm(history[-1]):.6f}")
5. Non-Convex Functions: Local MinimaΒΆ
Real-world loss surfaces in deep learning are highly non-convex β they contain many local minima, saddle points, and flat plateaus. Gradient descent is a greedy algorithm: it always moves downhill from its current position, so it can get trapped in a local minimum that is far from the global best. The code below demonstrates this by running gradient descent from four different starting points on a polynomial with multiple valleys. Each run converges to a different minimum, illustrating why initialization strategy matters and why techniques like random restarts, momentum, and stochastic noise (SGD) help escape poor local optima in practice.
def f_nonconvex(x):
"""Non-convex function with multiple local minima"""
return x**4 - 5*x**3 + 4*x**2 + 3*x
def grad_f_nonconvex(x):
"""Gradient"""
return 4*x**3 - 15*x**2 + 8*x + 3
# Try different starting points
starting_points = [-1, 0, 2, 4]
learning_rate = 0.01
n_iterations = 100
x_range = np.linspace(-2, 5, 300)
y_range = f_nonconvex(x_range)
plt.figure(figsize=(14, 6))
plt.plot(x_range, y_range, 'b-', linewidth=2, label='f(x)')
colors = ['red', 'green', 'purple', 'orange']
for start_x, color in zip(starting_points, colors):
x = start_x
history_x = [x]
for _ in range(n_iterations):
gradient = grad_f_nonconvex(x)
x = x - learning_rate * gradient
history_x.append(x)
history_x = np.array(history_x)
history_y = f_nonconvex(history_x)
plt.plot(history_x, history_y, 'o-', color=color, alpha=0.6,
linewidth=1.5, markersize=3,
label=f'Start: x={start_x:.1f}, End: x={history_x[-1]:.2f}')
plt.plot(start_x, f_nonconvex(start_x), '*', color=color, markersize=15)
plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title('Non-Convex Function: Different Starting Points Lead to Different Minima',
fontsize=14)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.show()
print("Notice: Different starting points converge to different local minima!")
print("This is a key challenge in deep learning.")
6. Stochastic Gradient Descent (SGD)ΒΆ
In ML, we often minimize loss over a dataset:
Three variants:
Batch GD: Use all data for each update (slow for large datasets)
Stochastic GD (SGD): Use one random sample per update (noisy but fast)
Mini-batch GD: Use small batch of samples (best of both worlds)
# Simulate linear regression: y = 2x + 3 + noise
np.random.seed(42)
n_samples = 100
X_data = np.random.randn(n_samples)
y_data = 2 * X_data + 3 + np.random.randn(n_samples) * 0.5
def loss(w, b, X, y):
"""Mean squared error loss"""
predictions = w * X + b
return np.mean((predictions - y)**2)
def grad_loss(w, b, X, y):
"""Gradient of MSE loss"""
predictions = w * X + b
error = predictions - y
grad_w = 2 * np.mean(error * X)
grad_b = 2 * np.mean(error)
return grad_w, grad_b
# Batch Gradient Descent
def batch_gd(X, y, learning_rate=0.01, n_iterations=100):
w, b = 0.0, 0.0
history_loss = []
for i in range(n_iterations):
# Use ALL data
grad_w, grad_b = grad_loss(w, b, X, y)
w -= learning_rate * grad_w
b -= learning_rate * grad_b
history_loss.append(loss(w, b, X, y))
return w, b, history_loss
# Stochastic Gradient Descent
def stochastic_gd(X, y, learning_rate=0.01, n_epochs=10):
w, b = 0.0, 0.0
history_loss = []
for epoch in range(n_epochs):
for i in range(len(X)):
# Use ONE random sample
idx = np.random.randint(len(X))
grad_w, grad_b = grad_loss(w, b, X[idx:idx+1], y[idx:idx+1])
w -= learning_rate * grad_w
b -= learning_rate * grad_b
history_loss.append(loss(w, b, X, y))
return w, b, history_loss
# Mini-batch Gradient Descent
def minibatch_gd(X, y, learning_rate=0.01, batch_size=10, n_epochs=10):
w, b = 0.0, 0.0
history_loss = []
for epoch in range(n_epochs):
indices = np.random.permutation(len(X))
for i in range(0, len(X), batch_size):
# Use a BATCH of samples
batch_indices = indices[i:i+batch_size]
grad_w, grad_b = grad_loss(w, b, X[batch_indices], y[batch_indices])
w -= learning_rate * grad_w
b -= learning_rate * grad_b
history_loss.append(loss(w, b, X, y))
return w, b, history_loss
# Run all three methods
w_batch, b_batch, loss_batch = batch_gd(X_data, y_data, learning_rate=0.1, n_iterations=100)
w_sgd, b_sgd, loss_sgd = stochastic_gd(X_data, y_data, learning_rate=0.01, n_epochs=10)
w_mb, b_mb, loss_mb = minibatch_gd(X_data, y_data, learning_rate=0.05, batch_size=10, n_epochs=10)
# Visualize
plt.figure(figsize=(14, 6))
plt.plot(loss_batch, label='Batch GD', linewidth=2)
plt.plot(loss_sgd, label='Stochastic GD', linewidth=1, alpha=0.7)
plt.plot(loss_mb, label='Mini-batch GD', linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Comparison of Gradient Descent Variants', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.yscale('log')
plt.show()
print("=== Results ===")
print(f"True parameters: w=2, b=3")
print(f"\nBatch GD: w={w_batch:.4f}, b={b_batch:.4f}")
print(f"Stochastic GD: w={w_sgd:.4f}, b={b_sgd:.4f}")
print(f"Mini-batch GD: w={w_mb:.4f}, b={b_mb:.4f}")
print("\nObservations:")
print("- Batch GD: Smooth convergence, but slow per iteration")
print("- SGD: Noisy but can escape local minima, fast updates")
print("- Mini-batch: Good balance, most commonly used in practice")
7. Momentum: Accelerating ConvergenceΒΆ
Idea: Add βmomentumβ to smooth out oscillations and accelerate in consistent directions.
Update rule: $\(v_t = \beta v_{t-1} + \nabla f(\theta_t)\)\( \)\(\theta_{t+1} = \theta_t - \alpha v_t\)$
Where \(\beta\) (typically 0.9) controls how much past gradients influence current update.
def gradient_descent_with_momentum(initial_pos, learning_rate, momentum, n_iterations):
"""GD with momentum in 2D"""
pos = np.array(initial_pos, dtype=float)
velocity = np.zeros_like(pos)
history = [pos.copy()]
for i in range(n_iterations):
gradient = grad_f_2d(pos[0], pos[1])
velocity = momentum * velocity + gradient
pos = pos - learning_rate * velocity
history.append(pos.copy())
return np.array(history)
# Compare with and without momentum
initial_pos = [5, 5]
learning_rate = 0.05
n_iterations = 50
history_no_momentum = gradient_descent_2d(initial_pos, learning_rate, n_iterations)
history_with_momentum = gradient_descent_with_momentum(initial_pos, learning_rate,
momentum=0.9, n_iterations=n_iterations)
# Visualize
x = np.linspace(-6, 6, 100)
y = np.linspace(-6, 6, 100)
X, Y = np.meshgrid(x, y)
Z = f_2d(X, Y)
plt.figure(figsize=(14, 6))
# Without momentum
plt.subplot(1, 2, 1)
contour = plt.contour(X, Y, Z, levels=20, cmap=cm.viridis)
plt.clabel(contour, inline=True, fontsize=8)
plt.plot(history_no_momentum[:, 0], history_no_momentum[:, 1],
'ro-', linewidth=2, markersize=5, label='Without momentum')
plt.plot(history_no_momentum[0, 0], history_no_momentum[0, 1], 'g*', markersize=20)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Standard Gradient Descent', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
# With momentum
plt.subplot(1, 2, 2)
contour = plt.contour(X, Y, Z, levels=20, cmap=cm.viridis)
plt.clabel(contour, inline=True, fontsize=8)
plt.plot(history_with_momentum[:, 0], history_with_momentum[:, 1],
'bo-', linewidth=2, markersize=5, label='With momentum (Ξ²=0.9)')
plt.plot(history_with_momentum[0, 0], history_with_momentum[0, 1], 'g*', markersize=20)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Gradient Descent with Momentum', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Without momentum - Final position: {history_no_momentum[-1]}")
print(f"With momentum - Final position: {history_with_momentum[-1]}")
print("\nMomentum often converges faster and more smoothly!")
8. Advanced Optimizers OverviewΒΆ
Modern deep learning uses sophisticated optimizers:
Optimizer |
Key Feature |
|---|---|
SGD |
Simple, proven, requires tuning |
Momentum |
Accelerates in consistent directions |
AdaGrad |
Adapts learning rate per parameter |
RMSprop |
Fixes AdaGradβs aggressive decay |
Adam |
Combines momentum + adaptive learning rates (most popular!) |
AdamW |
Adam with better weight decay |
Rule of thumb: Start with Adam, it works well for most problems.
# Simple Adam optimizer implementation
def adam_optimizer(initial_pos, learning_rate=0.1, beta1=0.9, beta2=0.999,
epsilon=1e-8, n_iterations=50):
"""
Adam optimizer
beta1: exponential decay rate for first moment (momentum)
beta2: exponential decay rate for second moment (adaptive learning rate)
"""
pos = np.array(initial_pos, dtype=float)
m = np.zeros_like(pos) # First moment
v = np.zeros_like(pos) # Second moment
history = [pos.copy()]
for t in range(1, n_iterations + 1):
gradient = grad_f_2d(pos[0], pos[1])
# Update biased first moment estimate
m = beta1 * m + (1 - beta1) * gradient
# Update biased second moment estimate
v = beta2 * v + (1 - beta2) * gradient**2
# Compute bias-corrected moment estimates
m_hat = m / (1 - beta1**t)
v_hat = v / (1 - beta2**t)
# Update parameters
pos = pos - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
history.append(pos.copy())
return np.array(history)
# Compare optimizers
initial_pos = [5, 5]
n_iterations = 30
history_sgd = gradient_descent_2d(initial_pos, learning_rate=0.1, n_iterations=n_iterations)
history_momentum = gradient_descent_with_momentum(initial_pos, learning_rate=0.1,
momentum=0.9, n_iterations=n_iterations)
history_adam = adam_optimizer(initial_pos, learning_rate=0.5, n_iterations=n_iterations)
# Visualize
x = np.linspace(-6, 6, 100)
y = np.linspace(-6, 6, 100)
X, Y = np.meshgrid(x, y)
Z = f_2d(X, Y)
plt.figure(figsize=(14, 6))
contour = plt.contour(X, Y, Z, levels=20, cmap=cm.viridis, alpha=0.3)
plt.clabel(contour, inline=True, fontsize=8)
plt.plot(history_sgd[:, 0], history_sgd[:, 1],
'r.-', linewidth=2, markersize=6, label='SGD', alpha=0.7)
plt.plot(history_momentum[:, 0], history_momentum[:, 1],
'g.-', linewidth=2, markersize=6, label='Momentum', alpha=0.7)
plt.plot(history_adam[:, 0], history_adam[:, 1],
'b.-', linewidth=2, markersize=6, label='Adam', alpha=0.7)
plt.plot(initial_pos[0], initial_pos[1], 'k*', markersize=20, label='Start')
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Comparison of Different Optimizers', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
print("Final positions after 30 iterations:")
print(f"SGD: {history_sgd[-1]}")
print(f"Momentum: {history_momentum[-1]}")
print(f"Adam: {history_adam[-1]}")
print("\nAdam typically converges fastest with less tuning needed!")
SummaryΒΆ
Youβve learned:
β Gradient descent algorithm: Follow negative gradient to minimize loss β Learning rate: Critical hyperparameter - too small (slow) vs too large (diverge) β 1D and 2D visualization: How GD navigates loss landscapes β Local minima: Non-convex functions can trap optimization β Batch vs SGD vs Mini-batch: Tradeoffs between accuracy and speed β Momentum: Accelerates convergence by building velocity β Advanced optimizers: Adam combines best features, most popular choice
Key Insights:ΒΆ
Gradient descent is the foundation of training neural networks
Learning rate is crucial - use learning rate schedules in practice
Mini-batch + Adam is the standard combination for deep learning
Visualization helps understand optimizer behavior
Next Steps:ΒΆ
Study learning rate schedules (decay, warmup)
Explore second-order methods (Newtonβs method, L-BFGS)
Learn about batch normalization and its interaction with optimizers
Practice implementing backpropagation + GD for neural networks