Gradient DescentΒΆ

Gradient descent variants (SGD, mini-batch, Adam, momentum), learning rates, and convergence analysis.

# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import seaborn as sns

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
np.random.seed(42)

1. The Gradient Descent AlgorithmΒΆ

Goal: Find the minimum of a function \(f(\theta)\)

Algorithm:

  1. Initialize parameters: \(\theta_0\)

  2. Repeat until convergence:

    • Compute gradient: \(g = \nabla f(\theta)\)

    • Update: \(\theta_{new} = \theta_{old} - \alpha \cdot g\)

Where:

  • \(\alpha\) is the learning rate (step size)

  • \(\nabla f(\theta)\) is the gradient (direction of steepest ascent)

  • We move in the negative gradient direction (to descend)

2. Simple 1D ExampleΒΆ

Starting with the function \(f(x) = x^2 + 5\) strips gradient descent down to its essence. The gradient is simply \(f'(x) = 2x\), so at any point the update rule becomes \(x_{\text{new}} = x - \alpha \cdot 2x = x(1 - 2\alpha)\). With \(\alpha = 0.1\), each step multiplies \(x\) by \(0.8\), geometrically shrinking the distance to the minimum at \(x = 0\). Watching this process on a plot builds the intuition that gradient descent is just β€œroll downhill” – the steeper the slope, the bigger the step.

def f(x):
    """Objective function: f(x) = x^2 + 5"""
    return x**2 + 5

def grad_f(x):
    """Gradient: f'(x) = 2x"""
    return 2*x

def gradient_descent_1d(initial_x, learning_rate, n_iterations):
    """Perform gradient descent in 1D"""
    x = initial_x
    history_x = [x]
    history_f = [f(x)]
    
    for i in range(n_iterations):
        gradient = grad_f(x)
        x = x - learning_rate * gradient
        history_x.append(x)
        history_f.append(f(x))
    
    return np.array(history_x), np.array(history_f)

# Run gradient descent
initial_x = 8.0
learning_rate = 0.1
n_iterations = 20

history_x, history_f = gradient_descent_1d(initial_x, learning_rate, n_iterations)

# Visualize
x_range = np.linspace(-10, 10, 200)
y_range = f(x_range)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Descent path on function
ax1.plot(x_range, y_range, 'b-', linewidth=2, label='f(x) = xΒ² + 5')
ax1.plot(history_x, history_f, 'ro-', markersize=8, linewidth=1.5, 
         alpha=0.7, label='Gradient descent path')
ax1.plot(history_x[0], history_f[0], 'g*', markersize=20, label='Start')
ax1.plot(history_x[-1], history_f[-1], 'r*', markersize=20, label='End')

# Add arrows to show direction
for i in range(0, len(history_x)-1, 2):
    ax1.annotate('', xy=(history_x[i+1], history_f[i+1]), 
                xytext=(history_x[i], history_f[i]),
                arrowprops=dict(arrowstyle='->', color='red', lw=1.5, alpha=0.5))

ax1.set_xlabel('x', fontsize=12)
ax1.set_ylabel('f(x)', fontsize=12)
ax1.set_title(f'Gradient Descent Path (Ξ±={learning_rate})', fontsize=14)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: Convergence over iterations
ax2.plot(history_f, 'o-', linewidth=2, markersize=6)
ax2.axhline(y=5, color='r', linestyle='--', linewidth=2, label='True minimum')
ax2.set_xlabel('Iteration', fontsize=12)
ax2.set_ylabel('f(x)', fontsize=12)
ax2.set_title('Convergence Plot', fontsize=14)
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("=== Gradient Descent Results ===")
print(f"Initial x: {history_x[0]:.6f}")
print(f"Final x: {history_x[-1]:.6f}")
print(f"True minimum: x = 0")
print(f"\nInitial f(x): {history_f[0]:.6f}")
print(f"Final f(x): {history_f[-1]:.6f}")
print(f"True minimum: f(x) = 5")

3. Effect of Learning RateΒΆ

The learning rate is crucial:

  • Too small: Slow convergence

  • Too large: Overshooting, divergence

  • Just right: Fast, stable convergence

# Compare different learning rates
learning_rates = [0.01, 0.1, 0.5, 0.9]
colors = ['blue', 'green', 'orange', 'red']

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, (lr, color) in enumerate(zip(learning_rates, colors)):
    history_x, history_f = gradient_descent_1d(initial_x=8.0, 
                                                learning_rate=lr, 
                                                n_iterations=30)
    
    # Plot function
    x_range = np.linspace(-10, 10, 200)
    axes[idx].plot(x_range, f(x_range), 'b-', linewidth=2, alpha=0.3)
    
    # Plot path
    axes[idx].plot(history_x, history_f, 'o-', color=color, 
                   linewidth=2, markersize=6, alpha=0.8)
    axes[idx].plot(history_x[0], history_f[0], 'g*', markersize=15)
    
    # Labels
    axes[idx].set_xlabel('x', fontsize=11)
    axes[idx].set_ylabel('f(x)', fontsize=11)
    axes[idx].set_title(f'Learning Rate Ξ± = {lr}\nFinal x = {history_x[-1]:.4f}', 
                       fontsize=12)
    axes[idx].grid(True, alpha=0.3)
    
    # Set y-limits for better visualization
    if lr < 0.9:
        axes[idx].set_ylim([0, 100])
    else:
        axes[idx].set_ylim([0, 500])

plt.suptitle('Effect of Learning Rate on Convergence', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("Observations:")
print("- Ξ±=0.01: Very slow, steady progress")
print("- Ξ±=0.1: Good balance, smooth convergence")
print("- Ξ±=0.5: Faster but some oscillation")
print("- Ξ±=0.9: Too large! Oscillates wildly, may diverge")

4. 2D Gradient DescentΒΆ

Let’s visualize gradient descent on a 2D surface:

\[f(x, y) = x^2 + y^2\]

This is a simple paraboloid (bowl shape).

def f_2d(x, y):
    """2D objective function"""
    return x**2 + y**2

def grad_f_2d(x, y):
    """Gradient of f"""
    return np.array([2*x, 2*y])

def gradient_descent_2d(initial_pos, learning_rate, n_iterations):
    """Perform gradient descent in 2D"""
    pos = np.array(initial_pos, dtype=float)
    history = [pos.copy()]
    
    for i in range(n_iterations):
        gradient = grad_f_2d(pos[0], pos[1])
        pos = pos - learning_rate * gradient
        history.append(pos.copy())
    
    return np.array(history)

# Run gradient descent
initial_pos = [5, 5]
learning_rate = 0.1
history = gradient_descent_2d(initial_pos, learning_rate, n_iterations=30)

# Create meshgrid for visualization
x = np.linspace(-6, 6, 100)
y = np.linspace(-6, 6, 100)
X, Y = np.meshgrid(x, y)
Z = f_2d(X, Y)

# Visualize
fig = plt.figure(figsize=(16, 6))

# 3D surface plot
ax1 = fig.add_subplot(131, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap=cm.viridis, alpha=0.6)
ax1.plot(history[:, 0], history[:, 1], 
         [f_2d(p[0], p[1]) for p in history],
         'ro-', linewidth=2, markersize=5, label='GD path')
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('3D Surface with GD Path')

# Contour plot with path
ax2 = fig.add_subplot(132)
contour = ax2.contour(X, Y, Z, levels=30, cmap=cm.viridis)
ax2.clabel(contour, inline=True, fontsize=8)
ax2.plot(history[:, 0], history[:, 1], 'ro-', linewidth=2, markersize=6)
ax2.plot(history[0, 0], history[0, 1], 'g*', markersize=20, label='Start')
ax2.plot(history[-1, 0], history[-1, 1], 'r*', markersize=20, label='End')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Contour Plot with GD Path')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Convergence plot
ax3 = fig.add_subplot(133)
f_values = [f_2d(p[0], p[1]) for p in history]
ax3.plot(f_values, 'o-', linewidth=2, markersize=6)
ax3.axhline(y=0, color='r', linestyle='--', linewidth=2, label='True minimum')
ax3.set_xlabel('Iteration')
ax3.set_ylabel('f(x,y)')
ax3.set_title('Convergence')
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Initial position: {history[0]}")
print(f"Final position: {history[-1]}")
print(f"Distance from minimum: {np.linalg.norm(history[-1]):.6f}")

5. Non-Convex Functions: Local MinimaΒΆ

Real-world loss surfaces in deep learning are highly non-convex – they contain many local minima, saddle points, and flat plateaus. Gradient descent is a greedy algorithm: it always moves downhill from its current position, so it can get trapped in a local minimum that is far from the global best. The code below demonstrates this by running gradient descent from four different starting points on a polynomial with multiple valleys. Each run converges to a different minimum, illustrating why initialization strategy matters and why techniques like random restarts, momentum, and stochastic noise (SGD) help escape poor local optima in practice.

def f_nonconvex(x):
    """Non-convex function with multiple local minima"""
    return x**4 - 5*x**3 + 4*x**2 + 3*x

def grad_f_nonconvex(x):
    """Gradient"""
    return 4*x**3 - 15*x**2 + 8*x + 3

# Try different starting points
starting_points = [-1, 0, 2, 4]
learning_rate = 0.01
n_iterations = 100

x_range = np.linspace(-2, 5, 300)
y_range = f_nonconvex(x_range)

plt.figure(figsize=(14, 6))
plt.plot(x_range, y_range, 'b-', linewidth=2, label='f(x)')

colors = ['red', 'green', 'purple', 'orange']

for start_x, color in zip(starting_points, colors):
    x = start_x
    history_x = [x]
    
    for _ in range(n_iterations):
        gradient = grad_f_nonconvex(x)
        x = x - learning_rate * gradient
        history_x.append(x)
    
    history_x = np.array(history_x)
    history_y = f_nonconvex(history_x)
    
    plt.plot(history_x, history_y, 'o-', color=color, alpha=0.6,
            linewidth=1.5, markersize=3, 
            label=f'Start: x={start_x:.1f}, End: x={history_x[-1]:.2f}')
    plt.plot(start_x, f_nonconvex(start_x), '*', color=color, markersize=15)

plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title('Non-Convex Function: Different Starting Points Lead to Different Minima', 
         fontsize=14)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.show()

print("Notice: Different starting points converge to different local minima!")
print("This is a key challenge in deep learning.")

6. Stochastic Gradient Descent (SGD)ΒΆ

In ML, we often minimize loss over a dataset:

\[L(\theta) = \frac{1}{n} \sum_{i=1}^{n} \ell(\theta, x_i, y_i)\]

Three variants:

  1. Batch GD: Use all data for each update (slow for large datasets)

  2. Stochastic GD (SGD): Use one random sample per update (noisy but fast)

  3. Mini-batch GD: Use small batch of samples (best of both worlds)

# Simulate linear regression: y = 2x + 3 + noise
np.random.seed(42)
n_samples = 100
X_data = np.random.randn(n_samples)
y_data = 2 * X_data + 3 + np.random.randn(n_samples) * 0.5

def loss(w, b, X, y):
    """Mean squared error loss"""
    predictions = w * X + b
    return np.mean((predictions - y)**2)

def grad_loss(w, b, X, y):
    """Gradient of MSE loss"""
    predictions = w * X + b
    error = predictions - y
    grad_w = 2 * np.mean(error * X)
    grad_b = 2 * np.mean(error)
    return grad_w, grad_b

# Batch Gradient Descent
def batch_gd(X, y, learning_rate=0.01, n_iterations=100):
    w, b = 0.0, 0.0
    history_loss = []
    
    for i in range(n_iterations):
        # Use ALL data
        grad_w, grad_b = grad_loss(w, b, X, y)
        w -= learning_rate * grad_w
        b -= learning_rate * grad_b
        history_loss.append(loss(w, b, X, y))
    
    return w, b, history_loss

# Stochastic Gradient Descent
def stochastic_gd(X, y, learning_rate=0.01, n_epochs=10):
    w, b = 0.0, 0.0
    history_loss = []
    
    for epoch in range(n_epochs):
        for i in range(len(X)):
            # Use ONE random sample
            idx = np.random.randint(len(X))
            grad_w, grad_b = grad_loss(w, b, X[idx:idx+1], y[idx:idx+1])
            w -= learning_rate * grad_w
            b -= learning_rate * grad_b
            history_loss.append(loss(w, b, X, y))
    
    return w, b, history_loss

# Mini-batch Gradient Descent
def minibatch_gd(X, y, learning_rate=0.01, batch_size=10, n_epochs=10):
    w, b = 0.0, 0.0
    history_loss = []
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(len(X))
        for i in range(0, len(X), batch_size):
            # Use a BATCH of samples
            batch_indices = indices[i:i+batch_size]
            grad_w, grad_b = grad_loss(w, b, X[batch_indices], y[batch_indices])
            w -= learning_rate * grad_w
            b -= learning_rate * grad_b
            history_loss.append(loss(w, b, X, y))
    
    return w, b, history_loss

# Run all three methods
w_batch, b_batch, loss_batch = batch_gd(X_data, y_data, learning_rate=0.1, n_iterations=100)
w_sgd, b_sgd, loss_sgd = stochastic_gd(X_data, y_data, learning_rate=0.01, n_epochs=10)
w_mb, b_mb, loss_mb = minibatch_gd(X_data, y_data, learning_rate=0.05, batch_size=10, n_epochs=10)

# Visualize
plt.figure(figsize=(14, 6))
plt.plot(loss_batch, label='Batch GD', linewidth=2)
plt.plot(loss_sgd, label='Stochastic GD', linewidth=1, alpha=0.7)
plt.plot(loss_mb, label='Mini-batch GD', linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Comparison of Gradient Descent Variants', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.yscale('log')
plt.show()

print("=== Results ===")
print(f"True parameters: w=2, b=3")
print(f"\nBatch GD:      w={w_batch:.4f}, b={b_batch:.4f}")
print(f"Stochastic GD: w={w_sgd:.4f}, b={b_sgd:.4f}")
print(f"Mini-batch GD: w={w_mb:.4f}, b={b_mb:.4f}")
print("\nObservations:")
print("- Batch GD: Smooth convergence, but slow per iteration")
print("- SGD: Noisy but can escape local minima, fast updates")
print("- Mini-batch: Good balance, most commonly used in practice")

7. Momentum: Accelerating ConvergenceΒΆ

Idea: Add β€œmomentum” to smooth out oscillations and accelerate in consistent directions.

Update rule: $\(v_t = \beta v_{t-1} + \nabla f(\theta_t)\)\( \)\(\theta_{t+1} = \theta_t - \alpha v_t\)$

Where \(\beta\) (typically 0.9) controls how much past gradients influence current update.

def gradient_descent_with_momentum(initial_pos, learning_rate, momentum, n_iterations):
    """GD with momentum in 2D"""
    pos = np.array(initial_pos, dtype=float)
    velocity = np.zeros_like(pos)
    history = [pos.copy()]
    
    for i in range(n_iterations):
        gradient = grad_f_2d(pos[0], pos[1])
        velocity = momentum * velocity + gradient
        pos = pos - learning_rate * velocity
        history.append(pos.copy())
    
    return np.array(history)

# Compare with and without momentum
initial_pos = [5, 5]
learning_rate = 0.05
n_iterations = 50

history_no_momentum = gradient_descent_2d(initial_pos, learning_rate, n_iterations)
history_with_momentum = gradient_descent_with_momentum(initial_pos, learning_rate, 
                                                        momentum=0.9, n_iterations=n_iterations)

# Visualize
x = np.linspace(-6, 6, 100)
y = np.linspace(-6, 6, 100)
X, Y = np.meshgrid(x, y)
Z = f_2d(X, Y)

plt.figure(figsize=(14, 6))

# Without momentum
plt.subplot(1, 2, 1)
contour = plt.contour(X, Y, Z, levels=20, cmap=cm.viridis)
plt.clabel(contour, inline=True, fontsize=8)
plt.plot(history_no_momentum[:, 0], history_no_momentum[:, 1], 
         'ro-', linewidth=2, markersize=5, label='Without momentum')
plt.plot(history_no_momentum[0, 0], history_no_momentum[0, 1], 'g*', markersize=20)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Standard Gradient Descent', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)

# With momentum
plt.subplot(1, 2, 2)
contour = plt.contour(X, Y, Z, levels=20, cmap=cm.viridis)
plt.clabel(contour, inline=True, fontsize=8)
plt.plot(history_with_momentum[:, 0], history_with_momentum[:, 1], 
         'bo-', linewidth=2, markersize=5, label='With momentum (Ξ²=0.9)')
plt.plot(history_with_momentum[0, 0], history_with_momentum[0, 1], 'g*', markersize=20)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Gradient Descent with Momentum', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Without momentum - Final position: {history_no_momentum[-1]}")
print(f"With momentum - Final position: {history_with_momentum[-1]}")
print("\nMomentum often converges faster and more smoothly!")

8. Advanced Optimizers OverviewΒΆ

Modern deep learning uses sophisticated optimizers:

Optimizer

Key Feature

SGD

Simple, proven, requires tuning

Momentum

Accelerates in consistent directions

AdaGrad

Adapts learning rate per parameter

RMSprop

Fixes AdaGrad’s aggressive decay

Adam

Combines momentum + adaptive learning rates (most popular!)

AdamW

Adam with better weight decay

Rule of thumb: Start with Adam, it works well for most problems.

# Simple Adam optimizer implementation
def adam_optimizer(initial_pos, learning_rate=0.1, beta1=0.9, beta2=0.999, 
                   epsilon=1e-8, n_iterations=50):
    """
    Adam optimizer
    beta1: exponential decay rate for first moment (momentum)
    beta2: exponential decay rate for second moment (adaptive learning rate)
    """
    pos = np.array(initial_pos, dtype=float)
    m = np.zeros_like(pos)  # First moment
    v = np.zeros_like(pos)  # Second moment
    history = [pos.copy()]
    
    for t in range(1, n_iterations + 1):
        gradient = grad_f_2d(pos[0], pos[1])
        
        # Update biased first moment estimate
        m = beta1 * m + (1 - beta1) * gradient
        
        # Update biased second moment estimate
        v = beta2 * v + (1 - beta2) * gradient**2
        
        # Compute bias-corrected moment estimates
        m_hat = m / (1 - beta1**t)
        v_hat = v / (1 - beta2**t)
        
        # Update parameters
        pos = pos - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
        history.append(pos.copy())
    
    return np.array(history)

# Compare optimizers
initial_pos = [5, 5]
n_iterations = 30

history_sgd = gradient_descent_2d(initial_pos, learning_rate=0.1, n_iterations=n_iterations)
history_momentum = gradient_descent_with_momentum(initial_pos, learning_rate=0.1, 
                                                   momentum=0.9, n_iterations=n_iterations)
history_adam = adam_optimizer(initial_pos, learning_rate=0.5, n_iterations=n_iterations)

# Visualize
x = np.linspace(-6, 6, 100)
y = np.linspace(-6, 6, 100)
X, Y = np.meshgrid(x, y)
Z = f_2d(X, Y)

plt.figure(figsize=(14, 6))
contour = plt.contour(X, Y, Z, levels=20, cmap=cm.viridis, alpha=0.3)
plt.clabel(contour, inline=True, fontsize=8)

plt.plot(history_sgd[:, 0], history_sgd[:, 1], 
         'r.-', linewidth=2, markersize=6, label='SGD', alpha=0.7)
plt.plot(history_momentum[:, 0], history_momentum[:, 1], 
         'g.-', linewidth=2, markersize=6, label='Momentum', alpha=0.7)
plt.plot(history_adam[:, 0], history_adam[:, 1], 
         'b.-', linewidth=2, markersize=6, label='Adam', alpha=0.7)

plt.plot(initial_pos[0], initial_pos[1], 'k*', markersize=20, label='Start')
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Comparison of Different Optimizers', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print("Final positions after 30 iterations:")
print(f"SGD:      {history_sgd[-1]}")
print(f"Momentum: {history_momentum[-1]}")
print(f"Adam:     {history_adam[-1]}")
print("\nAdam typically converges fastest with less tuning needed!")

SummaryΒΆ

You’ve learned:

βœ… Gradient descent algorithm: Follow negative gradient to minimize loss βœ… Learning rate: Critical hyperparameter - too small (slow) vs too large (diverge) βœ… 1D and 2D visualization: How GD navigates loss landscapes βœ… Local minima: Non-convex functions can trap optimization βœ… Batch vs SGD vs Mini-batch: Tradeoffs between accuracy and speed βœ… Momentum: Accelerates convergence by building velocity βœ… Advanced optimizers: Adam combines best features, most popular choice

Key Insights:ΒΆ

  1. Gradient descent is the foundation of training neural networks

  2. Learning rate is crucial - use learning rate schedules in practice

  3. Mini-batch + Adam is the standard combination for deep learning

  4. Visualization helps understand optimizer behavior

Next Steps:ΒΆ

  • Study learning rate schedules (decay, warmup)

  • Explore second-order methods (Newton’s method, L-BFGS)

  • Learn about batch normalization and its interaction with optimizers

  • Practice implementing backpropagation + GD for neural networks