import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
np.random.seed(42)
print("β
Imports successful!")
The ProblemΒΆ
Neural networks have millions of parameters (weights and biases). To train them, we need:
Loss function \(L\) that measures error
Gradients \(\frac{\partial L}{\partial w}\) for each weight
Update rule: \(w \leftarrow w - \alpha \frac{\partial L}{\partial w}\)
Challenge: How do we compute gradients efficiently?
Solution: Backpropagation - an application of the chain rule that computes all gradients in one backward pass!
2. The Chain Rule - Foundation of BackpropagationΒΆ
Single Variable Chain RuleΒΆ
If \(y = f(u)\) and \(u = g(x)\), then:
ExampleΒΆ
# Example: y = (3x + 2)^2
# Let u = 3x + 2, then y = u^2
def forward(x):
"""Compute y and intermediate value u"""
u = 3*x + 2 # u = g(x)
y = u**2 # y = f(u)
return y, u
def backward(x, u, y):
"""Compute dy/dx using chain rule"""
dy_du = 2 * u # Derivative of y = u^2
du_dx = 3 # Derivative of u = 3x + 2
dy_dx = dy_du * du_dx # Chain rule!
return dy_dx
# Test at x = 1
x = 1
y, u = forward(x)
gradient = backward(x, u, y)
print(f"At x = {x}:")
print(f" u = 3x + 2 = {u}")
print(f" y = u^2 = {y}")
print(f" dy/dx = {gradient}")
# Verify with numerical gradient
epsilon = 1e-5
y_plus, _ = forward(x + epsilon)
y_minus, _ = forward(x - epsilon)
numerical_gradient = (y_plus - y_minus) / (2 * epsilon)
print(f"\nNumerical gradient: {numerical_gradient:.6f}")
print(f"Analytical gradient: {gradient}")
print(f"Match: {np.isclose(gradient, numerical_gradient)}")
Visualization of the Chain RuleΒΆ
The chain rule becomes far more intuitive when viewed as a computational graph. Each node in the graph represents an elementary operation (addition, multiplication, exponentiation), and the edges carry values forward during the forward pass. During the backward pass, local derivatives flow backward along the same edges and are multiplied together at each node β this is exactly what backpropagation does at scale. Understanding this graph-based perspective is critical because frameworks like PyTorch and TensorFlow build these graphs automatically and use them to compute gradients for millions of parameters in a single backward sweep.
# Computational graph
print("""
Computational Graph:
x β [Γ3, +2] β u β [^2] β y
Forward pass: x=1 β u=5 β y=25
Backward pass (gradients):
dy/dy = 1 (starting point)
dy/du = dy/dy Γ du/dy = 1 Γ 2u = 10
dy/dx = dy/du Γ du/dx = 10 Γ 3 = 30
The chain rule lets us propagate gradients backward!
""")
3. Backpropagation in a Simple NetworkΒΆ
Letβs implement backprop for a tiny network:
x β [w1, b1] β z1 β [ReLU] β a1 β [w2, b2] β z2 β [Sigmoid] β Ε· β [Loss] β L
Forward Pass EquationsΒΆ
Backward Pass - Computing GradientsΒΆ
def sigmoid(z):
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def relu(z):
return np.maximum(0, z)
def relu_derivative(z):
"""Derivative of ReLU"""
return (z > 0).astype(float)
def binary_cross_entropy(y, y_pred):
y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
class TinyNetwork:
"""
A 2-layer network for demonstrating backpropagation
Architecture: 1 input β 1 hidden β 1 output
"""
def __init__(self):
# Initialize weights
self.w1 = np.random.randn() * 0.1
self.b1 = 0.0
self.w2 = np.random.randn() * 0.1
self.b2 = 0.0
def forward(self, x):
"""Forward pass - store all intermediate values"""
# Layer 1
self.x = x
self.z1 = self.w1 * x + self.b1
self.a1 = relu(self.z1)
# Layer 2
self.z2 = self.w2 * self.a1 + self.b2
self.y_pred = sigmoid(self.z2)
return self.y_pred
def backward(self, y):
"""
Backward pass - compute all gradients using chain rule
We compute gradients from the end backwards:
L β Ε· β z2 β a1 β z1 β parameters
"""
# Gradient of loss w.r.t. prediction
# For binary cross-entropy with sigmoid:
# dL/dΕ· = -(y/Ε· - (1-y)/(1-Ε·))
# After applying chain rule with sigmoid: dL/dz2 simplifies to:
dL_dz2 = self.y_pred - y # Beautiful simplification!
# Gradients for layer 2 parameters
dL_dw2 = dL_dz2 * self.a1 # Chain rule: dL/dz2 Γ dz2/dw2
dL_db2 = dL_dz2 # Chain rule: dL/dz2 Γ dz2/db2 = dL/dz2 Γ 1
# Gradient w.r.t. a1 (needed to go further back)
dL_da1 = dL_dz2 * self.w2 # Chain rule: dL/dz2 Γ dz2/da1
# Gradient w.r.t. z1 (apply ReLU derivative)
dL_dz1 = dL_da1 * relu_derivative(self.z1) # Chain rule: dL/da1 Γ da1/dz1
# Gradients for layer 1 parameters
dL_dw1 = dL_dz1 * self.x # Chain rule: dL/dz1 Γ dz1/dw1
dL_db1 = dL_dz1 # Chain rule: dL/dz1 Γ dz1/db1 = dL/dz1 Γ 1
# Store gradients
self.gradients = {
'w1': dL_dw1,
'b1': dL_db1,
'w2': dL_dw2,
'b2': dL_db2
}
return self.gradients
def update(self, learning_rate):
"""Update parameters using gradients"""
self.w1 -= learning_rate * self.gradients['w1']
self.b1 -= learning_rate * self.gradients['b1']
self.w2 -= learning_rate * self.gradients['w2']
self.b2 -= learning_rate * self.gradients['b2']
# Test the network
net = TinyNetwork()
print("Initial parameters:")
print(f" w1 = {net.w1:.4f}, b1 = {net.b1:.4f}")
print(f" w2 = {net.w2:.4f}, b2 = {net.b2:.4f}")
# Forward pass
x = 2.0
y = 1.0
y_pred = net.forward(x)
loss = binary_cross_entropy(y, y_pred)
print(f"\nForward pass:")
print(f" Input: x = {x}")
print(f" Prediction: Ε· = {y_pred:.4f}")
print(f" Target: y = {y}")
print(f" Loss: {loss:.4f}")
# Backward pass
grads = net.backward(y)
print(f"\nGradients (backpropagation):")
for param, grad in grads.items():
print(f" dL/d{param} = {grad:.4f}")
Gradient Checking - Verify BackpropagationΒΆ
We can verify our analytical gradients using numerical gradients:
def gradient_check(net, x, y, epsilon=1e-5):
"""
Verify backpropagation using numerical gradients
"""
# Compute analytical gradients
net.forward(x)
analytical = net.backward(y)
numerical = {}
# Check each parameter
for param in ['w1', 'b1', 'w2', 'b2']:
# Save original value
original = getattr(net, param)
# Compute f(w + epsilon)
setattr(net, param, original + epsilon)
y_pred_plus = net.forward(x)
loss_plus = binary_cross_entropy(y, y_pred_plus)
# Compute f(w - epsilon)
setattr(net, param, original - epsilon)
y_pred_minus = net.forward(x)
loss_minus = binary_cross_entropy(y, y_pred_minus)
# Numerical gradient
numerical[param] = (loss_plus - loss_minus) / (2 * epsilon)
# Restore original value
setattr(net, param, original)
# Compare
print("Gradient Checking Results:")
print("="*60)
print(f"{'Parameter':<10} {'Analytical':<15} {'Numerical':<15} {'Difference':<15}")
print("="*60)
for param in ['w1', 'b1', 'w2', 'b2']:
anal = analytical[param]
numer = numerical[param]
diff = abs(anal - numer)
print(f"{param:<10} {anal:>14.8f} {numer:>14.8f} {diff:>14.8e}")
print("="*60)
print("\nβ
Gradients match! Backpropagation is implemented correctly."
if all(abs(analytical[p] - numerical[p]) < 1e-5 for p in analytical)
else "β Gradients don't match! Check implementation.")
# Run gradient check
net = TinyNetwork()
gradient_check(net, x=2.0, y=1.0)
4. Training with BackpropagationΒΆ
Putting the Pieces TogetherΒΆ
Now we combine the forward pass, loss computation, backward pass, and parameter update into a complete training loop. For each training example the network predicts an output (forward), measures the error (loss), computes how each weight contributed to that error (backward via chain rule), and nudges the weights to reduce the error (gradient descent update). Repeating this cycle over many epochs causes the loss to decrease and the network to learn the target function. The example below trains our TinyNetwork on a small pattern, letting you watch the loss curve converge in real time.
# Generate training data: learn XOR function
X_train = np.array([0.0, 1.0, 2.0, 3.0])
y_train = np.array([0.0, 1.0, 1.0, 0.0]) # Pattern to learn
# Train the network
net = TinyNetwork()
losses = []
learning_rate = 0.1
epochs = 1000
for epoch in range(epochs):
epoch_loss = 0
for x, y in zip(X_train, y_train):
# Forward pass
y_pred = net.forward(x)
loss = binary_cross_entropy(y, y_pred)
epoch_loss += loss
# Backward pass
net.backward(y)
# Update parameters
net.update(learning_rate)
losses.append(epoch_loss / len(X_train))
if (epoch + 1) % 200 == 0:
print(f"Epoch {epoch+1}/{epochs} - Loss: {losses[-1]:.4f}")
# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(losses, linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.grid(True, alpha=0.3)
plt.show()
# Test predictions
print("\nFinal Predictions:")
print(f"{'x':<10} {'True y':<10} {'Predicted Ε·':<15} {'Rounded':<10}")
print("="*50)
for x, y in zip(X_train, y_train):
y_pred = net.forward(x)
print(f"{x:<10.1f} {y:<10.1f} {y_pred:<15.4f} {round(y_pred):<10}")
5. Multi-Layer Network with Matrix OperationsΒΆ
Scaling Up to Real ProblemsΒΆ
The tiny single-sample network above was great for understanding the math, but real neural networks process batches of samples simultaneously using matrix operations. Instead of looping over individual inputs, we pack all samples into a matrix \(X\) of shape \((n_{\text{features}}, m)\) and compute the forward and backward passes with matrix multiplications. This vectorized approach is both mathematically elegant and computationally efficient β it is the reason GPUs (which excel at matrix math) accelerate deep learning by orders of magnitude. The NeuralNetwork class below implements full-batch backpropagation with He weight initialization, which keeps gradient magnitudes stable in networks that use ReLU activations.
class NeuralNetwork:
"""
A fully-connected neural network with backpropagation
"""
def __init__(self, layer_sizes):
"""
Args:
layer_sizes: [input_size, hidden1, hidden2, ..., output_size]
"""
self.params = {}
self.cache = {}
self.gradients = {}
self.num_layers = len(layer_sizes) - 1
# Initialize parameters
for i in range(self.num_layers):
# He initialization for ReLU
self.params[f'W{i+1}'] = np.random.randn(
layer_sizes[i+1], layer_sizes[i]
) * np.sqrt(2.0 / layer_sizes[i])
self.params[f'b{i+1}'] = np.zeros((layer_sizes[i+1], 1))
def forward(self, X):
"""
Forward pass
Args:
X: input data (n_features, n_samples)
Returns:
A: output (n_output, n_samples)
"""
self.cache['A0'] = X
A = X
# Hidden layers with ReLU
for i in range(1, self.num_layers):
Z = np.dot(self.params[f'W{i}'], A) + self.params[f'b{i}']
A = relu(Z)
self.cache[f'Z{i}'] = Z
self.cache[f'A{i}'] = A
# Output layer with sigmoid
Z = np.dot(self.params[f'W{self.num_layers}'], A) + self.params[f'b{self.num_layers}']
A = sigmoid(Z)
self.cache[f'Z{self.num_layers}'] = Z
self.cache[f'A{self.num_layers}'] = A
return A
def backward(self, y):
"""
Backward pass - compute all gradients
Args:
y: true labels (n_output, n_samples)
"""
m = y.shape[1] # Number of samples
# Output layer gradient (simplified for BCE + sigmoid)
dZ = self.cache[f'A{self.num_layers}'] - y
# Backpropagate through all layers
for i in range(self.num_layers, 0, -1):
A_prev = self.cache[f'A{i-1}']
# Compute gradients for layer i
self.gradients[f'W{i}'] = np.dot(dZ, A_prev.T) / m
self.gradients[f'b{i}'] = np.mean(dZ, axis=1, keepdims=True)
if i > 1: # Not input layer
# Gradient for previous layer
dA_prev = np.dot(self.params[f'W{i}'].T, dZ)
# Apply ReLU derivative
dZ = dA_prev * relu_derivative(self.cache[f'Z{i-1}'])
def update(self, learning_rate):
"""Update all parameters using gradients"""
for i in range(1, self.num_layers + 1):
self.params[f'W{i}'] -= learning_rate * self.gradients[f'W{i}']
self.params[f'b{i}'] -= learning_rate * self.gradients[f'b{i}']
# Create network
nn = NeuralNetwork([2, 8, 8, 1])
print("Network created!")
print(f"Architecture: 2 β 8 β 8 β 1")
print(f"\nParameters:")
total_params = 0
for name, param in nn.params.items():
print(f" {name}: {param.shape}")
total_params += param.size
print(f"\nTotal parameters: {total_params}")
Train on Real DataΒΆ
With our matrix-based backpropagation engine ready, we can tackle realistic datasets. The make_moons dataset from scikit-learn generates two interleaving half-circles that are not linearly separable β a good stress test for nonlinear classifiers. We split the data into training and test sets, train the network for 2000 epochs, and then evaluate generalization on held-out data. Watching the loss and accuracy curves converge confirms that gradient-based optimization with backpropagation is working correctly across all layers of the network.
# Generate data
X, y = make_moons(n_samples=400, noise=0.15, random_state=42)
X = X.T
y = y.reshape(1, -1)
# Split
X_train, X_test, y_train, y_test = train_test_split(
X.T, y.T, test_size=0.2, random_state=42
)
X_train, X_test = X_train.T, X_test.T
y_train, y_test = y_train.T, y_test.T
print(f"Training samples: {X_train.shape[1]}")
print(f"Test samples: {X_test.shape[1]}")
# Training loop
losses = []
accuracies = []
learning_rate = 0.5
epochs = 2000
for epoch in range(epochs):
# Forward pass
y_pred = nn.forward(X_train)
# Compute loss
loss = binary_cross_entropy(y_train, y_pred)
# Compute accuracy
predictions = (y_pred > 0.5).astype(float)
accuracy = np.mean(predictions == y_train)
losses.append(loss)
accuracies.append(accuracy)
# Backward pass
nn.backward(y_train)
# Update
nn.update(learning_rate)
if (epoch + 1) % 400 == 0:
print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f}, Accuracy: {accuracy*100:.2f}%")
# Evaluate on test set
y_pred_test = nn.forward(X_test)
test_loss = binary_cross_entropy(y_test, y_pred_test)
test_predictions = (y_pred_test > 0.5).astype(float)
test_accuracy = np.mean(test_predictions == y_test)
print(f"\nπ Test Performance:")
print(f" Loss: {test_loss:.4f}")
print(f" Accuracy: {test_accuracy*100:.2f}%")
# Plot results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Loss
axes[0].plot(losses, 'b-', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss')
axes[0].grid(True, alpha=0.3)
# Accuracy
axes[1].plot(accuracies, 'g-', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training Accuracy')
axes[1].set_ylim([0, 1])
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
6. Vanishing and Exploding GradientsΒΆ
The ProblemΒΆ
In deep networks, gradients can become very small (vanishing) or very large (exploding) as they propagate backward.
Why? Chain rule multiplies many derivatives together:
If each derivative is < 1, the product β 0 (vanishing)
If each derivative is > 1, the product β β (exploding)
def visualize_gradient_flow(layer_sizes, activation='sigmoid'):
"""
Visualize how gradients change magnitude through layers
"""
# Create deep network
nn_deep = NeuralNetwork(layer_sizes)
# Forward and backward pass
X_sample = np.random.randn(layer_sizes[0], 10)
y_sample = np.random.randint(0, 2, (1, 10))
nn_deep.forward(X_sample)
nn_deep.backward(y_sample)
# Collect gradient magnitudes
gradient_norms = []
layer_names = []
for i in range(len(layer_sizes) - 1, 0, -1):
grad_norm = np.linalg.norm(nn_deep.gradients[f'W{i}'])
gradient_norms.append(grad_norm)
layer_names.append(f'Layer {i}')
# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(gradient_norms)), gradient_norms, color='steelblue')
plt.xlabel('Layer (backward order)')
plt.ylabel('Gradient Norm')
plt.title(f'Gradient Flow in Deep Network\nArchitecture: {" β ".join(map(str, layer_sizes))}')
plt.xticks(range(len(gradient_norms)), layer_names)
plt.grid(axis='y', alpha=0.3)
plt.yscale('log')
plt.tight_layout()
plt.show()
return gradient_norms
# Test with deep network
print("Testing gradient flow in deep network...\n")
grads = visualize_gradient_flow([10, 8, 8, 8, 8, 8, 1])
print("\nGradient magnitudes (from output to input):")
for i, norm in enumerate(grads):
print(f" Layer {len(grads)-i}: {norm:.6f}")
if grads[-1] < 1e-4:
print("\nβ οΈ Vanishing gradient detected in early layers!")
elif grads[-1] > 100:
print("\nβ οΈ Exploding gradient detected in early layers!")
else:
print("\nβ
Gradients are well-behaved!")
Solutions to Gradient ProblemsΒΆ
Better activation functions: ReLU instead of Sigmoid/Tanh
Better initialization: He initialization for ReLU, Xavier for Tanh
Batch normalization: Normalize activations (covered later)
Residual connections: Skip connections (covered in transformers)
Gradient clipping: Cap gradient magnitude
Better optimizers: Adam, RMSprop (next notebook)
SummaryΒΆ
β What You LearnedΒΆ
Chain Rule: Foundation of backpropagation
Forward Pass: Compute predictions and cache intermediate values
Backward Pass: Compute gradients using chain rule
Gradient Descent: Update weights using gradients
Gradient Checking: Verify implementation with numerical gradients
Gradient Problems: Vanishing and exploding gradients
π Key EquationsΒΆ
Forward:
Z = W @ A_prev + b
A = activation(Z)
Backward:
dZ = dA * activation_derivative(Z)
dW = (1/m) * dZ @ A_prev.T
db = (1/m) * sum(dZ)
dA_prev = W.T @ dZ
Update:
W = W - learning_rate * dW
b = b - learning_rate * db
π― Whatβs Next?ΒΆ
Next notebook: 03_pytorch_fundamentals.ipynb
Youβll learn:
PyTorch tensors and automatic differentiation
Building networks with
nn.ModuleModern optimizers (Adam, RMSprop)
GPU acceleration
Real-world datasets (MNIST, CIFAR)
π Additional ResourcesΒΆ
Excellent work! You now understand the mathematics behind how neural networks learn. This is the foundation of all modern deep learning! π