Calculus & DerivativesΒΆ
Derivatives, partial derivatives, chain rule, gradient descent, and multivariable calculus for ML.
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import seaborn as sns
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
1. What is a Derivative?ΒΆ
A derivative measures how a function changes as its input changes.
Formal definition: $\(f'(x) = \lim_{h \to 0} \frac{f(x+h) - f(x)}{h}\)$
Interpretation: The slope of the tangent line at a point.
ML relevance: Tells us how to adjust model parameters to reduce error.
# Visualize derivative as slope
def f(x):
"""Simple quadratic function"""
return x**2
def f_derivative(x):
"""Derivative of x^2 is 2x"""
return 2*x
# Generate points
x = np.linspace(-3, 3, 100)
y = f(x)
# Point of interest
x0 = 1.5
y0 = f(x0)
slope = f_derivative(x0)
# Tangent line: y = y0 + slope*(x - x0)
tangent_y = y0 + slope * (x - x0)
# Plot
plt.figure(figsize=(10, 6))
plt.plot(x, y, 'b-', linewidth=2, label='f(x) = xΒ²')
plt.plot(x, tangent_y, 'r--', linewidth=2, label=f'Tangent at x={x0} (slope={slope})')
plt.plot(x0, y0, 'ro', markersize=10, label=f'Point ({x0}, {y0})')
plt.axhline(y=0, color='k', linewidth=0.5)
plt.axvline(x=0, color='k', linewidth=0.5)
plt.grid(True, alpha=0.3)
plt.legend(fontsize=12)
plt.title('Derivative as Slope of Tangent Line', fontsize=14)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.show()
Numerical Derivative ApproximationΒΆ
When an analytical derivative is difficult or impossible to derive by hand, we can approximate it using the finite difference method: \(f'(x) \approx \frac{f(x+h) - f(x)}{h}\) for a very small \(h\). This is exactly how automatic differentiation frameworks like PyTorch and TensorFlow verify their gradient computations during testing β a technique called gradient checking. By comparing the numerical approximation against the analytical result, you can catch bugs in your backpropagation implementation before they silently corrupt training.
def numerical_derivative(f, x, h=1e-5):
"""
Compute derivative using finite difference method
f'(x) β [f(x+h) - f(x)] / h
"""
return (f(x + h) - f(x)) / h
# Test on f(x) = x^2 at x = 2
x_test = 2
analytical = f_derivative(x_test) # 2*x = 4
numerical = numerical_derivative(f, x_test)
print(f"At x = {x_test}:")
print(f"Analytical derivative: {analytical}")
print(f"Numerical derivative: {numerical:.10f}")
print(f"Difference: {abs(analytical - numerical):.2e}")
2. Common Derivative RulesΒΆ
Essential rules for ML:
Function |
Derivative |
|---|---|
\(c\) (constant) |
\(0\) |
\(x^n\) |
\(nx^{n-1}\) |
\(e^x\) |
\(e^x\) |
\(\ln(x)\) |
\(\frac{1}{x}\) |
\(\sin(x)\) |
\(\cos(x)\) |
\(\cos(x)\) |
\(-\sin(x)\) |
Sum rule: \((f + g)' = f' + g'\)
Product rule: \((fg)' = f'g + fg'\)
Chain rule: \((f(g(x)))' = f'(g(x)) \cdot g'(x)\) β CRITICAL FOR ML!
# Visualize common functions and their derivatives
x = np.linspace(-2, 2, 100)
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# Power function: x^2
axes[0, 0].plot(x, x**2, 'b-', label='f(x) = xΒ²', linewidth=2)
axes[0, 0].plot(x, 2*x, 'r--', label="f'(x) = 2x", linewidth=2)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_title('Power Function')
# Exponential
x_pos = np.linspace(0.1, 2, 100)
axes[0, 1].plot(x_pos, np.exp(x_pos), 'b-', label='f(x) = eΛ£', linewidth=2)
axes[0, 1].plot(x_pos, np.exp(x_pos), 'r--', label="f'(x) = eΛ£", linewidth=2)
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_title('Exponential Function')
# Logarithm
axes[0, 2].plot(x_pos, np.log(x_pos), 'b-', label='f(x) = ln(x)', linewidth=2)
axes[0, 2].plot(x_pos, 1/x_pos, 'r--', label="f'(x) = 1/x", linewidth=2)
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)
axes[0, 2].set_title('Logarithm Function')
# Sine
axes[1, 0].plot(x, np.sin(x), 'b-', label='f(x) = sin(x)', linewidth=2)
axes[1, 0].plot(x, np.cos(x), 'r--', label="f'(x) = cos(x)", linewidth=2)
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_title('Sine Function')
# ReLU (common activation function)
axes[1, 1].plot(x, np.maximum(0, x), 'b-', label='f(x) = max(0,x)', linewidth=2)
relu_derivative = np.where(x > 0, 1, 0)
axes[1, 1].plot(x, relu_derivative, 'r--', label="f'(x) = 0 or 1", linewidth=2)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].set_title('ReLU Activation')
# Sigmoid (common activation function)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x):
s = sigmoid(x)
return s * (1 - s)
axes[1, 2].plot(x, sigmoid(x), 'b-', label='f(x) = Ο(x)', linewidth=2)
axes[1, 2].plot(x, sigmoid_derivative(x), 'r--', label="f'(x) = Ο(x)(1-Ο(x))", linewidth=2)
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)
axes[1, 2].set_title('Sigmoid Activation')
plt.tight_layout()
plt.show()
3. Partial DerivativesΒΆ
For functions with multiple variables: \(f(x, y)\)
Partial derivative with respect to x: $\(\frac{\partial f}{\partial x} = \text{derivative treating } y \text{ as constant}\)$
Example: \(f(x, y) = x^2 + 3xy + y^2\)
\(\frac{\partial f}{\partial x} = 2x + 3y\)
\(\frac{\partial f}{\partial y} = 3x + 2y\)
# Define a 2D function
def f_2d(x, y):
"""f(x,y) = x^2 + 3xy + y^2"""
return x**2 + 3*x*y + y**2
def df_dx(x, y):
"""Partial derivative with respect to x"""
return 2*x + 3*y
def df_dy(x, y):
"""Partial derivative with respect to y"""
return 3*x + 2*y
# Create a grid for visualization
x_range = np.linspace(-2, 2, 50)
y_range = np.linspace(-2, 2, 50)
X, Y = np.meshgrid(x_range, y_range)
Z = f_2d(X, Y)
# 3D surface plot
fig = plt.figure(figsize=(14, 5))
# Surface plot
ax1 = fig.add_subplot(121, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap=cm.viridis, alpha=0.8)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('Surface: f(x,y) = xΒ² + 3xy + yΒ²')
fig.colorbar(surf, ax=ax1, shrink=0.5)
# Contour plot
ax2 = fig.add_subplot(122)
contour = ax2.contour(X, Y, Z, levels=20, cmap=cm.viridis)
ax2.clabel(contour, inline=True, fontsize=8)
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Contour Plot of f(x,y)')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Test partial derivatives at a point
x0, y0 = 1, 1
print(f"At point ({x0}, {y0}):")
print(f"f({x0}, {y0}) = {f_2d(x0, y0)}")
print(f"βf/βx = {df_dx(x0, y0)}")
print(f"βf/βy = {df_dy(x0, y0)}")
4. The Gradient VectorΒΆ
The gradient is a vector of all partial derivatives:
Key property: The gradient points in the direction of steepest ascent.
ML relevance: We follow the negative gradient to minimize loss (gradient descent)!
def gradient(x, y):
"""Compute gradient vector at (x, y)"""
return np.array([df_dx(x, y), df_dy(x, y)])
# Visualize gradient vectors on contour plot
fig, ax = plt.subplots(figsize=(10, 8))
# Contour plot
contour = ax.contour(X, Y, Z, levels=20, cmap=cm.viridis)
ax.clabel(contour, inline=True, fontsize=8)
# Sample points for gradient vectors
sample_x = np.linspace(-2, 2, 8)
sample_y = np.linspace(-2, 2, 8)
# Plot gradient vectors
for xi in sample_x:
for yi in sample_y:
grad = gradient(xi, yi)
# Normalize for better visualization
grad_norm = grad / (np.linalg.norm(grad) + 1e-8)
ax.arrow(xi, yi, grad_norm[0]*0.2, grad_norm[1]*0.2,
head_width=0.1, head_length=0.1, fc='red', ec='red', alpha=0.6)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Gradient Vectors (Red arrows point uphill)')
ax.grid(True, alpha=0.3)
plt.show()
5. Chain Rule: The Heart of BackpropagationΒΆ
For composite functions: \(h(x) = f(g(x))\)
Example: \(h(x) = \sin(x^2)\)
Let \(g(x) = x^2\) and \(f(u) = \sin(u)\)
\(g'(x) = 2x\) and \(f'(u) = \cos(u)\)
\(h'(x) = \cos(x^2) \cdot 2x\)
Neural network perspective: Chain rule allows us to compute gradients through multiple layers!
# Chain rule example: h(x) = sin(x^2)
def h(x):
"""Composite function: sin(x^2)"""
return np.sin(x**2)
def h_derivative(x):
"""Derivative using chain rule: cos(x^2) * 2x"""
return np.cos(x**2) * 2*x
x = np.linspace(-3, 3, 200)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
# Plot function
ax1.plot(x, h(x), 'b-', linewidth=2, label='h(x) = sin(xΒ²)')
ax1.plot(x, x**2, 'g--', alpha=0.5, label='g(x) = xΒ² (inner function)')
ax1.grid(True, alpha=0.3)
ax1.legend(fontsize=12)
ax1.set_title('Composite Function', fontsize=14)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
# Plot derivative
ax2.plot(x, h_derivative(x), 'r-', linewidth=2, label="h'(x) = cos(xΒ²)Β·2x")
ax2.axhline(y=0, color='k', linewidth=0.5)
ax2.grid(True, alpha=0.3)
ax2.legend(fontsize=12)
ax2.set_title('Derivative (using Chain Rule)', fontsize=14)
ax2.set_xlabel('x')
ax2.set_ylabel("h'(x)")
plt.tight_layout()
plt.show()
# Verify numerically
x_test = 1.5
analytical = h_derivative(x_test)
numerical = numerical_derivative(h, x_test)
print(f"\nAt x = {x_test}:")
print(f"Analytical (chain rule): {analytical:.10f}")
print(f"Numerical derivative: {numerical:.10f}")
print(f"Difference: {abs(analytical - numerical):.2e}")
Chain Rule in Neural NetworksΒΆ
Consider a simple 2-layer network:
To compute \(\frac{\partial L}{\partial W_1}\), we use chain rule:
# Simple example: Computing gradient through a 2-layer network
# Network: x -> W1 -> sigmoid -> W2 -> output -> MSE loss
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def sigmoid_derivative(z):
s = sigmoid(z)
return s * (1 - s)
# Sample data
x = 2.0 # Input
y_true = 1.0 # Target
W1 = 0.5 # Weight 1
W2 = 0.8 # Weight 2
# Forward pass
z1 = W1 * x # Linear transformation 1
a1 = sigmoid(z1) # Activation
z2 = W2 * a1 # Linear transformation 2
y_pred = z2 # Output (no activation)
loss = 0.5 * (y_pred - y_true)**2 # MSE loss
print("=== Forward Pass ===")
print(f"x = {x}")
print(f"z1 = W1 * x = {W1} * {x} = {z1}")
print(f"a1 = sigmoid(z1) = {a1:.4f}")
print(f"z2 = W2 * a1 = {W2} * {a1:.4f} = {z2:.4f}")
print(f"y_pred = {y_pred:.4f}")
print(f"Loss = {loss:.4f}")
# Backward pass (chain rule!)
print("\n=== Backward Pass (Chain Rule) ===")
# dL/dy_pred
dL_dy = y_pred - y_true
print(f"dL/dy_pred = {dL_dy:.4f}")
# dL/dW2 = dL/dy * dy/dz2 * dz2/dW2
dy_dz2 = 1 # No activation on output
dz2_dW2 = a1
dL_dW2 = dL_dy * dy_dz2 * dz2_dW2
print(f"dL/dW2 = {dL_dy:.4f} * {dy_dz2} * {dz2_dW2:.4f} = {dL_dW2:.4f}")
# dL/dW1 = dL/dy * dy/dz2 * dz2/da1 * da1/dz1 * dz1/dW1
dz2_da1 = W2
da1_dz1 = sigmoid_derivative(z1)
dz1_dW1 = x
dL_dW1 = dL_dy * dy_dz2 * dz2_da1 * da1_dz1 * dz1_dW1
print(f"dL/dW1 = {dL_dy:.4f} * {dy_dz2} * {dz2_da1} * {da1_dz1:.4f} * {dz1_dW1} = {dL_dW1:.4f}")
print("\nThis is backpropagation using the chain rule!")
6. Optimization: Finding MinimaΒΆ
Goal in ML: Find parameters that minimize loss function
Critical points: Where derivative = 0
Minimum: \(f'(x) = 0\) and \(f''(x) > 0\) (second derivative positive)
Maximum: \(f'(x) = 0\) and \(f''(x) < 0\) (second derivative negative)
Saddle point: \(f'(x) = 0\) and \(f''(x) = 0\)
# Function with multiple critical points
def f_complex(x):
"""Function with local minimum, maximum, and global minimum"""
return x**4 - 4*x**3 + 4*x**2 + 1
def f_complex_derivative(x):
return 4*x**3 - 12*x**2 + 8*x
def f_complex_second_derivative(x):
return 12*x**2 - 24*x + 8
x = np.linspace(-0.5, 3, 300)
y = f_complex(x)
y_prime = f_complex_derivative(x)
# Find critical points (where derivative = 0)
from scipy.optimize import fsolve
critical_points = fsolve(f_complex_derivative, [0.5, 1.0, 2.0])
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
# Plot function
ax1.plot(x, y, 'b-', linewidth=2, label='f(x)')
ax1.axhline(y=0, color='k', linewidth=0.5)
ax1.grid(True, alpha=0.3)
# Mark critical points
for cp in critical_points:
second_deriv = f_complex_second_derivative(cp)
if second_deriv > 0:
label = 'Minimum'
color = 'green'
elif second_deriv < 0:
label = 'Maximum'
color = 'red'
else:
label = 'Inflection'
color = 'orange'
ax1.plot(cp, f_complex(cp), 'o', color=color, markersize=10,
label=f'{label}: x={cp:.2f}')
ax1.legend(fontsize=10)
ax1.set_title('Function with Critical Points', fontsize=14)
ax1.set_xlabel('x')
ax1.set_ylabel('f(x)')
# Plot derivative
ax2.plot(x, y_prime, 'r-', linewidth=2, label="f'(x)")
ax2.axhline(y=0, color='k', linewidth=1, linestyle='--', alpha=0.5)
ax2.grid(True, alpha=0.3)
# Mark where derivative = 0
for cp in critical_points:
ax2.plot(cp, 0, 'go', markersize=10)
ax2.legend(fontsize=10)
ax2.set_title('Derivative (zeros indicate critical points)', fontsize=14)
ax2.set_xlabel('x')
ax2.set_ylabel("f'(x)")
plt.tight_layout()
plt.show()
print("Critical Points Analysis:")
for i, cp in enumerate(critical_points, 1):
second_deriv = f_complex_second_derivative(cp)
point_type = "Minimum" if second_deriv > 0 else ("Maximum" if second_deriv < 0 else "Inflection")
print(f"Point {i}: x = {cp:.4f}, f(x) = {f_complex(cp):.4f}, f''(x) = {second_deriv:.4f} β {point_type}")
7. Gradient Descent PreviewΒΆ
Core ML optimization algorithm:
Start with initial parameters \(\theta\)
Compute gradient: \(\nabla L(\theta)\)
Update: \(\theta_{new} = \theta_{old} - \alpha \nabla L(\theta)\)
Repeat until convergence
Where \(\alpha\) is the learning rate.
# Simple gradient descent on f(x) = x^2
def simple_gradient_descent(learning_rate=0.1, initial_x=5.0, iterations=20):
"""
Minimize f(x) = x^2 using gradient descent
"""
x = initial_x
history = [x]
for i in range(iterations):
# Compute gradient (derivative)
gradient = 2 * x # f'(x) = 2x for f(x) = x^2
# Update x
x = x - learning_rate * gradient
history.append(x)
if i < 5 or i == iterations - 1:
print(f"Iteration {i+1}: x = {x:.6f}, f(x) = {x**2:.6f}, gradient = {gradient:.6f}")
return np.array(history)
# Run gradient descent
print("=== Gradient Descent on f(x) = xΒ² ===")
history = simple_gradient_descent(learning_rate=0.1, initial_x=5.0, iterations=20)
# Visualize the descent
x_range = np.linspace(-6, 6, 100)
y_range = x_range**2
plt.figure(figsize=(12, 6))
plt.plot(x_range, y_range, 'b-', linewidth=2, label='f(x) = xΒ²')
plt.plot(history, history**2, 'ro-', markersize=8, linewidth=1.5,
alpha=0.7, label='Gradient descent path')
plt.plot(history[0], history[0]**2, 'g*', markersize=20, label='Start')
plt.plot(history[-1], history[-1]**2, 'r*', markersize=20, label='End')
plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title('Gradient Descent Visualization', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
8. Practice ExercisesΒΆ
These exercises reinforce the three core skills from this notebook: computing derivatives analytically using standard rules, applying partial derivatives to multivariable functions (the bread and butter of multi-parameter models), and using the chain rule to differentiate composite functions (the mechanism behind backpropagation). Try solving each one on paper first, then verify your answer by uncommenting the numerical check.
# Exercise 1: Compute the derivative of f(x) = 3x^3 - 2x^2 + x - 5
# Both analytically and numerically
def exercise1(x):
return 3*x**3 - 2*x**2 + x - 5
# Your analytical derivative here:
def exercise1_derivative(x):
# Fill in the correct derivative
pass
# Test at x = 2
# x_test = 2
# analytical = exercise1_derivative(x_test)
# numerical = numerical_derivative(exercise1, x_test)
# print(f"Analytical: {analytical}")
# print(f"Numerical: {numerical}")
# Exercise 2: Compute partial derivatives of f(x,y) = x^3*y^2 + 2*x*y - y^3
def exercise2(x, y):
return x**3 * y**2 + 2*x*y - y**3
# Your partial derivatives here:
def exercise2_dx(x, y):
# βf/βx = ?
pass
def exercise2_dy(x, y):
# βf/βy = ?
pass
# Exercise 3: Use chain rule to find derivative of h(x) = e^(x^2)
def exercise3(x):
return np.exp(x**2)
# Your derivative using chain rule:
def exercise3_derivative(x):
# h'(x) = ?
# Hint: Let u = x^2, then h = e^u
pass
SummaryΒΆ
Youβve learned:
β Derivatives: Measure rate of change, slope of tangent line β Common derivative rules: Power, exponential, logarithm, trig functions β Partial derivatives: Derivatives for multivariable functions β Gradient vector: Collection of all partial derivatives β Chain rule: Essential for backpropagation in neural networks β Optimization: Finding minima using derivatives β Gradient descent preview: Core ML optimization algorithm
Key Takeaways for ML:ΒΆ
Gradients tell us how to update parameters to reduce loss
Chain rule enables backpropagation through deep networks
Partial derivatives handle multi-parameter models
Second derivatives help understand convergence and learning rate selection
Next Steps:ΒΆ
Study gradient descent in detail (see notebook 04)
Learn about backpropagation algorithm
Explore advanced optimizers (Adam, RMSprop)
Practice with neural network implementations