# Setup
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy.optimize import minimize
import sympy as sp
sns.set_style('whitegrid')
np.set_printoptions(precision=4, suppress=True)
5.1 Differentiation of Univariate Functions¶
Definition 5.1 (Derivative)¶
The derivative of f at x is:
Interpretation:
Instantaneous rate of change
Slope of tangent line
Common Derivatives:¶
Function |
Derivative |
|---|---|
\(x^n\) |
\(nx^{n-1}\) |
\(e^x\) |
\(e^x\) |
\(\ln(x)\) |
\(1/x\) |
\(\sin(x)\) |
\(\cos(x)\) |
\(\cos(x)\) |
\(-\sin(x)\) |
# Numerical derivative approximation
def numerical_derivative(f, x, h=1e-5):
"""Approximate derivative using finite differences."""
return (f(x + h) - f(x - h)) / (2 * h)
# Example: f(x) = x^2
f = lambda x: x**2
f_prime_exact = lambda x: 2*x # Analytical derivative
x_test = 3.0
numerical = numerical_derivative(f, x_test)
analytical = f_prime_exact(x_test)
print(f"f(x) = x²")
print(f"\nAt x = {x_test}:")
print(f"Numerical derivative: {numerical:.6f}")
print(f"Analytical derivative: {analytical:.6f}")
print(f"Error: {abs(numerical - analytical):.2e}")
# Visualize
x = np.linspace(0, 5, 100)
y = f(x)
# Tangent line at x_test
slope = f_prime_exact(x_test)
tangent_y = f(x_test) + slope * (x - x_test)
plt.figure(figsize=(10, 6))
plt.plot(x, y, 'b-', linewidth=2, label='f(x) = x²')
plt.plot(x, tangent_y, 'r--', linewidth=2, label=f"Tangent at x={x_test}")
plt.plot(x_test, f(x_test), 'go', markersize=10, label=f'Point ({x_test}, {f(x_test)})')
plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title('Derivative as Slope of Tangent Line', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
print(f"\nSlope of tangent = f'({x_test}) = {slope}")
# Chain rule, product rule, quotient rule
# Symbolic differentiation with sympy
x = sp.Symbol('x')
# Chain rule: d/dx[f(g(x))] = f'(g(x)) * g'(x)
f_chain = sp.sin(x**2)
print("Chain Rule Example:")
print(f"f(x) = sin(x²)")
print(f"f'(x) = {sp.diff(f_chain, x)}")
print(f"Simplified: {sp.simplify(sp.diff(f_chain, x))}")
# Product rule: d/dx[f(x)g(x)] = f'(x)g(x) + f(x)g'(x)
f_product = x**2 * sp.exp(x)
print("\nProduct Rule Example:")
print(f"f(x) = x² * e^x")
print(f"f'(x) = {sp.diff(f_product, x)}")
print(f"Simplified: {sp.simplify(sp.diff(f_product, x))}")
# Quotient rule: d/dx[f(x)/g(x)] = [f'(x)g(x) - f(x)g'(x)] / g(x)²
f_quotient = sp.sin(x) / x
print("\nQuotient Rule Example:")
print(f"f(x) = sin(x) / x")
print(f"f'(x) = {sp.diff(f_quotient, x)}")
print(f"Simplified: {sp.simplify(sp.diff(f_quotient, x))}")
5.2 Partial Differentiation and Gradients¶
Definition 5.3 (Partial Derivative)¶
For f: ℝⁿ → ℝ, the partial derivative with respect to xᵢ is:
Definition 5.4 (Gradient)¶
The gradient is the vector of all partial derivatives:
Key Property: Gradient points in direction of steepest ascent!
# Partial derivatives and gradients
# Example: f(x, y) = x² + xy + y²
x, y = sp.symbols('x y')
f = x**2 + x*y + y**2
print("Function: f(x, y) = x² + xy + y²")
# Partial derivatives
df_dx = sp.diff(f, x)
df_dy = sp.diff(f, y)
print(f"\n∂f/∂x = {df_dx}")
print(f"∂f/∂y = {df_dy}")
# Gradient
print(f"\n∇f = [{df_dx}, {df_dy}]ᵀ")
# Evaluate at a point
point = {x: 2, y: 3}
grad_at_point = np.array([float(df_dx.subs(point)), float(df_dy.subs(point))])
print(f"\nAt (2, 3):")
print(f"∇f(2, 3) = {grad_at_point}")
# Numerical gradient
def f_numpy(xy):
return xy[0]**2 + xy[0]*xy[1] + xy[1]**2
def numerical_gradient(f, x, h=1e-5):
"""Compute gradient numerically."""
n = len(x)
grad = np.zeros(n)
for i in range(n):
x_plus = x.copy()
x_minus = x.copy()
x_plus[i] += h
x_minus[i] -= h
grad[i] = (f(x_plus) - f(x_minus)) / (2 * h)
return grad
x_point = np.array([2.0, 3.0])
grad_numerical = numerical_gradient(f_numpy, x_point)
print(f"\nNumerical gradient: {grad_numerical}")
print(f"Match? {np.allclose(grad_at_point, grad_numerical)}")
# Visualize gradient as direction of steepest ascent
def f_2d(x, y):
return x**2 + x*y + y**2
def gradient_2d(x, y):
return np.array([2*x + y, x + 2*y])
# Create grid
x_range = np.linspace(-3, 3, 100)
y_range = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = f_2d(X, Y)
fig = plt.figure(figsize=(14, 6))
# 3D surface
ax1 = fig.add_subplot(121, projection='3d')
ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x, y)')
ax1.set_title('f(x, y) = x² + xy + y²', fontsize=12)
# Contour plot with gradient vectors
ax2 = fig.add_subplot(122)
contour = ax2.contour(X, Y, Z, levels=15, cmap='viridis')
ax2.clabel(contour, inline=True, fontsize=8)
# Plot gradient vectors at several points
points = [(-2, -2), (-1, 1), (1, -1), (2, 2)]
for px, py in points:
grad = gradient_2d(px, py)
# Normalize for visualization
grad_norm = grad / (np.linalg.norm(grad) + 1e-10) * 0.5
ax2.quiver(px, py, grad_norm[0], grad_norm[1],
color='red', width=0.006, scale=1, scale_units='xy')
ax2.plot(px, py, 'ro', markersize=6)
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Contours + Gradient Vectors\n(red arrows point uphill)', fontsize=12)
ax2.set_aspect('equal')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Gradients (red arrows) point perpendicular to contours,")
print("in the direction of steepest ascent!")
5.3 Gradients of Vector-Valued Functions¶
Definition 5.6 (Jacobian)¶
For f: ℝⁿ → ℝᵐ, the Jacobian is the m×n matrix:
Each row is the gradient of one output component.
# Jacobian matrix
# Example: f: R² → R³
# f(x, y) = [x² + y, xy, sin(x) + cos(y)]
x, y = sp.symbols('x y')
f1 = x**2 + y
f2 = x * y
f3 = sp.sin(x) + sp.cos(y)
print("Vector function f: R² → R³")
print(f"f₁(x,y) = {f1}")
print(f"f₂(x,y) = {f2}")
print(f"f₃(x,y) = {f3}")
# Compute Jacobian
J = sp.Matrix([
[sp.diff(f1, x), sp.diff(f1, y)],
[sp.diff(f2, x), sp.diff(f2, y)],
[sp.diff(f3, x), sp.diff(f3, y)]
])
print("\nJacobian J (3×2):")
sp.pprint(J)
# Evaluate at a point
point = {x: 1, y: 2}
J_at_point = np.array(J.subs(point)).astype(float)
print(f"\nAt (1, 2):")
print(J_at_point)
# Numerical Jacobian
def f_vector(xy):
x, y = xy
return np.array([x**2 + y, x*y, np.sin(x) + np.cos(y)])
def numerical_jacobian(f, x, h=1e-5):
"""Compute Jacobian numerically."""
n = len(x)
f0 = f(x)
m = len(f0)
J = np.zeros((m, n))
for i in range(n):
x_plus = x.copy()
x_minus = x.copy()
x_plus[i] += h
x_minus[i] -= h
J[:, i] = (f(x_plus) - f(x_minus)) / (2 * h)
return J
x_point = np.array([1.0, 2.0])
J_numerical = numerical_jacobian(f_vector, x_point)
print("\nNumerical Jacobian:")
print(J_numerical)
print(f"\nMatch? {np.allclose(J_at_point, J_numerical)}")
5.6 Backpropagation and Automatic Differentiation¶
Chain Rule for Composition:¶
For f = f₄ ∘ f₃ ∘ f₂ ∘ f₁:
Backpropagation: Compute gradients efficiently by reusing intermediate results.
Key Idea: Store forward pass values, compute gradients backward.
# Simple backpropagation example
# Compute gradient of f(x) = (exp(x² + 1))²
def forward_pass(x):
"""
Forward pass with intermediate values.
f(x) = (exp(x² + 1))²
Decomposition:
a = x²
b = a + 1
c = exp(b)
f = c²
"""
a = x**2
b = a + 1
c = np.exp(b)
f = c**2
return f, {'x': x, 'a': a, 'b': b, 'c': c, 'f': f}
def backward_pass(cache):
"""
Backward pass using chain rule.
df/dx = df/dc * dc/db * db/da * da/dx
"""
x = cache['x']
a = cache['a']
b = cache['b']
c = cache['c']
f = cache['f']
# Backward: df/dc
df_dc = 2 * c
# dc/db
dc_db = np.exp(b) # = c
# db/da
db_da = 1
# da/dx
da_dx = 2 * x
# Chain rule
df_dx = df_dc * dc_db * db_da * da_dx
return df_dx
# Test
x_test = 2.0
f_val, cache = forward_pass(x_test)
gradient = backward_pass(cache)
print("Backpropagation Example")
print(f"f(x) = (exp(x² + 1))²")
print(f"\nAt x = {x_test}:")
print(f"f({x_test}) = {f_val:.4f}")
print(f"df/dx = {gradient:.4f}")
# Verify with numerical gradient
h = 1e-5
numerical_grad = (forward_pass(x_test + h)[0] - forward_pass(x_test - h)[0]) / (2 * h)
print(f"\nNumerical gradient: {numerical_grad:.4f}")
print(f"Match? {np.isclose(gradient, numerical_grad)}")
# Show intermediate values
print("\n" + "="*50)
print("\nForward pass (stored for backprop):")
for key, val in cache.items():
print(f"{key} = {val:.4f}")
# Automatic differentiation with PyTorch
try:
import torch
# Enable autograd
x = torch.tensor([2.0], requires_grad=True)
# Define computation
a = x**2
b = a + 1
c = torch.exp(b)
f = c**2
print("PyTorch Autograd Example")
print(f"f(x) = (exp(x² + 1))²")
print(f"\nAt x = {x.item()}:")
print(f"f(x) = {f.item():.4f}")
# Backpropagation
f.backward()
print(f"\nAutomatic gradient: {x.grad.item():.4f}")
print(f"Manual gradient: {gradient:.4f}")
print(f"\nMatch? {np.isclose(x.grad.item(), gradient)}")
except ImportError:
print("PyTorch not installed. Run: pip install torch")
print("\nThe manual backpropagation above shows the same principle!")
5.7 Higher-Order Derivatives¶
Definition 5.9 (Hessian)¶
The Hessian matrix contains all second-order partial derivatives:
Properties:
Symmetric (if f is smooth)
Describes local curvature
Used in Newton’s method
# Hessian matrix
# Example: f(x, y) = x² + xy + y²
x, y = sp.symbols('x y')
f = x**2 + x*y + y**2
print("Function: f(x, y) = x² + xy + y²")
# First derivatives (gradient)
df_dx = sp.diff(f, x)
df_dy = sp.diff(f, y)
print(f"\n∇f = [{df_dx}, {df_dy}]ᵀ")
# Second derivatives (Hessian)
d2f_dx2 = sp.diff(df_dx, x)
d2f_dxdy = sp.diff(df_dx, y)
d2f_dydx = sp.diff(df_dy, x)
d2f_dy2 = sp.diff(df_dy, y)
H = sp.Matrix([
[d2f_dx2, d2f_dxdy],
[d2f_dydx, d2f_dy2]
])
print("\nHessian H:")
sp.pprint(H)
# Evaluate at a point
point = {x: 1, y: 2}
H_at_point = np.array(H.subs(point)).astype(float)
print(f"\nAt (1, 2):")
print(H_at_point)
# Check symmetry
print(f"\nSymmetric? {np.allclose(H_at_point, H_at_point.T)}")
# Eigenvalues determine curvature
eigenvalues = np.linalg.eigvalsh(H_at_point)
print(f"\nEigenvalues: {eigenvalues}")
if np.all(eigenvalues > 0):
print("All positive → local minimum (convex)")
elif np.all(eigenvalues < 0):
print("All negative → local maximum (concave)")
else:
print("Mixed signs → saddle point")
# Numerical Hessian
def numerical_hessian(f, x, h=1e-5):
"""
Compute Hessian numerically using finite differences.
"""
n = len(x)
H = np.zeros((n, n))
for i in range(n):
for j in range(n):
# Compute ∂²f/∂xᵢ∂xⱼ
x_pp = x.copy()
x_pm = x.copy()
x_mp = x.copy()
x_mm = x.copy()
x_pp[i] += h; x_pp[j] += h
x_pm[i] += h; x_pm[j] -= h
x_mp[i] -= h; x_mp[j] += h
x_mm[i] -= h; x_mm[j] -= h
H[i, j] = (f(x_pp) - f(x_pm) - f(x_mp) + f(x_mm)) / (4 * h**2)
return H
# Test
f_numpy = lambda xy: xy[0]**2 + xy[0]*xy[1] + xy[1]**2
x_point = np.array([1.0, 2.0])
H_numerical = numerical_hessian(f_numpy, x_point)
print("Numerical Hessian:")
print(H_numerical)
print("\nAnalytical Hessian:")
print(H_at_point)
print(f"\nMatch? {np.allclose(H_numerical, H_at_point, atol=1e-4)}")
5.8 Linearization and Multivariate Taylor Series¶
First-Order Taylor Approximation (Linearization):¶
Second-Order Taylor Approximation:¶
Uses:
Local approximation
Newton’s method
Optimization algorithms
# Taylor approximation
def f_2d(xy):
x, y = xy
return x**2 + x*y + y**2
def gradient_2d(xy):
x, y = xy
return np.array([2*x + y, x + 2*y])
def hessian_2d(xy):
return np.array([[2, 1],
[1, 2]])
# Expansion point
x0 = np.array([1.0, 1.0])
f0 = f_2d(x0)
grad0 = gradient_2d(x0)
H0 = hessian_2d(x0)
# Test point
x = np.array([1.5, 1.2])
delta_x = x - x0
# True value
f_true = f_2d(x)
# Zero-order (constant)
f_order0 = f0
# First-order (linear)
f_order1 = f0 + grad0 @ delta_x
# Second-order (quadratic)
f_order2 = f0 + grad0 @ delta_x + 0.5 * delta_x @ H0 @ delta_x
print(f"Function: f(x, y) = x² + xy + y²")
print(f"\nExpansion point: x₀ = {x0}")
print(f"Test point: x = {x}")
print(f"δx = {delta_x}")
print("\n" + "="*50)
print("\nTaylor Approximations:")
print(f"True value: f(x) = {f_true:.6f}")
print(f"0th order (const): = {f_order0:.6f} (error: {abs(f_true - f_order0):.6f})")
print(f"1st order (linear): = {f_order1:.6f} (error: {abs(f_true - f_order1):.6f})")
print(f"2nd order (quadratic): = {f_order2:.6f} (error: {abs(f_true - f_order2):.6f})")
print("\nHigher-order approximations are more accurate!")
# Visualize Taylor approximation
# 1D example for easier visualization
def f_1d(x):
return np.sin(x) + 0.1 * x**2
def f_prime(x):
return np.cos(x) + 0.2 * x
def f_double_prime(x):
return -np.sin(x) + 0.2
# Expansion point
x0 = 1.0
f0 = f_1d(x0)
fp0 = f_prime(x0)
fpp0 = f_double_prime(x0)
# Range
x = np.linspace(x0 - 1.5, x0 + 1.5, 100)
# Approximations
f_true = f_1d(x)
f_linear = f0 + fp0 * (x - x0)
f_quadratic = f0 + fp0 * (x - x0) + 0.5 * fpp0 * (x - x0)**2
plt.figure(figsize=(12, 7))
plt.plot(x, f_true, 'b-', linewidth=3, label='True f(x)')
plt.plot(x, f_linear, 'r--', linewidth=2, label='1st order (linear)')
plt.plot(x, f_quadratic, 'g--', linewidth=2, label='2nd order (quadratic)')
plt.plot(x0, f0, 'ko', markersize=10, label=f'Expansion point x₀={x0}')
plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title('Taylor Series Approximation', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.xlim(x0 - 1.5, x0 + 1.5)
plt.tight_layout()
plt.show()
print("Observation: Approximations are better near the expansion point!")
print("2nd order captures curvature, 1st order only captures slope.")
Summary¶
Key Concepts from Chapter 5:¶
Derivatives: Instantaneous rate of change, slopes
Partial Derivatives: Change along one coordinate
Gradients: Direction of steepest ascent
Jacobians: Generalization for vector functions
Chain Rule: Composition of functions
Backpropagation: Efficient gradient computation
Hessians: Second-order derivatives, curvature
Taylor Series: Local approximations
ML Applications:¶
Concept |
ML Application |
|---|---|
Gradients |
Gradient descent, optimization |
Backpropagation |
Training neural networks |
Hessians |
Newton’s method, second-order optimization |
Chain rule |
Deep learning, automatic differentiation |
Taylor series |
Trust region methods |
Gradient Descent Preview:¶
where α is the learning rate.
Next Steps:¶
Chapter 6: Probability and Distributions
Chapter 7: Continuous Optimization (applies gradients)
Chapter 9: Linear Regression (uses gradients for fitting)
Practice: Implement gradient descent to minimize f(x, y) = x² + xy + y²!