# Setup
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy.optimize import minimize
import sympy as sp

sns.set_style('whitegrid')
np.set_printoptions(precision=4, suppress=True)

5.1 Differentiation of Univariate Functions

Definition 5.1 (Derivative)

The derivative of f at x is:

\[f'(x) = \lim_{h \to 0} \frac{f(x+h) - f(x)}{h}\]

Interpretation:

  • Instantaneous rate of change

  • Slope of tangent line

Common Derivatives:

Function

Derivative

\(x^n\)

\(nx^{n-1}\)

\(e^x\)

\(e^x\)

\(\ln(x)\)

\(1/x\)

\(\sin(x)\)

\(\cos(x)\)

\(\cos(x)\)

\(-\sin(x)\)

# Numerical derivative approximation

def numerical_derivative(f, x, h=1e-5):
    """Approximate derivative using finite differences."""
    return (f(x + h) - f(x - h)) / (2 * h)

# Example: f(x) = x^2
f = lambda x: x**2
f_prime_exact = lambda x: 2*x  # Analytical derivative

x_test = 3.0
numerical = numerical_derivative(f, x_test)
analytical = f_prime_exact(x_test)

print(f"f(x) = x²")
print(f"\nAt x = {x_test}:")
print(f"Numerical derivative: {numerical:.6f}")
print(f"Analytical derivative: {analytical:.6f}")
print(f"Error: {abs(numerical - analytical):.2e}")

# Visualize
x = np.linspace(0, 5, 100)
y = f(x)

# Tangent line at x_test
slope = f_prime_exact(x_test)
tangent_y = f(x_test) + slope * (x - x_test)

plt.figure(figsize=(10, 6))
plt.plot(x, y, 'b-', linewidth=2, label='f(x) = x²')
plt.plot(x, tangent_y, 'r--', linewidth=2, label=f"Tangent at x={x_test}")
plt.plot(x_test, f(x_test), 'go', markersize=10, label=f'Point ({x_test}, {f(x_test)})')
plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title('Derivative as Slope of Tangent Line', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nSlope of tangent = f'({x_test}) = {slope}")
# Chain rule, product rule, quotient rule

# Symbolic differentiation with sympy
x = sp.Symbol('x')

# Chain rule: d/dx[f(g(x))] = f'(g(x)) * g'(x)
f_chain = sp.sin(x**2)
print("Chain Rule Example:")
print(f"f(x) = sin(x²)")
print(f"f'(x) = {sp.diff(f_chain, x)}")
print(f"Simplified: {sp.simplify(sp.diff(f_chain, x))}")

# Product rule: d/dx[f(x)g(x)] = f'(x)g(x) + f(x)g'(x)
f_product = x**2 * sp.exp(x)
print("\nProduct Rule Example:")
print(f"f(x) = x² * e^x")
print(f"f'(x) = {sp.diff(f_product, x)}")
print(f"Simplified: {sp.simplify(sp.diff(f_product, x))}")

# Quotient rule: d/dx[f(x)/g(x)] = [f'(x)g(x) - f(x)g'(x)] / g(x)²
f_quotient = sp.sin(x) / x
print("\nQuotient Rule Example:")
print(f"f(x) = sin(x) / x")
print(f"f'(x) = {sp.diff(f_quotient, x)}")
print(f"Simplified: {sp.simplify(sp.diff(f_quotient, x))}")

5.2 Partial Differentiation and Gradients

Definition 5.3 (Partial Derivative)

For f: ℝⁿ → ℝ, the partial derivative with respect to xᵢ is:

\[\frac{\partial f}{\partial x_i} = \lim_{h \to 0} \frac{f(x_1, \ldots, x_i + h, \ldots, x_n) - f(x_1, \ldots, x_n)}{h}\]

Definition 5.4 (Gradient)

The gradient is the vector of all partial derivatives:

\[\begin{split}\nabla f(x) = \begin{bmatrix} \frac{\partial f}{\partial x_1} \\ \vdots \\ \frac{\partial f}{\partial x_n} \end{bmatrix}\end{split}\]

Key Property: Gradient points in direction of steepest ascent!

# Partial derivatives and gradients

# Example: f(x, y) = x² + xy + y²
x, y = sp.symbols('x y')
f = x**2 + x*y + y**2

print("Function: f(x, y) = x² + xy + y²")

# Partial derivatives
df_dx = sp.diff(f, x)
df_dy = sp.diff(f, y)

print(f"\n∂f/∂x = {df_dx}")
print(f"∂f/∂y = {df_dy}")

# Gradient
print(f"\n∇f = [{df_dx}, {df_dy}]ᵀ")

# Evaluate at a point
point = {x: 2, y: 3}
grad_at_point = np.array([float(df_dx.subs(point)), float(df_dy.subs(point))])

print(f"\nAt (2, 3):")
print(f"∇f(2, 3) = {grad_at_point}")

# Numerical gradient
def f_numpy(xy):
    return xy[0]**2 + xy[0]*xy[1] + xy[1]**2

def numerical_gradient(f, x, h=1e-5):
    """Compute gradient numerically."""
    n = len(x)
    grad = np.zeros(n)
    for i in range(n):
        x_plus = x.copy()
        x_minus = x.copy()
        x_plus[i] += h
        x_minus[i] -= h
        grad[i] = (f(x_plus) - f(x_minus)) / (2 * h)
    return grad

x_point = np.array([2.0, 3.0])
grad_numerical = numerical_gradient(f_numpy, x_point)

print(f"\nNumerical gradient: {grad_numerical}")
print(f"Match? {np.allclose(grad_at_point, grad_numerical)}")
# Visualize gradient as direction of steepest ascent

def f_2d(x, y):
    return x**2 + x*y + y**2

def gradient_2d(x, y):
    return np.array([2*x + y, x + 2*y])

# Create grid
x_range = np.linspace(-3, 3, 100)
y_range = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = f_2d(X, Y)

fig = plt.figure(figsize=(14, 6))

# 3D surface
ax1 = fig.add_subplot(121, projection='3d')
ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x, y)')
ax1.set_title('f(x, y) = x² + xy + y²', fontsize=12)

# Contour plot with gradient vectors
ax2 = fig.add_subplot(122)
contour = ax2.contour(X, Y, Z, levels=15, cmap='viridis')
ax2.clabel(contour, inline=True, fontsize=8)

# Plot gradient vectors at several points
points = [(-2, -2), (-1, 1), (1, -1), (2, 2)]
for px, py in points:
    grad = gradient_2d(px, py)
    # Normalize for visualization
    grad_norm = grad / (np.linalg.norm(grad) + 1e-10) * 0.5
    ax2.quiver(px, py, grad_norm[0], grad_norm[1], 
              color='red', width=0.006, scale=1, scale_units='xy')
    ax2.plot(px, py, 'ro', markersize=6)

ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Contours + Gradient Vectors\n(red arrows point uphill)', fontsize=12)
ax2.set_aspect('equal')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Gradients (red arrows) point perpendicular to contours,")
print("in the direction of steepest ascent!")

5.3 Gradients of Vector-Valued Functions

Definition 5.6 (Jacobian)

For f: ℝⁿ → ℝᵐ, the Jacobian is the m×n matrix:

\[\begin{split}J = \begin{bmatrix} \frac{\partial f_1}{\partial x_1} & \cdots & \frac{\partial f_1}{\partial x_n} \\ \vdots & \ddots & \vdots \\ \frac{\partial f_m}{\partial x_1} & \cdots & \frac{\partial f_m}{\partial x_n} \end{bmatrix}\end{split}\]

Each row is the gradient of one output component.

# Jacobian matrix

# Example: f: R² → R³
# f(x, y) = [x² + y, xy, sin(x) + cos(y)]

x, y = sp.symbols('x y')
f1 = x**2 + y
f2 = x * y
f3 = sp.sin(x) + sp.cos(y)

print("Vector function f: R² → R³")
print(f"f₁(x,y) = {f1}")
print(f"f₂(x,y) = {f2}")
print(f"f₃(x,y) = {f3}")

# Compute Jacobian
J = sp.Matrix([
    [sp.diff(f1, x), sp.diff(f1, y)],
    [sp.diff(f2, x), sp.diff(f2, y)],
    [sp.diff(f3, x), sp.diff(f3, y)]
])

print("\nJacobian J (3×2):")
sp.pprint(J)

# Evaluate at a point
point = {x: 1, y: 2}
J_at_point = np.array(J.subs(point)).astype(float)

print(f"\nAt (1, 2):")
print(J_at_point)

# Numerical Jacobian
def f_vector(xy):
    x, y = xy
    return np.array([x**2 + y, x*y, np.sin(x) + np.cos(y)])

def numerical_jacobian(f, x, h=1e-5):
    """Compute Jacobian numerically."""
    n = len(x)
    f0 = f(x)
    m = len(f0)
    J = np.zeros((m, n))
    
    for i in range(n):
        x_plus = x.copy()
        x_minus = x.copy()
        x_plus[i] += h
        x_minus[i] -= h
        J[:, i] = (f(x_plus) - f(x_minus)) / (2 * h)
    
    return J

x_point = np.array([1.0, 2.0])
J_numerical = numerical_jacobian(f_vector, x_point)

print("\nNumerical Jacobian:")
print(J_numerical)

print(f"\nMatch? {np.allclose(J_at_point, J_numerical)}")

5.6 Backpropagation and Automatic Differentiation

Chain Rule for Composition:

For f = f₄ ∘ f₃ ∘ f₂ ∘ f₁:

\[\frac{\partial f}{\partial x} = \frac{\partial f_4}{\partial f_3} \cdot \frac{\partial f_3}{\partial f_2} \cdot \frac{\partial f_2}{\partial f_1} \cdot \frac{\partial f_1}{\partial x}\]

Backpropagation: Compute gradients efficiently by reusing intermediate results.

Key Idea: Store forward pass values, compute gradients backward.

# Simple backpropagation example
# Compute gradient of f(x) = (exp(x² + 1))²

def forward_pass(x):
    """
    Forward pass with intermediate values.
    f(x) = (exp(x² + 1))²
    
    Decomposition:
    a = x²
    b = a + 1
    c = exp(b)
    f = c²
    """
    a = x**2
    b = a + 1
    c = np.exp(b)
    f = c**2
    
    return f, {'x': x, 'a': a, 'b': b, 'c': c, 'f': f}

def backward_pass(cache):
    """
    Backward pass using chain rule.
    
    df/dx = df/dc * dc/db * db/da * da/dx
    """
    x = cache['x']
    a = cache['a']
    b = cache['b']
    c = cache['c']
    f = cache['f']
    
    # Backward: df/dc
    df_dc = 2 * c
    
    # dc/db
    dc_db = np.exp(b)  # = c
    
    # db/da
    db_da = 1
    
    # da/dx
    da_dx = 2 * x
    
    # Chain rule
    df_dx = df_dc * dc_db * db_da * da_dx
    
    return df_dx

# Test
x_test = 2.0
f_val, cache = forward_pass(x_test)
gradient = backward_pass(cache)

print("Backpropagation Example")
print(f"f(x) = (exp(x² + 1))²")
print(f"\nAt x = {x_test}:")
print(f"f({x_test}) = {f_val:.4f}")
print(f"df/dx = {gradient:.4f}")

# Verify with numerical gradient
h = 1e-5
numerical_grad = (forward_pass(x_test + h)[0] - forward_pass(x_test - h)[0]) / (2 * h)
print(f"\nNumerical gradient: {numerical_grad:.4f}")
print(f"Match? {np.isclose(gradient, numerical_grad)}")

# Show intermediate values
print("\n" + "="*50)
print("\nForward pass (stored for backprop):")
for key, val in cache.items():
    print(f"{key} = {val:.4f}")
# Automatic differentiation with PyTorch

try:
    import torch
    
    # Enable autograd
    x = torch.tensor([2.0], requires_grad=True)
    
    # Define computation
    a = x**2
    b = a + 1
    c = torch.exp(b)
    f = c**2
    
    print("PyTorch Autograd Example")
    print(f"f(x) = (exp(x² + 1))²")
    print(f"\nAt x = {x.item()}:")
    print(f"f(x) = {f.item():.4f}")
    
    # Backpropagation
    f.backward()
    
    print(f"\nAutomatic gradient: {x.grad.item():.4f}")
    print(f"Manual gradient: {gradient:.4f}")
    print(f"\nMatch? {np.isclose(x.grad.item(), gradient)}")
    
except ImportError:
    print("PyTorch not installed. Run: pip install torch")
    print("\nThe manual backpropagation above shows the same principle!")

5.7 Higher-Order Derivatives

Definition 5.9 (Hessian)

The Hessian matrix contains all second-order partial derivatives:

\[\begin{split}H = \begin{bmatrix} \frac{\partial^2 f}{\partial x_1^2} & \frac{\partial^2 f}{\partial x_1 \partial x_2} & \cdots \\ \frac{\partial^2 f}{\partial x_2 \partial x_1} & \frac{\partial^2 f}{\partial x_2^2} & \cdots \\ \vdots & \vdots & \ddots \end{bmatrix}\end{split}\]

Properties:

  • Symmetric (if f is smooth)

  • Describes local curvature

  • Used in Newton’s method

# Hessian matrix

# Example: f(x, y) = x² + xy + y²
x, y = sp.symbols('x y')
f = x**2 + x*y + y**2

print("Function: f(x, y) = x² + xy + y²")

# First derivatives (gradient)
df_dx = sp.diff(f, x)
df_dy = sp.diff(f, y)

print(f"\n∇f = [{df_dx}, {df_dy}]ᵀ")

# Second derivatives (Hessian)
d2f_dx2 = sp.diff(df_dx, x)
d2f_dxdy = sp.diff(df_dx, y)
d2f_dydx = sp.diff(df_dy, x)
d2f_dy2 = sp.diff(df_dy, y)

H = sp.Matrix([
    [d2f_dx2, d2f_dxdy],
    [d2f_dydx, d2f_dy2]
])

print("\nHessian H:")
sp.pprint(H)

# Evaluate at a point
point = {x: 1, y: 2}
H_at_point = np.array(H.subs(point)).astype(float)

print(f"\nAt (1, 2):")
print(H_at_point)

# Check symmetry
print(f"\nSymmetric? {np.allclose(H_at_point, H_at_point.T)}")

# Eigenvalues determine curvature
eigenvalues = np.linalg.eigvalsh(H_at_point)
print(f"\nEigenvalues: {eigenvalues}")
if np.all(eigenvalues > 0):
    print("All positive → local minimum (convex)")
elif np.all(eigenvalues < 0):
    print("All negative → local maximum (concave)")
else:
    print("Mixed signs → saddle point")
# Numerical Hessian

def numerical_hessian(f, x, h=1e-5):
    """
    Compute Hessian numerically using finite differences.
    """
    n = len(x)
    H = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            # Compute ∂²f/∂xᵢ∂xⱼ
            x_pp = x.copy()
            x_pm = x.copy()
            x_mp = x.copy()
            x_mm = x.copy()
            
            x_pp[i] += h; x_pp[j] += h
            x_pm[i] += h; x_pm[j] -= h
            x_mp[i] -= h; x_mp[j] += h
            x_mm[i] -= h; x_mm[j] -= h
            
            H[i, j] = (f(x_pp) - f(x_pm) - f(x_mp) + f(x_mm)) / (4 * h**2)
    
    return H

# Test
f_numpy = lambda xy: xy[0]**2 + xy[0]*xy[1] + xy[1]**2
x_point = np.array([1.0, 2.0])

H_numerical = numerical_hessian(f_numpy, x_point)

print("Numerical Hessian:")
print(H_numerical)
print("\nAnalytical Hessian:")
print(H_at_point)
print(f"\nMatch? {np.allclose(H_numerical, H_at_point, atol=1e-4)}")

5.8 Linearization and Multivariate Taylor Series

First-Order Taylor Approximation (Linearization):

\[f(x + \delta x) \approx f(x) + (\nabla f(x))^T \delta x\]

Second-Order Taylor Approximation:

\[f(x + \delta x) \approx f(x) + (\nabla f(x))^T \delta x + \frac{1}{2} \delta x^T H(x) \delta x\]

Uses:

  • Local approximation

  • Newton’s method

  • Optimization algorithms

# Taylor approximation

def f_2d(xy):
    x, y = xy
    return x**2 + x*y + y**2

def gradient_2d(xy):
    x, y = xy
    return np.array([2*x + y, x + 2*y])

def hessian_2d(xy):
    return np.array([[2, 1],
                     [1, 2]])

# Expansion point
x0 = np.array([1.0, 1.0])
f0 = f_2d(x0)
grad0 = gradient_2d(x0)
H0 = hessian_2d(x0)

# Test point
x = np.array([1.5, 1.2])
delta_x = x - x0

# True value
f_true = f_2d(x)

# Zero-order (constant)
f_order0 = f0

# First-order (linear)
f_order1 = f0 + grad0 @ delta_x

# Second-order (quadratic)
f_order2 = f0 + grad0 @ delta_x + 0.5 * delta_x @ H0 @ delta_x

print(f"Function: f(x, y) = x² + xy + y²")
print(f"\nExpansion point: x₀ = {x0}")
print(f"Test point: x = {x}")
print(f"δx = {delta_x}")

print("\n" + "="*50)
print("\nTaylor Approximations:")
print(f"True value:        f(x) = {f_true:.6f}")
print(f"0th order (const):      = {f_order0:.6f}  (error: {abs(f_true - f_order0):.6f})")
print(f"1st order (linear):     = {f_order1:.6f}  (error: {abs(f_true - f_order1):.6f})")
print(f"2nd order (quadratic):  = {f_order2:.6f}  (error: {abs(f_true - f_order2):.6f})")

print("\nHigher-order approximations are more accurate!")
# Visualize Taylor approximation

# 1D example for easier visualization
def f_1d(x):
    return np.sin(x) + 0.1 * x**2

def f_prime(x):
    return np.cos(x) + 0.2 * x

def f_double_prime(x):
    return -np.sin(x) + 0.2

# Expansion point
x0 = 1.0
f0 = f_1d(x0)
fp0 = f_prime(x0)
fpp0 = f_double_prime(x0)

# Range
x = np.linspace(x0 - 1.5, x0 + 1.5, 100)

# Approximations
f_true = f_1d(x)
f_linear = f0 + fp0 * (x - x0)
f_quadratic = f0 + fp0 * (x - x0) + 0.5 * fpp0 * (x - x0)**2

plt.figure(figsize=(12, 7))
plt.plot(x, f_true, 'b-', linewidth=3, label='True f(x)')
plt.plot(x, f_linear, 'r--', linewidth=2, label='1st order (linear)')
plt.plot(x, f_quadratic, 'g--', linewidth=2, label='2nd order (quadratic)')
plt.plot(x0, f0, 'ko', markersize=10, label=f'Expansion point x₀={x0}')

plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title('Taylor Series Approximation', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.xlim(x0 - 1.5, x0 + 1.5)
plt.tight_layout()
plt.show()

print("Observation: Approximations are better near the expansion point!")
print("2nd order captures curvature, 1st order only captures slope.")

Summary

Key Concepts from Chapter 5:

  1. Derivatives: Instantaneous rate of change, slopes

  2. Partial Derivatives: Change along one coordinate

  3. Gradients: Direction of steepest ascent

  4. Jacobians: Generalization for vector functions

  5. Chain Rule: Composition of functions

  6. Backpropagation: Efficient gradient computation

  7. Hessians: Second-order derivatives, curvature

  8. Taylor Series: Local approximations

ML Applications:

Concept

ML Application

Gradients

Gradient descent, optimization

Backpropagation

Training neural networks

Hessians

Newton’s method, second-order optimization

Chain rule

Deep learning, automatic differentiation

Taylor series

Trust region methods

Gradient Descent Preview:

\[x_{t+1} = x_t - \alpha \nabla f(x_t)\]

where α is the learning rate.

Next Steps:

  • Chapter 6: Probability and Distributions

  • Chapter 7: Continuous Optimization (applies gradients)

  • Chapter 9: Linear Regression (uses gradients for fitting)

Practice: Implement gradient descent to minimize f(x, y) = x² + xy + y²!