# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.special import gamma, factorial
import pandas as pd
sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)
6.1 Probability Space¶
Definition 6.1 (Probability Space)¶
A probability space consists of:
Sample space Ω: Set of all possible outcomes
Event space 𝓐: Set of events (subsets of Ω)
Probability measure P: Function assigning probabilities
Axioms of Probability:¶
Non-negativity: P(A) ≥ 0 for all events A
Normalization: P(Ω) = 1
Additivity: For disjoint events A and B: P(A ∪ B) = P(A) + P(B)
# Example: Coin flips
# Sample space
omega = ['H', 'T'] # Heads, Tails
# Probability measure (fair coin)
P = {'H': 0.5, 'T': 0.5}
print("Fair Coin Example")
print(f"Sample space Ω = {omega}")
print(f"\nProbability measure:")
for outcome, prob in P.items():
print(f" P({outcome}) = {prob}")
# Verify axioms
print("\nVerifying Axioms:")
print(f"1. Non-negativity: {all(p >= 0 for p in P.values())}")
print(f"2. Normalization: P(Ω) = {sum(P.values())} = 1")
# Simulate coin flips
n_flips = 10000
flips = np.random.choice(omega, size=n_flips, p=list(P.values()))
# Empirical frequencies
unique, counts = np.unique(flips, return_counts=True)
empirical_probs = counts / n_flips
print(f"\nSimulation ({n_flips} flips):")
for outcome, count, prob in zip(unique, counts, empirical_probs):
print(f" {outcome}: {count} flips ({prob:.4f})")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Theoretical
axes[0].bar(P.keys(), P.values(), color='skyblue', edgecolor='black')
axes[0].set_ylabel('Probability')
axes[0].set_title('Theoretical Probabilities')
axes[0].set_ylim([0, 0.6])
# Empirical
axes[1].bar(unique, empirical_probs, color='lightcoral', edgecolor='black')
axes[1].set_ylabel('Relative Frequency')
axes[1].set_title(f'Empirical Frequencies ({n_flips} flips)')
axes[1].set_ylim([0, 0.6])
plt.tight_layout()
plt.show()
print("\nLaw of Large Numbers: Empirical frequencies → true probabilities!")
6.2 Discrete and Continuous Probabilities¶
Discrete Random Variables¶
Probability Mass Function (PMF): P(X = x)
Continuous Random Variables¶
Probability Density Function (PDF): p(x)
# Discrete: Binomial distribution
# X ~ Binomial(n, p): number of successes in n trials
n_trials = 10
p_success = 0.3
# PMF
x_vals = np.arange(0, n_trials + 1)
pmf = stats.binom.pmf(x_vals, n_trials, p_success)
print(f"Binomial Distribution: n={n_trials}, p={p_success}")
print(f"\nPMF (Probability Mass Function):")
for x, prob in zip(x_vals, pmf):
print(f" P(X={x}) = {prob:.4f}")
# Verify normalization
print(f"\nSum of probabilities: {pmf.sum():.6f}")
# Simulate
samples = np.random.binomial(n_trials, p_success, size=10000)
# Plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.bar(x_vals, pmf, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('x (number of successes)')
plt.ylabel('P(X = x)')
plt.title(f'Binomial PMF (n={n_trials}, p={p_success})')
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.hist(samples, bins=np.arange(0, n_trials + 2) - 0.5,
density=True, color='lightcoral', edgecolor='black', alpha=0.7, label='Empirical')
plt.bar(x_vals, pmf, color='skyblue', edgecolor='black', alpha=0.5, label='Theoretical')
plt.xlabel('x')
plt.ylabel('Probability')
plt.title('10,000 Samples vs Theory')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Continuous: Normal (Gaussian) distribution
# X ~ N(μ, σ²)
mu = 0
sigma = 1
# PDF: p(x) = (1/√(2πσ²)) exp(-(x-μ)²/(2σ²))
x = np.linspace(-4, 4, 1000)
pdf = stats.norm.pdf(x, mu, sigma)
print(f"Normal Distribution: μ={mu}, σ={sigma}")
# Integral of PDF = 1
dx = x[1] - x[0]
integral = np.sum(pdf) * dx
print(f"\n∫ p(x) dx = {integral:.6f}")
# Probability of interval
a, b = -1, 1
prob_interval = stats.norm.cdf(b, mu, sigma) - stats.norm.cdf(a, mu, sigma)
print(f"\nP({a} ≤ X ≤ {b}) = {prob_interval:.4f}")
# Samples
samples = np.random.normal(mu, sigma, size=10000)
# Plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(x, pdf, 'b-', linewidth=2, label='PDF p(x)')
plt.fill_between(x, pdf, where=(x >= a) & (x <= b),
alpha=0.3, color='red', label=f'P({a} ≤ X ≤ {b})')
plt.xlabel('x')
plt.ylabel('Density p(x)')
plt.title(f'Normal PDF (μ={mu}, σ={sigma})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.hist(samples, bins=50, density=True, color='lightcoral',
edgecolor='black', alpha=0.7, label='Empirical')
plt.plot(x, pdf, 'b-', linewidth=2, label='Theoretical')
plt.xlabel('x')
plt.ylabel('Density')
plt.title('10,000 Samples vs Theory')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nNote: For continuous distributions, P(X = x) = 0!")
print("We can only compute P(a ≤ X ≤ b) for intervals.")
6.3 Sum Rule, Product Rule, and Bayes’ Theorem¶
Sum Rule (Marginalization):¶
Product Rule:¶
Bayes’ Theorem:¶
p(y): Prior
p(x|y): Likelihood
p(y|x): Posterior
p(x): Evidence (marginal likelihood)
# Bayes' Theorem: Medical test example
# Setup:
# Disease prevalence (prior)
p_disease = 0.01 # 1% of population has disease
p_healthy = 1 - p_disease
# Test accuracy (likelihood)
p_pos_given_disease = 0.95 # Sensitivity: 95% true positive rate
p_pos_given_healthy = 0.05 # False positive rate: 5%
print("Medical Test Example")
print("\nPrior:")
print(f" P(Disease) = {p_disease}")
print(f" P(Healthy) = {p_healthy}")
print("\nLikelihood (test accuracy):")
print(f" P(Positive | Disease) = {p_pos_given_disease}")
print(f" P(Positive | Healthy) = {p_pos_given_healthy}")
# Compute P(Positive) using sum rule
p_positive = (p_pos_given_disease * p_disease +
p_pos_given_healthy * p_healthy)
print(f"\nEvidence (marginal):")
print(f" P(Positive) = {p_positive:.4f}")
# Bayes' theorem: P(Disease | Positive)
p_disease_given_pos = (p_pos_given_disease * p_disease) / p_positive
print("\nPosterior (what we care about!):")
print(f" P(Disease | Positive) = {p_disease_given_pos:.4f}")
print(f"\nInterpretation: Even with a positive test, only {p_disease_given_pos*100:.1f}%")
print("chance of actually having the disease (due to low prevalence).")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Prior
axes[0].bar(['Disease', 'Healthy'], [p_disease, p_healthy],
color=['red', 'green'], alpha=0.6, edgecolor='black')
axes[0].set_ylabel('Probability')
axes[0].set_title('Prior: P(Disease)')
axes[0].set_ylim([0, 1])
# Posterior
p_healthy_given_pos = 1 - p_disease_given_pos
axes[1].bar(['Disease', 'Healthy'], [p_disease_given_pos, p_healthy_given_pos],
color=['red', 'green'], alpha=0.6, edgecolor='black')
axes[1].set_ylabel('Probability')
axes[1].set_title('Posterior: P(Disease | Positive Test)')
axes[1].set_ylim([0, 1])
plt.tight_layout()
plt.show()
print("\nBayes' theorem updates our belief from prior to posterior!")
# Joint, marginal, and conditional distributions
# Joint distribution P(X, Y)
# Example: Two dice
x_vals = np.arange(1, 7) # Die 1
y_vals = np.arange(1, 7) # Die 2
# Joint PMF (uniform for fair dice)
joint = np.ones((6, 6)) / 36
# Marginal distributions (sum rule)
marginal_x = joint.sum(axis=1) # Sum over y
marginal_y = joint.sum(axis=0) # Sum over x
print("Two Fair Dice Example")
print("\nMarginal P(X):")
print(marginal_x)
print(f"Sum: {marginal_x.sum()}")
print("\nMarginal P(Y):")
print(marginal_y)
print(f"Sum: {marginal_y.sum()}")
# Conditional distribution P(Y|X=3)
x_condition = 2 # Index for X=3
conditional_y_given_x = joint[x_condition, :] / marginal_x[x_condition]
print(f"\nConditional P(Y | X={x_condition+1}):")
print(conditional_y_given_x)
print(f"Sum: {conditional_y_given_x.sum():.6f}")
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Joint
im = axes[0, 0].imshow(joint, cmap='Blues', origin='lower')
axes[0, 0].set_xlabel('Y (Die 2)')
axes[0, 0].set_ylabel('X (Die 1)')
axes[0, 0].set_title('Joint P(X, Y)')
axes[0, 0].set_xticks(range(6))
axes[0, 0].set_yticks(range(6))
axes[0, 0].set_xticklabels(y_vals)
axes[0, 0].set_yticklabels(x_vals)
plt.colorbar(im, ax=axes[0, 0])
# Marginal X
axes[0, 1].bar(x_vals, marginal_x, color='skyblue', edgecolor='black')
axes[0, 1].set_xlabel('X (Die 1)')
axes[0, 1].set_ylabel('P(X)')
axes[0, 1].set_title('Marginal P(X)')
axes[0, 1].grid(True, alpha=0.3)
# Marginal Y
axes[1, 0].bar(y_vals, marginal_y, color='lightcoral', edgecolor='black')
axes[1, 0].set_xlabel('Y (Die 2)')
axes[1, 0].set_ylabel('P(Y)')
axes[1, 0].set_title('Marginal P(Y)')
axes[1, 0].grid(True, alpha=0.3)
# Conditional
axes[1, 1].bar(y_vals, conditional_y_given_x, color='lightgreen', edgecolor='black')
axes[1, 1].set_xlabel('Y (Die 2)')
axes[1, 1].set_ylabel(f'P(Y | X={x_condition+1})')
axes[1, 1].set_title(f'Conditional P(Y | X={x_condition+1})')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nFor independent events: P(X, Y) = P(X)P(Y)")
print("Here, dice are independent, so P(Y|X) = P(Y)!")
6.4 Summary Statistics and Independence¶
Expectation (Mean):¶
Discrete: \(E[X] = \sum_x x \cdot P(X = x)\)
Continuous: \(E[X] = \int x \cdot p(x) dx\)
Variance:¶
Covariance:¶
Independence:¶
X and Y are independent if: \(p(x, y) = p(x)p(y)\)
Implies: Cov[X, Y] = 0 (but not vice versa!)
# Compute mean and variance
# Example: Die roll
x_vals = np.arange(1, 7)
probs = np.ones(6) / 6 # Fair die
# Expectation
mean = np.sum(x_vals * probs)
print("Fair Die (X ~ Uniform{1,2,3,4,5,6})")
print(f"\nE[X] = {mean:.4f}")
# E[X²]
mean_x2 = np.sum(x_vals**2 * probs)
print(f"E[X²] = {mean_x2:.4f}")
# Variance
variance = mean_x2 - mean**2
std = np.sqrt(variance)
print(f"\nVar[X] = E[X²] - (E[X])² = {variance:.4f}")
print(f"Std[X] = √Var[X] = {std:.4f}")
# Simulate and verify
samples = np.random.choice(x_vals, size=100000, p=probs)
empirical_mean = samples.mean()
empirical_var = samples.var()
print(f"\nEmpirical (100k samples):")
print(f" Mean: {empirical_mean:.4f}")
print(f" Variance: {empirical_var:.4f}")
# Visualize
plt.figure(figsize=(10, 6))
plt.bar(x_vals, probs, color='skyblue', edgecolor='black', alpha=0.7, label='PMF')
plt.axvline(mean, color='red', linestyle='--', linewidth=2, label=f'E[X] = {mean:.2f}')
plt.axvline(mean - std, color='orange', linestyle=':', linewidth=2, label='±1 std')
plt.axvline(mean + std, color='orange', linestyle=':', linewidth=2)
plt.xlabel('x')
plt.ylabel('P(X = x)')
plt.title('Fair Die: PMF and Summary Statistics')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Covariance and correlation
# Generate correlated data
np.random.seed(42)
n = 1000
# Positive correlation
x1 = np.random.randn(n)
y1 = 0.8 * x1 + 0.6 * np.random.randn(n)
# Negative correlation
x2 = np.random.randn(n)
y2 = -0.8 * x2 + 0.6 * np.random.randn(n)
# No correlation (independent)
x3 = np.random.randn(n)
y3 = np.random.randn(n)
# Compute statistics
def compute_stats(x, y):
cov = np.cov(x, y)[0, 1]
corr = np.corrcoef(x, y)[0, 1]
return cov, corr
cov1, corr1 = compute_stats(x1, y1)
cov2, corr2 = compute_stats(x2, y2)
cov3, corr3 = compute_stats(x3, y3)
print("Covariance and Correlation\n")
print(f"Positive correlation: Cov = {cov1:.4f}, Corr = {corr1:.4f}")
print(f"Negative correlation: Cov = {cov2:.4f}, Corr = {corr2:.4f}")
print(f"Independent: Cov = {cov3:.4f}, Corr = {corr3:.4f}")
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
axes[0].scatter(x1, y1, alpha=0.5, s=10)
axes[0].set_title(f'Positive Correlation\nρ = {corr1:.2f}')
axes[0].set_xlabel('X')
axes[0].set_ylabel('Y')
axes[0].grid(True, alpha=0.3)
axes[1].scatter(x2, y2, alpha=0.5, s=10, color='red')
axes[1].set_title(f'Negative Correlation\nρ = {corr2:.2f}')
axes[1].set_xlabel('X')
axes[1].set_ylabel('Y')
axes[1].grid(True, alpha=0.3)
axes[2].scatter(x3, y3, alpha=0.5, s=10, color='green')
axes[2].set_title(f'Independent\nρ = {corr3:.2f}')
axes[2].set_xlabel('X')
axes[2].set_ylabel('Y')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nCorrelation ρ ∈ [-1, 1]:")
print(" ρ = 1 → perfect positive linear relationship")
print(" ρ = 0 → no linear relationship")
print(" ρ = -1 → perfect negative linear relationship")
6.5 Gaussian Distribution¶
Univariate Gaussian:¶
Multivariate Gaussian:¶
where:
μ: mean vector
Σ: covariance matrix
d: dimensionality
Why important?
Central Limit Theorem
Maximum entropy distribution
Analytically tractable
# Multivariate Gaussian
# 2D Gaussian
mu = np.array([0, 0])
Sigma = np.array([[1, 0.8],
[0.8, 1]])
print("2D Gaussian Distribution")
print(f"\nMean μ:\n{mu}")
print(f"\nCovariance Σ:\n{Sigma}")
# Sample
samples = np.random.multivariate_normal(mu, Sigma, size=1000)
# Create grid for contour plot
x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
pos = np.dstack((X, Y))
# PDF
rv = stats.multivariate_normal(mu, Sigma)
Z = rv.pdf(pos)
# Visualize
fig = plt.figure(figsize=(14, 6))
# 3D surface
ax1 = fig.add_subplot(121, projection='3d')
ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x₁')
ax1.set_ylabel('x₂')
ax1.set_zlabel('p(x)')
ax1.set_title('2D Gaussian PDF')
# Contours + samples
ax2 = fig.add_subplot(122)
ax2.contour(X, Y, Z, levels=10, cmap='viridis')
ax2.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10, color='red')
ax2.set_xlabel('x₁')
ax2.set_ylabel('x₂')
ax2.set_title('Contours + 1000 Samples')
ax2.set_aspect('equal')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Empirical statistics
emp_mu = samples.mean(axis=0)
emp_Sigma = np.cov(samples.T)
print("\nEmpirical (from samples):")
print(f"Mean:\n{emp_mu}")
print(f"\nCovariance:\n{emp_Sigma}")
print(f"\nClose to true values? {np.allclose(emp_mu, mu, atol=0.1)}")
# Effect of covariance matrix
mu = np.array([0, 0])
# Different covariance matrices
Sigma_spherical = np.array([[1, 0],
[0, 1]]) # Independent, equal variance
Sigma_diagonal = np.array([[2, 0],
[0, 0.5]]) # Independent, different variance
Sigma_correlated = np.array([[1, 0.9],
[0.9, 1]]) # Positive correlation
# Sample
n = 500
samples1 = np.random.multivariate_normal(mu, Sigma_spherical, n)
samples2 = np.random.multivariate_normal(mu, Sigma_diagonal, n)
samples3 = np.random.multivariate_normal(mu, Sigma_correlated, n)
# Plot
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
axes[0].scatter(samples1[:, 0], samples1[:, 1], alpha=0.5, s=10)
axes[0].set_title('Spherical\nΣ = [[1, 0], [0, 1]]')
axes[0].set_xlabel('x₁')
axes[0].set_ylabel('x₂')
axes[0].set_xlim([-4, 4])
axes[0].set_ylim([-4, 4])
axes[0].set_aspect('equal')
axes[0].grid(True, alpha=0.3)
axes[1].scatter(samples2[:, 0], samples2[:, 1], alpha=0.5, s=10, color='red')
axes[1].set_title('Diagonal\nΣ = [[2, 0], [0, 0.5]]')
axes[1].set_xlabel('x₁')
axes[1].set_ylabel('x₂')
axes[1].set_xlim([-4, 4])
axes[1].set_ylim([-4, 4])
axes[1].set_aspect('equal')
axes[1].grid(True, alpha=0.3)
axes[2].scatter(samples3[:, 0], samples3[:, 1], alpha=0.5, s=10, color='green')
axes[2].set_title('Correlated\nΣ = [[1, 0.9], [0.9, 1]]')
axes[2].set_xlabel('x₁')
axes[2].set_ylabel('x₂')
axes[2].set_xlim([-4, 4])
axes[2].set_ylim([-4, 4])
axes[2].set_aspect('equal')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Covariance matrix Σ controls the shape and orientation!")
print("- Diagonal Σ → axes aligned with coordinates")
print("- Off-diagonal elements → correlation, rotation")
Summary¶
Key Concepts from Chapter 6:¶
Probability Space: (Ω, 𝓐, P) - sample space, events, probability
PMF vs PDF: Discrete vs continuous distributions
Sum Rule: Marginalization p(x) = Σ p(x,y)
Product Rule: p(x,y) = p(x|y)p(y)
Bayes’ Theorem: p(y|x) = p(x|y)p(y)/p(x)
Expectation: E[X], Var[X], Cov[X,Y]
Gaussian Distribution: Most important in ML!
ML Applications:¶
Concept |
ML Application |
|---|---|
Bayes’ theorem |
Bayesian inference, spam filters |
Gaussian |
Linear regression, GMMs, Gaussian processes |
Expectation |
Loss functions, risk |
Covariance |
PCA, Mahalanobis distance |
Independence |
Naive Bayes classifier |
Important Distributions:¶
Discrete:
Bernoulli: p(x) = p^x (1-p)^(1-x)
Binomial: Number of successes in n trials
Poisson: Count data
Continuous:
Uniform: p(x) = 1/(b-a) for x ∈ [a,b]
Gaussian: Normal distribution
Exponential: Waiting times
Next Steps:¶
Chapter 7: Continuous Optimization
Chapter 9: Linear Regression (uses Gaussians)
Chapter 11: Gaussian Mixture Models
Practice: Implement Naive Bayes classifier using Bayes’ theorem!