Run this notebook: Open in Colab Open in Kaggle

# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.special import gamma, factorial
import pandas as pd

sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)

6.1 Probability Space¶

Definition 6.1 (Probability Space)¶

A probability space consists of:

Sample space Ω: Set of all possible outcomes
Event space 𝓐: Set of events (subsets of Ω)
Probability measure P: Function assigning probabilities

Axioms of Probability:¶

Non-negativity: P(A) ≥ 0 for all events A
Normalization: P(Ω) = 1
Additivity: For disjoint events A and B: P(A ∪ B) = P(A) + P(B)

# Example: Coin flips

# Sample space
omega = ['H', 'T']  # Heads, Tails

# Probability measure (fair coin)
P = {'H': 0.5, 'T': 0.5}

print("Fair Coin Example")
print(f"Sample space Ω = {omega}")
print(f"\nProbability measure:")
for outcome, prob in P.items():
    print(f"  P({outcome}) = {prob}")

# Verify axioms
print("\nVerifying Axioms:")
print(f"1. Non-negativity: {all(p >= 0 for p in P.values())}")
print(f"2. Normalization: P(Ω) = {sum(P.values())} = 1")

# Simulate coin flips
n_flips = 10000
flips = np.random.choice(omega, size=n_flips, p=list(P.values()))

# Empirical frequencies
unique, counts = np.unique(flips, return_counts=True)
empirical_probs = counts / n_flips

print(f"\nSimulation ({n_flips} flips):")
for outcome, count, prob in zip(unique, counts, empirical_probs):
    print(f"  {outcome}: {count} flips ({prob:.4f})")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Theoretical
axes[0].bar(P.keys(), P.values(), color='skyblue', edgecolor='black')
axes[0].set_ylabel('Probability')
axes[0].set_title('Theoretical Probabilities')
axes[0].set_ylim([0, 0.6])

# Empirical
axes[1].bar(unique, empirical_probs, color='lightcoral', edgecolor='black')
axes[1].set_ylabel('Relative Frequency')
axes[1].set_title(f'Empirical Frequencies ({n_flips} flips)')
axes[1].set_ylim([0, 0.6])

plt.tight_layout()
plt.show()

print("\nLaw of Large Numbers: Empirical frequencies → true probabilities!")

6.2 Discrete and Continuous Probabilities¶

Discrete Random Variables¶

Probability Mass Function (PMF): P(X = x)

\[\sum_x P(X = x) = 1\]

Continuous Random Variables¶

Probability Density Function (PDF): p(x)

\[\int_{-\infty}^{\infty} p(x) dx = 1\]

\[P(a \leq X \leq b) = \int_a^b p(x) dx\]

# Discrete: Binomial distribution
# X ~ Binomial(n, p): number of successes in n trials

n_trials = 10
p_success = 0.3

# PMF
x_vals = np.arange(0, n_trials + 1)
pmf = stats.binom.pmf(x_vals, n_trials, p_success)

print(f"Binomial Distribution: n={n_trials}, p={p_success}")
print(f"\nPMF (Probability Mass Function):")
for x, prob in zip(x_vals, pmf):
    print(f"  P(X={x}) = {prob:.4f}")

# Verify normalization
print(f"\nSum of probabilities: {pmf.sum():.6f}")

# Simulate
samples = np.random.binomial(n_trials, p_success, size=10000)

# Plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(x_vals, pmf, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('x (number of successes)')
plt.ylabel('P(X = x)')
plt.title(f'Binomial PMF (n={n_trials}, p={p_success})')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(samples, bins=np.arange(0, n_trials + 2) - 0.5, 
         density=True, color='lightcoral', edgecolor='black', alpha=0.7, label='Empirical')
plt.bar(x_vals, pmf, color='skyblue', edgecolor='black', alpha=0.5, label='Theoretical')
plt.xlabel('x')
plt.ylabel('Probability')
plt.title('10,000 Samples vs Theory')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Continuous: Normal (Gaussian) distribution
# X ~ N(μ, σ²)

mu = 0
sigma = 1

# PDF: p(x) = (1/√(2πσ²)) exp(-(x-μ)²/(2σ²))
x = np.linspace(-4, 4, 1000)
pdf = stats.norm.pdf(x, mu, sigma)

print(f"Normal Distribution: μ={mu}, σ={sigma}")

# Integral of PDF = 1
dx = x[1] - x[0]
integral = np.sum(pdf) * dx
print(f"\n∫ p(x) dx = {integral:.6f}")

# Probability of interval
a, b = -1, 1
prob_interval = stats.norm.cdf(b, mu, sigma) - stats.norm.cdf(a, mu, sigma)
print(f"\nP({a} ≤ X ≤ {b}) = {prob_interval:.4f}")

# Samples
samples = np.random.normal(mu, sigma, size=10000)

# Plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(x, pdf, 'b-', linewidth=2, label='PDF p(x)')
plt.fill_between(x, pdf, where=(x >= a) & (x <= b), 
                 alpha=0.3, color='red', label=f'P({a} ≤ X ≤ {b})')
plt.xlabel('x')
plt.ylabel('Density p(x)')
plt.title(f'Normal PDF (μ={mu}, σ={sigma})')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(samples, bins=50, density=True, color='lightcoral', 
         edgecolor='black', alpha=0.7, label='Empirical')
plt.plot(x, pdf, 'b-', linewidth=2, label='Theoretical')
plt.xlabel('x')
plt.ylabel('Density')
plt.title('10,000 Samples vs Theory')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nNote: For continuous distributions, P(X = x) = 0!")
print("We can only compute P(a ≤ X ≤ b) for intervals.")

6.3 Sum Rule, Product Rule, and Bayes’ Theorem¶

Sum Rule (Marginalization):¶

\[p(x) = \sum_y p(x, y) = \sum_y p(x|y)p(y)\]

Product Rule:¶

\[p(x, y) = p(x|y)p(y) = p(y|x)p(x)\]

Bayes’ Theorem:¶

\[p(y|x) = \frac{p(x|y)p(y)}{p(x)} = \frac{p(x|y)p(y)}{\sum_y p(x|y)p(y)}\]

p(y): Prior
p(x|y): Likelihood
p(y|x): Posterior
p(x): Evidence (marginal likelihood)

# Bayes' Theorem: Medical test example

# Setup:
# Disease prevalence (prior)
p_disease = 0.01  # 1% of population has disease
p_healthy = 1 - p_disease

# Test accuracy (likelihood)
p_pos_given_disease = 0.95  # Sensitivity: 95% true positive rate
p_pos_given_healthy = 0.05  # False positive rate: 5%

print("Medical Test Example")
print("\nPrior:")
print(f"  P(Disease) = {p_disease}")
print(f"  P(Healthy) = {p_healthy}")

print("\nLikelihood (test accuracy):")
print(f"  P(Positive | Disease) = {p_pos_given_disease}")
print(f"  P(Positive | Healthy) = {p_pos_given_healthy}")

# Compute P(Positive) using sum rule
p_positive = (p_pos_given_disease * p_disease + 
              p_pos_given_healthy * p_healthy)

print(f"\nEvidence (marginal):")
print(f"  P(Positive) = {p_positive:.4f}")

# Bayes' theorem: P(Disease | Positive)
p_disease_given_pos = (p_pos_given_disease * p_disease) / p_positive

print("\nPosterior (what we care about!):")
print(f"  P(Disease | Positive) = {p_disease_given_pos:.4f}")
print(f"\nInterpretation: Even with a positive test, only {p_disease_given_pos*100:.1f}%")
print("chance of actually having the disease (due to low prevalence).")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Prior
axes[0].bar(['Disease', 'Healthy'], [p_disease, p_healthy], 
            color=['red', 'green'], alpha=0.6, edgecolor='black')
axes[0].set_ylabel('Probability')
axes[0].set_title('Prior: P(Disease)')
axes[0].set_ylim([0, 1])

# Posterior
p_healthy_given_pos = 1 - p_disease_given_pos
axes[1].bar(['Disease', 'Healthy'], [p_disease_given_pos, p_healthy_given_pos],
            color=['red', 'green'], alpha=0.6, edgecolor='black')
axes[1].set_ylabel('Probability')
axes[1].set_title('Posterior: P(Disease | Positive Test)')
axes[1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

print("\nBayes' theorem updates our belief from prior to posterior!")

# Joint, marginal, and conditional distributions

# Joint distribution P(X, Y)
# Example: Two dice
x_vals = np.arange(1, 7)  # Die 1
y_vals = np.arange(1, 7)  # Die 2

# Joint PMF (uniform for fair dice)
joint = np.ones((6, 6)) / 36

# Marginal distributions (sum rule)
marginal_x = joint.sum(axis=1)  # Sum over y
marginal_y = joint.sum(axis=0)  # Sum over x

print("Two Fair Dice Example")
print("\nMarginal P(X):")
print(marginal_x)
print(f"Sum: {marginal_x.sum()}")

print("\nMarginal P(Y):")
print(marginal_y)
print(f"Sum: {marginal_y.sum()}")

# Conditional distribution P(Y|X=3)
x_condition = 2  # Index for X=3
conditional_y_given_x = joint[x_condition, :] / marginal_x[x_condition]

print(f"\nConditional P(Y | X={x_condition+1}):")
print(conditional_y_given_x)
print(f"Sum: {conditional_y_given_x.sum():.6f}")

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Joint
im = axes[0, 0].imshow(joint, cmap='Blues', origin='lower')
axes[0, 0].set_xlabel('Y (Die 2)')
axes[0, 0].set_ylabel('X (Die 1)')
axes[0, 0].set_title('Joint P(X, Y)')
axes[0, 0].set_xticks(range(6))
axes[0, 0].set_yticks(range(6))
axes[0, 0].set_xticklabels(y_vals)
axes[0, 0].set_yticklabels(x_vals)
plt.colorbar(im, ax=axes[0, 0])

# Marginal X
axes[0, 1].bar(x_vals, marginal_x, color='skyblue', edgecolor='black')
axes[0, 1].set_xlabel('X (Die 1)')
axes[0, 1].set_ylabel('P(X)')
axes[0, 1].set_title('Marginal P(X)')
axes[0, 1].grid(True, alpha=0.3)

# Marginal Y
axes[1, 0].bar(y_vals, marginal_y, color='lightcoral', edgecolor='black')
axes[1, 0].set_xlabel('Y (Die 2)')
axes[1, 0].set_ylabel('P(Y)')
axes[1, 0].set_title('Marginal P(Y)')
axes[1, 0].grid(True, alpha=0.3)

# Conditional
axes[1, 1].bar(y_vals, conditional_y_given_x, color='lightgreen', edgecolor='black')
axes[1, 1].set_xlabel('Y (Die 2)')
axes[1, 1].set_ylabel(f'P(Y | X={x_condition+1})')
axes[1, 1].set_title(f'Conditional P(Y | X={x_condition+1})')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nFor independent events: P(X, Y) = P(X)P(Y)")
print("Here, dice are independent, so P(Y|X) = P(Y)!")

6.4 Summary Statistics and Independence¶

Expectation (Mean):¶

Discrete: \(E[X] = \sum_x x \cdot P(X = x)\)

Continuous: \(E[X] = \int x \cdot p(x) dx\)

Variance:¶

\[\text{Var}[X] = E[(X - E[X])^2] = E[X^2] - (E[X])^2\]

Covariance:¶

\[\text{Cov}[X, Y] = E[(X - E[X])(Y - E[Y])] = E[XY] - E[X]E[Y]\]

Independence:¶

X and Y are independent if: \(p(x, y) = p(x)p(y)\)

Implies: Cov[X, Y] = 0 (but not vice versa!)

# Compute mean and variance

# Example: Die roll
x_vals = np.arange(1, 7)
probs = np.ones(6) / 6  # Fair die

# Expectation
mean = np.sum(x_vals * probs)
print("Fair Die (X ~ Uniform{1,2,3,4,5,6})")
print(f"\nE[X] = {mean:.4f}")

# E[X²]
mean_x2 = np.sum(x_vals**2 * probs)
print(f"E[X²] = {mean_x2:.4f}")

# Variance
variance = mean_x2 - mean**2
std = np.sqrt(variance)
print(f"\nVar[X] = E[X²] - (E[X])² = {variance:.4f}")
print(f"Std[X] = √Var[X] = {std:.4f}")

# Simulate and verify
samples = np.random.choice(x_vals, size=100000, p=probs)
empirical_mean = samples.mean()
empirical_var = samples.var()

print(f"\nEmpirical (100k samples):")
print(f"  Mean: {empirical_mean:.4f}")
print(f"  Variance: {empirical_var:.4f}")

# Visualize
plt.figure(figsize=(10, 6))
plt.bar(x_vals, probs, color='skyblue', edgecolor='black', alpha=0.7, label='PMF')
plt.axvline(mean, color='red', linestyle='--', linewidth=2, label=f'E[X] = {mean:.2f}')
plt.axvline(mean - std, color='orange', linestyle=':', linewidth=2, label='±1 std')
plt.axvline(mean + std, color='orange', linestyle=':', linewidth=2)
plt.xlabel('x')
plt.ylabel('P(X = x)')
plt.title('Fair Die: PMF and Summary Statistics')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Covariance and correlation

# Generate correlated data
np.random.seed(42)
n = 1000

# Positive correlation
x1 = np.random.randn(n)
y1 = 0.8 * x1 + 0.6 * np.random.randn(n)

# Negative correlation
x2 = np.random.randn(n)
y2 = -0.8 * x2 + 0.6 * np.random.randn(n)

# No correlation (independent)
x3 = np.random.randn(n)
y3 = np.random.randn(n)

# Compute statistics
def compute_stats(x, y):
    cov = np.cov(x, y)[0, 1]
    corr = np.corrcoef(x, y)[0, 1]
    return cov, corr

cov1, corr1 = compute_stats(x1, y1)
cov2, corr2 = compute_stats(x2, y2)
cov3, corr3 = compute_stats(x3, y3)

print("Covariance and Correlation\n")
print(f"Positive correlation: Cov = {cov1:.4f}, Corr = {corr1:.4f}")
print(f"Negative correlation: Cov = {cov2:.4f}, Corr = {corr2:.4f}")
print(f"Independent:          Cov = {cov3:.4f}, Corr = {corr3:.4f}")

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].scatter(x1, y1, alpha=0.5, s=10)
axes[0].set_title(f'Positive Correlation\nρ = {corr1:.2f}')
axes[0].set_xlabel('X')
axes[0].set_ylabel('Y')
axes[0].grid(True, alpha=0.3)

axes[1].scatter(x2, y2, alpha=0.5, s=10, color='red')
axes[1].set_title(f'Negative Correlation\nρ = {corr2:.2f}')
axes[1].set_xlabel('X')
axes[1].set_ylabel('Y')
axes[1].grid(True, alpha=0.3)

axes[2].scatter(x3, y3, alpha=0.5, s=10, color='green')
axes[2].set_title(f'Independent\nρ = {corr3:.2f}')
axes[2].set_xlabel('X')
axes[2].set_ylabel('Y')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nCorrelation ρ ∈ [-1, 1]:")
print("  ρ = 1  → perfect positive linear relationship")
print("  ρ = 0  → no linear relationship")
print("  ρ = -1 → perfect negative linear relationship")

6.5 Gaussian Distribution¶

Univariate Gaussian:¶

\[p(x | \mu, \sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)\]

Multivariate Gaussian:¶

\[p(\mathbf{x} | \boldsymbol{\mu}, \boldsymbol{\Sigma}) = \frac{1}{(2\pi)^{d/2}|\boldsymbol{\Sigma}|^{1/2}} \exp\left(-\frac{1}{2}(\mathbf{x}-\boldsymbol{\mu})^T\boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right)\]

where:

μ: mean vector
Σ: covariance matrix
d: dimensionality

Why important?

Central Limit Theorem
Maximum entropy distribution
Analytically tractable

# Multivariate Gaussian

# 2D Gaussian
mu = np.array([0, 0])
Sigma = np.array([[1, 0.8],
                  [0.8, 1]])

print("2D Gaussian Distribution")
print(f"\nMean μ:\n{mu}")
print(f"\nCovariance Σ:\n{Sigma}")

# Sample
samples = np.random.multivariate_normal(mu, Sigma, size=1000)

# Create grid for contour plot
x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
pos = np.dstack((X, Y))

# PDF
rv = stats.multivariate_normal(mu, Sigma)
Z = rv.pdf(pos)

# Visualize
fig = plt.figure(figsize=(14, 6))

# 3D surface
ax1 = fig.add_subplot(121, projection='3d')
ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x₁')
ax1.set_ylabel('x₂')
ax1.set_zlabel('p(x)')
ax1.set_title('2D Gaussian PDF')

# Contours + samples
ax2 = fig.add_subplot(122)
ax2.contour(X, Y, Z, levels=10, cmap='viridis')
ax2.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10, color='red')
ax2.set_xlabel('x₁')
ax2.set_ylabel('x₂')
ax2.set_title('Contours + 1000 Samples')
ax2.set_aspect('equal')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Empirical statistics
emp_mu = samples.mean(axis=0)
emp_Sigma = np.cov(samples.T)

print("\nEmpirical (from samples):")
print(f"Mean:\n{emp_mu}")
print(f"\nCovariance:\n{emp_Sigma}")
print(f"\nClose to true values? {np.allclose(emp_mu, mu, atol=0.1)}")

# Effect of covariance matrix

mu = np.array([0, 0])

# Different covariance matrices
Sigma_spherical = np.array([[1, 0],
                             [0, 1]])  # Independent, equal variance

Sigma_diagonal = np.array([[2, 0],
                            [0, 0.5]])  # Independent, different variance

Sigma_correlated = np.array([[1, 0.9],
                              [0.9, 1]])  # Positive correlation

# Sample
n = 500
samples1 = np.random.multivariate_normal(mu, Sigma_spherical, n)
samples2 = np.random.multivariate_normal(mu, Sigma_diagonal, n)
samples3 = np.random.multivariate_normal(mu, Sigma_correlated, n)

# Plot
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].scatter(samples1[:, 0], samples1[:, 1], alpha=0.5, s=10)
axes[0].set_title('Spherical\nΣ = [[1, 0], [0, 1]]')
axes[0].set_xlabel('x₁')
axes[0].set_ylabel('x₂')
axes[0].set_xlim([-4, 4])
axes[0].set_ylim([-4, 4])
axes[0].set_aspect('equal')
axes[0].grid(True, alpha=0.3)

axes[1].scatter(samples2[:, 0], samples2[:, 1], alpha=0.5, s=10, color='red')
axes[1].set_title('Diagonal\nΣ = [[2, 0], [0, 0.5]]')
axes[1].set_xlabel('x₁')
axes[1].set_ylabel('x₂')
axes[1].set_xlim([-4, 4])
axes[1].set_ylim([-4, 4])
axes[1].set_aspect('equal')
axes[1].grid(True, alpha=0.3)

axes[2].scatter(samples3[:, 0], samples3[:, 1], alpha=0.5, s=10, color='green')
axes[2].set_title('Correlated\nΣ = [[1, 0.9], [0.9, 1]]')
axes[2].set_xlabel('x₁')
axes[2].set_ylabel('x₂')
axes[2].set_xlim([-4, 4])
axes[2].set_ylim([-4, 4])
axes[2].set_aspect('equal')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Covariance matrix Σ controls the shape and orientation!")
print("- Diagonal Σ → axes aligned with coordinates")
print("- Off-diagonal elements → correlation, rotation")

Summary¶

Key Concepts from Chapter 6:¶

Probability Space: (Ω, 𝓐, P) - sample space, events, probability
PMF vs PDF: Discrete vs continuous distributions
Sum Rule: Marginalization p(x) = Σ p(x,y)
Product Rule: p(x,y) = p(x|y)p(y)
Bayes’ Theorem: p(y|x) = p(x|y)p(y)/p(x)
Expectation: E[X], Var[X], Cov[X,Y]
Gaussian Distribution: Most important in ML!

ML Applications:¶

Concept	ML Application
Bayes’ theorem	Bayesian inference, spam filters
Gaussian	Linear regression, GMMs, Gaussian processes
Expectation	Loss functions, risk
Covariance	PCA, Mahalanobis distance
Independence	Naive Bayes classifier

Important Distributions:¶

Discrete:

Bernoulli: p(x) = p^x (1-p)^(1-x)
Binomial: Number of successes in n trials
Poisson: Count data

Continuous:

Uniform: p(x) = 1/(b-a) for x ∈ [a,b]
Gaussian: Normal distribution
Exponential: Waiting times

Next Steps:¶

Chapter 7: Continuous Optimization
Chapter 9: Linear Regression (uses Gaussians)
Chapter 11: Gaussian Mixture Models

Practice: Implement Naive Bayes classifier using Bayes’ theorem!