import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.special import comb
import pandas as pd

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
np.random.seed(42)

1. Basic Probability RulesΒΆ

1.1 Sum Rule and Product RuleΒΆ

Sum Rule (Marginalization): $\(p(X=x) = \sum_{y} p(X=x, Y=y)\)$

Product Rule: $\(p(X=x, Y=y) = p(X=x|Y=y)p(Y=y)\)$

# Example: Rolling two dice
# Joint probability table
dice1 = np.arange(1, 7)
dice2 = np.arange(1, 7)

# Joint probability (uniform)
joint_prob = np.ones((6, 6)) / 36

# Marginal probability (sum rule)
marginal_dice1 = joint_prob.sum(axis=1)
marginal_dice2 = joint_prob.sum(axis=0)

print("Joint Probability Table:")
print(pd.DataFrame(joint_prob, index=dice1, columns=dice2))
print(f"\nMarginal P(Dice1): {marginal_dice1}")
print(f"Marginal P(Dice2): {marginal_dice2}")
print(f"Sum of marginals: {marginal_dice1.sum():.4f}")

1.2 Conditional ProbabilityΒΆ

What and Why: Conditional probability \(p(Y=y|X=x) = \frac{p(X=x, Y=y)}{p(X=x)}\) quantifies how our belief about \(Y\) changes when we observe \(X=x\). This is the mechanism by which data informs inference – every supervised learning prediction is fundamentally a conditional probability estimate \(p(y|x)\).

How: The code below applies conditional probability to medical diagnosis. Given a disease with 1% prevalence and a test with 99% sensitivity and 95% specificity, we compute \(P(\text{positive test})\) using the sum rule over the joint: \(P(+) = P(+|\text{disease})P(\text{disease}) + P(+|\text{no disease})P(\text{no disease})\). The result sets up the next section where Bayes’ rule inverts this conditioning to find \(P(\text{disease}|+)\).

Connection: Conditional probability underpins classifier outputs (logistic regression, softmax), conditional generative models (VAEs, diffusion models), and the chain rule decomposition used in autoregressive language models where \(p(x_1, \ldots, x_T) = \prod_t p(x_t | x_{<t})\).

# Example: Medical diagnosis
# Disease prevalence
p_disease = 0.01
p_no_disease = 0.99

# Test accuracy
p_positive_given_disease = 0.99  # True positive rate (sensitivity)
p_negative_given_no_disease = 0.95  # True negative rate (specificity)
p_positive_given_no_disease = 1 - p_negative_given_no_disease  # False positive rate

# Joint probabilities
p_disease_and_positive = p_positive_given_disease * p_disease
p_no_disease_and_positive = p_positive_given_no_disease * p_no_disease

# Marginal: P(positive test)
p_positive = p_disease_and_positive + p_no_disease_and_positive

print(f"P(Disease) = {p_disease}")
print(f"P(+|Disease) = {p_positive_given_disease}")
print(f"P(+|No Disease) = {p_positive_given_no_disease}")
print(f"P(Positive Test) = {p_positive:.4f}")

2. Bayes’ RuleΒΆ

\[p(H|D) = \frac{p(D|H)p(H)}{p(D)}\]

Where:

  • \(p(H|D)\) is the posterior (what we want to know)

  • \(p(D|H)\) is the likelihood (how likely is the data given hypothesis)

  • \(p(H)\) is the prior (what we believed before seeing data)

  • \(p(D)\) is the evidence (normalizing constant)

# Continuing medical diagnosis example
# What's the probability of having disease given positive test?

# Bayes' rule
p_disease_given_positive = (p_positive_given_disease * p_disease) / p_positive

print("\n=== Bayes' Rule Application ===")
print(f"Prior: P(Disease) = {p_disease}")
print(f"Likelihood: P(+|Disease) = {p_positive_given_disease}")
print(f"Evidence: P(+) = {p_positive:.4f}")
print(f"\nPosterior: P(Disease|+) = {p_disease_given_positive:.4f}")
print(f"\nEven with a positive test, only {p_disease_given_positive*100:.1f}% chance of disease!")
print("This illustrates the base rate fallacy.")

2.1 Bayesian Updating with Multiple ObservationsΒΆ

What: This code demonstrates sequential Bayesian updating, where each new data point refines our belief about an unknown parameter – here, the bias \(\theta\) of a coin. Starting from a Beta(2,2) prior, we observe coin flips one at a time and watch the posterior evolve.

Why: Sequential updating is a cornerstone of Bayesian inference in ML. Unlike batch MLE which computes a single point estimate from all data, Bayesian updating via conjugate priors lets us maintain a full distribution over parameters. The Beta-Binomial conjugate pair makes the posterior update analytic: after observing \(h\) heads and \(t\) tails, the posterior is \(\text{Beta}(\alpha + h, \beta + t)\).

How: The left panel contrasts the prior \(\text{Beta}(2,2)\) against the posterior \(\text{Beta}(\alpha + n_h, \beta + n_t)\), showing how observed data shifts probability mass. The right panel tracks the posterior mean \(\mathbb{E}[\theta] = \frac{\alpha}{\alpha + \beta}\) after each flip, illustrating convergence. Notice the posterior mean lies between the prior mean and the MLE \(\hat{\theta} = \frac{n_h}{n_h + n_t}\) – the prior acts as a regularizer, which matters most when data is scarce.

Connection: Bayesian updating underpins online learning systems, Thompson sampling in bandits, and adaptive A/B testing. In deep learning, Bayesian neural networks maintain weight distributions updated via similar principles, providing uncertainty estimates critical for safety-sensitive applications.

# Example: Coin flip - is the coin fair?
# Prior: Beta(2, 2) - slightly biased towards fair
alpha_prior = 2
beta_prior = 2

# Data: 7 heads out of 10 flips
n_heads = 7
n_tails = 3

# Posterior: Beta(alpha_prior + n_heads, beta_prior + n_tails)
alpha_post = alpha_prior + n_heads
beta_post = beta_prior + n_tails

# Plot
theta = np.linspace(0, 1, 1000)
prior = stats.beta(alpha_prior, beta_prior).pdf(theta)
posterior = stats.beta(alpha_post, beta_post).pdf(theta)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(theta, prior, 'b-', label='Prior Beta(2,2)', linewidth=2)
plt.plot(theta, posterior, 'r-', label=f'Posterior Beta({alpha_post},{beta_post})', linewidth=2)
plt.axvline(0.5, color='k', linestyle='--', alpha=0.3, label='Fair coin')
plt.xlabel('ΞΈ (probability of heads)')
plt.ylabel('Density')
plt.title('Bayesian Update for Coin Fairness')
plt.legend()
plt.grid(True, alpha=0.3)

# Simulate sequential updating
plt.subplot(1, 2, 2)
outcomes = [1]*n_heads + [0]*n_tails
np.random.shuffle(outcomes)

alphas = [alpha_prior]
betas = [beta_prior]
means = [alpha_prior / (alpha_prior + beta_prior)]

for outcome in outcomes:
    if outcome == 1:
        alphas.append(alphas[-1] + 1)
        betas.append(betas[-1])
    else:
        alphas.append(alphas[-1])
        betas.append(betas[-1] + 1)
    means.append(alphas[-1] / (alphas[-1] + betas[-1]))

plt.plot(means, 'o-', linewidth=2)
plt.axhline(0.5, color='k', linestyle='--', alpha=0.3, label='True fair value')
plt.xlabel('Number of flips observed')
plt.ylabel('Posterior mean of ΞΈ')
plt.title('Sequential Bayesian Updating')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Prior mean: {alpha_prior/(alpha_prior+beta_prior):.3f}")
print(f"Posterior mean: {alpha_post/(alpha_post+beta_post):.3f}")
print(f"MLE estimate: {n_heads/(n_heads+n_tails):.3f}")

3. Common Probability DistributionsΒΆ

3.1 Discrete DistributionsΒΆ

What and Why: Probability distributions are the building blocks of every probabilistic model in ML. Discrete distributions – Bernoulli (single binary outcome), Binomial (count of successes in \(n\) trials), and Poisson (count of rare events per unit time) – model categorical and count data. Continuous distributions – Gaussian (bell curve), Exponential (waiting times), and Beta (probabilities of probabilities) – model real-valued quantities.

How: Each distribution is parameterized by quantities that control its shape: the Bernoulli by \(p\), the Binomial by \((n, p)\), and the Gaussian by \((\mu, \sigma^2)\). The code below visualizes PMFs (probability mass functions) for discrete distributions as bar charts and PDFs (probability density functions) for continuous distributions as smooth curves. The Beta distribution is especially important as the conjugate prior for Bernoulli/Binomial likelihoods, meaning Bayesian updates remain in the same distribution family.

Connection: Classification losses derive from Bernoulli/Categorical distributions (cross-entropy), Poisson regression models count data in NLP and ecology, Gaussian assumptions underlie linear regression and VAE latent spaces, and Beta distributions parameterize uncertainty in Thompson sampling for bandit problems.

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Bernoulli
ax = axes[0, 0]
p = 0.7
x = [0, 1]
pmf = [1-p, p]
ax.bar(x, pmf, color='skyblue', edgecolor='black')
ax.set_title(f'Bernoulli(p={p})')
ax.set_xlabel('x')
ax.set_ylabel('P(X=x)')
ax.set_xticks([0, 1])

# Binomial
ax = axes[0, 1]
n, p = 10, 0.7
x = np.arange(0, n+1)
pmf = stats.binom(n, p).pmf(x)
ax.bar(x, pmf, color='lightcoral', edgecolor='black')
ax.set_title(f'Binomial(n={n}, p={p})')
ax.set_xlabel('x (number of successes)')
ax.set_ylabel('P(X=x)')

# Poisson
ax = axes[0, 2]
lam = 3
x = np.arange(0, 15)
pmf = stats.poisson(lam).pmf(x)
ax.bar(x, pmf, color='lightgreen', edgecolor='black')
ax.set_title(f'Poisson(Ξ»={lam})')
ax.set_xlabel('x (number of events)')
ax.set_ylabel('P(X=x)')

# Gaussian/Normal
ax = axes[1, 0]
mu, sigma = 0, 1
x = np.linspace(-4, 4, 1000)
pdf = stats.norm(mu, sigma).pdf(x)
ax.plot(x, pdf, 'b-', linewidth=2)
ax.fill_between(x, pdf, alpha=0.3)
ax.set_title(f'Normal(ΞΌ={mu}, σ²={sigma**2})')
ax.set_xlabel('x')
ax.set_ylabel('p(x)')
ax.grid(True, alpha=0.3)

# Exponential
ax = axes[1, 1]
lam = 1.5
x = np.linspace(0, 5, 1000)
pdf = stats.expon(scale=1/lam).pdf(x)
ax.plot(x, pdf, 'r-', linewidth=2)
ax.fill_between(x, pdf, alpha=0.3, color='red')
ax.set_title(f'Exponential(Ξ»={lam})')
ax.set_xlabel('x')
ax.set_ylabel('p(x)')
ax.grid(True, alpha=0.3)

# Beta
ax = axes[1, 2]
alphas_betas = [(0.5, 0.5), (2, 2), (2, 5), (8, 4)]
x = np.linspace(0, 1, 1000)
for a, b in alphas_betas:
    pdf = stats.beta(a, b).pdf(x)
    ax.plot(x, pdf, linewidth=2, label=f'Ξ±={a}, Ξ²={b}')
ax.set_title('Beta Distributions')
ax.set_xlabel('x')
ax.set_ylabel('p(x)')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

3.2 Multivariate Gaussian DistributionΒΆ

\[\mathcal{N}(\mathbf{x}|\boldsymbol{\mu}, \boldsymbol{\Sigma}) = \frac{1}{(2\pi)^{D/2}|\boldsymbol{\Sigma}|^{1/2}} \exp\left(-\frac{1}{2}(\mathbf{x}-\boldsymbol{\mu})^T\boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right)\]
# 2D Gaussian with different covariances
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Different covariance matrices
covariances = [
    ("Identity", np.array([[1, 0], [0, 1]])),
    ("Diagonal", np.array([[2, 0], [0, 0.5]])),
    ("Positive correlation", np.array([[1, 0.8], [0.8, 1]])),
    ("Negative correlation", np.array([[1, -0.8], [-0.8, 1]])),
    ("High variance X", np.array([[3, 0], [0, 0.3]])),
    ("Rotated", np.array([[1.5, 1], [1, 1.5]]))
]

mu = np.array([0, 0])

for idx, (name, cov) in enumerate(covariances):
    ax = axes[idx // 3, idx % 3]
    
    # Create grid
    x = np.linspace(-3, 3, 100)
    y = np.linspace(-3, 3, 100)
    X, Y = np.meshgrid(x, y)
    pos = np.dstack((X, Y))
    
    # Calculate PDF
    rv = stats.multivariate_normal(mu, cov)
    Z = rv.pdf(pos)
    
    # Contour plot
    contour = ax.contour(X, Y, Z, levels=5, colors='black', alpha=0.3)
    ax.contourf(X, Y, Z, levels=20, cmap='viridis')
    
    # Sample points
    samples = rv.rvs(size=200)
    ax.scatter(samples[:, 0], samples[:, 1], alpha=0.5, s=10, color='red')
    
    ax.set_title(f'{name}\nΞ£={cov.tolist()}')
    ax.set_xlabel('x₁')
    ax.set_ylabel('xβ‚‚')
    ax.axis('equal')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

4. Monte Carlo SamplingΒΆ

4.1 Estimating Expectations via SamplingΒΆ

\[\mathbb{E}[f(X)] \approx \frac{1}{S}\sum_{s=1}^{S} f(x^{(s)})\]
# Example: Estimate Ο€ using Monte Carlo
def estimate_pi(n_samples):
    """Estimate Ο€ by sampling random points in unit square"""
    x = np.random.uniform(-1, 1, n_samples)
    y = np.random.uniform(-1, 1, n_samples)
    
    # Count points inside unit circle
    inside_circle = (x**2 + y**2) <= 1
    pi_estimate = 4 * np.sum(inside_circle) / n_samples
    
    return pi_estimate, x, y, inside_circle

# Test with different sample sizes
sample_sizes = [100, 1000, 10000, 100000]
estimates = []

fig, axes = plt.subplots(2, 2, figsize=(12, 12))

for idx, n in enumerate(sample_sizes):
    ax = axes[idx // 2, idx % 2]
    
    pi_est, x, y, inside = estimate_pi(n)
    estimates.append(pi_est)
    
    # Plot first 1000 points for visualization
    plot_n = min(1000, n)
    ax.scatter(x[:plot_n][inside[:plot_n]], y[:plot_n][inside[:plot_n]], 
               c='red', s=1, alpha=0.5, label='Inside circle')
    ax.scatter(x[:plot_n][~inside[:plot_n]], y[:plot_n][~inside[:plot_n]], 
               c='blue', s=1, alpha=0.5, label='Outside circle')
    
    # Draw circle
    circle = plt.Circle((0, 0), 1, fill=False, color='black', linewidth=2)
    ax.add_patch(circle)
    
    ax.set_xlim(-1.1, 1.1)
    ax.set_ylim(-1.1, 1.1)
    ax.set_aspect('equal')
    ax.set_title(f'n={n:,}\nΟ€ β‰ˆ {pi_est:.6f}\nError: {abs(pi_est - np.pi):.6f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nEstimates of Ο€:")
for n, est in zip(sample_sizes, estimates):
    print(f"n={n:7,}: Ο€ β‰ˆ {est:.6f}, error = {abs(est - np.pi):.6f}")
print(f"\nTrue Ο€ = {np.pi:.6f}")

4.2 Rejection SamplingΒΆ

What: This code implements rejection sampling to draw samples from a mixture of two Gaussians – a distribution we cannot directly sample from. A wider Gaussian proposal distribution \(q(x)\) is scaled by a constant \(M\) so that \(M \cdot q(x) \geq p(x)\) everywhere, then candidate samples are accepted or rejected based on the ratio \(\frac{p(x)}{M \cdot q(x)}\).

Why: Many posterior distributions in Bayesian ML lack closed-form sampling procedures. Rejection sampling provides an exact sampling method: accepted samples are guaranteed to follow the target distribution \(p(x)\), regardless of its complexity. The acceptance rate equals \(\frac{1}{M}\), so the tightness of the envelope \(M \cdot q(x)\) directly determines computational efficiency.

How: For each proposal sample \(x \sim q(x)\), we draw \(u \sim \text{Uniform}(0, M \cdot q(x))\) and accept if \(u \leq p(x)\). The left plot visualizes the envelope \(M \cdot q(x)\) covering the target, while the right plot overlays the histogram of accepted samples against the true density to confirm correctness. The acceptance rate printed at the end quantifies how well the proposal matches the target.

Connection: While rejection sampling becomes inefficient in high dimensions (acceptance rates drop exponentially), it is the conceptual foundation for more advanced MCMC methods like Metropolis-Hastings and slice sampling used in Bayesian deep learning, probabilistic programming frameworks (Stan, PyMC), and variational inference proposals.

# Sample from a mixture of Gaussians using rejection sampling
def target_distribution(x):
    """Mixture of two Gaussians"""
    return 0.3 * stats.norm(-2, 0.5).pdf(x) + 0.7 * stats.norm(2, 1).pdf(x)

def proposal_distribution(x):
    """Proposal: wider Gaussian"""
    return stats.norm(0, 3).pdf(x)

# Find scaling constant M such that M*q(x) >= p(x)
x_range = np.linspace(-6, 6, 1000)
M = np.max(target_distribution(x_range) / proposal_distribution(x_range)) * 1.1

# Rejection sampling
samples = []
n_accepted = 0
n_rejected = 0

np.random.seed(42)
while len(samples) < 10000:
    # Sample from proposal
    x = np.random.normal(0, 3)
    u = np.random.uniform(0, M * proposal_distribution(x))
    
    if u <= target_distribution(x):
        samples.append(x)
        n_accepted += 1
    else:
        n_rejected += 1

samples = np.array(samples)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Target vs proposal
ax1.plot(x_range, target_distribution(x_range), 'b-', linewidth=2, label='Target p(x)')
ax1.plot(x_range, M * proposal_distribution(x_range), 'r--', linewidth=2, 
         label=f'MΓ—q(x), M={M:.2f}')
ax1.plot(x_range, proposal_distribution(x_range), 'g:', linewidth=2, label='Proposal q(x)')
ax1.fill_between(x_range, target_distribution(x_range), alpha=0.3)
ax1.set_xlabel('x')
ax1.set_ylabel('Density')
ax1.set_title('Rejection Sampling')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Samples vs target
ax2.hist(samples, bins=50, density=True, alpha=0.6, label='Samples', edgecolor='black')
ax2.plot(x_range, target_distribution(x_range), 'r-', linewidth=2, label='Target p(x)')
ax2.set_xlabel('x')
ax2.set_ylabel('Density')
ax2.set_title(f'Samples from Target Distribution\nAcceptance rate: {n_accepted/(n_accepted+n_rejected):.2%}')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Acceptance rate: {n_accepted/(n_accepted+n_rejected):.2%}")
print(f"Samples generated: {len(samples)}")

5. Information Theory BasicsΒΆ

5.1 EntropyΒΆ

Shannon Entropy: $\(H(X) = -\sum_{x} p(x) \log_2 p(x)\)$

Measures uncertainty or information content.

def entropy(probs):
    """Calculate Shannon entropy in bits"""
    probs = np.array(probs)
    probs = probs[probs > 0]  # Remove zero probabilities
    return -np.sum(probs * np.log2(probs))

# Binary entropy function
p_range = np.linspace(0.001, 0.999, 1000)
binary_entropy = [-p*np.log2(p) - (1-p)*np.log2(1-p) for p in p_range]

plt.figure(figsize=(12, 5))

# Binary entropy
plt.subplot(1, 2, 1)
plt.plot(p_range, binary_entropy, 'b-', linewidth=2)
plt.axvline(0.5, color='r', linestyle='--', label='Maximum at p=0.5')
plt.xlabel('p (probability of event)')
plt.ylabel('H(X) bits')
plt.title('Binary Entropy Function')
plt.grid(True, alpha=0.3)
plt.legend()

# Entropy for different distributions
plt.subplot(1, 2, 2)
distributions = {
    'Uniform (4 outcomes)': [0.25, 0.25, 0.25, 0.25],
    'Skewed': [0.7, 0.2, 0.08, 0.02],
    'Very skewed': [0.95, 0.03, 0.015, 0.005],
    'Two outcomes': [0.5, 0.5, 0, 0],
    'Deterministic': [1.0, 0, 0, 0]
}

names = list(distributions.keys())
entropies = [entropy(distributions[name]) for name in names]

plt.barh(names, entropies, color='skyblue', edgecolor='black')
plt.xlabel('Entropy (bits)')
plt.title('Entropy of Different Distributions')
plt.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("Entropy values:")
for name, H in zip(names, entropies):
    print(f"{name:25s}: {H:.4f} bits")

5.2 KL DivergenceΒΆ

\[D_{KL}(p||q) = \sum_{x} p(x) \log \frac{p(x)}{q(x)}\]

Measures how one probability distribution differs from another.

def kl_divergence(p, q):
    """Calculate KL divergence D_KL(p||q)"""
    p = np.array(p)
    q = np.array(q)
    # Add small epsilon to avoid log(0)
    epsilon = 1e-10
    return np.sum(p * np.log((p + epsilon) / (q + epsilon)))

# Example: KL divergence between Gaussians
x = np.linspace(-5, 10, 1000)

# True distribution
mu_true, sigma_true = 2, 1
p_true = stats.norm(mu_true, sigma_true).pdf(x)

# Different approximations
approximations = [
    ('Good approximation', 2.1, 1.1),
    ('Wrong mean', 0, 1),
    ('Wrong variance', 2, 2),
    ('Both wrong', 4, 0.5)
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, (name, mu_approx, sigma_approx) in enumerate(approximations):
    ax = axes[idx // 2, idx % 2]
    
    q_approx = stats.norm(mu_approx, sigma_approx).pdf(x)
    
    # Calculate KL divergence (approximate using discrete distribution)
    p_discrete = p_true / np.sum(p_true)
    q_discrete = q_approx / np.sum(q_approx)
    kl_div = kl_divergence(p_discrete, q_discrete)
    
    ax.plot(x, p_true, 'b-', linewidth=2, label=f'True: N({mu_true},{sigma_true}Β²)')
    ax.plot(x, q_approx, 'r--', linewidth=2, label=f'Approx: N({mu_approx},{sigma_approx}Β²)')
    ax.fill_between(x, p_true, alpha=0.2)
    ax.fill_between(x, q_approx, alpha=0.2, color='red')
    ax.set_title(f'{name}\nKL(p||q) = {kl_div:.4f}')
    ax.set_xlabel('x')
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nKL Divergence (lower is better):")
for name, mu, sigma in approximations:
    q = stats.norm(mu, sigma).pdf(x)
    p_d = p_true / np.sum(p_true)
    q_d = q / np.sum(q)
    kl = kl_divergence(p_d, q_d)
    print(f"{name:25s}: {kl:.6f}")

5.3 Mutual InformationΒΆ

\[I(X;Y) = \sum_{x,y} p(x,y) \log \frac{p(x,y)}{p(x)p(y)}\]

Measures the mutual dependence between two variables.

def mutual_information(joint_prob):
    """Calculate mutual information from joint probability table"""
    # Marginal probabilities
    p_x = joint_prob.sum(axis=1, keepdims=True)
    p_y = joint_prob.sum(axis=0, keepdims=True)
    
    # Product of marginals
    p_prod = p_x @ p_y
    
    # MI calculation
    epsilon = 1e-10
    mi = np.sum(joint_prob * np.log((joint_prob + epsilon) / (p_prod + epsilon)))
    return mi

# Example: Correlated vs independent variables
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Independent
p_x = np.array([0.5, 0.5])
p_y = np.array([0.3, 0.7])
joint_indep = np.outer(p_x, p_y)
mi_indep = mutual_information(joint_indep)

# Weakly correlated
joint_weak = np.array([[0.20, 0.30],
                       [0.10, 0.40]])
mi_weak = mutual_information(joint_weak)

# Strongly correlated
joint_strong = np.array([[0.45, 0.05],
                         [0.05, 0.45]])
mi_strong = mutual_information(joint_strong)

joints = [joint_indep, joint_weak, joint_strong]
mis = [mi_indep, mi_weak, mi_strong]
titles = ['Independent', 'Weakly Correlated', 'Strongly Correlated']

for ax, joint, mi, title in zip(axes, joints, mis, titles):
    im = ax.imshow(joint, cmap='YlOrRd', vmin=0, vmax=0.5)
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xlabel('Y')
    ax.set_ylabel('X')
    ax.set_title(f'{title}\nI(X;Y) = {mi:.4f} nats')
    
    # Annotate cells
    for i in range(2):
        for j in range(2):
            ax.text(j, i, f'{joint[i,j]:.2f}', ha='center', va='center', 
                   color='white' if joint[i,j] > 0.25 else 'black', fontsize=14)
    
    plt.colorbar(im, ax=ax)

plt.tight_layout()
plt.show()

print("Mutual Information:")
for title, mi in zip(titles, mis):
    print(f"{title:25s}: {mi:.6f} nats")

SummaryΒΆ

In this notebook, we covered:

  1. Basic Probability Rules: Sum rule, product rule, conditional probability

  2. Bayes’ Rule: Bayesian inference and sequential updating

  3. Probability Distributions: Discrete and continuous distributions

  4. Monte Carlo Methods: Sampling-based estimation and rejection sampling

  5. Information Theory: Entropy, KL divergence, and mutual information

These concepts form the foundation for understanding machine learning from a probabilistic perspective.

ExercisesΒΆ

  1. Implement importance sampling and compare it with rejection sampling

  2. Calculate the mutual information between correlated Gaussian variables

  3. Derive and implement the Beta-Binomial conjugate prior update

  4. Visualize the relationship between correlation and mutual information

  5. Implement a simple Gibbs sampler for a 2D Gaussian