Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.special import gammaln, logsumexp
import pandas as pd

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

1. Bayesian Inference Fundamentals¶

Bayes’ Theorem for Parameters¶

\[p(\theta|\mathcal{D}) = \frac{p(\mathcal{D}|\theta)p(\theta)}{p(\mathcal{D})}\]

Where:

\(p(\theta|\mathcal{D})\) is the posterior: our belief after seeing data
\(p(\mathcal{D}|\theta)\) is the likelihood: probability of data given parameters
\(p(\theta)\) is the prior: our belief before seeing data
\(p(\mathcal{D})\) is the evidence (marginal likelihood)

# Example: Coin flipping with Bayesian inference
# Prior: Beta distribution (conjugate prior for Bernoulli)

def plot_beta_binomial(alpha_prior, beta_prior, n_heads, n_tails, title=""):
    """Plot prior, likelihood, and posterior for Beta-Binomial model"""
    theta = np.linspace(0, 1, 1000)
    
    # Prior
    prior = stats.beta(alpha_prior, beta_prior).pdf(theta)
    
    # Likelihood (up to normalization constant)
    likelihood = theta**n_heads * (1-theta)**n_tails
    likelihood = likelihood / np.max(likelihood)  # Normalize for visualization
    
    # Posterior
    alpha_post = alpha_prior + n_heads
    beta_post = beta_prior + n_tails
    posterior = stats.beta(alpha_post, beta_post).pdf(theta)
    
    # Plot
    plt.figure(figsize=(12, 5))
    plt.plot(theta, prior, 'b-', linewidth=2, label=f'Prior: Beta({alpha_prior},{beta_prior})')
    plt.plot(theta, likelihood, 'g--', linewidth=2, label=f'Likelihood: {n_heads}H, {n_tails}T (scaled)')
    plt.plot(theta, posterior, 'r-', linewidth=2, label=f'Posterior: Beta({alpha_post},{beta_post})')
    plt.axvline(alpha_post/(alpha_post+beta_post), color='r', linestyle=':', 
               label=f'Posterior mean: {alpha_post/(alpha_post+beta_post):.3f}')
    if n_heads + n_tails > 0:
        plt.axvline(n_heads/(n_heads+n_tails), color='g', linestyle=':', 
                   label=f'MLE: {n_heads/(n_heads+n_tails):.3f}')
    plt.xlabel('θ (probability of heads)')
    plt.ylabel('Density')
    plt.title(title if title else 'Bayesian Inference for Coin Flipping')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return alpha_post, beta_post

# Different prior beliefs
print("=" * 60)
print("Scenario 1: Uniform prior (no prior knowledge)")
print("=" * 60)
plot_beta_binomial(1, 1, 7, 3, "Uniform Prior: Beta(1,1)")

print("\n" + "=" * 60)
print("Scenario 2: Informative prior (believe coin is fair)")
print("=" * 60)
plot_beta_binomial(10, 10, 7, 3, "Informative Prior: Beta(10,10)")

print("\n" + "=" * 60)
print("Scenario 3: Skeptical prior (believe coin favors tails)")
print("=" * 60)
plot_beta_binomial(3, 7, 7, 3, "Skeptical Prior: Beta(3,7)")

Sequential Bayesian Update¶

The posterior from one update becomes the prior for the next:

\[p(\theta|\mathcal{D}_1, \mathcal{D}_2) \propto p(\mathcal{D}_2|\theta) \underbrace{p(\theta|\mathcal{D}_1)}_{\text{previous posterior}}\]

# Sequential updating demonstration
alpha = 2
beta = 2

# Simulate coin flips
true_theta = 0.7
flips = np.random.binomial(1, true_theta, 100)

# Track posterior evolution
alphas = [alpha]
betas = [beta]
theta_range = np.linspace(0, 1, 1000)

snapshots = [0, 5, 20, 100]  # Save posteriors at these points
posteriors = {}

for i, flip in enumerate(flips):
    if flip == 1:
        alpha += 1
    else:
        beta += 1
    
    alphas.append(alpha)
    betas.append(beta)
    
    if i+1 in snapshots:
        posteriors[i+1] = stats.beta(alpha, beta).pdf(theta_range)

# Plot evolution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, n_flips in enumerate(snapshots):
    ax = axes[idx // 2, idx % 2]
    
    if n_flips == 0:
        posterior = stats.beta(alphas[0], betas[0]).pdf(theta_range)
        n_heads = 0
        n_tails = 0
    else:
        posterior = posteriors[n_flips]
        n_heads = np.sum(flips[:n_flips])
        n_tails = n_flips - n_heads
    
    ax.plot(theta_range, posterior, 'b-', linewidth=2)
    ax.fill_between(theta_range, posterior, alpha=0.3)
    ax.axvline(true_theta, color='r', linestyle='--', linewidth=2, label=f'True θ = {true_theta}')
    ax.axvline(alphas[n_flips]/(alphas[n_flips]+betas[n_flips]), color='g', 
              linestyle=':', linewidth=2, label=f'Posterior mean')
    ax.set_xlabel('θ')
    ax.set_ylabel('Density')
    ax.set_title(f'After {n_flips} flips: {n_heads}H, {n_tails}T\nBeta({alphas[n_flips]},{betas[n_flips]})')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Plot posterior mean evolution
posterior_means = [a/(a+b) for a, b in zip(alphas, betas)]

plt.figure(figsize=(12, 5))
plt.plot(posterior_means, 'b-', linewidth=2, label='Posterior mean')
plt.axhline(true_theta, color='r', linestyle='--', linewidth=2, label=f'True θ = {true_theta}')
plt.xlabel('Number of observations')
plt.ylabel('Posterior mean of θ')
plt.title('Sequential Bayesian Learning')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Final posterior: Beta({alphas[-1]}, {betas[-1]})")
print(f"Posterior mean: {posterior_means[-1]:.4f}")
print(f"True value: {true_theta}")
print(f"MLE: {np.sum(flips)/len(flips):.4f}")

2. Conjugate Priors¶

A prior is conjugate to a likelihood if the posterior has the same functional form as the prior.

Common Conjugate Pairs¶

Likelihood	Conjugate Prior	Posterior
Bernoulli/Binomial	Beta	Beta
Multinomial	Dirichlet	Dirichlet
Poisson	Gamma	Gamma
Gaussian (known σ²)	Gaussian	Gaussian
Gaussian (known μ)	Inverse-Gamma	Inverse-Gamma

# Example 1: Poisson-Gamma conjugacy
# Model event counts with Poisson distribution

# True rate parameter
lambda_true = 5

# Generate data
data = np.random.poisson(lambda_true, 50)

# Prior: Gamma(α, β)
alpha_prior = 2
beta_prior = 0.5

# Posterior: Gamma(α + Σx_i, β + n)
alpha_post = alpha_prior + np.sum(data)
beta_post = beta_prior + len(data)

# Plot
lambda_range = np.linspace(0, 15, 1000)
prior = stats.gamma(alpha_prior, scale=1/beta_prior).pdf(lambda_range)
posterior = stats.gamma(alpha_post, scale=1/beta_post).pdf(lambda_range)

plt.figure(figsize=(12, 5))
plt.plot(lambda_range, prior, 'b-', linewidth=2, 
        label=f'Prior: Gamma({alpha_prior}, {beta_prior})')
plt.plot(lambda_range, posterior, 'r-', linewidth=2,
        label=f'Posterior: Gamma({alpha_post:.0f}, {beta_post:.0f})')
plt.axvline(lambda_true, color='g', linestyle='--', linewidth=2, label=f'True λ = {lambda_true}')
plt.axvline(alpha_post/beta_post, color='r', linestyle=':', linewidth=2,
           label=f'Posterior mean = {alpha_post/beta_post:.3f}')
plt.axvline(np.mean(data), color='orange', linestyle=':', linewidth=2,
           label=f'MLE = {np.mean(data):.3f}')
plt.xlabel('λ (rate parameter)')
plt.ylabel('Density')
plt.title(f'Poisson-Gamma Conjugacy (n={len(data)} observations)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Data summary: mean={np.mean(data):.2f}, std={np.std(data):.2f}")
print(f"Prior mean: {alpha_prior/beta_prior:.2f}")
print(f"Posterior mean: {alpha_post/beta_post:.2f}")
print(f"Posterior std: {np.sqrt(alpha_post)/(beta_post):.2f}")

# Example 2: Gaussian-Gaussian conjugacy (known variance)
# Inferring the mean of a Gaussian with known variance

# True parameters
mu_true = 5
sigma_known = 2

# Generate data
data_gaussian = np.random.normal(mu_true, sigma_known, 30)

# Prior on mean: N(μ₀, σ₀²)
mu0 = 0
sigma0 = 5

# Posterior on mean: N(μₙ, σₙ²)
n = len(data_gaussian)
precision0 = 1 / sigma0**2
precision_data = n / sigma_known**2
precision_post = precision0 + precision_data

mu_post = (precision0 * mu0 + precision_data * np.mean(data_gaussian)) / precision_post
sigma_post = np.sqrt(1 / precision_post)

# Plot
mu_range = np.linspace(-5, 15, 1000)
prior_gauss = stats.norm(mu0, sigma0).pdf(mu_range)
likelihood_gauss = stats.norm(np.mean(data_gaussian), sigma_known/np.sqrt(n)).pdf(mu_range)
posterior_gauss = stats.norm(mu_post, sigma_post).pdf(mu_range)

plt.figure(figsize=(12, 5))
plt.plot(mu_range, prior_gauss, 'b-', linewidth=2, label=f'Prior: N({mu0}, {sigma0}²)')
plt.plot(mu_range, likelihood_gauss, 'g--', linewidth=2, label='Likelihood (scaled)')
plt.plot(mu_range, posterior_gauss, 'r-', linewidth=2, 
        label=f'Posterior: N({mu_post:.2f}, {sigma_post:.2f}²)')
plt.axvline(mu_true, color='black', linestyle='--', linewidth=2, label=f'True μ = {mu_true}')
plt.xlabel('μ')
plt.ylabel('Density')
plt.title(f'Gaussian-Gaussian Conjugacy (σ={sigma_known} known, n={n})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Show 95% credible interval
ci_lower = mu_post - 1.96 * sigma_post
ci_upper = mu_post + 1.96 * sigma_post
print(f"\nPosterior 95% credible interval: [{ci_lower:.3f}, {ci_upper:.3f}]")
print(f"True μ in credible interval: {ci_lower <= mu_true <= ci_upper}")

3. Posterior Predictive Distribution¶

Prediction for new data, marginalizing over parameter uncertainty:

\[p(\tilde{x}|\mathcal{D}) = \int p(\tilde{x}|\theta)p(\theta|\mathcal{D})d\theta\]

This accounts for both data uncertainty and parameter uncertainty.

# Beta-Binomial posterior predictive
# Predict next coin flip

def beta_binomial_predictive(alpha, beta):
    """Posterior predictive probability of heads"""
    return alpha / (alpha + beta)

# Scenario: observed 7 heads, 3 tails
n_heads = 7
n_tails = 3

# Different priors
priors = [
    ("Uniform", 1, 1),
    ("Weak", 2, 2),
    ("Strong Fair", 10, 10),
    ("Strong Biased", 3, 7)
]

print("Posterior Predictive Probabilities:")
print("=" * 70)
print(f"{'Prior':<20} {'P(next=H|data)':<20} {'MLE':<20}")
print("=" * 70)

mle = n_heads / (n_heads + n_tails)

for name, alpha_pr, beta_pr in priors:
    alpha_po = alpha_pr + n_heads
    beta_po = beta_pr + n_tails
    pred_prob = beta_binomial_predictive(alpha_po, beta_po)
    print(f"{name:<20} {pred_prob:<20.4f} {mle:<20.4f}")

# Visualize uncertainty
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, (name, alpha_pr, beta_pr) in enumerate(priors):
    ax = axes[idx // 2, idx % 2]
    
    alpha_po = alpha_pr + n_heads
    beta_po = beta_pr + n_tails
    
    # Sample from posterior
    theta_samples = np.random.beta(alpha_po, beta_po, 10000)
    
    # For each theta, sample prediction
    predictions = np.random.binomial(1, theta_samples)
    
    # Plot posterior over theta
    theta_range = np.linspace(0, 1, 1000)
    posterior = stats.beta(alpha_po, beta_po).pdf(theta_range)
    
    ax.plot(theta_range, posterior, 'b-', linewidth=2)
    ax.fill_between(theta_range, posterior, alpha=0.3)
    ax.axvline(mle, color='g', linestyle='--', linewidth=2, label=f'MLE = {mle:.3f}')
    ax.axvline(alpha_po/(alpha_po+beta_po), color='r', linestyle=':', linewidth=2,
              label=f'Posterior mean = {alpha_po/(alpha_po+beta_po):.3f}')
    ax.set_xlabel('θ')
    ax.set_ylabel('Density')
    ax.set_title(f'{name} Prior\nP(next=H|data) = {alpha_po/(alpha_po+beta_po):.3f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

4. Bayesian Decision Theory¶

Loss Functions and Risk¶

Expected loss (risk) for action \(a\) given data:

\[R(a|\mathcal{D}) = \mathbb{E}_{\theta|\mathcal{D}}[L(\theta, a)] = \int L(\theta, a) p(\theta|\mathcal{D}) d\theta\]

Optimal Bayes action: \(a^* = \arg\min_a R(a|\mathcal{D})\)

Common Loss Functions¶

0-1 loss: \(L(\theta, a) = \mathbb{I}(\theta \neq a)\) → Optimal: posterior mode
Squared loss: \(L(\theta, a) = (\theta - a)^2\) → Optimal: posterior mean
Absolute loss: \(L(\theta, a) = |\theta - a|\) → Optimal: posterior median

# Bayesian point estimation under different loss functions

# Generate skewed posterior (mixture to create asymmetry)
alpha1, beta1 = 5, 2
alpha2, beta2 = 3, 5
weight1 = 0.7

# Sample from mixture
samples1 = np.random.beta(alpha1, beta1, int(10000*weight1))
samples2 = np.random.beta(alpha2, beta2, int(10000*(1-weight1)))
posterior_samples = np.concatenate([samples1, samples2])

# Calculate optimal estimates under different losses
posterior_mode = 0.7  # Approximate mode
posterior_mean = np.mean(posterior_samples)
posterior_median = np.median(posterior_samples)

# Plot posterior and optimal estimates
theta_range = np.linspace(0, 1, 1000)

# Estimate PDF from samples
from scipy.stats import gaussian_kde
kde = gaussian_kde(posterior_samples)
posterior_pdf = kde(theta_range)

plt.figure(figsize=(12, 6))
plt.plot(theta_range, posterior_pdf, 'b-', linewidth=2, label='Posterior')
plt.fill_between(theta_range, posterior_pdf, alpha=0.3)
plt.axvline(posterior_mode, color='r', linestyle='--', linewidth=2, 
           label=f'Mode (0-1 loss) ≈ {posterior_mode:.3f}')
plt.axvline(posterior_mean, color='g', linestyle='--', linewidth=2,
           label=f'Mean (squared loss) = {posterior_mean:.3f}')
plt.axvline(posterior_median, color='orange', linestyle='--', linewidth=2,
           label=f'Median (absolute loss) = {posterior_median:.3f}')
plt.xlabel('θ')
plt.ylabel('Density')
plt.title('Optimal Estimates Under Different Loss Functions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("Optimal point estimates:")
print(f"  0-1 loss (minimize misclassification): Mode ≈ {posterior_mode:.4f}")
print(f"  Squared loss (minimize MSE): Mean = {posterior_mean:.4f}")
print(f"  Absolute loss (minimize MAE): Median = {posterior_median:.4f}")

Classification with Asymmetric Costs¶

What: This code applies Bayesian decision theory to medical diagnosis, where the cost of a false negative (missing a disease) vastly exceeds a false positive (unnecessary treatment). It computes expected losses under each action using the posterior \(p(\text{disease}|+)\) and a user-defined loss matrix, then selects the action minimizing expected risk.

Why: Real-world classification rarely has symmetric costs. In medical screening, fraud detection, and safety systems, the consequences of different error types differ by orders of magnitude. The Bayes-optimal decision threshold is not 0.5 but rather \(\tau^* = \frac{C_{FP}}{C_{FP} + C_{FN}}\), shifting toward the cheaper error. Even though \(p(\text{disease}|+) \approx 0.17\) is below 0.5, the high cost ratio (\(C_{FN}/C_{FP} = 10\)) can still make treatment the optimal action.

Connection: Asymmetric cost-sensitive classification is critical in production ML systems: autonomous vehicles weight pedestrian detection errors heavily, credit scoring penalizes wrongful denials differently from defaults, and cybersecurity intrusion detection systems tune thresholds based on operational costs of missed attacks versus false alarms.

# Medical diagnosis with asymmetric costs
# False negative (missing disease) is more costly than false positive

# Setup
p_disease = 0.01  # Base rate
sensitivity = 0.99  # P(+|disease)
specificity = 0.95  # P(-|no disease)

# Patient tests positive
p_positive_given_disease = sensitivity
p_positive_given_no_disease = 1 - specificity

# Posterior P(disease|+) using Bayes' rule
p_positive = (p_positive_given_disease * p_disease + 
             p_positive_given_no_disease * (1 - p_disease))
p_disease_given_positive = (p_positive_given_disease * p_disease) / p_positive
p_no_disease_given_positive = 1 - p_disease_given_positive

print(f"Posterior probabilities given positive test:")
print(f"  P(disease|+) = {p_disease_given_positive:.4f}")
print(f"  P(no disease|+) = {p_no_disease_given_positive:.4f}")

# Define loss matrix
#                  True State
#                  Disease  No Disease
# Action: Treat      0         10      (false positive cost)
#         No Treat   100       0       (false negative cost)

cost_FP = 10   # Cost of unnecessary treatment
cost_FN = 100  # Cost of missing disease

# Expected loss for each action
loss_treat = 0 * p_disease_given_positive + cost_FP * p_no_disease_given_positive
loss_no_treat = cost_FN * p_disease_given_positive + 0 * p_no_disease_given_positive

print(f"\nExpected losses:")
print(f"  Treat: {loss_treat:.4f}")
print(f"  Don't treat: {loss_no_treat:.4f}")
print(f"\nOptimal action: {'Treat' if loss_treat < loss_no_treat else 'Do not treat'}")

# Visualize decision boundary
cost_ratios = np.logspace(-2, 2, 100)  # FN/FP cost ratio
decision_thresholds = []

for ratio in cost_ratios:
    # Threshold where expected losses are equal
    # cost_FP * (1-p) = cost_FN * p
    # p = cost_FP / (cost_FP + cost_FN)
    threshold = cost_FP / (cost_FP + cost_FP * ratio)
    decision_thresholds.append(threshold)

plt.figure(figsize=(12, 5))
plt.semilogx(cost_ratios, decision_thresholds, 'b-', linewidth=2)
plt.axhline(p_disease_given_positive, color='r', linestyle='--', linewidth=2,
           label=f'P(disease|+) = {p_disease_given_positive:.4f}')
plt.xlabel('Cost Ratio (False Negative / False Positive)')
plt.ylabel('Decision Threshold')
plt.title('Optimal Decision Threshold vs Cost Ratio')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nWith cost ratio of {cost_FN/cost_FP:.1f}:")
print(f"Decision threshold: {cost_FP/(cost_FP+cost_FN):.4f}")
print(f"Since P(disease|+) = {p_disease_given_positive:.4f} > threshold, we should treat.")

5. Empirical Bayes¶

Idea: Use data to estimate hyperparameters of the prior

\[\hat{\theta}_{EB} = \arg\max_{\theta} p(\mathcal{D}|\theta) = \arg\max_{\theta} \int p(\mathcal{D}|\phi, \theta)p(\phi|\theta)d\phi\]

Also called Type-II Maximum Likelihood or Evidence Maximization

# Empirical Bayes for multiple Binomial experiments
# E.g., batting averages for baseball players

# Simulate data: each player has true ability θ drawn from Beta(α, β)
alpha_true = 20
beta_true = 70
n_players = 50
n_trials = 100  # At-bats per player

# Generate true abilities and observed data
true_abilities = np.random.beta(alpha_true, beta_true, n_players)
successes = np.random.binomial(n_trials, true_abilities)

# MLE estimates (no shrinkage)
mle_estimates = successes / n_trials

# Empirical Bayes: estimate α, β from data using method of moments
sample_mean = np.mean(successes / n_trials)
sample_var = np.var(successes / n_trials)

# Method of moments estimators
# E[θ] = α/(α+β) = sample_mean
# Var[θ] = αβ/((α+β)²(α+β+1)) = sample_var
alpha_eb = sample_mean * (sample_mean * (1 - sample_mean) / sample_var - 1)
beta_eb = (1 - sample_mean) * (sample_mean * (1 - sample_mean) / sample_var - 1)

# Empirical Bayes estimates (with shrinkage)
eb_estimates = (alpha_eb + successes) / (alpha_eb + beta_eb + n_trials)

print(f"True hyperparameters: α={alpha_true}, β={beta_true}")
print(f"Estimated hyperparameters: α={alpha_eb:.2f}, β={beta_eb:.2f}")
print(f"True prior mean: {alpha_true/(alpha_true+beta_true):.4f}")
print(f"Estimated prior mean: {alpha_eb/(alpha_eb+beta_eb):.4f}")

# Calculate errors
mle_error = np.mean((mle_estimates - true_abilities)**2)
eb_error = np.mean((eb_estimates - true_abilities)**2)

print(f"\nMean Squared Error:")
print(f"  MLE: {mle_error:.6f}")
print(f"  Empirical Bayes: {eb_error:.6f}")
print(f"  Improvement: {100*(mle_error-eb_error)/mle_error:.1f}%")

# Visualize shrinkage
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Comparison plot
axes[0].scatter(mle_estimates, true_abilities, alpha=0.6, s=50, label='MLE')
axes[0].scatter(eb_estimates, true_abilities, alpha=0.6, s=50, label='Empirical Bayes')
axes[0].plot([0, 0.5], [0, 0.5], 'k--', alpha=0.5, label='Perfect prediction')
axes[0].set_xlabel('Estimated Ability')
axes[0].set_ylabel('True Ability')
axes[0].set_title('Estimates vs True Abilities')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].axis('equal')

# Shrinkage visualization
axes[1].scatter(mle_estimates, eb_estimates, alpha=0.6, s=50)
axes[1].plot([0, 0.5], [0, 0.5], 'k--', alpha=0.5, label='No shrinkage')
axes[1].axhline(alpha_eb/(alpha_eb+beta_eb), color='r', linestyle=':', 
               label=f'Prior mean = {alpha_eb/(alpha_eb+beta_eb):.3f}')
axes[1].set_xlabel('MLE Estimate')
axes[1].set_ylabel('Empirical Bayes Estimate')
axes[1].set_title('Shrinkage Effect')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].axis('equal')

plt.tight_layout()
plt.show()

# Show examples of extreme shrinkage
extreme_idx = np.argsort(np.abs(mle_estimates - sample_mean))[-5:]
print("\nExamples of shrinkage (most extreme):")
print(f"{'True':<8} {'Successes':<10} {'MLE':<8} {'EB':<8} {'Shrinkage':<10}")
for idx in extreme_idx:
    shrinkage = mle_estimates[idx] - eb_estimates[idx]
    print(f"{true_abilities[idx]:<8.3f} {successes[idx]:<10} {mle_estimates[idx]:<8.3f} "
          f"{eb_estimates[idx]:<8.3f} {shrinkage:<10.3f}")

Summary¶

In this notebook, we covered:

Bayesian Inference: Sequential updating and parameter learning
Conjugate Priors: Convenient analytical solutions for common models
Posterior Predictive: Accounting for parameter uncertainty in predictions
Bayesian Decision Theory: Optimal actions under different loss functions
Empirical Bayes: Data-driven prior specification

Key Takeaways¶

Bayesian methods provide a principled framework for incorporating prior knowledge
Conjugate priors enable analytical posterior computation
The posterior predictive distribution accounts for parameter uncertainty
Different loss functions lead to different optimal point estimates
Empirical Bayes offers a compromise between full Bayes and frequentist methods
Shrinkage towards the prior mean can improve estimates, especially with limited data

Exercises¶

Derive the posterior for Gaussian with unknown mean and variance
Implement grid approximation for non-conjugate posteriors
Compare Bayesian and frequentist confidence/credible intervals
Implement hierarchical Bayesian model for grouped data
Explore the effect of prior strength on posterior inference