import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.special import gammaln, logsumexp
import pandas as pd
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)
1. Bayesian Inference FundamentalsΒΆ
Bayesβ Theorem for ParametersΒΆ
Where:
\(p(\theta|\mathcal{D})\) is the posterior: our belief after seeing data
\(p(\mathcal{D}|\theta)\) is the likelihood: probability of data given parameters
\(p(\theta)\) is the prior: our belief before seeing data
\(p(\mathcal{D})\) is the evidence (marginal likelihood)
# Example: Coin flipping with Bayesian inference
# Prior: Beta distribution (conjugate prior for Bernoulli)
def plot_beta_binomial(alpha_prior, beta_prior, n_heads, n_tails, title=""):
"""Plot prior, likelihood, and posterior for Beta-Binomial model"""
theta = np.linspace(0, 1, 1000)
# Prior
prior = stats.beta(alpha_prior, beta_prior).pdf(theta)
# Likelihood (up to normalization constant)
likelihood = theta**n_heads * (1-theta)**n_tails
likelihood = likelihood / np.max(likelihood) # Normalize for visualization
# Posterior
alpha_post = alpha_prior + n_heads
beta_post = beta_prior + n_tails
posterior = stats.beta(alpha_post, beta_post).pdf(theta)
# Plot
plt.figure(figsize=(12, 5))
plt.plot(theta, prior, 'b-', linewidth=2, label=f'Prior: Beta({alpha_prior},{beta_prior})')
plt.plot(theta, likelihood, 'g--', linewidth=2, label=f'Likelihood: {n_heads}H, {n_tails}T (scaled)')
plt.plot(theta, posterior, 'r-', linewidth=2, label=f'Posterior: Beta({alpha_post},{beta_post})')
plt.axvline(alpha_post/(alpha_post+beta_post), color='r', linestyle=':',
label=f'Posterior mean: {alpha_post/(alpha_post+beta_post):.3f}')
if n_heads + n_tails > 0:
plt.axvline(n_heads/(n_heads+n_tails), color='g', linestyle=':',
label=f'MLE: {n_heads/(n_heads+n_tails):.3f}')
plt.xlabel('ΞΈ (probability of heads)')
plt.ylabel('Density')
plt.title(title if title else 'Bayesian Inference for Coin Flipping')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return alpha_post, beta_post
# Different prior beliefs
print("=" * 60)
print("Scenario 1: Uniform prior (no prior knowledge)")
print("=" * 60)
plot_beta_binomial(1, 1, 7, 3, "Uniform Prior: Beta(1,1)")
print("\n" + "=" * 60)
print("Scenario 2: Informative prior (believe coin is fair)")
print("=" * 60)
plot_beta_binomial(10, 10, 7, 3, "Informative Prior: Beta(10,10)")
print("\n" + "=" * 60)
print("Scenario 3: Skeptical prior (believe coin favors tails)")
print("=" * 60)
plot_beta_binomial(3, 7, 7, 3, "Skeptical Prior: Beta(3,7)")
Sequential Bayesian UpdateΒΆ
The posterior from one update becomes the prior for the next:
# Sequential updating demonstration
alpha = 2
beta = 2
# Simulate coin flips
true_theta = 0.7
flips = np.random.binomial(1, true_theta, 100)
# Track posterior evolution
alphas = [alpha]
betas = [beta]
theta_range = np.linspace(0, 1, 1000)
snapshots = [0, 5, 20, 100] # Save posteriors at these points
posteriors = {}
for i, flip in enumerate(flips):
if flip == 1:
alpha += 1
else:
beta += 1
alphas.append(alpha)
betas.append(beta)
if i+1 in snapshots:
posteriors[i+1] = stats.beta(alpha, beta).pdf(theta_range)
# Plot evolution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for idx, n_flips in enumerate(snapshots):
ax = axes[idx // 2, idx % 2]
if n_flips == 0:
posterior = stats.beta(alphas[0], betas[0]).pdf(theta_range)
n_heads = 0
n_tails = 0
else:
posterior = posteriors[n_flips]
n_heads = np.sum(flips[:n_flips])
n_tails = n_flips - n_heads
ax.plot(theta_range, posterior, 'b-', linewidth=2)
ax.fill_between(theta_range, posterior, alpha=0.3)
ax.axvline(true_theta, color='r', linestyle='--', linewidth=2, label=f'True ΞΈ = {true_theta}')
ax.axvline(alphas[n_flips]/(alphas[n_flips]+betas[n_flips]), color='g',
linestyle=':', linewidth=2, label=f'Posterior mean')
ax.set_xlabel('ΞΈ')
ax.set_ylabel('Density')
ax.set_title(f'After {n_flips} flips: {n_heads}H, {n_tails}T\nBeta({alphas[n_flips]},{betas[n_flips]})')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Plot posterior mean evolution
posterior_means = [a/(a+b) for a, b in zip(alphas, betas)]
plt.figure(figsize=(12, 5))
plt.plot(posterior_means, 'b-', linewidth=2, label='Posterior mean')
plt.axhline(true_theta, color='r', linestyle='--', linewidth=2, label=f'True ΞΈ = {true_theta}')
plt.xlabel('Number of observations')
plt.ylabel('Posterior mean of ΞΈ')
plt.title('Sequential Bayesian Learning')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Final posterior: Beta({alphas[-1]}, {betas[-1]})")
print(f"Posterior mean: {posterior_means[-1]:.4f}")
print(f"True value: {true_theta}")
print(f"MLE: {np.sum(flips)/len(flips):.4f}")
2. Conjugate PriorsΒΆ
A prior is conjugate to a likelihood if the posterior has the same functional form as the prior.
Common Conjugate PairsΒΆ
Likelihood |
Conjugate Prior |
Posterior |
|---|---|---|
Bernoulli/Binomial |
Beta |
Beta |
Multinomial |
Dirichlet |
Dirichlet |
Poisson |
Gamma |
Gamma |
Gaussian (known ΟΒ²) |
Gaussian |
Gaussian |
Gaussian (known ΞΌ) |
Inverse-Gamma |
Inverse-Gamma |
# Example 1: Poisson-Gamma conjugacy
# Model event counts with Poisson distribution
# True rate parameter
lambda_true = 5
# Generate data
data = np.random.poisson(lambda_true, 50)
# Prior: Gamma(Ξ±, Ξ²)
alpha_prior = 2
beta_prior = 0.5
# Posterior: Gamma(Ξ± + Ξ£x_i, Ξ² + n)
alpha_post = alpha_prior + np.sum(data)
beta_post = beta_prior + len(data)
# Plot
lambda_range = np.linspace(0, 15, 1000)
prior = stats.gamma(alpha_prior, scale=1/beta_prior).pdf(lambda_range)
posterior = stats.gamma(alpha_post, scale=1/beta_post).pdf(lambda_range)
plt.figure(figsize=(12, 5))
plt.plot(lambda_range, prior, 'b-', linewidth=2,
label=f'Prior: Gamma({alpha_prior}, {beta_prior})')
plt.plot(lambda_range, posterior, 'r-', linewidth=2,
label=f'Posterior: Gamma({alpha_post:.0f}, {beta_post:.0f})')
plt.axvline(lambda_true, color='g', linestyle='--', linewidth=2, label=f'True Ξ» = {lambda_true}')
plt.axvline(alpha_post/beta_post, color='r', linestyle=':', linewidth=2,
label=f'Posterior mean = {alpha_post/beta_post:.3f}')
plt.axvline(np.mean(data), color='orange', linestyle=':', linewidth=2,
label=f'MLE = {np.mean(data):.3f}')
plt.xlabel('Ξ» (rate parameter)')
plt.ylabel('Density')
plt.title(f'Poisson-Gamma Conjugacy (n={len(data)} observations)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Data summary: mean={np.mean(data):.2f}, std={np.std(data):.2f}")
print(f"Prior mean: {alpha_prior/beta_prior:.2f}")
print(f"Posterior mean: {alpha_post/beta_post:.2f}")
print(f"Posterior std: {np.sqrt(alpha_post)/(beta_post):.2f}")
# Example 2: Gaussian-Gaussian conjugacy (known variance)
# Inferring the mean of a Gaussian with known variance
# True parameters
mu_true = 5
sigma_known = 2
# Generate data
data_gaussian = np.random.normal(mu_true, sigma_known, 30)
# Prior on mean: N(ΞΌβ, ΟβΒ²)
mu0 = 0
sigma0 = 5
# Posterior on mean: N(ΞΌβ, ΟβΒ²)
n = len(data_gaussian)
precision0 = 1 / sigma0**2
precision_data = n / sigma_known**2
precision_post = precision0 + precision_data
mu_post = (precision0 * mu0 + precision_data * np.mean(data_gaussian)) / precision_post
sigma_post = np.sqrt(1 / precision_post)
# Plot
mu_range = np.linspace(-5, 15, 1000)
prior_gauss = stats.norm(mu0, sigma0).pdf(mu_range)
likelihood_gauss = stats.norm(np.mean(data_gaussian), sigma_known/np.sqrt(n)).pdf(mu_range)
posterior_gauss = stats.norm(mu_post, sigma_post).pdf(mu_range)
plt.figure(figsize=(12, 5))
plt.plot(mu_range, prior_gauss, 'b-', linewidth=2, label=f'Prior: N({mu0}, {sigma0}Β²)')
plt.plot(mu_range, likelihood_gauss, 'g--', linewidth=2, label='Likelihood (scaled)')
plt.plot(mu_range, posterior_gauss, 'r-', linewidth=2,
label=f'Posterior: N({mu_post:.2f}, {sigma_post:.2f}Β²)')
plt.axvline(mu_true, color='black', linestyle='--', linewidth=2, label=f'True ΞΌ = {mu_true}')
plt.xlabel('ΞΌ')
plt.ylabel('Density')
plt.title(f'Gaussian-Gaussian Conjugacy (Ο={sigma_known} known, n={n})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Show 95% credible interval
ci_lower = mu_post - 1.96 * sigma_post
ci_upper = mu_post + 1.96 * sigma_post
print(f"\nPosterior 95% credible interval: [{ci_lower:.3f}, {ci_upper:.3f}]")
print(f"True ΞΌ in credible interval: {ci_lower <= mu_true <= ci_upper}")
3. Posterior Predictive DistributionΒΆ
Prediction for new data, marginalizing over parameter uncertainty:
This accounts for both data uncertainty and parameter uncertainty.
# Beta-Binomial posterior predictive
# Predict next coin flip
def beta_binomial_predictive(alpha, beta):
"""Posterior predictive probability of heads"""
return alpha / (alpha + beta)
# Scenario: observed 7 heads, 3 tails
n_heads = 7
n_tails = 3
# Different priors
priors = [
("Uniform", 1, 1),
("Weak", 2, 2),
("Strong Fair", 10, 10),
("Strong Biased", 3, 7)
]
print("Posterior Predictive Probabilities:")
print("=" * 70)
print(f"{'Prior':<20} {'P(next=H|data)':<20} {'MLE':<20}")
print("=" * 70)
mle = n_heads / (n_heads + n_tails)
for name, alpha_pr, beta_pr in priors:
alpha_po = alpha_pr + n_heads
beta_po = beta_pr + n_tails
pred_prob = beta_binomial_predictive(alpha_po, beta_po)
print(f"{name:<20} {pred_prob:<20.4f} {mle:<20.4f}")
# Visualize uncertainty
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for idx, (name, alpha_pr, beta_pr) in enumerate(priors):
ax = axes[idx // 2, idx % 2]
alpha_po = alpha_pr + n_heads
beta_po = beta_pr + n_tails
# Sample from posterior
theta_samples = np.random.beta(alpha_po, beta_po, 10000)
# For each theta, sample prediction
predictions = np.random.binomial(1, theta_samples)
# Plot posterior over theta
theta_range = np.linspace(0, 1, 1000)
posterior = stats.beta(alpha_po, beta_po).pdf(theta_range)
ax.plot(theta_range, posterior, 'b-', linewidth=2)
ax.fill_between(theta_range, posterior, alpha=0.3)
ax.axvline(mle, color='g', linestyle='--', linewidth=2, label=f'MLE = {mle:.3f}')
ax.axvline(alpha_po/(alpha_po+beta_po), color='r', linestyle=':', linewidth=2,
label=f'Posterior mean = {alpha_po/(alpha_po+beta_po):.3f}')
ax.set_xlabel('ΞΈ')
ax.set_ylabel('Density')
ax.set_title(f'{name} Prior\nP(next=H|data) = {alpha_po/(alpha_po+beta_po):.3f}')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
4. Bayesian Decision TheoryΒΆ
Loss Functions and RiskΒΆ
Expected loss (risk) for action \(a\) given data:
Optimal Bayes action: \(a^* = \arg\min_a R(a|\mathcal{D})\)
Common Loss FunctionsΒΆ
0-1 loss: \(L(\theta, a) = \mathbb{I}(\theta \neq a)\) β Optimal: posterior mode
Squared loss: \(L(\theta, a) = (\theta - a)^2\) β Optimal: posterior mean
Absolute loss: \(L(\theta, a) = |\theta - a|\) β Optimal: posterior median
# Bayesian point estimation under different loss functions
# Generate skewed posterior (mixture to create asymmetry)
alpha1, beta1 = 5, 2
alpha2, beta2 = 3, 5
weight1 = 0.7
# Sample from mixture
samples1 = np.random.beta(alpha1, beta1, int(10000*weight1))
samples2 = np.random.beta(alpha2, beta2, int(10000*(1-weight1)))
posterior_samples = np.concatenate([samples1, samples2])
# Calculate optimal estimates under different losses
posterior_mode = 0.7 # Approximate mode
posterior_mean = np.mean(posterior_samples)
posterior_median = np.median(posterior_samples)
# Plot posterior and optimal estimates
theta_range = np.linspace(0, 1, 1000)
# Estimate PDF from samples
from scipy.stats import gaussian_kde
kde = gaussian_kde(posterior_samples)
posterior_pdf = kde(theta_range)
plt.figure(figsize=(12, 6))
plt.plot(theta_range, posterior_pdf, 'b-', linewidth=2, label='Posterior')
plt.fill_between(theta_range, posterior_pdf, alpha=0.3)
plt.axvline(posterior_mode, color='r', linestyle='--', linewidth=2,
label=f'Mode (0-1 loss) β {posterior_mode:.3f}')
plt.axvline(posterior_mean, color='g', linestyle='--', linewidth=2,
label=f'Mean (squared loss) = {posterior_mean:.3f}')
plt.axvline(posterior_median, color='orange', linestyle='--', linewidth=2,
label=f'Median (absolute loss) = {posterior_median:.3f}')
plt.xlabel('ΞΈ')
plt.ylabel('Density')
plt.title('Optimal Estimates Under Different Loss Functions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print("Optimal point estimates:")
print(f" 0-1 loss (minimize misclassification): Mode β {posterior_mode:.4f}")
print(f" Squared loss (minimize MSE): Mean = {posterior_mean:.4f}")
print(f" Absolute loss (minimize MAE): Median = {posterior_median:.4f}")
Classification with Asymmetric CostsΒΆ
What: This code applies Bayesian decision theory to medical diagnosis, where the cost of a false negative (missing a disease) vastly exceeds a false positive (unnecessary treatment). It computes expected losses under each action using the posterior \(p(\text{disease}|+)\) and a user-defined loss matrix, then selects the action minimizing expected risk.
Why: Real-world classification rarely has symmetric costs. In medical screening, fraud detection, and safety systems, the consequences of different error types differ by orders of magnitude. The Bayes-optimal decision threshold is not 0.5 but rather \(\tau^* = \frac{C_{FP}}{C_{FP} + C_{FN}}\), shifting toward the cheaper error. Even though \(p(\text{disease}|+) \approx 0.17\) is below 0.5, the high cost ratio (\(C_{FN}/C_{FP} = 10\)) can still make treatment the optimal action.
Connection: Asymmetric cost-sensitive classification is critical in production ML systems: autonomous vehicles weight pedestrian detection errors heavily, credit scoring penalizes wrongful denials differently from defaults, and cybersecurity intrusion detection systems tune thresholds based on operational costs of missed attacks versus false alarms.
# Medical diagnosis with asymmetric costs
# False negative (missing disease) is more costly than false positive
# Setup
p_disease = 0.01 # Base rate
sensitivity = 0.99 # P(+|disease)
specificity = 0.95 # P(-|no disease)
# Patient tests positive
p_positive_given_disease = sensitivity
p_positive_given_no_disease = 1 - specificity
# Posterior P(disease|+) using Bayes' rule
p_positive = (p_positive_given_disease * p_disease +
p_positive_given_no_disease * (1 - p_disease))
p_disease_given_positive = (p_positive_given_disease * p_disease) / p_positive
p_no_disease_given_positive = 1 - p_disease_given_positive
print(f"Posterior probabilities given positive test:")
print(f" P(disease|+) = {p_disease_given_positive:.4f}")
print(f" P(no disease|+) = {p_no_disease_given_positive:.4f}")
# Define loss matrix
# True State
# Disease No Disease
# Action: Treat 0 10 (false positive cost)
# No Treat 100 0 (false negative cost)
cost_FP = 10 # Cost of unnecessary treatment
cost_FN = 100 # Cost of missing disease
# Expected loss for each action
loss_treat = 0 * p_disease_given_positive + cost_FP * p_no_disease_given_positive
loss_no_treat = cost_FN * p_disease_given_positive + 0 * p_no_disease_given_positive
print(f"\nExpected losses:")
print(f" Treat: {loss_treat:.4f}")
print(f" Don't treat: {loss_no_treat:.4f}")
print(f"\nOptimal action: {'Treat' if loss_treat < loss_no_treat else 'Do not treat'}")
# Visualize decision boundary
cost_ratios = np.logspace(-2, 2, 100) # FN/FP cost ratio
decision_thresholds = []
for ratio in cost_ratios:
# Threshold where expected losses are equal
# cost_FP * (1-p) = cost_FN * p
# p = cost_FP / (cost_FP + cost_FN)
threshold = cost_FP / (cost_FP + cost_FP * ratio)
decision_thresholds.append(threshold)
plt.figure(figsize=(12, 5))
plt.semilogx(cost_ratios, decision_thresholds, 'b-', linewidth=2)
plt.axhline(p_disease_given_positive, color='r', linestyle='--', linewidth=2,
label=f'P(disease|+) = {p_disease_given_positive:.4f}')
plt.xlabel('Cost Ratio (False Negative / False Positive)')
plt.ylabel('Decision Threshold')
plt.title('Optimal Decision Threshold vs Cost Ratio')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"\nWith cost ratio of {cost_FN/cost_FP:.1f}:")
print(f"Decision threshold: {cost_FP/(cost_FP+cost_FN):.4f}")
print(f"Since P(disease|+) = {p_disease_given_positive:.4f} > threshold, we should treat.")
5. Empirical BayesΒΆ
Idea: Use data to estimate hyperparameters of the prior
Also called Type-II Maximum Likelihood or Evidence Maximization
# Empirical Bayes for multiple Binomial experiments
# E.g., batting averages for baseball players
# Simulate data: each player has true ability ΞΈ drawn from Beta(Ξ±, Ξ²)
alpha_true = 20
beta_true = 70
n_players = 50
n_trials = 100 # At-bats per player
# Generate true abilities and observed data
true_abilities = np.random.beta(alpha_true, beta_true, n_players)
successes = np.random.binomial(n_trials, true_abilities)
# MLE estimates (no shrinkage)
mle_estimates = successes / n_trials
# Empirical Bayes: estimate Ξ±, Ξ² from data using method of moments
sample_mean = np.mean(successes / n_trials)
sample_var = np.var(successes / n_trials)
# Method of moments estimators
# E[ΞΈ] = Ξ±/(Ξ±+Ξ²) = sample_mean
# Var[ΞΈ] = Ξ±Ξ²/((Ξ±+Ξ²)Β²(Ξ±+Ξ²+1)) = sample_var
alpha_eb = sample_mean * (sample_mean * (1 - sample_mean) / sample_var - 1)
beta_eb = (1 - sample_mean) * (sample_mean * (1 - sample_mean) / sample_var - 1)
# Empirical Bayes estimates (with shrinkage)
eb_estimates = (alpha_eb + successes) / (alpha_eb + beta_eb + n_trials)
print(f"True hyperparameters: Ξ±={alpha_true}, Ξ²={beta_true}")
print(f"Estimated hyperparameters: Ξ±={alpha_eb:.2f}, Ξ²={beta_eb:.2f}")
print(f"True prior mean: {alpha_true/(alpha_true+beta_true):.4f}")
print(f"Estimated prior mean: {alpha_eb/(alpha_eb+beta_eb):.4f}")
# Calculate errors
mle_error = np.mean((mle_estimates - true_abilities)**2)
eb_error = np.mean((eb_estimates - true_abilities)**2)
print(f"\nMean Squared Error:")
print(f" MLE: {mle_error:.6f}")
print(f" Empirical Bayes: {eb_error:.6f}")
print(f" Improvement: {100*(mle_error-eb_error)/mle_error:.1f}%")
# Visualize shrinkage
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Comparison plot
axes[0].scatter(mle_estimates, true_abilities, alpha=0.6, s=50, label='MLE')
axes[0].scatter(eb_estimates, true_abilities, alpha=0.6, s=50, label='Empirical Bayes')
axes[0].plot([0, 0.5], [0, 0.5], 'k--', alpha=0.5, label='Perfect prediction')
axes[0].set_xlabel('Estimated Ability')
axes[0].set_ylabel('True Ability')
axes[0].set_title('Estimates vs True Abilities')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].axis('equal')
# Shrinkage visualization
axes[1].scatter(mle_estimates, eb_estimates, alpha=0.6, s=50)
axes[1].plot([0, 0.5], [0, 0.5], 'k--', alpha=0.5, label='No shrinkage')
axes[1].axhline(alpha_eb/(alpha_eb+beta_eb), color='r', linestyle=':',
label=f'Prior mean = {alpha_eb/(alpha_eb+beta_eb):.3f}')
axes[1].set_xlabel('MLE Estimate')
axes[1].set_ylabel('Empirical Bayes Estimate')
axes[1].set_title('Shrinkage Effect')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].axis('equal')
plt.tight_layout()
plt.show()
# Show examples of extreme shrinkage
extreme_idx = np.argsort(np.abs(mle_estimates - sample_mean))[-5:]
print("\nExamples of shrinkage (most extreme):")
print(f"{'True':<8} {'Successes':<10} {'MLE':<8} {'EB':<8} {'Shrinkage':<10}")
for idx in extreme_idx:
shrinkage = mle_estimates[idx] - eb_estimates[idx]
print(f"{true_abilities[idx]:<8.3f} {successes[idx]:<10} {mle_estimates[idx]:<8.3f} "
f"{eb_estimates[idx]:<8.3f} {shrinkage:<10.3f}")
SummaryΒΆ
In this notebook, we covered:
Bayesian Inference: Sequential updating and parameter learning
Conjugate Priors: Convenient analytical solutions for common models
Posterior Predictive: Accounting for parameter uncertainty in predictions
Bayesian Decision Theory: Optimal actions under different loss functions
Empirical Bayes: Data-driven prior specification
Key TakeawaysΒΆ
Bayesian methods provide a principled framework for incorporating prior knowledge
Conjugate priors enable analytical posterior computation
The posterior predictive distribution accounts for parameter uncertainty
Different loss functions lead to different optimal point estimates
Empirical Bayes offers a compromise between full Bayes and frequentist methods
Shrinkage towards the prior mean can improve estimates, especially with limited data
ExercisesΒΆ
Derive the posterior for Gaussian with unknown mean and variance
Implement grid approximation for non-conjugate posteriors
Compare Bayesian and frequentist confidence/credible intervals
Implement hierarchical Bayesian model for grouped data
Explore the effect of prior strength on posterior inference