Run this notebook: Open in Colab Open in Kaggle

Bayesian Thinking: Updating Beliefs with Data¶

Bayesian inference is how rational agents update their beliefs. This notebook builds intuition from Bayes’ theorem → Beta-Binomial conjugates → Markov Chain Monte Carlo, with practical applications in A/B testing and parameter estimation.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy import stats
from scipy.special import betaln
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

# Check for PyMC
try:
    import pymc as pm
    import arviz as az
    HAS_PYMC = True
    print('PyMC available for MCMC sampling')
except ImportError:
    HAS_PYMC = False
    print('PyMC not available — showing patterns with scipy distributions')

1. Bayes’ Theorem — The Core Idea¶

# P(H | E) = P(E | H) * P(H) / P(E)
# Posterior = Likelihood × Prior / Evidence

# Classic example: medical test
# Disease prevalence (prior): 1 in 1000
# Test sensitivity (true positive rate): 99%
# Test specificity: 95% (5% false positive rate)

p_disease   = 0.001    # Prior: 0.1% of population has disease
sensitivity = 0.99     # P(+test | disease)
specificity = 0.95     # P(-test | no disease) → FPR = 0.05
fpr = 1 - specificity  # 0.05

# P(+test) = P(+test|disease)*P(disease) + P(+test|no disease)*P(no disease)
p_positive_test = sensitivity * p_disease + fpr * (1 - p_disease)

# P(disease | +test) = Bayes' theorem
p_disease_given_positive = (sensitivity * p_disease) / p_positive_test

print('Medical Test Example (Bayes\' Theorem)')
print('=' * 45)
print(f'Disease prevalence:    {p_disease:.1%}')
print(f'Test sensitivity:      {sensitivity:.0%}')
print(f'False positive rate:   {fpr:.0%}')
print()
print(f'P(disease | positive test) = {p_disease_given_positive:.1%}')
print()
print('Interpretation: Even with a 99% accurate test,')
print('a positive result only means ~2% chance of disease')
print('because the disease is so rare!')
print()
print('This is why mass screening for rare conditions is tricky.')

# Visualize the intuition with a frequency table
n_people = 100_000
have_disease   = int(n_people * p_disease)
no_disease     = n_people - have_disease
tp = int(have_disease * sensitivity)
fp = int(no_disease * fpr)
print(f'\nFrequency approach (out of {n_people:,} people):')
print(f'  Truly have disease: {have_disease}')
print(f'  True positives (disease + positive test): {tp}')
print(f'  False positives (no disease + positive test): {fp}')
print(f'  PPV = {tp}/{tp+fp} = {tp/(tp+fp):.1%}')

2. The Beta Distribution — A Prior for Probabilities¶

# Beta(α, β) is the conjugate prior for binomial likelihoods
# α = prior successes + 1, β = prior failures + 1
# After observing k successes in n trials:
#   Posterior = Beta(α + k, β + n - k)

x = np.linspace(0, 1, 1000)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 1. Different priors
priors = [
    (1, 1, 'Uniform: Beta(1,1) — no prior knowledge'),
    (3, 7, 'Skeptical: Beta(3,7) — expect ~30% conversion'),
    (10, 10, 'Informed: Beta(10,10) — expect ~50% conversion'),
]

for alpha, beta, label in priors:
    axes[0].plot(x, stats.beta.pdf(x, alpha, beta), label=label, linewidth=2)
axes[0].set_title('Different Prior Beliefs')
axes[0].set_xlabel('Conversion rate θ')
axes[0].set_ylabel('Probability density')
axes[0].legend(fontsize=7)

# 2. Bayesian updating step by step
# True conversion rate: 0.14
true_p = 0.14
# Start with uniform prior
alpha_prior, beta_prior = 1, 1

observations = [3, 10, 30, 100]
colors = ['blue', 'orange', 'green', 'red']

axes[1].plot(x, stats.beta.pdf(x, alpha_prior, beta_prior),
             'k--', label='Prior Beta(1,1)', linewidth=1.5)

alpha_curr, beta_curr = alpha_prior, beta_prior
for n_obs, color in zip(observations, colors):
    k = int(n_obs * true_p)
    alpha_curr = alpha_prior + k
    beta_curr  = beta_prior + (n_obs - k)
    axes[1].plot(x, stats.beta.pdf(x, alpha_curr, beta_curr),
                 color=color, linewidth=2, label=f'n={n_obs}: Beta({alpha_curr},{beta_curr})')

axes[1].axvline(true_p, color='black', linestyle=':', alpha=0.5, label=f'True θ={true_p}')
axes[1].set_title('Belief Update as Data Accumulates')
axes[1].set_xlabel('Conversion rate θ')
axes[1].legend(fontsize=7)

# 3. Credible interval vs confidence interval
alpha_post = alpha_prior + 14  # 14 successes
beta_post  = beta_prior + 86   # 86 failures (100 trials)
posterior = stats.beta(alpha_post, beta_post)

axes[2].fill_between(x, posterior.pdf(x),
                     where=(x >= posterior.ppf(0.025)) & (x <= posterior.ppf(0.975)),
                     alpha=0.4, color='blue', label='95% Credible Interval')
axes[2].plot(x, posterior.pdf(x), 'b-', linewidth=2)
axes[2].axvline(posterior.mean(), color='red', linestyle='--', label=f'Posterior mean: {posterior.mean():.3f}')
axes[2].set_title('Posterior with 95% Credible Interval\n(after 100 trials, 14 successes)')
axes[2].set_xlabel('Conversion rate θ')
axes[2].legend(fontsize=9)

plt.tight_layout()
plt.show()

ci_low, ci_high = posterior.ppf(0.025), posterior.ppf(0.975)
print(f'Posterior mean: {posterior.mean():.3f}')
print(f'95% Credible interval: [{ci_low:.3f}, {ci_high:.3f}]')
print(f'Interpretation: There is a 95% probability that the true rate is between {ci_low:.1%} and {ci_high:.1%}')

3. Bayesian A/B Testing — Probability of Being Best¶

# Bayesian A/B: instead of p-values, compute P(B > A)

def bayesian_ab_test(n_a, k_a, n_b, k_b, n_samples=100_000):
    """
    Compare two proportions using Beta-Binomial conjugate.
    Returns P(B > A) and expected lift.
    """
    # Posteriors (uniform prior: α=1, β=1)
    posterior_a = stats.beta(1 + k_a, 1 + n_a - k_a)
    posterior_b = stats.beta(1 + k_b, 1 + n_b - k_b)
    
    # Monte Carlo sampling
    samples_a = posterior_a.rvs(n_samples)
    samples_b = posterior_b.rvs(n_samples)
    
    prob_b_better = (samples_b > samples_a).mean()
    expected_lift = (samples_b - samples_a).mean()
    lift_ci = np.percentile(samples_b - samples_a, [2.5, 97.5])
    
    # Expected loss (how much do we lose if we wrongly pick B?)
    loss_b = np.maximum(samples_a - samples_b, 0).mean()
    
    return {
        'prob_b_better': prob_b_better,
        'expected_lift': expected_lift,
        'lift_95ci': lift_ci,
        'expected_loss_if_choose_b': loss_b,
        'posterior_a': posterior_a,
        'posterior_b': posterior_b,
    }

# A/B test: original vs new checkout
result = bayesian_ab_test(
    n_a=1000, k_a=120,  # Control: 12.0%
    n_b=1000, k_b=143,  # Treatment: 14.3%
)

print('=== Bayesian A/B Test Results ===')
print(f'P(B > A):                    {result["prob_b_better"]:.1%}')
print(f'Expected lift:               {result["expected_lift"]:+.3f} ({result["expected_lift"]/0.12:+.0%} relative)')
print(f'95% Credible interval:       [{result["lift_95ci"][0]:+.3f}, {result["lift_95ci"][1]:+.3f}]')
print(f'Expected loss (if choose B): {result["expected_loss_if_choose_b"]:.4f}')
print()
print('Bayesian Decision Rule (typical):')
print('  P(B>A) > 95% → Ship B')
print('  P(B>A) > 90% → Ship B with monitoring')
print('  P(B>A) < 50% → Consider stopping early')

# Visualize posteriors
fig, ax = plt.subplots(figsize=(10, 5))
x = np.linspace(0.05, 0.25, 1000)
ax.fill_between(x, result['posterior_a'].pdf(x), alpha=0.4, color='blue', label='Control A')
ax.fill_between(x, result['posterior_b'].pdf(x), alpha=0.4, color='red', label='Treatment B')
ax.plot(x, result['posterior_a'].pdf(x), 'b-', linewidth=2)
ax.plot(x, result['posterior_b'].pdf(x), 'r-', linewidth=2)
ax.set_xlabel('Conversion rate θ')
ax.set_ylabel('Probability density')
ax.set_title(f'Posterior Distributions — P(B>A) = {result["prob_b_better"]:.1%}')
ax.legend()
plt.tight_layout()
plt.show()

4. MCMC with PyMC — Complex Models¶

# When conjugate priors don't exist → MCMC
# Example: linear regression with Bayesian uncertainty estimates

np.random.seed(42)
n = 50
x_data = np.linspace(0, 10, n)
# True: y = 2.5x + 5 + noise
y_data = 2.5 * x_data + 5 + np.random.normal(0, 3, n)

if HAS_PYMC:
    with pm.Model() as linear_model:
        # Priors
        alpha = pm.Normal('alpha', mu=0, sigma=10)   # intercept
        beta  = pm.Normal('beta',  mu=0, sigma=10)   # slope
        sigma = pm.HalfNormal('sigma', sigma=5)      # noise
        
        # Likelihood
        mu = alpha + beta * x_data
        likelihood = pm.Normal('y', mu=mu, sigma=sigma, observed=y_data)
        
        # Sample
        trace = pm.sample(1000, tune=1000, progressbar=False, return_inferencedata=True)
    
    az.plot_posterior(trace, var_names=['alpha', 'beta', 'sigma'])
    plt.tight_layout()
    plt.show()
    
    summary = az.summary(trace, var_names=['alpha', 'beta', 'sigma'])
    print(summary)
else:
    # Manual MCMC (Metropolis-Hastings) for intuition
    def log_posterior(alpha, beta, sigma, x, y):
        if sigma <= 0:
            return -np.inf
        # Priors
        lp = stats.norm.logpdf(alpha, 0, 10)
        lp += stats.norm.logpdf(beta, 0, 10)
        lp += stats.halfnorm.logpdf(sigma, scale=5)
        # Likelihood
        predicted = alpha + beta * x
        lp += stats.norm.logpdf(y, predicted, sigma).sum()
        return lp
    
    # Metropolis-Hastings sampler
    n_samples = 5000
    samples = np.zeros((n_samples, 3))  # [alpha, beta, sigma]
    current = np.array([3.0, 2.0, 2.0])
    step_sizes = np.array([0.5, 0.1, 0.2])
    
    for i in range(n_samples):
        proposal = current + np.random.normal(0, step_sizes)
        log_ratio = log_posterior(*proposal, x_data, y_data) - log_posterior(*current, x_data, y_data)
        if np.log(np.random.uniform()) < log_ratio:
            current = proposal
        samples[i] = current
    
    # Discard burn-in
    samples_burned = samples[1000:]
    
    fig, axes = plt.subplots(1, 3, figsize=(14, 4))
    names = ['alpha (intercept)', 'beta (slope)', 'sigma (noise)')
    true_vals = [5, 2.5, 3]
    
    for ax, samp, name, true_val in zip(axes, samples_burned.T, names, true_vals):
        ax.hist(samp, bins=50, density=True, alpha=0.7, color='steelblue')
        ax.axvline(samp.mean(), color='red', linestyle='--', label=f'Mean: {samp.mean():.2f}')
        ax.axvline(true_val, color='green', linestyle=':', label=f'True: {true_val}')
        ax.set_title(name)
        ax.legend(fontsize=8)
    
    plt.suptitle('MCMC Posterior Distributions (Metropolis-Hastings)', y=1.02)
    plt.tight_layout()
    plt.show()
    
    print('Posterior summaries (mean ± std):')
    print(f'  alpha: {samples_burned[:,0].mean():.2f} ± {samples_burned[:,0].std():.2f} (true: 5.0)')
    print(f'  beta:  {samples_burned[:,1].mean():.2f} ± {samples_burned[:,1].std():.2f} (true: 2.5)')
    print(f'  sigma: {samples_burned[:,2].mean():.2f} ± {samples_burned[:,2].std():.2f} (true: 3.0)')

Bayesian Thinking Cheat Sheet¶

Concept                Key Insight
────────────────────────────────────────────────────────────────
Prior                  Your belief BEFORE seeing data
Likelihood             How probable is this data given a hypothesis?
Posterior              Updated belief AFTER seeing data
Conjugate prior        Prior that yields same-family posterior
                         → Beta + Binomial = Beta posterior
                         → Normal + Normal = Normal posterior
Credible interval      P(θ in CI) = 95% — direct probability statement
Confidence interval    If repeated, 95% of CIs would contain true value
MCMC                   Sample from posterior when no closed form exists

Frequentist vs Bayesian A/B:
  Frequentist: p-value < 0.05 → reject null
    Pro: Controlled error rates  Con: Hard to interpret, no early stopping
  Bayesian: P(B>A) > 95% → choose B
    Pro: Intuitive, safe early stopping  Con: Prior choice matters

Exercises¶

A coin comes up heads 7 times in 10 flips. Starting from a Beta(1,1) prior, compute the posterior and 95% credible interval for the true probability.
Compare Bayesian A/B test results with a frequentist z-test on the same data — when do they disagree?
Implement a Multi-Armed Bandit using Thompson Sampling (sample from Beta posterior, pick arm with highest sample).
Use PyMC to fit a Poisson regression model for count data (e.g., predicting support ticket volume).
Show how the choice of prior affects posterior when n=5 vs n=500 — verify that with enough data, the prior washes out.