Hypothesis Testing: The Statistics Behind A/B Tests and DecisionsΒΆ
Every product decision backed by data needs a test. This notebook builds intuition for p-values, statistical power, and multiple testing β with practical A/B test analysis patterns you can apply immediately.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
# Synthetic A/B test data: website checkout conversion
# Control: original checkout flow | Treatment: simplified 1-page checkout
n_control = 1000
n_treatment = 1000
p_control = 0.12 # 12% baseline conversion
p_treatment = 0.14 # 14% target (absolute lift of +2pp)
control_conversions = np.random.binomial(1, p_control, n_control)
treatment_conversions = np.random.binomial(1, p_treatment, n_treatment)
print(f'Control: {control_conversions.sum()}/{n_control} = {control_conversions.mean():.2%} conversion')
print(f'Treatment: {treatment_conversions.sum()}/{n_treatment} = {treatment_conversions.mean():.2%} conversion')
print(f'Observed lift: {treatment_conversions.mean() - control_conversions.mean():+.2%}')
1. Building Intuition β What Is a p-value?ΒΆ
# p-value = P(observing this result, or more extreme, IF null hypothesis is true)
# Null hypothesis (H0): p_control == p_treatment (no effect)
# Alt hypothesis (H1): p_treatment > p_control (treatment is better)
# Bootstrap simulation to build intuition
combined = np.concatenate([control_conversions, treatment_conversions])
observed_diff = treatment_conversions.mean() - control_conversions.mean()
n_simulations = 10000
null_diffs = []
n_c = len(control_conversions)
for _ in range(n_simulations):
shuffled = np.random.permutation(combined)
null_diff = shuffled[n_c:].mean() - shuffled[:n_c].mean()
null_diffs.append(null_diff)
null_diffs = np.array(null_diffs)
bootstrap_pval = (null_diffs >= observed_diff).mean()
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(null_diffs, bins=60, alpha=0.7, color='steelblue', label='Null distribution (shuffled)')
ax.axvline(observed_diff, color='red', linewidth=2, label=f'Observed diff = {observed_diff:+.3f}')
ax.fill_betweenx([0, ax.get_ylim()[1] if ax.get_ylim()[1] > 0 else 300],
observed_diff, null_diffs.max(),
alpha=0.3, color='red', label=f'p-value area = {bootstrap_pval:.4f}')
ax.set_xlabel('Difference in conversion rates (treatment - control)')
ax.set_ylabel('Count')
ax.set_title('Permutation Test: Null Distribution')
ax.legend()
plt.tight_layout()
plt.show()
print(f'Bootstrap p-value: {bootstrap_pval:.4f}')
print(f'Interpretation: If the null were true, we would see a difference this large {bootstrap_pval:.1%} of the time.')
2. Statistical Tests β Choosing the Right OneΒΆ
# Test selection guide
print('Statistical Test Selection Guide')
print('=' * 50)
print()
print('Comparing two groups:')
print(' Metric type | Groups | Test')
print(' βββββββββββββββββββββββββββββββββββββββββββββ')
print(' Proportions | 2 groups | z-test / chi-square')
print(' Continuous | 2 groups | t-test (normal) / Mann-Whitney U (non-normal)')
print(' Continuous | 3+ groups | ANOVA + post-hoc')
print(' Paired data | 2 groups | paired t-test')
print()
# --- Test 1: Z-test for proportions (our A/B test) ---
from statsmodels.stats.proportion import proportions_ztest
count = np.array([treatment_conversions.sum(), control_conversions.sum()])
nobs = np.array([n_treatment, n_control])
z_stat, p_val_z = proportions_ztest(count, nobs, alternative='larger')
print('=== Z-test for Proportions (conversion rate) ===')
print(f'Z-statistic: {z_stat:.4f}')
print(f'P-value: {p_val_z:.4f}')
print(f'Significant at Ξ±=0.05? {"YES" if p_val_z < 0.05 else "NO"}')
print()
# --- Test 2: Independent t-test (continuous metric: revenue per user) ---
revenue_control = np.random.lognormal(3, 1, n_control)
revenue_treatment = np.random.lognormal(3.1, 1, n_treatment)
t_stat, p_val_t = stats.ttest_ind(revenue_treatment, revenue_control, alternative='greater')
print('=== Independent t-test (revenue per user) ===')
print(f'Control mean: ${revenue_control.mean():.2f}')
print(f'Treatment mean: ${revenue_treatment.mean():.2f}')
print(f'T-statistic: {t_stat:.4f}, P-value: {p_val_t:.4f}')
print(f'Significant at Ξ±=0.05? {"YES" if p_val_t < 0.05 else "NO"}')
print()
# --- Test 3: Chi-square (categorical: device type Γ converted) ---
# Contingency table: device Γ conversion
observed = np.array([
[280, 120], # Mobile: [not converted, converted]
[380, 120], # Desktop
[200, 80], # Tablet
])
chi2, p_chi2, dof, expected = stats.chi2_contingency(observed)
print('=== Chi-square Test (device type Γ conversion) ===')
print(f'Chi2: {chi2:.3f}, dof: {dof}, P-value: {p_chi2:.4f}')
print(f'Significant? {"YES" if p_chi2 < 0.05 else "NO"}')
3. Statistical Power & Sample Size PlanningΒΆ
from statsmodels.stats.power import NormalIndPower, TTestIndPower
# Power = P(detect effect | effect truly exists) = 1 - Ξ²
# Standard targets: Ξ±=0.05 (5% false positive), power=0.80 (80% chance to detect)
def required_sample_size(p_baseline, min_detectable_effect, alpha=0.05, power=0.80):
"""
Calculate minimum sample size per group for a proportions test.
min_detectable_effect: absolute change (e.g., 0.02 for +2pp)
"""
p_treatment = p_baseline + min_detectable_effect
effect_size = abs(p_treatment - p_baseline) / np.sqrt(
p_baseline * (1 - p_baseline) / 2 + p_treatment * (1 - p_treatment) / 2
)
analysis = NormalIndPower()
n = analysis.solve_power(effect_size=effect_size, power=power, alpha=alpha)
return int(np.ceil(n))
# Show how sample size changes with effect size
effects = [0.005, 0.01, 0.02, 0.03, 0.05, 0.10]
baseline = 0.12
print(f'Baseline conversion: {baseline:.0%}')
print(f'{"Abs. Lift":<12} {"Rel. Lift":<12} {"N per group":<15} {"Total N"}')
print('-' * 52)
for effect in effects:
n = required_sample_size(baseline, effect)
rel_lift = effect / baseline
print(f'{effect:+.1%} {rel_lift:+.0%} {n:<15,} {2*n:,}')
# Power curve visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))
# Power vs sample size
ns = range(100, 3001, 100)
analysis = NormalIndPower()
p1, p2 = 0.12, 0.14
effect_size = abs(p2 - p1) / np.sqrt(p1*(1-p1)/2 + p2*(1-p2)/2)
powers = [analysis.solve_power(effect_size=effect_size, nobs1=n, alpha=0.05) for n in ns]
ax1.plot(ns, powers, 'b-', linewidth=2)
ax1.axhline(0.8, color='red', linestyle='--', label='80% power target')
ax1.axvline(required_sample_size(0.12, 0.02), color='green', linestyle='--', label=f'N={required_sample_size(0.12, 0.02)}')
ax1.set_xlabel('Sample size per group')
ax1.set_ylabel('Statistical Power')
ax1.set_title('Power vs Sample Size (12% β 14% conversion)')
ax1.legend()
# Effect size vs required N
mdes = np.linspace(0.005, 0.10, 50)
ns_required = [required_sample_size(0.12, e) for e in mdes]
ax2.plot(mdes * 100, ns_required, 'b-', linewidth=2)
ax2.set_xlabel('Minimum Detectable Effect (pp)')
ax2.set_ylabel('Required N per group')
ax2.set_title('Smaller effects require exponentially more data')
ax2.set_yscale('log')
plt.tight_layout()
plt.show()
5. Full A/B Test Report β End-to-End PatternΒΆ
def run_ab_test_report(control, treatment, metric_name='conversion', alpha=0.05):
"""Full A/B test analysis with all key statistics."""
n_c, n_t = len(control), len(treatment)
mean_c, mean_t = control.mean(), treatment.mean()
# Test
t_stat, p_val = stats.ttest_ind(treatment, control, alternative='greater')
# Effect size (Cohen's d)
pooled_std = np.sqrt((control.std()**2 + treatment.std()**2) / 2)
cohens_d = (mean_t - mean_c) / pooled_std
# Confidence interval (95%) on the difference
diff = mean_t - mean_c
se_diff = np.sqrt(control.var()/n_c + treatment.var()/n_t)
ci_low = diff - 1.96 * se_diff
ci_high = diff + 1.96 * se_diff
# Power (post-hoc)
analysis = TTestIndPower()
achieved_power = analysis.solve_power(
effect_size=cohens_d, nobs1=n_c, alpha=alpha, ratio=n_t/n_c
) if abs(cohens_d) > 0 else 0
print(f'A/B TEST REPORT β {metric_name}')
print('=' * 50)
print(f'Control: n={n_c:,}, mean={mean_c:.4f}')
print(f'Treatment: n={n_t:,}, mean={mean_t:.4f}')
print()
print(f'Absolute lift: {diff:+.4f}')
print(f'Relative lift: {diff/mean_c:+.1%}')
print(f'95% CI on lift: [{ci_low:+.4f}, {ci_high:+.4f}]')
print(f"Cohen's d: {cohens_d:.3f} ({'small' if abs(cohens_d)<0.5 else 'medium' if abs(cohens_d)<0.8 else 'large'})")
print()
print(f'P-value: {p_val:.4f}')
print(f'Significance (Ξ±={alpha}): {"β
SIGNIFICANT" if p_val < alpha else "β NOT SIGNIFICANT"}')
print(f'Achieved power: {achieved_power:.1%}')
print()
if p_val < alpha:
if ci_low > 0:
print('Recommendation: SHIP IT β entire CI is positive, effect is real and meaningful')
else:
print('Recommendation: CAUTION β significant but CI includes 0, effect may be small')
else:
if achieved_power < 0.8:
print('Recommendation: NEED MORE DATA β underpowered test')
else:
print('Recommendation: NO EFFECT β well-powered test found no significant difference')
run_ab_test_report(control_conversions.astype(float), treatment_conversions.astype(float), 'Conversion Rate')
Hypothesis Testing Cheat SheetΒΆ
Test Use When Python
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Z-test (proportions) Conversion rates, CTR proportions_ztest()
Independent t-test Revenue, time-on-site ttest_ind()
Paired t-test Same users, before/after ttest_rel()
Mann-Whitney U Non-normal continuous mannwhitneyu()
Chi-square Categorical Γ categorical chi2_contingency()
ANOVA 3+ groups, continuous f_oneway()
Red Flags in A/B Testing:
β Peeking: stopping test early because p < 0.05
β HARKing: testing many metrics, only reporting significant ones
β Novelty effect: users click new things, effect fades over time
β Leakage: treatment users interact with control users
β
Pre-register: write down hypothesis BEFORE looking at data
β
Run AA test first: verify your test infrastructure
β
Sequential testing: allows early stopping with statistical validity
ExercisesΒΆ
Run a simulation showing that peeking at results daily and stopping when p<0.05 inflates false positive rate to ~30%.
Implement CUPED (Controlled-experiment Using Pre-Experiment Data) variance reduction technique.
Simulate an A/A test (same treatment and control) and verify the p-value distribution is Uniform(0,1).
Use
scipy.stats.kruskalto test whether conversion rates differ across 4 geographic regions.Build a sample size calculator with a Streamlit UI that lets you input baseline rate and MDE and outputs required N.