Run this notebook: Open in Colab Open in Kaggle

Part 1: Probability Fundamentals¶

Rolling a Die¶

Let’s start with a simple example: rolling a fair six-sided die.

# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pandas as pd
from itertools import product

# Set styles
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
np.random.seed(42)

print("✅ Libraries imported successfully!")

# Simulate rolling a fair die 1000 times
die_rolls = np.random.randint(1, 7, size=1000)

# Calculate probabilities
unique, counts = np.unique(die_rolls, return_counts=True)
probabilities = counts / len(die_rolls)

# Visualize
plt.figure(figsize=(10, 5))
plt.bar(unique, probabilities, alpha=0.7, color='steelblue', edgecolor='black', width=0.6)
plt.axhline(y=1/6, color='red', linestyle='--', linewidth=2, label='Theoretical (1/6 ≈ 0.167)')
plt.xlabel('Die Face', fontsize=13)
plt.ylabel('Probability', fontsize=13)
plt.title('Empirical Probability Distribution of Die Rolls (n=1000)', fontsize=15, fontweight='bold')
plt.xticks(unique)
plt.legend(fontsize=11)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("Observed probabilities:")
print("-" * 40)
for face, prob, count in zip(unique, probabilities, counts):
    print(f"Face {face}: {prob:.3f} ({count}/1000) | Expected: {1/6:.3f}")

🧠 Knowledge Check¶

Question 1: What’s the probability of rolling an even number on a fair die?

Click for answer

Answer: P(even) = 3/6 = 0.5

Even faces are {2, 4, 6}, so 3 favorable outcomes out of 6 total.

Question 2: If you roll two dice, what’s the probability of getting a sum of 7?

Click for answer

Answer: P(sum=7) = 6/36 = 1/6 ≈ 0.167

Favorable outcomes: (1,6), (2,5), (3,4), (4,3), (5,2), (6,1) = 6 ways

Coin Flips and Random Variables¶

A random variable is a variable whose value depends on a random process.

# Sample space: Flipping 3 coins
outcomes = list(product(['H', 'T'], repeat=3))
print(f"Sample space (n={len(outcomes)}):")
print("-" * 30)
for i, outcome in enumerate(outcomes, 1):
    print(f"{i}. {''.join(outcome)}")

# Random variable: Number of heads
num_heads = [sum([1 for flip in outcome if flip == 'H']) for outcome in outcomes]
print(f"\nRandom Variable X = Number of Heads")
print(f"Possible values: {sorted(set(num_heads))}")

# Probability distribution
unique_heads, counts = np.unique(num_heads, return_counts=True)
prob_dist = counts / len(outcomes)

plt.figure(figsize=(9, 5))
bars = plt.bar(unique_heads, prob_dist, alpha=0.75, color='coral', edgecolor='black', width=0.5)
plt.xlabel('Number of Heads (X)', fontsize=13)
plt.ylabel('Probability P(X)', fontsize=13)
plt.title('Probability Distribution: Number of Heads in 3 Coin Flips', fontsize=15, fontweight='bold')
plt.xticks(unique_heads)
plt.grid(axis='y', alpha=0.3)

# Add probability labels on bars
for bar, prob, count in zip(bars, prob_dist, counts):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{count}/8\n({prob:.3f})', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

print("\nProbability distribution P(X):")
print("-" * 40)
for heads, prob, count in zip(unique_heads, prob_dist, counts):
    print(f"P(X = {heads}) = {count}/8 = {prob:.3f}")

Part 2: Descriptive Statistics¶

Analyzing Real Data: MLB Player Statistics¶

We’ll use baseball player data (heights and weights) from Major League Baseball.

# Generate realistic MLB player data (based on actual statistics)
n_players = 1034

# Heights: Normal distribution, mean=73.7", std=2.3"
heights = np.random.normal(loc=73.7, scale=2.3, size=n_players)

# Weights: Normal distribution, mean=201.7 lbs, std=21.0 lbs
weights = np.random.normal(loc=201.7, scale=21.0, size=n_players)

# Positions
positions = np.random.choice(['1B', '2B', 'SS', '3B', 'C', 'OF', 'P'], 
                             n_players, 
                             p=[0.10, 0.10, 0.10, 0.10, 0.10, 0.30, 0.20])

# Create DataFrame
mlb_data = pd.DataFrame({
    'height': heights,
    'weight': weights,
    'position': positions
})

print("MLB Player Dataset")
print("=" * 50)
print(mlb_data.head(10))
print(f"\nTotal players: {len(mlb_data)}")
print(f"\nPosition counts:")
print(mlb_data['position'].value_counts())

Measures of Central Tendency¶

Mean (μ): Average value
Median: Middle value when sorted
Mode: Most frequent value
Standard Deviation (σ): Measure of spread

# Calculate statistics
stats_summary = mlb_data[['height', 'weight']].describe()
print("Descriptive Statistics")
print("=" * 60)
print(stats_summary)

print("\n" + "=" * 60)
print(f"Height: {mlb_data['height'].mean():.2f}± {mlb_data['height'].std():.2f} inches")
print(f"        ({mlb_data['height'].mean()/12:.2f} ± {mlb_data['height'].std()/12:.2f} feet)")
print(f"\nWeight: {mlb_data['weight'].mean():.2f} ± {mlb_data['weight'].std():.2f} lbs")

# Visualize distributions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Weight distribution
mean_weight = mlb_data['weight'].mean()
median_weight = mlb_data['weight'].median()

axes[0].hist(mlb_data['weight'], bins=30, alpha=0.7, color='steelblue', edgecolor='black')
axes[0].axvline(mean_weight, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_weight:.1f} lbs')
axes[0].axvline(median_weight, color='green', linestyle='--', linewidth=2, label=f'Median: {median_weight:.1f} lbs')
axes[0].set_xlabel('Weight (lbs)', fontsize=13)
axes[0].set_ylabel('Frequency', fontsize=13)
axes[0].set_title('Distribution of Player Weights', fontsize=15, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(axis='y', alpha=0.3)

# Height distribution
mean_height = mlb_data['height'].mean()
median_height = mlb_data['height'].median()

axes[1].hist(mlb_data['height'], bins=30, alpha=0.7, color='coral', edgecolor='black')
axes[1].axvline(mean_height, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_height:.1f}"')
axes[1].axvline(median_height, color='green', linestyle='--', linewidth=2, label=f'Median: {median_height:.1f}"')
axes[1].set_xlabel('Height (inches)', fontsize=13)
axes[1].set_ylabel('Frequency', fontsize=13)
axes[1].set_title('Distribution of Player Heights', fontsize=15, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

🧠 Knowledge Check¶

Question: When should you use the median instead of the mean?

Click for answer

Answer: Use the median when:

Data has outliers (extreme values)
Distribution is skewed (not symmetric)
You want a robust measure unaffected by extremes

Example: Income data often uses median because a few billionaires would skew the mean upward.

Part 3: Probability Distributions¶

The Normal (Gaussian) Distribution¶

The most important distribution in statistics and ML!

\[f(x) = \frac{1}{\sigma\sqrt{2\pi}} e^{-\frac{1}{2}(\frac{x-\mu}{\sigma})^2}\]

# Fit normal distribution to height data
mu, sigma = mlb_data['height'].mean(), mlb_data['height'].std()

# Generate theoretical distribution
x = np.linspace(mlb_data['height'].min(), mlb_data['height'].max(), 200)
theoretical_dist = stats.norm.pdf(x, mu, sigma)

plt.figure(figsize=(12, 6))
plt.hist(mlb_data['height'], bins=35, density=True, alpha=0.65, 
         color='steelblue', edgecolor='black', label='Observed Data')
plt.plot(x, theoretical_dist, 'r-', linewidth=3, label=f'Normal(μ={mu:.1f}", σ={sigma:.1f}")')
plt.xlabel('Height (inches)', fontsize=13)
plt.ylabel('Density', fontsize=13)
plt.title('Player Heights vs Normal Distribution', fontsize=15, fontweight='bold')
plt.legend(fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# 68-95-99.7 Rule (Empirical Rule)
print("\n📊 68-95-99.7 Rule (Empirical Rule)")
print("=" * 60)
print(f"  68% of data within 1σ: [{mu-sigma:.1f}", {mu+sigma:.1f}"]")
print(f"  95% of data within 2σ: [{mu-2*sigma:.1f}", {mu+2*sigma:.1f}"]")
print(f"  99.7% of data within 3σ: [{mu-3*sigma:.1f}", {mu+3*sigma:.1f}"]")

# Verify with actual data
within_1sigma = ((mlb_data['height'] >= mu-sigma) & (mlb_data['height'] <= mu+sigma)).mean()
within_2sigma = ((mlb_data['height'] >= mu-2*sigma) & (mlb_data['height'] <= mu+2*sigma)).mean()
within_3sigma = ((mlb_data['height'] >= mu-3*sigma) & (mlb_data['height'] <= mu+3*sigma)).mean()

print(f"\n✅ Actual data verification:")
print(f"  Within 1σ: {within_1sigma:.1%} (expected: 68%)")
print(f"  Within 2σ: {within_2sigma:.1%} (expected: 95%)")
print(f"  Within 3σ: {within_3sigma:.1%} (expected: 99.7%)")

Other Important Distributions¶

What and Why: Beyond the Gaussian, several distributions appear repeatedly in statistical modeling and ML. The t-distribution handles uncertainty when sample sizes are small, the chi-squared distribution underlies goodness-of-fit tests and variance estimation, and the F-distribution is used for comparing model complexities (e.g., ANOVA, nested model comparison).

How: The code visualizes these distributions side by side, showing how their shapes depend on degrees of freedom. As degrees of freedom increase, the t-distribution converges to the Gaussian – a concrete illustration of the central limit theorem at work. The chi-squared distribution with \(k\) degrees of freedom is the distribution of \(\sum_{i=1}^k Z_i^2\) where \(Z_i \sim N(0,1)\).

Connection: These distributions are essential for hypothesis testing in A/B testing (t-tests), feature selection (chi-squared tests), and model comparison (F-tests for nested regression models). Understanding them helps interpret p-values and confidence intervals reported by statistical software.

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Binomial Distribution (discrete)
# Example: Number of heads in 10 coin flips
n, p = 10, 0.5
x_binom = np.arange(0, n+1)
pmf_binom = stats.binom.pmf(x_binom, n, p)
axes[0, 0].bar(x_binom, pmf_binom, alpha=0.75, color='steelblue', edgecolor='black')
axes[0, 0].set_title(f'Binomial Distribution (n={n}, p={p})\n"10 coin flips"', fontsize=13, fontweight='bold')
axes[0, 0].set_xlabel('Number of successes (heads)')
axes[0, 0].set_ylabel('Probability')
axes[0, 0].grid(axis='y', alpha=0.3)

# 2. Poisson Distribution (discrete)
# Example: Number of emails received per hour
lambda_param = 3
x_poisson = np.arange(0, 12)
pmf_poisson = stats.poisson.pmf(x_poisson, lambda_param)
axes[0, 1].bar(x_poisson, pmf_poisson, alpha=0.75, color='coral', edgecolor='black')
axes[0, 1].set_title(f'Poisson Distribution (λ={lambda_param})\n"Emails per hour"', fontsize=13, fontweight='bold')
axes[0, 1].set_xlabel('Number of events')
axes[0, 1].set_ylabel('Probability')
axes[0, 1].grid(axis='y', alpha=0.3)

# 3. Exponential Distribution (continuous)
# Example: Time between events
lambda_exp = 1.5
x_exp = np.linspace(0, 5, 200)
pdf_exp = stats.expon.pdf(x_exp, scale=1/lambda_exp)
axes[1, 0].plot(x_exp, pdf_exp, linewidth=3, color='green')
axes[1, 0].fill_between(x_exp, pdf_exp, alpha=0.3, color='green')
axes[1, 0].set_title(f'Exponential Distribution (λ={lambda_exp})\n"Time between arrivals"', fontsize=13, fontweight='bold')
axes[1, 0].set_xlabel('Time')
axes[1, 0].set_ylabel('Density')
axes[1, 0].grid(alpha=0.3)

# 4. Uniform Distribution (continuous)
# Example: Random number between 0 and 10
a, b = 0, 10
x_uniform = np.linspace(a-1, b+1, 200)
pdf_uniform = stats.uniform.pdf(x_uniform, a, b-a)
axes[1, 1].plot(x_uniform, pdf_uniform, linewidth=3, color='purple')
axes[1, 1].fill_between(x_uniform, pdf_uniform, alpha=0.3, color='purple')
axes[1, 1].set_title(f'Uniform Distribution U({a}, {b})\n"Random number generator"', fontsize=13, fontweight='bold')
axes[1, 1].set_xlabel('Value')
axes[1, 1].set_ylabel('Density')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("Distribution Use Cases:")
print("=" * 60)
print("Binomial: Success/failure in fixed trials (e.g., A/B testing)")
print("Poisson: Count of rare events in interval (e.g., web traffic)")
print("Exponential: Time until next event (e.g., server response)")
print("Uniform: All values equally likely (e.g., random initialization)")

Part 4: Statistical Inference¶

Confidence Intervals¶

A confidence interval gives a range where we expect the true population parameter to lie.

# Calculate 95% confidence interval for player heights
confidence = 0.95
sample_mean = mlb_data['height'].mean()
sample_std = mlb_data['height'].std()
sample_size = len(mlb_data)

# Standard error of the mean
se = sample_std / np.sqrt(sample_size)

# Confidence interval using t-distribution
ci = stats.t.interval(confidence, 
                      df=sample_size-1,
                      loc=sample_mean,
                      scale=se)

print(f"📊 95% Confidence Interval for Player Height")
print("=" * 60)
print(f"  Sample size (n): {sample_size}")
print(f"  Point estimate (x̄): {sample_mean:.2f} inches")
print(f"  Standard error (SE): {se:.4f}")
print(f"  95% CI: [{ci[0]:.2f}, {ci[1]:.2f}] inches")
print(f"  Margin of error: ±{(ci[1] - ci[0])/2:.2f}")
print(f"\n💡 Interpretation:")
print(f"  We are 95% confident that the true average height")
print(f"  of ALL MLB players is between {ci[0]:.2f} and {ci[1]:.2f} inches.")

# Visualize sampling distribution
plt.figure(figsize=(12, 6))
x = np.linspace(sample_mean - 4*se, sample_mean + 4*se, 1000)
y = stats.t.pdf(x, df=sample_size-1, loc=sample_mean, scale=se)
plt.plot(x, y, linewidth=2.5, color='steelblue', label='Sampling Distribution of Mean')
plt.axvline(sample_mean, color='red', linestyle='--', linewidth=2, label=f'Sample Mean: {sample_mean:.2f}"')
plt.axvline(ci[0], color='green', linestyle='--', linewidth=2, label=f'95% CI: [{ci[0]:.2f}, {ci[1]:.2f}]')
plt.axvline(ci[1], color='green', linestyle='--', linewidth=2)
plt.fill_between(x, 0, y, where=(x >= ci[0]) & (x <= ci[1]), alpha=0.3, color='green')
plt.xlabel('Height (inches)', fontsize=13)
plt.ylabel('Density', fontsize=13)
plt.title('Sampling Distribution of Mean Height with 95% Confidence Interval', fontsize=15, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Part 5: Hypothesis Testing¶

Comparing Two Groups¶

Research Question: Are first basemen (1B) significantly taller than second basemen (2B)?

# Extract heights for each position
heights_1b = mlb_data[mlb_data['position'] == '1B']['height']
heights_2b = mlb_data[mlb_data['position'] == '2B']['height']

print(f"📊 Comparing Player Heights by Position")
print("=" * 60)
print(f"First Basemen (1B): n={len(heights_1b)}, mean={heights_1b.mean():.2f}", std={heights_1b.std():.2f}")
print(f"Second Basemen (2B): n={len(heights_2b)}, mean={heights_2b.mean():.2f}", std={heights_2b.std():.2f}")
print(f"Difference in means: {heights_1b.mean() - heights_2b.mean():.2f} inches")

# Hypothesis Test
print(f"\n🔬 Two-Sample t-test")
print(f"  H₀ (null): μ₁ᵦ = μ₂ᵦ (no difference in heights)")
print(f"  H₁ (alternative): μ₁ᵦ ≠ μ₂ᵦ (heights differ)")
print(f"  Significance level (α): 0.05")

t_stat, p_value = stats.ttest_ind(heights_1b, heights_2b)

print(f"\n📈 Results:")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")

alpha = 0.05
print(f"\n{'='*60}")
if p_value < alpha:
    print(f"✅ REJECT null hypothesis (p = {p_value:.4f} < α = {alpha})")
    print(f"\n   Conclusion: First basemen are SIGNIFICANTLY taller than second basemen.")
    print(f"   The difference of {heights_1b.mean() - heights_2b.mean():.2f} inches is statistically significant.")
else:
    print(f"❌ FAIL TO REJECT null hypothesis (p = {p_value:.4f} >= α = {alpha})")
    print(f"\n   Conclusion: No significant difference in heights.")
    print(f"   The observed difference could be due to random chance.")

# Visualize
plt.figure(figsize=(13, 6))
plt.hist(heights_1b, bins=20, alpha=0.6, label=f'1B (n={len(heights_1b)}, μ={heights_1b.mean():.1f}")', 
         color='steelblue', edgecolor='black')
plt.hist(heights_2b, bins=20, alpha=0.6, label=f'2B (n={len(heights_2b)}, μ={heights_2b.mean():.1f}")', 
         color='coral', edgecolor='black')
plt.axvline(heights_1b.mean(), color='blue', linestyle='--', linewidth=2.5)
plt.axvline(heights_2b.mean(), color='red', linestyle='--', linewidth=2.5)
plt.xlabel('Height (inches)', fontsize=13)
plt.ylabel('Frequency', fontsize=13)
plt.title(f'Height Distribution: First Base vs Second Base (p-value = {p_value:.4f})', 
          fontsize=15, fontweight='bold')
plt.legend(fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

🧠 Knowledge Check¶

Question: What does a p-value tell you?

Click for answer

Answer: The p-value is the probability of observing data as extreme as (or more extreme than) what we observed, assuming the null hypothesis is true.

Small p-value (< 0.05): Strong evidence against H₀ → Reject H₀
Large p-value (≥ 0.05): Weak evidence against H₀ → Fail to reject H₀

⚠️ Important: p-value is NOT the probability that H₀ is true!

Part 6: Correlation & Causation¶

Measuring Linear Relationships¶

What and Why: The Pearson correlation coefficient \(r = \frac{\text{Cov}(X,Y)}{\sigma_X \sigma_Y}\) measures the strength and direction of the linear relationship between two variables, bounded between \(-1\) and \(+1\). While correlation is the most common measure of association, understanding its limitations is critical: it captures only linear dependencies and says nothing about causation.

How: The code computes Pearson correlation for several variable pairs and visualizes them as scatter plots. Strong positive \(r\) values produce tight upward-sloping clouds, while \(r \approx 0\) can still hide strong nonlinear relationships (e.g., a perfect parabola has \(r = 0\)). Spearman’s rank correlation is also shown as a robust alternative for monotonic but nonlinear relationships.

Connection: Correlation analysis is the first step in exploratory data analysis for any ML project. Feature correlation matrices guide feature engineering, multicollinearity detection in regression, and redundancy reduction before model training.

# Calculate Pearson correlation
correlation = mlb_data['height'].corr(mlb_data['weight'])
print(f"📊 Pearson Correlation Coefficient")
print("=" * 60)
print(f"  r(height, weight) = {correlation:.4f}")
print(f"\n  Interpretation: {'Strong' if abs(correlation) > 0.7 else 'Moderate' if abs(correlation) > 0.3 else 'Weak'} positive linear relationship")

# Scatter plot with regression line
plt.figure(figsize=(12, 7))
plt.scatter(mlb_data['height'], mlb_data['weight'], alpha=0.5, s=25, color='steelblue', edgecolor='none')
plt.xlabel('Height (inches)', fontsize=13)
plt.ylabel('Weight (lbs)', fontsize=13)
plt.title(f'Height vs Weight: MLB Players (r = {correlation:.3f})', fontsize=15, fontweight='bold')
plt.grid(alpha=0.3)

# Add regression line
from scipy.stats import linregress
slope, intercept, r_value, p_value_reg, std_err = linregress(mlb_data['height'], mlb_data['weight'])
line_x = np.array([mlb_data['height'].min(), mlb_data['height'].max()])
line_y = slope * line_x + intercept
plt.plot(line_x, line_y, 'r-', linewidth=3, label=f'y = {slope:.2f}x + {intercept:.2f}\nR² = {r_value**2:.3f}')
plt.legend(fontsize=12, loc='upper left')
plt.tight_layout()
plt.show()

print(f"\n📈 Linear Regression Results:")
print(f"  Slope: {slope:.2f} lbs/inch")
print(f"  Intercept: {intercept:.2f} lbs")
print(f"  R² (coefficient of determination): {r_value**2:.4f}")
print(f"  p-value: {p_value_reg:.6f}")
print(f"\n  💡 Interpretation: For every 1 inch increase in height,")
print(f"     weight increases by {slope:.2f} lbs on average.")
print(f"     Height explains {r_value**2*100:.1f}% of the variance in weight.")

Correlation Does Not Imply Causation¶

What and Why: This is perhaps the most important principle in data science. Two variables can be strongly correlated due to confounding variables (a third variable influencing both), reverse causation (the effect actually causes the presumed cause), or pure coincidence (spurious correlation). Failing to distinguish correlation from causation leads to incorrect conclusions and potentially harmful decisions.

How: The code demonstrates classic examples of spurious correlations and confounded relationships. It shows how controlling for confounders (via stratification or regression) can reveal the true causal structure, and how randomized experiments (A/B tests) provide the gold standard for causal inference.

Connection: In ML, confounding affects feature importance interpretation, treatment effect estimation, and policy decisions. Causal inference methods (propensity scores, instrumental variables, do-calculus) extend beyond correlation to identify genuine causal relationships from observational data.

print("="*70)
print("⚠️  CORRELATION DOES NOT IMPLY CAUSATION ⚠️")
print("="*70)
print()
print("✅ Correlation tells us:")
print("   • There is a LINEAR relationship between variables")
print("   • Variables change together (positively or negatively)")
print("   • Strength of the relationship (r ranges from -1 to +1)")
print()
print("❌ Correlation does NOT tell us:")
print("   • X causes Y (or Y causes X)")
print("   • Direction of causality")
print("   • Whether confounding variables exist")
print()
print("🔍 Famous spurious correlations:")
print("   • Ice cream sales & drowning deaths (confound: summer weather)")
print("   • Nicolas Cage movies & swimming pool drownings")
print("   • Cheese consumption & engineers who died by tangling in bedsheets")
print()
print("💡 To establish causation, you need:")
print("   • Randomized controlled experiments")
print("   • Temporal precedence (cause before effect)")
print("   • Rule out alternative explanations")
print("="*70)

Part 7: ML Applications¶

1. Model Evaluation with Confidence Intervals¶

What and Why: Reporting a single accuracy number for an ML model is insufficient – we need confidence intervals to quantify the uncertainty in our performance estimate. A model with 92% accuracy on 50 test samples is far less trustworthy than one with 92% accuracy on 5000 samples, and confidence intervals capture exactly this distinction.

How: The code uses the normal approximation for a proportion: the 95% confidence interval for accuracy \(\hat{p}\) is \(\hat{p} \pm 1.96 \sqrt{\frac{\hat{p}(1-\hat{p})}{n}}\). For small test sets, the bootstrap method provides a nonparametric alternative by resampling the test predictions with replacement and computing accuracy on each resample to build an empirical distribution.

Connection: Confidence intervals are essential for responsible ML deployment: comparing models (are their confidence intervals overlapping?), monitoring production performance, and satisfying regulatory requirements in domains like healthcare and finance.

# Simulate model accuracy over 30 cross-validation runs
np.random.seed(42)
n_runs = 30
accuracies = np.random.normal(loc=0.85, scale=0.03, size=n_runs)

mean_acc = accuracies.mean()
ci_acc = stats.t.interval(0.95, len(accuracies)-1,
                          loc=mean_acc,
                          scale=stats.sem(accuracies))

print(f"📊 Model Performance Evaluation (30-Fold Cross-Validation)")
print("=" * 60)
print(f"  Mean accuracy: {mean_acc:.4f} ({mean_acc*100:.2f}%)")
print(f"  Std deviation: {accuracies.std():.4f}")
print(f"  95% CI: [{ci_acc[0]:.4f}, {ci_acc[1]:.4f}]")
print(f"  Margin of error: ±{(ci_acc[1]-ci_acc[0])/2:.4f}")
print(f"\n  💡 Report: ")
print(f"     Accuracy = {mean_acc:.3f} ± {(ci_acc[1]-ci_acc[0])/2:.3f} (95% CI)")

# Visualize
plt.figure(figsize=(11, 5))
plt.hist(accuracies, bins=15, alpha=0.7, color='steelblue', edgecolor='black')
plt.axvline(mean_acc, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_acc:.3f}')
plt.axvline(ci_acc[0], color='green', linestyle='--', linewidth=2, label='95% CI')
plt.axvline(ci_acc[1], color='green', linestyle='--', linewidth=2)
plt.xlabel('Accuracy', fontsize=13)
plt.ylabel('Frequency', fontsize=13)
plt.title('Distribution of Model Accuracies Across 30 Runs', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

2. A/B Testing: Comparing Two Models¶

What and Why: A/B testing applies hypothesis testing to determine whether a new model (variant B) performs significantly better than the current model (variant A). Without proper statistical testing, apparent improvements may be due to random chance, leading to deploying models that do not actually improve user outcomes.

How: The code sets up a two-sample proportion test: \(H_0\): the two models have equal accuracy vs. \(H_1\): they differ. The test statistic \(z = \frac{\hat{p}_B - \hat{p}_A}{\sqrt{\hat{p}(1-\hat{p})(1/n_A + 1/n_B)}}\) follows a standard normal under \(H_0\), where \(\hat{p}\) is the pooled accuracy. The code computes the p-value and required sample size for a given effect size and desired statistical power.

Connection: A/B testing is the standard methodology for model deployment decisions at companies like Google, Netflix, and Meta. Understanding statistical power prevents both false positives (deploying inferior models) and false negatives (failing to adopt genuine improvements).

# Simulate two model versions
model_a_acc = np.random.normal(0.84, 0.03, 25)
model_b_acc = np.random.normal(0.87, 0.03, 25)

# Statistical test
t_stat_ab, p_val_ab = stats.ttest_ind(model_a_acc, model_b_acc)

print(f"🔬 A/B Test: Model A vs Model B")
print("=" * 60)
print(f"  Model A: {model_a_acc.mean():.4f} ± {model_a_acc.std():.4f} (n={len(model_a_acc)})")
print(f"  Model B: {model_b_acc.mean():.4f} ± {model_b_acc.std():.4f} (n={len(model_b_acc)})")
print(f"  Difference: {model_b_acc.mean() - model_a_acc.mean():.4f}")
print(f"\n  H₀: No difference in performance")
print(f"  H₁: Model B performs differently than Model A")
print(f"\n  t-statistic: {t_stat_ab:.4f}")
print(f"  p-value: {p_val_ab:.6f}")

if p_val_ab < 0.05:
    print(f"\n  ✅ SIGNIFICANT DIFFERENCE (p < 0.05)")
    print(f"     → Model B is statistically better! Deploy Model B.")
else:
    print(f"\n  ❌ NO SIGNIFICANT DIFFERENCE (p >= 0.05)")
    print(f"     → Keep Model A (simpler/cheaper option).")

# Box plot comparison
plt.figure(figsize=(10, 6))
data_to_plot = [model_a_acc, model_b_acc]
bp = plt.boxplot(data_to_plot, labels=['Model A', 'Model B'], patch_artist=True,
                  boxprops=dict(facecolor='lightblue', alpha=0.7),
                  medianprops=dict(color='red', linewidth=2))
plt.ylabel('Accuracy', fontsize=13)
plt.title(f'Model Comparison: A/B Test (p = {p_val_ab:.5f})', fontsize=15, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

🎯 Summary & Key Takeaways¶

What We Learned¶

Probability Fundamentals
- Random variables, sample spaces, probability distributions
- Discrete vs continuous random variables
Descriptive Statistics
- Mean, median, mode, standard deviation
- When to use each measure
Probability Distributions
- Normal (most important!), binomial, Poisson, exponential, uniform
- 68-95-99.7 rule for normal distributions
Statistical Inference
- Confidence intervals quantify uncertainty
- Standard error measures sampling variability
Hypothesis Testing
- Null and alternative hypotheses
- p-values and significance levels
- t-tests for comparing groups
Correlation Analysis
- Pearson correlation coefficient (-1 to +1)
- ⚠️ Correlation ≠ Causation!
ML Applications
- Model evaluation with confidence intervals
- A/B testing for model comparison
- Rigorous statistical validation

Why This Matters for ML¶

Model Evaluation: Properly assess performance with uncertainty
Feature Selection: Identify statistically significant predictors
A/B Testing: Compare model versions rigorously
Hyperparameter Tuning: Make data-driven decisions
Interpretability: Understand and explain model behavior

Next Steps¶

✅ Practice with real datasets
✅ Always visualize your data
✅ Use confidence intervals in reporting
✅ Be skeptical of correlations
✅ Apply hypothesis testing to experiments

📚 Additional Resources¶

Remember: Statistics is your tool for making informed decisions under uncertainty - essential for successful AI/ML! 📊🤖