Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.special import logsumexp
from sklearn.mixture import GaussianMixture
from sklearn.datasets import make_blobs, make_moons
from matplotlib.patches import Ellipse
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

1. Multivariate Gaussian Distribution¶

Definition¶

\[\mathcal{N}(\mathbf{x}|\boldsymbol{\mu}, \boldsymbol{\Sigma}) = \frac{1}{(2\pi)^{D/2}|\boldsymbol{\Sigma}|^{1/2}} \exp\left(-\frac{1}{2}(\mathbf{x}-\boldsymbol{\mu})^T\boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right)\]

Where:

$\boldsymbol{\mu}$ is the mean vector
$\boldsymbol{\Sigma}$ is the covariance matrix
$|\boldsymbol{\Sigma}|$ is the determinant of the covariance matrix

def plot_gaussian_contours(mu, cov, ax, n_std=3, **kwargs):
    """Plot confidence ellipses for 2D Gaussian"""
    # Calculate eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eigh(cov)
    angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
    
    # Plot ellipses for 1, 2, 3 standard deviations
    for n in range(1, n_std + 1):
        width, height = 2 * n * np.sqrt(eigenvalues)
        ellipse = Ellipse(mu, width, height, angle=angle, 
                         fill=False, linewidth=2, **kwargs)
        ax.add_patch(ellipse)
    
    return ax

# Example: Different covariance structures
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
mu = np.array([0, 0])

covariances = [
    ("Spherical", np.array([[1, 0], [0, 1]])),
    ("Diagonal", np.array([[2, 0], [0, 0.5]])),
    ("Positive Correlation", np.array([[1, 0.8], [0.8, 1]])),
    ("Negative Correlation", np.array([[1, -0.8], [-0.8, 1]])),
    ("High Variance X", np.array([[3, 0], [0, 0.3]])),
    ("General", np.array([[2, 1.2], [1.2, 1.5]]))
]

for idx, (name, cov) in enumerate(covariances):
    ax = axes[idx // 3, idx % 3]
    
    # Sample points
    samples = np.random.multivariate_normal(mu, cov, 500)
    
    # Plot samples
    ax.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10)
    
    # Plot contours
    plot_gaussian_contours(mu, cov, ax, color='red', alpha=0.8)
    
    ax.scatter(mu[0], mu[1], c='red', marker='x', s=200, linewidths=3)
    ax.set_title(f'{name}\nΣ = {cov.tolist()}')
    ax.set_xlim(-5, 5)
    ax.set_ylim(-5, 5)
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

1.1 Properties of Multivariate Gaussian¶

What: This code demonstrates two fundamental closure properties of the multivariate Gaussian: marginalization (integrating out variables preserves Gaussianity) and conditioning (fixing observed variables yields a Gaussian over the remaining variables). Given a 3D Gaussian $\mathcal{N}(\boldsymbol{\mu}, \boldsymbol{\Sigma})$, we compute the marginal $p(x_1, x_2)$ and the conditional $p(x_3 | x_1, x_2)$.

Why: These properties make Gaussians uniquely tractable in ML. The conditional mean formula $\boldsymbol{\mu}_{m|o} = \boldsymbol{\mu}_m + \boldsymbol{\Sigma}_{mo}\boldsymbol{\Sigma}_{oo}^{-1}(\mathbf{x}_o - \boldsymbol{\mu}_o)$ is the foundation of Gaussian process regression, Kalman filtering, and linear-Gaussian state space models. The conditional variance $\boldsymbol{\Sigma}_{m|o} = \boldsymbol{\Sigma}_{mm} - \boldsymbol{\Sigma}_{mo}\boldsymbol{\Sigma}_{oo}^{-1}\boldsymbol{\Sigma}_{om}$ (the Schur complement) quantifies uncertainty reduction from observations – it depends only on which variables are observed, not their values.

Connection: Gaussian conditioning powers Gaussian processes for Bayesian optimization, geostatistical kriging, and sensor fusion. In deep learning, variational autoencoders rely on Gaussian conditionals for the reparameterization trick, and diffusion models use conditional Gaussians at each denoising step.

# Marginal and conditional distributions
mu = np.array([1, 2, 3])
cov = np.array([[1.0, 0.5, 0.3],
                [0.5, 2.0, 0.4],
                [0.3, 0.4, 1.5]])

# Sample from 3D Gaussian
samples = np.random.multivariate_normal(mu, cov, 1000)

# Marginal distribution: p(x1, x2)
mu_marginal = mu[:2]
cov_marginal = cov[:2, :2]

# Conditional distribution: p(x3 | x1=a, x2=b)
# For simplicity, condition on mean values
x_observed = mu[:2]

# Conditional mean and covariance
mu_3 = mu[2]
mu_12 = mu[:2]
cov_3 = cov[2, 2]
cov_12 = cov[:2, :2]
cov_3_12 = cov[2, :2]

mu_conditional = mu_3 + cov_3_12 @ np.linalg.inv(cov_12) @ (x_observed - mu_12)
cov_conditional = cov_3 - cov_3_12 @ np.linalg.inv(cov_12) @ cov_3_12.T

print("3D Gaussian Properties:")
print(f"\nOriginal mean: {mu}")
print(f"Original covariance:\n{cov}")
print(f"\nMarginal p(x1, x2) mean: {mu_marginal}")
print(f"Marginal p(x1, x2) covariance:\n{cov_marginal}")
print(f"\nConditional p(x3|x1={x_observed[0]}, x2={x_observed[1]}) mean: {mu_conditional:.4f}")
print(f"Conditional p(x3|x1, x2) variance: {cov_conditional:.4f}")

# Visualize marginals
fig = plt.figure(figsize=(15, 5))

# 2D marginal
ax1 = plt.subplot(131)
ax1.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10)
plot_gaussian_contours(mu_marginal, cov_marginal, ax1, color='red')
ax1.set_xlabel('x₁')
ax1.set_ylabel('x₂')
ax1.set_title('Marginal p(x₁, x₂)')
ax1.grid(True, alpha=0.3)

# 3D scatter
ax2 = fig.add_subplot(132, projection='3d')
ax2.scatter(samples[:, 0], samples[:, 1], samples[:, 2], alpha=0.1, s=5)
ax2.scatter([mu[0]], [mu[1]], [mu[2]], c='red', marker='x', s=200, linewidths=3)
ax2.set_xlabel('x₁')
ax2.set_ylabel('x₂')
ax2.set_zlabel('x₃')
ax2.set_title('Full 3D Distribution')

# Conditional distribution
ax3 = plt.subplot(133)
ax3.hist(samples[:, 2], bins=50, density=True, alpha=0.6, label='Samples')
x_range = np.linspace(samples[:, 2].min(), samples[:, 2].max(), 100)
ax3.plot(x_range, stats.norm(mu_conditional, np.sqrt(cov_conditional)).pdf(x_range),
         'r-', linewidth=2, label=f'Conditional p(x₃|x₁={x_observed[0]}, x₂={x_observed[1]})')
ax3.set_xlabel('x₃')
ax3.set_ylabel('Density')
ax3.set_title('Conditional Distribution')
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

2. Maximum Likelihood Estimation¶

MLE for Multivariate Gaussian¶

Given data $\mathcal{D} = \{\mathbf{x}_1, ..., \mathbf{x}_N\}$:

\[\hat{\boldsymbol{\mu}}_{MLE} = \frac{1}{N}\sum_{i=1}^{N} \mathbf{x}_i\]

\[\hat{\boldsymbol{\Sigma}}_{MLE} = \frac{1}{N}\sum_{i=1}^{N} (\mathbf{x}_i - \hat{\boldsymbol{\mu}})(\mathbf{x}_i - \hat{\boldsymbol{\mu}})^T\]

def gaussian_mle(X):
    """Maximum likelihood estimation for Gaussian parameters"""
    mu_mle = np.mean(X, axis=0)
    X_centered = X - mu_mle
    cov_mle = (X_centered.T @ X_centered) / len(X)
    return mu_mle, cov_mle

# True parameters
mu_true = np.array([2, 3])
cov_true = np.array([[2, 1], [1, 1.5]])

# Test with different sample sizes
sample_sizes = [10, 50, 100, 500, 1000, 5000]
mu_errors = []
cov_errors = []

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for idx, n in enumerate(sample_sizes):
    ax = axes[idx // 3, idx % 3]
    
    # Generate samples
    samples = np.random.multivariate_normal(mu_true, cov_true, n)
    
    # Estimate parameters
    mu_est, cov_est = gaussian_mle(samples)
    
    # Calculate errors
    mu_error = np.linalg.norm(mu_est - mu_true)
    cov_error = np.linalg.norm(cov_est - cov_true, 'fro')
    mu_errors.append(mu_error)
    cov_errors.append(cov_error)
    
    # Visualize
    ax.scatter(samples[:, 0], samples[:, 1], alpha=0.3, s=10, label='Samples')
    plot_gaussian_contours(mu_true, cov_true, ax, color='green', label='True')
    plot_gaussian_contours(mu_est, cov_est, ax, color='red', linestyle='--', label='Estimated')
    ax.scatter(mu_true[0], mu_true[1], c='green', marker='x', s=200, linewidths=3)
    ax.scatter(mu_est[0], mu_est[1], c='red', marker='+', s=200, linewidths=3)
    ax.set_title(f'n={n}\nμ error={mu_error:.3f}, Σ error={cov_error:.3f}')
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.3)
    if idx == 0:
        ax.legend(loc='upper right')

plt.tight_layout()
plt.show()

# Plot convergence
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.loglog(sample_sizes, mu_errors, 'o-', linewidth=2, markersize=8)
ax1.set_xlabel('Sample Size')
ax1.set_ylabel('Mean Estimation Error')
ax1.set_title('Convergence of Mean Estimate')
ax1.grid(True, alpha=0.3)

ax2.loglog(sample_sizes, cov_errors, 's-', linewidth=2, markersize=8, color='orange')
ax2.set_xlabel('Sample Size')
ax2.set_ylabel('Covariance Estimation Error (Frobenius norm)')
ax2.set_title('Convergence of Covariance Estimate')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

3. Gaussian Mixture Models (GMM)¶

Definition¶

A GMM is a weighted sum of Gaussian components:

\[p(\mathbf{x}) = \sum_{k=1}^{K} \pi_k \mathcal{N}(\mathbf{x}|\boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k)\]

Where:

$\pi_k$ are mixing coefficients ($\sum_k \pi_k = 1$)
Each component has its own mean $\boldsymbol{\mu}_k$ and covariance $\boldsymbol{\Sigma}_k$

# Generate GMM data
np.random.seed(42)

# Three Gaussian components
n_samples = 300
means = [np.array([0, 0]), np.array([4, 4]), np.array([0, 4])]
covs = [np.array([[0.5, 0], [0, 0.5]]),
        np.array([[0.8, 0.4], [0.4, 0.8]]),
        np.array([[0.3, -0.1], [-0.1, 0.5]])]
weights = [0.3, 0.5, 0.2]

# Generate samples from mixture
samples_gmm = []
labels_true = []

for i, (mean, cov, weight) in enumerate(zip(means, covs, weights)):
    n = int(n_samples * weight)
    samples_gmm.append(np.random.multivariate_normal(mean, cov, n))
    labels_true.extend([i] * n)

X_gmm = np.vstack(samples_gmm)
labels_true = np.array(labels_true)

# Shuffle
indices = np.random.permutation(len(X_gmm))
X_gmm = X_gmm[indices]
labels_true = labels_true[indices]

# Visualize true components
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
colors = ['red', 'blue', 'green']
for i, (mean, cov, color) in enumerate(zip(means, covs, colors)):
    mask = labels_true == i
    plt.scatter(X_gmm[mask, 0], X_gmm[mask, 1], alpha=0.5, s=30, 
               c=color, label=f'Component {i+1}')
    plot_gaussian_contours(mean, cov, plt.gca(), color=color, alpha=0.8)
plt.title('True GMM Components')
plt.xlabel('x₁')
plt.ylabel('x₂')
plt.legend()
plt.grid(True, alpha=0.3)
plt.axis('equal')

plt.subplot(1, 2, 2)
plt.scatter(X_gmm[:, 0], X_gmm[:, 1], alpha=0.5, s=30, c='gray')
plt.title('Mixed Data (Labels Hidden)')
plt.xlabel('x₁')
plt.ylabel('x₂')
plt.grid(True, alpha=0.3)
plt.axis('equal')

plt.tight_layout()
plt.show()

4. Expectation-Maximization (EM) Algorithm¶

Algorithm¶

E-step: Compute responsibilities (posterior probabilities) $$r_{ik} = \frac{\pi_k \mathcal{N}(\mathbf{x}_i|\boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k)}{\sum_{j=1}^{K} \pi_j \mathcal{N}(\mathbf{x}_i|\boldsymbol{\mu}_j, \boldsymbol{\Sigma}_j)}$$

M-step: Update parameters $$\pi_k = \frac{1}{N}\sum_{i=1}^{N} r_{ik}$$\boldsymbol{\mu}_k = \frac{\sum_{i=1}^{N} r_{ik} \mathbf{x}_i}{\sum_{i=1}^{N} r_{ik}}$$\boldsymbol{\Sigma}_k = \frac{\sum_{i=1}^{N} r_{ik} (\mathbf{x}_i - \boldsymbol{\mu}_k)(\mathbf{x}_i - \boldsymbol{\mu}_k)^T}{\sum_{i=1}^{N} r_{ik}}$$

class GaussianMixtureEM:
    """Gaussian Mixture Model using EM algorithm"""
    
    def __init__(self, n_components=3, max_iter=100, tol=1e-6):
        self.n_components = n_components
        self.max_iter = max_iter
        self.tol = tol
    
    def _initialize(self, X):
        """Initialize parameters using k-means++"""
        n_samples, n_features = X.shape
        
        # Random initialization
        indices = np.random.choice(n_samples, self.n_components, replace=False)
        self.means = X[indices]
        
        # Initialize covariances to identity
        self.covariances = [np.eye(n_features) for _ in range(self.n_components)]
        
        # Equal mixing coefficients
        self.weights = np.ones(self.n_components) / self.n_components
    
    def _e_step(self, X):
        """Expectation step: compute responsibilities"""
        n_samples = len(X)
        responsibilities = np.zeros((n_samples, self.n_components))
        
        for k in range(self.n_components):
            responsibilities[:, k] = self.weights[k] * stats.multivariate_normal(
                self.means[k], self.covariances[k], allow_singular=True).pdf(X)
        
        # Normalize
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)
        return responsibilities
    
    def _m_step(self, X, responsibilities):
        """Maximization step: update parameters"""
        n_samples, n_features = X.shape
        
        # Update mixing coefficients
        nk = responsibilities.sum(axis=0)
        self.weights = nk / n_samples
        
        # Update means
        self.means = (responsibilities.T @ X) / nk[:, np.newaxis]
        
        # Update covariances
        for k in range(self.n_components):
            diff = X - self.means[k]
            self.covariances[k] = (responsibilities[:, k:k+1] * diff).T @ diff / nk[k]
            # Add small regularization for numerical stability
            self.covariances[k] += np.eye(n_features) * 1e-6
    
    def _compute_log_likelihood(self, X):
        """Compute log likelihood of data"""
        n_samples = len(X)
        log_prob = np.zeros((n_samples, self.n_components))
        
        for k in range(self.n_components):
            log_prob[:, k] = np.log(self.weights[k]) + stats.multivariate_normal(
                self.means[k], self.covariances[k], allow_singular=True).logpdf(X)
        
        return logsumexp(log_prob, axis=1).sum()
    
    def fit(self, X, verbose=False):
        """Fit GMM using EM algorithm"""
        self._initialize(X)
        self.log_likelihoods = []
        
        for iteration in range(self.max_iter):
            # E-step
            responsibilities = self._e_step(X)
            
            # M-step
            self._m_step(X, responsibilities)
            
            # Compute log likelihood
            log_likelihood = self._compute_log_likelihood(X)
            self.log_likelihoods.append(log_likelihood)
            
            if verbose and iteration % 10 == 0:
                print(f"Iteration {iteration}: log-likelihood = {log_likelihood:.4f}")
            
            # Check convergence
            if iteration > 0 and abs(self.log_likelihoods[-1] - self.log_likelihoods[-2]) < self.tol:
                if verbose:
                    print(f"Converged at iteration {iteration}")
                break
        
        return self
    
    def predict(self, X):
        """Predict cluster labels"""
        responsibilities = self._e_step(X)
        return np.argmax(responsibilities, axis=1)

# Fit custom GMM
gmm_custom = GaussianMixtureEM(n_components=3, max_iter=100)
gmm_custom.fit(X_gmm, verbose=True)
labels_custom = gmm_custom.predict(X_gmm)

# Fit sklearn GMM
gmm_sklearn = GaussianMixture(n_components=3, random_state=42)
gmm_sklearn.fit(X_gmm)
labels_sklearn = gmm_sklearn.predict(X_gmm)

# Visualize results
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# True labels
axes[0].scatter(X_gmm[:, 0], X_gmm[:, 1], c=labels_true, cmap='viridis', alpha=0.6, s=30)
for mean, cov in zip(means, covs):
    plot_gaussian_contours(mean, cov, axes[0], color='red', alpha=0.8)
axes[0].set_title('True Labels')
axes[0].set_xlabel('x₁')
axes[0].set_ylabel('x₂')
axes[0].grid(True, alpha=0.3)
axes[0].axis('equal')

# Custom EM
axes[1].scatter(X_gmm[:, 0], X_gmm[:, 1], c=labels_custom, cmap='viridis', alpha=0.6, s=30)
for mean, cov in zip(gmm_custom.means, gmm_custom.covariances):
    plot_gaussian_contours(mean, cov, axes[1], color='red', alpha=0.8)
axes[1].set_title('Custom EM Algorithm')
axes[1].set_xlabel('x₁')
axes[1].set_ylabel('x₂')
axes[1].grid(True, alpha=0.3)
axes[1].axis('equal')

# Sklearn GMM
axes[2].scatter(X_gmm[:, 0], X_gmm[:, 1], c=labels_sklearn, cmap='viridis', alpha=0.6, s=30)
for mean, cov in zip(gmm_sklearn.means_, gmm_sklearn.covariances_):
    plot_gaussian_contours(mean, cov, axes[2], color='red', alpha=0.8)
axes[2].set_title('Sklearn GMM')
axes[2].set_xlabel('x₁')
axes[2].set_ylabel('x₂')
axes[2].grid(True, alpha=0.3)
axes[2].axis('equal')

plt.tight_layout()
plt.show()

# Plot log-likelihood convergence
plt.figure(figsize=(10, 5))
plt.plot(gmm_custom.log_likelihoods, 'o-', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Log Likelihood')
plt.title('EM Algorithm Convergence')
plt.grid(True, alpha=0.3)
plt.show()

5. Missing Data Imputation with EM¶

What: This code tackles a common real-world problem: filling in missing values using the EM algorithm with a Gaussian model. Starting from data with 30% values missing completely at random (MCAR), it compares naive mean imputation against iterative EM imputation that exploits feature correlations.

Why: Missing data is pervasive in healthcare records, sensor networks, and survey data. Mean imputation destroys correlations and underestimates variance, leading to biased downstream models. EM imputation preserves the joint distribution structure by iteratively (1) estimating Gaussian parameters $\boldsymbol{\mu}, \boldsymbol{\Sigma}$ from current imputed data (M-step), then (2) computing conditional expectations $\mathbb{E}[x_{missing} | x_{observed}, \boldsymbol{\mu}, \boldsymbol{\Sigma}]$ to re-impute missing values (E-step). Each iteration increases the observed-data log-likelihood, converging to a local maximum.

How: The em_imputation function uses the Gaussian conditional formula: for each row with missing entries, the imputed values are $\hat{\mathbf{x}}_m = \boldsymbol{\mu}_m + \boldsymbol{\Sigma}_{mo}\boldsymbol{\Sigma}_{oo}^{-1}(\mathbf{x}_o - \boldsymbol{\mu}_o)$. The RMSE comparison quantifies how much better EM preserves the true values compared to mean imputation. The contour plots visually confirm that EM recovers the correlation structure while mean imputation distorts it.

Connection: EM-based imputation is the engine behind the MICE algorithm widely used in clinical trials and epidemiology. Modern extensions handle mixed data types and non-Gaussian distributions, and the same principle underlies missing-data handling in recommendation systems (matrix completion) and time-series forecasting with gaps.

# Generate complete data
mu_complete = np.array([5, 10])
cov_complete = np.array([[2, 1.5], [1.5, 3]])
X_complete = np.random.multivariate_normal(mu_complete, cov_complete, 200)

# Introduce missing data (MCAR - Missing Completely At Random)
X_missing = X_complete.copy()
missing_mask = np.random.random(X_complete.shape) < 0.3  # 30% missing
X_missing[missing_mask] = np.nan

print(f"Total values: {X_complete.size}")
print(f"Missing values: {np.sum(missing_mask)}")
print(f"Missing percentage: {100 * np.sum(missing_mask) / X_complete.size:.1f}%")

# Simple imputation: mean imputation
X_mean_imputed = X_missing.copy()
col_means = np.nanmean(X_missing, axis=0)
for i in range(X_missing.shape[1]):
    X_mean_imputed[missing_mask[:, i], i] = col_means[i]

# EM-based imputation using Gaussian model
def em_imputation(X_missing, max_iter=50, tol=1e-6):
    """Impute missing values using EM with Gaussian assumption"""
    X_imputed = X_missing.copy()
    
    # Initialize with mean imputation
    col_means = np.nanmean(X_missing, axis=0)
    for i in range(X_missing.shape[1]):
        mask = np.isnan(X_missing[:, i])
        X_imputed[mask, i] = col_means[i]
    
    for iteration in range(max_iter):
        # M-step: estimate parameters from imputed data
        mu_est = np.mean(X_imputed, axis=0)
        cov_est = np.cov(X_imputed.T)
        
        # E-step: re-impute missing values
        X_prev = X_imputed.copy()
        
        for i in range(len(X_imputed)):
            if np.any(np.isnan(X_missing[i])):
                # Indices of observed and missing
                observed_idx = ~np.isnan(X_missing[i])
                missing_idx = np.isnan(X_missing[i])
                
                if np.any(observed_idx):
                    # Conditional mean
                    mu_o = mu_est[observed_idx]
                    mu_m = mu_est[missing_idx]
                    cov_oo = cov_est[np.ix_(observed_idx, observed_idx)]
                    cov_mo = cov_est[np.ix_(missing_idx, observed_idx)]
                    
                    x_o = X_missing[i, observed_idx]
                    
                    # Conditional expectation
                    x_m = mu_m + cov_mo @ np.linalg.solve(cov_oo, x_o - mu_o)
                    X_imputed[i, missing_idx] = x_m
        
        # Check convergence
        change = np.linalg.norm(X_imputed - X_prev)
        if change < tol:
            print(f"Converged at iteration {iteration}")
            break
    
    return X_imputed

X_em_imputed = em_imputation(X_missing)

# Compare imputation methods
# Calculate errors only for originally missing values
mean_imp_error = np.sqrt(np.mean((X_complete[missing_mask] - X_mean_imputed[missing_mask])**2))
em_imp_error = np.sqrt(np.mean((X_complete[missing_mask] - X_em_imputed[missing_mask])**2))

print(f"\nImputation RMSE:")
print(f"Mean imputation: {mean_imp_error:.4f}")
print(f"EM imputation: {em_imp_error:.4f}")

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Complete data
axes[0, 0].scatter(X_complete[:, 0], X_complete[:, 1], alpha=0.6, s=50)
plot_gaussian_contours(mu_complete, cov_complete, axes[0, 0], color='red')
axes[0, 0].set_title('Complete Data')
axes[0, 0].set_xlabel('x₁')
axes[0, 0].set_ylabel('x₂')
axes[0, 0].grid(True, alpha=0.3)

# Data with missing values
complete_rows = ~np.any(np.isnan(X_missing), axis=1)
axes[0, 1].scatter(X_missing[complete_rows, 0], X_missing[complete_rows, 1], 
                  alpha=0.6, s=50, label='Complete')
axes[0, 1].scatter(X_missing[~complete_rows, 0], X_missing[~complete_rows, 1],
                  alpha=0.3, s=50, color='red', label='Incomplete')
axes[0, 1].set_title(f'Data with {100*np.sum(missing_mask)/X_complete.size:.1f}% Missing')
axes[0, 1].set_xlabel('x₁')
axes[0, 1].set_ylabel('x₂')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Mean imputation
axes[1, 0].scatter(X_mean_imputed[:, 0], X_mean_imputed[:, 1], alpha=0.6, s=50)
axes[1, 0].scatter(X_mean_imputed[missing_mask[:, 0], 0], 
                  X_mean_imputed[missing_mask[:, 0], 1],
                  color='red', alpha=0.8, s=50, label='Imputed')
mu_mean_imp = np.mean(X_mean_imputed, axis=0)
cov_mean_imp = np.cov(X_mean_imputed.T)
plot_gaussian_contours(mu_mean_imp, cov_mean_imp, axes[1, 0], color='green')
axes[1, 0].set_title(f'Mean Imputation (RMSE={mean_imp_error:.3f})')
axes[1, 0].set_xlabel('x₁')
axes[1, 0].set_ylabel('x₂')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# EM imputation
axes[1, 1].scatter(X_em_imputed[:, 0], X_em_imputed[:, 1], alpha=0.6, s=50)
axes[1, 1].scatter(X_em_imputed[missing_mask[:, 0], 0], 
                  X_em_imputed[missing_mask[:, 0], 1],
                  color='red', alpha=0.8, s=50, label='Imputed')
mu_em_imp = np.mean(X_em_imputed, axis=0)
cov_em_imp = np.cov(X_em_imputed.T)
plot_gaussian_contours(mu_em_imp, cov_em_imp, axes[1, 1], color='green')
axes[1, 1].set_title(f'EM Imputation (RMSE={em_imp_error:.3f})')
axes[1, 1].set_xlabel('x₁')
axes[1, 1].set_ylabel('x₂')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Summary¶

In this notebook, we covered:

Multivariate Gaussian Distribution: Properties, marginals, and conditionals
Maximum Likelihood Estimation: Parameter estimation and convergence
Gaussian Mixture Models: Modeling complex distributions
EM Algorithm: Iterative parameter estimation with latent variables
Missing Data Imputation: Using EM for handling incomplete data

Key Takeaways¶

Multivariate Gaussians are closed under marginalization and conditioning
MLE provides simple closed-form solutions for Gaussian parameters
GMMs can model complex, multi-modal distributions
EM algorithm elegantly handles latent variables and missing data
EM iteratively improves likelihood but may converge to local optima

Exercises¶

Implement model selection for GMMs using BIC or AIC
Derive the conditional distribution formulas for multivariate Gaussian
Implement diagonal and spherical covariance constraints for GMM
Compare EM imputation with other methods (k-NN, regression)
Visualize the EM algorithm’s parameter evolution over iterations