# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.mixture import GaussianMixture
from sklearn.datasets import make_blobs, make_moons
from matplotlib.patches import Ellipse
from scipy.spatial.distance import cdist

sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)

11.1 Gaussian Mixture ModelΒΆ

Mixture Model:ΒΆ

\[p(\mathbf{x}) = \sum_{k=1}^K \pi_k p_k(\mathbf{x})\]

where:

  • K: number of components (clusters)

  • Ο€β‚–: mixing coefficient (probability of cluster k)

  • pβ‚–(x): component distribution (Gaussian)

GMM Specifically:ΒΆ

\[p(\mathbf{x}) = \sum_{k=1}^K \pi_k \mathcal{N}(\mathbf{x} | \boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k)\]

Parameters ΞΈ:

  • Means: μ₁, …, ΞΌβ‚–

  • Covariances: Σ₁, …, Ξ£β‚–

  • Mixing coefficients: π₁, …, Ο€β‚– (Σπₖ = 1)

Constraints:ΒΆ

\[\sum_{k=1}^K \pi_k = 1, \quad \pi_k \geq 0\]
# Generate data from a GMM

np.random.seed(42)

def generate_gmm_data(n_samples=300, n_components=3):
    """Generate data from a Gaussian Mixture Model."""
    # Define true parameters
    means = np.array([[0, 0], [3, 3], [-2, 3]])
    covariances = [
        [[1.0, 0.5], [0.5, 1.0]],  # Correlated
        [[0.8, 0.0], [0.0, 0.8]],  # Spherical
        [[2.0, -0.5], [-0.5, 0.5]] # Elongated
    ]
    weights = np.array([0.3, 0.4, 0.3])
    
    # Sample from mixture
    X = []
    y = []
    
    for i in range(n_samples):
        # Choose component
        k = np.random.choice(n_components, p=weights)
        # Sample from that component
        x = np.random.multivariate_normal(means[k], covariances[k])
        X.append(x)
        y.append(k)
    
    return np.array(X), np.array(y), means, covariances, weights

X, y_true, true_means, true_covs, true_weights = generate_gmm_data()

print("Generated GMM Data")
print(f"Shape: {X.shape}")
print(f"True cluster proportions: {true_weights}")

# Visualize
plt.figure(figsize=(10, 6))
colors = ['red', 'blue', 'green']
for k in range(3):
    mask = y_true == k
    plt.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50, 
               label=f'Component {k+1} ({true_weights[k]:.1f})')
    plt.scatter(true_means[k, 0], true_means[k, 1], c=colors[k], 
               marker='X', s=300, edgecolors='black', linewidths=2)

plt.xlabel('x₁', fontsize=12)
plt.ylabel('xβ‚‚', fontsize=12)
plt.title('Data Generated from Gaussian Mixture Model', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print("\nGoal: Learn the parameters (means, covariances, weights) from data alone!")

11.2 Expectation-Maximization (EM) AlgorithmΒΆ

Problem:ΒΆ

Maximum likelihood estimation is hard because log doesn’t simplify:

\[\log p(\mathbf{X} | \boldsymbol{\theta}) = \sum_{n=1}^N \log \left( \sum_{k=1}^K \pi_k \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k) \right)\]

Solution: EM AlgorithmΒΆ

Introduce latent variables z (cluster assignments) and alternate:

E-step (Expectation): Compute responsibilities $\(r_{nk} = \frac{\pi_k \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k)}{\sum_{j=1}^K \pi_j \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_j, \boldsymbol{\Sigma}_j)}\)$

M-step (Maximization): Update parameters $\(\boldsymbol{\mu}_k = \frac{\sum_n r_{nk} \mathbf{x}_n}{\sum_n r_{nk}}, \quad \boldsymbol{\Sigma}_k = \frac{\sum_n r_{nk}(\mathbf{x}_n - \boldsymbol{\mu}_k)(\mathbf{x}_n - \boldsymbol{\mu}_k)^T}{\sum_n r_{nk}}\)$

\[\pi_k = \frac{1}{N}\sum_n r_{nk}\]

rβ‚™β‚– = responsibility: probability that point n belongs to cluster k

# Implement EM algorithm from scratch

class GMM_EM:
    """Gaussian Mixture Model using EM algorithm."""
    
    def __init__(self, n_components=3, max_iter=100, tol=1e-4):
        self.K = n_components
        self.max_iter = max_iter
        self.tol = tol
        
    def _initialize(self, X):
        """Initialize parameters randomly."""
        n, d = X.shape
        
        # Random initialization
        self.means = X[np.random.choice(n, self.K, replace=False)]
        self.covariances = [np.eye(d) for _ in range(self.K)]
        self.weights = np.ones(self.K) / self.K
        
    def _e_step(self, X):
        """E-step: Compute responsibilities."""
        n = X.shape[0]
        responsibilities = np.zeros((n, self.K))
        
        for k in range(self.K):
            # Gaussian density
            responsibilities[:, k] = self.weights[k] * stats.multivariate_normal.pdf(
                X, mean=self.means[k], cov=self.covariances[k]
            )
        
        # Normalize (denominator in responsibility formula)
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)
        
        return responsibilities
    
    def _m_step(self, X, responsibilities):
        """M-step: Update parameters."""
        n, d = X.shape
        
        # Effective number of points in each cluster
        N_k = responsibilities.sum(axis=0)
        
        # Update means
        self.means = (responsibilities.T @ X) / N_k[:, np.newaxis]
        
        # Update covariances
        for k in range(self.K):
            diff = X - self.means[k]
            self.covariances[k] = (responsibilities[:, k:k+1] * diff).T @ diff / N_k[k]
            # Add small regularization for numerical stability
            self.covariances[k] += 1e-6 * np.eye(d)
        
        # Update weights
        self.weights = N_k / n
        
    def _compute_log_likelihood(self, X):
        """Compute log-likelihood of data."""
        log_likelihood = 0
        for k in range(self.K):
            log_likelihood += self.weights[k] * stats.multivariate_normal.pdf(
                X, mean=self.means[k], cov=self.covariances[k]
            )
        return np.sum(np.log(log_likelihood + 1e-10))
    
    def fit(self, X):
        """Fit GMM using EM algorithm."""
        self._initialize(X)
        
        self.log_likelihoods = []
        
        for iteration in range(self.max_iter):
            # E-step
            responsibilities = self._e_step(X)
            
            # M-step
            self._m_step(X, responsibilities)
            
            # Compute log-likelihood
            ll = self._compute_log_likelihood(X)
            self.log_likelihoods.append(ll)
            
            # Check convergence
            if iteration > 0 and abs(ll - self.log_likelihoods[-2]) < self.tol:
                print(f"Converged after {iteration + 1} iterations")
                break
        
        return self
    
    def predict_proba(self, X):
        """Predict cluster probabilities (responsibilities)."""
        return self._e_step(X)
    
    def predict(self, X):
        """Predict cluster assignments (hard assignment)."""
        return self.predict_proba(X).argmax(axis=1)

# Fit GMM
gmm = GMM_EM(n_components=3, max_iter=100)
gmm.fit(X)

print("EM Algorithm Results")
print(f"\nLearned means:")
for k in range(3):
    print(f"  Component {k+1}: {gmm.means[k]}")

print(f"\nLearned weights: {gmm.weights}")

print(f"\nTrue means:")
for k in range(3):
    print(f"  Component {k+1}: {true_means[k]}")

print(f"\nTrue weights: {true_weights}")

# Plot convergence
plt.figure(figsize=(10, 5))
plt.plot(gmm.log_likelihoods, 'b-', linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Log-Likelihood', fontsize=12)
plt.title('EM Algorithm Convergence', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()
# Visualize GMM fit

def plot_gmm_ellipse(mean, cov, ax, color, n_std=2.0, **kwargs):
    """Plot covariance ellipse."""
    # Eigendecomposition
    eigenvalues, eigenvectors = np.linalg.eig(cov)
    
    # Angle of ellipse
    angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
    
    # Width and height (2 standard deviations)
    width, height = 2 * n_std * np.sqrt(eigenvalues)
    
    ellipse = Ellipse(mean, width, height, angle=angle, 
                     facecolor='none', edgecolor=color, linewidth=2, **kwargs)
    ax.add_patch(ellipse)

# Predict clusters
y_pred = gmm.predict(X)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# True clusters
ax = axes[0]
for k in range(3):
    mask = y_true == k
    ax.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50, 
              label=f'True cluster {k+1}')
    ax.scatter(true_means[k, 0], true_means[k, 1], c=colors[k], 
              marker='X', s=300, edgecolors='black', linewidths=2)
    plot_gmm_ellipse(true_means[k], true_covs[k], ax, colors[k], linestyle='--')

ax.set_xlabel('x₁', fontsize=12)
ax.set_ylabel('xβ‚‚', fontsize=12)
ax.set_title('True GMM (Data Generation)', fontsize=13)
ax.legend()
ax.grid(True, alpha=0.3)

# Learned clusters
ax = axes[1]
for k in range(3):
    mask = y_pred == k
    ax.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50, 
              label=f'Learned cluster {k+1}')
    ax.scatter(gmm.means[k, 0], gmm.means[k, 1], c=colors[k], 
              marker='X', s=300, edgecolors='black', linewidths=2)
    plot_gmm_ellipse(gmm.means[k], gmm.covariances[k], ax, colors[k])

ax.set_xlabel('x₁', fontsize=12)
ax.set_ylabel('xβ‚‚', fontsize=12)
ax.set_title('Learned GMM (EM Algorithm)', fontsize=13)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("EM successfully recovered the true clusters!")

11.3 Soft vs Hard ClusteringΒΆ

Hard Clustering (K-means):ΒΆ

Each point assigned to one cluster: $\(z_n \in \{1, 2, ..., K\}\)$

Soft Clustering (GMM):ΒΆ

Each point has probability of belonging to each cluster: $\(r_{nk} = P(z_n = k | \mathbf{x}_n)\)$

Advantage: Captures uncertainty in cluster assignment!

# Compare K-means (hard) vs GMM (soft)

from sklearn.cluster import KMeans

# Fit K-means
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X)

# Get GMM responsibilities
responsibilities = gmm.predict_proba(X)

# Find points with uncertain assignments (high entropy)
entropy = -np.sum(responsibilities * np.log(responsibilities + 1e-10), axis=1)
uncertain_mask = entropy > 0.8  # High uncertainty

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# K-means (hard clustering)
ax = axes[0]
for k in range(3):
    mask = y_kmeans == k
    ax.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50)
    ax.scatter(kmeans.cluster_centers_[k, 0], kmeans.cluster_centers_[k, 1], 
              c=colors[k], marker='X', s=300, edgecolors='black', linewidths=2)
ax.set_xlabel('x₁')
ax.set_ylabel('xβ‚‚')
ax.set_title('K-means (Hard Clustering)')
ax.grid(True, alpha=0.3)

# GMM (soft clustering - showing hard assignment)
ax = axes[1]
for k in range(3):
    mask = y_pred == k
    ax.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50)
    ax.scatter(gmm.means[k, 0], gmm.means[k, 1], c=colors[k], 
              marker='X', s=300, edgecolors='black', linewidths=2)
    plot_gmm_ellipse(gmm.means[k], gmm.covariances[k], ax, colors[k])
ax.set_xlabel('x₁')
ax.set_ylabel('xβ‚‚')
ax.set_title('GMM (Soft Clustering - Hard Assignment)')
ax.grid(True, alpha=0.3)

# GMM uncertainty
ax = axes[2]
scatter = ax.scatter(X[:, 0], X[:, 1], c=entropy, cmap='viridis', 
                    alpha=0.6, s=50)
# Highlight uncertain points
ax.scatter(X[uncertain_mask, 0], X[uncertain_mask, 1], 
          facecolors='none', edgecolors='red', s=100, linewidths=2,
          label='Uncertain points')
plt.colorbar(scatter, ax=ax, label='Entropy (Uncertainty)')
ax.set_xlabel('x₁')
ax.set_ylabel('xβ‚‚')
ax.set_title('GMM Uncertainty')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Soft Clustering Advantages:")
print(f"  - {uncertain_mask.sum()} points have uncertain assignments")
print("  - These are typically near cluster boundaries")
print("  - GMM quantifies this uncertainty; K-means doesn't")

# Show example uncertain point
idx = np.argmax(entropy)
print(f"\nMost uncertain point: {X[idx]}")
print(f"  Responsibilities: {responsibilities[idx]}")
print(f"  Entropy: {entropy[idx]:.3f}")
# Visualize responsibilities with pie charts

# Select representative points from each cluster and boundaries
sample_indices = []

# One clear point from each cluster
for k in range(3):
    cluster_points = np.where(y_pred == k)[0]
    cluster_responsibilities = responsibilities[cluster_points]
    # Find most confident point (lowest entropy)
    most_confident_idx = cluster_points[np.argmin(entropy[cluster_points])]
    sample_indices.append(most_confident_idx)

# Two uncertain points (high entropy)
uncertain_indices = np.argsort(entropy)[-2:]
sample_indices.extend(uncertain_indices)

fig, ax = plt.subplots(figsize=(12, 8))

# Plot all points
ax.scatter(X[:, 0], X[:, 1], c='lightgray', alpha=0.3, s=30)

# Plot selected points with pie charts
for idx in sample_indices:
    # Position
    x, y = X[idx]
    
    # Create small pie chart
    pie_size = 0.3
    pie_ax = fig.add_axes([0, 0, pie_size, pie_size])
    pie_ax.pie(responsibilities[idx], colors=colors, startangle=90)
    
    # Position pie chart
    # Convert data coordinates to figure coordinates
    trans = ax.transData.transform([x, y])
    inv_trans = fig.transFigure.inverted().transform(trans)
    pie_ax.set_position([inv_trans[0] - pie_size/2, 
                         inv_trans[1] - pie_size/2, 
                         pie_size, pie_size])
    
    # Draw line to point
    ax.plot(x, y, 'ko', markersize=10)
    ax.annotate(f'{entropy[idx]:.2f}', xy=(x, y), xytext=(x+0.3, y+0.3),
               fontsize=10, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Plot cluster centers
for k in range(3):
    ax.scatter(gmm.means[k, 0], gmm.means[k, 1], c=colors[k], 
              marker='X', s=300, edgecolors='black', linewidths=2)
    plot_gmm_ellipse(gmm.means[k], gmm.covariances[k], ax, colors[k])

ax.set_xlabel('x₁', fontsize=12)
ax.set_ylabel('xβ‚‚', fontsize=12)
ax.set_title('GMM Soft Assignments (Pie Charts Show Responsibilities)', fontsize=14)
ax.grid(True, alpha=0.3)

plt.show()

print("Pie charts show cluster probabilities for each point")
print("  - Confident points: One dominant color")
print("  - Uncertain points: Mixed colors")

11.4 Model Selection: Choosing KΒΆ

Problem:ΒΆ

How many clusters should we use?

Methods:ΒΆ

  1. Elbow Method: Plot log-likelihood vs K

  2. Information Criteria: AIC, BIC

  3. Cross-Validation: Hold-out validation

  4. Domain Knowledge: Use prior information

BIC (Bayesian Information Criterion):ΒΆ

\[BIC = -2 \log p(\mathbf{X} | \boldsymbol{\theta}_{ML}) + p \log N\]

where:

  • p = number of parameters

  • N = number of data points

Lower BIC is better (penalizes model complexity)

# Model selection: Compare different K values

K_values = range(1, 8)
bic_scores = []
aic_scores = []
log_likelihoods = []

for K in K_values:
    # Fit GMM with sklearn (faster)
    gmm_k = GaussianMixture(n_components=K, random_state=42, n_init=10)
    gmm_k.fit(X)
    
    bic_scores.append(gmm_k.bic(X))
    aic_scores.append(gmm_k.aic(X))
    log_likelihoods.append(gmm_k.score(X) * len(X))

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Log-likelihood
axes[0].plot(K_values, log_likelihoods, 'bo-', linewidth=2, markersize=8)
axes[0].axvline(x=3, color='r', linestyle='--', linewidth=2, label='True K=3')
axes[0].set_xlabel('Number of Components (K)')
axes[0].set_ylabel('Log-Likelihood')
axes[0].set_title('Log-Likelihood vs K')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# BIC (lower is better)
axes[1].plot(K_values, bic_scores, 'rs-', linewidth=2, markersize=8)
axes[1].axvline(x=3, color='g', linestyle='--', linewidth=2, label='True K=3')
axes[1].axvline(x=K_values[np.argmin(bic_scores)], color='b', linestyle='--', 
               linewidth=2, label=f'BIC optimal K={K_values[np.argmin(bic_scores)]}')
axes[1].set_xlabel('Number of Components (K)')
axes[1].set_ylabel('BIC Score')
axes[1].set_title('BIC vs K (Lower is Better)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# AIC (lower is better)
axes[2].plot(K_values, aic_scores, 'gs-', linewidth=2, markersize=8)
axes[2].axvline(x=3, color='r', linestyle='--', linewidth=2, label='True K=3')
axes[2].axvline(x=K_values[np.argmin(aic_scores)], color='b', linestyle='--', 
               linewidth=2, label=f'AIC optimal K={K_values[np.argmin(aic_scores)]}')
axes[2].set_xlabel('Number of Components (K)')
axes[2].set_ylabel('AIC Score')
axes[2].set_title('AIC vs K (Lower is Better)')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Model Selection Results:")
print(f"\nTrue K: 3")
print(f"BIC optimal K: {K_values[np.argmin(bic_scores)]}")
print(f"AIC optimal K: {K_values[np.argmin(aic_scores)]}")

print("\nObservations:")
print("  - Log-likelihood always increases with K (overfitting)")
print("  - BIC/AIC penalize complexity")
print("  - BIC correctly identifies K=3!")

11.5 GMM Applications and LimitationsΒΆ

Applications:ΒΆ

  1. Clustering: Soft assignments with uncertainty

  2. Density Estimation: Model complex distributions

  3. Anomaly Detection: Low probability β†’ anomaly

  4. Image Segmentation: Cluster pixels

  5. Speech Recognition: Model phonemes

Advantages:ΒΆ

βœ… Soft clustering (probabilistic) βœ… Handles elliptical clusters βœ… Principled probabilistic framework βœ… EM guarantees convergence (to local optimum)

Limitations:ΒΆ

❌ Assumes Gaussian components ❌ Sensitive to initialization ❌ Can get stuck in local optima ❌ Need to choose K ❌ Computational cost for large datasets

# Demonstrate GMM limitations: Non-Gaussian data

# Generate non-Gaussian data (two moons)
from sklearn.datasets import make_moons
X_moons, y_moons = make_moons(n_samples=300, noise=0.1, random_state=42)

# Fit GMM with K=2
gmm_moons = GaussianMixture(n_components=2, random_state=42)
y_gmm_moons = gmm_moons.fit_predict(X_moons)

# Fit K-means for comparison
kmeans_moons = KMeans(n_clusters=2, random_state=42, n_init=10)
y_kmeans_moons = kmeans_moons.fit_predict(X_moons)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# True clusters
axes[0].scatter(X_moons[:, 0], X_moons[:, 1], c=y_moons, cmap='viridis', s=50)
axes[0].set_title('True Clusters (Non-Gaussian)')
axes[0].set_xlabel('x₁')
axes[0].set_ylabel('xβ‚‚')
axes[0].grid(True, alpha=0.3)

# K-means
axes[1].scatter(X_moons[:, 0], X_moons[:, 1], c=y_kmeans_moons, cmap='viridis', s=50)
axes[1].scatter(kmeans_moons.cluster_centers_[:, 0], 
               kmeans_moons.cluster_centers_[:, 1],
               c='red', marker='X', s=300, edgecolors='black', linewidths=2)
axes[1].set_title('K-means (Fails)')
axes[1].set_xlabel('x₁')
axes[1].set_ylabel('xβ‚‚')
axes[1].grid(True, alpha=0.3)

# GMM
axes[2].scatter(X_moons[:, 0], X_moons[:, 1], c=y_gmm_moons, cmap='viridis', s=50)
for k in range(2):
    plot_gmm_ellipse(gmm_moons.means_[k], gmm_moons.covariances_[k], 
                    axes[2], f'C{k}')
axes[2].set_title('GMM (Also Fails)')
axes[2].set_xlabel('x₁')
axes[2].set_ylabel('xβ‚‚')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Limitation: GMM assumes Gaussian components")
print("  - Two moons are non-Gaussian (crescent-shaped)")
print("  - Both K-means and GMM fail")
print("  - Solutions: Kernel methods, spectral clustering, DBSCAN")
# GMM for density estimation and anomaly detection

# Fit GMM to original data
gmm_density = GaussianMixture(n_components=3, random_state=42)
gmm_density.fit(X)

# Create grid
x1_range = np.linspace(X[:, 0].min() - 2, X[:, 0].max() + 2, 100)
x2_range = np.linspace(X[:, 1].min() - 2, X[:, 1].max() + 2, 100)
X1_grid, X2_grid = np.meshgrid(x1_range, x2_range)
X_grid = np.column_stack([X1_grid.ravel(), X2_grid.ravel()])

# Compute log-likelihood at each grid point
log_prob = gmm_density.score_samples(X_grid)
prob = np.exp(log_prob).reshape(X1_grid.shape)

# Compute log-likelihood for data points
log_prob_data = gmm_density.score_samples(X)

# Identify anomalies (low probability)
threshold = np.percentile(log_prob_data, 5)  # Bottom 5%
anomalies = log_prob_data < threshold

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Density estimation
ax = axes[0]
contour = ax.contourf(X1_grid, X2_grid, prob, levels=20, cmap='viridis', alpha=0.6)
ax.contour(X1_grid, X2_grid, prob, levels=10, colors='black', linewidths=0.5, alpha=0.3)
ax.scatter(X[:, 0], X[:, 1], c='white', s=30, edgecolors='black', linewidths=0.5, alpha=0.8)
plt.colorbar(contour, ax=ax, label='Probability Density')
ax.set_xlabel('x₁')
ax.set_ylabel('xβ‚‚')
ax.set_title('GMM Density Estimation')

# Anomaly detection
ax = axes[1]
ax.scatter(X[~anomalies, 0], X[~anomalies, 1], c='blue', s=50, alpha=0.6, label='Normal')
ax.scatter(X[anomalies, 0], X[anomalies, 1], c='red', s=100, marker='x', 
          linewidths=2, label='Anomalies')
for k in range(3):
    plot_gmm_ellipse(gmm_density.means_[k], gmm_density.covariances_[k], ax, 'black')
ax.set_xlabel('x₁')
ax.set_ylabel('xβ‚‚')
ax.set_title('Anomaly Detection')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Anomaly Detection:")
print(f"  - Identified {anomalies.sum()} anomalies (bottom 5% probability)")
print(f"  - These are far from all cluster centers")

SummaryΒΆ

Key Concepts from Chapter 11:ΒΆ

  1. GMM: Mixture of Gaussian distributions

  2. EM Algorithm: Iterative parameter estimation

  3. E-step: Compute responsibilities (soft assignments)

  4. M-step: Update parameters (means, covariances, weights)

  5. Soft Clustering: Probabilistic assignments with uncertainty

  6. Model Selection: BIC, AIC for choosing K

EM Algorithm:ΒΆ

Step

Operation

Formula

E-step

Compute responsibilities

r_nk = Ο€_k N(x_n | ΞΌ_k, Ξ£_k) / Ξ£_j Ο€_j N(x_n | ΞΌ_j, Ξ£_j)

M-step

Update means

ΞΌ_k = Ξ£_n r_nk x_n / Ξ£_n r_nk

M-step

Update covariances

Ξ£_k = Ξ£_n r_nk (x_n - ΞΌ_k)(x_n - ΞΌ_k)^T / Ξ£_n r_nk

M-step

Update weights

Ο€_k = (1/N) Ξ£_n r_nk

GMM vs K-means:ΒΆ

Aspect

K-means

GMM

Assignment

Hard (one cluster)

Soft (probabilities)

Cluster shape

Spherical

Elliptical

Framework

Geometric

Probabilistic

Uncertainty

No

Yes

Speed

Fast

Slower

When to Use GMM:ΒΆ

βœ… Good for:

  • Need soft clustering

  • Elliptical clusters

  • Density estimation

  • Anomaly detection

  • Uncertainty quantification

❌ Not good for:

  • Non-Gaussian data

  • Very large datasets

  • Need fast clustering

  • Complex cluster shapes

Connection to Other Chapters:ΒΆ

Concept

Chapter

Gaussian distribution

Chapter 6

Multivariate Gaussians

Chapter 6

Maximum likelihood

Chapter 6, 9

Covariance matrix

Chapter 6

Optimization (EM)

Chapter 7

Practical Tips:ΒΆ

  1. Initialize well: Run multiple times with different initializations

  2. Standardize data: Especially if features have different scales

  3. Use BIC/AIC: For model selection

  4. Check convergence: Monitor log-likelihood

  5. Visualize: Plot responsibilities and cluster ellipses

  6. Consider alternatives: Spectral clustering, DBSCAN for non-Gaussian

Next Steps:ΒΆ

  • Chapter 12: SVM (classification with margin maximization)

  • Advanced: Variational inference, Dirichlet Process GMM

  • Applications: Image segmentation, speech recognition

Practice: Apply GMM to a real dataset and interpret the clusters!