# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.mixture import GaussianMixture
from sklearn.datasets import make_blobs, make_moons
from matplotlib.patches import Ellipse
from scipy.spatial.distance import cdist
sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)
11.1 Gaussian Mixture ModelΒΆ
Mixture Model:ΒΆ
where:
K: number of components (clusters)
Οβ: mixing coefficient (probability of cluster k)
pβ(x): component distribution (Gaussian)
GMM Specifically:ΒΆ
Parameters ΞΈ:
Means: ΞΌβ, β¦, ΞΌβ
Covariances: Ξ£β, β¦, Ξ£β
Mixing coefficients: Οβ, β¦, Οβ (Ξ£Οβ = 1)
Constraints:ΒΆ
# Generate data from a GMM
np.random.seed(42)
def generate_gmm_data(n_samples=300, n_components=3):
"""Generate data from a Gaussian Mixture Model."""
# Define true parameters
means = np.array([[0, 0], [3, 3], [-2, 3]])
covariances = [
[[1.0, 0.5], [0.5, 1.0]], # Correlated
[[0.8, 0.0], [0.0, 0.8]], # Spherical
[[2.0, -0.5], [-0.5, 0.5]] # Elongated
]
weights = np.array([0.3, 0.4, 0.3])
# Sample from mixture
X = []
y = []
for i in range(n_samples):
# Choose component
k = np.random.choice(n_components, p=weights)
# Sample from that component
x = np.random.multivariate_normal(means[k], covariances[k])
X.append(x)
y.append(k)
return np.array(X), np.array(y), means, covariances, weights
X, y_true, true_means, true_covs, true_weights = generate_gmm_data()
print("Generated GMM Data")
print(f"Shape: {X.shape}")
print(f"True cluster proportions: {true_weights}")
# Visualize
plt.figure(figsize=(10, 6))
colors = ['red', 'blue', 'green']
for k in range(3):
mask = y_true == k
plt.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50,
label=f'Component {k+1} ({true_weights[k]:.1f})')
plt.scatter(true_means[k, 0], true_means[k, 1], c=colors[k],
marker='X', s=300, edgecolors='black', linewidths=2)
plt.xlabel('xβ', fontsize=12)
plt.ylabel('xβ', fontsize=12)
plt.title('Data Generated from Gaussian Mixture Model', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
print("\nGoal: Learn the parameters (means, covariances, weights) from data alone!")
11.2 Expectation-Maximization (EM) AlgorithmΒΆ
Problem:ΒΆ
Maximum likelihood estimation is hard because log doesnβt simplify:
Solution: EM AlgorithmΒΆ
Introduce latent variables z (cluster assignments) and alternate:
E-step (Expectation): Compute responsibilities $\(r_{nk} = \frac{\pi_k \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k)}{\sum_{j=1}^K \pi_j \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_j, \boldsymbol{\Sigma}_j)}\)$
M-step (Maximization): Update parameters $\(\boldsymbol{\mu}_k = \frac{\sum_n r_{nk} \mathbf{x}_n}{\sum_n r_{nk}}, \quad \boldsymbol{\Sigma}_k = \frac{\sum_n r_{nk}(\mathbf{x}_n - \boldsymbol{\mu}_k)(\mathbf{x}_n - \boldsymbol{\mu}_k)^T}{\sum_n r_{nk}}\)$
rββ = responsibility: probability that point n belongs to cluster k
# Implement EM algorithm from scratch
class GMM_EM:
"""Gaussian Mixture Model using EM algorithm."""
def __init__(self, n_components=3, max_iter=100, tol=1e-4):
self.K = n_components
self.max_iter = max_iter
self.tol = tol
def _initialize(self, X):
"""Initialize parameters randomly."""
n, d = X.shape
# Random initialization
self.means = X[np.random.choice(n, self.K, replace=False)]
self.covariances = [np.eye(d) for _ in range(self.K)]
self.weights = np.ones(self.K) / self.K
def _e_step(self, X):
"""E-step: Compute responsibilities."""
n = X.shape[0]
responsibilities = np.zeros((n, self.K))
for k in range(self.K):
# Gaussian density
responsibilities[:, k] = self.weights[k] * stats.multivariate_normal.pdf(
X, mean=self.means[k], cov=self.covariances[k]
)
# Normalize (denominator in responsibility formula)
responsibilities /= responsibilities.sum(axis=1, keepdims=True)
return responsibilities
def _m_step(self, X, responsibilities):
"""M-step: Update parameters."""
n, d = X.shape
# Effective number of points in each cluster
N_k = responsibilities.sum(axis=0)
# Update means
self.means = (responsibilities.T @ X) / N_k[:, np.newaxis]
# Update covariances
for k in range(self.K):
diff = X - self.means[k]
self.covariances[k] = (responsibilities[:, k:k+1] * diff).T @ diff / N_k[k]
# Add small regularization for numerical stability
self.covariances[k] += 1e-6 * np.eye(d)
# Update weights
self.weights = N_k / n
def _compute_log_likelihood(self, X):
"""Compute log-likelihood of data."""
log_likelihood = 0
for k in range(self.K):
log_likelihood += self.weights[k] * stats.multivariate_normal.pdf(
X, mean=self.means[k], cov=self.covariances[k]
)
return np.sum(np.log(log_likelihood + 1e-10))
def fit(self, X):
"""Fit GMM using EM algorithm."""
self._initialize(X)
self.log_likelihoods = []
for iteration in range(self.max_iter):
# E-step
responsibilities = self._e_step(X)
# M-step
self._m_step(X, responsibilities)
# Compute log-likelihood
ll = self._compute_log_likelihood(X)
self.log_likelihoods.append(ll)
# Check convergence
if iteration > 0 and abs(ll - self.log_likelihoods[-2]) < self.tol:
print(f"Converged after {iteration + 1} iterations")
break
return self
def predict_proba(self, X):
"""Predict cluster probabilities (responsibilities)."""
return self._e_step(X)
def predict(self, X):
"""Predict cluster assignments (hard assignment)."""
return self.predict_proba(X).argmax(axis=1)
# Fit GMM
gmm = GMM_EM(n_components=3, max_iter=100)
gmm.fit(X)
print("EM Algorithm Results")
print(f"\nLearned means:")
for k in range(3):
print(f" Component {k+1}: {gmm.means[k]}")
print(f"\nLearned weights: {gmm.weights}")
print(f"\nTrue means:")
for k in range(3):
print(f" Component {k+1}: {true_means[k]}")
print(f"\nTrue weights: {true_weights}")
# Plot convergence
plt.figure(figsize=(10, 5))
plt.plot(gmm.log_likelihoods, 'b-', linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Log-Likelihood', fontsize=12)
plt.title('EM Algorithm Convergence', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()
# Visualize GMM fit
def plot_gmm_ellipse(mean, cov, ax, color, n_std=2.0, **kwargs):
"""Plot covariance ellipse."""
# Eigendecomposition
eigenvalues, eigenvectors = np.linalg.eig(cov)
# Angle of ellipse
angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
# Width and height (2 standard deviations)
width, height = 2 * n_std * np.sqrt(eigenvalues)
ellipse = Ellipse(mean, width, height, angle=angle,
facecolor='none', edgecolor=color, linewidth=2, **kwargs)
ax.add_patch(ellipse)
# Predict clusters
y_pred = gmm.predict(X)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# True clusters
ax = axes[0]
for k in range(3):
mask = y_true == k
ax.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50,
label=f'True cluster {k+1}')
ax.scatter(true_means[k, 0], true_means[k, 1], c=colors[k],
marker='X', s=300, edgecolors='black', linewidths=2)
plot_gmm_ellipse(true_means[k], true_covs[k], ax, colors[k], linestyle='--')
ax.set_xlabel('xβ', fontsize=12)
ax.set_ylabel('xβ', fontsize=12)
ax.set_title('True GMM (Data Generation)', fontsize=13)
ax.legend()
ax.grid(True, alpha=0.3)
# Learned clusters
ax = axes[1]
for k in range(3):
mask = y_pred == k
ax.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50,
label=f'Learned cluster {k+1}')
ax.scatter(gmm.means[k, 0], gmm.means[k, 1], c=colors[k],
marker='X', s=300, edgecolors='black', linewidths=2)
plot_gmm_ellipse(gmm.means[k], gmm.covariances[k], ax, colors[k])
ax.set_xlabel('xβ', fontsize=12)
ax.set_ylabel('xβ', fontsize=12)
ax.set_title('Learned GMM (EM Algorithm)', fontsize=13)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("EM successfully recovered the true clusters!")
11.3 Soft vs Hard ClusteringΒΆ
Hard Clustering (K-means):ΒΆ
Each point assigned to one cluster: $\(z_n \in \{1, 2, ..., K\}\)$
Soft Clustering (GMM):ΒΆ
Each point has probability of belonging to each cluster: $\(r_{nk} = P(z_n = k | \mathbf{x}_n)\)$
Advantage: Captures uncertainty in cluster assignment!
# Compare K-means (hard) vs GMM (soft)
from sklearn.cluster import KMeans
# Fit K-means
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X)
# Get GMM responsibilities
responsibilities = gmm.predict_proba(X)
# Find points with uncertain assignments (high entropy)
entropy = -np.sum(responsibilities * np.log(responsibilities + 1e-10), axis=1)
uncertain_mask = entropy > 0.8 # High uncertainty
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# K-means (hard clustering)
ax = axes[0]
for k in range(3):
mask = y_kmeans == k
ax.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50)
ax.scatter(kmeans.cluster_centers_[k, 0], kmeans.cluster_centers_[k, 1],
c=colors[k], marker='X', s=300, edgecolors='black', linewidths=2)
ax.set_xlabel('xβ')
ax.set_ylabel('xβ')
ax.set_title('K-means (Hard Clustering)')
ax.grid(True, alpha=0.3)
# GMM (soft clustering - showing hard assignment)
ax = axes[1]
for k in range(3):
mask = y_pred == k
ax.scatter(X[mask, 0], X[mask, 1], c=colors[k], alpha=0.6, s=50)
ax.scatter(gmm.means[k, 0], gmm.means[k, 1], c=colors[k],
marker='X', s=300, edgecolors='black', linewidths=2)
plot_gmm_ellipse(gmm.means[k], gmm.covariances[k], ax, colors[k])
ax.set_xlabel('xβ')
ax.set_ylabel('xβ')
ax.set_title('GMM (Soft Clustering - Hard Assignment)')
ax.grid(True, alpha=0.3)
# GMM uncertainty
ax = axes[2]
scatter = ax.scatter(X[:, 0], X[:, 1], c=entropy, cmap='viridis',
alpha=0.6, s=50)
# Highlight uncertain points
ax.scatter(X[uncertain_mask, 0], X[uncertain_mask, 1],
facecolors='none', edgecolors='red', s=100, linewidths=2,
label='Uncertain points')
plt.colorbar(scatter, ax=ax, label='Entropy (Uncertainty)')
ax.set_xlabel('xβ')
ax.set_ylabel('xβ')
ax.set_title('GMM Uncertainty')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Soft Clustering Advantages:")
print(f" - {uncertain_mask.sum()} points have uncertain assignments")
print(" - These are typically near cluster boundaries")
print(" - GMM quantifies this uncertainty; K-means doesn't")
# Show example uncertain point
idx = np.argmax(entropy)
print(f"\nMost uncertain point: {X[idx]}")
print(f" Responsibilities: {responsibilities[idx]}")
print(f" Entropy: {entropy[idx]:.3f}")
# Visualize responsibilities with pie charts
# Select representative points from each cluster and boundaries
sample_indices = []
# One clear point from each cluster
for k in range(3):
cluster_points = np.where(y_pred == k)[0]
cluster_responsibilities = responsibilities[cluster_points]
# Find most confident point (lowest entropy)
most_confident_idx = cluster_points[np.argmin(entropy[cluster_points])]
sample_indices.append(most_confident_idx)
# Two uncertain points (high entropy)
uncertain_indices = np.argsort(entropy)[-2:]
sample_indices.extend(uncertain_indices)
fig, ax = plt.subplots(figsize=(12, 8))
# Plot all points
ax.scatter(X[:, 0], X[:, 1], c='lightgray', alpha=0.3, s=30)
# Plot selected points with pie charts
for idx in sample_indices:
# Position
x, y = X[idx]
# Create small pie chart
pie_size = 0.3
pie_ax = fig.add_axes([0, 0, pie_size, pie_size])
pie_ax.pie(responsibilities[idx], colors=colors, startangle=90)
# Position pie chart
# Convert data coordinates to figure coordinates
trans = ax.transData.transform([x, y])
inv_trans = fig.transFigure.inverted().transform(trans)
pie_ax.set_position([inv_trans[0] - pie_size/2,
inv_trans[1] - pie_size/2,
pie_size, pie_size])
# Draw line to point
ax.plot(x, y, 'ko', markersize=10)
ax.annotate(f'{entropy[idx]:.2f}', xy=(x, y), xytext=(x+0.3, y+0.3),
fontsize=10, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
# Plot cluster centers
for k in range(3):
ax.scatter(gmm.means[k, 0], gmm.means[k, 1], c=colors[k],
marker='X', s=300, edgecolors='black', linewidths=2)
plot_gmm_ellipse(gmm.means[k], gmm.covariances[k], ax, colors[k])
ax.set_xlabel('xβ', fontsize=12)
ax.set_ylabel('xβ', fontsize=12)
ax.set_title('GMM Soft Assignments (Pie Charts Show Responsibilities)', fontsize=14)
ax.grid(True, alpha=0.3)
plt.show()
print("Pie charts show cluster probabilities for each point")
print(" - Confident points: One dominant color")
print(" - Uncertain points: Mixed colors")
11.4 Model Selection: Choosing KΒΆ
Problem:ΒΆ
How many clusters should we use?
Methods:ΒΆ
Elbow Method: Plot log-likelihood vs K
Information Criteria: AIC, BIC
Cross-Validation: Hold-out validation
Domain Knowledge: Use prior information
BIC (Bayesian Information Criterion):ΒΆ
where:
p = number of parameters
N = number of data points
Lower BIC is better (penalizes model complexity)
# Model selection: Compare different K values
K_values = range(1, 8)
bic_scores = []
aic_scores = []
log_likelihoods = []
for K in K_values:
# Fit GMM with sklearn (faster)
gmm_k = GaussianMixture(n_components=K, random_state=42, n_init=10)
gmm_k.fit(X)
bic_scores.append(gmm_k.bic(X))
aic_scores.append(gmm_k.aic(X))
log_likelihoods.append(gmm_k.score(X) * len(X))
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Log-likelihood
axes[0].plot(K_values, log_likelihoods, 'bo-', linewidth=2, markersize=8)
axes[0].axvline(x=3, color='r', linestyle='--', linewidth=2, label='True K=3')
axes[0].set_xlabel('Number of Components (K)')
axes[0].set_ylabel('Log-Likelihood')
axes[0].set_title('Log-Likelihood vs K')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# BIC (lower is better)
axes[1].plot(K_values, bic_scores, 'rs-', linewidth=2, markersize=8)
axes[1].axvline(x=3, color='g', linestyle='--', linewidth=2, label='True K=3')
axes[1].axvline(x=K_values[np.argmin(bic_scores)], color='b', linestyle='--',
linewidth=2, label=f'BIC optimal K={K_values[np.argmin(bic_scores)]}')
axes[1].set_xlabel('Number of Components (K)')
axes[1].set_ylabel('BIC Score')
axes[1].set_title('BIC vs K (Lower is Better)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# AIC (lower is better)
axes[2].plot(K_values, aic_scores, 'gs-', linewidth=2, markersize=8)
axes[2].axvline(x=3, color='r', linestyle='--', linewidth=2, label='True K=3')
axes[2].axvline(x=K_values[np.argmin(aic_scores)], color='b', linestyle='--',
linewidth=2, label=f'AIC optimal K={K_values[np.argmin(aic_scores)]}')
axes[2].set_xlabel('Number of Components (K)')
axes[2].set_ylabel('AIC Score')
axes[2].set_title('AIC vs K (Lower is Better)')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Model Selection Results:")
print(f"\nTrue K: 3")
print(f"BIC optimal K: {K_values[np.argmin(bic_scores)]}")
print(f"AIC optimal K: {K_values[np.argmin(aic_scores)]}")
print("\nObservations:")
print(" - Log-likelihood always increases with K (overfitting)")
print(" - BIC/AIC penalize complexity")
print(" - BIC correctly identifies K=3!")
11.5 GMM Applications and LimitationsΒΆ
Applications:ΒΆ
Clustering: Soft assignments with uncertainty
Density Estimation: Model complex distributions
Anomaly Detection: Low probability β anomaly
Image Segmentation: Cluster pixels
Speech Recognition: Model phonemes
Advantages:ΒΆ
β Soft clustering (probabilistic) β Handles elliptical clusters β Principled probabilistic framework β EM guarantees convergence (to local optimum)
Limitations:ΒΆ
β Assumes Gaussian components β Sensitive to initialization β Can get stuck in local optima β Need to choose K β Computational cost for large datasets
# Demonstrate GMM limitations: Non-Gaussian data
# Generate non-Gaussian data (two moons)
from sklearn.datasets import make_moons
X_moons, y_moons = make_moons(n_samples=300, noise=0.1, random_state=42)
# Fit GMM with K=2
gmm_moons = GaussianMixture(n_components=2, random_state=42)
y_gmm_moons = gmm_moons.fit_predict(X_moons)
# Fit K-means for comparison
kmeans_moons = KMeans(n_clusters=2, random_state=42, n_init=10)
y_kmeans_moons = kmeans_moons.fit_predict(X_moons)
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# True clusters
axes[0].scatter(X_moons[:, 0], X_moons[:, 1], c=y_moons, cmap='viridis', s=50)
axes[0].set_title('True Clusters (Non-Gaussian)')
axes[0].set_xlabel('xβ')
axes[0].set_ylabel('xβ')
axes[0].grid(True, alpha=0.3)
# K-means
axes[1].scatter(X_moons[:, 0], X_moons[:, 1], c=y_kmeans_moons, cmap='viridis', s=50)
axes[1].scatter(kmeans_moons.cluster_centers_[:, 0],
kmeans_moons.cluster_centers_[:, 1],
c='red', marker='X', s=300, edgecolors='black', linewidths=2)
axes[1].set_title('K-means (Fails)')
axes[1].set_xlabel('xβ')
axes[1].set_ylabel('xβ')
axes[1].grid(True, alpha=0.3)
# GMM
axes[2].scatter(X_moons[:, 0], X_moons[:, 1], c=y_gmm_moons, cmap='viridis', s=50)
for k in range(2):
plot_gmm_ellipse(gmm_moons.means_[k], gmm_moons.covariances_[k],
axes[2], f'C{k}')
axes[2].set_title('GMM (Also Fails)')
axes[2].set_xlabel('xβ')
axes[2].set_ylabel('xβ')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Limitation: GMM assumes Gaussian components")
print(" - Two moons are non-Gaussian (crescent-shaped)")
print(" - Both K-means and GMM fail")
print(" - Solutions: Kernel methods, spectral clustering, DBSCAN")
# GMM for density estimation and anomaly detection
# Fit GMM to original data
gmm_density = GaussianMixture(n_components=3, random_state=42)
gmm_density.fit(X)
# Create grid
x1_range = np.linspace(X[:, 0].min() - 2, X[:, 0].max() + 2, 100)
x2_range = np.linspace(X[:, 1].min() - 2, X[:, 1].max() + 2, 100)
X1_grid, X2_grid = np.meshgrid(x1_range, x2_range)
X_grid = np.column_stack([X1_grid.ravel(), X2_grid.ravel()])
# Compute log-likelihood at each grid point
log_prob = gmm_density.score_samples(X_grid)
prob = np.exp(log_prob).reshape(X1_grid.shape)
# Compute log-likelihood for data points
log_prob_data = gmm_density.score_samples(X)
# Identify anomalies (low probability)
threshold = np.percentile(log_prob_data, 5) # Bottom 5%
anomalies = log_prob_data < threshold
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Density estimation
ax = axes[0]
contour = ax.contourf(X1_grid, X2_grid, prob, levels=20, cmap='viridis', alpha=0.6)
ax.contour(X1_grid, X2_grid, prob, levels=10, colors='black', linewidths=0.5, alpha=0.3)
ax.scatter(X[:, 0], X[:, 1], c='white', s=30, edgecolors='black', linewidths=0.5, alpha=0.8)
plt.colorbar(contour, ax=ax, label='Probability Density')
ax.set_xlabel('xβ')
ax.set_ylabel('xβ')
ax.set_title('GMM Density Estimation')
# Anomaly detection
ax = axes[1]
ax.scatter(X[~anomalies, 0], X[~anomalies, 1], c='blue', s=50, alpha=0.6, label='Normal')
ax.scatter(X[anomalies, 0], X[anomalies, 1], c='red', s=100, marker='x',
linewidths=2, label='Anomalies')
for k in range(3):
plot_gmm_ellipse(gmm_density.means_[k], gmm_density.covariances_[k], ax, 'black')
ax.set_xlabel('xβ')
ax.set_ylabel('xβ')
ax.set_title('Anomaly Detection')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Anomaly Detection:")
print(f" - Identified {anomalies.sum()} anomalies (bottom 5% probability)")
print(f" - These are far from all cluster centers")
SummaryΒΆ
Key Concepts from Chapter 11:ΒΆ
GMM: Mixture of Gaussian distributions
EM Algorithm: Iterative parameter estimation
E-step: Compute responsibilities (soft assignments)
M-step: Update parameters (means, covariances, weights)
Soft Clustering: Probabilistic assignments with uncertainty
Model Selection: BIC, AIC for choosing K
EM Algorithm:ΒΆ
Step |
Operation |
Formula |
|---|---|---|
E-step |
Compute responsibilities |
r_nk = Ο_k N(x_n | ΞΌ_k, Ξ£_k) / Ξ£_j Ο_j N(x_n | ΞΌ_j, Ξ£_j) |
M-step |
Update means |
ΞΌ_k = Ξ£_n r_nk x_n / Ξ£_n r_nk |
M-step |
Update covariances |
Ξ£_k = Ξ£_n r_nk (x_n - ΞΌ_k)(x_n - ΞΌ_k)^T / Ξ£_n r_nk |
M-step |
Update weights |
Ο_k = (1/N) Ξ£_n r_nk |
GMM vs K-means:ΒΆ
Aspect |
K-means |
GMM |
|---|---|---|
Assignment |
Hard (one cluster) |
Soft (probabilities) |
Cluster shape |
Spherical |
Elliptical |
Framework |
Geometric |
Probabilistic |
Uncertainty |
No |
Yes |
Speed |
Fast |
Slower |
When to Use GMM:ΒΆ
β Good for:
Need soft clustering
Elliptical clusters
Density estimation
Anomaly detection
Uncertainty quantification
β Not good for:
Non-Gaussian data
Very large datasets
Need fast clustering
Complex cluster shapes
Connection to Other Chapters:ΒΆ
Concept |
Chapter |
|---|---|
Gaussian distribution |
Chapter 6 |
Multivariate Gaussians |
Chapter 6 |
Maximum likelihood |
Chapter 6, 9 |
Covariance matrix |
Chapter 6 |
Optimization (EM) |
Chapter 7 |
Practical Tips:ΒΆ
Initialize well: Run multiple times with different initializations
Standardize data: Especially if features have different scales
Use BIC/AIC: For model selection
Check convergence: Monitor log-likelihood
Visualize: Plot responsibilities and cluster ellipses
Consider alternatives: Spectral clustering, DBSCAN for non-Gaussian
Next Steps:ΒΆ
Chapter 12: SVM (classification with margin maximization)
Advanced: Variational inference, Dirichlet Process GMM
Applications: Image segmentation, speech recognition
Practice: Apply GMM to a real dataset and interpret the clusters!