import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering, AffinityPropagation
from sklearn.datasets import make_blobs, make_moons, make_circles
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)
1. K-Means Clustering (Detailed)ΒΆ
ObjectiveΒΆ
Minimize within-cluster sum of squares (WCSS):
AlgorithmΒΆ
Initialize \(K\) cluster centers \(\{\boldsymbol{\mu}_k\}\)
Assignment step: Assign each point to nearest center
Update step: Recompute centers as cluster means
Repeat until convergence
PropertiesΒΆ
Guaranteed to converge (but to local optimum)
Sensitive to initialization (use k-means++)
Assumes spherical clusters
# Generate different cluster shapes
np.random.seed(42)
# Dataset 1: Well-separated blobs
X_blobs, y_blobs = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)
# Dataset 2: Moons (non-convex)
X_moons, y_moons = make_moons(n_samples=300, noise=0.05, random_state=42)
# Dataset 3: Circles (nested)
X_circles, y_circles = make_circles(n_samples=300, noise=0.05, factor=0.5, random_state=42)
# Dataset 4: Anisotropic (elongated)
X_aniso = np.random.randn(300, 2)
X_aniso = np.dot(X_aniso, [[0.6, -0.6], [-0.4, 0.8]])
y_aniso = np.array([0] * 100 + [1] * 100 + [2] * 100)
X_aniso[y_aniso == 0] += [2, 0]
X_aniso[y_aniso == 1] += [-2, 0]
X_aniso[y_aniso == 2] += [0, 2]
datasets = [
(X_blobs, y_blobs, "Blobs"),
(X_moons, y_moons, "Moons"),
(X_circles, y_circles, "Circles"),
(X_aniso, y_aniso, "Anisotropic")
]
# Apply K-means
fig, axes = plt.subplots(2, 4, figsize=(18, 10))
for idx, (X, y_true, name) in enumerate(datasets):
# True labels
axes[0, idx].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', alpha=0.6, s=30)
axes[0, idx].set_title(f'{name}\n(True Labels)')
axes[0, idx].grid(True, alpha=0.3)
# K-means clustering
n_clusters = len(np.unique(y_true))
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X)
axes[1, idx].scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', alpha=0.6, s=30)
axes[1, idx].scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
c='red', marker='X', s=200, edgecolors='black', linewidth=2)
# Compute metrics
ari = adjusted_rand_score(y_true, y_kmeans)
silhouette = silhouette_score(X, y_kmeans)
axes[1, idx].set_title(f'K-Means\nARI: {ari:.3f}, Sil: {silhouette:.3f}')
axes[1, idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("K-Means Performance:")
print("- Works well: Blobs (spherical, well-separated)")
print("- Struggles: Moons, Circles (non-convex shapes)")
print("- Mediocre: Anisotropic (elongated clusters)")
2. Hierarchical ClusteringΒΆ
TypesΒΆ
Agglomerative (bottom-up): Start with each point as a cluster, merge
Divisive (top-down): Start with one cluster, split
Linkage MethodsΒΆ
Distance between clusters \(C_i\) and \(C_j\):
Single linkage: \(\min_{x \in C_i, y \in C_j} d(x, y)\)
Complete linkage: \(\max_{x \in C_i, y \in C_j} d(x, y)\)
Average linkage: \(\frac{1}{|C_i||C_j|} \sum_{x \in C_i} \sum_{y \in C_j} d(x, y)\)
Ward linkage: Minimize within-cluster variance
OutputΒΆ
Dendrogram showing hierarchical structure
# Use blobs dataset
X = X_blobs
y_true = y_blobs
# Different linkage methods
linkage_methods = ['single', 'complete', 'average', 'ward']
fig, axes = plt.subplots(2, 4, figsize=(18, 10))
for idx, method in enumerate(linkage_methods):
# Compute linkage
Z = linkage(X, method=method)
# Plot dendrogram
dendrogram(Z, ax=axes[0, idx], no_labels=True, color_threshold=0)
axes[0, idx].set_title(f'{method.capitalize()} Linkage\nDendrogram')
axes[0, idx].set_ylabel('Distance')
axes[0, idx].grid(True, alpha=0.3, axis='y')
# Apply hierarchical clustering
hc = AgglomerativeClustering(n_clusters=4, linkage=method)
y_hc = hc.fit_predict(X)
# Plot clusters
axes[1, idx].scatter(X[:, 0], X[:, 1], c=y_hc, cmap='viridis', alpha=0.6, s=30)
# Compute metrics
ari = adjusted_rand_score(y_true, y_hc)
silhouette = silhouette_score(X, y_hc)
axes[1, idx].set_title(f'Clusters\nARI: {ari:.3f}, Sil: {silhouette:.3f}')
axes[1, idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Hierarchical Clustering Linkage Methods:")
print("\nSingle Linkage:")
print(" - Can handle non-convex shapes")
print(" - Sensitive to noise (chaining effect)")
print("\nComplete Linkage:")
print(" - Produces compact clusters")
print(" - Sensitive to outliers")
print("\nAverage Linkage:")
print(" - Compromise between single and complete")
print(" - More robust")
print("\nWard Linkage:")
print(" - Minimizes within-cluster variance")
print(" - Tends to produce equal-sized clusters")
3. Spectral ClusteringΒΆ
IdeaΒΆ
Use eigenvalues/eigenvectors of graph Laplacian to find clusters.
AlgorithmΒΆ
Construct similarity matrix \(\mathbf{W}\) (e.g., RBF kernel)
Compute degree matrix \(\mathbf{D}\): \(D_{ii} = \sum_j W_{ij}\)
Compute Laplacian: \(\mathbf{L} = \mathbf{D} - \mathbf{W}\)
Find first \(K\) eigenvectors of \(\mathbf{L}\)
Run K-means on eigenvector matrix
AdvantagesΒΆ
Can find non-convex clusters
Works well when clusters are not linearly separable
# Compare Spectral Clustering with K-means on non-convex data
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
test_datasets = [
(X_moons, y_moons, "Moons"),
(X_circles, y_circles, "Circles"),
(X_blobs, y_blobs, "Blobs")
]
for idx, (X, y_true, name) in enumerate(test_datasets):
n_clusters = len(np.unique(y_true))
# K-means
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X)
axes[0, idx].scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', alpha=0.6, s=30)
ari_kmeans = adjusted_rand_score(y_true, y_kmeans)
axes[0, idx].set_title(f'{name} - K-Means\nARI: {ari_kmeans:.3f}')
axes[0, idx].grid(True, alpha=0.3)
# Spectral Clustering
spectral = SpectralClustering(n_clusters=n_clusters, affinity='rbf',
gamma=1.0, random_state=42)
y_spectral = spectral.fit_predict(X)
axes[1, idx].scatter(X[:, 0], X[:, 1], c=y_spectral, cmap='viridis', alpha=0.6, s=30)
ari_spectral = adjusted_rand_score(y_true, y_spectral)
axes[1, idx].set_title(f'{name} - Spectral\nARI: {ari_spectral:.3f}')
axes[1, idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Spectral Clustering:")
print("β Handles non-convex shapes (moons, circles)")
print("β Works when K-means fails")
print("β More computationally expensive")
print("β Requires choosing similarity parameter (gamma)")
4. DBSCAN (Density-Based)ΒΆ
IdeaΒΆ
Clusters are dense regions separated by sparse regions.
ParametersΒΆ
\(\epsilon\): Neighborhood radius
\(\text{minPts}\): Minimum points to form dense region
Point TypesΒΆ
Core point: Has \(\geq \text{minPts}\) neighbors within \(\epsilon\)
Border point: Not core, but neighbor of core point
Noise point: Neither core nor border
AdvantagesΒΆ
Doesnβt require specifying number of clusters
Can find arbitrarily shaped clusters
Identifies outliers
# DBSCAN on different datasets
fig, axes = plt.subplots(2, 4, figsize=(18, 10))
for idx, (X, y_true, name) in enumerate(datasets):
# True labels
axes[0, idx].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', alpha=0.6, s=30)
axes[0, idx].set_title(f'{name}\n(True Labels)')
axes[0, idx].grid(True, alpha=0.3)
# DBSCAN
# Choose eps based on dataset scale
eps_values = [0.3, 0.15, 0.15, 0.5]
dbscan = DBSCAN(eps=eps_values[idx], min_samples=5)
y_dbscan = dbscan.fit_predict(X)
# Plot (noise points in gray)
unique_labels = np.unique(y_dbscan)
colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels[unique_labels >= 0])))
for k, col in zip(unique_labels[unique_labels >= 0], colors):
class_mask = y_dbscan == k
axes[1, idx].scatter(X[class_mask, 0], X[class_mask, 1],
c=[col], alpha=0.6, s=30, label=f'Cluster {k}')
# Noise points
if -1 in y_dbscan:
noise_mask = y_dbscan == -1
axes[1, idx].scatter(X[noise_mask, 0], X[noise_mask, 1],
c='gray', alpha=0.3, s=30, marker='x', label='Noise')
n_clusters = len(unique_labels[unique_labels >= 0])
n_noise = np.sum(y_dbscan == -1)
axes[1, idx].set_title(f'DBSCAN (eps={eps_values[idx]})\n{n_clusters} clusters, {n_noise} noise')
axes[1, idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("DBSCAN Characteristics:")
print("β Finds clusters of arbitrary shape")
print("β Identifies outliers (noise points)")
print("β No need to specify number of clusters")
print("β Sensitive to parameters (eps, min_samples)")
print("β Struggles with varying densities")
5. Affinity PropagationΒΆ
IdeaΒΆ
Finds βexemplarsβ (cluster centers) by message passing.
AlgorithmΒΆ
Compute similarity matrix \(s(i,j)\) (negative squared Euclidean distance)
Exchange two types of messages:
Responsibility \(r(i,k)\): How suitable point \(k\) is as exemplar for \(i\)
Availability \(a(i,k)\): How appropriate for \(i\) to choose \(k\) as exemplar
Iterate until convergence
Exemplars are points where \(r(k,k) + a(k,k) > 0\)
AdvantagesΒΆ
Automatically determines number of clusters
No initialization required
Can use any similarity measure
# Affinity Propagation
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
test_data = [X_blobs, X_moons, X_aniso]
test_names = ["Blobs", "Moons", "Anisotropic"]
for idx, (X, name) in enumerate(zip(test_data, test_names)):
# Affinity Propagation
af = AffinityPropagation(random_state=42, damping=0.9)
y_af = af.fit_predict(X)
# Plot clusters
axes[idx].scatter(X[:, 0], X[:, 1], c=y_af, cmap='viridis', alpha=0.6, s=30)
# Plot exemplars
exemplars = af.cluster_centers_indices_
axes[idx].scatter(X[exemplars, 0], X[exemplars, 1],
c='red', marker='*', s=300, edgecolors='black', linewidth=2,
label='Exemplars')
n_clusters = len(exemplars)
axes[idx].set_title(f'{name}\nAffinity Propagation\n{n_clusters} clusters found')
axes[idx].legend()
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Affinity Propagation:")
print("β Automatically determines number of clusters")
print("β Produces exemplars (representative points)")
print("β No random initialization")
print("β Quadratic complexity O(NΒ²)")
print("β Can be slow for large datasets")
6. Cluster Evaluation MetricsΒΆ
Internal Metrics (no ground truth needed)ΒΆ
Silhouette Score: $\(s(i) = \frac{b(i) - a(i)}{\max(a(i), b(i))}\)$
where:
\(a(i)\): Mean distance to points in same cluster
\(b(i)\): Mean distance to points in nearest other cluster
Range: \([-1, 1]\) (higher is better)
Davies-Bouldin Index: Lower is better
Calinski-Harabasz Index: Higher is better
External Metrics (with ground truth)ΒΆ
Adjusted Rand Index (ARI):
Measures similarity between two clusterings
Range: \([-1, 1]\) (1 = perfect match)
Adjusted for chance
Normalized Mutual Information (NMI):
Information-theoretic measure
Range: \([0, 1]\) (1 = perfect match)
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score
# Compare all algorithms on blobs dataset
X = X_blobs
y_true = y_blobs
algorithms = [
('K-Means', KMeans(n_clusters=4, random_state=42, n_init=10)),
('Hierarchical', AgglomerativeClustering(n_clusters=4)),
('Spectral', SpectralClustering(n_clusters=4, random_state=42)),
('DBSCAN', DBSCAN(eps=0.3, min_samples=5))
]
results = []
for name, algorithm in algorithms:
y_pred = algorithm.fit_predict(X)
# Skip if only one cluster or noise-only
if len(np.unique(y_pred[y_pred >= 0])) < 2:
continue
# External metrics
ari = adjusted_rand_score(y_true, y_pred)
nmi = normalized_mutual_info_score(y_true, y_pred)
# Internal metrics (only for non-noise points)
mask = y_pred >= 0
if np.sum(mask) > 0:
silhouette = silhouette_score(X[mask], y_pred[mask])
db_index = davies_bouldin_score(X[mask], y_pred[mask])
ch_index = calinski_harabasz_score(X[mask], y_pred[mask])
else:
silhouette = db_index = ch_index = np.nan
results.append({
'Algorithm': name,
'ARI': ari,
'NMI': nmi,
'Silhouette': silhouette,
'Davies-Bouldin': db_index,
'Calinski-Harabasz': ch_index
})
# Create comparison table
df_results = pd.DataFrame(results)
print("\nClustering Algorithm Comparison (Blobs Dataset):")
print("="*80)
print(df_results.to_string(index=False))
print("\nMetric Interpretation:")
print(" ARI, NMI, Silhouette: Higher is better")
print(" Davies-Bouldin: Lower is better")
print(" Calinski-Harabasz: Higher is better")
# Visualize metrics
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# External metrics
x = np.arange(len(df_results))
width = 0.35
axes[0].bar(x - width/2, df_results['ARI'], width, label='ARI', alpha=0.8)
axes[0].bar(x + width/2, df_results['NMI'], width, label='NMI', alpha=0.8)
axes[0].set_ylabel('Score')
axes[0].set_title('External Metrics\n(with ground truth)')
axes[0].set_xticks(x)
axes[0].set_xticklabels(df_results['Algorithm'], rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')
# Silhouette
axes[1].bar(x, df_results['Silhouette'], alpha=0.8, color='green')
axes[1].set_ylabel('Score')
axes[1].set_title('Silhouette Score\n(higher is better)')
axes[1].set_xticks(x)
axes[1].set_xticklabels(df_results['Algorithm'], rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')
# Davies-Bouldin
axes[2].bar(x, df_results['Davies-Bouldin'], alpha=0.8, color='red')
axes[2].set_ylabel('Score')
axes[2].set_title('Davies-Bouldin Index\n(lower is better)')
axes[2].set_xticks(x)
axes[2].set_xticklabels(df_results['Algorithm'], rotation=45)
axes[2].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
SummaryΒΆ
In this notebook, we covered major clustering algorithms:
K-Means: Fast, spherical clusters
Hierarchical: Dendrogram, nested clusters
Spectral: Non-convex shapes via graph Laplacian
DBSCAN: Density-based, identifies outliers
Affinity Propagation: Automatic cluster count via message passing
Evaluation Metrics: Internal and external measures
Algorithm ComparisonΒΆ
Algorithm |
Cluster Shape |
# Clusters |
Outliers |
Complexity |
Best For |
|---|---|---|---|---|---|
K-Means |
Spherical |
Fixed |
No |
O(nKT) |
Large datasets, spherical clusters |
Hierarchical |
Any |
Hierarchical |
No |
O(nΒ²) or O(nΒ³) |
Small datasets, dendrograms |
Spectral |
Non-convex |
Fixed |
No |
O(nΒ³) |
Non-convex shapes |
DBSCAN |
Arbitrary |
Automatic |
Yes |
O(n log n) |
Varying shapes, noise |
Affinity Prop |
Any |
Automatic |
No |
O(nΒ²T) |
Exemplar-based, small datasets |
Choosing an AlgorithmΒΆ
Use K-Means if:
Large dataset
Spherical clusters
Know number of clusters
Speed is important
Use Hierarchical if:
Need dendrogram
Want nested clusters
Small to medium dataset
Flexible on cluster count
Use Spectral if:
Non-convex cluster shapes
Graph structure important
Can afford O(nΒ³) complexity
Use DBSCAN if:
Arbitrary cluster shapes
Unknown number of clusters
Need outlier detection
Varying cluster sizes OK
Use Affinity Propagation if:
Need exemplars
Unknown cluster count
Small to medium dataset
No initialization wanted
Practical TipsΒΆ
Always standardize data before clustering
Try multiple algorithms and compare
Use internal metrics when no ground truth
Visualize results when possible (2D/3D)
K-means++ initialization reduces sensitivity
Elbow method for choosing K in K-means
Silhouette analysis for cluster quality
ExercisesΒΆ
Implement mini-batch K-means for large datasets
Use elbow method to find optimal K
Compare clustering on PCA-reduced data
Apply hierarchical clustering with different linkages
Tune DBSCAN parameters using grid search