import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering, AffinityPropagation
from sklearn.datasets import make_blobs, make_moons, make_circles
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

1. K-Means Clustering (Detailed)ΒΆ

ObjectiveΒΆ

Minimize within-cluster sum of squares (WCSS):

\[J = \sum_{k=1}^K \sum_{i \in C_k} \|\mathbf{x}_i - \boldsymbol{\mu}_k\|^2\]

AlgorithmΒΆ

  1. Initialize \(K\) cluster centers \(\{\boldsymbol{\mu}_k\}\)

  2. Assignment step: Assign each point to nearest center

  3. Update step: Recompute centers as cluster means

  4. Repeat until convergence

PropertiesΒΆ

  • Guaranteed to converge (but to local optimum)

  • Sensitive to initialization (use k-means++)

  • Assumes spherical clusters

# Generate different cluster shapes
np.random.seed(42)

# Dataset 1: Well-separated blobs
X_blobs, y_blobs = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)

# Dataset 2: Moons (non-convex)
X_moons, y_moons = make_moons(n_samples=300, noise=0.05, random_state=42)

# Dataset 3: Circles (nested)
X_circles, y_circles = make_circles(n_samples=300, noise=0.05, factor=0.5, random_state=42)

# Dataset 4: Anisotropic (elongated)
X_aniso = np.random.randn(300, 2)
X_aniso = np.dot(X_aniso, [[0.6, -0.6], [-0.4, 0.8]])
y_aniso = np.array([0] * 100 + [1] * 100 + [2] * 100)
X_aniso[y_aniso == 0] += [2, 0]
X_aniso[y_aniso == 1] += [-2, 0]
X_aniso[y_aniso == 2] += [0, 2]

datasets = [
    (X_blobs, y_blobs, "Blobs"),
    (X_moons, y_moons, "Moons"),
    (X_circles, y_circles, "Circles"),
    (X_aniso, y_aniso, "Anisotropic")
]

# Apply K-means
fig, axes = plt.subplots(2, 4, figsize=(18, 10))

for idx, (X, y_true, name) in enumerate(datasets):
    # True labels
    axes[0, idx].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', alpha=0.6, s=30)
    axes[0, idx].set_title(f'{name}\n(True Labels)')
    axes[0, idx].grid(True, alpha=0.3)
    
    # K-means clustering
    n_clusters = len(np.unique(y_true))
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    y_kmeans = kmeans.fit_predict(X)
    
    axes[1, idx].scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', alpha=0.6, s=30)
    axes[1, idx].scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
                        c='red', marker='X', s=200, edgecolors='black', linewidth=2)
    
    # Compute metrics
    ari = adjusted_rand_score(y_true, y_kmeans)
    silhouette = silhouette_score(X, y_kmeans)
    
    axes[1, idx].set_title(f'K-Means\nARI: {ari:.3f}, Sil: {silhouette:.3f}')
    axes[1, idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("K-Means Performance:")
print("- Works well: Blobs (spherical, well-separated)")
print("- Struggles: Moons, Circles (non-convex shapes)")
print("- Mediocre: Anisotropic (elongated clusters)")

2. Hierarchical ClusteringΒΆ

TypesΒΆ

  1. Agglomerative (bottom-up): Start with each point as a cluster, merge

  2. Divisive (top-down): Start with one cluster, split

Linkage MethodsΒΆ

Distance between clusters \(C_i\) and \(C_j\):

  • Single linkage: \(\min_{x \in C_i, y \in C_j} d(x, y)\)

  • Complete linkage: \(\max_{x \in C_i, y \in C_j} d(x, y)\)

  • Average linkage: \(\frac{1}{|C_i||C_j|} \sum_{x \in C_i} \sum_{y \in C_j} d(x, y)\)

  • Ward linkage: Minimize within-cluster variance

OutputΒΆ

Dendrogram showing hierarchical structure

# Use blobs dataset
X = X_blobs
y_true = y_blobs

# Different linkage methods
linkage_methods = ['single', 'complete', 'average', 'ward']

fig, axes = plt.subplots(2, 4, figsize=(18, 10))

for idx, method in enumerate(linkage_methods):
    # Compute linkage
    Z = linkage(X, method=method)
    
    # Plot dendrogram
    dendrogram(Z, ax=axes[0, idx], no_labels=True, color_threshold=0)
    axes[0, idx].set_title(f'{method.capitalize()} Linkage\nDendrogram')
    axes[0, idx].set_ylabel('Distance')
    axes[0, idx].grid(True, alpha=0.3, axis='y')
    
    # Apply hierarchical clustering
    hc = AgglomerativeClustering(n_clusters=4, linkage=method)
    y_hc = hc.fit_predict(X)
    
    # Plot clusters
    axes[1, idx].scatter(X[:, 0], X[:, 1], c=y_hc, cmap='viridis', alpha=0.6, s=30)
    
    # Compute metrics
    ari = adjusted_rand_score(y_true, y_hc)
    silhouette = silhouette_score(X, y_hc)
    
    axes[1, idx].set_title(f'Clusters\nARI: {ari:.3f}, Sil: {silhouette:.3f}')
    axes[1, idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Hierarchical Clustering Linkage Methods:")
print("\nSingle Linkage:")
print("  - Can handle non-convex shapes")
print("  - Sensitive to noise (chaining effect)")
print("\nComplete Linkage:")
print("  - Produces compact clusters")
print("  - Sensitive to outliers")
print("\nAverage Linkage:")
print("  - Compromise between single and complete")
print("  - More robust")
print("\nWard Linkage:")
print("  - Minimizes within-cluster variance")
print("  - Tends to produce equal-sized clusters")

3. Spectral ClusteringΒΆ

IdeaΒΆ

Use eigenvalues/eigenvectors of graph Laplacian to find clusters.

AlgorithmΒΆ

  1. Construct similarity matrix \(\mathbf{W}\) (e.g., RBF kernel)

  2. Compute degree matrix \(\mathbf{D}\): \(D_{ii} = \sum_j W_{ij}\)

  3. Compute Laplacian: \(\mathbf{L} = \mathbf{D} - \mathbf{W}\)

  4. Find first \(K\) eigenvectors of \(\mathbf{L}\)

  5. Run K-means on eigenvector matrix

AdvantagesΒΆ

  • Can find non-convex clusters

  • Works well when clusters are not linearly separable

# Compare Spectral Clustering with K-means on non-convex data
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

test_datasets = [
    (X_moons, y_moons, "Moons"),
    (X_circles, y_circles, "Circles"),
    (X_blobs, y_blobs, "Blobs")
]

for idx, (X, y_true, name) in enumerate(test_datasets):
    n_clusters = len(np.unique(y_true))
    
    # K-means
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    y_kmeans = kmeans.fit_predict(X)
    
    axes[0, idx].scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', alpha=0.6, s=30)
    ari_kmeans = adjusted_rand_score(y_true, y_kmeans)
    axes[0, idx].set_title(f'{name} - K-Means\nARI: {ari_kmeans:.3f}')
    axes[0, idx].grid(True, alpha=0.3)
    
    # Spectral Clustering
    spectral = SpectralClustering(n_clusters=n_clusters, affinity='rbf', 
                                  gamma=1.0, random_state=42)
    y_spectral = spectral.fit_predict(X)
    
    axes[1, idx].scatter(X[:, 0], X[:, 1], c=y_spectral, cmap='viridis', alpha=0.6, s=30)
    ari_spectral = adjusted_rand_score(y_true, y_spectral)
    axes[1, idx].set_title(f'{name} - Spectral\nARI: {ari_spectral:.3f}')
    axes[1, idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Spectral Clustering:")
print("βœ“ Handles non-convex shapes (moons, circles)")
print("βœ“ Works when K-means fails")
print("βœ— More computationally expensive")
print("βœ— Requires choosing similarity parameter (gamma)")

4. DBSCAN (Density-Based)ΒΆ

IdeaΒΆ

Clusters are dense regions separated by sparse regions.

ParametersΒΆ

  • \(\epsilon\): Neighborhood radius

  • \(\text{minPts}\): Minimum points to form dense region

Point TypesΒΆ

  1. Core point: Has \(\geq \text{minPts}\) neighbors within \(\epsilon\)

  2. Border point: Not core, but neighbor of core point

  3. Noise point: Neither core nor border

AdvantagesΒΆ

  • Doesn’t require specifying number of clusters

  • Can find arbitrarily shaped clusters

  • Identifies outliers

# DBSCAN on different datasets
fig, axes = plt.subplots(2, 4, figsize=(18, 10))

for idx, (X, y_true, name) in enumerate(datasets):
    # True labels
    axes[0, idx].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', alpha=0.6, s=30)
    axes[0, idx].set_title(f'{name}\n(True Labels)')
    axes[0, idx].grid(True, alpha=0.3)
    
    # DBSCAN
    # Choose eps based on dataset scale
    eps_values = [0.3, 0.15, 0.15, 0.5]
    dbscan = DBSCAN(eps=eps_values[idx], min_samples=5)
    y_dbscan = dbscan.fit_predict(X)
    
    # Plot (noise points in gray)
    unique_labels = np.unique(y_dbscan)
    colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels[unique_labels >= 0])))
    
    for k, col in zip(unique_labels[unique_labels >= 0], colors):
        class_mask = y_dbscan == k
        axes[1, idx].scatter(X[class_mask, 0], X[class_mask, 1], 
                           c=[col], alpha=0.6, s=30, label=f'Cluster {k}')
    
    # Noise points
    if -1 in y_dbscan:
        noise_mask = y_dbscan == -1
        axes[1, idx].scatter(X[noise_mask, 0], X[noise_mask, 1], 
                           c='gray', alpha=0.3, s=30, marker='x', label='Noise')
    
    n_clusters = len(unique_labels[unique_labels >= 0])
    n_noise = np.sum(y_dbscan == -1)
    
    axes[1, idx].set_title(f'DBSCAN (eps={eps_values[idx]})\n{n_clusters} clusters, {n_noise} noise')
    axes[1, idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("DBSCAN Characteristics:")
print("βœ“ Finds clusters of arbitrary shape")
print("βœ“ Identifies outliers (noise points)")
print("βœ“ No need to specify number of clusters")
print("βœ— Sensitive to parameters (eps, min_samples)")
print("βœ— Struggles with varying densities")

5. Affinity PropagationΒΆ

IdeaΒΆ

Finds β€œexemplars” (cluster centers) by message passing.

AlgorithmΒΆ

  1. Compute similarity matrix \(s(i,j)\) (negative squared Euclidean distance)

  2. Exchange two types of messages:

    • Responsibility \(r(i,k)\): How suitable point \(k\) is as exemplar for \(i\)

    • Availability \(a(i,k)\): How appropriate for \(i\) to choose \(k\) as exemplar

  3. Iterate until convergence

  4. Exemplars are points where \(r(k,k) + a(k,k) > 0\)

AdvantagesΒΆ

  • Automatically determines number of clusters

  • No initialization required

  • Can use any similarity measure

# Affinity Propagation
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

test_data = [X_blobs, X_moons, X_aniso]
test_names = ["Blobs", "Moons", "Anisotropic"]

for idx, (X, name) in enumerate(zip(test_data, test_names)):
    # Affinity Propagation
    af = AffinityPropagation(random_state=42, damping=0.9)
    y_af = af.fit_predict(X)
    
    # Plot clusters
    axes[idx].scatter(X[:, 0], X[:, 1], c=y_af, cmap='viridis', alpha=0.6, s=30)
    
    # Plot exemplars
    exemplars = af.cluster_centers_indices_
    axes[idx].scatter(X[exemplars, 0], X[exemplars, 1], 
                     c='red', marker='*', s=300, edgecolors='black', linewidth=2,
                     label='Exemplars')
    
    n_clusters = len(exemplars)
    axes[idx].set_title(f'{name}\nAffinity Propagation\n{n_clusters} clusters found')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Affinity Propagation:")
print("βœ“ Automatically determines number of clusters")
print("βœ“ Produces exemplars (representative points)")
print("βœ“ No random initialization")
print("βœ— Quadratic complexity O(NΒ²)")
print("βœ— Can be slow for large datasets")

6. Cluster Evaluation MetricsΒΆ

Internal Metrics (no ground truth needed)ΒΆ

Silhouette Score: $\(s(i) = \frac{b(i) - a(i)}{\max(a(i), b(i))}\)$

where:

  • \(a(i)\): Mean distance to points in same cluster

  • \(b(i)\): Mean distance to points in nearest other cluster

  • Range: \([-1, 1]\) (higher is better)

Davies-Bouldin Index: Lower is better

Calinski-Harabasz Index: Higher is better

External Metrics (with ground truth)ΒΆ

Adjusted Rand Index (ARI):

  • Measures similarity between two clusterings

  • Range: \([-1, 1]\) (1 = perfect match)

  • Adjusted for chance

Normalized Mutual Information (NMI):

  • Information-theoretic measure

  • Range: \([0, 1]\) (1 = perfect match)

from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score

# Compare all algorithms on blobs dataset
X = X_blobs
y_true = y_blobs

algorithms = [
    ('K-Means', KMeans(n_clusters=4, random_state=42, n_init=10)),
    ('Hierarchical', AgglomerativeClustering(n_clusters=4)),
    ('Spectral', SpectralClustering(n_clusters=4, random_state=42)),
    ('DBSCAN', DBSCAN(eps=0.3, min_samples=5))
]

results = []

for name, algorithm in algorithms:
    y_pred = algorithm.fit_predict(X)
    
    # Skip if only one cluster or noise-only
    if len(np.unique(y_pred[y_pred >= 0])) < 2:
        continue
    
    # External metrics
    ari = adjusted_rand_score(y_true, y_pred)
    nmi = normalized_mutual_info_score(y_true, y_pred)
    
    # Internal metrics (only for non-noise points)
    mask = y_pred >= 0
    if np.sum(mask) > 0:
        silhouette = silhouette_score(X[mask], y_pred[mask])
        db_index = davies_bouldin_score(X[mask], y_pred[mask])
        ch_index = calinski_harabasz_score(X[mask], y_pred[mask])
    else:
        silhouette = db_index = ch_index = np.nan
    
    results.append({
        'Algorithm': name,
        'ARI': ari,
        'NMI': nmi,
        'Silhouette': silhouette,
        'Davies-Bouldin': db_index,
        'Calinski-Harabasz': ch_index
    })

# Create comparison table
df_results = pd.DataFrame(results)
print("\nClustering Algorithm Comparison (Blobs Dataset):")
print("="*80)
print(df_results.to_string(index=False))
print("\nMetric Interpretation:")
print("  ARI, NMI, Silhouette: Higher is better")
print("  Davies-Bouldin: Lower is better")
print("  Calinski-Harabasz: Higher is better")

# Visualize metrics
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# External metrics
x = np.arange(len(df_results))
width = 0.35
axes[0].bar(x - width/2, df_results['ARI'], width, label='ARI', alpha=0.8)
axes[0].bar(x + width/2, df_results['NMI'], width, label='NMI', alpha=0.8)
axes[0].set_ylabel('Score')
axes[0].set_title('External Metrics\n(with ground truth)')
axes[0].set_xticks(x)
axes[0].set_xticklabels(df_results['Algorithm'], rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Silhouette
axes[1].bar(x, df_results['Silhouette'], alpha=0.8, color='green')
axes[1].set_ylabel('Score')
axes[1].set_title('Silhouette Score\n(higher is better)')
axes[1].set_xticks(x)
axes[1].set_xticklabels(df_results['Algorithm'], rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

# Davies-Bouldin
axes[2].bar(x, df_results['Davies-Bouldin'], alpha=0.8, color='red')
axes[2].set_ylabel('Score')
axes[2].set_title('Davies-Bouldin Index\n(lower is better)')
axes[2].set_xticks(x)
axes[2].set_xticklabels(df_results['Algorithm'], rotation=45)
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

SummaryΒΆ

In this notebook, we covered major clustering algorithms:

  1. K-Means: Fast, spherical clusters

  2. Hierarchical: Dendrogram, nested clusters

  3. Spectral: Non-convex shapes via graph Laplacian

  4. DBSCAN: Density-based, identifies outliers

  5. Affinity Propagation: Automatic cluster count via message passing

  6. Evaluation Metrics: Internal and external measures

Algorithm ComparisonΒΆ

Algorithm

Cluster Shape

# Clusters

Outliers

Complexity

Best For

K-Means

Spherical

Fixed

No

O(nKT)

Large datasets, spherical clusters

Hierarchical

Any

Hierarchical

No

O(nΒ²) or O(nΒ³)

Small datasets, dendrograms

Spectral

Non-convex

Fixed

No

O(nΒ³)

Non-convex shapes

DBSCAN

Arbitrary

Automatic

Yes

O(n log n)

Varying shapes, noise

Affinity Prop

Any

Automatic

No

O(nΒ²T)

Exemplar-based, small datasets

Choosing an AlgorithmΒΆ

Use K-Means if:

  • Large dataset

  • Spherical clusters

  • Know number of clusters

  • Speed is important

Use Hierarchical if:

  • Need dendrogram

  • Want nested clusters

  • Small to medium dataset

  • Flexible on cluster count

Use Spectral if:

  • Non-convex cluster shapes

  • Graph structure important

  • Can afford O(nΒ³) complexity

Use DBSCAN if:

  • Arbitrary cluster shapes

  • Unknown number of clusters

  • Need outlier detection

  • Varying cluster sizes OK

Use Affinity Propagation if:

  • Need exemplars

  • Unknown cluster count

  • Small to medium dataset

  • No initialization wanted

Practical TipsΒΆ

  1. Always standardize data before clustering

  2. Try multiple algorithms and compare

  3. Use internal metrics when no ground truth

  4. Visualize results when possible (2D/3D)

  5. K-means++ initialization reduces sensitivity

  6. Elbow method for choosing K in K-means

  7. Silhouette analysis for cluster quality

ExercisesΒΆ

  1. Implement mini-batch K-means for large datasets

  2. Use elbow method to find optimal K

  3. Compare clustering on PCA-reduced data

  4. Apply hierarchical clustering with different linkages

  5. Tune DBSCAN parameters using grid search