Run this notebook: Open in Colab Open in Kaggle

Plot Feature Agglomeration Vs Univariate Selection¶

============================================== Feature agglomeration vs. univariate selection¶

This example compares 2 dimensionality reduction strategies:

univariate feature selection with Anova
feature agglomeration with Ward hierarchical clustering

Both methods are compared in a regression problem using a BayesianRidge as supervised estimator.

Imports for Comparing Feature Agglomeration and Univariate Selection¶

This example compares two fundamentally different dimensionality reduction strategies in a regression pipeline: univariate feature selection (ANOVA F-test via SelectPercentile) which ranks each feature independently by its correlation with the target, and feature agglomeration (FeatureAgglomeration with Ward linkage) which groups spatially adjacent features into clusters and replaces each group with its mean. Both are combined with BayesianRidge regression and their hyperparameters (number of clusters or percentile of features) are tuned via GridSearchCV.

Spatial structure matters: The synthetic data has a spatial structure – the true weight map has two square regions of opposite sign on a 40x40 image grid. Feature agglomeration with grid_to_graph connectivity respects this spatial layout, merging neighboring pixels into coherent parcels that preserve the block structure of the true weights. Univariate selection, by contrast, treats each pixel independently and selects scattered individual features. The inverse_transform method maps the learned coefficients back to the original image space for visualization, revealing that agglomeration recovers smoother, more spatially coherent weight maps while univariate selection produces a noisier, point-wise reconstruction. The joblib.Memory caching avoids redundant computation during the grid search cross-validation folds.

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

# %%
import shutil
import tempfile

import matplotlib.pyplot as plt
import numpy as np
from joblib import Memory
from scipy import linalg, ndimage

from sklearn import feature_selection
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline

# %%
# Set parameters
n_samples = 200
size = 40  # image size
roi_size = 15
snr = 5.0
np.random.seed(0)

# %%
# Generate data
coef = np.zeros((size, size))
coef[0:roi_size, 0:roi_size] = -1.0
coef[-roi_size:, -roi_size:] = 1.0

X = np.random.randn(n_samples, size**2)
for x in X:  # smooth data
    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())

# %%
# add noise
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.0)) / linalg.norm(noise, 2)
y += noise_coef * noise

# %%
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(2)  # cross-validation generator for model selection
ridge = BayesianRidge()
cachedir = tempfile.mkdtemp()
mem = Memory(location=cachedir, verbose=1)

# %%
# Ward agglomeration followed by BayesianRidge
connectivity = grid_to_graph(n_x=size, n_y=size)
ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem)
clf = Pipeline([("ward", ward), ("ridge", ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {"ward__n_clusters": [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator_.steps[-1][1].coef_
coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# %%
# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
anova = feature_selection.SelectPercentile(f_regression)
clf = Pipeline([("anova", anova), ("ridge", ridge)])
# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, {"anova__percentile": [5, 10, 20]}, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator_.steps[-1][1].coef_
coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
coef_selection_ = coef_.reshape(size, size)

# %%
# Inverse the transformation to plot the results on an image
plt.close("all")
plt.figure(figsize=(7.3, 2.7))
plt.subplot(1, 3, 1)
plt.imshow(coef, interpolation="nearest", cmap=plt.cm.RdBu_r)
plt.title("True weights")
plt.subplot(1, 3, 2)
plt.imshow(coef_selection_, interpolation="nearest", cmap=plt.cm.RdBu_r)
plt.title("Feature Selection")
plt.subplot(1, 3, 3)
plt.imshow(coef_agglomeration_, interpolation="nearest", cmap=plt.cm.RdBu_r)
plt.title("Feature Agglomeration")
plt.subplots_adjust(0.04, 0.0, 0.98, 0.94, 0.16, 0.26)
plt.show()

# %%
# Attempt to remove the temporary cachedir, but don't worry if it fails
shutil.rmtree(cachedir, ignore_errors=True)