Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression, make_classification
import pandas as pd

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

1. Lasso Regression (L1 Regularization)¶

Objective Function¶

\[\hat{\mathbf{w}}_{Lasso} = \arg\min_{\mathbf{w}} \|\mathbf{y} - \mathbf{X}\mathbf{w}\|_2^2 + \lambda\|\mathbf{w}\|_1\]

Where:

\(\|\mathbf{w}\|_1 = \sum_{j=1}^p |w_j|\) is the L1 norm
\(\lambda\) controls sparsity (larger \(\lambda\) → more zeros)

Key Properties¶

Sparsity: L1 penalty drives many coefficients exactly to zero
Feature selection: Automatically selects relevant features
No closed-form solution: Requires iterative algorithms (coordinate descent, LARS, etc.)

Soft Thresholding Operator¶

\[S(z, \gamma) = \text{sign}(z) \max(|z| - \gamma, 0)\]

# Visualize soft thresholding
z = np.linspace(-3, 3, 100)
gammas = [0, 0.5, 1.0, 1.5]

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
for gamma in gammas:
    s_z = np.sign(z) * np.maximum(np.abs(z) - gamma, 0)
    plt.plot(z, s_z, linewidth=2, label=f'γ={gamma}')
plt.plot(z, z, 'k--', alpha=0.3, label='Identity')
plt.xlabel('z')
plt.ylabel('S(z, γ)')
plt.title('Soft Thresholding Operator')
plt.legend()
plt.grid(True, alpha=0.3)

# Compare L1 and L2 penalties
plt.subplot(1, 2, 2)
w = np.linspace(-2, 2, 100)
l1_penalty = np.abs(w)
l2_penalty = w**2

plt.plot(w, l1_penalty, 'b-', linewidth=2, label='L1: |w|')
plt.plot(w, l2_penalty, 'r-', linewidth=2, label='L2: w²')
plt.xlabel('w')
plt.ylabel('Penalty')
plt.title('L1 vs L2 Penalty Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Key difference:")
print("- L1 penalty is linear → encourages exact zeros")
print("- L2 penalty is quadratic → shrinks but doesn't zero out")

Coordinate Descent for Lasso¶

Update one coordinate at a time while holding others fixed:

\[w_j \leftarrow S\left(\frac{1}{n}\sum_{i=1}^n x_{ij}r_i, \lambda\right) / \left(\frac{1}{n}\sum_{i=1}^n x_{ij}^2\right)\]

Where \(r_i = y_i - \sum_{k \neq j} x_{ik}w_k\) is the partial residual.

class LassoCoordinateDescent:
    """Lasso Regression using Coordinate Descent"""
    
    def __init__(self, lambda_=1.0, max_iter=1000, tol=1e-4):
        self.lambda_ = lambda_
        self.max_iter = max_iter
        self.tol = tol
        self.coef_path = []
    
    def soft_threshold(self, z, gamma):
        """Soft thresholding operator"""
        return np.sign(z) * np.maximum(np.abs(z) - gamma, 0)
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # Initialize weights
        self.w = np.zeros(n_features)
        
        # Precompute column norms
        X_col_norm_sq = np.sum(X**2, axis=0) / n_samples
        
        for iteration in range(self.max_iter):
            w_old = self.w.copy()
            
            # Update each coordinate
            for j in range(n_features):
                # Compute partial residual
                r_j = y - X @ self.w + X[:, j] * self.w[j]
                
                # Compute correlation with feature j
                rho_j = np.dot(X[:, j], r_j) / n_samples
                
                # Soft thresholding update
                if X_col_norm_sq[j] > 1e-10:  # Avoid division by zero
                    self.w[j] = self.soft_threshold(rho_j, self.lambda_) / X_col_norm_sq[j]
                else:
                    self.w[j] = 0
            
            # Track coefficient path
            if iteration % 10 == 0:
                self.coef_path.append(self.w.copy())
            
            # Check convergence
            if np.linalg.norm(self.w - w_old) < self.tol:
                break
        
        return self
    
    def predict(self, X):
        return X @ self.w
    
    def mse(self, X, y):
        return np.mean((y - self.predict(X))**2)
    
    def n_nonzero(self):
        return np.sum(np.abs(self.w) > 1e-10)

# Generate sparse data (only few features are relevant)
np.random.seed(42)
n_samples = 100
n_features = 50
n_informative = 10

X, y, true_coef = make_regression(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative,
    noise=10,
    coef=True,
    random_state=42
)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit custom Lasso
lasso_custom = LassoCoordinateDescent(lambda_=1.0, max_iter=1000)
lasso_custom.fit(X_train, y_train)

# Fit sklearn Lasso (alpha = lambda / n_samples)
lasso_sklearn = Lasso(alpha=1.0/len(X_train), max_iter=1000)
lasso_sklearn.fit(X_train, y_train)

# Compare
print("Lasso Regression Results:")
print(f"True number of nonzero coefficients: {np.sum(np.abs(true_coef) > 1e-10)}")
print(f"\nCustom Lasso:")
print(f"  Number of nonzero coefficients: {lasso_custom.n_nonzero()}")
print(f"  Train MSE: {lasso_custom.mse(X_train, y_train):.2f}")
print(f"  Test MSE: {lasso_custom.mse(X_test, y_test):.2f}")

print(f"\nSklearn Lasso:")
print(f"  Number of nonzero coefficients: {np.sum(np.abs(lasso_sklearn.coef_) > 1e-10)}")
print(f"  Train MSE: {mean_squared_error(y_train, lasso_sklearn.predict(X_train)):.2f}")
print(f"  Test MSE: {mean_squared_error(y_test, lasso_sklearn.predict(X_test)):.2f}")

# Visualize coefficients
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# True coefficients
axes[0, 0].stem(range(n_features), true_coef, basefmt=' ')
axes[0, 0].set_xlabel('Feature index')
axes[0, 0].set_ylabel('Coefficient value')
axes[0, 0].set_title('True Coefficients (Sparse)')
axes[0, 0].grid(True, alpha=0.3)

# Custom Lasso coefficients
axes[0, 1].stem(range(n_features), lasso_custom.w, basefmt=' ')
axes[0, 1].set_xlabel('Feature index')
axes[0, 1].set_ylabel('Coefficient value')
axes[0, 1].set_title(f'Custom Lasso ({lasso_custom.n_nonzero()} nonzero)')
axes[0, 1].grid(True, alpha=0.3)

# Sklearn Lasso coefficients
axes[1, 0].stem(range(n_features), lasso_sklearn.coef_, basefmt=' ')
axes[1, 0].set_xlabel('Feature index')
axes[1, 0].set_ylabel('Coefficient value')
axes[1, 0].set_title(f'Sklearn Lasso ({np.sum(np.abs(lasso_sklearn.coef_) > 1e-10)} nonzero)')
axes[1, 0].grid(True, alpha=0.3)

# Coefficient convergence path
coef_path = np.array(lasso_custom.coef_path)
for j in range(min(10, n_features)):
    axes[1, 1].plot(coef_path[:, j], alpha=0.7)
axes[1, 1].set_xlabel('Iteration (×10)')
axes[1, 1].set_ylabel('Coefficient value')
axes[1, 1].set_title('Coefficient Path (First 10 features)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

2. Regularization Path: Ridge vs Lasso¶

What: This code sweeps \(\lambda\) across several orders of magnitude and tracks how each coefficient evolves, creating “regularization paths” for both Ridge and Lasso. It also plots sparsity (number of nonzero coefficients) and total weight magnitude as functions of \(\lambda\).

Why: The regularization path reveals the fundamental difference between L1 and L2 penalties. Ridge coefficients shrink smoothly toward zero but never reach it, while Lasso coefficients hit exactly zero at finite \(\lambda\) values – a consequence of the L1 norm’s non-differentiability at the origin. The sparsity plot makes this explicit: Lasso transitions from a dense model (low \(\lambda\)) to a completely sparse one (high \(\lambda\)), enabling a principled trade-off between model complexity and fit.

Connection: Regularization paths are computed efficiently by the LARS algorithm (for Lasso) and analytically (for Ridge), making them practical tools for model selection. In genomics, regularization paths over thousands of genes help identify which few drive a disease phenotype. In production systems, visualizing the path helps ML engineers choose \(\lambda\) that balances interpretability with predictive accuracy.

# Test range of lambda values
lambdas = np.logspace(-3, 2, 50)

# Storage for coefficient paths
lasso_coefs = []
ridge_coefs = []
lasso_nonzero = []
ridge_nonzero = []

for lam in lambdas:
    # Lasso
    lasso = Lasso(alpha=lam/len(X_train), max_iter=5000)
    lasso.fit(X_train, y_train)
    lasso_coefs.append(lasso.coef_)
    lasso_nonzero.append(np.sum(np.abs(lasso.coef_) > 1e-4))
    
    # Ridge
    ridge = Ridge(alpha=lam)
    ridge.fit(X_train, y_train)
    ridge_coefs.append(ridge.coef_)
    ridge_nonzero.append(np.sum(np.abs(ridge.coef_) > 1e-4))

lasso_coefs = np.array(lasso_coefs)
ridge_coefs = np.array(ridge_coefs)

# Visualize regularization paths
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Lasso path
axes[0, 0].semilogx(lambdas, lasso_coefs[:, :20], alpha=0.7)
axes[0, 0].set_xlabel('λ (regularization strength)')
axes[0, 0].set_ylabel('Coefficient value')
axes[0, 0].set_title('Lasso Regularization Path (First 20 features)')
axes[0, 0].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0, 0].grid(True, alpha=0.3)

# Ridge path
axes[0, 1].semilogx(lambdas, ridge_coefs[:, :20], alpha=0.7)
axes[0, 1].set_xlabel('λ (regularization strength)')
axes[0, 1].set_ylabel('Coefficient value')
axes[0, 1].set_title('Ridge Regularization Path (First 20 features)')
axes[0, 1].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0, 1].grid(True, alpha=0.3)

# Number of nonzero coefficients
axes[1, 0].semilogx(lambdas, lasso_nonzero, 'b-', linewidth=2, label='Lasso')
axes[1, 0].semilogx(lambdas, ridge_nonzero, 'r--', linewidth=2, label='Ridge')
axes[1, 0].set_xlabel('λ (regularization strength)')
axes[1, 0].set_ylabel('Number of nonzero coefficients')
axes[1, 0].set_title('Sparsity Comparison')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Coefficient magnitude (L2 norm)
lasso_norms = [np.linalg.norm(c) for c in lasso_coefs]
ridge_norms = [np.linalg.norm(c) for c in ridge_coefs]
axes[1, 1].semilogx(lambdas, lasso_norms, 'b-', linewidth=2, label='Lasso')
axes[1, 1].semilogx(lambdas, ridge_norms, 'r--', linewidth=2, label='Ridge')
axes[1, 1].set_xlabel('λ (regularization strength)')
axes[1, 1].set_ylabel('||w|| (coefficient norm)')
axes[1, 1].set_title('Weight Magnitude Comparison')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Observations:")
print("- Lasso: Coefficients become exactly zero (sparse)")
print("- Ridge: Coefficients shrink but rarely reach zero")
print("- Lasso performs automatic feature selection")

3. Elastic Net: Combining L1 and L2¶

Objective Function¶

\[\hat{\mathbf{w}}_{EN} = \arg\min_{\mathbf{w}} \|\mathbf{y} - \mathbf{X}\mathbf{w}\|_2^2 + \lambda_1\|\mathbf{w}\|_1 + \lambda_2\|\mathbf{w}\|_2^2\]

Or equivalently:

\[\hat{\mathbf{w}}_{EN} = \arg\min_{\mathbf{w}} \|\mathbf{y} - \mathbf{X}\mathbf{w}\|_2^2 + \lambda\left(\alpha\|\mathbf{w}\|_1 + \frac{1-\alpha}{2}\|\mathbf{w}\|_2^2\right)\]

Where:

\(\alpha \in [0, 1]\) controls the mix of L1 and L2
\(\alpha = 1\): Pure Lasso
\(\alpha = 0\): Pure Ridge

Advantages¶

Combines benefits of both Lasso and Ridge
Can select groups of correlated features (unlike Lasso)
More stable than Lasso with correlated features

# Generate data with correlated features
np.random.seed(42)
n_samples = 100
n_features = 20

# Create correlation structure
X_uncorr = np.random.randn(n_samples, n_features)
# Make features 0-4 highly correlated
X_corr = X_uncorr.copy()
for i in range(1, 5):
    X_corr[:, i] = X_corr[:, 0] + 0.1 * np.random.randn(n_samples)

# True coefficients (only first 5 features are informative)
true_coef = np.zeros(n_features)
true_coef[:5] = [5, 4, 3, 2, 1]

y_corr = X_corr @ true_coef + np.random.randn(n_samples) * 2

# Split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_corr, y_corr, test_size=0.3, random_state=42)

# Standardize
scaler = StandardScaler()
X_train_c = scaler.fit_transform(X_train_c)
X_test_c = scaler.transform(X_test_c)

# Compare Ridge, Lasso, and Elastic Net
alpha_val = 1.0

ridge = Ridge(alpha=alpha_val)
lasso = Lasso(alpha=alpha_val/len(X_train_c))
enet = ElasticNet(alpha=alpha_val/len(X_train_c), l1_ratio=0.5)  # 50-50 mix

ridge.fit(X_train_c, y_train_c)
lasso.fit(X_train_c, y_train_c)
enet.fit(X_train_c, y_train_c)

# Results
print("Comparison on Correlated Features:")
print(f"\nTrue nonzero coefficients: {np.sum(np.abs(true_coef) > 0)}")
print(f"\nRidge:")
print(f"  Nonzero: {np.sum(np.abs(ridge.coef_) > 1e-4)}")
print(f"  Test MSE: {mean_squared_error(y_test_c, ridge.predict(X_test_c)):.2f}")
print(f"\nLasso:")
print(f"  Nonzero: {np.sum(np.abs(lasso.coef_) > 1e-4)}")
print(f"  Test MSE: {mean_squared_error(y_test_c, lasso.predict(X_test_c)):.2f}")
print(f"\nElastic Net:")
print(f"  Nonzero: {np.sum(np.abs(enet.coef_) > 1e-4)}")
print(f"  Test MSE: {mean_squared_error(y_test_c, enet.predict(X_test_c)):.2f}")

# Visualize coefficients
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# True coefficients
axes[0, 0].stem(range(n_features), true_coef, basefmt=' ')
axes[0, 0].axvline(x=4.5, color='r', linestyle='--', alpha=0.5, label='Correlated group')
axes[0, 0].set_xlabel('Feature index')
axes[0, 0].set_ylabel('Coefficient')
axes[0, 0].set_title('True Coefficients (Features 0-4 are correlated)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Ridge
axes[0, 1].stem(range(n_features), ridge.coef_, basefmt=' ')
axes[0, 1].axvline(x=4.5, color='r', linestyle='--', alpha=0.5)
axes[0, 1].set_xlabel('Feature index')
axes[0, 1].set_ylabel('Coefficient')
axes[0, 1].set_title('Ridge (keeps all correlated features)')
axes[0, 1].grid(True, alpha=0.3)

# Lasso
axes[1, 0].stem(range(n_features), lasso.coef_, basefmt=' ')
axes[1, 0].axvline(x=4.5, color='r', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('Feature index')
axes[1, 0].set_ylabel('Coefficient')
axes[1, 0].set_title('Lasso (picks one from correlated group)')
axes[1, 0].grid(True, alpha=0.3)

# Elastic Net
axes[1, 1].stem(range(n_features), enet.coef_, basefmt=' ')
axes[1, 1].axvline(x=4.5, color='r', linestyle='--', alpha=0.5)
axes[1, 1].set_xlabel('Feature index')
axes[1, 1].set_ylabel('Coefficient')
axes[1, 1].set_title('Elastic Net (balances both)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nKey insight:")
print("- Ridge: Shrinks all correlated features similarly")
print("- Lasso: Arbitrarily picks one from correlated group")
print("- Elastic Net: Keeps multiple correlated features (more stable)")

Elastic Net: Effect of l1_ratio¶

What: This code varies the l1_ratio parameter \(\alpha \in [0, 1]\) from pure Ridge (\(\alpha = 0\)) through mixed Elastic Net to pure Lasso (\(\alpha = 1\)), visualizing how each setting distributes nonzero coefficients across features – particularly the correlated group (features 0-4).

Why: The l1_ratio controls the geometry of the penalty region. At \(\alpha = 0\), the constraint set is a smooth ball (L2), producing dense solutions. At \(\alpha = 1\), it is a diamond (L1), producing sparse but unstable solutions with correlated features. Intermediate values create a rounded diamond that inherits sparsity from L1 while the L2 component ensures the grouping effect: correlated features are selected or dropped together rather than arbitrarily picking one.

Connection: Choosing l1_ratio is common in scikit-learn pipelines via ElasticNetCV. In practice, genomics and neuroimaging applications favor \(\alpha \approx 0.5\) because biological features (genes, brain regions) are naturally correlated, and selecting groups rather than individuals yields more reproducible and interpretable models.

# Try different l1_ratio values
l1_ratios = [0, 0.25, 0.5, 0.75, 1.0]

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, l1_ratio in enumerate(l1_ratios):
    enet = ElasticNet(alpha=1.0/len(X_train_c), l1_ratio=l1_ratio, max_iter=5000)
    enet.fit(X_train_c, y_train_c)
    
    test_mse = mean_squared_error(y_test_c, enet.predict(X_test_c))
    n_nonzero = np.sum(np.abs(enet.coef_) > 1e-4)
    
    axes[idx].stem(range(n_features), enet.coef_, basefmt=' ')
    axes[idx].axvline(x=4.5, color='r', linestyle='--', alpha=0.5)
    axes[idx].set_xlabel('Feature index')
    axes[idx].set_ylabel('Coefficient')
    
    if l1_ratio == 0:
        title = f'Pure Ridge (α=0)'
    elif l1_ratio == 1:
        title = f'Pure Lasso (α=1)'
    else:
        title = f'Elastic Net (α={l1_ratio})'
    
    axes[idx].set_title(f'{title}\nNonzero: {n_nonzero}, MSE: {test_mse:.2f}')
    axes[idx].grid(True, alpha=0.3)

# Hide extra subplot
axes[5].axis('off')

plt.tight_layout()
plt.show()

4. Feature Selection with Lasso¶

What: This code uses LassoCV with 5-fold cross-validation to automatically select the optimal regularization strength \(\lambda\) on a 100-feature dataset with only 15 truly informative features. It then evaluates feature selection quality using precision (fraction of selected features that are truly informative) and recall (fraction of truly informative features that are recovered).

Why: In high-dimensional settings where \(p \gg n\) or \(p \approx n\), ordinary least squares overfits catastrophically. Lasso’s L1 penalty provides a principled approach to embedded feature selection – simultaneously fitting the model and selecting features. Cross-validation ensures the chosen \(\lambda\) generalizes, avoiding both over-selection (too many spurious features at low \(\lambda\)) and under-selection (missing true features at high \(\lambda\)).

Connection: Lasso-based feature selection is a standard preprocessing step in bioinformatics (gene selection), NLP (vocabulary pruning), and financial modeling (factor selection). It often serves as a fast first pass before applying more expensive methods like stability selection or Bayesian variable selection to refine the feature set.

# Generate high-dimensional sparse data
np.random.seed(42)
n_samples = 200
n_features = 100
n_informative = 15

X_sparse, y_sparse, true_coef_sparse = make_regression(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative,
    noise=20,
    coef=True,
    random_state=42
)

# Standardize
scaler = StandardScaler()
X_sparse = scaler.fit_transform(X_sparse)

# Split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_sparse, y_sparse, test_size=0.3, random_state=42)

# Cross-validation to find optimal alpha
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(cv=5, random_state=42, max_iter=5000)
lasso_cv.fit(X_train_s, y_train_s)

print("Feature Selection Results:")
print(f"Optimal alpha: {lasso_cv.alpha_:.4f}")
print(f"True informative features: {n_informative}")
print(f"Selected features (nonzero): {np.sum(np.abs(lasso_cv.coef_) > 1e-4)}")
print(f"Test R²: {lasso_cv.score(X_test_s, y_test_s):.4f}")

# Get selected features
selected_features = np.where(np.abs(lasso_cv.coef_) > 1e-4)[0]
true_features = np.where(np.abs(true_coef_sparse) > 1e-4)[0]

# Calculate precision and recall
true_positives = len(set(selected_features) & set(true_features))
precision = true_positives / len(selected_features) if len(selected_features) > 0 else 0
recall = true_positives / len(true_features) if len(true_features) > 0 else 0

print(f"\nFeature Selection Quality:")
print(f"Precision: {precision:.2f} (fraction of selected features that are true)")
print(f"Recall: {recall:.2f} (fraction of true features that are selected)")

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# True coefficients
axes[0].stem(range(n_features), true_coef_sparse, basefmt=' ')
axes[0].set_xlabel('Feature index')
axes[0].set_ylabel('True coefficient')
axes[0].set_title(f'True Coefficients ({len(true_features)} informative)')
axes[0].grid(True, alpha=0.3)

# Lasso selected coefficients
axes[1].stem(range(n_features), lasso_cv.coef_, basefmt=' ')
axes[1].set_xlabel('Feature index')
axes[1].set_ylabel('Lasso coefficient')
axes[1].set_title(f'Lasso Selected ({len(selected_features)} features)')
axes[1].grid(True, alpha=0.3)

# Comparison scatter plot
axes[2].scatter(true_coef_sparse, lasso_cv.coef_, alpha=0.6)
axes[2].plot([-50, 50], [-50, 50], 'r--', alpha=0.5, label='Perfect recovery')
axes[2].set_xlabel('True coefficient')
axes[2].set_ylabel('Lasso coefficient')
axes[2].set_title('Coefficient Recovery')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

5. Sparse Logistic Regression¶

L1 regularization can also be applied to logistic regression for feature selection in classification.

\[\min_{\mathbf{w}} -\sum_{i=1}^n \left[y_i \log(\mu_i) + (1-y_i)\log(1-\mu_i)\right] + \lambda\|\mathbf{w}\|_1\]

# Generate high-dimensional classification data
np.random.seed(42)
X_cls_sparse, y_cls_sparse = make_classification(
    n_samples=300,
    n_features=50,
    n_informative=10,
    n_redundant=0,
    n_classes=2,
    random_state=42
)

# Standardize
scaler = StandardScaler()
X_cls_sparse = scaler.fit_transform(X_cls_sparse)

# Split
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls_sparse, y_cls_sparse, test_size=0.3, random_state=42)

# Compare L1 and L2 logistic regression
C_value = 0.1  # Inverse of regularization strength

logreg_l1 = LogisticRegression(penalty='l1', C=C_value, solver='liblinear', max_iter=1000)
logreg_l2 = LogisticRegression(penalty='l2', C=C_value, max_iter=1000)
logreg_none = LogisticRegression(penalty=None, max_iter=1000)

logreg_l1.fit(X_train_cls, y_train_cls)
logreg_l2.fit(X_train_cls, y_train_cls)
logreg_none.fit(X_train_cls, y_train_cls)

print("Sparse Logistic Regression Results:")
print(f"\nL1 (Lasso):")
print(f"  Nonzero features: {np.sum(np.abs(logreg_l1.coef_) > 1e-4)}")
print(f"  Train accuracy: {logreg_l1.score(X_train_cls, y_train_cls):.4f}")
print(f"  Test accuracy: {logreg_l1.score(X_test_cls, y_test_cls):.4f}")

print(f"\nL2 (Ridge):")
print(f"  Nonzero features: {np.sum(np.abs(logreg_l2.coef_) > 1e-4)}")
print(f"  Train accuracy: {logreg_l2.score(X_train_cls, y_train_cls):.4f}")
print(f"  Test accuracy: {logreg_l2.score(X_test_cls, y_test_cls):.4f}")

print(f"\nNo regularization:")
print(f"  Nonzero features: {np.sum(np.abs(logreg_none.coef_) > 1e-4)}")
print(f"  Train accuracy: {logreg_none.score(X_train_cls, y_train_cls):.4f}")
print(f"  Test accuracy: {logreg_none.score(X_test_cls, y_test_cls):.4f}")

# Visualize coefficients
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# L1
axes[0].stem(range(50), logreg_l1.coef_[0], basefmt=' ')
axes[0].set_xlabel('Feature index')
axes[0].set_ylabel('Coefficient')
axes[0].set_title(f'L1 Logistic Regression\n({np.sum(np.abs(logreg_l1.coef_) > 1e-4)} selected)')
axes[0].grid(True, alpha=0.3)

# L2
axes[1].stem(range(50), logreg_l2.coef_[0], basefmt=' ')
axes[1].set_xlabel('Feature index')
axes[1].set_ylabel('Coefficient')
axes[1].set_title('L2 Logistic Regression\n(all features used)')
axes[1].grid(True, alpha=0.3)

# No regularization
axes[2].stem(range(50), logreg_none.coef_[0], basefmt=' ')
axes[2].set_xlabel('Feature index')
axes[2].set_ylabel('Coefficient')
axes[2].set_title('No Regularization\n(potentially overfitting)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Summary¶

In this notebook, we covered:

Lasso (L1): Sparse solutions via soft thresholding
Coordinate Descent: Efficient algorithm for Lasso
Regularization Paths: How coefficients evolve with λ
Elastic Net: Combining L1 and L2 for stability
Feature Selection: Using sparsity for interpretability
Sparse Logistic Regression: L1 regularization for classification

Key Takeaways¶

L1 vs L2:
- L1 (Lasso): Induces sparsity, performs feature selection
- L2 (Ridge): Shrinks coefficients, doesn’t zero them out
Elastic Net: Best of both worlds
- Handles correlated features better than Lasso
- More stable than pure Lasso
- Controlled by l1_ratio parameter
When to use:
- Lasso: When you believe only few features are relevant
- Ridge: When all features contribute, no sparsity needed
- Elastic Net: When features are correlated and you want sparsity

Exercises¶

Implement proximal gradient descent for Lasso
Derive the soft thresholding operator
Implement group Lasso for selecting feature groups
Compare LARS (Least Angle Regression) with coordinate descent
Implement adaptive Lasso (weighted L1 penalty)