import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression, make_classification
import pandas as pd
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)
1. Lasso Regression (L1 Regularization)ΒΆ
Objective FunctionΒΆ
Where:
\(\|\mathbf{w}\|_1 = \sum_{j=1}^p |w_j|\) is the L1 norm
\(\lambda\) controls sparsity (larger \(\lambda\) β more zeros)
Key PropertiesΒΆ
Sparsity: L1 penalty drives many coefficients exactly to zero
Feature selection: Automatically selects relevant features
No closed-form solution: Requires iterative algorithms (coordinate descent, LARS, etc.)
Soft Thresholding OperatorΒΆ
# Visualize soft thresholding
z = np.linspace(-3, 3, 100)
gammas = [0, 0.5, 1.0, 1.5]
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
for gamma in gammas:
s_z = np.sign(z) * np.maximum(np.abs(z) - gamma, 0)
plt.plot(z, s_z, linewidth=2, label=f'Ξ³={gamma}')
plt.plot(z, z, 'k--', alpha=0.3, label='Identity')
plt.xlabel('z')
plt.ylabel('S(z, Ξ³)')
plt.title('Soft Thresholding Operator')
plt.legend()
plt.grid(True, alpha=0.3)
# Compare L1 and L2 penalties
plt.subplot(1, 2, 2)
w = np.linspace(-2, 2, 100)
l1_penalty = np.abs(w)
l2_penalty = w**2
plt.plot(w, l1_penalty, 'b-', linewidth=2, label='L1: |w|')
plt.plot(w, l2_penalty, 'r-', linewidth=2, label='L2: wΒ²')
plt.xlabel('w')
plt.ylabel('Penalty')
plt.title('L1 vs L2 Penalty Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Key difference:")
print("- L1 penalty is linear β encourages exact zeros")
print("- L2 penalty is quadratic β shrinks but doesn't zero out")
Coordinate Descent for LassoΒΆ
Update one coordinate at a time while holding others fixed:
Where \(r_i = y_i - \sum_{k \neq j} x_{ik}w_k\) is the partial residual.
class LassoCoordinateDescent:
"""Lasso Regression using Coordinate Descent"""
def __init__(self, lambda_=1.0, max_iter=1000, tol=1e-4):
self.lambda_ = lambda_
self.max_iter = max_iter
self.tol = tol
self.coef_path = []
def soft_threshold(self, z, gamma):
"""Soft thresholding operator"""
return np.sign(z) * np.maximum(np.abs(z) - gamma, 0)
def fit(self, X, y):
n_samples, n_features = X.shape
# Initialize weights
self.w = np.zeros(n_features)
# Precompute column norms
X_col_norm_sq = np.sum(X**2, axis=0) / n_samples
for iteration in range(self.max_iter):
w_old = self.w.copy()
# Update each coordinate
for j in range(n_features):
# Compute partial residual
r_j = y - X @ self.w + X[:, j] * self.w[j]
# Compute correlation with feature j
rho_j = np.dot(X[:, j], r_j) / n_samples
# Soft thresholding update
if X_col_norm_sq[j] > 1e-10: # Avoid division by zero
self.w[j] = self.soft_threshold(rho_j, self.lambda_) / X_col_norm_sq[j]
else:
self.w[j] = 0
# Track coefficient path
if iteration % 10 == 0:
self.coef_path.append(self.w.copy())
# Check convergence
if np.linalg.norm(self.w - w_old) < self.tol:
break
return self
def predict(self, X):
return X @ self.w
def mse(self, X, y):
return np.mean((y - self.predict(X))**2)
def n_nonzero(self):
return np.sum(np.abs(self.w) > 1e-10)
# Generate sparse data (only few features are relevant)
np.random.seed(42)
n_samples = 100
n_features = 50
n_informative = 10
X, y, true_coef = make_regression(
n_samples=n_samples,
n_features=n_features,
n_informative=n_informative,
noise=10,
coef=True,
random_state=42
)
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit custom Lasso
lasso_custom = LassoCoordinateDescent(lambda_=1.0, max_iter=1000)
lasso_custom.fit(X_train, y_train)
# Fit sklearn Lasso (alpha = lambda / n_samples)
lasso_sklearn = Lasso(alpha=1.0/len(X_train), max_iter=1000)
lasso_sklearn.fit(X_train, y_train)
# Compare
print("Lasso Regression Results:")
print(f"True number of nonzero coefficients: {np.sum(np.abs(true_coef) > 1e-10)}")
print(f"\nCustom Lasso:")
print(f" Number of nonzero coefficients: {lasso_custom.n_nonzero()}")
print(f" Train MSE: {lasso_custom.mse(X_train, y_train):.2f}")
print(f" Test MSE: {lasso_custom.mse(X_test, y_test):.2f}")
print(f"\nSklearn Lasso:")
print(f" Number of nonzero coefficients: {np.sum(np.abs(lasso_sklearn.coef_) > 1e-10)}")
print(f" Train MSE: {mean_squared_error(y_train, lasso_sklearn.predict(X_train)):.2f}")
print(f" Test MSE: {mean_squared_error(y_test, lasso_sklearn.predict(X_test)):.2f}")
# Visualize coefficients
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# True coefficients
axes[0, 0].stem(range(n_features), true_coef, basefmt=' ')
axes[0, 0].set_xlabel('Feature index')
axes[0, 0].set_ylabel('Coefficient value')
axes[0, 0].set_title('True Coefficients (Sparse)')
axes[0, 0].grid(True, alpha=0.3)
# Custom Lasso coefficients
axes[0, 1].stem(range(n_features), lasso_custom.w, basefmt=' ')
axes[0, 1].set_xlabel('Feature index')
axes[0, 1].set_ylabel('Coefficient value')
axes[0, 1].set_title(f'Custom Lasso ({lasso_custom.n_nonzero()} nonzero)')
axes[0, 1].grid(True, alpha=0.3)
# Sklearn Lasso coefficients
axes[1, 0].stem(range(n_features), lasso_sklearn.coef_, basefmt=' ')
axes[1, 0].set_xlabel('Feature index')
axes[1, 0].set_ylabel('Coefficient value')
axes[1, 0].set_title(f'Sklearn Lasso ({np.sum(np.abs(lasso_sklearn.coef_) > 1e-10)} nonzero)')
axes[1, 0].grid(True, alpha=0.3)
# Coefficient convergence path
coef_path = np.array(lasso_custom.coef_path)
for j in range(min(10, n_features)):
axes[1, 1].plot(coef_path[:, j], alpha=0.7)
axes[1, 1].set_xlabel('Iteration (Γ10)')
axes[1, 1].set_ylabel('Coefficient value')
axes[1, 1].set_title('Coefficient Path (First 10 features)')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
2. Regularization Path: Ridge vs LassoΒΆ
What: This code sweeps \(\lambda\) across several orders of magnitude and tracks how each coefficient evolves, creating βregularization pathsβ for both Ridge and Lasso. It also plots sparsity (number of nonzero coefficients) and total weight magnitude as functions of \(\lambda\).
Why: The regularization path reveals the fundamental difference between L1 and L2 penalties. Ridge coefficients shrink smoothly toward zero but never reach it, while Lasso coefficients hit exactly zero at finite \(\lambda\) values β a consequence of the L1 normβs non-differentiability at the origin. The sparsity plot makes this explicit: Lasso transitions from a dense model (low \(\lambda\)) to a completely sparse one (high \(\lambda\)), enabling a principled trade-off between model complexity and fit.
Connection: Regularization paths are computed efficiently by the LARS algorithm (for Lasso) and analytically (for Ridge), making them practical tools for model selection. In genomics, regularization paths over thousands of genes help identify which few drive a disease phenotype. In production systems, visualizing the path helps ML engineers choose \(\lambda\) that balances interpretability with predictive accuracy.
# Test range of lambda values
lambdas = np.logspace(-3, 2, 50)
# Storage for coefficient paths
lasso_coefs = []
ridge_coefs = []
lasso_nonzero = []
ridge_nonzero = []
for lam in lambdas:
# Lasso
lasso = Lasso(alpha=lam/len(X_train), max_iter=5000)
lasso.fit(X_train, y_train)
lasso_coefs.append(lasso.coef_)
lasso_nonzero.append(np.sum(np.abs(lasso.coef_) > 1e-4))
# Ridge
ridge = Ridge(alpha=lam)
ridge.fit(X_train, y_train)
ridge_coefs.append(ridge.coef_)
ridge_nonzero.append(np.sum(np.abs(ridge.coef_) > 1e-4))
lasso_coefs = np.array(lasso_coefs)
ridge_coefs = np.array(ridge_coefs)
# Visualize regularization paths
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Lasso path
axes[0, 0].semilogx(lambdas, lasso_coefs[:, :20], alpha=0.7)
axes[0, 0].set_xlabel('Ξ» (regularization strength)')
axes[0, 0].set_ylabel('Coefficient value')
axes[0, 0].set_title('Lasso Regularization Path (First 20 features)')
axes[0, 0].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0, 0].grid(True, alpha=0.3)
# Ridge path
axes[0, 1].semilogx(lambdas, ridge_coefs[:, :20], alpha=0.7)
axes[0, 1].set_xlabel('Ξ» (regularization strength)')
axes[0, 1].set_ylabel('Coefficient value')
axes[0, 1].set_title('Ridge Regularization Path (First 20 features)')
axes[0, 1].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0, 1].grid(True, alpha=0.3)
# Number of nonzero coefficients
axes[1, 0].semilogx(lambdas, lasso_nonzero, 'b-', linewidth=2, label='Lasso')
axes[1, 0].semilogx(lambdas, ridge_nonzero, 'r--', linewidth=2, label='Ridge')
axes[1, 0].set_xlabel('Ξ» (regularization strength)')
axes[1, 0].set_ylabel('Number of nonzero coefficients')
axes[1, 0].set_title('Sparsity Comparison')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# Coefficient magnitude (L2 norm)
lasso_norms = [np.linalg.norm(c) for c in lasso_coefs]
ridge_norms = [np.linalg.norm(c) for c in ridge_coefs]
axes[1, 1].semilogx(lambdas, lasso_norms, 'b-', linewidth=2, label='Lasso')
axes[1, 1].semilogx(lambdas, ridge_norms, 'r--', linewidth=2, label='Ridge')
axes[1, 1].set_xlabel('Ξ» (regularization strength)')
axes[1, 1].set_ylabel('||w|| (coefficient norm)')
axes[1, 1].set_title('Weight Magnitude Comparison')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Observations:")
print("- Lasso: Coefficients become exactly zero (sparse)")
print("- Ridge: Coefficients shrink but rarely reach zero")
print("- Lasso performs automatic feature selection")
3. Elastic Net: Combining L1 and L2ΒΆ
Objective FunctionΒΆ
Or equivalently:
Where:
\(\alpha \in [0, 1]\) controls the mix of L1 and L2
\(\alpha = 1\): Pure Lasso
\(\alpha = 0\): Pure Ridge
AdvantagesΒΆ
Combines benefits of both Lasso and Ridge
Can select groups of correlated features (unlike Lasso)
More stable than Lasso with correlated features
# Generate data with correlated features
np.random.seed(42)
n_samples = 100
n_features = 20
# Create correlation structure
X_uncorr = np.random.randn(n_samples, n_features)
# Make features 0-4 highly correlated
X_corr = X_uncorr.copy()
for i in range(1, 5):
X_corr[:, i] = X_corr[:, 0] + 0.1 * np.random.randn(n_samples)
# True coefficients (only first 5 features are informative)
true_coef = np.zeros(n_features)
true_coef[:5] = [5, 4, 3, 2, 1]
y_corr = X_corr @ true_coef + np.random.randn(n_samples) * 2
# Split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
X_corr, y_corr, test_size=0.3, random_state=42)
# Standardize
scaler = StandardScaler()
X_train_c = scaler.fit_transform(X_train_c)
X_test_c = scaler.transform(X_test_c)
# Compare Ridge, Lasso, and Elastic Net
alpha_val = 1.0
ridge = Ridge(alpha=alpha_val)
lasso = Lasso(alpha=alpha_val/len(X_train_c))
enet = ElasticNet(alpha=alpha_val/len(X_train_c), l1_ratio=0.5) # 50-50 mix
ridge.fit(X_train_c, y_train_c)
lasso.fit(X_train_c, y_train_c)
enet.fit(X_train_c, y_train_c)
# Results
print("Comparison on Correlated Features:")
print(f"\nTrue nonzero coefficients: {np.sum(np.abs(true_coef) > 0)}")
print(f"\nRidge:")
print(f" Nonzero: {np.sum(np.abs(ridge.coef_) > 1e-4)}")
print(f" Test MSE: {mean_squared_error(y_test_c, ridge.predict(X_test_c)):.2f}")
print(f"\nLasso:")
print(f" Nonzero: {np.sum(np.abs(lasso.coef_) > 1e-4)}")
print(f" Test MSE: {mean_squared_error(y_test_c, lasso.predict(X_test_c)):.2f}")
print(f"\nElastic Net:")
print(f" Nonzero: {np.sum(np.abs(enet.coef_) > 1e-4)}")
print(f" Test MSE: {mean_squared_error(y_test_c, enet.predict(X_test_c)):.2f}")
# Visualize coefficients
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# True coefficients
axes[0, 0].stem(range(n_features), true_coef, basefmt=' ')
axes[0, 0].axvline(x=4.5, color='r', linestyle='--', alpha=0.5, label='Correlated group')
axes[0, 0].set_xlabel('Feature index')
axes[0, 0].set_ylabel('Coefficient')
axes[0, 0].set_title('True Coefficients (Features 0-4 are correlated)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
# Ridge
axes[0, 1].stem(range(n_features), ridge.coef_, basefmt=' ')
axes[0, 1].axvline(x=4.5, color='r', linestyle='--', alpha=0.5)
axes[0, 1].set_xlabel('Feature index')
axes[0, 1].set_ylabel('Coefficient')
axes[0, 1].set_title('Ridge (keeps all correlated features)')
axes[0, 1].grid(True, alpha=0.3)
# Lasso
axes[1, 0].stem(range(n_features), lasso.coef_, basefmt=' ')
axes[1, 0].axvline(x=4.5, color='r', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('Feature index')
axes[1, 0].set_ylabel('Coefficient')
axes[1, 0].set_title('Lasso (picks one from correlated group)')
axes[1, 0].grid(True, alpha=0.3)
# Elastic Net
axes[1, 1].stem(range(n_features), enet.coef_, basefmt=' ')
axes[1, 1].axvline(x=4.5, color='r', linestyle='--', alpha=0.5)
axes[1, 1].set_xlabel('Feature index')
axes[1, 1].set_ylabel('Coefficient')
axes[1, 1].set_title('Elastic Net (balances both)')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nKey insight:")
print("- Ridge: Shrinks all correlated features similarly")
print("- Lasso: Arbitrarily picks one from correlated group")
print("- Elastic Net: Keeps multiple correlated features (more stable)")
Elastic Net: Effect of l1_ratioΒΆ
What: This code varies the l1_ratio parameter \(\alpha \in [0, 1]\) from pure Ridge (\(\alpha = 0\)) through mixed Elastic Net to pure Lasso (\(\alpha = 1\)), visualizing how each setting distributes nonzero coefficients across features β particularly the correlated group (features 0-4).
Why: The l1_ratio controls the geometry of the penalty region. At \(\alpha = 0\), the constraint set is a smooth ball (L2), producing dense solutions. At \(\alpha = 1\), it is a diamond (L1), producing sparse but unstable solutions with correlated features. Intermediate values create a rounded diamond that inherits sparsity from L1 while the L2 component ensures the grouping effect: correlated features are selected or dropped together rather than arbitrarily picking one.
Connection: Choosing l1_ratio is common in scikit-learn pipelines via ElasticNetCV. In practice, genomics and neuroimaging applications favor \(\alpha \approx 0.5\) because biological features (genes, brain regions) are naturally correlated, and selecting groups rather than individuals yields more reproducible and interpretable models.
# Try different l1_ratio values
l1_ratios = [0, 0.25, 0.5, 0.75, 1.0]
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()
for idx, l1_ratio in enumerate(l1_ratios):
enet = ElasticNet(alpha=1.0/len(X_train_c), l1_ratio=l1_ratio, max_iter=5000)
enet.fit(X_train_c, y_train_c)
test_mse = mean_squared_error(y_test_c, enet.predict(X_test_c))
n_nonzero = np.sum(np.abs(enet.coef_) > 1e-4)
axes[idx].stem(range(n_features), enet.coef_, basefmt=' ')
axes[idx].axvline(x=4.5, color='r', linestyle='--', alpha=0.5)
axes[idx].set_xlabel('Feature index')
axes[idx].set_ylabel('Coefficient')
if l1_ratio == 0:
title = f'Pure Ridge (Ξ±=0)'
elif l1_ratio == 1:
title = f'Pure Lasso (Ξ±=1)'
else:
title = f'Elastic Net (Ξ±={l1_ratio})'
axes[idx].set_title(f'{title}\nNonzero: {n_nonzero}, MSE: {test_mse:.2f}')
axes[idx].grid(True, alpha=0.3)
# Hide extra subplot
axes[5].axis('off')
plt.tight_layout()
plt.show()
4. Feature Selection with LassoΒΆ
What: This code uses LassoCV with 5-fold cross-validation to automatically select the optimal regularization strength \(\lambda\) on a 100-feature dataset with only 15 truly informative features. It then evaluates feature selection quality using precision (fraction of selected features that are truly informative) and recall (fraction of truly informative features that are recovered).
Why: In high-dimensional settings where \(p \gg n\) or \(p \approx n\), ordinary least squares overfits catastrophically. Lassoβs L1 penalty provides a principled approach to embedded feature selection β simultaneously fitting the model and selecting features. Cross-validation ensures the chosen \(\lambda\) generalizes, avoiding both over-selection (too many spurious features at low \(\lambda\)) and under-selection (missing true features at high \(\lambda\)).
Connection: Lasso-based feature selection is a standard preprocessing step in bioinformatics (gene selection), NLP (vocabulary pruning), and financial modeling (factor selection). It often serves as a fast first pass before applying more expensive methods like stability selection or Bayesian variable selection to refine the feature set.
# Generate high-dimensional sparse data
np.random.seed(42)
n_samples = 200
n_features = 100
n_informative = 15
X_sparse, y_sparse, true_coef_sparse = make_regression(
n_samples=n_samples,
n_features=n_features,
n_informative=n_informative,
noise=20,
coef=True,
random_state=42
)
# Standardize
scaler = StandardScaler()
X_sparse = scaler.fit_transform(X_sparse)
# Split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
X_sparse, y_sparse, test_size=0.3, random_state=42)
# Cross-validation to find optimal alpha
from sklearn.linear_model import LassoCV
lasso_cv = LassoCV(cv=5, random_state=42, max_iter=5000)
lasso_cv.fit(X_train_s, y_train_s)
print("Feature Selection Results:")
print(f"Optimal alpha: {lasso_cv.alpha_:.4f}")
print(f"True informative features: {n_informative}")
print(f"Selected features (nonzero): {np.sum(np.abs(lasso_cv.coef_) > 1e-4)}")
print(f"Test RΒ²: {lasso_cv.score(X_test_s, y_test_s):.4f}")
# Get selected features
selected_features = np.where(np.abs(lasso_cv.coef_) > 1e-4)[0]
true_features = np.where(np.abs(true_coef_sparse) > 1e-4)[0]
# Calculate precision and recall
true_positives = len(set(selected_features) & set(true_features))
precision = true_positives / len(selected_features) if len(selected_features) > 0 else 0
recall = true_positives / len(true_features) if len(true_features) > 0 else 0
print(f"\nFeature Selection Quality:")
print(f"Precision: {precision:.2f} (fraction of selected features that are true)")
print(f"Recall: {recall:.2f} (fraction of true features that are selected)")
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# True coefficients
axes[0].stem(range(n_features), true_coef_sparse, basefmt=' ')
axes[0].set_xlabel('Feature index')
axes[0].set_ylabel('True coefficient')
axes[0].set_title(f'True Coefficients ({len(true_features)} informative)')
axes[0].grid(True, alpha=0.3)
# Lasso selected coefficients
axes[1].stem(range(n_features), lasso_cv.coef_, basefmt=' ')
axes[1].set_xlabel('Feature index')
axes[1].set_ylabel('Lasso coefficient')
axes[1].set_title(f'Lasso Selected ({len(selected_features)} features)')
axes[1].grid(True, alpha=0.3)
# Comparison scatter plot
axes[2].scatter(true_coef_sparse, lasso_cv.coef_, alpha=0.6)
axes[2].plot([-50, 50], [-50, 50], 'r--', alpha=0.5, label='Perfect recovery')
axes[2].set_xlabel('True coefficient')
axes[2].set_ylabel('Lasso coefficient')
axes[2].set_title('Coefficient Recovery')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
5. Sparse Logistic RegressionΒΆ
L1 regularization can also be applied to logistic regression for feature selection in classification.
# Generate high-dimensional classification data
np.random.seed(42)
X_cls_sparse, y_cls_sparse = make_classification(
n_samples=300,
n_features=50,
n_informative=10,
n_redundant=0,
n_classes=2,
random_state=42
)
# Standardize
scaler = StandardScaler()
X_cls_sparse = scaler.fit_transform(X_cls_sparse)
# Split
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
X_cls_sparse, y_cls_sparse, test_size=0.3, random_state=42)
# Compare L1 and L2 logistic regression
C_value = 0.1 # Inverse of regularization strength
logreg_l1 = LogisticRegression(penalty='l1', C=C_value, solver='liblinear', max_iter=1000)
logreg_l2 = LogisticRegression(penalty='l2', C=C_value, max_iter=1000)
logreg_none = LogisticRegression(penalty=None, max_iter=1000)
logreg_l1.fit(X_train_cls, y_train_cls)
logreg_l2.fit(X_train_cls, y_train_cls)
logreg_none.fit(X_train_cls, y_train_cls)
print("Sparse Logistic Regression Results:")
print(f"\nL1 (Lasso):")
print(f" Nonzero features: {np.sum(np.abs(logreg_l1.coef_) > 1e-4)}")
print(f" Train accuracy: {logreg_l1.score(X_train_cls, y_train_cls):.4f}")
print(f" Test accuracy: {logreg_l1.score(X_test_cls, y_test_cls):.4f}")
print(f"\nL2 (Ridge):")
print(f" Nonzero features: {np.sum(np.abs(logreg_l2.coef_) > 1e-4)}")
print(f" Train accuracy: {logreg_l2.score(X_train_cls, y_train_cls):.4f}")
print(f" Test accuracy: {logreg_l2.score(X_test_cls, y_test_cls):.4f}")
print(f"\nNo regularization:")
print(f" Nonzero features: {np.sum(np.abs(logreg_none.coef_) > 1e-4)}")
print(f" Train accuracy: {logreg_none.score(X_train_cls, y_train_cls):.4f}")
print(f" Test accuracy: {logreg_none.score(X_test_cls, y_test_cls):.4f}")
# Visualize coefficients
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# L1
axes[0].stem(range(50), logreg_l1.coef_[0], basefmt=' ')
axes[0].set_xlabel('Feature index')
axes[0].set_ylabel('Coefficient')
axes[0].set_title(f'L1 Logistic Regression\n({np.sum(np.abs(logreg_l1.coef_) > 1e-4)} selected)')
axes[0].grid(True, alpha=0.3)
# L2
axes[1].stem(range(50), logreg_l2.coef_[0], basefmt=' ')
axes[1].set_xlabel('Feature index')
axes[1].set_ylabel('Coefficient')
axes[1].set_title('L2 Logistic Regression\n(all features used)')
axes[1].grid(True, alpha=0.3)
# No regularization
axes[2].stem(range(50), logreg_none.coef_[0], basefmt=' ')
axes[2].set_xlabel('Feature index')
axes[2].set_ylabel('Coefficient')
axes[2].set_title('No Regularization\n(potentially overfitting)')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
SummaryΒΆ
In this notebook, we covered:
Lasso (L1): Sparse solutions via soft thresholding
Coordinate Descent: Efficient algorithm for Lasso
Regularization Paths: How coefficients evolve with Ξ»
Elastic Net: Combining L1 and L2 for stability
Feature Selection: Using sparsity for interpretability
Sparse Logistic Regression: L1 regularization for classification
Key TakeawaysΒΆ
L1 vs L2:
L1 (Lasso): Induces sparsity, performs feature selection
L2 (Ridge): Shrinks coefficients, doesnβt zero them out
Elastic Net: Best of both worlds
Handles correlated features better than Lasso
More stable than pure Lasso
Controlled by l1_ratio parameter
When to use:
Lasso: When you believe only few features are relevant
Ridge: When all features contribute, no sparsity needed
Elastic Net: When features are correlated and you want sparsity
ExercisesΒΆ
Implement proximal gradient descent for Lasso
Derive the soft thresholding operator
Implement group Lasso for selecting feature groups
Compare LARS (Least Angle Regression) with coordinate descent
Implement adaptive Lasso (weighted L1 penalty)