Feature Engineering: Turning Raw Data into Model FuelΒΆ
Better features beat better models. This notebook covers encoding, scaling, date features, interaction terms, and target encoding β the techniques that separate 70% accuracy from 90%.
import pandas as pd
import numpy as np
from sklearn.preprocessing import (
LabelEncoder, OrdinalEncoder, OneHotEncoder,
StandardScaler, MinMaxScaler, RobustScaler,
PolynomialFeatures
)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Synthetic e-commerce dataset
np.random.seed(42)
n = 2000
df = pd.DataFrame({
'user_id': np.arange(n),
'signup_date': pd.date_range('2020-01-01', periods=n, freq='12h'),
'last_login': pd.date_range('2023-01-01', periods=n, freq='6h') + pd.to_timedelta(np.random.randint(0, 365, n), unit='D'),
'country': np.random.choice(['US', 'UK', 'DE', 'FR', 'JP', 'Other'], n, p=[0.4, 0.15, 0.1, 0.1, 0.1, 0.15]),
'plan': np.random.choice(['free', 'basic', 'pro', 'enterprise'], n, p=[0.5, 0.25, 0.15, 0.1]),
'total_spent': np.random.lognormal(4, 1.5, n).round(2),
'n_purchases': np.random.poisson(5, n),
'session_count': np.random.poisson(20, n),
'avg_session_min':np.random.exponential(15, n).round(1),
'support_tickets':np.random.poisson(1, n),
'referral_source':np.random.choice(['organic', 'paid', 'email', 'social', 'partner'], n),
})
# Target: churned (1) or retained (0)
# Higher engagement β lower churn
churn_prob = 1 / (1 + np.exp(0.5 * np.log(df['session_count'] + 1) - 0.3 * df['support_tickets'] + np.random.normal(0, 1, n)))
df['churned'] = (churn_prob > 0.5).astype(int)
print(f'Dataset: {df.shape}, Churn rate: {df["churned"].mean():.1%}')
df.head()
1. Categorical Encoding β Choosing the Right MethodΒΆ
# Method 1: One-Hot Encoding (for low-cardinality, no order)
# Use when: < 10-15 categories, no natural order
country_ohe = pd.get_dummies(df['country'], prefix='country')
print(f'One-Hot Encoding: {country_ohe.shape[1]} columns')
print(country_ohe.head(3))
# Method 2: Ordinal Encoding (for ordered categories)
# Use when: there IS a natural order (free < basic < pro < enterprise)
plan_order = [['free', 'basic', 'pro', 'enterprise']]
oe = OrdinalEncoder(categories=plan_order)
df['plan_encoded'] = oe.fit_transform(df[['plan']])
print('\nOrdinal encoded plan:')
print(df[['plan', 'plan_encoded']].drop_duplicates().sort_values('plan_encoded'))
# Method 3: Target Encoding (for high-cardinality, regression/classification)
# Use when: many categories (100+), each category's relationship to target matters
# β οΈ MUST be done only on training data to avoid leakage!
def target_encode(train: pd.DataFrame, test: pd.DataFrame, col: str, target: str, smoothing: float = 10.0) -> tuple:
"""
Target encoding with smoothing.
Smoothing prevents overfitting for rare categories.
smooth = global_mean when nβ0, category_mean when nββ
"""
global_mean = train[target].mean()
stats = train.groupby(col)[target].agg(['mean', 'count'])
stats['smoothed'] = (
(stats['count'] * stats['mean'] + smoothing * global_mean)
/ (stats['count'] + smoothing)
)
train_encoded = train[col].map(stats['smoothed']).fillna(global_mean)
test_encoded = test[col].map(stats['smoothed']).fillna(global_mean)
return train_encoded, test_encoded
X_train, X_test, y_train, y_test = train_test_split(df, df['churned'], test_size=0.2, random_state=42)
X_train['country_te'], X_test['country_te'] = target_encode(X_train, X_test, 'country', 'churned')
X_train['source_te'], X_test['source_te'] = target_encode(X_train, X_test, 'referral_source', 'churned')
print('Target encoding churn rates by country:')
print(X_train.groupby('country')['country_te'].mean().sort_values(ascending=False))
2. Date & Time FeaturesΒΆ
reference_date = pd.Timestamp('2024-01-01')
def extract_date_features(df: pd.DataFrame, date_col: str, ref_date=None) -> pd.DataFrame:
"""Extract rich features from a datetime column."""
dt = df[date_col]
prefix = date_col
features = pd.DataFrame({
f'{prefix}_year': dt.dt.year,
f'{prefix}_month': dt.dt.month,
f'{prefix}_dayofweek': dt.dt.dayofweek, # 0=Mon, 6=Sun
f'{prefix}_is_weekend': dt.dt.dayofweek.isin([5, 6]).astype(int),
f'{prefix}_quarter': dt.dt.quarter,
f'{prefix}_is_month_start': dt.dt.is_month_start.astype(int),
# Cyclical encoding for month (avoids DecemberβJanuary discontinuity)
f'{prefix}_month_sin': np.sin(2 * np.pi * dt.dt.month / 12),
f'{prefix}_month_cos': np.cos(2 * np.pi * dt.dt.month / 12),
})
if ref_date:
features[f'{prefix}_days_ago'] = (ref_date - dt).dt.days
return features
signup_features = extract_date_features(df, 'signup_date', reference_date)
print('Generated date features:')
print(signup_features.columns.tolist())
print(signup_features.head(3))
# Recency, Frequency, Monetary (RFM) β classic feature engineering pattern
df['days_since_signup'] = (reference_date - df['signup_date']).dt.days
df['days_since_login'] = (reference_date - df['last_login']).dt.days
df['avg_spend_per_order']= (df['total_spent'] / (df['n_purchases'] + 1)).round(2)
df['tickets_per_session']= (df['support_tickets'] / (df['session_count'] + 1)).round(4)
df['engagement_score'] = (
np.log1p(df['session_count']) * df['avg_session_min'] / (df['days_since_login'] + 1)
).round(4)
print('RFM-style features:')
print(df[['days_since_signup', 'days_since_login', 'avg_spend_per_order',
'tickets_per_session', 'engagement_score']].describe().round(2))
3. Scaling β Which Scaler for Which SituationΒΆ
num_cols = ['total_spent', 'session_count', 'avg_session_min', 'support_tickets']
sample = df[num_cols].head(200)
scalers = {
'Original': sample,
'StandardScaler': pd.DataFrame(StandardScaler().fit_transform(sample), columns=num_cols),
'MinMaxScaler': pd.DataFrame(MinMaxScaler().fit_transform(sample), columns=num_cols),
'RobustScaler': pd.DataFrame(RobustScaler().fit_transform(sample), columns=num_cols),
}
fig, axes = plt.subplots(len(scalers), len(num_cols), figsize=(16, 12))
for i, (scaler_name, data) in enumerate(scalers.items()):
for j, col in enumerate(num_cols):
axes[i, j].hist(data[col], bins=20, edgecolor='black', alpha=0.7)
if j == 0:
axes[i, j].set_ylabel(scaler_name, fontsize=10, fontweight='bold')
if i == 0:
axes[i, j].set_title(col, fontsize=10)
plt.suptitle('Scaler Comparison', fontsize=14)
plt.tight_layout()
plt.show()
print('When to use which scaler:')
print(' StandardScaler: normally distributed features, linear models, SVM')
print(' MinMaxScaler: bounded output [0,1] needed, neural networks')
print(' RobustScaler: outliers present (uses IQR, not std)')
4. Polynomial & Interaction FeaturesΒΆ
# Polynomial features: capture non-linear relationships
base_features = df[['session_count', 'total_spent']].head(200)
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly_features = pd.DataFrame(
poly.fit_transform(base_features),
columns=poly.get_feature_names_out(['session_count', 'total_spent'])
)
print('Polynomial (degree=2) features:')
print(poly_features.columns.tolist())
print(poly_features.head(3).round(2))
# Feature importance: compare raw vs. engineered features
from sklearn.ensemble import RandomForestClassifier
# Base features only
base_cols = ['n_purchases', 'session_count', 'avg_session_min', 'support_tickets', 'total_spent', 'plan_encoded']
# Engineered features added
eng_cols = base_cols + ['days_since_login', 'avg_spend_per_order', 'tickets_per_session', 'engagement_score']
target = 'churned'
results = {}
for name, features in [('Base features', base_cols), ('+ Engineered features', eng_cols)]:
X_tr = df[features].iloc[:1600]
X_te = df[features].iloc[1600:]
y_tr = df[target].iloc[:1600]
y_te = df[target].iloc[1600:]
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_tr, y_tr)
acc = accuracy_score(y_te, rf.predict(X_te))
results[name] = acc
print(f'{name}: {acc:.3f} accuracy')
improvement = (results['+ Engineered features'] - results['Base features']) / results['Base features'] * 100
print(f'\nAccuracy improvement from feature engineering: {improvement:+.1f}%')
Feature Engineering Cheat SheetΒΆ
Data Type β Feature Ideas
ββββββββββββββββββββββββββββββββββββββββββββββββββββ
Datetime β year, month, weekday, hour, is_weekend,
cyclical sin/cos, days since reference
Counts β log1p(x), sqrt(x), x_per_session
Monetary β log1p(spend), spend_per_order, quantile_bin
Categorical β OHE (low card), Ordinal (ordered), Target (high card)
Text β TF-IDF, char count, word count, sentiment
IDs β Count encode (frequency of that ID in data)
Relationships β ratio features, interaction products
ExercisesΒΆ
Add a
log1ptransformation tototal_spentandn_purchases. Does it improve model accuracy?Implement a
FrequencyEncoderthat replaces each category with its frequency in the training set.Create a
days_activefeature: days betweensignup_dateandlast_login.Use
SelectKBest(f_classif, k=10)to automatically select the 10 most informative features.Plot feature importances from the RandomForest for both base and engineered feature sets.