Run this notebook: Open in Colab Open in Kaggle

Feature Engineering: Turning Raw Data into Model Fuel¶

Better features beat better models. This notebook covers encoding, scaling, date features, interaction terms, and target encoding — the techniques that separate 70% accuracy from 90%.

import pandas as pd
import numpy as np
from sklearn.preprocessing import (
    LabelEncoder, OrdinalEncoder, OneHotEncoder,
    StandardScaler, MinMaxScaler, RobustScaler,
    PolynomialFeatures
)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Synthetic e-commerce dataset
np.random.seed(42)
n = 2000

df = pd.DataFrame({
    'user_id':        np.arange(n),
    'signup_date':    pd.date_range('2020-01-01', periods=n, freq='12h'),
    'last_login':     pd.date_range('2023-01-01', periods=n, freq='6h') + pd.to_timedelta(np.random.randint(0, 365, n), unit='D'),
    'country':        np.random.choice(['US', 'UK', 'DE', 'FR', 'JP', 'Other'], n, p=[0.4, 0.15, 0.1, 0.1, 0.1, 0.15]),
    'plan':           np.random.choice(['free', 'basic', 'pro', 'enterprise'], n, p=[0.5, 0.25, 0.15, 0.1]),
    'total_spent':    np.random.lognormal(4, 1.5, n).round(2),
    'n_purchases':    np.random.poisson(5, n),
    'session_count':  np.random.poisson(20, n),
    'avg_session_min':np.random.exponential(15, n).round(1),
    'support_tickets':np.random.poisson(1, n),
    'referral_source':np.random.choice(['organic', 'paid', 'email', 'social', 'partner'], n),
})

# Target: churned (1) or retained (0)
# Higher engagement → lower churn
churn_prob = 1 / (1 + np.exp(0.5 * np.log(df['session_count'] + 1) - 0.3 * df['support_tickets'] + np.random.normal(0, 1, n)))
df['churned'] = (churn_prob > 0.5).astype(int)

print(f'Dataset: {df.shape}, Churn rate: {df["churned"].mean():.1%}')
df.head()

1. Categorical Encoding — Choosing the Right Method¶

# Method 1: One-Hot Encoding (for low-cardinality, no order)
# Use when: < 10-15 categories, no natural order
country_ohe = pd.get_dummies(df['country'], prefix='country')
print(f'One-Hot Encoding: {country_ohe.shape[1]} columns')
print(country_ohe.head(3))

# Method 2: Ordinal Encoding (for ordered categories)
# Use when: there IS a natural order (free < basic < pro < enterprise)
plan_order = [['free', 'basic', 'pro', 'enterprise']]
oe = OrdinalEncoder(categories=plan_order)
df['plan_encoded'] = oe.fit_transform(df[['plan']])
print('\nOrdinal encoded plan:')
print(df[['plan', 'plan_encoded']].drop_duplicates().sort_values('plan_encoded'))

# Method 3: Target Encoding (for high-cardinality, regression/classification)
# Use when: many categories (100+), each category's relationship to target matters
# ⚠️ MUST be done only on training data to avoid leakage!

def target_encode(train: pd.DataFrame, test: pd.DataFrame, col: str, target: str, smoothing: float = 10.0) -> tuple:
    """
    Target encoding with smoothing.
    Smoothing prevents overfitting for rare categories.
    smooth = global_mean when n→0, category_mean when n→∞
    """
    global_mean = train[target].mean()
    stats = train.groupby(col)[target].agg(['mean', 'count'])
    stats['smoothed'] = (
        (stats['count'] * stats['mean'] + smoothing * global_mean)
        / (stats['count'] + smoothing)
    )
    train_encoded = train[col].map(stats['smoothed']).fillna(global_mean)
    test_encoded  = test[col].map(stats['smoothed']).fillna(global_mean)
    return train_encoded, test_encoded

X_train, X_test, y_train, y_test = train_test_split(df, df['churned'], test_size=0.2, random_state=42)

X_train['country_te'], X_test['country_te'] = target_encode(X_train, X_test, 'country', 'churned')
X_train['source_te'], X_test['source_te']   = target_encode(X_train, X_test, 'referral_source', 'churned')

print('Target encoding churn rates by country:')
print(X_train.groupby('country')['country_te'].mean().sort_values(ascending=False))

2. Date & Time Features¶

reference_date = pd.Timestamp('2024-01-01')

def extract_date_features(df: pd.DataFrame, date_col: str, ref_date=None) -> pd.DataFrame:
    """Extract rich features from a datetime column."""
    dt = df[date_col]
    prefix = date_col
    
    features = pd.DataFrame({
        f'{prefix}_year':       dt.dt.year,
        f'{prefix}_month':      dt.dt.month,
        f'{prefix}_dayofweek':  dt.dt.dayofweek,       # 0=Mon, 6=Sun
        f'{prefix}_is_weekend': dt.dt.dayofweek.isin([5, 6]).astype(int),
        f'{prefix}_quarter':    dt.dt.quarter,
        f'{prefix}_is_month_start': dt.dt.is_month_start.astype(int),
        # Cyclical encoding for month (avoids December→January discontinuity)
        f'{prefix}_month_sin':  np.sin(2 * np.pi * dt.dt.month / 12),
        f'{prefix}_month_cos':  np.cos(2 * np.pi * dt.dt.month / 12),
    })
    
    if ref_date:
        features[f'{prefix}_days_ago'] = (ref_date - dt).dt.days
    
    return features

signup_features = extract_date_features(df, 'signup_date', reference_date)
print('Generated date features:')
print(signup_features.columns.tolist())
print(signup_features.head(3))

# Recency, Frequency, Monetary (RFM) — classic feature engineering pattern
df['days_since_signup']  = (reference_date - df['signup_date']).dt.days
df['days_since_login']   = (reference_date - df['last_login']).dt.days
df['avg_spend_per_order']= (df['total_spent'] / (df['n_purchases'] + 1)).round(2)
df['tickets_per_session']= (df['support_tickets'] / (df['session_count'] + 1)).round(4)
df['engagement_score']   = (
    np.log1p(df['session_count']) * df['avg_session_min'] / (df['days_since_login'] + 1)
).round(4)

print('RFM-style features:')
print(df[['days_since_signup', 'days_since_login', 'avg_spend_per_order',
          'tickets_per_session', 'engagement_score']].describe().round(2))

3. Scaling — Which Scaler for Which Situation¶

num_cols = ['total_spent', 'session_count', 'avg_session_min', 'support_tickets']
sample = df[num_cols].head(200)

scalers = {
    'Original':       sample,
    'StandardScaler': pd.DataFrame(StandardScaler().fit_transform(sample), columns=num_cols),
    'MinMaxScaler':   pd.DataFrame(MinMaxScaler().fit_transform(sample), columns=num_cols),
    'RobustScaler':   pd.DataFrame(RobustScaler().fit_transform(sample), columns=num_cols),
}

fig, axes = plt.subplots(len(scalers), len(num_cols), figsize=(16, 12))
for i, (scaler_name, data) in enumerate(scalers.items()):
    for j, col in enumerate(num_cols):
        axes[i, j].hist(data[col], bins=20, edgecolor='black', alpha=0.7)
        if j == 0:
            axes[i, j].set_ylabel(scaler_name, fontsize=10, fontweight='bold')
        if i == 0:
            axes[i, j].set_title(col, fontsize=10)
plt.suptitle('Scaler Comparison', fontsize=14)
plt.tight_layout()
plt.show()

print('When to use which scaler:')
print('  StandardScaler: normally distributed features, linear models, SVM')
print('  MinMaxScaler:   bounded output [0,1] needed, neural networks')
print('  RobustScaler:   outliers present (uses IQR, not std)')

4. Polynomial & Interaction Features¶

# Polynomial features: capture non-linear relationships
base_features = df[['session_count', 'total_spent']].head(200)

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly_features = pd.DataFrame(
    poly.fit_transform(base_features),
    columns=poly.get_feature_names_out(['session_count', 'total_spent'])
)
print('Polynomial (degree=2) features:')
print(poly_features.columns.tolist())
print(poly_features.head(3).round(2))

# Feature importance: compare raw vs. engineered features
from sklearn.ensemble import RandomForestClassifier

# Base features only
base_cols = ['n_purchases', 'session_count', 'avg_session_min', 'support_tickets', 'total_spent', 'plan_encoded']

# Engineered features added
eng_cols  = base_cols + ['days_since_login', 'avg_spend_per_order', 'tickets_per_session', 'engagement_score']

target = 'churned'
results = {}

for name, features in [('Base features', base_cols), ('+ Engineered features', eng_cols)]:
    X_tr = df[features].iloc[:1600]
    X_te = df[features].iloc[1600:]
    y_tr = df[target].iloc[:1600]
    y_te = df[target].iloc[1600:]
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_tr, y_tr)
    acc = accuracy_score(y_te, rf.predict(X_te))
    results[name] = acc
    print(f'{name}: {acc:.3f} accuracy')

improvement = (results['+ Engineered features'] - results['Base features']) / results['Base features'] * 100
print(f'\nAccuracy improvement from feature engineering: {improvement:+.1f}%')

Feature Engineering Cheat Sheet¶

Data Type        → Feature Ideas
────────────────────────────────────────────────────
Datetime         → year, month, weekday, hour, is_weekend,
                   cyclical sin/cos, days since reference
Counts           → log1p(x), sqrt(x), x_per_session
Monetary         → log1p(spend), spend_per_order, quantile_bin
Categorical      → OHE (low card), Ordinal (ordered), Target (high card)
Text             → TF-IDF, char count, word count, sentiment
IDs              → Count encode (frequency of that ID in data)
Relationships    → ratio features, interaction products

Exercises¶

Add a log1p transformation to total_spent and n_purchases. Does it improve model accuracy?
Implement a FrequencyEncoder that replaces each category with its frequency in the training set.
Create a days_active feature: days between signup_date and last_login.
Use SelectKBest(f_classif, k=10) to automatically select the 10 most informative features.
Plot feature importances from the RandomForest for both base and engineered feature sets.