Data Science Interview Prep β Part 2: Q16 to Q30ΒΆ
Continuation of 00_INTERVIEW_PREP.ipynb. Covers SQL, Deep Learning, System Design, and Practical questions.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn
import sqlite3
np.random.seed(42)
print('numpy:', np.__version__)
print('pandas:', pd.__version__)
print('sklearn:', sklearn.__version__)
Q16: SQL β JOIN types with a concrete exampleΒΆ
Join |
What it returns |
|---|---|
INNER |
Only matching rows from both tables |
LEFT |
All left rows + matching right (NULL if no match) |
RIGHT |
All right rows + matching left (NULL if no match) |
FULL OUTER |
All rows from both; NULL where no match |
Mnemonic: LEFT JOIN = βkeep everything on the left, bring what you can from the right.β
con = sqlite3.connect(':memory:')
cur = con.cursor()
cur.executescript("""
CREATE TABLE customers (id INTEGER, name TEXT);
INSERT INTO customers VALUES (1,'Alice'),(2,'Bob'),(3,'Carol'),(4,'Dave'),(5,'Eve');
CREATE TABLE orders (order_id INTEGER, customer_id INTEGER, amount REAL);
INSERT INTO orders VALUES (101,1,50),(102,2,30),(103,2,80),(104,3,20),(105,3,60),(106,99,10);
""")
joins = {
'INNER': "SELECT c.name, o.order_id FROM customers c INNER JOIN orders o ON c.id=o.customer_id",
'LEFT': "SELECT c.name, o.order_id FROM customers c LEFT JOIN orders o ON c.id=o.customer_id",
'RIGHT (simulated)': "SELECT c.name, o.order_id FROM orders o LEFT JOIN customers c ON c.id=o.customer_id",
'FULL OUTER (simulated)': """
SELECT c.name, o.order_id FROM customers c LEFT JOIN orders o ON c.id=o.customer_id
UNION
SELECT c.name, o.order_id FROM orders o LEFT JOIN customers c ON c.id=o.customer_id""",
}
for name, sql in joins.items():
rows = cur.execute(sql).fetchall()
print(f"{name}: {len(rows)} rows")
print(pd.DataFrame(rows, columns=['name','order_id']).to_string(index=False), '\n')
con.close()
Q17: SQL β Find duplicate rows and delete keeping most recentΒΆ
Three approaches to find duplicates:
GROUP BY ... HAVING COUNT(*) > 1β simplest, shows which values are duplicatedROW_NUMBER() OVER (PARTITION BY ...)β assigns rank within each group; keep rank=1Self-join on duplicate key columns β flexible for complex keys
Delete strategy: keep the row with the highest id (most recent insert).
con = sqlite3.connect(':memory:')
cur = con.cursor()
cur.executescript("""
CREATE TABLE events (id INTEGER PRIMARY KEY, email TEXT, event TEXT, ts TEXT);
INSERT INTO events VALUES
(1,'a@x.com','click','2024-01-01'),(2,'b@x.com','view','2024-01-01'),
(3,'a@x.com','click','2024-01-02'),(4,'c@x.com','click','2024-01-01'),
(5,'a@x.com','click','2024-01-03'),(6,'b@x.com','view','2024-01-02');
""")
print('=== GROUP BY HAVING ===')
print(pd.read_sql("SELECT email,event,COUNT(*) AS cnt FROM events GROUP BY email,event HAVING cnt>1", con))
print('\n=== ROW_NUMBER CTE ===')
rn_sql = """WITH rn AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY email,event ORDER BY id DESC) AS rn FROM events)
SELECT * FROM rn WHERE rn>1"""
print(pd.read_sql(rn_sql, con))
print('\n=== SELF JOIN ===')
sj = "SELECT a.* FROM events a JOIN events b ON a.email=b.email AND a.event=b.event AND a.id<b.id"
print(pd.read_sql(sj, con))
# Delete duplicates, keep latest
cur.execute("""DELETE FROM events WHERE id IN (
SELECT id FROM (SELECT id, ROW_NUMBER() OVER (PARTITION BY email,event ORDER BY id DESC) rn FROM events) WHERE rn>1)""")
con.commit()
print('\n=== After delete (deduped) ===')
print(pd.read_sql('SELECT * FROM events ORDER BY id', con))
con.close()
Q18: SQL β Top 3 customers by revenue per monthΒΆ
Pattern: CTE β aggregate revenue β RANK() OVER (PARTITION BY month ORDER BY revenue DESC) β filter rank <= 3.
WITH monthly AS (
SELECT strftime('%Y-%m', order_date) AS month,
customer_id,
SUM(amount) AS revenue
FROM orders GROUP BY 1, 2
),
ranked AS (
SELECT *, RANK() OVER (PARTITION BY month ORDER BY revenue DESC) AS rnk
FROM monthly
)
SELECT * FROM ranked WHERE rnk <= 3;
con = sqlite3.connect(':memory:')
np.random.seed(0)
dates = pd.date_range('2024-01-01', periods=60, freq='3D').strftime('%Y-%m-%d')
df = pd.DataFrame({'order_date': np.random.choice(dates, 120),
'customer_id': np.random.randint(1, 8, 120),
'amount': np.random.uniform(10, 500, 120).round(2)})
df.to_sql('orders', con, index=False)
result = pd.read_sql("""
WITH monthly AS (
SELECT strftime('%Y-%m', order_date) AS month, customer_id, SUM(amount) AS revenue
FROM orders GROUP BY 1, 2
),
ranked AS (
SELECT *, RANK() OVER (PARTITION BY month ORDER BY revenue DESC) AS rnk FROM monthly
)
SELECT month, customer_id, ROUND(revenue,2) AS revenue, rnk
FROM ranked WHERE rnk <= 3 ORDER BY month, rnk
""", con)
print(result.to_string(index=False))
con.close()
Q19: How does backpropagation work? Implement manually.ΒΆ
Chain rule in one line: dL/dW1 = dL/dA2 Β· dA2/dZ2 Β· dZ2/dA1 Β· dA1/dZ1 Β· dZ1/dW1
Steps:
Forward pass β compute activations layer by layer
Compute loss (MSE)
Backward pass β propagate gradient from output to input using chain rule
Update weights with
W -= lr * grad
Gradient check: numerical gradient = (L(W+Ξ΅) - L(W-Ξ΅)) / 2Ξ΅ should match analytical gradient.
sig = lambda x: 1/(1+np.exp(-x))
dsig = lambda x: sig(x)*(1-sig(x))
X = np.array([[0.5, -0.3]]); y = np.array([[0.7]])
W1 = np.random.randn(2,4)*0.1; b1 = np.zeros((1,4))
W2 = np.random.randn(4,1)*0.1; b2 = np.zeros((1,1))
def forward(X, W1,b1,W2,b2):
z1=X@W1+b1; a1=sig(z1); z2=a1@W2+b2; a2=sig(z2)
return z1,a1,z2,a2
def mse(pred,true): return np.mean((pred-true)**2)
z1,a1,z2,a2 = forward(X,W1,b1,W2,b2)
loss = mse(a2, y)
dL_da2 = 2*(a2-y)/y.size
dL_dz2 = dL_da2 * dsig(z2)
dL_dW2 = a1.T @ dL_dz2
dL_da1 = dL_dz2 @ W2.T
dL_dz1 = dL_da1 * dsig(z1)
dL_dW1 = X.T @ dL_dz1
eps=1e-5; num_grad_W2=np.zeros_like(W2)
for i in range(W2.shape[0]):
for j in range(W2.shape[1]):
W2[i,j]+=eps; _,_,_,a2p=forward(X,W1,b1,W2,b2); lp=mse(a2p,y)
W2[i,j]-=2*eps; _,_,_,a2m=forward(X,W1,b1,W2,b2); lm=mse(a2m,y)
W2[i,j]+=eps; num_grad_W2[i,j]=(lp-lm)/(2*eps)
max_diff = np.max(np.abs(num_grad_W2 - dL_dW2))
print(f'Loss: {loss:.6f}')
print(f'Max |numerical - analytical| grad (W2): {max_diff:.2e} -- pass: {max_diff < 1e-5}')
Q20: Explain the attention mechanism.ΒΆ
RNN problem: encoder compresses entire input into one fixed-size vector β early tokens get βforgotten.β
Attention solution: at each decoding step, look at ALL encoder states, compute a weighted sum.
Scaled dot-product attention: $\(\text{Attention}(Q, K, V) = \text{softmax}\!\left(\frac{QK^\top}{\sqrt{d_k}}\right) V\)$
Q (Query): what Iβm looking for
K (Key): what each token advertises
V (Value): what each token actually contains
Dividing by βd_k prevents softmax from saturating in high dimensions
Self-attention: Q, K, V all derived from the same sequence β tokens attend to each other.
def softmax(x, axis=-1):
e = np.exp(x - x.max(axis=axis, keepdims=True))
return e / e.sum(axis=axis, keepdims=True)
def attention(Q, K, V):
d_k = Q.shape[-1]
scores = Q @ K.T / np.sqrt(d_k)
weights = softmax(scores)
return weights @ V, weights
np.random.seed(7)
seq_len, d_k = 4, 8
Q = np.random.randn(seq_len, d_k)
K = np.random.randn(seq_len, d_k)
V = np.random.randn(seq_len, d_k)
out, weights = attention(Q, K, V)
print('Output shape:', out.shape)
print('Attention weights (rows=query tokens, cols=key tokens):')
print(np.round(weights, 3))
fig, ax = plt.subplots(figsize=(5, 4))
im = ax.imshow(weights, cmap='Blues', vmin=0, vmax=1)
ax.set_xlabel('Key token'); ax.set_ylabel('Query token')
ax.set_title('Scaled Dot-Product Attention Weights')
plt.colorbar(im, ax=ax)
plt.tight_layout(); plt.show()
Q21: Transfer learning β why does it work?ΒΆ
Hierarchy of learned features (CNNs):
Layer 1β2: edges, corners, color blobs (universal β same in any image domain)
Layer 3β4: textures, patterns (mostly universal)
Layer 5+: object parts, class-specific (task-specific)
Two strategies:
Strategy |
Frozen layers |
Trainable params |
When to use |
|---|---|---|---|
Feature extraction |
All conv layers |
Classifier head only (~512) |
Small dataset, similar domain |
Fine-tuning |
None (or early conv) |
All (~11M for ResNet18) |
Larger dataset, or different domain |
Why it works: SGD needs fewer examples when starting from a good initialization rather than random weights.
try:
import torchvision.models as models
import torch.nn as nn
model = models.resnet18(weights=None)
# Feature extraction: freeze all, replace head
for p in model.parameters(): p.requires_grad = False
model.fc = nn.Linear(512, 10)
fe_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# Fine-tuning: unfreeze all
for p in model.parameters(): p.requires_grad = True
ft_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Feature extraction: {fe_params:,} trainable params')
print(f'Fine-tuning: {ft_params:,} trainable params')
except ImportError:
# Simulate ResNet18 param counts
resnet18_total = 11_181_642
head_only = 512 * 10 + 10 # fc layer for 10 classes
print(f'Feature extraction: {head_only:,} trainable params')
print(f'Fine-tuning: {resnet18_total:,} trainable params')
print('(torchvision not installed β showing simulated counts)')
Q22: Tokenization β word vs subword vs characterΒΆ
Method |
Vocab size |
Sequence length |
OOV handling |
|---|---|---|---|
Word |
50Kβ1M |
Short |
Fails on new words |
Character |
~256 |
Very long |
No OOV β every char known |
BPE / subword |
32Kβ50K |
Medium |
Rare words split into known pieces |
BPE (Byte-Pair Encoding): start with characters, repeatedly merge the most frequent adjacent pair.
"unhappiness" β ["un", "##happi", "##ness"] (WordPiece, used in BERT)
GPT-2/3/4 uses BPE with ~50K tokens. Allows handling any new word by splitting into subword units.
from collections import Counter
corpus = ['low lower lowest', 'newer newest new', 'low low lower']
# Represent each word as char sequence ending with </w>
vocab = Counter()
for sent in corpus:
for word in sent.split():
vocab[' '.join(list(word)) + ' </w>'] += 1
def get_pairs(vocab):
pairs = Counter()
for word, freq in vocab.items():
syms = word.split()
for i in range(len(syms)-1): pairs[(syms[i], syms[i+1])] += freq
return pairs
def merge(vocab, pair):
out = {}; bigram = ' '.join(pair)
for word, freq in vocab.items():
out[word.replace(bigram, ''.join(pair))] = freq
return out
print('Initial vocab:', sorted(vocab.keys()))
for i in range(5):
pairs = get_pairs(vocab)
best = pairs.most_common(1)[0][0]
vocab = merge(vocab, best)
print(f'Merge {i+1}: {best} β {"" .join(best)}')
print('\nFinal vocab:', sorted(vocab.keys()))
word = 'unhappiness'
print(f'\nWord: 5 tokens (1 per word in short sentence)')
print(f'Char: {len(word)} tokens (1 per character)')
print(f'BPE: ~8 tokens (un + happi + ness + ...)')
Q23: System design β recommendation system for 100M usersΒΆ
Two-stage architecture (used by YouTube, Pinterest, TikTok):
Stage 1 β Candidate Generation:
2-tower model: user embedding + item embedding, dot product similarity
ANN index (FAISS / ScaNN): retrieve top-100 from millions in <10ms
Stage 2 β Ranking:
Full feature model (cross-features, context, user history)
Ranks 100 candidates with richer signals
Stage 3 β Re-ranking:
Diversity: avoid showing 10 identical items
Freshness: boost new content
Business rules: sponsored slots, content policies
diagram = """
User Request
β
βΌ
ββββββββββββββββββββββ
β Candidate Gen β 2-tower model + ANN index
β (millions β 100) β <10ms
ββββββββββ¬ββββββββββββ
β 100 candidates
βΌ
ββββββββββββββββββββββ
β Ranking Model β full features, CTR/CVR prediction
β (100 β ranked 20) β <30ms
ββββββββββ¬ββββββββββββ
β scored list
βΌ
ββββββββββββββββββββββ
β Re-ranking β diversity, freshness, business rules
β (20 β serve 10) β <5ms
ββββββββββ¬ββββββββββββ
β
βΌ
Served to User
"""
print(diagram)
class MockRecommender:
def __init__(self, n_items=1_000_000):
self.n_items = n_items
self.item_embeddings = np.random.randn(n_items, 32)
def candidate_gen(self, user_vec, k=100):
scores = self.item_embeddings[:10000] @ user_vec # sample for speed
return np.argsort(scores)[-k:][::-1]
def rank(self, candidates):
scores = np.random.rand(len(candidates)) # placeholder ranker
return candidates[np.argsort(scores)[::-1]][:20]
def rerank(self, ranked, n=10):
# Simple diversity: take every other item from two halves
return ranked[:n]
def recommend(self, user_vec, n=10):
candidates = self.candidate_gen(user_vec)
ranked = self.rank(candidates)
return self.rerank(ranked, n)
rec = MockRecommender()
user = np.random.randn(32)
recs = rec.recommend(user)
print(f'Top-10 recommendations: {recs}')
Q24: System design β real-time fraud detectionΒΆ
Key features:
Velocity: # transactions in last 1h / 24h
Amount z-score: how unusual is this amount vs userβs history?
Location anomaly: distance from usual location
Device fingerprint: new device not seen before?
Model architecture:
Rule engine: hard blocks (velocity > 20/hr, amount > $5K first txn)
GBM model: probabilistic score (must respond in <50ms)
Latency budget: feature lookup 5ms + model inference 10ms + rule check 2ms + overhead 33ms = 50ms total
import time
class FraudScorer:
def __init__(self):
self.user_history = {} # user_id -> list of (ts, amount)
def _velocity(self, uid, ts):
history = self.user_history.get(uid, [])
return sum(1 for t, _ in history if ts - t < 3600)
def _amount_zscore(self, uid, amount):
amounts = [a for _, a in self.user_history.get(uid, [])]
if len(amounts) < 3: return 0.0
return (amount - np.mean(amounts)) / (np.std(amounts) + 1e-9)
def score(self, uid, amount, ts, new_device=False):
vel = self._velocity(uid, ts)
zscore = self._amount_zscore(uid, amount)
reasons = []
if vel > 5: reasons.append(f'HIGH_VELOCITY({vel}/hr)')
if zscore > 3: reasons.append(f'AMOUNT_ANOMALY(z={zscore:.1f})')
if new_device and amount > 200: reasons.append('NEW_DEVICE_HIGH_AMOUNT')
self.user_history.setdefault(uid, []).append((ts, amount))
is_fraud = len(reasons) > 0
return is_fraud, reasons
scorer = FraudScorer()
np.random.seed(1)
ts0 = int(time.time())
fraud_events = []
for i in range(1000):
uid = np.random.randint(1, 50)
amount = np.random.lognormal(4, 1) # typical ~$55, long tail
ts = ts0 + np.random.randint(0, 7200)
new_dev = np.random.rand() < 0.05
is_fraud, reasons = scorer.score(uid, round(amount, 2), ts, new_dev)
if is_fraud: fraud_events.append({'uid': uid, 'amount': round(amount,2), 'reasons': ', '.join(reasons)})
print(f'Processed 1000 transactions. Flagged {len(fraud_events)} as fraud ({len(fraud_events)/10:.1f}%)')
print(pd.DataFrame(fraud_events[:10]).to_string(index=False))
Q25: ML pipeline that auto-retrains when performance degradesΒΆ
Trigger conditions for retraining:
Data drift: input distribution shifts (KS test p-value < threshold)
Performance drop: AUC on recent labeled data falls below baseline
Scheduled: periodic retraining regardless (e.g., weekly)
Best practice: use shadow mode β new model runs in parallel before promoting.
from sklearn.metrics import roc_auc_score
class ModelHealthMonitor:
def __init__(self, drift_alpha=0.05, perf_drop_threshold=0.05):
self.drift_alpha = drift_alpha
self.perf_threshold = perf_drop_threshold
def check_data_drift(self, reference, current):
stat, p = stats.ks_2samp(reference, current)
drift = p < self.drift_alpha
print(f' KS stat={stat:.3f}, p={p:.4f} β drift_detected={drift}')
return drift
def check_performance_drop(self, recent_preds, recent_labels, baseline_auc):
current_auc = roc_auc_score(recent_labels, recent_preds)
drop = baseline_auc - current_auc
dropped = drop > self.perf_threshold
print(f' baseline={baseline_auc:.3f}, current={current_auc:.3f}, drop={drop:.3f} β perf_dropped={dropped}')
return dropped
def should_retrain(self, drift, perf_drop):
retrain = drift or perf_drop
print(f' should_retrain={retrain}')
return retrain
monitor = ModelHealthMonitor()
ref = np.random.normal(0, 1, 1000)
baseline_auc = 0.85
periods = [('T1: Stable', np.random.normal(0,1,500), np.random.rand(200), 0.84),
('T2: Drift', np.random.normal(1,1.5,500), np.random.rand(200), 0.81),
('T3: Degraded', np.random.normal(2,2,500), np.random.rand(200)*0.3, 0.72)]
labels = (np.random.rand(200) > 0.5).astype(int)
for name, curr_feat, preds, curr_auc in periods:
print(f'\n--- {name} ---')
drift = monitor.check_data_drift(ref, curr_feat)
perf_drop = monitor.check_performance_drop(preds, labels, baseline_auc)
monitor.should_retrain(drift, perf_drop)
Q26: How to A/B test a new ML model?ΒΆ
Rollout stages:
Shadow (0%) β Canary (1%) β Ramp (10%) β Ramp (50%) β Full (100%)
Key rules:
Primary metric = business KPI (revenue, engagement), not model accuracy
Guardrails: p99 latency, error rate β auto-rollback if breached
Duration: at least 1β2 weeks to capture weekly seasonality
Sample ratio mismatch (SRM): check that traffic split is as intended
Multiple testing: use Bonferroni / sequential testing to avoid false positives
from scipy.stats import norm
true_effect = 0.02 # 2% lift
baseline_rate = 0.10
daily_traffic = 100_000
power = 0.80; alpha = 0.05
z_alpha = norm.ppf(1 - alpha/2)
z_beta = norm.ppf(power)
traffic_splits = [0.01, 0.05, 0.10, 0.25, 0.50]
results = []
for split in traffic_splits:
n_per_day = daily_traffic * split
p = baseline_rate
# Required N for 2-proportion z-test
n_required = 2 * ((z_alpha + z_beta)**2 * p*(1-p)) / (true_effect**2)
days = n_required / n_per_day
# SE of estimated effect at this sample size after 14 days
n_14days = n_per_day * 14
se = np.sqrt(2 * p*(1-p) / n_14days)
results.append({'traffic_%': f'{split*100:.0f}%', 'n_per_day': int(n_per_day),
'days_for_80pct_power': round(days, 1), 'SE_after_14d': round(se, 5)})
df_ab = pd.DataFrame(results)
print(df_ab.to_string(index=False))
print('\nConclusion: at 1% traffic, need', results[0]['days_for_80pct_power'],
'days. At 10%:', results[2]['days_for_80pct_power'], 'days.')
Q27: You get 99% accuracy. Should you deploy?ΒΆ
Short answer: probably not without deeper investigation.
With 1% positive class prevalence, a model that always predicts βnegativeβ achieves 99% accuracy but catches zero fraud/disease/defects.
Use the right metric for the task:
Fraud detection β Recall (minimize false negatives)
Spam filter β Precision (minimize false positives)
Medical screening β Sensitivity/Recall
General imbalanced β F1, PR-AUC, ROC-AUC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.dummy import DummyClassifier
np.random.seed(0)
n = 10_000
y_true = np.zeros(n, dtype=int)
y_true[:100] = 1 # 1% positive class
np.random.shuffle(y_true)
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(y_true.reshape(-1,1), y_true)
y_pred = dummy.predict(y_true.reshape(-1,1))
print(f'Accuracy: {accuracy_score(y_true, y_pred)*100:.1f}%')
print(classification_report(y_true, y_pred, target_names=['majority','minority']))
checklist = """
Before deploying "99% accuracy" model:
β‘ Check class balance β is 99% just predicting majority?
β‘ Evaluate on held-out test set (not validation)
β‘ Check precision/recall/F1 on the minority class
β‘ Calibration: are probabilities meaningful?
β‘ Fairness: does it perform equally across demographic groups?
β‘ Business metric: does it improve what actually matters?
β‘ Edge cases: what happens on rare/unusual inputs?
"""
print(checklist)
Q28: Model good offline, poor in production β debug it.ΒΆ
Top 3 causes:
Training-serving skew: preprocessing applied differently at train vs serve time (e.g., scaler fit on wrong data)
Label leakage: feature contains information from the future (e.g., a target-encoded column computed on full dataset)
Distribution shift: data in production has different statistical properties than training data (seasonal, demographic, behavioral changes)
Debug workflow: log raw features at serving time β compare distribution to training data β check feature pipelines for leakage.
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
np.random.seed(42)
X_train = np.random.randn(800, 5); y_train = (X_train[:,0] + np.random.randn(800)*0.5 > 0).astype(int)
X_test = np.random.randn(200, 5) + 0.3; y_test = (X_test[:,0] + np.random.randn(200)*0.5 > 0.3).astype(int)
# --- Correct: fit scaler on train only ---
sc_correct = StandardScaler().fit(X_train)
clf_c = LogisticRegression().fit(sc_correct.transform(X_train), y_train)
auc_correct = roc_auc_score(y_test, clf_c.predict_proba(sc_correct.transform(X_test))[:,1])
# --- Leaky: fit scaler on ALL data (train+test) ---
X_all = np.vstack([X_train, X_test])
sc_leaky = StandardScaler().fit(X_all)
clf_l = LogisticRegression().fit(sc_leaky.transform(X_train), y_train)
auc_leaky = roc_auc_score(y_test, clf_l.predict_proba(sc_leaky.transform(X_test))[:,1])
# --- Distribution shift: Q1 vs Q4 ---
X_q1 = np.random.randn(500, 5); y_q1 = (X_q1[:,0] > 0).astype(int)
X_q4 = np.random.randn(200, 5) + 1.5; y_q4 = (X_q4[:,0] + 1.5 > 0).astype(int) # shifted
sc_q1 = StandardScaler().fit(X_q1)
clf_q1 = LogisticRegression().fit(sc_q1.transform(X_q1), y_q1)
auc_shift = roc_auc_score(y_q4, clf_q1.predict_proba(sc_q1.transform(X_q4))[:,1])
print(f'Correct preprocessing: AUC = {auc_correct:.3f}')
print(f'Leaky (scaler on all): AUC = {auc_leaky:.3f} β inflated offline score')
print(f'Distribution shift Q4: AUC = {auc_shift:.3f} β degraded in prod')
print('\nDiagnostic checklist:')
for line in [' [Skew] Was scaler/encoder fit on train ONLY?',
' [Leakage] Does any feature use future or target data?',
' [Shift] Compare feature distributions: train vs prod logs']:
print(line)
Q29: Tell me about a project youβre proud of.ΒΆ
Use the STAR framework: Situation β Task β Action β Result
Worked Example: Churn Prediction
Situation: Our subscription business was losing ~8% of customers per month. The retention team was calling all at-risk customers manually, which was expensive and untargeted.
Task: Build a churn prediction model to prioritize which customers to contact, with the goal of reducing churn by 15% within 6 months.
Action:
Defined churn label carefully: no login in 30 days AND no subscription renewal (avoided leakage)
Engineered point-in-time features: login frequency trend, support ticket volume, feature adoption scores
Trained GBM model; PR-AUC 0.72 vs. baseline 0.41
Worked with eng to deploy as batch scoring job (weekly); integrated scores into CRM
A/B tested: intervention group (top 20% risk score) vs control β 2-week holdout
Result:
18% reduction in churn in intervention group (exceeded 15% target)
Retention team efficiency improved: same headcount, 3x more targeted outreach
Model monitoring dashboard in place; quarterly retraining cadence
Template prompts:
What was the business problem and why did it matter?
What was technically interesting or hard?
What decision did YOU make (vs. the team)?
How did you measure success?
What would you do differently?
Q30: What metrics do you use to evaluate a forecasting model?ΒΆ
Metric |
Formula |
Weakness |
|---|---|---|
MAE |
mean |
y - Ε· |
RMSE |
βmean(y-Ε·)Β² |
Penalizes outliers heavily |
MAPE |
mean |
y-Ε· |
sMAPE |
mean 2 |
y-Ε· |
MASE |
MAE / MAE_naive |
Scale-free, handles zeros |
Winkler |
Interval score penalizing width + coverage miss |
For probabilistic forecasts |
Rule of thumb: MAPE when actuals are always positive and non-zero. MASE otherwise (especially for intermittent demand).
np.random.seed(3)
n = 52 # weeks
t = np.arange(n)
y_true = 100 + 5*t + 20*np.sin(2*np.pi*t/52) + np.random.randn(n)*8
y_pred = 102 + 4.8*t + 18*np.sin(2*np.pi*t/52) # slightly off model
# Prediction intervals (simple: Β±1.96*sigma)
sigma = 12
lower = y_pred - 1.96*sigma; upper = y_pred + 1.96*sigma
def mae(y, yh): return np.mean(np.abs(y - yh))
def rmse(y, yh): return np.sqrt(np.mean((y - yh)**2))
def mape(y, yh): return np.mean(np.abs(y - yh) / np.abs(y)) * 100
def smape(y, yh): return np.mean(2*np.abs(y-yh)/(np.abs(y)+np.abs(yh))) * 100
def mase(y, yh):
naive = np.mean(np.abs(np.diff(y))) # naive: predict last value
return mae(y, yh) / naive
def winkler(y, lo, hi, alpha=0.05):
width = hi - lo
penalty = (2/alpha) * np.where(y < lo, lo-y, np.where(y > hi, y-hi, 0))
return np.mean(width + penalty)
metrics = {'MAE': mae(y_true, y_pred), 'RMSE': rmse(y_true, y_pred),
'MAPE (%)': mape(y_true, y_pred), 'sMAPE (%)': smape(y_true, y_pred),
'MASE': mase(y_true, y_pred), 'Winkler (95%)': winkler(y_true, lower, upper)}
df_m = pd.DataFrame({'Metric': metrics.keys(), 'Value': [round(v,4) for v in metrics.values()]})
print(df_m.to_string(index=False))
print('\nGuide: MAPE fails when actuals β 0; use sMAPE or MASE instead.')
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(t, y_true, label='Actual', color='steelblue')
ax.plot(t, y_pred, label='Forecast', color='tomato', linestyle='--')
ax.fill_between(t, lower, upper, alpha=0.15, color='tomato', label='95% interval')
ax.set_title('Synthetic Forecast vs Actual'); ax.legend()
plt.tight_layout(); plt.show()
Quick Reference CardΒΆ
30-Second Answers
p-value: P(data this extreme | H0 true) β NOT P(H0 is true)
Type I error: False positive (alpha = significance level)
Type II error: False negative (beta = 1 - power)
Bias-variance: Error = BiasΒ² + Variance + Noise
L1 vs L2: L1 = sparse (zeros), L2 = smooth (small)
Overfitting: low train error + high test error β regularize / more data
Random Forest: bootstrap + feature sampling β decorrelated trees β avg
Gradient Boosting: sequential residual fitting β additive model
Precision: TP/(TP+FP) β of flagged positives, how many were right?
Recall: TP/(TP+FN) β of actual positives, how many did we catch?
ROC-AUC: P(positive ranked above negative) β threshold-independent
Attention: softmax(QKα΅/βd)V β weighted sum over value vectors
Transfer learning: freeze early layers (universal), fine-tune late layers
Training-serving skew: model sees different distribution at serve time
Point-in-time feature: feature value as it was at prediction time (no future data)