# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import cProfile
import pstats
from io import StringIO
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# Set random seed
np.random.seed(42)
print("β
Libraries imported successfully!")
print("\nπ‘ For full profiling functionality, install:")
print(" pip install line-profiler memory-profiler")
CPU Profiling with cProfile: Finding Where Time DisappearsΒΆ
cProfile is Pythonβs built-in deterministic profiler that instruments every function call, measuring both cumulative time (total time including sub-calls) and total time (time in the function itself, excluding sub-calls). The distinction matters: a function with high cumulative but low total time is a dispatcher that delegates to slow subroutines, while high total time indicates the actual hotspot. Profiling before optimizing is essential because developer intuition about bottlenecks is wrong roughly 90% of the time β Amdahlβs Law tells us that optimizing a function consuming only 5% of total runtime can yield at most a 5% speedup, regardless of how clever the optimization.
Reading profiler output: sort by cumulative to find the call chain leading to slow code, or by tottime to find the individual functions consuming the most CPU. In ML pipelines, the usual suspects are Python-level loops over data (which should be vectorized), repeated DataFrame operations that trigger copies, and unparallelized cross-validation. For finer granularity, line_profiler profiles individual lines within a function using the @profile decorator.
# Example: Slow data processing function
def slow_data_processing(n=10000):
"""
Inefficient data processing (intentionally slow).
"""
# Slow: Using Python loops
data = []
for i in range(n):
row = []
for j in range(10):
row.append(np.random.randn())
data.append(row)
# Convert to DataFrame (also slow)
df = pd.DataFrame(data)
# Slow: Row-wise operations
result = []
for idx, row in df.iterrows():
result.append(row.sum())
return result
# Profile the function
profiler = cProfile.Profile()
profiler.enable()
result = slow_data_processing(n=5000)
profiler.disable()
# Print profiling results
s = StringIO()
ps = pstats.Stats(profiler, stream=s).sort_stats('cumulative')
ps.print_stats(10) # Top 10 functions
print("Top 10 Time-Consuming Functions:")
print("=" * 80)
print(s.getvalue())
Vectorized Optimization: Replacing Loops with NumPyΒΆ
The single most impactful optimization in Python ML code is replacing row-by-row Python loops with vectorized NumPy or pandas operations. Python loops incur interpreter overhead on every iteration (type checking, reference counting, bytecode dispatch), while NumPy delegates to compiled C/Fortran routines that process entire arrays in a single call. For an array of \(n\) elements, a Python loop runs \(n\) interpreter cycles; a vectorized operation runs one C-level loop, often achieving 10-100x speedups. The fast_data_processing function below demonstrates this: np.random.randn(n, 10) generates all random numbers in a single call, and df.sum(axis=1) computes row sums without touching Pythonβs interpreter loop.
def fast_data_processing(n=10000):
"""
Optimized data processing using vectorization.
"""
# Fast: Vectorized array creation
data = np.random.randn(n, 10)
# Fast: Direct DataFrame creation
df = pd.DataFrame(data)
# Fast: Vectorized sum
result = df.sum(axis=1).values
return result
# Benchmark both versions
print("Performance Comparison:\n")
# Slow version
start = time.time()
result_slow = slow_data_processing(n=5000)
time_slow = time.time() - start
print(f"Slow version: {time_slow:.3f}s")
# Fast version
start = time.time()
result_fast = fast_data_processing(n=5000)
time_fast = time.time() - start
print(f"Fast version: {time_fast:.3f}s")
speedup = time_slow / time_fast
print(f"\nπ Speedup: {speedup:.1f}x faster!")
# Visualize performance comparison
sizes = [1000, 2000, 5000, 10000]
times_slow = []
times_fast = []
for n in sizes:
# Slow version
start = time.time()
slow_data_processing(n=n)
times_slow.append(time.time() - start)
# Fast version
start = time.time()
fast_data_processing(n=n)
times_fast.append(time.time() - start)
# Plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(sizes, times_slow, 'o-', label='Slow (loops)', color='red', linewidth=2, markersize=8)
ax.plot(sizes, times_fast, 's-', label='Fast (vectorized)', color='green', linewidth=2, markersize=8)
ax.set_xlabel('Data Size (n)', fontsize=12)
ax.set_ylabel('Time (seconds)', fontsize=12)
ax.set_title('Performance: Loops vs Vectorization', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\nAt n=10000: {times_slow[-1]/times_fast[-1]:.1f}x speedup")
Memory Profiling: Tracking Allocation and LeaksΒΆ
Memory bottlenecks are often invisible until they cause out-of-memory crashes or trigger excessive garbage collection pauses. Pythonβs sys.getsizeof() reports the shallow size of a single object (excluding referenced objects), while tracemalloc from the standard library tracks peak memory allocation across an entire code block. For ML workloads, the key insight is that NumPy arrays store data contiguously in C memory, using 8 bytes per float64 element, while Python lists store pointers to individually heap-allocated float objects, each consuming ~28 bytes plus the 8-byte pointer β roughly 4.5x more memory.
Common memory pitfalls in ML code: creating unnecessary copies with pandas operations (most pandas methods return new DataFrames by default), holding entire datasets in memory when only batches are needed, and accumulating intermediate arrays during feature engineering. Using float32 instead of float64 halves memory with negligible accuracy loss for most ML tasks, and memory_profilerβs @profile decorator lets you track line-by-line memory consumption to pinpoint exactly where allocations spike.
import sys
def analyze_memory_usage():
"""
Analyze memory usage of different data structures.
"""
print("=" * 60)
print("MEMORY USAGE ANALYSIS")
print("=" * 60)
n = 100000
# Python list
py_list = list(range(n))
mem_list = sys.getsizeof(py_list)
# NumPy array
np_array = np.arange(n)
mem_array = np_array.nbytes
# Pandas Series
pd_series = pd.Series(range(n))
mem_series = pd_series.memory_usage()
print(f"\nStoring {n:,} integers:\n")
print(f"Python list: {mem_list:,} bytes ({mem_list/1024/1024:.2f} MB)")
print(f"NumPy array: {mem_array:,} bytes ({mem_array/1024/1024:.2f} MB)")
print(f"Pandas Series: {mem_series:,} bytes ({mem_series/1024/1024:.2f} MB)")
print(f"\nπ‘ NumPy is {mem_list/mem_array:.1f}x more memory efficient!")
analyze_memory_usage()
# Memory-inefficient vs efficient code
def memory_inefficient(n=10000):
"""
Creates many intermediate copies.
"""
# Create multiple copies
data = np.random.randn(n, 100)
data_copy1 = data.copy()
data_copy2 = data_copy1.copy()
data_copy3 = data_copy2.copy()
# Inefficient: Creating new arrays
result = data_copy3 * 2
result = result + 1
result = result / 3
return result
def memory_efficient(n=10000):
"""
Operates in-place when possible.
"""
# Create once
data = np.random.randn(n, 100)
# Efficient: In-place operations
data *= 2
data += 1
data /= 3
return data
# Compare memory usage
import tracemalloc
# Inefficient version
tracemalloc.start()
result1 = memory_inefficient(n=10000)
current, peak = tracemalloc.get_traced_memory()
mem_inefficient = peak / 1024 / 1024
tracemalloc.stop()
# Efficient version
tracemalloc.start()
result2 = memory_efficient(n=10000)
current, peak = tracemalloc.get_traced_memory()
mem_efficient = peak / 1024 / 1024
tracemalloc.stop()
print("Memory Usage Comparison:\n")
print(f"Inefficient: {mem_inefficient:.2f} MB")
print(f"Efficient: {mem_efficient:.2f} MB")
print(f"\nπ Memory reduction: {100*(1 - mem_efficient/mem_inefficient):.1f}%")
Identifying Bottlenecks in ML PipelinesΒΆ
ML pipelines have a characteristic performance profile: data loading and preprocessing often dominate wall-clock time (60-80%), yet practitioners focus optimization efforts on model training. A systematic timing breakdown β measuring each stage independently β reveals the true bottleneck. The pattern below uses time.time() checkpoints, but production pipelines benefit from structured instrumentation with tools like MLflowβs autologging or custom decorators that record stage-level metrics to a monitoring dashboard.
Key optimization targets: preprocessing bottlenecks typically stem from row-wise Python loops (replace with StandardScaler.fit_transform or vectorized pandas operations), training bottlenecks respond to parallelization (n_jobs=-1 in scikit-learn distributes tree fitting across CPU cores), and evaluation overhead drops with parallel cross-validation. The optimized pipeline below demonstrates all three, often achieving 3-10x total speedup with minimal code changes.
# ML pipeline with potential bottlenecks
def ml_pipeline_slow(n_samples=10000):
"""
ML pipeline with bottlenecks.
"""
times = {}
# Data generation
start = time.time()
X, y = make_classification(
n_samples=n_samples, n_features=50, n_informative=30, random_state=42
)
times['data_generation'] = time.time() - start
# Inefficient preprocessing
start = time.time()
X_processed = []
for row in X:
# Row-wise normalization (slow!)
row_normalized = (row - row.mean()) / row.std()
X_processed.append(row_normalized)
X_processed = np.array(X_processed)
times['preprocessing'] = time.time() - start
# Model training
start = time.time()
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_processed, y)
times['training'] = time.time() - start
# Evaluation
start = time.time()
scores = cross_val_score(model, X_processed, y, cv=5)
times['evaluation'] = time.time() - start
return times
# Run and profile
times_slow = ml_pipeline_slow(n_samples=5000)
print("Pipeline Timing Breakdown:\n")
total = sum(times_slow.values())
for step, duration in times_slow.items():
pct = 100 * duration / total
print(f"{step:20s}: {duration:.3f}s ({pct:.1f}%)")
print(f"\nTotal: {total:.3f}s")
# Optimized pipeline
from sklearn.preprocessing import StandardScaler
def ml_pipeline_fast(n_samples=10000):
"""
Optimized ML pipeline.
"""
times = {}
# Data generation (same)
start = time.time()
X, y = make_classification(
n_samples=n_samples, n_features=50, n_informative=30, random_state=42
)
times['data_generation'] = time.time() - start
# Fast preprocessing (vectorized)
start = time.time()
scaler = StandardScaler()
X_processed = scaler.fit_transform(X)
times['preprocessing'] = time.time() - start
# Model training (same)
start = time.time()
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42) # Use all CPUs
model.fit(X_processed, y)
times['training'] = time.time() - start
# Evaluation (same)
start = time.time()
scores = cross_val_score(model, X_processed, y, cv=5, n_jobs=-1) # Parallel CV
times['evaluation'] = time.time() - start
return times
# Run optimized version
times_fast = ml_pipeline_fast(n_samples=5000)
print("Optimized Pipeline Timing:\n")
total_fast = sum(times_fast.values())
for step, duration in times_fast.items():
pct = 100 * duration / total_fast
print(f"{step:20s}: {duration:.3f}s ({pct:.1f}%)")
print(f"\nTotal: {total_fast:.3f}s")
speedup = sum(times_slow.values()) / sum(times_fast.values())
print(f"\nπ Overall speedup: {speedup:.1f}x")
# Visualize bottlenecks
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Slow pipeline
steps = list(times_slow.keys())
durations_slow = list(times_slow.values())
axes[0].barh(steps, durations_slow, color='red', alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Time (seconds)', fontsize=11)
axes[0].set_title(f'Slow Pipeline\nTotal: {sum(durations_slow):.2f}s',
fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')
# Fast pipeline
durations_fast = list(times_fast.values())
axes[1].barh(steps, durations_fast, color='green', alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Time (seconds)', fontsize=11)
axes[1].set_title(f'Optimized Pipeline\nTotal: {sum(durations_fast):.2f}s',
fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
print("\nπ― Key optimizations:")
print(" β’ Vectorized preprocessing (StandardScaler)")
print(" β’ Parallel training (n_jobs=-1)")
print(" β’ Parallel cross-validation")
Common Optimization Techniques for ML CodeΒΆ
The following three techniques β vectorization, caching, and batch processing β form the optimization trifecta for Python ML code. They address different bottleneck types: vectorization eliminates interpreter overhead for numerical computation, caching eliminates redundant recomputation of expensive results, and batch processing amortizes per-call overhead (model initialization, memory allocation) across many samples. Applied together, they routinely transform minutes-long pipelines into seconds-long ones. The key principle is to measure first, optimize second: always profile to confirm where time is actually spent before applying any technique.
Vectorization: Broadcasting Over LoopsΒΆ
Vectorized distance computation replaces \(O(n \cdot m)\) Python-level iterations with a single matrix operation. The Euclidean distance between points \(\mathbf{x}\) and \(\mathbf{y}\) expands as \(\|\mathbf{x} - \mathbf{y}\|^2 = \|\mathbf{x}\|^2 + \|\mathbf{y}\|^2 - 2\mathbf{x}^T\mathbf{y}\), which maps directly to NumPyβs np.dot(X, Y.T) plus precomputed norms. This algebraic trick avoids explicitly forming the \((n \times m \times d)\) difference tensor, reducing memory from \(O(nmd)\) to \(O(nm)\). The same broadcasting pattern applies to cosine similarity, Mahalanobis distance, and kernel evaluations β any pairwise computation can usually be decomposed into matrix multiplications and element-wise operations.
# Example: Distance calculation
def euclidean_distance_loop(X, Y):
"""Slow: Python loops."""
distances = []
for x in X:
for y in Y:
dist = np.sqrt(np.sum((x - y) ** 2))
distances.append(dist)
return np.array(distances).reshape(len(X), len(Y))
def euclidean_distance_vectorized(X, Y):
"""Fast: Broadcasting."""
# X: (n, d), Y: (m, d)
# Result: (n, m)
X_norm = np.sum(X ** 2, axis=1).reshape(-1, 1)
Y_norm = np.sum(Y ** 2, axis=1).reshape(1, -1)
distances = np.sqrt(X_norm + Y_norm - 2 * np.dot(X, Y.T))
return distances
# Benchmark
X = np.random.randn(100, 50)
Y = np.random.randn(100, 50)
start = time.time()
dist_loop = euclidean_distance_loop(X, Y)
time_loop = time.time() - start
start = time.time()
dist_vec = euclidean_distance_vectorized(X, Y)
time_vec = time.time() - start
print(f"Loop version: {time_loop:.3f}s")
print(f"Vectorized: {time_vec:.3f}s")
print(f"\nπ Speedup: {time_loop/time_vec:.0f}x")
print(f"β
Results match: {np.allclose(dist_loop, dist_vec)}")
Caching and Memoization: Trading Memory for SpeedΒΆ
functools.lru_cache stores the results of expensive function calls in a dictionary keyed by arguments, returning cached results on subsequent calls with the same inputs. For deterministic functions called repeatedly with the same arguments β feature engineering pipelines, hyperparameter search evaluations, or API calls β caching transforms \(O(n \cdot T)\) total computation into \(O(k \cdot T + (n-k) \cdot O(1))\), where \(k\) is the number of unique inputs and \(T\) is the per-call cost. The maxsize parameter controls the LRU eviction policy; set it to None for unlimited caching when memory is not a concern. In ML contexts, scikit-learnβs memory parameter in Pipeline provides disk-based caching of transformer outputs, which persists across Python sessions.
from functools import lru_cache
# Expensive computation
def expensive_computation(n):
"""Simulate expensive computation."""
time.sleep(0.1) # Simulate delay
return n ** 2
# With caching
@lru_cache(maxsize=128)
def expensive_computation_cached(n):
"""Cached version."""
time.sleep(0.1)
return n ** 2
# Test without cache
start = time.time()
for _ in range(3):
result = expensive_computation(5)
time_nocache = time.time() - start
# Test with cache
start = time.time()
for _ in range(3):
result = expensive_computation_cached(5)
time_cached = time.time() - start
print(f"Without cache (3 calls): {time_nocache:.3f}s")
print(f"With cache (3 calls): {time_cached:.3f}s")
print(f"\nπ Speedup: {time_nocache/time_cached:.0f}x")
Batch Processing: Amortizing Per-Call OverheadΒΆ
Calling model.predict() on one sample at a time is dramatically slower than predicting on an entire array because each call incurs fixed overhead: input validation, array shape checks, and (for tree ensembles) tree traversal setup. Batch prediction passes the full array once, allowing scikit-learn to optimize memory access patterns and leverage BLAS-level parallelism. The same principle applies to API-based inference (LLM providers charge per request, not per token, making batched requests cheaper), database queries (one SELECT with 1000 IDs vs. 1000 individual queries), and GPU computation (GPUs achieve peak throughput only when processing large batches that saturate their parallel cores).
# Example: Predictions
from sklearn.ensemble import RandomForestClassifier
# Train a model
X_train, y_train = make_classification(n_samples=1000, n_features=20, random_state=42)
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)
# Test data
X_test = np.random.randn(10000, 20)
# One-by-one predictions (slow)
start = time.time()
predictions_single = [model.predict(x.reshape(1, -1))[0] for x in X_test]
time_single = time.time() - start
# Batch predictions (fast)
start = time.time()
predictions_batch = model.predict(X_test)
time_batch = time.time() - start
print(f"One-by-one: {time_single:.3f}s")
print(f"Batch: {time_batch:.3f}s")
print(f"\nπ Speedup: {time_single/time_batch:.0f}x")
π― Key TakeawaysΒΆ
Always Profile First
Donβt guess where bottlenecks are
Use cProfile, line_profiler, memory_profiler
Focus optimization efforts on hotspots
Vectorization is Key
Replace Python loops with NumPy operations
Use built-in functions (sum, mean, etc.)
Leverage broadcasting
Memory Matters
Use in-place operations when possible
Avoid unnecessary copies
Choose appropriate data types (float32 vs float64)
Parallelize When Possible
Use n_jobs=-1 in scikit-learn
Leverage multiprocessing for CPU-bound tasks
Batch operations for efficiency
Cache Expensive Operations
Use @lru_cache for repeated computations
Store intermediate results
Reuse preprocessed data
π Optimization ChecklistΒΆ
β Profile before optimizing
β Replace loops with vectorized operations
β Use appropriate data types
β Operate in-place when possible
β Parallelize independent operations
β Cache expensive computations
β Batch process when applicable
β Use compiled libraries (NumPy, scikit-learn)
Continue to Notebook 4 to learn model-specific debugging! π