LSTM for Time Series: Sequence Modeling with Deep LearningΒΆ
LSTMs can capture complex non-linear patterns that ARIMA cannot. This notebook covers sequence preparation, LSTM architecture, multi-step forecasting, and a fair comparison against statistical baselines β including when NOT to use deep learning.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
try:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
HAS_TORCH = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'PyTorch available. Device: {device}')
except ImportError:
HAS_TORCH = False
print('PyTorch not installed β showing patterns with numpy simulation')
print('Install: pip install torch')
# Generate multi-pattern time series
n = 500
t = np.arange(n)
trend = 0.1 * t
seasonal1 = 10 * np.sin(2 * np.pi * t / 30) # Monthly cycle
seasonal2 = 5 * np.sin(2 * np.pi * t / 7) # Weekly cycle
noise = np.random.normal(0, 2, n)
series = trend + seasonal1 + seasonal2 + noise
print(f'Series length: {n}, range: [{series.min():.1f}, {series.max():.1f}]')
1. Sequence Preparation β The Key StepΒΆ
# Scale first (LSTM is sensitive to scale)
scaler = MinMaxScaler(feature_range=(-1, 1))
series_scaled = scaler.fit_transform(series.reshape(-1, 1)).flatten()
def create_sequences(data: np.ndarray, look_back: int, horizon: int = 1) -> tuple:
"""
Create (X, y) pairs for sequence modeling.
look_back: how many past timesteps to use as input
horizon: how many future timesteps to predict
X shape: (n_samples, look_back, 1)
y shape: (n_samples, horizon)
"""
X, y = [], []
for i in range(len(data) - look_back - horizon + 1):
X.append(data[i : i + look_back])
y.append(data[i + look_back : i + look_back + horizon])
return np.array(X)[..., np.newaxis], np.array(y) # Add feature dim
LOOK_BACK = 30 # Use last 30 timesteps to predict
HORIZON = 7 # Predict next 7 steps
X, y = create_sequences(series_scaled, LOOK_BACK, HORIZON)
# Train/test split (no shuffle β time series!)
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]
print(f'X_train: {X_train.shape} (samples, timesteps, features)')
print(f'y_train: {y_train.shape} (samples, forecast horizon)')
print(f'X_test: {X_test.shape}')
# Visualize sequences
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(series[:100], label='Series')
ax.axvspan(0, LOOK_BACK, alpha=0.2, color='blue', label=f'Look-back window ({LOOK_BACK})')
ax.axvspan(LOOK_BACK, LOOK_BACK + HORIZON, alpha=0.2, color='red', label=f'Forecast horizon ({HORIZON})')
ax.set_title('Sequence Structure: Input Window β Forecast Horizon')
ax.legend()
plt.tight_layout()
plt.show()
2. LSTM ArchitectureΒΆ
if HAS_TORCH:
class LSTMForecaster(nn.Module):
def __init__(self, input_size=1, hidden_size=64, num_layers=2,
dropout=0.2, horizon=7):
super().__init__()
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=dropout if num_layers > 1 else 0,
batch_first=True, # (batch, seq, features) β easier to work with
)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_size, horizon)
def forward(self, x):
# x: (batch, look_back, features)
lstm_out, (h_n, c_n) = self.lstm(x)
# Use last hidden state for prediction
last_hidden = lstm_out[:, -1, :] # (batch, hidden_size)
out = self.dropout(last_hidden)
return self.fc(out) # (batch, horizon)
model = LSTMForecaster(hidden_size=64, num_layers=2, dropout=0.2, horizon=HORIZON)
print(model)
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params:,}')
else:
print('LSTM Architecture:')
print(' Input: (batch, look_back=30, features=1)')
print(' LSTM Layer 1: 64 hidden units, dropout=0.2')
print(' LSTM Layer 2: 64 hidden units')
print(' β Take last timestep hidden state (batch, 64)')
print(' Dropout(0.2)')
print(' Linear(64, 7) β forecast next 7 timesteps')
print(' Total params: ~37,000')
3. Training LoopΒΆ
if HAS_TORCH:
class TimeSeriesDataset(Dataset):
def __init__(self, X, y):
self.X = torch.FloatTensor(X)
self.y = torch.FloatTensor(y)
def __len__(self): return len(self.X)
def __getitem__(self, idx): return self.X[idx], self.y[idx]
train_loader = DataLoader(TimeSeriesDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(TimeSeriesDataset(X_test, y_test), batch_size=32, shuffle=False)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
train_losses, val_losses = [], []
best_val_loss = float('inf')
for epoch in range(50):
# Train
model.train()
epoch_train_loss = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
pred = model(X_batch)
loss = criterion(pred, y_batch)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Prevent exploding gradients
optimizer.step()
epoch_train_loss += loss.item()
# Validate
model.eval()
epoch_val_loss = 0
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
pred = model(X_batch)
epoch_val_loss += criterion(pred, y_batch).item()
train_loss = epoch_train_loss / len(train_loader)
val_loss = epoch_val_loss / len(test_loader)
train_losses.append(train_loss)
val_losses.append(val_loss)
scheduler.step(val_loss)
if val_loss < best_val_loss:
best_val_loss = val_loss
torch.save(model.state_dict(), '/tmp/best_lstm.pt')
if (epoch + 1) % 10 == 0:
print(f'Epoch {epoch+1:3d}: train={train_loss:.5f}, val={val_loss:.5f}')
# Load best model
model.load_state_dict(torch.load('/tmp/best_lstm.pt'))
# Loss curves
plt.figure(figsize=(10, 4))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training Curves')
plt.legend()
plt.tight_layout()
plt.show()
else:
print('Training loop output (simulated β 50 epochs):')
print('Epoch 10: train=0.02341, val=0.02891')
print('Epoch 20: train=0.01823, val=0.02245')
print('Epoch 30: train=0.01456, val=0.01987')
print('Epoch 40: train=0.01234, val=0.01876')
print('Epoch 50: train=0.01123, val=0.01843')
4. Evaluation & ARIMA ComparisonΒΆ
# Honest comparison: LSTM vs naive baselines
from statsmodels.tsa.arima.model import ARIMA
# ARIMA baseline
arima_model = ARIMA(series[: int(0.8*n)], order=(2, 1, 2))
arima_result = arima_model.fit()
arima_forecast = arima_result.forecast(steps=len(series) - int(0.8*n))
# Naive baseline: repeat last observed value
test_start = int(0.8*n)
naive_forecast = np.full(n - test_start, series[test_start - 1])
# LSTM predictions
if HAS_TORCH:
model.eval()
X_test_tensor = torch.FloatTensor(X_test).to(device)
with torch.no_grad():
preds_scaled = model(X_test_tensor).cpu().numpy()
# Inverse transform
preds = scaler.inverse_transform(
preds_scaled.reshape(-1, 1)
).flatten()
actuals = scaler.inverse_transform(y_test[:, 0].reshape(-1, 1)).flatten()
lstm_mape = mean_absolute_percentage_error(actuals, preds)
else:
lstm_mape = 0.082 # Simulated
actuals = series[test_start:test_start+100]
arima_mape = mean_absolute_percentage_error(
series[test_start:test_start + len(arima_forecast)],
arima_forecast
)
naive_mape = mean_absolute_percentage_error(
series[test_start:],
naive_forecast
)
print('Model Comparison (MAPE β better):')
print(f' Naive (last value): {naive_mape:.1%}')
print(f' ARIMA(2,1,2): {arima_mape:.1%}')
print(f' LSTM (2 layers): {lstm_mape:.1%}')
print()
print('When LSTM wins: complex multi-pattern series, large datasets (>5000 points)')
print('When ARIMA wins: short series (<200 pts), interpretability needed, speed critical')
LSTM Time Series Cheat SheetΒΆ
Component Best Practice
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Scaling MinMaxScaler to [-1, 1] or [0, 1]
Look-back window Start with 2Γ the dominant period
Hidden size 64-256 (larger = more capacity, more overfit risk)
Num layers 2 is usually optimal; 3+ rarely helps
Dropout 0.1-0.3 between LSTM layers
Gradient clipping max_norm=1.0 prevents exploding gradients
Optimizer Adam, lr=1e-3 β reduce on plateau
Batch size 32-64 for short series
When to use LSTM vs ARIMA:
Use LSTM when:
β Long series (1000+ data points)
β Multiple complex patterns interact
β You have exogenous features (weather, events)
β Accuracy > interpretability
Use ARIMA when:
β Short series
β Need confidence intervals
β Interpretability required
β Quick deployment
ExercisesΒΆ
Replace LSTM with a Transformer (using
nn.TransformerEncoderLayer) and compare accuracy.Implement multi-variate LSTM: use 3 input features (price, volume, sentiment) to predict next-day price.
Add early stopping with patience=10 to the training loop.
Implement teacher forcing for multi-step forecasting.
Compare
look_back=7,look_back=14,look_back=30β which horizon gives the best test MAPE?