Run this notebook: Open in Colab Open in Kaggle

LSTM for Time Series: Sequence Modeling with Deep Learning¶

LSTMs can capture complex non-linear patterns that ARIMA cannot. This notebook covers sequence preparation, LSTM architecture, multi-step forecasting, and a fair comparison against statistical baselines — including when NOT to use deep learning.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

try:
    import torch
    import torch.nn as nn
    from torch.utils.data import Dataset, DataLoader
    HAS_TORCH = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'PyTorch available. Device: {device}')
except ImportError:
    HAS_TORCH = False
    print('PyTorch not installed — showing patterns with numpy simulation')
    print('Install: pip install torch')

# Generate multi-pattern time series
n = 500
t = np.arange(n)
trend = 0.1 * t
seasonal1 = 10 * np.sin(2 * np.pi * t / 30)    # Monthly cycle
seasonal2 = 5  * np.sin(2 * np.pi * t / 7)     # Weekly cycle
noise = np.random.normal(0, 2, n)
series = trend + seasonal1 + seasonal2 + noise

print(f'Series length: {n}, range: [{series.min():.1f}, {series.max():.1f}]')

1. Sequence Preparation — The Key Step¶

# Scale first (LSTM is sensitive to scale)
scaler = MinMaxScaler(feature_range=(-1, 1))
series_scaled = scaler.fit_transform(series.reshape(-1, 1)).flatten()

def create_sequences(data: np.ndarray, look_back: int, horizon: int = 1) -> tuple:
    """
    Create (X, y) pairs for sequence modeling.
    look_back: how many past timesteps to use as input
    horizon:   how many future timesteps to predict
    
    X shape: (n_samples, look_back, 1)
    y shape: (n_samples, horizon)
    """
    X, y = [], []
    for i in range(len(data) - look_back - horizon + 1):
        X.append(data[i : i + look_back])
        y.append(data[i + look_back : i + look_back + horizon])
    return np.array(X)[..., np.newaxis], np.array(y)  # Add feature dim

LOOK_BACK = 30   # Use last 30 timesteps to predict
HORIZON   = 7    # Predict next 7 steps

X, y = create_sequences(series_scaled, LOOK_BACK, HORIZON)

# Train/test split (no shuffle — time series!)
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print(f'X_train: {X_train.shape}  (samples, timesteps, features)')
print(f'y_train: {y_train.shape}  (samples, forecast horizon)')
print(f'X_test:  {X_test.shape}')

# Visualize sequences
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(series[:100], label='Series')
ax.axvspan(0, LOOK_BACK, alpha=0.2, color='blue', label=f'Look-back window ({LOOK_BACK})')
ax.axvspan(LOOK_BACK, LOOK_BACK + HORIZON, alpha=0.2, color='red', label=f'Forecast horizon ({HORIZON})')
ax.set_title('Sequence Structure: Input Window → Forecast Horizon')
ax.legend()
plt.tight_layout()
plt.show()

2. LSTM Architecture¶

if HAS_TORCH:
    class LSTMForecaster(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=2,
                     dropout=0.2, horizon=7):
            super().__init__()
            self.lstm = nn.LSTM(
                input_size=input_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True,     # (batch, seq, features) — easier to work with
            )
            self.dropout = nn.Dropout(dropout)
            self.fc = nn.Linear(hidden_size, horizon)
        
        def forward(self, x):
            # x: (batch, look_back, features)
            lstm_out, (h_n, c_n) = self.lstm(x)
            # Use last hidden state for prediction
            last_hidden = lstm_out[:, -1, :]    # (batch, hidden_size)
            out = self.dropout(last_hidden)
            return self.fc(out)                  # (batch, horizon)
    
    model = LSTMForecaster(hidden_size=64, num_layers=2, dropout=0.2, horizon=HORIZON)
    print(model)
    total_params = sum(p.numel() for p in model.parameters())
    print(f'Total parameters: {total_params:,}')
else:
    print('LSTM Architecture:')
    print('  Input: (batch, look_back=30, features=1)')
    print('  LSTM Layer 1: 64 hidden units, dropout=0.2')
    print('  LSTM Layer 2: 64 hidden units')
    print('  → Take last timestep hidden state (batch, 64)')
    print('  Dropout(0.2)')
    print('  Linear(64, 7) → forecast next 7 timesteps')
    print('  Total params: ~37,000')

3. Training Loop¶

if HAS_TORCH:
    class TimeSeriesDataset(Dataset):
        def __init__(self, X, y):
            self.X = torch.FloatTensor(X)
            self.y = torch.FloatTensor(y)
        def __len__(self): return len(self.X)
        def __getitem__(self, idx): return self.X[idx], self.y[idx]
    
    train_loader = DataLoader(TimeSeriesDataset(X_train, y_train), batch_size=32, shuffle=True)
    test_loader  = DataLoader(TimeSeriesDataset(X_test,  y_test),  batch_size=32, shuffle=False)
    
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    
    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    
    for epoch in range(50):
        # Train
        model.train()
        epoch_train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevent exploding gradients
            optimizer.step()
            epoch_train_loss += loss.item()
        
        # Validate
        model.eval()
        epoch_val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                pred = model(X_batch)
                epoch_val_loss += criterion(pred, y_batch).item()
        
        train_loss = epoch_train_loss / len(train_loader)
        val_loss   = epoch_val_loss / len(test_loader)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        scheduler.step(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), '/tmp/best_lstm.pt')
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1:3d}: train={train_loss:.5f}, val={val_loss:.5f}')
    
    # Load best model
    model.load_state_dict(torch.load('/tmp/best_lstm.pt'))
    
    # Loss curves
    plt.figure(figsize=(10, 4))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('MSE Loss')
    plt.title('Training Curves')
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print('Training loop output (simulated — 50 epochs):')
    print('Epoch  10: train=0.02341, val=0.02891')
    print('Epoch  20: train=0.01823, val=0.02245')
    print('Epoch  30: train=0.01456, val=0.01987')
    print('Epoch  40: train=0.01234, val=0.01876')
    print('Epoch  50: train=0.01123, val=0.01843')

4. Evaluation & ARIMA Comparison¶

# Honest comparison: LSTM vs naive baselines
from statsmodels.tsa.arima.model import ARIMA

# ARIMA baseline
arima_model = ARIMA(series[: int(0.8*n)], order=(2, 1, 2))
arima_result = arima_model.fit()
arima_forecast = arima_result.forecast(steps=len(series) - int(0.8*n))

# Naive baseline: repeat last observed value
test_start = int(0.8*n)
naive_forecast = np.full(n - test_start, series[test_start - 1])

# LSTM predictions
if HAS_TORCH:
    model.eval()
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    with torch.no_grad():
        preds_scaled = model(X_test_tensor).cpu().numpy()
    
    # Inverse transform
    preds = scaler.inverse_transform(
        preds_scaled.reshape(-1, 1)
    ).flatten()
    actuals = scaler.inverse_transform(y_test[:, 0].reshape(-1, 1)).flatten()
    lstm_mape = mean_absolute_percentage_error(actuals, preds)
else:
    lstm_mape = 0.082  # Simulated
    actuals = series[test_start:test_start+100]

arima_mape = mean_absolute_percentage_error(
    series[test_start:test_start + len(arima_forecast)],
    arima_forecast
)
naive_mape = mean_absolute_percentage_error(
    series[test_start:],
    naive_forecast
)

print('Model Comparison (MAPE ↓ better):')
print(f'  Naive (last value):  {naive_mape:.1%}')
print(f'  ARIMA(2,1,2):        {arima_mape:.1%}')
print(f'  LSTM (2 layers):     {lstm_mape:.1%}')
print()
print('When LSTM wins: complex multi-pattern series, large datasets (>5000 points)')
print('When ARIMA wins: short series (<200 pts), interpretability needed, speed critical')

LSTM Time Series Cheat Sheet¶

Component              Best Practice
────────────────────────────────────────────────────────────
Scaling                MinMaxScaler to [-1, 1] or [0, 1]
Look-back window       Start with 2× the dominant period
Hidden size            64-256 (larger = more capacity, more overfit risk)
Num layers             2 is usually optimal; 3+ rarely helps
Dropout                0.1-0.3 between LSTM layers
Gradient clipping      max_norm=1.0 prevents exploding gradients
Optimizer              Adam, lr=1e-3 → reduce on plateau
Batch size             32-64 for short series

When to use LSTM vs ARIMA:
  Use LSTM when:
    → Long series (1000+ data points)
    → Multiple complex patterns interact
    → You have exogenous features (weather, events)
    → Accuracy > interpretability
  Use ARIMA when:
    → Short series
    → Need confidence intervals
    → Interpretability required
    → Quick deployment

Exercises¶

Replace LSTM with a Transformer (using nn.TransformerEncoderLayer) and compare accuracy.
Implement multi-variate LSTM: use 3 input features (price, volume, sentiment) to predict next-day price.
Add early stopping with patience=10 to the training loop.
Implement teacher forcing for multi-step forecasting.
Compare look_back=7, look_back=14, look_back=30 — which horizon gives the best test MAPE?