Run this notebook: Open in Colab Open in Kaggle

Time Series Fundamentals: Decomposition, Stationarity, and Autocorrelation¶

Time series data has structure that standard ML ignores: trend, seasonality, and autocorrelation. This notebook builds the intuition and tools to understand any time series before you model it.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

# Synthetic time series: monthly retail sales (3 years)
n = 36
dates = pd.date_range('2021-01-01', periods=n, freq='MS')

# Components
trend = np.linspace(100, 160, n)                              # Upward trend
seasonality = 20 * np.sin(2 * np.pi * np.arange(n) / 12)    # Annual cycle
noise = np.random.normal(0, 5, n)                             # Random noise
cyclical = 10 * np.sin(2 * np.pi * np.arange(n) / 24)       # 2-year cycle

sales = trend + seasonality + cyclical + noise

ts = pd.Series(sales, index=dates, name='monthly_sales')
print(f'Time series: {len(ts)} months, {ts.index[0].date()} to {ts.index[-1].date()}')
print(f'Mean: {ts.mean():.1f}, Std: {ts.std():.1f}, Range: [{ts.min():.1f}, {ts.max():.1f}]')
ts.head()

1. Visual Exploration — The First Thing to Do¶

fig, axes = plt.subplots(4, 1, figsize=(13, 12), sharex=False)

# Raw series
ts.plot(ax=axes[0], color='steelblue', linewidth=1.5)
axes[0].set_title('Raw Time Series: Monthly Sales')
axes[0].set_ylabel('Sales ($K)')

# Rolling statistics (window=3 months)
rolling_mean = ts.rolling(window=3).mean()
rolling_std  = ts.rolling(window=3).std()

ts.plot(ax=axes[1], alpha=0.5, color='gray', label='Raw')
rolling_mean.plot(ax=axes[1], color='blue', linewidth=2, label='3M rolling mean')
axes[1].fill_between(ts.index,
                     rolling_mean - 2*rolling_std,
                     rolling_mean + 2*rolling_std,
                     alpha=0.2, color='blue', label='±2σ band')
axes[1].set_title('Rolling Mean & Confidence Band')
axes[1].legend()

# Year-over-year comparison
for year in [2021, 2022, 2023]:
    year_data = ts[ts.index.year == year]
    if not year_data.empty:
        axes[2].plot(range(len(year_data)), year_data.values,
                     marker='o', markersize=4, label=str(year))
axes[2].set_xticks(range(12))
axes[2].set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
axes[2].set_title('Year-over-Year Seasonal Pattern')
axes[2].legend()
axes[2].set_ylabel('Sales ($K)')

# Month-by-month box plots
monthly_grouped = [ts[ts.index.month == m].values for m in range(1, 13)]
axes[3].boxplot(monthly_grouped, labels=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
axes[3].set_title('Distribution by Month')
axes[3].set_ylabel('Sales ($K)')

plt.tight_layout()
plt.show()

2. Decomposition — Separating Signal from Noise¶

# Classical decomposition: Additive vs Multiplicative
# Additive:      observed = trend + seasonal + residual  (when seasonal amplitude is CONSTANT)
# Multiplicative: observed = trend × seasonal × residual (when amplitude GROWS with trend)

fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for col, model in enumerate(['additive', 'multiplicative']):
    try:
        decomp = seasonal_decompose(ts, model=model, period=12)
        components = [ts, decomp.trend, decomp.seasonal, decomp.resid]
        titles = ['Observed', 'Trend', 'Seasonal', 'Residual']
        
        for i, (comp, title) in enumerate(zip(components, titles)):
            comp.plot(ax=axes[col, i], color='steelblue' if i < 3 else 'coral')
            axes[col, i].set_title(f'{model.title()}\n{title}')
            axes[col, i].set_xlabel('')
    except Exception as e:
        axes[col, 0].text(0.5, 0.5, str(e), transform=axes[col, 0].transAxes)

plt.tight_layout()
plt.show()

# STL decomposition (more robust, handles outliers better)
try:
    from statsmodels.tsa.seasonal import STL
    stl = STL(ts, period=12, robust=True)
    stl_result = stl.fit()
    
    fig = stl_result.plot()
    fig.set_size_inches(12, 8)
    fig.suptitle('STL Decomposition (Robust — handles outliers)', fontsize=13)
    plt.tight_layout()
    plt.show()
    
    strength_trend    = max(0, 1 - stl_result.resid.var() / (stl_result.trend + stl_result.resid).var())
    strength_seasonal = max(0, 1 - stl_result.resid.var() / (stl_result.seasonal + stl_result.resid).var())
    print(f'Trend strength:    {strength_trend:.3f}  (0=none, 1=dominant)')
    print(f'Seasonal strength: {strength_seasonal:.3f}  (>0.6 = strong seasonality)')
except ImportError:
    print('STL not available in this statsmodels version')

3. Stationarity — What ARIMA Actually Requires¶

def stationarity_tests(series: pd.Series, name: str = '') -> dict:
    """
    ADF test: H0 = non-stationary (has unit root)
    KPSS test: H0 = stationary
    Interpreting both together avoids false conclusions.
    """
    adf_stat, adf_p, _, _, adf_crit, _ = adfuller(series.dropna())
    kpss_stat, kpss_p, _, kpss_crit = kpss(series.dropna(), regression='c')
    
    stationary = adf_p < 0.05 and kpss_p > 0.05
    
    print(f'=== Stationarity Tests: {name} ===')
    print(f'ADF:  stat={adf_stat:.4f}, p={adf_p:.4f} → {"Stationary" if adf_p < 0.05 else "NON-stationary"}')
    print(f'KPSS: stat={kpss_stat:.4f}, p={kpss_p:.4f} → {"Stationary" if kpss_p > 0.05 else "NON-stationary"}')
    print(f'Conclusion: {"✅ STATIONARY" if stationary else "❌ NOT STATIONARY — differencing needed"}')
    return {'stationary': stationary, 'adf_p': adf_p, 'kpss_p': kpss_p}

# Original series
result_raw = stationarity_tests(ts, 'Original series')
print()

# First difference (removes linear trend)
ts_diff1 = ts.diff().dropna()
result_d1 = stationarity_tests(ts_diff1, '1st difference')
print()

# Seasonal difference (removes seasonality)
ts_sdiff = ts.diff(12).dropna()
result_sd = stationarity_tests(ts_sdiff, 'Seasonal difference (lag=12)')
print()

# Combined
ts_both = ts.diff().diff(12).dropna()
result_both = stationarity_tests(ts_both, '1st + Seasonal difference')

# Visualize
fig, axes = plt.subplots(4, 1, figsize=(13, 10), sharex=False)
for ax, series, title in zip(axes,
    [ts, ts_diff1, ts_sdiff, ts_both],
    ['Original', '1st Difference', 'Seasonal Difference (12)', 'Both Differences']):
    series.plot(ax=ax, color='steelblue')
    ax.axhline(series.mean(), color='red', linestyle='--', alpha=0.7, label=f'Mean={series.mean():.1f}')
    ax.set_title(title)
    ax.legend(fontsize=8)
plt.tight_layout()
plt.show()

4. ACF & PACF — Choosing ARIMA Orders¶

# ACF  (p): correlation at lag k — how much does past Y predict future Y?
# PACF (q): partial autocorrelation — controls for intermediate lags
#
# Reading the plots to identify ARIMA(p,d,q):
#   AR(p): PACF cuts off after lag p, ACF decays gradually
#   MA(q): ACF cuts off after lag q, PACF decays gradually
#   Both:  Both decay gradually → ARMA

stationary_ts = ts_both  # Already stationary after double differencing

fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# ACF
plot_acf(stationary_ts, lags=24, ax=axes[0, 0], alpha=0.05)
axes[0, 0].set_title('ACF — Stationarized Series\n(Significant lags suggest MA order)')

# PACF
plot_pacf(stationary_ts, lags=24, ax=axes[0, 1], alpha=0.05, method='ols')
axes[0, 1].set_title('PACF — Stationarized Series\n(Significant lags suggest AR order)')

# Lag plot: Is there autocorrelation?
pd.plotting.lag_plot(stationary_ts, lag=1, ax=axes[1, 0])
axes[1, 0].set_title('Lag Plot (lag=1)\nStrong pattern = autocorrelation present')

# Correlation at multiple lags
lags = range(1, 13)
correlations = [stationary_ts.autocorr(lag=l) for l in lags]
bars = axes[1, 1].bar(lags, correlations,
                       color=['red' if abs(c) > 0.2 else 'steelblue' for c in correlations])
axes[1, 1].axhline(0, color='black', linewidth=0.5)
axes[1, 1].axhline(2/np.sqrt(len(stationary_ts)), color='blue', linestyle='--', alpha=0.5, label='95% CI')
axes[1, 1].axhline(-2/np.sqrt(len(stationary_ts)), color='blue', linestyle='--', alpha=0.5)
axes[1, 1].set_xlabel('Lag')
axes[1, 1].set_ylabel('Autocorrelation')
axes[1, 1].set_title('Autocorrelations by Lag')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

Time Series Fundamentals Cheat Sheet¶

Component    Description                    How to Remove
─────────────────────────────────────────────────────────
Trend        Long-term direction            First difference (.diff(1))
Seasonality  Repeating pattern (period P)   Seasonal difference (.diff(P))
Cyclical     Long irregular waves           Hard — usually model as trend
Noise        Random variation               Smoothing (rolling mean, EWM)

Stationarity Rules:
  ADF p < 0.05 AND KPSS p > 0.05 → Stationary ✅
  ADF p > 0.05 → Still has unit root → Difference
  KPSS p < 0.05 → Trend present → Detrend

ACF/PACF Reading Guide:
  AR(p): PACF significant at lags 1..p, cuts off. ACF decays.
  MA(q): ACF significant at lags 1..q, cuts off. PACF decays.
  ARMA:  Both decay gradually → need both p and q
  Seasonal: Spikes at multiples of period S (12 for monthly)

Exercises¶

Load statsmodels.datasets.co2.load_pandas() and run the full stationarity + decomposition analysis.
Create an is_seasonal function that returns True if the seasonal component explains >40% of variance.
Plot a correlogram for the log-transformed series — does it change the stationarity conclusion?
Implement an Augmented Dickey-Fuller test loop that keeps differencing until the series is stationary.
Compare additive vs multiplicative decomposition on a series with exponentially growing seasonal amplitude.