import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
import seaborn as sns
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
sns.set_style('whitegrid')
1. Motivation: Exact LikelihoodΒΆ
Comparison of Generative ModelsΒΆ
Model |
Likelihood |
Sampling |
Representation |
|---|---|---|---|
VAE |
Approximate (ELBO) |
Fast |
Latent space |
GAN |
None |
Fast |
Implicit |
Flow |
Exact |
Fast |
Bijective |
Key Idea: Change of VariablesΒΆ
Given:
Simple base distribution \(p_z(z)\) (e.g., Gaussian)
Invertible transformation \(f: \mathbb{R}^d \to \mathbb{R}^d\)
Data \(x = f(z)\)
Change of variables formula: $\(p_x(x) = p_z(f^{-1}(x)) \left| \det \frac{\partial f^{-1}}{\partial x} \right|\)$
Equivalently: $\(p_x(x) = p_z(z) \left| \det \frac{\partial f}{\partial z} \right|^{-1}\)$
where \(z = f^{-1}(x)\).
RequirementsΒΆ
Invertibility: \(f\) must be bijective
Tractable Jacobian: \(\det J_f\) must be computable
Expressiveness: \(f\) must be flexible
π Reference Materials:
vb_nf.pdf - Vb Nf
generative_models.pdf - Generative Models
Change of Variables in 1DΒΆ
Normalizing flows are built on the change of variables formula: if \(z \sim p_z(z)\) and \(x = f(z)\) where \(f\) is invertible, then \(p_x(x) = p_z(f^{-1}(x)) \left|\det \frac{\partial f^{-1}}{\partial x}\right|\). In one dimension this simplifies to \(p_x(x) = p_z(f^{-1}(x)) \cdot |\frac{df^{-1}}{dx}|\), making it easy to visualize how stretching and compressing regions of space changes probability density. The 1D case provides clean intuition: where the transformation expands space, density decreases; where it compresses, density increases. This fundamental principle scales directly to higher dimensions via the Jacobian determinant.
# Simple 1D example: affine transformation
def affine_transform(z, a, b):
"""x = a*z + b"""
return a * z + b
def affine_inverse(x, a, b):
"""z = (x - b) / a"""
return (x - b) / a
def affine_log_det_jacobian(a):
"""log|dx/dz| = log|a|"""
return np.log(np.abs(a))
# Base distribution: N(0, 1)
z = np.random.randn(10000)
# Transform: x = 2z + 3
a, b = 2.0, 3.0
x = affine_transform(z, a, b)
# Verify distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 4))
# Original
axes[0].hist(z, bins=50, density=True, alpha=0.7, label='Samples')
z_range = np.linspace(-4, 4, 100)
axes[0].plot(z_range, stats.norm.pdf(z_range, 0, 1), 'r-', linewidth=2, label='N(0,1)')
axes[0].set_xlabel('z', fontsize=12)
axes[0].set_ylabel('Density', fontsize=12)
axes[0].set_title('Base Distribution', fontsize=13)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Transformed
axes[1].hist(x, bins=50, density=True, alpha=0.7, label='Samples')
x_range = np.linspace(-5, 11, 100)
axes[1].plot(x_range, stats.norm.pdf(x_range, b, a), 'r-', linewidth=2, label='N(3,4)')
axes[1].set_xlabel('x', fontsize=12)
axes[1].set_ylabel('Density', fontsize=12)
axes[1].set_title('Transformed Distribution', fontsize=13)
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# Transformation function
axes[2].plot(z_range, affine_transform(z_range, a, b), linewidth=2)
axes[2].set_xlabel('z', fontsize=12)
axes[2].set_ylabel('x = f(z)', fontsize=12)
axes[2].set_title(f'Transformation: x = {a}z + {b}', fontsize=13)
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
from scipy import stats
print(f"log|det J| = {affine_log_det_jacobian(a):.4f}")
3. Composing FlowsΒΆ
Sequential TransformationsΒΆ
Apply multiple bijections: $\(z_0 \xrightarrow{f_1} z_1 \xrightarrow{f_2} \cdots \xrightarrow{f_K} z_K = x\)$
Composed transformation: $\(f = f_K \circ f_{K-1} \circ \cdots \circ f_1\)$
Log-LikelihoodΒΆ
where \(z_0 = f^{-1}(x)\).
Training ObjectiveΒΆ
Maximize log-likelihood: $\(\max_{\theta} \sum_{i=1}^n \log p_{\theta}(x_i)\)$
This is exact, not a lower bound!
Planar Flow (Simple Example)ΒΆ
A planar flow is one of the simplest normalizing flow transformations: \(f(z) = z + u \cdot h(w^T z + b)\), where \(u, w\) are weight vectors, \(b\) is a bias, and \(h\) is a smooth activation function. Each planar flow layer applies a single hyperplane-based transformation, and stacking multiple layers builds up the expressiveness to model complex distributions. The Jacobian determinant for a planar flow has a closed-form expression, making likelihood computation efficient. While planar flows are too simple for high-dimensional data, they clearly demonstrate how composing simple invertible transforms can warp a Gaussian into a multimodal target distribution.
class PlanarFlow(nn.Module):
"""Planar normalizing flow.
f(z) = z + u * tanh(w^T z + b)
"""
def __init__(self, dim):
super().__init__()
self.dim = dim
self.u = nn.Parameter(torch.randn(dim))
self.w = nn.Parameter(torch.randn(dim))
self.b = nn.Parameter(torch.randn(1))
def forward(self, z):
"""Forward pass: z -> x"""
linear = torch.matmul(z, self.w) + self.b
x = z + self.u * torch.tanh(linear).unsqueeze(-1)
# Log determinant
psi = (1 - torch.tanh(linear)**2) * self.w
det = 1 + torch.matmul(psi, self.u)
log_det = torch.log(torch.abs(det) + 1e-8)
return x, log_det
class NormalizingFlow(nn.Module):
"""Stack of planar flows."""
def __init__(self, dim, n_flows=10):
super().__init__()
self.flows = nn.ModuleList([PlanarFlow(dim) for _ in range(n_flows)])
def forward(self, z):
"""Transform z to x."""
log_det_sum = 0
x = z
for flow in self.flows:
x, log_det = flow(x)
log_det_sum += log_det
return x, log_det_sum
def log_prob(self, x):
"""Compute log p(x)."""
# Base distribution log prob
z = torch.zeros_like(x)
log_pz = -0.5 * (z**2).sum(dim=-1) - 0.5 * z.size(-1) * np.log(2 * np.pi)
# Transform
_, log_det = self.forward(z)
return log_pz - log_det
# Test
flow = NormalizingFlow(dim=2, n_flows=5).to(device)
z = torch.randn(10, 2).to(device)
x, log_det = flow(z)
print(f"Input shape: {z.shape}")
print(f"Output shape: {x.shape}")
print(f"Log det shape: {log_det.shape}")
Training Flow on 2D DataΒΆ
Training a normalizing flow on 2D data provides a compelling visualization of the learning process. The model starts with a simple Gaussian base distribution and gradually warps it to match a complex target (e.g., two moons, concentric circles, or a spiral). The training objective is exact maximum likelihood: \(\mathcal{L} = -\mathbb{E}_{x \sim p_{\text{data}}}[\log p_\theta(x)]\), which decomposes into the log-density of \(f^{-1}(x)\) under the base distribution plus the log-absolute-determinant of the Jacobian. Unlike VAEs and GANs, normalizing flows provide exact density evaluation and exact sampling β a significant advantage for applications requiring calibrated uncertainty estimates.
# Generate target distribution: two moons
from sklearn.datasets import make_moons
n_samples = 2000
X, _ = make_moons(n_samples=n_samples, noise=0.05)
X_train = torch.FloatTensor(X).to(device)
# Normalize
X_train = (X_train - X_train.mean(dim=0)) / X_train.std(dim=0)
train_loader = DataLoader(TensorDataset(X_train), batch_size=128, shuffle=True)
# Model
model = NormalizingFlow(dim=2, n_flows=16).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# Training
n_epochs = 100
losses = []
for epoch in range(n_epochs):
epoch_loss = 0
for batch, in train_loader:
optimizer.zero_grad()
# Sample from base
z = torch.randn_like(batch)
# Transform
x, log_det = model(z)
# Base log prob
log_pz = -0.5 * (z**2).sum(dim=1) - np.log(2 * np.pi)
# Log likelihood
log_px = log_pz - log_det
# Match to data (minimize KL divergence)
loss = -log_px.mean()
# Also add MSE to stabilize
loss += F.mse_loss(x, batch)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
losses.append(epoch_loss / len(train_loader))
if (epoch + 1) % 20 == 0:
print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {losses[-1]:.4f}")
print("\nTraining complete!")
# Visualize results
model.eval()
with torch.no_grad():
# Sample from base
z_samples = torch.randn(1000, 2).to(device)
# Transform through flow
x_samples, _ = model(z_samples)
x_samples = x_samples.cpu().numpy()
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Training loss
axes[0].plot(losses, linewidth=2)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('Training Loss', fontsize=13)
axes[0].grid(True, alpha=0.3)
# Base distribution
z_vis = torch.randn(1000, 2).cpu().numpy()
axes[1].scatter(z_vis[:, 0], z_vis[:, 1], alpha=0.5, s=10)
axes[1].set_xlabel('zβ', fontsize=12)
axes[1].set_ylabel('zβ', fontsize=12)
axes[1].set_title('Base Distribution (Gaussian)', fontsize=13)
axes[1].grid(True, alpha=0.3)
axes[1].set_aspect('equal')
# Generated samples
axes[2].scatter(X_train.cpu()[:, 0], X_train.cpu()[:, 1], alpha=0.3, s=10, label='Data')
axes[2].scatter(x_samples[:, 0], x_samples[:, 1], alpha=0.5, s=10, label='Generated')
axes[2].set_xlabel('xβ', fontsize=12)
axes[2].set_ylabel('xβ', fontsize=12)
axes[2].set_title('Generated vs True Data', fontsize=13)
axes[2].legend()
axes[2].grid(True, alpha=0.3)
axes[2].set_aspect('equal')
plt.tight_layout()
plt.show()
6. Modern Flow ArchitecturesΒΆ
Coupling Layers (RealNVP, Glow)ΒΆ
Split input \(z = (z_1, z_2)\):
where \(s, t\) are neural networks.
Jacobian is triangular β easy determinant!
Autoregressive Flows (MAF, IAF)ΒΆ
Triangular Jacobian via autoregressive structure.
Continuous Normalizing Flows (Neural ODEs)ΒΆ
Infinite depth flow, compute via ODE solvers.
ComparisonΒΆ
Architecture |
Sampling |
Density |
Jacobian |
|---|---|---|---|
Coupling (RealNVP) |
Fast β |
Fast β |
Triangular |
Autoregressive (MAF) |
Slow |
Fast β |
Triangular |
Inverse Autoregressive (IAF) |
Fast β |
Slow |
Triangular |
Neural ODE |
Medium |
Medium |
Continuous |
SummaryΒΆ
Key Principles:ΒΆ
Invertibility: Bijective transformations \(f: z \to x\)
Change of variables: \(p_x(x) = p_z(z) |\det J_f|^{-1}\)
Exact likelihood: No approximation, unlike VAE
Composition: Stack flows for expressiveness
Training Objective:ΒΆ
Architectural Choices:ΒΆ
Requirements:
Invertible transformation
Tractable Jacobian determinant
Solutions:
Triangular Jacobian: Coupling, autoregressive
Low-rank updates: Planar, radial flows
Continuous: Neural ODEs
Advantages:ΒΆ
Exact likelihood: Principled training
Fast sampling: Single forward pass
Latent space: Invertible, no encoder needed
Limitations:ΒΆ
Architectural constraints: Must be invertible
Dimension preserving: \(\dim(z) = \dim(x)\)
Computational cost: Jacobian determinant
Modern Variants:ΒΆ
RealNVP (Dinh et al., 2017): Coupling layers
Glow (Kingma & Dhariwal, 2018): Invertible 1x1 convolutions
FFJORD (Grathwohl et al., 2019): Continuous flows
Flow++ (Ho et al., 2019): Improved coupling
Applications:ΒΆ
Image generation (high-res)
Density estimation
Variational inference (VI with flows)
Audio synthesis
Next Steps:ΒΆ
04_neural_ode.ipynb - Continuous normalizing flows
03_variational_autoencoders_advanced.ipynb - Compare with VAEs
09_vq_vae.ipynb - Discrete latent representations
Advanced Normalizing Flows TheoryΒΆ
1. Mathematical FoundationsΒΆ
Change of Variables FormulaΒΆ
Theorem (Change of Variables): For invertible f: βα΅ β βα΅ with x = f(z):
Inverse Jacobian Form: Using inverse function theorem:
where z = fβ»ΒΉ(x).
Log-Density:
where J_f is the Jacobian matrix.
Requirements for Normalizing FlowsΒΆ
Bijection: f must be invertible (one-to-one and onto)
Tractable Jacobian: det(J_f) must be computable efficiently
Differentiable: For gradient-based optimization
Challenge: General neural networks are not invertible.
2. Flow CompositionΒΆ
Sequential TransformationsΒΆ
Composition: Apply K bijections sequentially:
Combined Transformation: f = f_K β f_{K-1} β β¦ β f_1
Jacobian Determinant (Chain Rule):
Log-Density:
Maximum Likelihood Training:
3. Coupling Layers (RealNVP)ΒΆ
Affine Coupling LayerΒΆ
Key Idea: Split input into two parts, transform one conditioned on the other.
Forward Pass: Given z = (zβ, zβ) where zβ β βα΅, zβ β βα΄°β»α΅:
where s, t: βα΅ β βα΄°β»α΅ are neural networks (scale and translation).
Inverse Pass:
Jacobian: Block triangular structure:
Determinant:
Log-Determinant:
Computational Cost: O(D) instead of O(DΒ³) for general Jacobian!
Multi-Scale ArchitectureΒΆ
Strategy: Apply coupling at different resolutions.
Squeeze Operation: Reshape (H, W, C) β (H/2, W/2, 4C).
Split: Send half of channels directly to output, transform other half.
Benefit: Reduces memory, enables high-resolution generation.
4. Glow ArchitectureΒΆ
1Γ1 Invertible ConvolutionΒΆ
Motivation: Permutation is too restrictive, use learned mixing.
Operation: W β βαΆΛ£αΆ invertible weight matrix:
Initialization: W = random rotation matrix (via QR decomposition).
Jacobian: For each spatial position (h, w):
Total Log-Determinant: For HΓW spatial resolution:
Efficient Computation: LU decomposition of W:
where P is permutation, L lower triangular, U upper triangular, s is diagonal.
Actnorm (Activation Normalization)ΒΆ
Purpose: Data-dependent initialization for stable training.
Operation: Affine transformation per channel:
Initialization: Set s, b so first batch has zero mean and unit variance.
Benefit: Prevents activation explosion/vanishing.
Glow BlockΒΆ
Full Glow Step:
1. Actnorm (data normalization)
2. 1Γ1 Invertible Conv (channel mixing)
3. Affine Coupling (nonlinear transform)
Repeat: Stack K Glow steps per scale.
Multi-scale: Squeeze β K steps β Split β next scale.
5. Autoregressive FlowsΒΆ
Masked Autoregressive Flow (MAF)ΒΆ
Autoregressive Structure: Each xα΅’ depends only on xβ, β¦, xα΅’ββ:
where Ξ± (scale) and ΞΌ (location) are neural networks with masking.
Jacobian: Lower triangular (autoregressive property):
Determinant:
Complexity:
Forward (zβx): O(D) sequential (slow)
Inverse (xβz): O(D) parallel (fast)
Density evaluation: Fast
Use Case: Density estimation, variational inference.
Inverse Autoregressive Flow (IAF)ΒΆ
Key Difference: Autoregressive in z instead of x:
Complexity:
Forward (zβx): O(D) parallel (fast)
Inverse (xβz): O(D) sequential (slow)
Sampling: Fast
Use Case: Sampling, variational autoencoders (VAE posterior).
MADE (Masked Autoencoder for Distribution Estimation)ΒΆ
Architecture: Enforce autoregressive property via weight masking.
Masks: Binary matrices M ensuring output i depends only on input j where j < i.
Efficient Parallelization: Compute all ΞΌα΅’, Ξ±α΅’ in one forward pass.
6. Continuous Normalizing Flows (Neural ODEs)ΒΆ
Continuous-Time DynamicsΒΆ
ODE Formulation: Instead of discrete steps, continuous transformation:
where f is a neural network.
Transformation: z(0) β z(1) via ODE integration:
Inverse: Integrate backward:
Instantaneous Change of VariablesΒΆ
Theorem (Continuous Change of Variables): Log-density evolves via:
Integrated Form:
Trace Computation: For D-dimensional z, computing trace directly is O(DΒ²).
FFJORD (Free-Form Jacobian of Reversible Dynamics)ΒΆ
Hutchinson Trace Estimator: Unbiased estimate using random vectors:
Application:
computed via vector-Jacobian product (automatic differentiation).
Complexity: O(D) instead of O(DΒ²).
Training: Solve augmented ODE simultaneously for z(t) and log-density:
Advantage: No architectural constraints (any f can be used).
7. Volume-Preserving FlowsΒΆ
MotivationΒΆ
Challenge: Standard flows require dim(z) = dim(x).
Solution: Constrain det(J) = 1 (volume-preserving).
Residual FlowsΒΆ
Transformation: f(z) = z + g(z) where g is a neural network.
Condition for Invertibility: Lipschitz constant Lip(g) < 1.
Spectral Normalization: Constrain Lip(g) via weight normalization.
Jacobian Determinant: det(J) = det(I + βg/βz).
Power Series: For small ||βg/βz||:
Infinitesimal FlowsΒΆ
Limit: As g β 0, residual flow becomes continuous flow.
Connection: Neural ODEs are limit of residual flows.
8. Advanced Training TechniquesΒΆ
DequantizationΒΆ
Problem: Discrete data (images) have zero density under continuous model.
Solution: Add uniform noise U[0, 1/256] to discretized data:
Variational Dequantization: Learn noise distribution q(u|x) for tighter bound.
Temperature ScalingΒΆ
Sampling: Scale base distribution std by temperature Ο:
Ο < 1: Higher quality, less diversity
Ο > 1: More diversity, lower quality
Multi-Scale TrainingΒΆ
Strategy: Apply loss at multiple resolutions.
Objective:
where xβ½Λ’βΎ is data at scale s.
Benefit: Better gradient flow, prevents mode collapse.
Gradient CheckpointingΒΆ
Memory Reduction: Donβt store all intermediate activations during forward pass.
Recomputation: Recompute activations during backward pass.
Trade-off: 33% slower, but enables deeper flows.
9. Likelihood EstimationΒΆ
Bits per Dimension (BPD)ΒΆ
Definition: Negative log-likelihood normalized by data dimension:
where D = H Γ W Γ C for images.
Interpretation: Average bits needed to encode each pixel/dimension.
Typical Values:
Random: 8 bits/dim (uniform over [0, 255])
Good model: 3-4 bits/dim (CIFAR-10)
SOTA: <3 bits/dim
Comparison with ELBOΒΆ
Flow: Exact likelihood $\(\log p(x) = \log p(z) - \sum_k \log |\det J_{f_k}|\)$
VAE: Lower bound (ELBO) $\(\log p(x) \geq \mathbb{E}_{q(z|x)}[\log p(x|z)] - KL(q(z|x) \| p(z))\)$
Advantage: Flows optimize true likelihood, VAEs optimize bound (may be loose).
10. Invertibility GuaranteesΒΆ
Conditions for InvertibilityΒΆ
Theorem (Inverse Function Theorem): If f is continuously differentiable and det(J_f) β 0, then f is locally invertible.
Global Invertibility: Additional conditions needed:
f is injective (one-to-one)
f is surjective (onto)
Invertible Neural Networks (INN)ΒΆ
Design Principles:
Use invertible components (coupling, 1Γ1 conv)
Ensure non-zero Jacobian determinant
Verify numerical stability
Numerical Inversion: For complex f, use iterative solvers (fixed-point iteration, Newtonβs method).
11. State-of-the-Art ResultsΒΆ
Image GenerationΒΆ
CIFAR-10 (32Γ32):
Glow: 3.35 bits/dim
Flow++: 3.08 bits/dim
Residual Flow: 3.28 bits/dim
ImageNet (64Γ64):
Glow: 3.81 bits/dim
Flow++: 3.69 bits/dim
CelebA-HQ (256Γ256):
Glow: 1.03 bits/dim
Sampling SpeedΒΆ
Comparison (single sample):
Flow: ~10-50ms (K forward passes)
GAN: ~5ms (1 generator pass)
Diffusion: ~1-10s (100-1000 steps)
Advantage: Flows are faster than diffusion, slower than GANs.
12. LimitationsΒΆ
Architectural ConstraintsΒΆ
Dimension Preservation: dim(z) = dim(x) required.
Canβt compress like VAE (z < x)
Inefficient for high-dim data
Invertibility: Limits architecture choices.
No pooling, dropout, batch norm (standard forms)
Must use specialized layers
Sample Quality vs LikelihoodΒΆ
Paradox: High likelihood doesnβt always mean good samples.
Reason: Model may assign probability mass to imperceptible differences.
Solution: Hybrid models (VAE + Flow, GAN + Flow).
Computational CostΒΆ
Training: Computing Jacobian determinant adds overhead.
RealNVP: 2-3Γ slower than GAN
FFJORD: 5-10Γ slower (ODE solve)
Memory: Storing intermediate activations for deep flows.
13. Extensions and VariantsΒΆ
Discrete FlowsΒΆ
Problem: Continuous flows donβt model discrete data directly.
Solution: Integer discrete flows (Hoogeboom et al., 2019).
Application: Text, graphs, molecular structures.
Variational Dequantization FlowsΒΆ
Idea: Learn dequantization distribution q(u|x) jointly with flow.
Objective: Tighter bound on discrete data likelihood.
Graphical Normalizing FlowsΒΆ
Extension: Flows for graph-structured data.
Equivariance: Preserve graph symmetries (permutation equivariance).
Neural Spline FlowsΒΆ
Piecewise Function: Use monotonic splines instead of affine transforms.
Advantage: More expressive coupling layers.
SOTA: Achieves best bits/dim on many benchmarks.
14. ApplicationsΒΆ
Variational InferenceΒΆ
Use: Flow-based posterior q_Ο(z|x) in VAE.
Benefit: More flexible than Gaussian posterior.
Example: Normalizing Flow VAE (Rezende & Mohamed, 2015).
Density EstimationΒΆ
Task: Model p(x) for anomaly detection, compression.
Method: Maximize likelihood on training data.
Anomaly Score: -log p(x) (high for outliers).
Image SynthesisΒΆ
Generation: Sample z ~ N(0,I), compute x = f(z).
Interpolation: Linear interpolation in z-space.
Attribute Manipulation: Directions in latent space correspond to attributes.
Data CompressionΒΆ
Principle: Use learned p(x) for entropy coding.
Bits Required: Approximately -logβ p(x) bits.
Lossless Compression: Exact reconstruction via invertibility.
15. Connections to Other ModelsΒΆ
Flows vs VAEsΒΆ
Aspect |
Flow |
VAE |
|---|---|---|
Likelihood |
Exact |
Lower bound (ELBO) |
Latent dim |
dim(z) = dim(x) |
dim(z) < dim(x) |
Sampling |
Fast (1 pass) |
Fast (1 pass) |
Invertibility |
Yes |
No (encoder β decoderβ»ΒΉ) |
Flows vs GANsΒΆ
Similarities: Both map noise β data.
Differences:
Flow: Bijection, exact likelihood
GAN: Not invertible, no likelihood
Hybrid: Use flow as GAN generator for stable training.
Flows vs DiffusionΒΆ
Diffusion: Iterative denoising process (many steps).
Pro: SOTA sample quality
Con: Slow sampling
Flow: Single pass transformation.
Pro: Fast sampling
Con: Architectural constraints
16. Key PapersΒΆ
FoundationsΒΆ
Rezende & Mohamed (2015): βVariational Inference with Normalizing Flowsβ
Dinh et al. (2015): βNICE: Non-linear Independent Components Estimationβ
Modern ArchitecturesΒΆ
Dinh et al. (2017): βDensity Estimation using Real NVPβ
Kingma & Dhariwal (2018): βGlow: Generative Flow using Invertible 1x1 Convolutionsβ
Papamakarios et al. (2017): βMasked Autoregressive Flow for Density Estimationβ
Continuous FlowsΒΆ
Chen et al. (2018): βNeural Ordinary Differential Equationsβ
Grathwohl et al. (2019): βFFJORD: Free-form Continuous Dynamics for Scalable Reversible Generative Modelsβ
Recent AdvancesΒΆ
Ho et al. (2019): βFlow++: Improving Flow-Based Generative Modelsβ
Durkan et al. (2019): βNeural Spline Flowsβ
Hoogeboom et al. (2019): βInteger Discrete Flowsβ
17. Practical ConsiderationsΒΆ
HyperparametersΒΆ
Parameter |
Typical Range |
Effect |
|---|---|---|
Coupling depth |
8-32 layers |
More β expressive, slower |
Hidden dim |
256-512 |
Larger β flexible, memory |
Split dimension |
D/2 |
Half channels per coupling |
Learning rate |
1e-4 to 5e-4 |
Higher β faster, unstable |
Batch size |
32-128 |
Larger β stable, more memory |
Debugging TipsΒΆ
Check:
Jacobian determinant: Should be non-zero
Inverse accuracy: ||fβ»ΒΉ(f(z)) - z|| < Ξ΅
Likelihood: Should increase during training
Samples: Visually inspect quality
Common Issues:
Exploding gradients β gradient clipping, spectral norm
Poor samples β increase depth, tune temperature
Slow training β reduce depth, use gradient checkpointing
Computational EfficiencyΒΆ
Speed Optimizations:
Use 1Γ1 conv instead of full permutation
LU decomposition for faster determinant
Gradient checkpointing for memory
Parallelization: Coupling layers parallelize well across GPUs.
18. When to Use Normalizing FlowsΒΆ
Choose Flows When:
Exact likelihood required (density estimation, compression)
Fast sampling important (real-time applications)
Interpretable latent space desired
Invertibility useful (cycle-consistency tasks)
Avoid Flows When:
Best sample quality critical (use diffusion/GANs)
High compression needed (use VAE with z << x)
Data is discrete (flows designed for continuous)
Limited compute (flows can be memory-intensive)
# Advanced Normalizing Flows Implementations
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import Tuple, List, Optional
class AffineCoupling(nn.Module):
"""
Affine coupling layer (RealNVP).
Split input, transform half conditioned on other half.
"""
def __init__(self, dim, hidden_dim=256, mask_type='checkerboard'):
super().__init__()
self.dim = dim
# Create mask (1 = identity, 0 = transform)
if mask_type == 'checkerboard':
self.register_buffer('mask', torch.arange(dim) % 2)
elif mask_type == 'channel':
mask = torch.zeros(dim)
mask[:dim//2] = 1
self.register_buffer('mask', mask)
# Scale and translation networks
mask_dim = int(self.mask.sum().item())
transform_dim = dim - mask_dim
self.scale_net = nn.Sequential(
nn.Linear(mask_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, transform_dim)
)
self.translate_net = nn.Sequential(
nn.Linear(mask_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, transform_dim)
)
def forward(self, z, reverse=False):
"""
Forward: z -> x
Reverse: x -> z
"""
mask = self.mask.unsqueeze(0)
# Split input
z_masked = z * mask
z_transform = z * (1 - mask)
# Get conditioning (identity part)
z_cond = z_masked[:, self.mask == 1]
# Compute scale and translation
s = self.scale_net(z_cond)
t = self.translate_net(z_cond)
if not reverse:
# Forward: x = z * exp(s) + t
x_transform = z_transform[:, self.mask == 0] * torch.exp(s) + t
log_det = s.sum(dim=1)
else:
# Inverse: z = (x - t) * exp(-s)
x_transform = (z_transform[:, self.mask == 0] - t) * torch.exp(-s)
log_det = -s.sum(dim=1)
# Reconstruct output
x = torch.zeros_like(z)
x[:, self.mask == 1] = z_cond
x[:, self.mask == 0] = x_transform
return x, log_det
class RealNVP(nn.Module):
"""
RealNVP: Stacked affine coupling layers with alternating masks.
"""
def __init__(self, dim, n_layers=6, hidden_dim=256):
super().__init__()
self.dim = dim
# Alternating checkerboard and channel masks
self.layers = nn.ModuleList()
for i in range(n_layers):
mask_type = 'checkerboard' if i % 2 == 0 else 'channel'
self.layers.append(AffineCoupling(dim, hidden_dim, mask_type))
def forward(self, z, reverse=False):
"""Transform z -> x (or x -> z if reverse)."""
log_det_total = 0
x = z
layers = reversed(self.layers) if reverse else self.layers
for layer in layers:
x, log_det = layer(x, reverse=reverse)
log_det_total += log_det
return x, log_det_total
def log_prob(self, x):
"""Compute log p(x)."""
# Inverse transform: x -> z
z, log_det = self.forward(x, reverse=True)
# Base distribution: N(0, I)
log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.dim * np.log(2 * np.pi)
# log p(x) = log p(z) + log |det J_f^{-1}|
log_px = log_pz + log_det
return log_px
class Invertible1x1Conv(nn.Module):
"""
1Γ1 invertible convolution (Glow).
Efficient via LU decomposition: W = PLU.
"""
def __init__(self, channels):
super().__init__()
self.channels = channels
# Initialize with random rotation
W = torch.qr(torch.randn(channels, channels))[0]
# LU decomposition
P, L, U = torch.lu_unpack(*torch.lu(W))
# Store parameters
self.register_buffer('P', P) # Permutation (fixed)
self.L = nn.Parameter(L) # Lower triangular
self.U = nn.Parameter(U) # Upper triangular
# Diagonal of U as separate parameter for easy determinant
self.log_s = nn.Parameter(torch.log(torch.abs(torch.diag(U))))
def forward(self, z, reverse=False):
"""Apply (inverse) 1Γ1 convolution."""
batch_size = z.size(0)
# Reconstruct W = P @ L @ (U + diag(s))
L = torch.tril(self.L, diagonal=-1) + torch.eye(self.channels, device=z.device)
U = torch.triu(self.U, diagonal=1)
W = self.P @ L @ (U + torch.diag(torch.exp(self.log_s)))
if not reverse:
# Forward: x = W @ z
x = z @ W.t()
# Log determinant: sum of log diagonal elements
log_det = self.log_s.sum() * torch.ones(batch_size, device=z.device)
else:
# Inverse: z = W^{-1} @ x
W_inv = torch.inverse(W)
x = z @ W_inv.t()
log_det = -self.log_s.sum() * torch.ones(batch_size, device=z.device)
return x, log_det
class ActNorm(nn.Module):
"""
Activation normalization (Glow).
Data-dependent initialization for stable training.
"""
def __init__(self, channels):
super().__init__()
self.channels = channels
# Learnable scale and bias
self.log_scale = nn.Parameter(torch.zeros(channels))
self.bias = nn.Parameter(torch.zeros(channels))
# Track initialization
self.register_buffer('initialized', torch.tensor(0))
def forward(self, z, reverse=False):
"""Apply activation normalization."""
batch_size = z.size(0)
# Initialize on first batch
if not self.initialized:
with torch.no_grad():
# Set bias to have zero mean
mean = z.mean(dim=0)
std = z.std(dim=0)
self.bias.data = -mean
self.log_scale.data = -torch.log(std + 1e-6)
self.initialized.fill_(1)
if not reverse:
# Forward: normalize
x = z * torch.exp(self.log_scale) + self.bias
log_det = self.log_scale.sum() * torch.ones(batch_size, device=z.device)
else:
# Inverse: denormalize
x = (z - self.bias) * torch.exp(-self.log_scale)
log_det = -self.log_scale.sum() * torch.ones(batch_size, device=z.device)
return x, log_det
class GlowBlock(nn.Module):
"""
Single Glow block: ActNorm -> 1Γ1 Conv -> Affine Coupling.
"""
def __init__(self, channels, hidden_dim=256):
super().__init__()
self.actnorm = ActNorm(channels)
self.conv1x1 = Invertible1x1Conv(channels)
self.coupling = AffineCoupling(channels, hidden_dim, mask_type='channel')
def forward(self, z, reverse=False):
"""Apply (inverse) Glow block."""
log_det_total = 0
if not reverse:
# Forward: ActNorm -> 1Γ1Conv -> Coupling
x, ld1 = self.actnorm(z, reverse=False)
x, ld2 = self.conv1x1(x, reverse=False)
x, ld3 = self.coupling(x, reverse=False)
log_det_total = ld1 + ld2 + ld3
else:
# Reverse: Coupling -> 1Γ1Conv -> ActNorm
x, ld3 = self.coupling(z, reverse=True)
x, ld2 = self.conv1x1(x, reverse=True)
x, ld1 = self.actnorm(x, reverse=True)
log_det_total = ld1 + ld2 + ld3
return x, log_det_total
class Glow(nn.Module):
"""
Glow: Multi-scale architecture with squeeze, flow, split.
"""
def __init__(self, input_dim, n_blocks=8, hidden_dim=256):
super().__init__()
self.input_dim = input_dim
# Stack of Glow blocks
self.blocks = nn.ModuleList([
GlowBlock(input_dim, hidden_dim) for _ in range(n_blocks)
])
def forward(self, z, reverse=False):
"""Transform z -> x (or x -> z if reverse)."""
log_det_total = 0
x = z
blocks = reversed(self.blocks) if reverse else self.blocks
for block in blocks:
x, log_det = block(x, reverse=reverse)
log_det_total += log_det
return x, log_det_total
def log_prob(self, x):
"""Compute log p(x) via change of variables."""
# Inverse: x -> z
z, log_det = self.forward(x, reverse=True)
# Base log probability
log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.input_dim * np.log(2 * np.pi)
# Log probability
log_px = log_pz + log_det
return log_px
class MADE(nn.Module):
"""
Masked Autoencoder for Distribution Estimation.
Autoregressive flow via weight masking.
"""
def __init__(self, input_dim, hidden_dims=[256, 256]):
super().__init__()
self.input_dim = input_dim
# Build network
dims = [input_dim] + hidden_dims + [input_dim * 2] # *2 for scale and location
self.layers = nn.ModuleList()
for i in range(len(dims) - 1):
self.layers.append(nn.Linear(dims[i], dims[i+1]))
# Create masks for autoregressive property
self.masks = self._create_masks(dims)
def _create_masks(self, dims):
"""Create binary masks ensuring x_i depends only on x_{<i}."""
masks = []
# Assign degree (max input index) to each unit
m = {}
m[0] = torch.arange(1, self.input_dim + 1) # Input: 1, 2, ..., D
for l in range(1, len(dims) - 1):
# Hidden layers: random degrees between 1 and D-1
m[l] = torch.randint(1, self.input_dim, (dims[l],))
m[len(dims) - 1] = torch.arange(1, self.input_dim + 1).repeat(2) # Output: repeat for scale/loc
# Create masks: connect if m[l-1] < m[l]
for l in range(len(dims) - 1):
masks.append(m[l].unsqueeze(1) <= m[l+1].unsqueeze(0))
return masks
def forward(self, z):
"""Compute scale and location parameters."""
x = z
for i, (layer, mask) in enumerate(zip(self.layers, self.masks)):
# Apply masked weight
weight = layer.weight * mask.t().float().to(z.device)
x = F.linear(x, weight, layer.bias)
if i < len(self.layers) - 1:
x = F.relu(x)
# Split into scale and location
log_scale, loc = x.chunk(2, dim=1)
return log_scale, loc
class MAF(nn.Module):
"""
Masked Autoregressive Flow.
Fast density estimation, slow sampling.
"""
def __init__(self, input_dim, n_layers=5, hidden_dims=[256, 256]):
super().__init__()
self.input_dim = input_dim
# Stack of MADE layers
self.made_layers = nn.ModuleList([
MADE(input_dim, hidden_dims) for _ in range(n_layers)
])
def forward(self, z, reverse=False):
"""Transform z -> x (forward) or x -> z (reverse)."""
if not reverse:
# Forward (z -> x): Sequential (slow)
x = torch.zeros_like(z)
log_det_total = 0
for i in range(self.input_dim):
# Compute parameters from previous outputs
log_s, t = self.made_layers[0](x)
# Transform current dimension
x[:, i] = z[:, i] * torch.exp(log_s[:, i]) + t[:, i]
log_det_total += log_s[:, i]
return x, log_det_total
else:
# Inverse (x -> z): Parallel (fast)
log_det_total = 0
z = x.clone()
for made in self.made_layers:
log_s, t = made(x)
z = (x - t) * torch.exp(-log_s)
log_det_total += -log_s.sum(dim=1)
x = z
return z, log_det_total
def log_prob(self, x):
"""Compute log p(x) - FAST for MAF."""
# Inverse: x -> z (parallel, fast)
z, log_det = self.forward(x, reverse=True)
# Base log prob
log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.input_dim * np.log(2 * np.pi)
return log_pz + log_det
class NeuralODEFlow(nn.Module):
"""
Continuous Normalizing Flow via Neural ODE.
Uses torchdiffeq for ODE integration.
"""
def __init__(self, dim, hidden_dim=64):
super().__init__()
self.dim = dim
# Dynamics network: f(z, t)
self.net = nn.Sequential(
nn.Linear(dim + 1, hidden_dim), # +1 for time
nn.Tanh(),
nn.Linear(hidden_dim, hidden_dim),
nn.Tanh(),
nn.Linear(hidden_dim, dim)
)
def forward(self, t, z):
"""Compute dz/dt = f(z, t)."""
# Concatenate time
t_vec = torch.ones(z.size(0), 1, device=z.device) * t
z_t = torch.cat([z, t_vec], dim=1)
return self.net(z_t)
def trace_estimator(self, z, t, noise=None):
"""Hutchinson trace estimator for divergence."""
if noise is None:
noise = torch.randn_like(z)
# Compute f(z, t)
f = self.forward(t, z)
# Vector-Jacobian product: Ξ΅^T (βf/βz)
vjp = torch.autograd.grad(f, z, noise, create_graph=True)[0]
# Tr(βf/βz) β Ξ΅^T (βf/βz) Ξ΅
trace_estimate = (vjp * noise).sum(dim=1)
return trace_estimate
class CNFTrainer:
"""
Trainer for Continuous Normalizing Flows.
Integrates ODE for z(t) and log p(t) simultaneously.
"""
def __init__(self, flow_model, optimizer):
self.flow = flow_model
self.optimizer = optimizer
def integrate_ode(self, z0, t_span, method='euler'):
"""
Simple ODE integration (Euler method).
For production, use torchdiffeq library.
"""
z = z0
log_p = torch.zeros(z0.size(0), device=z0.device)
dt = (t_span[1] - t_span[0]) / 10 # 10 steps
t = t_span[0]
for _ in range(10):
z.requires_grad_(True)
# Compute dynamics
dz_dt = self.flow(t, z)
# Compute trace (divergence)
trace = self.flow.trace_estimator(z, t)
# Euler step
z = z + dz_dt * dt
log_p = log_p - trace * dt
t = t + dt
z = z.detach()
return z, log_p
def train_step(self, x_data):
"""Single training step."""
# Base distribution
z0 = torch.randn_like(x_data)
# Integrate forward: z(0) -> z(1)
z1, log_det = self.integrate_ode(z0, (0.0, 1.0))
# Base log prob
log_pz0 = -0.5 * (z0 ** 2).sum(dim=1) - 0.5 * z0.size(1) * np.log(2 * np.pi)
# Log prob at t=1
log_pz1 = log_pz0 - log_det
# Loss: negative log-likelihood + MSE to data
loss = -log_pz1.mean() + F.mse_loss(z1, x_data)
# Backprop
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss.item()
# ============================================================================
# Demonstrations
# ============================================================================
print("=" * 70)
print("Normalizing Flows - Advanced Implementations")
print("=" * 70)
# 1. RealNVP
print("\n1. RealNVP (Affine Coupling):")
realnvp = RealNVP(dim=10, n_layers=6, hidden_dim=128)
z_test = torch.randn(4, 10)
x_test, log_det = realnvp(z_test)
log_px = realnvp.log_prob(x_test)
print(f" Input shape: {z_test.shape}")
print(f" Output shape: {x_test.shape}")
print(f" Log determinant: {log_det.shape}")
print(f" Log p(x): {log_px.shape}")
print(f" Parameters: {sum(p.numel() for p in realnvp.parameters()):,}")
# 2. Glow
print("\n2. Glow (ActNorm + 1Γ1Conv + Coupling):")
glow = Glow(input_dim=10, n_blocks=8, hidden_dim=128)
x_glow, log_det_glow = glow(z_test)
print(f" Glow blocks: 8")
print(f" Each block: ActNorm -> Inv1Γ1Conv -> AffineCoupling")
print(f" Parameters: {sum(p.numel() for p in glow.parameters()):,}")
print(f" Forward pass: z -> x")
print(f" Inverse pass: x -> z (for log p(x))")
# 3. MAF
print("\n3. MAF (Masked Autoregressive Flow):")
maf = MAF(input_dim=10, n_layers=3, hidden_dims=[64, 64])
log_px_maf = maf.log_prob(x_test)
print(f" MADE layers: 3")
print(f" Autoregressive: x_i = f(x_{<i}, z_i)")
print(f" Density eval: FAST (parallel)")
print(f" Sampling: SLOW (sequential)")
print(f" Parameters: {sum(p.numel() for p in maf.parameters()):,}")
# 4. Neural ODE Flow
print("\n4. Neural ODE Flow (Continuous):")
ode_flow = NeuralODEFlow(dim=10, hidden_dim=32)
print(f" Dynamics: dz/dt = f(z, t; ΞΈ)")
print(f" Integration: 0 -> 1 (continuous time)")
print(f" Trace: Hutchinson estimator (efficient)")
print(f" Parameters: {sum(p.numel() for p in ode_flow.parameters()):,}")
# 5. Architecture comparison
print("\n5. Architecture Comparison:")
print(" ββββββββββββββββ¬ββββββββββββ¬βββββββββββββ¬βββββββββββββββ¬βββββββββββ")
print(" β Architecture β Sampling β Density β Jacobian β Use Case β")
print(" ββββββββββββββββΌββββββββββββΌβββββββββββββΌβββββββββββββββΌβββββββββββ€")
print(" β RealNVP β Fast β β Fast β β Triangular β General β")
print(" β Glow β Fast β β Fast β β Triangular β Images β")
print(" β MAF β Slow β Fast ββ β Triangular β Density β")
print(" β IAF β Fast ββ β Slow β Triangular β VAE β")
print(" β Neural ODE β Medium β Medium β Continuous β Flexible β")
print(" ββββββββββββββββ΄ββββββββββββ΄βββββββββββββ΄βββββββββββββββ΄βββββββββββ")
# 6. Jacobian determinant comparison
print("\n6. Jacobian Determinant Computation:")
print(" General matrix: O(DΒ³) - prohibitive")
print(" Triangular (coupling): O(D) - efficient β")
print(" ")
print(" RealNVP coupling:")
print(" βββββββββββββββββββ")
print(" β I_d 0 β Determinant = exp(Ξ£ s_i)")
print(" β βxβ/βzβ diag(s)β Cost: O(D)")
print(" βββββββββββββββββββ")
# 7. Invertibility verification
print("\n7. Invertibility Check:")
z_original = torch.randn(2, 10)
x_forward, _ = realnvp(z_original, reverse=False)
z_reconstructed, _ = realnvp(x_forward, reverse=True)
error = (z_original - z_reconstructed).abs().max().item()
print(f" z_original -> x -> z_reconstructed")
print(f" Max reconstruction error: {error:.2e}")
print(f" Numerically invertible: {'β' if error < 1e-4 else 'β'}")
# 8. When to use guide
print("\n8. When to Use Each Flow:")
print(" Use RealNVP when:")
print(" β Need exact likelihood")
print(" β Fast sampling important")
print(" β General-purpose generation")
print("\n Use Glow when:")
print(" β Image generation (high-res)")
print(" β Need 1Γ1 conv mixing")
print(" β Multi-scale architecture beneficial")
print("\n Use MAF when:")
print(" β Density estimation primary goal")
print(" β Sampling speed not critical")
print(" β Variational inference (VI)")
print("\n Use Neural ODE when:")
print(" β Architectural flexibility needed")
print(" β No invertibility constraints")
print(" β Continuous dynamics natural")
# 9. Exact likelihood advantage
print("\n9. Exact Likelihood vs Bounds:")
print(" Flow: log p(x) = log p(z) - log|det J| [EXACT]")
print(" VAE: log p(x) β₯ ELBO [LOWER BOUND]")
print(" GAN: No likelihood [IMPLICIT]")
print(" ")
print(" Flows enable:")
print(" β’ Principled maximum likelihood training")
print(" β’ Exact density for anomaly detection")
print(" β’ Lossless compression via entropy coding")
print("\n" + "=" * 70)