import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
import seaborn as sns

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
sns.set_style('whitegrid')

1. Motivation: Exact LikelihoodΒΆ

Comparison of Generative ModelsΒΆ

Model

Likelihood

Sampling

Representation

VAE

Approximate (ELBO)

Fast

Latent space

GAN

None

Fast

Implicit

Flow

Exact

Fast

Bijective

Key Idea: Change of VariablesΒΆ

Given:

  • Simple base distribution \(p_z(z)\) (e.g., Gaussian)

  • Invertible transformation \(f: \mathbb{R}^d \to \mathbb{R}^d\)

  • Data \(x = f(z)\)

Change of variables formula: $\(p_x(x) = p_z(f^{-1}(x)) \left| \det \frac{\partial f^{-1}}{\partial x} \right|\)$

Equivalently: $\(p_x(x) = p_z(z) \left| \det \frac{\partial f}{\partial z} \right|^{-1}\)$

where \(z = f^{-1}(x)\).

RequirementsΒΆ

  1. Invertibility: \(f\) must be bijective

  2. Tractable Jacobian: \(\det J_f\) must be computable

  3. Expressiveness: \(f\) must be flexible

πŸ“š Reference Materials:

Change of Variables in 1DΒΆ

Normalizing flows are built on the change of variables formula: if \(z \sim p_z(z)\) and \(x = f(z)\) where \(f\) is invertible, then \(p_x(x) = p_z(f^{-1}(x)) \left|\det \frac{\partial f^{-1}}{\partial x}\right|\). In one dimension this simplifies to \(p_x(x) = p_z(f^{-1}(x)) \cdot |\frac{df^{-1}}{dx}|\), making it easy to visualize how stretching and compressing regions of space changes probability density. The 1D case provides clean intuition: where the transformation expands space, density decreases; where it compresses, density increases. This fundamental principle scales directly to higher dimensions via the Jacobian determinant.

# Simple 1D example: affine transformation
def affine_transform(z, a, b):
    """x = a*z + b"""
    return a * z + b

def affine_inverse(x, a, b):
    """z = (x - b) / a"""
    return (x - b) / a

def affine_log_det_jacobian(a):
    """log|dx/dz| = log|a|"""
    return np.log(np.abs(a))

# Base distribution: N(0, 1)
z = np.random.randn(10000)

# Transform: x = 2z + 3
a, b = 2.0, 3.0
x = affine_transform(z, a, b)

# Verify distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# Original
axes[0].hist(z, bins=50, density=True, alpha=0.7, label='Samples')
z_range = np.linspace(-4, 4, 100)
axes[0].plot(z_range, stats.norm.pdf(z_range, 0, 1), 'r-', linewidth=2, label='N(0,1)')
axes[0].set_xlabel('z', fontsize=12)
axes[0].set_ylabel('Density', fontsize=12)
axes[0].set_title('Base Distribution', fontsize=13)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Transformed
axes[1].hist(x, bins=50, density=True, alpha=0.7, label='Samples')
x_range = np.linspace(-5, 11, 100)
axes[1].plot(x_range, stats.norm.pdf(x_range, b, a), 'r-', linewidth=2, label='N(3,4)')
axes[1].set_xlabel('x', fontsize=12)
axes[1].set_ylabel('Density', fontsize=12)
axes[1].set_title('Transformed Distribution', fontsize=13)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Transformation function
axes[2].plot(z_range, affine_transform(z_range, a, b), linewidth=2)
axes[2].set_xlabel('z', fontsize=12)
axes[2].set_ylabel('x = f(z)', fontsize=12)
axes[2].set_title(f'Transformation: x = {a}z + {b}', fontsize=13)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

from scipy import stats
print(f"log|det J| = {affine_log_det_jacobian(a):.4f}")

3. Composing FlowsΒΆ

Sequential TransformationsΒΆ

Apply multiple bijections: $\(z_0 \xrightarrow{f_1} z_1 \xrightarrow{f_2} \cdots \xrightarrow{f_K} z_K = x\)$

Composed transformation: $\(f = f_K \circ f_{K-1} \circ \cdots \circ f_1\)$

Log-LikelihoodΒΆ

\[\log p_x(x) = \log p_{z_0}(z_0) - \sum_{k=1}^K \log \left| \det \frac{\partial f_k}{\partial z_{k-1}} \right|\]

where \(z_0 = f^{-1}(x)\).

Training ObjectiveΒΆ

Maximize log-likelihood: $\(\max_{\theta} \sum_{i=1}^n \log p_{\theta}(x_i)\)$

This is exact, not a lower bound!

Planar Flow (Simple Example)ΒΆ

A planar flow is one of the simplest normalizing flow transformations: \(f(z) = z + u \cdot h(w^T z + b)\), where \(u, w\) are weight vectors, \(b\) is a bias, and \(h\) is a smooth activation function. Each planar flow layer applies a single hyperplane-based transformation, and stacking multiple layers builds up the expressiveness to model complex distributions. The Jacobian determinant for a planar flow has a closed-form expression, making likelihood computation efficient. While planar flows are too simple for high-dimensional data, they clearly demonstrate how composing simple invertible transforms can warp a Gaussian into a multimodal target distribution.

class PlanarFlow(nn.Module):
    """Planar normalizing flow.
    
    f(z) = z + u * tanh(w^T z + b)
    """
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        self.u = nn.Parameter(torch.randn(dim))
        self.w = nn.Parameter(torch.randn(dim))
        self.b = nn.Parameter(torch.randn(1))
    
    def forward(self, z):
        """Forward pass: z -> x"""
        linear = torch.matmul(z, self.w) + self.b
        x = z + self.u * torch.tanh(linear).unsqueeze(-1)
        
        # Log determinant
        psi = (1 - torch.tanh(linear)**2) * self.w
        det = 1 + torch.matmul(psi, self.u)
        log_det = torch.log(torch.abs(det) + 1e-8)
        
        return x, log_det

class NormalizingFlow(nn.Module):
    """Stack of planar flows."""
    def __init__(self, dim, n_flows=10):
        super().__init__()
        self.flows = nn.ModuleList([PlanarFlow(dim) for _ in range(n_flows)])
    
    def forward(self, z):
        """Transform z to x."""
        log_det_sum = 0
        x = z
        
        for flow in self.flows:
            x, log_det = flow(x)
            log_det_sum += log_det
        
        return x, log_det_sum
    
    def log_prob(self, x):
        """Compute log p(x)."""
        # Base distribution log prob
        z = torch.zeros_like(x)
        log_pz = -0.5 * (z**2).sum(dim=-1) - 0.5 * z.size(-1) * np.log(2 * np.pi)
        
        # Transform
        _, log_det = self.forward(z)
        
        return log_pz - log_det

# Test
flow = NormalizingFlow(dim=2, n_flows=5).to(device)
z = torch.randn(10, 2).to(device)
x, log_det = flow(z)

print(f"Input shape: {z.shape}")
print(f"Output shape: {x.shape}")
print(f"Log det shape: {log_det.shape}")

Training Flow on 2D DataΒΆ

Training a normalizing flow on 2D data provides a compelling visualization of the learning process. The model starts with a simple Gaussian base distribution and gradually warps it to match a complex target (e.g., two moons, concentric circles, or a spiral). The training objective is exact maximum likelihood: \(\mathcal{L} = -\mathbb{E}_{x \sim p_{\text{data}}}[\log p_\theta(x)]\), which decomposes into the log-density of \(f^{-1}(x)\) under the base distribution plus the log-absolute-determinant of the Jacobian. Unlike VAEs and GANs, normalizing flows provide exact density evaluation and exact sampling – a significant advantage for applications requiring calibrated uncertainty estimates.

# Generate target distribution: two moons
from sklearn.datasets import make_moons

n_samples = 2000
X, _ = make_moons(n_samples=n_samples, noise=0.05)
X_train = torch.FloatTensor(X).to(device)

# Normalize
X_train = (X_train - X_train.mean(dim=0)) / X_train.std(dim=0)

train_loader = DataLoader(TensorDataset(X_train), batch_size=128, shuffle=True)

# Model
model = NormalizingFlow(dim=2, n_flows=16).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training
n_epochs = 100
losses = []

for epoch in range(n_epochs):
    epoch_loss = 0
    
    for batch, in train_loader:
        optimizer.zero_grad()
        
        # Sample from base
        z = torch.randn_like(batch)
        
        # Transform
        x, log_det = model(z)
        
        # Base log prob
        log_pz = -0.5 * (z**2).sum(dim=1) - np.log(2 * np.pi)
        
        # Log likelihood
        log_px = log_pz - log_det
        
        # Match to data (minimize KL divergence)
        loss = -log_px.mean()
        
        # Also add MSE to stabilize
        loss += F.mse_loss(x, batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    losses.append(epoch_loss / len(train_loader))
    
    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {losses[-1]:.4f}")

print("\nTraining complete!")
# Visualize results
model.eval()

with torch.no_grad():
    # Sample from base
    z_samples = torch.randn(1000, 2).to(device)
    
    # Transform through flow
    x_samples, _ = model(z_samples)
    x_samples = x_samples.cpu().numpy()

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Training loss
axes[0].plot(losses, linewidth=2)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('Training Loss', fontsize=13)
axes[0].grid(True, alpha=0.3)

# Base distribution
z_vis = torch.randn(1000, 2).cpu().numpy()
axes[1].scatter(z_vis[:, 0], z_vis[:, 1], alpha=0.5, s=10)
axes[1].set_xlabel('z₁', fontsize=12)
axes[1].set_ylabel('zβ‚‚', fontsize=12)
axes[1].set_title('Base Distribution (Gaussian)', fontsize=13)
axes[1].grid(True, alpha=0.3)
axes[1].set_aspect('equal')

# Generated samples
axes[2].scatter(X_train.cpu()[:, 0], X_train.cpu()[:, 1], alpha=0.3, s=10, label='Data')
axes[2].scatter(x_samples[:, 0], x_samples[:, 1], alpha=0.5, s=10, label='Generated')
axes[2].set_xlabel('x₁', fontsize=12)
axes[2].set_ylabel('xβ‚‚', fontsize=12)
axes[2].set_title('Generated vs True Data', fontsize=13)
axes[2].legend()
axes[2].grid(True, alpha=0.3)
axes[2].set_aspect('equal')

plt.tight_layout()
plt.show()

6. Modern Flow ArchitecturesΒΆ

Coupling Layers (RealNVP, Glow)ΒΆ

Split input \(z = (z_1, z_2)\):

\[x_1 = z_1\]
\[x_2 = z_2 \odot \exp(s(z_1)) + t(z_1)\]

where \(s, t\) are neural networks.

Jacobian is triangular β†’ easy determinant!

\[\det J = \prod_i \exp(s_i(z_1))\]

Autoregressive Flows (MAF, IAF)ΒΆ

\[x_i = z_i \cdot \exp(\alpha_i(x_{<i})) + \mu_i(x_{<i})\]

Triangular Jacobian via autoregressive structure.

Continuous Normalizing Flows (Neural ODEs)ΒΆ

\[\frac{dz(t)}{dt} = f(z(t), t; \theta)\]

Infinite depth flow, compute via ODE solvers.

ComparisonΒΆ

Architecture

Sampling

Density

Jacobian

Coupling (RealNVP)

Fast βœ“

Fast βœ“

Triangular

Autoregressive (MAF)

Slow

Fast βœ“

Triangular

Inverse Autoregressive (IAF)

Fast βœ“

Slow

Triangular

Neural ODE

Medium

Medium

Continuous

SummaryΒΆ

Key Principles:ΒΆ

  1. Invertibility: Bijective transformations \(f: z \to x\)

  2. Change of variables: \(p_x(x) = p_z(z) |\det J_f|^{-1}\)

  3. Exact likelihood: No approximation, unlike VAE

  4. Composition: Stack flows for expressiveness

Training Objective:ΒΆ

\[\max_{\theta} \mathbb{E}_{x \sim p_{data}}[\log p_{\theta}(x)]\]
\[= \max_{\theta} \mathbb{E}_x \left[ \log p_z(f^{-1}(x)) + \log \left| \det \frac{\partial f^{-1}}{\partial x} \right| \right]\]

Architectural Choices:ΒΆ

Requirements:

  • Invertible transformation

  • Tractable Jacobian determinant

Solutions:

  • Triangular Jacobian: Coupling, autoregressive

  • Low-rank updates: Planar, radial flows

  • Continuous: Neural ODEs

Advantages:ΒΆ

  • Exact likelihood: Principled training

  • Fast sampling: Single forward pass

  • Latent space: Invertible, no encoder needed

Limitations:ΒΆ

  • Architectural constraints: Must be invertible

  • Dimension preserving: \(\dim(z) = \dim(x)\)

  • Computational cost: Jacobian determinant

Modern Variants:ΒΆ

  • RealNVP (Dinh et al., 2017): Coupling layers

  • Glow (Kingma & Dhariwal, 2018): Invertible 1x1 convolutions

  • FFJORD (Grathwohl et al., 2019): Continuous flows

  • Flow++ (Ho et al., 2019): Improved coupling

Applications:ΒΆ

  • Image generation (high-res)

  • Density estimation

  • Variational inference (VI with flows)

  • Audio synthesis

Next Steps:ΒΆ

  • 04_neural_ode.ipynb - Continuous normalizing flows

  • 03_variational_autoencoders_advanced.ipynb - Compare with VAEs

  • 09_vq_vae.ipynb - Discrete latent representations

Advanced Normalizing Flows TheoryΒΆ

1. Mathematical FoundationsΒΆ

Change of Variables FormulaΒΆ

Theorem (Change of Variables): For invertible f: β„α΅ˆ β†’ β„α΅ˆ with x = f(z):

\[p_X(x) = p_Z(f^{-1}(x)) \left| \det \frac{\partial f^{-1}}{\partial x} \right|\]

Inverse Jacobian Form: Using inverse function theorem:

\[p_X(x) = p_Z(z) \left| \det \frac{\partial f}{\partial z} \right|^{-1}\]

where z = f⁻¹(x).

Log-Density:

\[\log p_X(x) = \log p_Z(z) - \log \left| \det J_f(z) \right|\]

where J_f is the Jacobian matrix.

Requirements for Normalizing FlowsΒΆ

  1. Bijection: f must be invertible (one-to-one and onto)

  2. Tractable Jacobian: det(J_f) must be computable efficiently

  3. Differentiable: For gradient-based optimization

Challenge: General neural networks are not invertible.

2. Flow CompositionΒΆ

Sequential TransformationsΒΆ

Composition: Apply K bijections sequentially:

\[z_0 \xrightarrow{f_1} z_1 \xrightarrow{f_2} z_2 \cdots \xrightarrow{f_K} z_K = x\]

Combined Transformation: f = f_K ∘ f_{K-1} ∘ … ∘ f_1

Jacobian Determinant (Chain Rule):

\[\det J_f = \prod_{k=1}^K \det J_{f_k}\]

Log-Density:

\[\log p_X(x) = \log p_{Z_0}(z_0) - \sum_{k=1}^K \log |\det J_{f_k}(z_{k-1})|\]

Maximum Likelihood Training:

\[\max_\theta \mathbb{E}_{x \sim p_{data}} \left[ \log p_{Z_0}(f^{-1}(x)) - \sum_k \log |\det J_{f_k}| \right]\]

3. Coupling Layers (RealNVP)ΒΆ

Affine Coupling LayerΒΆ

Key Idea: Split input into two parts, transform one conditioned on the other.

Forward Pass: Given z = (z₁, zβ‚‚) where z₁ ∈ β„α΅ˆ, zβ‚‚ ∈ β„α΄°β»α΅ˆ:

\[x_1 = z_1\]
\[x_2 = z_2 \odot \exp(s(z_1)) + t(z_1)\]

where s, t: β„α΅ˆ β†’ β„α΄°β»α΅ˆ are neural networks (scale and translation).

Inverse Pass:

\[z_1 = x_1\]
\[z_2 = (x_2 - t(x_1)) \odot \exp(-s(x_1))\]

Jacobian: Block triangular structure:

\[\begin{split}J = \begin{pmatrix} I_d & 0 \\ \frac{\partial x_2}{\partial z_1} & \text{diag}(\exp(s(z_1))) \end{pmatrix}\end{split}\]

Determinant:

\[\det J = \prod_{i=1}^{D-d} \exp(s_i(z_1)) = \exp\left(\sum_i s_i(z_1)\right)\]

Log-Determinant:

\[\log |\det J| = \sum_{i=1}^{D-d} s_i(z_1)\]

Computational Cost: O(D) instead of O(DΒ³) for general Jacobian!

Multi-Scale ArchitectureΒΆ

Strategy: Apply coupling at different resolutions.

Squeeze Operation: Reshape (H, W, C) β†’ (H/2, W/2, 4C).

Split: Send half of channels directly to output, transform other half.

Benefit: Reduces memory, enables high-resolution generation.

4. Glow ArchitectureΒΆ

1Γ—1 Invertible ConvolutionΒΆ

Motivation: Permutation is too restrictive, use learned mixing.

Operation: W ∈ β„αΆœΛ£αΆœ invertible weight matrix:

\[x = W \cdot z\]

Initialization: W = random rotation matrix (via QR decomposition).

Jacobian: For each spatial position (h, w):

\[\det J = \det(W)\]

Total Log-Determinant: For HΓ—W spatial resolution:

\[\log |\det J_{total}| = H \cdot W \cdot \log |\det W|\]

Efficient Computation: LU decomposition of W:

\[W = PL(U + \text{diag}(s))\]

where P is permutation, L lower triangular, U upper triangular, s is diagonal.

\[\log |\det W| = \sum_i \log |s_i|\]

Actnorm (Activation Normalization)ΒΆ

Purpose: Data-dependent initialization for stable training.

Operation: Affine transformation per channel:

\[y = s \odot x + b\]

Initialization: Set s, b so first batch has zero mean and unit variance.

Benefit: Prevents activation explosion/vanishing.

Glow BlockΒΆ

Full Glow Step:

1. Actnorm (data normalization)
2. 1Γ—1 Invertible Conv (channel mixing)
3. Affine Coupling (nonlinear transform)

Repeat: Stack K Glow steps per scale.

Multi-scale: Squeeze β†’ K steps β†’ Split β†’ next scale.

5. Autoregressive FlowsΒΆ

Masked Autoregressive Flow (MAF)ΒΆ

Autoregressive Structure: Each xα΅’ depends only on x₁, …, xᡒ₋₁:

\[x_i = z_i \cdot \exp(\alpha_i(x_{<i})) + \mu_i(x_{<i})\]

where Ξ± (scale) and ΞΌ (location) are neural networks with masking.

Jacobian: Lower triangular (autoregressive property):

\[\begin{split}J_{ij} = \begin{cases} \exp(\alpha_i(x_{<i})) & \text{if } i = j \\ 0 & \text{if } i < j \\ \frac{\partial x_i}{\partial x_j} & \text{if } i > j \end{cases}\end{split}\]

Determinant:

\[\det J = \prod_i \exp(\alpha_i(x_{<i}))\]

Complexity:

  • Forward (zβ†’x): O(D) sequential (slow)

  • Inverse (xβ†’z): O(D) parallel (fast)

  • Density evaluation: Fast

Use Case: Density estimation, variational inference.

Inverse Autoregressive Flow (IAF)ΒΆ

Key Difference: Autoregressive in z instead of x:

\[x_i = z_i \cdot \exp(\alpha_i(z_{<i})) + \mu_i(z_{<i})\]

Complexity:

  • Forward (zβ†’x): O(D) parallel (fast)

  • Inverse (xβ†’z): O(D) sequential (slow)

  • Sampling: Fast

Use Case: Sampling, variational autoencoders (VAE posterior).

MADE (Masked Autoencoder for Distribution Estimation)ΒΆ

Architecture: Enforce autoregressive property via weight masking.

Masks: Binary matrices M ensuring output i depends only on input j where j < i.

Efficient Parallelization: Compute all ΞΌα΅’, Ξ±α΅’ in one forward pass.

6. Continuous Normalizing Flows (Neural ODEs)ΒΆ

Continuous-Time DynamicsΒΆ

ODE Formulation: Instead of discrete steps, continuous transformation:

\[\frac{dz(t)}{dt} = f(z(t), t; \theta)\]

where f is a neural network.

Transformation: z(0) β†’ z(1) via ODE integration:

\[z(1) = z(0) + \int_0^1 f(z(t), t) dt\]

Inverse: Integrate backward:

\[z(0) = z(1) - \int_1^0 f(z(t), t) dt = z(1) + \int_0^1 f(z(1-t), 1-t) dt\]

Instantaneous Change of VariablesΒΆ

Theorem (Continuous Change of Variables): Log-density evolves via:

\[\frac{d}{dt} \log p(z(t)) = -\text{Tr}\left(\frac{\partial f}{\partial z(t)}\right)\]

Integrated Form:

\[\log p(z(1)) = \log p(z(0)) - \int_0^1 \text{Tr}\left(\frac{\partial f}{\partial z(t)}\right) dt\]

Trace Computation: For D-dimensional z, computing trace directly is O(DΒ²).

FFJORD (Free-Form Jacobian of Reversible Dynamics)ΒΆ

Hutchinson Trace Estimator: Unbiased estimate using random vectors:

\[\text{Tr}(A) = \mathbb{E}_{\epsilon \sim \mathcal{N}(0,I)} [\epsilon^T A \epsilon]\]

Application:

\[\text{Tr}\left(\frac{\partial f}{\partial z}\right) \approx \epsilon^T \frac{\partial f}{\partial z} \epsilon\]

computed via vector-Jacobian product (automatic differentiation).

Complexity: O(D) instead of O(DΒ²).

Training: Solve augmented ODE simultaneously for z(t) and log-density:

\[\begin{split}\frac{d}{dt} \begin{pmatrix} z(t) \\ \log p(t) \end{pmatrix} = \begin{pmatrix} f(z(t), t) \\ -\text{Tr}(\partial f/\partial z) \end{pmatrix}\end{split}\]

Advantage: No architectural constraints (any f can be used).

7. Volume-Preserving FlowsΒΆ

MotivationΒΆ

Challenge: Standard flows require dim(z) = dim(x).

Solution: Constrain det(J) = 1 (volume-preserving).

Residual FlowsΒΆ

Transformation: f(z) = z + g(z) where g is a neural network.

Condition for Invertibility: Lipschitz constant Lip(g) < 1.

Spectral Normalization: Constrain Lip(g) via weight normalization.

Jacobian Determinant: det(J) = det(I + βˆ‚g/βˆ‚z).

Power Series: For small ||βˆ‚g/βˆ‚z||:

\[\log \det(I + A) \approx \text{Tr}(A) - \frac{1}{2}\text{Tr}(A^2) + \cdots\]

Infinitesimal FlowsΒΆ

Limit: As g β†’ 0, residual flow becomes continuous flow.

Connection: Neural ODEs are limit of residual flows.

8. Advanced Training TechniquesΒΆ

DequantizationΒΆ

Problem: Discrete data (images) have zero density under continuous model.

Solution: Add uniform noise U[0, 1/256] to discretized data:

\[\tilde{x} = x + u, \quad u \sim \text{Uniform}[0, 1/256]\]

Variational Dequantization: Learn noise distribution q(u|x) for tighter bound.

Temperature ScalingΒΆ

Sampling: Scale base distribution std by temperature Ο„:

\[z \sim \mathcal{N}(0, \tau^2 I)\]
  • Ο„ < 1: Higher quality, less diversity

  • Ο„ > 1: More diversity, lower quality

Multi-Scale TrainingΒΆ

Strategy: Apply loss at multiple resolutions.

Objective:

\[\mathcal{L} = \sum_{s=1}^S \lambda_s \mathbb{E}_{x^{(s)}} [-\log p(x^{(s)})]\]

where x⁽˒⁾ is data at scale s.

Benefit: Better gradient flow, prevents mode collapse.

Gradient CheckpointingΒΆ

Memory Reduction: Don’t store all intermediate activations during forward pass.

Recomputation: Recompute activations during backward pass.

Trade-off: 33% slower, but enables deeper flows.

9. Likelihood EstimationΒΆ

Bits per Dimension (BPD)ΒΆ

Definition: Negative log-likelihood normalized by data dimension:

\[\text{BPD} = -\frac{1}{D \log 2} \mathbb{E}[\log p(x)]\]

where D = H Γ— W Γ— C for images.

Interpretation: Average bits needed to encode each pixel/dimension.

Typical Values:

  • Random: 8 bits/dim (uniform over [0, 255])

  • Good model: 3-4 bits/dim (CIFAR-10)

  • SOTA: <3 bits/dim

Comparison with ELBOΒΆ

Flow: Exact likelihood $\(\log p(x) = \log p(z) - \sum_k \log |\det J_{f_k}|\)$

VAE: Lower bound (ELBO) $\(\log p(x) \geq \mathbb{E}_{q(z|x)}[\log p(x|z)] - KL(q(z|x) \| p(z))\)$

Advantage: Flows optimize true likelihood, VAEs optimize bound (may be loose).

10. Invertibility GuaranteesΒΆ

Conditions for InvertibilityΒΆ

Theorem (Inverse Function Theorem): If f is continuously differentiable and det(J_f) β‰  0, then f is locally invertible.

Global Invertibility: Additional conditions needed:

  • f is injective (one-to-one)

  • f is surjective (onto)

Invertible Neural Networks (INN)ΒΆ

Design Principles:

  1. Use invertible components (coupling, 1Γ—1 conv)

  2. Ensure non-zero Jacobian determinant

  3. Verify numerical stability

Numerical Inversion: For complex f, use iterative solvers (fixed-point iteration, Newton’s method).

11. State-of-the-Art ResultsΒΆ

Image GenerationΒΆ

CIFAR-10 (32Γ—32):

  • Glow: 3.35 bits/dim

  • Flow++: 3.08 bits/dim

  • Residual Flow: 3.28 bits/dim

ImageNet (64Γ—64):

  • Glow: 3.81 bits/dim

  • Flow++: 3.69 bits/dim

CelebA-HQ (256Γ—256):

  • Glow: 1.03 bits/dim

Sampling SpeedΒΆ

Comparison (single sample):

  • Flow: ~10-50ms (K forward passes)

  • GAN: ~5ms (1 generator pass)

  • Diffusion: ~1-10s (100-1000 steps)

Advantage: Flows are faster than diffusion, slower than GANs.

12. LimitationsΒΆ

Architectural ConstraintsΒΆ

Dimension Preservation: dim(z) = dim(x) required.

  • Can’t compress like VAE (z < x)

  • Inefficient for high-dim data

Invertibility: Limits architecture choices.

  • No pooling, dropout, batch norm (standard forms)

  • Must use specialized layers

Sample Quality vs LikelihoodΒΆ

Paradox: High likelihood doesn’t always mean good samples.

Reason: Model may assign probability mass to imperceptible differences.

Solution: Hybrid models (VAE + Flow, GAN + Flow).

Computational CostΒΆ

Training: Computing Jacobian determinant adds overhead.

  • RealNVP: 2-3Γ— slower than GAN

  • FFJORD: 5-10Γ— slower (ODE solve)

Memory: Storing intermediate activations for deep flows.

13. Extensions and VariantsΒΆ

Discrete FlowsΒΆ

Problem: Continuous flows don’t model discrete data directly.

Solution: Integer discrete flows (Hoogeboom et al., 2019).

Application: Text, graphs, molecular structures.

Variational Dequantization FlowsΒΆ

Idea: Learn dequantization distribution q(u|x) jointly with flow.

Objective: Tighter bound on discrete data likelihood.

Graphical Normalizing FlowsΒΆ

Extension: Flows for graph-structured data.

Equivariance: Preserve graph symmetries (permutation equivariance).

Neural Spline FlowsΒΆ

Piecewise Function: Use monotonic splines instead of affine transforms.

Advantage: More expressive coupling layers.

SOTA: Achieves best bits/dim on many benchmarks.

14. ApplicationsΒΆ

Variational InferenceΒΆ

Use: Flow-based posterior q_Ο†(z|x) in VAE.

Benefit: More flexible than Gaussian posterior.

Example: Normalizing Flow VAE (Rezende & Mohamed, 2015).

Density EstimationΒΆ

Task: Model p(x) for anomaly detection, compression.

Method: Maximize likelihood on training data.

Anomaly Score: -log p(x) (high for outliers).

Image SynthesisΒΆ

Generation: Sample z ~ N(0,I), compute x = f(z).

Interpolation: Linear interpolation in z-space.

Attribute Manipulation: Directions in latent space correspond to attributes.

Data CompressionΒΆ

Principle: Use learned p(x) for entropy coding.

Bits Required: Approximately -logβ‚‚ p(x) bits.

Lossless Compression: Exact reconstruction via invertibility.

15. Connections to Other ModelsΒΆ

Flows vs VAEsΒΆ

Aspect

Flow

VAE

Likelihood

Exact

Lower bound (ELBO)

Latent dim

dim(z) = dim(x)

dim(z) < dim(x)

Sampling

Fast (1 pass)

Fast (1 pass)

Invertibility

Yes

No (encoder β‰  decoder⁻¹)

Flows vs GANsΒΆ

Similarities: Both map noise β†’ data.

Differences:

  • Flow: Bijection, exact likelihood

  • GAN: Not invertible, no likelihood

Hybrid: Use flow as GAN generator for stable training.

Flows vs DiffusionΒΆ

Diffusion: Iterative denoising process (many steps).

  • Pro: SOTA sample quality

  • Con: Slow sampling

Flow: Single pass transformation.

  • Pro: Fast sampling

  • Con: Architectural constraints

16. Key PapersΒΆ

FoundationsΒΆ

  • Rezende & Mohamed (2015): β€œVariational Inference with Normalizing Flows”

  • Dinh et al. (2015): β€œNICE: Non-linear Independent Components Estimation”

Modern ArchitecturesΒΆ

  • Dinh et al. (2017): β€œDensity Estimation using Real NVP”

  • Kingma & Dhariwal (2018): β€œGlow: Generative Flow using Invertible 1x1 Convolutions”

  • Papamakarios et al. (2017): β€œMasked Autoregressive Flow for Density Estimation”

Continuous FlowsΒΆ

  • Chen et al. (2018): β€œNeural Ordinary Differential Equations”

  • Grathwohl et al. (2019): β€œFFJORD: Free-form Continuous Dynamics for Scalable Reversible Generative Models”

Recent AdvancesΒΆ

  • Ho et al. (2019): β€œFlow++: Improving Flow-Based Generative Models”

  • Durkan et al. (2019): β€œNeural Spline Flows”

  • Hoogeboom et al. (2019): β€œInteger Discrete Flows”

17. Practical ConsiderationsΒΆ

HyperparametersΒΆ

Parameter

Typical Range

Effect

Coupling depth

8-32 layers

More β†’ expressive, slower

Hidden dim

256-512

Larger β†’ flexible, memory

Split dimension

D/2

Half channels per coupling

Learning rate

1e-4 to 5e-4

Higher β†’ faster, unstable

Batch size

32-128

Larger β†’ stable, more memory

Debugging TipsΒΆ

Check:

  • Jacobian determinant: Should be non-zero

  • Inverse accuracy: ||f⁻¹(f(z)) - z|| < Ξ΅

  • Likelihood: Should increase during training

  • Samples: Visually inspect quality

Common Issues:

  • Exploding gradients β†’ gradient clipping, spectral norm

  • Poor samples β†’ increase depth, tune temperature

  • Slow training β†’ reduce depth, use gradient checkpointing

Computational EfficiencyΒΆ

Speed Optimizations:

  • Use 1Γ—1 conv instead of full permutation

  • LU decomposition for faster determinant

  • Gradient checkpointing for memory

Parallelization: Coupling layers parallelize well across GPUs.

18. When to Use Normalizing FlowsΒΆ

Choose Flows When:

  • Exact likelihood required (density estimation, compression)

  • Fast sampling important (real-time applications)

  • Interpretable latent space desired

  • Invertibility useful (cycle-consistency tasks)

Avoid Flows When:

  • Best sample quality critical (use diffusion/GANs)

  • High compression needed (use VAE with z << x)

  • Data is discrete (flows designed for continuous)

  • Limited compute (flows can be memory-intensive)

# Advanced Normalizing Flows Implementations

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import Tuple, List, Optional

class AffineCoupling(nn.Module):
    """
    Affine coupling layer (RealNVP).
    Split input, transform half conditioned on other half.
    """
    
    def __init__(self, dim, hidden_dim=256, mask_type='checkerboard'):
        super().__init__()
        self.dim = dim
        
        # Create mask (1 = identity, 0 = transform)
        if mask_type == 'checkerboard':
            self.register_buffer('mask', torch.arange(dim) % 2)
        elif mask_type == 'channel':
            mask = torch.zeros(dim)
            mask[:dim//2] = 1
            self.register_buffer('mask', mask)
        
        # Scale and translation networks
        mask_dim = int(self.mask.sum().item())
        transform_dim = dim - mask_dim
        
        self.scale_net = nn.Sequential(
            nn.Linear(mask_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, transform_dim)
        )
        
        self.translate_net = nn.Sequential(
            nn.Linear(mask_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, transform_dim)
        )
    
    def forward(self, z, reverse=False):
        """
        Forward: z -> x
        Reverse: x -> z
        """
        mask = self.mask.unsqueeze(0)
        
        # Split input
        z_masked = z * mask
        z_transform = z * (1 - mask)
        
        # Get conditioning (identity part)
        z_cond = z_masked[:, self.mask == 1]
        
        # Compute scale and translation
        s = self.scale_net(z_cond)
        t = self.translate_net(z_cond)
        
        if not reverse:
            # Forward: x = z * exp(s) + t
            x_transform = z_transform[:, self.mask == 0] * torch.exp(s) + t
            log_det = s.sum(dim=1)
        else:
            # Inverse: z = (x - t) * exp(-s)
            x_transform = (z_transform[:, self.mask == 0] - t) * torch.exp(-s)
            log_det = -s.sum(dim=1)
        
        # Reconstruct output
        x = torch.zeros_like(z)
        x[:, self.mask == 1] = z_cond
        x[:, self.mask == 0] = x_transform
        
        return x, log_det


class RealNVP(nn.Module):
    """
    RealNVP: Stacked affine coupling layers with alternating masks.
    """
    
    def __init__(self, dim, n_layers=6, hidden_dim=256):
        super().__init__()
        self.dim = dim
        
        # Alternating checkerboard and channel masks
        self.layers = nn.ModuleList()
        for i in range(n_layers):
            mask_type = 'checkerboard' if i % 2 == 0 else 'channel'
            self.layers.append(AffineCoupling(dim, hidden_dim, mask_type))
    
    def forward(self, z, reverse=False):
        """Transform z -> x (or x -> z if reverse)."""
        log_det_total = 0
        x = z
        
        layers = reversed(self.layers) if reverse else self.layers
        
        for layer in layers:
            x, log_det = layer(x, reverse=reverse)
            log_det_total += log_det
        
        return x, log_det_total
    
    def log_prob(self, x):
        """Compute log p(x)."""
        # Inverse transform: x -> z
        z, log_det = self.forward(x, reverse=True)
        
        # Base distribution: N(0, I)
        log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.dim * np.log(2 * np.pi)
        
        # log p(x) = log p(z) + log |det J_f^{-1}|
        log_px = log_pz + log_det
        
        return log_px


class Invertible1x1Conv(nn.Module):
    """
    1Γ—1 invertible convolution (Glow).
    Efficient via LU decomposition: W = PLU.
    """
    
    def __init__(self, channels):
        super().__init__()
        self.channels = channels
        
        # Initialize with random rotation
        W = torch.qr(torch.randn(channels, channels))[0]
        
        # LU decomposition
        P, L, U = torch.lu_unpack(*torch.lu(W))
        
        # Store parameters
        self.register_buffer('P', P)  # Permutation (fixed)
        self.L = nn.Parameter(L)  # Lower triangular
        self.U = nn.Parameter(U)  # Upper triangular
        
        # Diagonal of U as separate parameter for easy determinant
        self.log_s = nn.Parameter(torch.log(torch.abs(torch.diag(U))))
    
    def forward(self, z, reverse=False):
        """Apply (inverse) 1Γ—1 convolution."""
        batch_size = z.size(0)
        
        # Reconstruct W = P @ L @ (U + diag(s))
        L = torch.tril(self.L, diagonal=-1) + torch.eye(self.channels, device=z.device)
        U = torch.triu(self.U, diagonal=1)
        W = self.P @ L @ (U + torch.diag(torch.exp(self.log_s)))
        
        if not reverse:
            # Forward: x = W @ z
            x = z @ W.t()
            # Log determinant: sum of log diagonal elements
            log_det = self.log_s.sum() * torch.ones(batch_size, device=z.device)
        else:
            # Inverse: z = W^{-1} @ x
            W_inv = torch.inverse(W)
            x = z @ W_inv.t()
            log_det = -self.log_s.sum() * torch.ones(batch_size, device=z.device)
        
        return x, log_det


class ActNorm(nn.Module):
    """
    Activation normalization (Glow).
    Data-dependent initialization for stable training.
    """
    
    def __init__(self, channels):
        super().__init__()
        self.channels = channels
        
        # Learnable scale and bias
        self.log_scale = nn.Parameter(torch.zeros(channels))
        self.bias = nn.Parameter(torch.zeros(channels))
        
        # Track initialization
        self.register_buffer('initialized', torch.tensor(0))
    
    def forward(self, z, reverse=False):
        """Apply activation normalization."""
        batch_size = z.size(0)
        
        # Initialize on first batch
        if not self.initialized:
            with torch.no_grad():
                # Set bias to have zero mean
                mean = z.mean(dim=0)
                std = z.std(dim=0)
                
                self.bias.data = -mean
                self.log_scale.data = -torch.log(std + 1e-6)
                
                self.initialized.fill_(1)
        
        if not reverse:
            # Forward: normalize
            x = z * torch.exp(self.log_scale) + self.bias
            log_det = self.log_scale.sum() * torch.ones(batch_size, device=z.device)
        else:
            # Inverse: denormalize
            x = (z - self.bias) * torch.exp(-self.log_scale)
            log_det = -self.log_scale.sum() * torch.ones(batch_size, device=z.device)
        
        return x, log_det


class GlowBlock(nn.Module):
    """
    Single Glow block: ActNorm -> 1Γ—1 Conv -> Affine Coupling.
    """
    
    def __init__(self, channels, hidden_dim=256):
        super().__init__()
        
        self.actnorm = ActNorm(channels)
        self.conv1x1 = Invertible1x1Conv(channels)
        self.coupling = AffineCoupling(channels, hidden_dim, mask_type='channel')
    
    def forward(self, z, reverse=False):
        """Apply (inverse) Glow block."""
        log_det_total = 0
        
        if not reverse:
            # Forward: ActNorm -> 1Γ—1Conv -> Coupling
            x, ld1 = self.actnorm(z, reverse=False)
            x, ld2 = self.conv1x1(x, reverse=False)
            x, ld3 = self.coupling(x, reverse=False)
            log_det_total = ld1 + ld2 + ld3
        else:
            # Reverse: Coupling -> 1Γ—1Conv -> ActNorm
            x, ld3 = self.coupling(z, reverse=True)
            x, ld2 = self.conv1x1(x, reverse=True)
            x, ld1 = self.actnorm(x, reverse=True)
            log_det_total = ld1 + ld2 + ld3
        
        return x, log_det_total


class Glow(nn.Module):
    """
    Glow: Multi-scale architecture with squeeze, flow, split.
    """
    
    def __init__(self, input_dim, n_blocks=8, hidden_dim=256):
        super().__init__()
        self.input_dim = input_dim
        
        # Stack of Glow blocks
        self.blocks = nn.ModuleList([
            GlowBlock(input_dim, hidden_dim) for _ in range(n_blocks)
        ])
    
    def forward(self, z, reverse=False):
        """Transform z -> x (or x -> z if reverse)."""
        log_det_total = 0
        x = z
        
        blocks = reversed(self.blocks) if reverse else self.blocks
        
        for block in blocks:
            x, log_det = block(x, reverse=reverse)
            log_det_total += log_det
        
        return x, log_det_total
    
    def log_prob(self, x):
        """Compute log p(x) via change of variables."""
        # Inverse: x -> z
        z, log_det = self.forward(x, reverse=True)
        
        # Base log probability
        log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.input_dim * np.log(2 * np.pi)
        
        # Log probability
        log_px = log_pz + log_det
        
        return log_px


class MADE(nn.Module):
    """
    Masked Autoencoder for Distribution Estimation.
    Autoregressive flow via weight masking.
    """
    
    def __init__(self, input_dim, hidden_dims=[256, 256]):
        super().__init__()
        self.input_dim = input_dim
        
        # Build network
        dims = [input_dim] + hidden_dims + [input_dim * 2]  # *2 for scale and location
        
        self.layers = nn.ModuleList()
        for i in range(len(dims) - 1):
            self.layers.append(nn.Linear(dims[i], dims[i+1]))
        
        # Create masks for autoregressive property
        self.masks = self._create_masks(dims)
    
    def _create_masks(self, dims):
        """Create binary masks ensuring x_i depends only on x_{<i}."""
        masks = []
        
        # Assign degree (max input index) to each unit
        m = {}
        m[0] = torch.arange(1, self.input_dim + 1)  # Input: 1, 2, ..., D
        
        for l in range(1, len(dims) - 1):
            # Hidden layers: random degrees between 1 and D-1
            m[l] = torch.randint(1, self.input_dim, (dims[l],))
        
        m[len(dims) - 1] = torch.arange(1, self.input_dim + 1).repeat(2)  # Output: repeat for scale/loc
        
        # Create masks: connect if m[l-1] < m[l]
        for l in range(len(dims) - 1):
            masks.append(m[l].unsqueeze(1) <= m[l+1].unsqueeze(0))
        
        return masks
    
    def forward(self, z):
        """Compute scale and location parameters."""
        x = z
        
        for i, (layer, mask) in enumerate(zip(self.layers, self.masks)):
            # Apply masked weight
            weight = layer.weight * mask.t().float().to(z.device)
            x = F.linear(x, weight, layer.bias)
            
            if i < len(self.layers) - 1:
                x = F.relu(x)
        
        # Split into scale and location
        log_scale, loc = x.chunk(2, dim=1)
        
        return log_scale, loc


class MAF(nn.Module):
    """
    Masked Autoregressive Flow.
    Fast density estimation, slow sampling.
    """
    
    def __init__(self, input_dim, n_layers=5, hidden_dims=[256, 256]):
        super().__init__()
        self.input_dim = input_dim
        
        # Stack of MADE layers
        self.made_layers = nn.ModuleList([
            MADE(input_dim, hidden_dims) for _ in range(n_layers)
        ])
    
    def forward(self, z, reverse=False):
        """Transform z -> x (forward) or x -> z (reverse)."""
        if not reverse:
            # Forward (z -> x): Sequential (slow)
            x = torch.zeros_like(z)
            log_det_total = 0
            
            for i in range(self.input_dim):
                # Compute parameters from previous outputs
                log_s, t = self.made_layers[0](x)
                
                # Transform current dimension
                x[:, i] = z[:, i] * torch.exp(log_s[:, i]) + t[:, i]
                log_det_total += log_s[:, i]
            
            return x, log_det_total
        else:
            # Inverse (x -> z): Parallel (fast)
            log_det_total = 0
            z = x.clone()
            
            for made in self.made_layers:
                log_s, t = made(x)
                z = (x - t) * torch.exp(-log_s)
                log_det_total += -log_s.sum(dim=1)
                x = z
            
            return z, log_det_total
    
    def log_prob(self, x):
        """Compute log p(x) - FAST for MAF."""
        # Inverse: x -> z (parallel, fast)
        z, log_det = self.forward(x, reverse=True)
        
        # Base log prob
        log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.input_dim * np.log(2 * np.pi)
        
        return log_pz + log_det


class NeuralODEFlow(nn.Module):
    """
    Continuous Normalizing Flow via Neural ODE.
    Uses torchdiffeq for ODE integration.
    """
    
    def __init__(self, dim, hidden_dim=64):
        super().__init__()
        self.dim = dim
        
        # Dynamics network: f(z, t)
        self.net = nn.Sequential(
            nn.Linear(dim + 1, hidden_dim),  # +1 for time
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, dim)
        )
    
    def forward(self, t, z):
        """Compute dz/dt = f(z, t)."""
        # Concatenate time
        t_vec = torch.ones(z.size(0), 1, device=z.device) * t
        z_t = torch.cat([z, t_vec], dim=1)
        
        return self.net(z_t)
    
    def trace_estimator(self, z, t, noise=None):
        """Hutchinson trace estimator for divergence."""
        if noise is None:
            noise = torch.randn_like(z)
        
        # Compute f(z, t)
        f = self.forward(t, z)
        
        # Vector-Jacobian product: Ξ΅^T (βˆ‚f/βˆ‚z)
        vjp = torch.autograd.grad(f, z, noise, create_graph=True)[0]
        
        # Tr(βˆ‚f/βˆ‚z) β‰ˆ Ξ΅^T (βˆ‚f/βˆ‚z) Ξ΅
        trace_estimate = (vjp * noise).sum(dim=1)
        
        return trace_estimate


class CNFTrainer:
    """
    Trainer for Continuous Normalizing Flows.
    Integrates ODE for z(t) and log p(t) simultaneously.
    """
    
    def __init__(self, flow_model, optimizer):
        self.flow = flow_model
        self.optimizer = optimizer
    
    def integrate_ode(self, z0, t_span, method='euler'):
        """
        Simple ODE integration (Euler method).
        For production, use torchdiffeq library.
        """
        z = z0
        log_p = torch.zeros(z0.size(0), device=z0.device)
        
        dt = (t_span[1] - t_span[0]) / 10  # 10 steps
        t = t_span[0]
        
        for _ in range(10):
            z.requires_grad_(True)
            
            # Compute dynamics
            dz_dt = self.flow(t, z)
            
            # Compute trace (divergence)
            trace = self.flow.trace_estimator(z, t)
            
            # Euler step
            z = z + dz_dt * dt
            log_p = log_p - trace * dt
            
            t = t + dt
            z = z.detach()
        
        return z, log_p
    
    def train_step(self, x_data):
        """Single training step."""
        # Base distribution
        z0 = torch.randn_like(x_data)
        
        # Integrate forward: z(0) -> z(1)
        z1, log_det = self.integrate_ode(z0, (0.0, 1.0))
        
        # Base log prob
        log_pz0 = -0.5 * (z0 ** 2).sum(dim=1) - 0.5 * z0.size(1) * np.log(2 * np.pi)
        
        # Log prob at t=1
        log_pz1 = log_pz0 - log_det
        
        # Loss: negative log-likelihood + MSE to data
        loss = -log_pz1.mean() + F.mse_loss(z1, x_data)
        
        # Backprop
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()


# ============================================================================
# Demonstrations
# ============================================================================

print("=" * 70)
print("Normalizing Flows - Advanced Implementations")
print("=" * 70)

# 1. RealNVP
print("\n1. RealNVP (Affine Coupling):")
realnvp = RealNVP(dim=10, n_layers=6, hidden_dim=128)
z_test = torch.randn(4, 10)
x_test, log_det = realnvp(z_test)
log_px = realnvp.log_prob(x_test)
print(f"   Input shape: {z_test.shape}")
print(f"   Output shape: {x_test.shape}")
print(f"   Log determinant: {log_det.shape}")
print(f"   Log p(x): {log_px.shape}")
print(f"   Parameters: {sum(p.numel() for p in realnvp.parameters()):,}")

# 2. Glow
print("\n2. Glow (ActNorm + 1Γ—1Conv + Coupling):")
glow = Glow(input_dim=10, n_blocks=8, hidden_dim=128)
x_glow, log_det_glow = glow(z_test)
print(f"   Glow blocks: 8")
print(f"   Each block: ActNorm -> Inv1Γ—1Conv -> AffineCoupling")
print(f"   Parameters: {sum(p.numel() for p in glow.parameters()):,}")
print(f"   Forward pass: z -> x")
print(f"   Inverse pass: x -> z (for log p(x))")

# 3. MAF
print("\n3. MAF (Masked Autoregressive Flow):")
maf = MAF(input_dim=10, n_layers=3, hidden_dims=[64, 64])
log_px_maf = maf.log_prob(x_test)
print(f"   MADE layers: 3")
print(f"   Autoregressive: x_i = f(x_{<i}, z_i)")
print(f"   Density eval: FAST (parallel)")
print(f"   Sampling: SLOW (sequential)")
print(f"   Parameters: {sum(p.numel() for p in maf.parameters()):,}")

# 4. Neural ODE Flow
print("\n4. Neural ODE Flow (Continuous):")
ode_flow = NeuralODEFlow(dim=10, hidden_dim=32)
print(f"   Dynamics: dz/dt = f(z, t; ΞΈ)")
print(f"   Integration: 0 -> 1 (continuous time)")
print(f"   Trace: Hutchinson estimator (efficient)")
print(f"   Parameters: {sum(p.numel() for p in ode_flow.parameters()):,}")

# 5. Architecture comparison
print("\n5. Architecture Comparison:")
print("   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”")
print("   β”‚ Architecture β”‚ Sampling  β”‚ Density    β”‚ Jacobian     β”‚ Use Case β”‚")
print("   β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€")
print("   β”‚ RealNVP      β”‚ Fast βœ“    β”‚ Fast βœ“     β”‚ Triangular   β”‚ General  β”‚")
print("   β”‚ Glow         β”‚ Fast βœ“    β”‚ Fast βœ“     β”‚ Triangular   β”‚ Images   β”‚")
print("   β”‚ MAF          β”‚ Slow      β”‚ Fast βœ“βœ“    β”‚ Triangular   β”‚ Density  β”‚")
print("   β”‚ IAF          β”‚ Fast βœ“βœ“   β”‚ Slow       β”‚ Triangular   β”‚ VAE      β”‚")
print("   β”‚ Neural ODE   β”‚ Medium    β”‚ Medium     β”‚ Continuous   β”‚ Flexible β”‚")
print("   β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜")

# 6. Jacobian determinant comparison
print("\n6. Jacobian Determinant Computation:")
print("   General matrix: O(DΒ³) - prohibitive")
print("   Triangular (coupling): O(D) - efficient βœ“")
print("   ")
print("   RealNVP coupling:")
print("   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”")
print("   β”‚ I_d      0      β”‚  Determinant = exp(Ξ£ s_i)")
print("   β”‚ βˆ‚xβ‚‚/βˆ‚z₁  diag(s)β”‚  Cost: O(D)")
print("   β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜")

# 7. Invertibility verification
print("\n7. Invertibility Check:")
z_original = torch.randn(2, 10)
x_forward, _ = realnvp(z_original, reverse=False)
z_reconstructed, _ = realnvp(x_forward, reverse=True)
error = (z_original - z_reconstructed).abs().max().item()
print(f"   z_original -> x -> z_reconstructed")
print(f"   Max reconstruction error: {error:.2e}")
print(f"   Numerically invertible: {'βœ“' if error < 1e-4 else 'βœ—'}")

# 8. When to use guide
print("\n8. When to Use Each Flow:")
print("   Use RealNVP when:")
print("     βœ“ Need exact likelihood")
print("     βœ“ Fast sampling important")
print("     βœ“ General-purpose generation")
print("\n   Use Glow when:")
print("     βœ“ Image generation (high-res)")
print("     βœ“ Need 1Γ—1 conv mixing")
print("     βœ“ Multi-scale architecture beneficial")
print("\n   Use MAF when:")
print("     βœ“ Density estimation primary goal")
print("     βœ“ Sampling speed not critical")
print("     βœ“ Variational inference (VI)")
print("\n   Use Neural ODE when:")
print("     βœ“ Architectural flexibility needed")
print("     βœ“ No invertibility constraints")
print("     βœ“ Continuous dynamics natural")

# 9. Exact likelihood advantage
print("\n9. Exact Likelihood vs Bounds:")
print("   Flow:  log p(x) = log p(z) - log|det J|  [EXACT]")
print("   VAE:   log p(x) β‰₯ ELBO                   [LOWER BOUND]")
print("   GAN:   No likelihood                     [IMPLICIT]")
print("   ")
print("   Flows enable:")
print("     β€’ Principled maximum likelihood training")
print("     β€’ Exact density for anomaly detection")
print("     β€’ Lossless compression via entropy coding")

print("\n" + "=" * 70)