Run this notebook: Open in Colab Open in Kaggle

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
import seaborn as sns

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
sns.set_style('whitegrid')

1. Motivation: Exact Likelihood¶

Comparison of Generative Models¶

Model	Likelihood	Sampling	Representation
VAE	Approximate (ELBO)	Fast	Latent space
GAN	None	Fast	Implicit
Flow	Exact	Fast	Bijective

Key Idea: Change of Variables¶

Given:

Simple base distribution $p_z(z)$ (e.g., Gaussian)
Invertible transformation $f: \mathbb{R}^d \to \mathbb{R}^d$
Data $x = f(z)$

Change of variables formula: $$p_x(x) = p_z(f^{-1}(x)) \left| \det \frac{\partial f^{-1}}{\partial x} \right|$$

Equivalently: $$p_x(x) = p_z(z) \left| \det \frac{\partial f}{\partial z} \right|^{-1}$$

where $z = f^{-1}(x)$.

Requirements¶

Invertibility: $f$ must be bijective
Tractable Jacobian: $\det J_f$ must be computable
Expressiveness: $f$ must be flexible

📚 Reference Materials:

vb_nf.pdf - Vb Nf
generative_models.pdf - Generative Models

Change of Variables in 1D¶

Normalizing flows are built on the change of variables formula: if $z \sim p_z(z)$ and $x = f(z)$ where $f$ is invertible, then $p_x(x) = p_z(f^{-1}(x)) \left|\det \frac{\partial f^{-1}}{\partial x}\right|$. In one dimension this simplifies to $p_x(x) = p_z(f^{-1}(x)) \cdot |\frac{df^{-1}}{dx}|$, making it easy to visualize how stretching and compressing regions of space changes probability density. The 1D case provides clean intuition: where the transformation expands space, density decreases; where it compresses, density increases. This fundamental principle scales directly to higher dimensions via the Jacobian determinant.

# Simple 1D example: affine transformation
def affine_transform(z, a, b):
    """x = a*z + b"""
    return a * z + b

def affine_inverse(x, a, b):
    """z = (x - b) / a"""
    return (x - b) / a

def affine_log_det_jacobian(a):
    """log|dx/dz| = log|a|"""
    return np.log(np.abs(a))

# Base distribution: N(0, 1)
z = np.random.randn(10000)

# Transform: x = 2z + 3
a, b = 2.0, 3.0
x = affine_transform(z, a, b)

# Verify distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# Original
axes[0].hist(z, bins=50, density=True, alpha=0.7, label='Samples')
z_range = np.linspace(-4, 4, 100)
axes[0].plot(z_range, stats.norm.pdf(z_range, 0, 1), 'r-', linewidth=2, label='N(0,1)')
axes[0].set_xlabel('z', fontsize=12)
axes[0].set_ylabel('Density', fontsize=12)
axes[0].set_title('Base Distribution', fontsize=13)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Transformed
axes[1].hist(x, bins=50, density=True, alpha=0.7, label='Samples')
x_range = np.linspace(-5, 11, 100)
axes[1].plot(x_range, stats.norm.pdf(x_range, b, a), 'r-', linewidth=2, label='N(3,4)')
axes[1].set_xlabel('x', fontsize=12)
axes[1].set_ylabel('Density', fontsize=12)
axes[1].set_title('Transformed Distribution', fontsize=13)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Transformation function
axes[2].plot(z_range, affine_transform(z_range, a, b), linewidth=2)
axes[2].set_xlabel('z', fontsize=12)
axes[2].set_ylabel('x = f(z)', fontsize=12)
axes[2].set_title(f'Transformation: x = {a}z + {b}', fontsize=13)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

from scipy import stats
print(f"log|det J| = {affine_log_det_jacobian(a):.4f}")

3. Composing Flows¶

Sequential Transformations¶

Apply multiple bijections: $$z_0 \xrightarrow{f_1} z_1 \xrightarrow{f_2} \cdots \xrightarrow{f_K} z_K = x$$

Composed transformation: $$f = f_K \circ f_{K-1} \circ \cdots \circ f_1$$

Log-Likelihood¶

\[\log p_x(x) = \log p_{z_0}(z_0) - \sum_{k=1}^K \log \left| \det \frac{\partial f_k}{\partial z_{k-1}} \right|\]

where $z_0 = f^{-1}(x)$.

Training Objective¶

Maximize log-likelihood: $$\max_{\theta} \sum_{i=1}^n \log p_{\theta}(x_i)$$

This is exact, not a lower bound!

Planar Flow (Simple Example)¶

A planar flow is one of the simplest normalizing flow transformations: $f(z) = z + u \cdot h(w^T z + b)$, where $u, w$ are weight vectors, $b$ is a bias, and $h$ is a smooth activation function. Each planar flow layer applies a single hyperplane-based transformation, and stacking multiple layers builds up the expressiveness to model complex distributions. The Jacobian determinant for a planar flow has a closed-form expression, making likelihood computation efficient. While planar flows are too simple for high-dimensional data, they clearly demonstrate how composing simple invertible transforms can warp a Gaussian into a multimodal target distribution.

class PlanarFlow(nn.Module):
    """Planar normalizing flow.
    
    f(z) = z + u * tanh(w^T z + b)
    """
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        self.u = nn.Parameter(torch.randn(dim))
        self.w = nn.Parameter(torch.randn(dim))
        self.b = nn.Parameter(torch.randn(1))
    
    def forward(self, z):
        """Forward pass: z -> x"""
        linear = torch.matmul(z, self.w) + self.b
        x = z + self.u * torch.tanh(linear).unsqueeze(-1)
        
        # Log determinant
        psi = (1 - torch.tanh(linear)**2) * self.w
        det = 1 + torch.matmul(psi, self.u)
        log_det = torch.log(torch.abs(det) + 1e-8)
        
        return x, log_det

class NormalizingFlow(nn.Module):
    """Stack of planar flows."""
    def __init__(self, dim, n_flows=10):
        super().__init__()
        self.flows = nn.ModuleList([PlanarFlow(dim) for _ in range(n_flows)])
    
    def forward(self, z):
        """Transform z to x."""
        log_det_sum = 0
        x = z
        
        for flow in self.flows:
            x, log_det = flow(x)
            log_det_sum += log_det
        
        return x, log_det_sum
    
    def log_prob(self, x):
        """Compute log p(x)."""
        # Base distribution log prob
        z = torch.zeros_like(x)
        log_pz = -0.5 * (z**2).sum(dim=-1) - 0.5 * z.size(-1) * np.log(2 * np.pi)
        
        # Transform
        _, log_det = self.forward(z)
        
        return log_pz - log_det

# Test
flow = NormalizingFlow(dim=2, n_flows=5).to(device)
z = torch.randn(10, 2).to(device)
x, log_det = flow(z)

print(f"Input shape: {z.shape}")
print(f"Output shape: {x.shape}")
print(f"Log det shape: {log_det.shape}")

Training Flow on 2D Data¶

Training a normalizing flow on 2D data provides a compelling visualization of the learning process. The model starts with a simple Gaussian base distribution and gradually warps it to match a complex target (e.g., two moons, concentric circles, or a spiral). The training objective is exact maximum likelihood: $\mathcal{L} = -\mathbb{E}_{x \sim p_{\text{data}}}[\log p_\theta(x)]$, which decomposes into the log-density of $f^{-1}(x)$ under the base distribution plus the log-absolute-determinant of the Jacobian. Unlike VAEs and GANs, normalizing flows provide exact density evaluation and exact sampling – a significant advantage for applications requiring calibrated uncertainty estimates.

# Generate target distribution: two moons
from sklearn.datasets import make_moons

n_samples = 2000
X, _ = make_moons(n_samples=n_samples, noise=0.05)
X_train = torch.FloatTensor(X).to(device)

# Normalize
X_train = (X_train - X_train.mean(dim=0)) / X_train.std(dim=0)

train_loader = DataLoader(TensorDataset(X_train), batch_size=128, shuffle=True)

# Model
model = NormalizingFlow(dim=2, n_flows=16).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training
n_epochs = 100
losses = []

for epoch in range(n_epochs):
    epoch_loss = 0
    
    for batch, in train_loader:
        optimizer.zero_grad()
        
        # Sample from base
        z = torch.randn_like(batch)
        
        # Transform
        x, log_det = model(z)
        
        # Base log prob
        log_pz = -0.5 * (z**2).sum(dim=1) - np.log(2 * np.pi)
        
        # Log likelihood
        log_px = log_pz - log_det
        
        # Match to data (minimize KL divergence)
        loss = -log_px.mean()
        
        # Also add MSE to stabilize
        loss += F.mse_loss(x, batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    losses.append(epoch_loss / len(train_loader))
    
    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {losses[-1]:.4f}")

print("\nTraining complete!")

# Visualize results
model.eval()

with torch.no_grad():
    # Sample from base
    z_samples = torch.randn(1000, 2).to(device)
    
    # Transform through flow
    x_samples, _ = model(z_samples)
    x_samples = x_samples.cpu().numpy()

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Training loss
axes[0].plot(losses, linewidth=2)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('Training Loss', fontsize=13)
axes[0].grid(True, alpha=0.3)

# Base distribution
z_vis = torch.randn(1000, 2).cpu().numpy()
axes[1].scatter(z_vis[:, 0], z_vis[:, 1], alpha=0.5, s=10)
axes[1].set_xlabel('z₁', fontsize=12)
axes[1].set_ylabel('z₂', fontsize=12)
axes[1].set_title('Base Distribution (Gaussian)', fontsize=13)
axes[1].grid(True, alpha=0.3)
axes[1].set_aspect('equal')

# Generated samples
axes[2].scatter(X_train.cpu()[:, 0], X_train.cpu()[:, 1], alpha=0.3, s=10, label='Data')
axes[2].scatter(x_samples[:, 0], x_samples[:, 1], alpha=0.5, s=10, label='Generated')
axes[2].set_xlabel('x₁', fontsize=12)
axes[2].set_ylabel('x₂', fontsize=12)
axes[2].set_title('Generated vs True Data', fontsize=13)
axes[2].legend()
axes[2].grid(True, alpha=0.3)
axes[2].set_aspect('equal')

plt.tight_layout()
plt.show()

6. Modern Flow Architectures¶

Coupling Layers (RealNVP, Glow)¶

Split input $z = (z_1, z_2)$:

\[x_1 = z_1\]

\[x_2 = z_2 \odot \exp(s(z_1)) + t(z_1)\]

where $s, t$ are neural networks.

Jacobian is triangular → easy determinant!

\[\det J = \prod_i \exp(s_i(z_1))\]

Autoregressive Flows (MAF, IAF)¶

\[x_i = z_i \cdot \exp(\alpha_i(x_{<i})) + \mu_i(x_{<i})\]

Triangular Jacobian via autoregressive structure.

Continuous Normalizing Flows (Neural ODEs)¶

\[\frac{dz(t)}{dt} = f(z(t), t; \theta)\]

Infinite depth flow, compute via ODE solvers.

Comparison¶

Architecture	Sampling	Density	Jacobian
Coupling (RealNVP)	Fast ✓	Fast ✓	Triangular
Autoregressive (MAF)	Slow	Fast ✓	Triangular
Inverse Autoregressive (IAF)	Fast ✓	Slow	Triangular
Neural ODE	Medium	Medium	Continuous

Summary¶

Key Principles:¶

Invertibility: Bijective transformations $f: z \to x$
Change of variables: $p_x(x) = p_z(z) |\det J_f|^{-1}$
Exact likelihood: No approximation, unlike VAE
Composition: Stack flows for expressiveness

Training Objective:¶

\[\max_{\theta} \mathbb{E}_{x \sim p_{data}}[\log p_{\theta}(x)]\]

\[= \max_{\theta} \mathbb{E}_x \left[ \log p_z(f^{-1}(x)) + \log \left| \det \frac{\partial f^{-1}}{\partial x} \right| \right]\]

Architectural Choices:¶

Requirements:

Invertible transformation
Tractable Jacobian determinant

Solutions:

Triangular Jacobian: Coupling, autoregressive
Low-rank updates: Planar, radial flows
Continuous: Neural ODEs

Advantages:¶

Exact likelihood: Principled training
Fast sampling: Single forward pass
Latent space: Invertible, no encoder needed

Limitations:¶

Architectural constraints: Must be invertible
Dimension preserving: $\dim(z) = \dim(x)$
Computational cost: Jacobian determinant

Modern Variants:¶

RealNVP (Dinh et al., 2017): Coupling layers
Glow (Kingma & Dhariwal, 2018): Invertible 1x1 convolutions
FFJORD (Grathwohl et al., 2019): Continuous flows
Flow++ (Ho et al., 2019): Improved coupling

Applications:¶

Image generation (high-res)
Density estimation
Variational inference (VI with flows)
Audio synthesis

Next Steps:¶

04_neural_ode.ipynb - Continuous normalizing flows
03_variational_autoencoders_advanced.ipynb - Compare with VAEs
09_vq_vae.ipynb - Discrete latent representations

Advanced Normalizing Flows Theory¶

1. Mathematical Foundations¶

Change of Variables Formula¶

Theorem (Change of Variables): For invertible f: ℝᵈ → ℝᵈ with x = f(z):

\[p_X(x) = p_Z(f^{-1}(x)) \left| \det \frac{\partial f^{-1}}{\partial x} \right|\]

Inverse Jacobian Form: Using inverse function theorem:

\[p_X(x) = p_Z(z) \left| \det \frac{\partial f}{\partial z} \right|^{-1}\]

where z = f⁻¹(x).

Log-Density:

\[\log p_X(x) = \log p_Z(z) - \log \left| \det J_f(z) \right|\]

where J_f is the Jacobian matrix.

Requirements for Normalizing Flows¶

Bijection: f must be invertible (one-to-one and onto)
Tractable Jacobian: det(J_f) must be computable efficiently
Differentiable: For gradient-based optimization

Challenge: General neural networks are not invertible.

2. Flow Composition¶

Sequential Transformations¶

Composition: Apply K bijections sequentially:

\[z_0 \xrightarrow{f_1} z_1 \xrightarrow{f_2} z_2 \cdots \xrightarrow{f_K} z_K = x\]

Combined Transformation: f = f_K ∘ f_{K-1} ∘ … ∘ f_1

Jacobian Determinant (Chain Rule):

\[\det J_f = \prod_{k=1}^K \det J_{f_k}\]

Log-Density:

\[\log p_X(x) = \log p_{Z_0}(z_0) - \sum_{k=1}^K \log |\det J_{f_k}(z_{k-1})|\]

Maximum Likelihood Training:

\[\max_\theta \mathbb{E}_{x \sim p_{data}} \left[ \log p_{Z_0}(f^{-1}(x)) - \sum_k \log |\det J_{f_k}| \right]\]

3. Coupling Layers (RealNVP)¶

Affine Coupling Layer¶

Key Idea: Split input into two parts, transform one conditioned on the other.

Forward Pass: Given z = (z₁, z₂) where z₁ ∈ ℝᵈ, z₂ ∈ ℝᴰ⁻ᵈ:

\[x_1 = z_1\]

\[x_2 = z_2 \odot \exp(s(z_1)) + t(z_1)\]

where s, t: ℝᵈ → ℝᴰ⁻ᵈ are neural networks (scale and translation).

Inverse Pass:

\[z_1 = x_1\]

\[z_2 = (x_2 - t(x_1)) \odot \exp(-s(x_1))\]

Jacobian: Block triangular structure:

\[\begin{split}J = \begin{pmatrix} I_d & 0 \\ \frac{\partial x_2}{\partial z_1} & \text{diag}(\exp(s(z_1))) \end{pmatrix}\end{split}\]

Determinant:

\[\det J = \prod_{i=1}^{D-d} \exp(s_i(z_1)) = \exp\left(\sum_i s_i(z_1)\right)\]

Log-Determinant:

\[\log |\det J| = \sum_{i=1}^{D-d} s_i(z_1)\]

Computational Cost: O(D) instead of O(D³) for general Jacobian!

Multi-Scale Architecture¶

Strategy: Apply coupling at different resolutions.

Squeeze Operation: Reshape (H, W, C) → (H/2, W/2, 4C).

Split: Send half of channels directly to output, transform other half.

Benefit: Reduces memory, enables high-resolution generation.

4. Glow Architecture¶

1×1 Invertible Convolution¶

Motivation: Permutation is too restrictive, use learned mixing.

Operation: W ∈ ℝᶜˣᶜ invertible weight matrix:

\[x = W \cdot z\]

Initialization: W = random rotation matrix (via QR decomposition).

Jacobian: For each spatial position (h, w):

\[\det J = \det(W)\]

Total Log-Determinant: For H×W spatial resolution:

\[\log |\det J_{total}| = H \cdot W \cdot \log |\det W|\]

Efficient Computation: LU decomposition of W:

\[W = PL(U + \text{diag}(s))\]

where P is permutation, L lower triangular, U upper triangular, s is diagonal.

\[\log |\det W| = \sum_i \log |s_i|\]

Actnorm (Activation Normalization)¶

Purpose: Data-dependent initialization for stable training.

Operation: Affine transformation per channel:

\[y = s \odot x + b\]

Initialization: Set s, b so first batch has zero mean and unit variance.

Benefit: Prevents activation explosion/vanishing.

Glow Block¶

Full Glow Step:

Actnorm (data normalization)
1×1 Invertible Conv (channel mixing)
Affine Coupling (nonlinear transform)

Repeat: Stack K Glow steps per scale.

Multi-scale: Squeeze → K steps → Split → next scale.

5. Autoregressive Flows¶

Masked Autoregressive Flow (MAF)¶

Autoregressive Structure: Each xᵢ depends only on x₁, …, xᵢ₋₁:

\[x_i = z_i \cdot \exp(\alpha_i(x_{<i})) + \mu_i(x_{<i})\]

where α (scale) and μ (location) are neural networks with masking.

Jacobian: Lower triangular (autoregressive property):

\[\begin{split}J_{ij} = \begin{cases} \exp(\alpha_i(x_{<i})) & \text{if } i = j \\ 0 & \text{if } i < j \\ \frac{\partial x_i}{\partial x_j} & \text{if } i > j \end{cases}\end{split}\]

Determinant:

\[\det J = \prod_i \exp(\alpha_i(x_{<i}))\]

Complexity:

Forward (z→x): O(D) sequential (slow)
Inverse (x→z): O(D) parallel (fast)
Density evaluation: Fast

Use Case: Density estimation, variational inference.

Inverse Autoregressive Flow (IAF)¶

Key Difference: Autoregressive in z instead of x:

\[x_i = z_i \cdot \exp(\alpha_i(z_{<i})) + \mu_i(z_{<i})\]

Complexity:

Forward (z→x): O(D) parallel (fast)
Inverse (x→z): O(D) sequential (slow)
Sampling: Fast

Use Case: Sampling, variational autoencoders (VAE posterior).

MADE (Masked Autoencoder for Distribution Estimation)¶

Architecture: Enforce autoregressive property via weight masking.

Masks: Binary matrices M ensuring output i depends only on input j where j < i.

Efficient Parallelization: Compute all μᵢ, αᵢ in one forward pass.

6. Continuous Normalizing Flows (Neural ODEs)¶

Continuous-Time Dynamics¶

ODE Formulation: Instead of discrete steps, continuous transformation:

\[\frac{dz(t)}{dt} = f(z(t), t; \theta)\]

where f is a neural network.

Transformation: z(0) → z(1) via ODE integration:

\[z(1) = z(0) + \int_0^1 f(z(t), t) dt\]

Inverse: Integrate backward:

\[z(0) = z(1) - \int_1^0 f(z(t), t) dt = z(1) + \int_0^1 f(z(1-t), 1-t) dt\]

Instantaneous Change of Variables¶

Theorem (Continuous Change of Variables): Log-density evolves via:

\[\frac{d}{dt} \log p(z(t)) = -\text{Tr}\left(\frac{\partial f}{\partial z(t)}\right)\]

Integrated Form:

\[\log p(z(1)) = \log p(z(0)) - \int_0^1 \text{Tr}\left(\frac{\partial f}{\partial z(t)}\right) dt\]

Trace Computation: For D-dimensional z, computing trace directly is O(D²).

FFJORD (Free-Form Jacobian of Reversible Dynamics)¶

Hutchinson Trace Estimator: Unbiased estimate using random vectors:

\[\text{Tr}(A) = \mathbb{E}_{\epsilon \sim \mathcal{N}(0,I)} [\epsilon^T A \epsilon]\]

Application:

\[\text{Tr}\left(\frac{\partial f}{\partial z}\right) \approx \epsilon^T \frac{\partial f}{\partial z} \epsilon\]

computed via vector-Jacobian product (automatic differentiation).

Complexity: O(D) instead of O(D²).

Training: Solve augmented ODE simultaneously for z(t) and log-density:

\[\begin{split}\frac{d}{dt} \begin{pmatrix} z(t) \\ \log p(t) \end{pmatrix} = \begin{pmatrix} f(z(t), t) \\ -\text{Tr}(\partial f/\partial z) \end{pmatrix}\end{split}\]

Advantage: No architectural constraints (any f can be used).

7. Volume-Preserving Flows¶

Motivation¶

Challenge: Standard flows require dim(z) = dim(x).

Solution: Constrain det(J) = 1 (volume-preserving).

Residual Flows¶

Transformation: f(z) = z + g(z) where g is a neural network.

Condition for Invertibility: Lipschitz constant Lip(g) < 1.

Spectral Normalization: Constrain Lip(g) via weight normalization.

Jacobian Determinant: det(J) = det(I + ∂g/∂z).

Power Series: For small ||∂g/∂z||:

\[\log \det(I + A) \approx \text{Tr}(A) - \frac{1}{2}\text{Tr}(A^2) + \cdots\]

Infinitesimal Flows¶

Limit: As g → 0, residual flow becomes continuous flow.

Connection: Neural ODEs are limit of residual flows.

8. Advanced Training Techniques¶

Dequantization¶

Problem: Discrete data (images) have zero density under continuous model.

Solution: Add uniform noise U[0, 1/256] to discretized data:

\[\tilde{x} = x + u, \quad u \sim \text{Uniform}[0, 1/256]\]

Variational Dequantization: Learn noise distribution q(u|x) for tighter bound.

Temperature Scaling¶

Sampling: Scale base distribution std by temperature τ:

\[z \sim \mathcal{N}(0, \tau^2 I)\]

τ < 1: Higher quality, less diversity
τ > 1: More diversity, lower quality

Multi-Scale Training¶

Strategy: Apply loss at multiple resolutions.

Objective:

\[\mathcal{L} = \sum_{s=1}^S \lambda_s \mathbb{E}_{x^{(s)}} [-\log p(x^{(s)})]\]

where x⁽ˢ⁾ is data at scale s.

Benefit: Better gradient flow, prevents mode collapse.

Gradient Checkpointing¶

Memory Reduction: Don’t store all intermediate activations during forward pass.

Recomputation: Recompute activations during backward pass.

Trade-off: 33% slower, but enables deeper flows.

9. Likelihood Estimation¶

Bits per Dimension (BPD)¶

Definition: Negative log-likelihood normalized by data dimension:

\[\text{BPD} = -\frac{1}{D \log 2} \mathbb{E}[\log p(x)]\]

where D = H × W × C for images.

Interpretation: Average bits needed to encode each pixel/dimension.

Typical Values:

Random: 8 bits/dim (uniform over [0, 255])
Good model: 3-4 bits/dim (CIFAR-10)
SOTA: <3 bits/dim

Comparison with ELBO¶

Flow: Exact likelihood $$\log p(x) = \log p(z) - \sum_k \log |\det J_{f_k}|$$

VAE: Lower bound (ELBO) $$\log p(x) \geq \mathbb{E}_{q(z|x)}[\log p(x|z)] - KL(q(z|x) \| p(z))$$

Advantage: Flows optimize true likelihood, VAEs optimize bound (may be loose).

10. Invertibility Guarantees¶

Conditions for Invertibility¶

Theorem (Inverse Function Theorem): If f is continuously differentiable and det(J_f) ≠ 0, then f is locally invertible.

Global Invertibility: Additional conditions needed:

f is injective (one-to-one)
f is surjective (onto)

Invertible Neural Networks (INN)¶

Design Principles:

Use invertible components (coupling, 1×1 conv)
Ensure non-zero Jacobian determinant
Verify numerical stability

Numerical Inversion: For complex f, use iterative solvers (fixed-point iteration, Newton’s method).

11. State-of-the-Art Results¶

Image Generation¶

CIFAR-10 (32×32):

Glow: 3.35 bits/dim
Flow++: 3.08 bits/dim
Residual Flow: 3.28 bits/dim

ImageNet (64×64):

Glow: 3.81 bits/dim
Flow++: 3.69 bits/dim

CelebA-HQ (256×256):

Glow: 1.03 bits/dim

Sampling Speed¶

Comparison (single sample):

Flow: ~10-50ms (K forward passes)
GAN: ~5ms (1 generator pass)
Diffusion: ~1-10s (100-1000 steps)

Advantage: Flows are faster than diffusion, slower than GANs.

12. Limitations¶

Architectural Constraints¶

Dimension Preservation: dim(z) = dim(x) required.

Can’t compress like VAE (z < x)
Inefficient for high-dim data

Invertibility: Limits architecture choices.

No pooling, dropout, batch norm (standard forms)
Must use specialized layers

Sample Quality vs Likelihood¶

Paradox: High likelihood doesn’t always mean good samples.

Reason: Model may assign probability mass to imperceptible differences.

Solution: Hybrid models (VAE + Flow, GAN + Flow).

Computational Cost¶

Training: Computing Jacobian determinant adds overhead.

RealNVP: 2-3× slower than GAN
FFJORD: 5-10× slower (ODE solve)

Memory: Storing intermediate activations for deep flows.

13. Extensions and Variants¶

Discrete Flows¶

Problem: Continuous flows don’t model discrete data directly.

Solution: Integer discrete flows (Hoogeboom et al., 2019).

Application: Text, graphs, molecular structures.

Variational Dequantization Flows¶

Idea: Learn dequantization distribution q(u|x) jointly with flow.

Objective: Tighter bound on discrete data likelihood.

Graphical Normalizing Flows¶

Extension: Flows for graph-structured data.

Equivariance: Preserve graph symmetries (permutation equivariance).

Neural Spline Flows¶

Piecewise Function: Use monotonic splines instead of affine transforms.

Advantage: More expressive coupling layers.

SOTA: Achieves best bits/dim on many benchmarks.

14. Applications¶

Variational Inference¶

Use: Flow-based posterior q_φ(z|x) in VAE.

Benefit: More flexible than Gaussian posterior.

Example: Normalizing Flow VAE (Rezende & Mohamed, 2015).

Density Estimation¶

Task: Model p(x) for anomaly detection, compression.

Method: Maximize likelihood on training data.

Anomaly Score: -log p(x) (high for outliers).

Image Synthesis¶

Generation: Sample z ~ N(0,I), compute x = f(z).

Interpolation: Linear interpolation in z-space.

Attribute Manipulation: Directions in latent space correspond to attributes.

Data Compression¶

Principle: Use learned p(x) for entropy coding.

Bits Required: Approximately -log₂ p(x) bits.

Lossless Compression: Exact reconstruction via invertibility.

15. Connections to Other Models¶

Flows vs VAEs¶

Aspect	Flow	VAE
Likelihood	Exact	Lower bound (ELBO)
Latent dim	dim(z) = dim(x)	dim(z) < dim(x)
Sampling	Fast (1 pass)	Fast (1 pass)
Invertibility	Yes	No (encoder ≠ decoder⁻¹)

Flows vs GANs¶

Similarities: Both map noise → data.

Differences:

Flow: Bijection, exact likelihood
GAN: Not invertible, no likelihood

Hybrid: Use flow as GAN generator for stable training.

Flows vs Diffusion¶

Diffusion: Iterative denoising process (many steps).

Pro: SOTA sample quality
Con: Slow sampling

Flow: Single pass transformation.

Pro: Fast sampling
Con: Architectural constraints

16. Key Papers¶

Foundations¶

Rezende & Mohamed (2015): “Variational Inference with Normalizing Flows”
Dinh et al. (2015): “NICE: Non-linear Independent Components Estimation”

Modern Architectures¶

Dinh et al. (2017): “Density Estimation using Real NVP”
Kingma & Dhariwal (2018): “Glow: Generative Flow using Invertible 1x1 Convolutions”
Papamakarios et al. (2017): “Masked Autoregressive Flow for Density Estimation”

Continuous Flows¶

Chen et al. (2018): “Neural Ordinary Differential Equations”
Grathwohl et al. (2019): “FFJORD: Free-form Continuous Dynamics for Scalable Reversible Generative Models”

Recent Advances¶

Ho et al. (2019): “Flow++: Improving Flow-Based Generative Models”
Durkan et al. (2019): “Neural Spline Flows”
Hoogeboom et al. (2019): “Integer Discrete Flows”

17. Practical Considerations¶

Hyperparameters¶

Parameter	Typical Range	Effect
Coupling depth	8-32 layers	More → expressive, slower
Hidden dim	256-512	Larger → flexible, memory
Split dimension	D/2	Half channels per coupling
Learning rate	1e-4 to 5e-4	Higher → faster, unstable
Batch size	32-128	Larger → stable, more memory

Debugging Tips¶

Check:

Jacobian determinant: Should be non-zero
Inverse accuracy: ||f⁻¹(f(z)) - z|| < ε
Likelihood: Should increase during training
Samples: Visually inspect quality

Common Issues:

Exploding gradients → gradient clipping, spectral norm
Poor samples → increase depth, tune temperature
Slow training → reduce depth, use gradient checkpointing

Computational Efficiency¶

Speed Optimizations:

Use 1×1 conv instead of full permutation
LU decomposition for faster determinant
Gradient checkpointing for memory

Parallelization: Coupling layers parallelize well across GPUs.

18. When to Use Normalizing Flows¶

Choose Flows When:

Exact likelihood required (density estimation, compression)
Fast sampling important (real-time applications)
Interpretable latent space desired
Invertibility useful (cycle-consistency tasks)

Avoid Flows When:

Best sample quality critical (use diffusion/GANs)
High compression needed (use VAE with z << x)
Data is discrete (flows designed for continuous)
Limited compute (flows can be memory-intensive)

# Advanced Normalizing Flows Implementations

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import Tuple, List, Optional

class AffineCoupling(nn.Module):
    """
    Affine coupling layer (RealNVP).
    Split input, transform half conditioned on other half.
    """
    
    def __init__(self, dim, hidden_dim=256, mask_type='checkerboard'):
        super().__init__()
        self.dim = dim
        
        # Create mask (1 = identity, 0 = transform)
        if mask_type == 'checkerboard':
            self.register_buffer('mask', torch.arange(dim) % 2)
        elif mask_type == 'channel':
            mask = torch.zeros(dim)
            mask[:dim//2] = 1
            self.register_buffer('mask', mask)
        
        # Scale and translation networks
        mask_dim = int(self.mask.sum().item())
        transform_dim = dim - mask_dim
        
        self.scale_net = nn.Sequential(
            nn.Linear(mask_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, transform_dim)
        )
        
        self.translate_net = nn.Sequential(
            nn.Linear(mask_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, transform_dim)
        )
    
    def forward(self, z, reverse=False):
        """
        Forward: z -> x
        Reverse: x -> z
        """
        mask = self.mask.unsqueeze(0)
        
        # Split input
        z_masked = z * mask
        z_transform = z * (1 - mask)
        
        # Get conditioning (identity part)
        z_cond = z_masked[:, self.mask == 1]
        
        # Compute scale and translation
        s = self.scale_net(z_cond)
        t = self.translate_net(z_cond)
        
        if not reverse:
            # Forward: x = z * exp(s) + t
            x_transform = z_transform[:, self.mask == 0] * torch.exp(s) + t
            log_det = s.sum(dim=1)
        else:
            # Inverse: z = (x - t) * exp(-s)
            x_transform = (z_transform[:, self.mask == 0] - t) * torch.exp(-s)
            log_det = -s.sum(dim=1)
        
        # Reconstruct output
        x = torch.zeros_like(z)
        x[:, self.mask == 1] = z_cond
        x[:, self.mask == 0] = x_transform
        
        return x, log_det


class RealNVP(nn.Module):
    """
    RealNVP: Stacked affine coupling layers with alternating masks.
    """
    
    def __init__(self, dim, n_layers=6, hidden_dim=256):
        super().__init__()
        self.dim = dim
        
        # Alternating checkerboard and channel masks
        self.layers = nn.ModuleList()
        for i in range(n_layers):
            mask_type = 'checkerboard' if i % 2 == 0 else 'channel'
            self.layers.append(AffineCoupling(dim, hidden_dim, mask_type))
    
    def forward(self, z, reverse=False):
        """Transform z -> x (or x -> z if reverse)."""
        log_det_total = 0
        x = z
        
        layers = reversed(self.layers) if reverse else self.layers
        
        for layer in layers:
            x, log_det = layer(x, reverse=reverse)
            log_det_total += log_det
        
        return x, log_det_total
    
    def log_prob(self, x):
        """Compute log p(x)."""
        # Inverse transform: x -> z
        z, log_det = self.forward(x, reverse=True)
        
        # Base distribution: N(0, I)
        log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.dim * np.log(2 * np.pi)
        
        # log p(x) = log p(z) + log |det J_f^{-1}|
        log_px = log_pz + log_det
        
        return log_px


class Invertible1x1Conv(nn.Module):
    """
    1×1 invertible convolution (Glow).
    Efficient via LU decomposition: W = PLU.
    """
    
    def __init__(self, channels):
        super().__init__()
        self.channels = channels
        
        # Initialize with random rotation
        W = torch.qr(torch.randn(channels, channels))[0]
        
        # LU decomposition
        P, L, U = torch.lu_unpack(*torch.lu(W))
        
        # Store parameters
        self.register_buffer('P', P)  # Permutation (fixed)
        self.L = nn.Parameter(L)  # Lower triangular
        self.U = nn.Parameter(U)  # Upper triangular
        
        # Diagonal of U as separate parameter for easy determinant
        self.log_s = nn.Parameter(torch.log(torch.abs(torch.diag(U))))
    
    def forward(self, z, reverse=False):
        """Apply (inverse) 1×1 convolution."""
        batch_size = z.size(0)
        
        # Reconstruct W = P @ L @ (U + diag(s))
        L = torch.tril(self.L, diagonal=-1) + torch.eye(self.channels, device=z.device)
        U = torch.triu(self.U, diagonal=1)
        W = self.P @ L @ (U + torch.diag(torch.exp(self.log_s)))
        
        if not reverse:
            # Forward: x = W @ z
            x = z @ W.t()
            # Log determinant: sum of log diagonal elements
            log_det = self.log_s.sum() * torch.ones(batch_size, device=z.device)
        else:
            # Inverse: z = W^{-1} @ x
            W_inv = torch.inverse(W)
            x = z @ W_inv.t()
            log_det = -self.log_s.sum() * torch.ones(batch_size, device=z.device)
        
        return x, log_det


class ActNorm(nn.Module):
    """
    Activation normalization (Glow).
    Data-dependent initialization for stable training.
    """
    
    def __init__(self, channels):
        super().__init__()
        self.channels = channels
        
        # Learnable scale and bias
        self.log_scale = nn.Parameter(torch.zeros(channels))
        self.bias = nn.Parameter(torch.zeros(channels))
        
        # Track initialization
        self.register_buffer('initialized', torch.tensor(0))
    
    def forward(self, z, reverse=False):
        """Apply activation normalization."""
        batch_size = z.size(0)
        
        # Initialize on first batch
        if not self.initialized:
            with torch.no_grad():
                # Set bias to have zero mean
                mean = z.mean(dim=0)
                std = z.std(dim=0)
                
                self.bias.data = -mean
                self.log_scale.data = -torch.log(std + 1e-6)
                
                self.initialized.fill_(1)
        
        if not reverse:
            # Forward: normalize
            x = z * torch.exp(self.log_scale) + self.bias
            log_det = self.log_scale.sum() * torch.ones(batch_size, device=z.device)
        else:
            # Inverse: denormalize
            x = (z - self.bias) * torch.exp(-self.log_scale)
            log_det = -self.log_scale.sum() * torch.ones(batch_size, device=z.device)
        
        return x, log_det


class GlowBlock(nn.Module):
    """
    Single Glow block: ActNorm -> 1×1 Conv -> Affine Coupling.
    """
    
    def __init__(self, channels, hidden_dim=256):
        super().__init__()
        
        self.actnorm = ActNorm(channels)
        self.conv1x1 = Invertible1x1Conv(channels)
        self.coupling = AffineCoupling(channels, hidden_dim, mask_type='channel')
    
    def forward(self, z, reverse=False):
        """Apply (inverse) Glow block."""
        log_det_total = 0
        
        if not reverse:
            # Forward: ActNorm -> 1×1Conv -> Coupling
            x, ld1 = self.actnorm(z, reverse=False)
            x, ld2 = self.conv1x1(x, reverse=False)
            x, ld3 = self.coupling(x, reverse=False)
            log_det_total = ld1 + ld2 + ld3
        else:
            # Reverse: Coupling -> 1×1Conv -> ActNorm
            x, ld3 = self.coupling(z, reverse=True)
            x, ld2 = self.conv1x1(x, reverse=True)
            x, ld1 = self.actnorm(x, reverse=True)
            log_det_total = ld1 + ld2 + ld3
        
        return x, log_det_total


class Glow(nn.Module):
    """
    Glow: Multi-scale architecture with squeeze, flow, split.
    """
    
    def __init__(self, input_dim, n_blocks=8, hidden_dim=256):
        super().__init__()
        self.input_dim = input_dim
        
        # Stack of Glow blocks
        self.blocks = nn.ModuleList([
            GlowBlock(input_dim, hidden_dim) for _ in range(n_blocks)
        ])
    
    def forward(self, z, reverse=False):
        """Transform z -> x (or x -> z if reverse)."""
        log_det_total = 0
        x = z
        
        blocks = reversed(self.blocks) if reverse else self.blocks
        
        for block in blocks:
            x, log_det = block(x, reverse=reverse)
            log_det_total += log_det
        
        return x, log_det_total
    
    def log_prob(self, x):
        """Compute log p(x) via change of variables."""
        # Inverse: x -> z
        z, log_det = self.forward(x, reverse=True)
        
        # Base log probability
        log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.input_dim * np.log(2 * np.pi)
        
        # Log probability
        log_px = log_pz + log_det
        
        return log_px


class MADE(nn.Module):
    """
    Masked Autoencoder for Distribution Estimation.
    Autoregressive flow via weight masking.
    """
    
    def __init__(self, input_dim, hidden_dims=[256, 256]):
        super().__init__()
        self.input_dim = input_dim
        
        # Build network
        dims = [input_dim] + hidden_dims + [input_dim * 2]  # *2 for scale and location
        
        self.layers = nn.ModuleList()
        for i in range(len(dims) - 1):
            self.layers.append(nn.Linear(dims[i], dims[i+1]))
        
        # Create masks for autoregressive property
        self.masks = self._create_masks(dims)
    
    def _create_masks(self, dims):
        """Create binary masks ensuring x_i depends only on x_{<i}."""
        masks = []
        
        # Assign degree (max input index) to each unit
        m = {}
        m[0] = torch.arange(1, self.input_dim + 1)  # Input: 1, 2, ..., D
        
        for l in range(1, len(dims) - 1):
            # Hidden layers: random degrees between 1 and D-1
            m[l] = torch.randint(1, self.input_dim, (dims[l],))
        
        m[len(dims) - 1] = torch.arange(1, self.input_dim + 1).repeat(2)  # Output: repeat for scale/loc
        
        # Create masks: connect if m[l-1] < m[l]
        for l in range(len(dims) - 1):
            masks.append(m[l].unsqueeze(1) <= m[l+1].unsqueeze(0))
        
        return masks
    
    def forward(self, z):
        """Compute scale and location parameters."""
        x = z
        
        for i, (layer, mask) in enumerate(zip(self.layers, self.masks)):
            # Apply masked weight
            weight = layer.weight * mask.t().float().to(z.device)
            x = F.linear(x, weight, layer.bias)
            
            if i < len(self.layers) - 1:
                x = F.relu(x)
        
        # Split into scale and location
        log_scale, loc = x.chunk(2, dim=1)
        
        return log_scale, loc


class MAF(nn.Module):
    """
    Masked Autoregressive Flow.
    Fast density estimation, slow sampling.
    """
    
    def __init__(self, input_dim, n_layers=5, hidden_dims=[256, 256]):
        super().__init__()
        self.input_dim = input_dim
        
        # Stack of MADE layers
        self.made_layers = nn.ModuleList([
            MADE(input_dim, hidden_dims) for _ in range(n_layers)
        ])
    
    def forward(self, z, reverse=False):
        """Transform z -> x (forward) or x -> z (reverse)."""
        if not reverse:
            # Forward (z -> x): Sequential (slow)
            x = torch.zeros_like(z)
            log_det_total = 0
            
            for i in range(self.input_dim):
                # Compute parameters from previous outputs
                log_s, t = self.made_layers[0](x)
                
                # Transform current dimension
                x[:, i] = z[:, i] * torch.exp(log_s[:, i]) + t[:, i]
                log_det_total += log_s[:, i]
            
            return x, log_det_total
        else:
            # Inverse (x -> z): Parallel (fast)
            log_det_total = 0
            z = x.clone()
            
            for made in self.made_layers:
                log_s, t = made(x)
                z = (x - t) * torch.exp(-log_s)
                log_det_total += -log_s.sum(dim=1)
                x = z
            
            return z, log_det_total
    
    def log_prob(self, x):
        """Compute log p(x) - FAST for MAF."""
        # Inverse: x -> z (parallel, fast)
        z, log_det = self.forward(x, reverse=True)
        
        # Base log prob
        log_pz = -0.5 * (z ** 2).sum(dim=1) - 0.5 * self.input_dim * np.log(2 * np.pi)
        
        return log_pz + log_det


class NeuralODEFlow(nn.Module):
    """
    Continuous Normalizing Flow via Neural ODE.
    Uses torchdiffeq for ODE integration.
    """
    
    def __init__(self, dim, hidden_dim=64):
        super().__init__()
        self.dim = dim
        
        # Dynamics network: f(z, t)
        self.net = nn.Sequential(
            nn.Linear(dim + 1, hidden_dim),  # +1 for time
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, dim)
        )
    
    def forward(self, t, z):
        """Compute dz/dt = f(z, t)."""
        # Concatenate time
        t_vec = torch.ones(z.size(0), 1, device=z.device) * t
        z_t = torch.cat([z, t_vec], dim=1)
        
        return self.net(z_t)
    
    def trace_estimator(self, z, t, noise=None):
        """Hutchinson trace estimator for divergence."""
        if noise is None:
            noise = torch.randn_like(z)
        
        # Compute f(z, t)
        f = self.forward(t, z)
        
        # Vector-Jacobian product: ε^T (∂f/∂z)
        vjp = torch.autograd.grad(f, z, noise, create_graph=True)[0]
        
        # Tr(∂f/∂z) ≈ ε^T (∂f/∂z) ε
        trace_estimate = (vjp * noise).sum(dim=1)
        
        return trace_estimate


class CNFTrainer:
    """
    Trainer for Continuous Normalizing Flows.
    Integrates ODE for z(t) and log p(t) simultaneously.
    """
    
    def __init__(self, flow_model, optimizer):
        self.flow = flow_model
        self.optimizer = optimizer
    
    def integrate_ode(self, z0, t_span, method='euler'):
        """
        Simple ODE integration (Euler method).
        For production, use torchdiffeq library.
        """
        z = z0
        log_p = torch.zeros(z0.size(0), device=z0.device)
        
        dt = (t_span[1] - t_span[0]) / 10  # 10 steps
        t = t_span[0]
        
        for _ in range(10):
            z.requires_grad_(True)
            
            # Compute dynamics
            dz_dt = self.flow(t, z)
            
            # Compute trace (divergence)
            trace = self.flow.trace_estimator(z, t)
            
            # Euler step
            z = z + dz_dt * dt
            log_p = log_p - trace * dt
            
            t = t + dt
            z = z.detach()
        
        return z, log_p
    
    def train_step(self, x_data):
        """Single training step."""
        # Base distribution
        z0 = torch.randn_like(x_data)
        
        # Integrate forward: z(0) -> z(1)
        z1, log_det = self.integrate_ode(z0, (0.0, 1.0))
        
        # Base log prob
        log_pz0 = -0.5 * (z0 ** 2).sum(dim=1) - 0.5 * z0.size(1) * np.log(2 * np.pi)
        
        # Log prob at t=1
        log_pz1 = log_pz0 - log_det
        
        # Loss: negative log-likelihood + MSE to data
        loss = -log_pz1.mean() + F.mse_loss(z1, x_data)
        
        # Backprop
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()


# ============================================================================
# Demonstrations
# ============================================================================

print("=" * 70)
print("Normalizing Flows - Advanced Implementations")
print("=" * 70)

# 1. RealNVP
print("\n1. RealNVP (Affine Coupling):")
realnvp = RealNVP(dim=10, n_layers=6, hidden_dim=128)
z_test = torch.randn(4, 10)
x_test, log_det = realnvp(z_test)
log_px = realnvp.log_prob(x_test)
print(f"   Input shape: {z_test.shape}")
print(f"   Output shape: {x_test.shape}")
print(f"   Log determinant: {log_det.shape}")
print(f"   Log p(x): {log_px.shape}")
print(f"   Parameters: {sum(p.numel() for p in realnvp.parameters()):,}")

# 2. Glow
print("\n2. Glow (ActNorm + 1×1Conv + Coupling):")
glow = Glow(input_dim=10, n_blocks=8, hidden_dim=128)
x_glow, log_det_glow = glow(z_test)
print(f"   Glow blocks: 8")
print(f"   Each block: ActNorm -> Inv1×1Conv -> AffineCoupling")
print(f"   Parameters: {sum(p.numel() for p in glow.parameters()):,}")
print(f"   Forward pass: z -> x")
print(f"   Inverse pass: x -> z (for log p(x))")

# 3. MAF
print("\n3. MAF (Masked Autoregressive Flow):")
maf = MAF(input_dim=10, n_layers=3, hidden_dims=[64, 64])
log_px_maf = maf.log_prob(x_test)
print(f"   MADE layers: 3")
print(f"   Autoregressive: x_i = f(x_{<i}, z_i)")
print(f"   Density eval: FAST (parallel)")
print(f"   Sampling: SLOW (sequential)")
print(f"   Parameters: {sum(p.numel() for p in maf.parameters()):,}")

# 4. Neural ODE Flow
print("\n4. Neural ODE Flow (Continuous):")
ode_flow = NeuralODEFlow(dim=10, hidden_dim=32)
print(f"   Dynamics: dz/dt = f(z, t; θ)")
print(f"   Integration: 0 -> 1 (continuous time)")
print(f"   Trace: Hutchinson estimator (efficient)")
print(f"   Parameters: {sum(p.numel() for p in ode_flow.parameters()):,}")

# 5. Architecture comparison
print("\n5. Architecture Comparison:")
print("   ┌──────────────┬───────────┬────────────┬──────────────┬──────────┐")
print("   │ Architecture │ Sampling  │ Density    │ Jacobian     │ Use Case │")
print("   ├──────────────┼───────────┼────────────┼──────────────┼──────────┤")
print("   │ RealNVP      │ Fast ✓    │ Fast ✓     │ Triangular   │ General  │")
print("   │ Glow         │ Fast ✓    │ Fast ✓     │ Triangular   │ Images   │")
print("   │ MAF          │ Slow      │ Fast ✓✓    │ Triangular   │ Density  │")
print("   │ IAF          │ Fast ✓✓   │ Slow       │ Triangular   │ VAE      │")
print("   │ Neural ODE   │ Medium    │ Medium     │ Continuous   │ Flexible │")
print("   └──────────────┴───────────┴────────────┴──────────────┴──────────┘")

# 6. Jacobian determinant comparison
print("\n6. Jacobian Determinant Computation:")
print("   General matrix: O(D³) - prohibitive")
print("   Triangular (coupling): O(D) - efficient ✓")
print("   ")
print("   RealNVP coupling:")
print("   ┌─────────────────┐")
print("   │ I_d      0      │  Determinant = exp(Σ s_i)")
print("   │ ∂x₂/∂z₁  diag(s)│  Cost: O(D)")
print("   └─────────────────┘")

# 7. Invertibility verification
print("\n7. Invertibility Check:")
z_original = torch.randn(2, 10)
x_forward, _ = realnvp(z_original, reverse=False)
z_reconstructed, _ = realnvp(x_forward, reverse=True)
error = (z_original - z_reconstructed).abs().max().item()
print(f"   z_original -> x -> z_reconstructed")
print(f"   Max reconstruction error: {error:.2e}")
print(f"   Numerically invertible: {'✓' if error < 1e-4 else '✗'}")

# 8. When to use guide
print("\n8. When to Use Each Flow:")
print("   Use RealNVP when:")
print("     ✓ Need exact likelihood")
print("     ✓ Fast sampling important")
print("     ✓ General-purpose generation")
print("\n   Use Glow when:")
print("     ✓ Image generation (high-res)")
print("     ✓ Need 1×1 conv mixing")
print("     ✓ Multi-scale architecture beneficial")
print("\n   Use MAF when:")
print("     ✓ Density estimation primary goal")
print("     ✓ Sampling speed not critical")
print("     ✓ Variational inference (VI)")
print("\n   Use Neural ODE when:")
print("     ✓ Architectural flexibility needed")
print("     ✓ No invertibility constraints")
print("     ✓ Continuous dynamics natural")

# 9. Exact likelihood advantage
print("\n9. Exact Likelihood vs Bounds:")
print("   Flow:  log p(x) = log p(z) - log|det J|  [EXACT]")
print("   VAE:   log p(x) ≥ ELBO                   [LOWER BOUND]")
print("   GAN:   No likelihood                     [IMPLICIT]")
print("   ")
print("   Flows enable:")
print("     • Principled maximum likelihood training")
print("     • Exact density for anomaly detection")
print("     • Lossless compression via entropy coding")

print("\n" + "=" * 70)