import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
1. Implicit Neural RepresentationsΒΆ
Concept:ΒΆ
Represent signal as continuous function:
For image: \(f_\theta(x, y) \rightarrow (r, g, b)\)
SIREN Activation:ΒΆ
π Reference Materials:
cv_3d_foundation.pdf - Cv 3D Foundation
cv_3d_research.pdf - Cv 3D Research
class SineLayer(nn.Module):
"""Sine activation with special initialization."""
def __init__(self, in_features, out_features, omega_0=30.0, is_first=False):
super().__init__()
self.omega_0 = omega_0
self.linear = nn.Linear(in_features, out_features)
# Special initialization
if is_first:
self.linear.weight.uniform_(-1 / in_features, 1 / in_features)
else:
bound = np.sqrt(6 / in_features) / omega_0
self.linear.weight.uniform_(-bound, bound)
def forward(self, x):
return torch.sin(self.omega_0 * self.linear(x))
print("SineLayer defined")
SIREN NetworkΒΆ
SIREN (Sinusoidal Representation Networks) uses \(\sin\) activation functions instead of ReLU, which gives the network and all its derivatives continuous, well-behaved structure. The key insight is that sinusoidal activations are their own derivatives (up to scaling), so a SIREN network is equally expressive for representing a function and its spatial gradients, Laplacian, or any higher-order differential quantity. The initialization scheme is critical: weights are drawn from \(\mathcal{U}(-\sqrt{6/n}, \sqrt{6/n})\) (except the first layer which uses a frequency parameter \(\omega_0\) scaling) to preserve the distribution of activations through the network. SIRENs have achieved state-of-the-art results for tasks ranging from image fitting to solving partial differential equations.
class SIREN(nn.Module):
"""Sinusoidal Representation Network."""
def __init__(self, in_features, hidden_features, hidden_layers, out_features, omega_0=30.0):
super().__init__()
# First layer
layers = [SineLayer(in_features, hidden_features, omega_0, is_first=True)]
# Hidden layers
for _ in range(hidden_layers - 1):
layers.append(SineLayer(hidden_features, hidden_features, omega_0))
# Final layer
final_linear = nn.Linear(hidden_features, out_features)
bound = np.sqrt(6 / hidden_features) / omega_0
final_linear.weight.uniform_(-bound, bound)
layers.append(final_linear)
self.net = nn.Sequential(*layers)
def forward(self, coords):
return self.net(coords)
print("SIREN defined")
Image FittingΒΆ
In the image fitting task, the SIREN learns to map pixel coordinates \((x, y)\) directly to RGB color values, representing an entire image as a continuous function. The training data is simply all (coordinate, color) pairs from the target image, and the loss is MSE between predicted and actual colors. Unlike a discrete pixel grid, the learned continuous representation can be evaluated at arbitrary coordinates β enabling resolution-independent rendering and smooth spatial interpolation. This is the simplest demonstration of implicit neural representations, and the same principle extends to 3D shapes, radiance fields, and video.
def get_coordinates(H, W):
"""Generate normalized coordinate grid."""
y = torch.linspace(-1, 1, H)
x = torch.linspace(-1, 1, W)
yy, xx = torch.meshgrid(y, x, indexing='ij')
coords = torch.stack([xx, yy], dim=-1)
return coords.view(-1, 2)
def create_test_image(size=64):
"""Create simple test image."""
img = torch.zeros(size, size, 3)
# Circle
y, x = torch.meshgrid(torch.linspace(-1, 1, size), torch.linspace(-1, 1, size), indexing='ij')
dist = torch.sqrt(x**2 + y**2)
circle = (dist < 0.5).float()
img[:, :, 0] = circle
img[:, :, 1] = 1 - circle
return img
# Create image
img = create_test_image(64)
H, W, C = img.shape
# Get coordinates and pixel values
coords = get_coordinates(H, W).to(device)
pixels = img.view(-1, C).to(device)
print(f"Image: {H}x{W}, Coords: {coords.shape}, Pixels: {pixels.shape}")
Train SIRENΒΆ
Training proceeds by sampling batches of coordinate-value pairs from the target signal and minimizing the reconstruction loss (MSE). SIRENs converge remarkably quickly compared to ReLU networks for this task, often fitting high-frequency image details in just a few hundred iterations. The frequency parameter \(\omega_0\) in the first layer acts as a prior on the spectral content: higher values allow the network to represent higher-frequency details but may cause optimization difficulties if set too large.
# Model
model = SIREN(
in_features=2,
hidden_features=256,
hidden_layers=3,
out_features=3,
omega_0=30.0
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# Train
losses = []
for epoch in range(500):
pred = model(coords)
loss = F.mse_loss(pred, pixels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
if (epoch + 1) % 100 == 0:
print(f"Epoch {epoch+1}, Loss: {loss.item():.6f}")
Visualize ResultsΒΆ
Comparing the SIREN reconstruction with the original image at the training resolution and at higher resolutions reveals the quality of the learned implicit representation. At the training resolution, the reconstruction should be near-perfect. At higher resolutions (super-resolution), the continuous representation produces smooth, plausible interpolations rather than pixelated artifacts. Visualizing the networkβs gradient outputs (which are analytically available) shows edge detection and surface normal estimation for free β a unique benefit of differentiable implicit representations.
# Reconstruct
model.eval()
with torch.no_grad():
pred = model(coords)
pred_img = pred.view(H, W, C).cpu().clamp(0, 1)
# Plot
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Original
axes[0].imshow(img.cpu())
axes[0].set_title('Original', fontsize=12)
axes[0].axis('off')
# Reconstructed
axes[1].imshow(pred_img)
axes[1].set_title('SIREN Reconstruction', fontsize=12)
axes[1].axis('off')
# Loss curve
axes[2].plot(losses)
axes[2].set_xlabel('Iteration', fontsize=11)
axes[2].set_ylabel('MSE Loss', fontsize=11)
axes[2].set_title('Training Loss', fontsize=12)
axes[2].grid(True, alpha=0.3)
axes[2].set_yscale('log')
plt.tight_layout()
plt.show()
Super-ResolutionΒΆ
Because the SIREN represents the image as a continuous function, we can query it at coordinates denser than the original pixel grid to produce a super-resolved image. Unlike traditional super-resolution methods that require training on paired low/high-resolution images, the implicit representation approach achieves super-resolution as a natural byproduct of the continuous coordinate mapping. The quality depends on how well the network has learned the underlying signal structure rather than memorizing pixel values β networks with appropriate frequency capacity produce smooth, natural-looking upscaled results.
# Generate high-resolution coordinates
coords_hr = get_coordinates(128, 128).to(device)
with torch.no_grad():
pred_hr = model(coords_hr)
pred_hr_img = pred_hr.view(128, 128, 3).cpu().clamp(0, 1)
# Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
axes[0].imshow(pred_img)
axes[0].set_title('64x64 Reconstruction', fontsize=12)
axes[0].axis('off')
axes[1].imshow(pred_hr_img)
axes[1].set_title('128x128 Super-Resolution', fontsize=12)
axes[1].axis('off')
plt.tight_layout()
plt.show()
SummaryΒΆ
Implicit Neural Representations:ΒΆ
Key Ideas:
Continuous function parameterization
Coordinate-based input
Resolution-independent
Compact representation
SIREN:ΒΆ
Periodic sine activation
Special weight initialization
Natural for signals with derivatives
Better than ReLU for smooth functions
Advantages:ΒΆ
Memory efficient: Single network vs pixel array
Super-resolution: Query at any resolution
Derivatives: Analytical gradients
Compression: Compact signal storage
Applications:ΒΆ
3D shape representation (DeepSDF, NeRF)
Image compression
Video representation
PDEs solving
Novel view synthesis
Variants:ΒΆ
NeRF: 5D function (x,y,z,ΞΈ,Ο) β (rgb,Ο)
Fourier features: Positional encoding
BACON: Bias-free activation
WIRE: Random Fourier with nonlinearities
Advanced Implicit Neural Representations TheoryΒΆ
1. Introduction to Neural FieldsΒΆ
1.1 From Discrete to Continuous RepresentationsΒΆ
Traditional approach: Discrete grids (images, voxels, meshes)
Images: HΓWΓC array
3D shapes: NΓNΓN voxel grid
Videos: TΓHΓWΓC tensor
Limitations:
Fixed resolution (cannot zoom arbitrarily)
Memory grows cubically with resolution (O(NΒ³) for 3D)
Aliasing artifacts
Difficult to apply transformations
Implicit Neural Representations (INRs): Continuous functions via neural networks
f_ΞΈ: βα΅ β βαΆ
Coordinates β Signal values
Examples:
Images: f(x, y) β RGB
3D shapes: f(x, y, z) β occupancy or SDF
Videos: f(x, y, t) β RGB
Audio: f(t) β amplitude
1.2 Advantages of INRsΒΆ
Resolution independence: Query at any coordinate
Memory efficiency: Parameters independent of resolution
Smooth interpolation: Continuous by construction
Differentiable: Gradients available everywhere
Compact: Single network encodes entire signal
Applications:
Novel view synthesis (NeRF)
3D shape representation
Image/video compression
Solving PDEs
Generative modeling
2. Coordinate-Based MLPsΒΆ
2.1 Basic ArchitectureΒΆ
Standard MLP:
x β βα΅ β [Linear β ReLU]^L β Linear β y β βαΆ
Problem: Low-frequency bias (spectral bias)
MLPs learn low frequencies first
Struggles with high-frequency details (textures, edges)
Reason: ReLU/tanh have limited frequency content
2.2 Spectral Bias ProblemΒΆ
Observation (Rahaman et al., 2019): Neural networks are biased toward low frequencies
Experiment: Fit f(x) = sin(kx) for various k
Low k (slow oscillation): Converges quickly
High k (fast oscillation): Converges slowly or fails
Consequence: Images/shapes miss fine details
3. Positional Encoding (Fourier Features)ΒΆ
3.1 Random Fourier FeaturesΒΆ
Idea (Tancik et al., 2020): Map coordinates to higher-dimensional frequency space
Encoding:
Ξ³(x) = [sin(2Οbβα΅x), cos(2Οbβα΅x), ..., sin(2Οbβα΅x), cos(2Οbβα΅x)]
where bα΅’ ~ N(0, ΟΒ²I)
Network:
f_ΞΈ(x) = MLP(Ξ³(x))
Effect: Projects input to frequency spectrum
Enables learning high-frequency details
Ο controls frequency range (higher Ο β higher frequencies)
Relationship to kernel methods: Ξ³(x)α΅Ξ³(xβ) approximates kernel k(x, xβ)
3.2 Learned Positional EncodingΒΆ
Alternative: Learn encoding parameters
Ξ³(x) = [sin(Wβx + bβ), cos(Wβx + bβ), ..., sin(Wβx + bβ), cos(Wβx + bβ)]
where W, b are learned
Advantage: Adapts encoding to data
3.3 Frequency AnalysisΒΆ
NTK perspective (Neural Tangent Kernel):
Positional encoding changes kernelβs frequency response
Enables higher frequency eigenfunctions
Rule of thumb: Ο β max frequency in signal
4. Sinusoidal Representation Networks (SIREN)ΒΆ
4.1 ArchitectureΒΆ
Key innovation (Sitzmann et al., 2020): Use sine activations throughout
Network:
x β [Linear β sin]^L β Linear β y
Initialization: Critical for training stability
W ~ Uniform(-β(6/n_in), β(6/n_in)) (first layer: β(6/c))
where c is tuning parameter (typically c = 6)
4.2 PropertiesΒΆ
Derivatives: βsin(x)/βx = cos(x)
Periodic derivatives
Enables solving PDEs naturally
Higher-order derivatives: Available via automatic differentiation
βΒ²f_ΞΈ(x) = Laplacian (used in physics-informed learning)
Frequency content: Richer than ReLU
Captures both low and high frequencies
No need for positional encoding
4.3 Theoretical FoundationΒΆ
Observation: Sine activations create periodic basis functions
Fourier perspective: Network computes Fourier-like expansion
f(x) β Ξ£α΅’ aα΅’ sin(Οα΅’x + Οα΅’)
Advantage for PDEs: Derivatives remain periodic
5. Neural Radiance Fields (NeRF)ΒΆ
5.1 Problem: Novel View SynthesisΒΆ
Input: Images of scene from different viewpoints
Output: Render scene from new viewpoint
Traditional: Reconstruct 3D mesh β render
NeRF: Implicit volumetric representation
5.2 NeRF FormulationΒΆ
5D function:
F_ΞΈ: (x, y, z, ΞΈ, Ο) β (r, g, b, Ο)
position direction color density
Volume rendering equation:
C(r) = β« T(t) Ο(r(t)) c(r(t), d) dt
where T(t) = exp(-β«βα΅ Ο(r(s)) ds) (transmittance)
Discrete approximation (quadrature):
Δ(r) = Ξ£α΅’ Tα΅’ (1 - exp(-Οα΅’Ξ΄α΅’)) cα΅’
where Tα΅’ = exp(-Ξ£β±Όβββ±β»ΒΉ Οβ±ΌΞ΄β±Ό)
5.3 NeRF ArchitectureΒΆ
Two MLPs:
Coarse network: Low-resolution sampling
Fine network: Importance sampling based on coarse
Input encoding:
Position (x,y,z): Positional encoding Ξ³(x) with 10 frequencies
Direction (ΞΈ,Ο): Positional encoding Ξ³(d) with 4 frequencies
Network structure:
Ξ³(x) β [256 β 256 β 256 β 256] β Ο
β (features)
[256 β 128] + Ξ³(d) β RGB
Hierarchical sampling:
Sample N_c points uniformly along ray
Evaluate coarse network β weights w_i
Importance sample N_f additional points based on w_i
Evaluate fine network β final color
5.4 Training NeRFΒΆ
Loss: Photometric reconstruction
L = Ξ£α΅£ ||C(r) - Δ(r)||Β²
Data: Multi-view images + camera poses
Optimization: Adam, ~300k iterations per scene
Challenges:
Slow rendering (many MLP queries per ray)
Per-scene optimization (no generalization)
Static scenes only
6. Improvements and VariantsΒΆ
6.1 Faster NeRF VariantsΒΆ
Instant NGP (MΓΌller et al., 2022):
Multi-resolution hash encoding
Speedup: 1000Γ faster training (5s vs. hours)
Real-time rendering
Plenoxels (Fridovich-Keil et al., 2022):
Explicit voxel grid + spherical harmonics
No neural network!
Faster optimization
TensoRF (Chen et al., 2022):
Tensor factorization for radiance field
Compact + fast
6.2 Generalizable NeRFΒΆ
pixelNeRF (Yu et al., 2021):
Encode image features via CNN
Query features + coordinates
Generalizes to new scenes
IBRNet (Wang et al., 2021):
Image-based rendering with transformers
Few-shot view synthesis
6.3 Dynamic NeRFΒΆ
D-NeRF (Pumarola et al., 2021):
Add time dimension: F(x, y, z, t, ΞΈ, Ο)
Deformation field + canonical space
HyperNeRF (Park et al., 2021):
Topological changes (e.g., cutting balloon)
K-Planes (Fridovich-Keil et al., 2023):
Factorized 4D space-time representation
6.4 NeRF for 360Β° ScenesΒΆ
Mip-NeRF 360 (Barron et al., 2022):
Unbounded scenes (not just objects)
Anti-aliasing via integrated positional encoding
Distortion loss for better geometry
7. Signed Distance Functions (SDFs)ΒΆ
7.1 SDF DefinitionΒΆ
SDF: Distance to nearest surface
SDF(x) = { d if x outside surface
{ -d if x inside surface
{ 0 if x on surface
Properties:
||βSDF(x)|| = 1 (eikonal equation)
Zero level set = surface
7.2 DeepSDFΒΆ
Idea (Park et al., 2019): Neural network as SDF
Network:
f_ΞΈ: βΒ³ β β
(x, y, z) β signed distance
Loss:
L = Ξ£α΅’ |f_ΞΈ(xα΅’) - SDFα΅’|
Shape representation: Implicit surface at f_ΞΈ(x) = 0
Advantages:
Watertight surfaces
Handles arbitrary topology
Compact encoding
7.3 Eikonal RegularizationΒΆ
Problem: Network may not satisfy ||βf|| = 1
Solution: Eikonal loss
L_eikonal = E_x[(||βf_ΞΈ(x)|| - 1)Β²]
Combined loss:
L = L_reconstruction + Ξ» L_eikonal
7.4 NeuS and VolSDFΒΆ
Challenge: SDF alone doesnβt give appearance
NeuS (Wang et al., 2021):
Combine SDF with color network
Volume rendering with SDF
Bias control parameter for surface sharpness
VolSDF (Yariv et al., 2021):
Similar approach with geometric initialization
Better surface reconstruction
8. Occupancy NetworksΒΆ
8.1 Occupancy RepresentationΒΆ
Occupancy function:
o: βΒ³ β [0, 1]
o(x) = probability that x is inside shape
Network:
f_ΞΈ(x, y, z) β Ο(logit) β [0, 1]
Loss (binary cross-entropy):
L = -Ξ£α΅’ [yα΅’ log o(xα΅’) + (1-yα΅’) log(1-o(xα΅’))]
8.2 Occupancy Networks (Mescheder et al., 2019)ΒΆ
Architecture:
Encoder: Point cloud/image β latent code z
Decoder: (x, z) β occupancy
Applications:
3D reconstruction from partial observations
Shape generation
Completion
9. Implicit Differentiation and Meta-LearningΒΆ
9.1 Hypernetworks for INRsΒΆ
Idea: Learn to generate network weights
Hypernetwork: Input data β ΞΈ (weights of INR)
INR: Coordinates β Signal values
Advantage: Amortize optimization across dataset
9.2 MAML for INRsΒΆ
Meta-learning: Learn initialization that adapts quickly
Procedure:
Initialize ΞΈ
For each task (scene/shape):
Fine-tune: ΞΈβ = ΞΈ - Ξ±βL_task(ΞΈ)
Accumulate meta-gradient
Update ΞΈ
Benefit: Fast adaptation to new data
10. Applications Beyond GraphicsΒΆ
10.1 Solving PDEsΒΆ
Physics-Informed Neural Networks (PINNs):
Use SIREN to represent solution u(x, t)
Loss = PDE residual + boundary conditions
Example (heat equation):
βu/βt = Ξ±βΒ²u
L = ||βu/βt - Ξ±βΒ²u||Β² + ||u(x, 0) - uβ||Β²
Advantage: Meshless, continuous solution
10.2 CompressionΒΆ
Image compression:
Fit INR to image
Store network weights (typically <1KB for small MLPs)
Decode at any resolution
Comparison:
JPEG: Fixed resolution, block artifacts
INR: Continuous, smooth, but slower decode
10.3 Inverse ProblemsΒΆ
Super-resolution: Low-res image β INR β High-res output
Inpainting: Masked image β INR (train on visible pixels) β Complete image
11. Computational ComplexityΒΆ
11.1 Training CostΒΆ
Per-iteration:
Forward: O(L Β· HΒ² Β· B) (L layers, H hidden dim, B batch)
Backward: Same as forward
Total: O(L Β· HΒ² Β· B Β· N_iter)
NeRF specific:
Rays per image: H_img Γ W_img
Samples per ray: N_c + N_f (typically 64 + 128)
Per scene: ~1M gradient steps
11.2 Inference CostΒΆ
Single query: O(L Β· HΒ²)
Full image:
Standard NeRF: O(H_img Β· W_img Β· N_samples Β· L Β· HΒ²)
Example: 800Γ800 pixels Γ 192 samples Γ 8 layers Γ 256Β² β 10ΒΉΒΉ ops
Slow! (~30s per image on GPU)
Fast variants:
Instant NGP: O(H_img Β· W_img Β· L) via hash table
Real-time (30 FPS)
12. Comparison of TechniquesΒΆ
12.1 Activation FunctionsΒΆ
Activation |
Frequency |
Derivatives |
Best For |
|---|---|---|---|
ReLU |
Low |
Discontinuous |
General tasks |
Tanh |
Low |
Smooth |
Smooth signals |
Sine |
High |
Periodic |
PDEs, high-freq |
GELU |
Medium |
Smooth |
Transformers |
12.2 Encoding StrategiesΒΆ
Method |
Frequencies |
Learnable |
Overhead |
|---|---|---|---|
None (Raw coords) |
Low |
No |
None |
Fourier Features |
Fixed |
No |
2M dim |
Learned Encoding |
Adaptive |
Yes |
2M dim |
SIREN |
Adaptive |
Implicit |
None |
Hash Encoding (NGP) |
Multi-scale |
Yes |
O(TΒ·F) |
13. Recent Advances (2020-2024)ΒΆ
13.1 3D Gaussian Splatting (2023)ΒΆ
Idea: Represent scene as 3D Gaussians (not neural field)
Each Gaussian: position, covariance, color, opacity
Differentiable rasterization
Much faster than NeRF (140 FPS vs. 0.03 FPS)
Advantage: Explicit representation + real-time rendering
13.2 Neural Light FieldsΒΆ
Plenoptic function: 5D light field L(x, y, z, ΞΈ, Ο)
Neural encoding: Replace volume rendering with learned interpolation
Speed: Faster inference than volumetric NeRF
13.3 Semantic NeRFΒΆ
Idea: Predict semantic labels along with color
F_ΞΈ: (x, y, z, ΞΈ, Ο) β (RGB, Ο, semantics)
Applications: Object removal, scene editing
13.4 NeRF for Generative ModelingΒΆ
Ο-GAN (Chan et al., 2021):
Generator: Latent z β NeRF parameters
Discriminator: Rendered images
3D-aware GAN
EG3D (Chan et al., 2022):
Efficient tri-plane representation
High-resolution 3D-aware generation
14. Limitations and ChallengesΒΆ
14.1 Current LimitationsΒΆ
Speed: Slow rendering (addressed by Instant NGP, 3DGS)
Generalization: Per-scene optimization (addressed by pixelNeRF)
Dynamic scenes: Complex deformations
Lighting: Baked lighting (canβt relight easily)
Reflections: Struggles with mirrors, specularities
Transparency: Semi-transparent objects difficult
14.2 Open Research ProblemsΒΆ
Editing: Interactive scene manipulation
Compositionality: Combine multiple objects
Few-shot learning: Reconstruct from 1-3 images
Physical consistency: Enforce physics constraints
Scalability: City-scale scenes
15. Software and ToolsΒΆ
15.1 LibrariesΒΆ
PyTorch: Most implementations use PyTorch
tiny-cuda-nn: Fast CUDA MLPs (used in Instant NGP)
nerfstudio: Unified framework for NeRF variants
threestudio: Text-to-3D with NeRF
15.2 DatasetsΒΆ
NeRF Synthetic: Blender-rendered objects
LLFF: Real forward-facing scenes
Tanks and Temples: Large-scale reconstruction
ShapeNet: 3D shape dataset for occupancy/SDF
16. Key TakeawaysΒΆ
INRs represent signals as continuous functions via neural networks
Spectral bias: MLPs struggle with high frequencies β positional encoding
SIREN: Sine activations enable derivatives for PDEs
NeRF: 5D radiance field for photorealistic novel view synthesis
SDFs: Implicit surfaces with geometric properties
Fast variants: Instant NGP, 3DGS achieve real-time rendering
Applications: Graphics, compression, PDEs, inverse problems
When to use INRs:
Need continuous representation (resolution independence)
Memory constraints (compact encoding)
Derivatives required (physics simulation)
Novel view synthesis (NeRF)
When NOT to use:
Speed critical (explicit representations faster)
Simple tasks (overkill for MNIST)
Limited data (overfitting risk)
17. Mathematical FoundationsΒΆ
17.1 Universal ApproximationΒΆ
Theorem: Neural networks can approximate any continuous function
For INRs: f_ΞΈ: [0,1]α΅ β βαΆ can represent any image/shape
17.2 Nyquist-Shannon SamplingΒΆ
Classic theorem: Sample rate β₯ 2Γ max frequency
For INRs: Positional encoding frequency Ο should match signal frequency
17.3 Volume Rendering EquationΒΆ
Continuous:
C(r) = β«β^β T(t) Ο(r(t)) c(r(t), d) dt
T(t) = exp(-β«βα΅ Ο(r(s)) ds)
Discrete (alpha compositing):
C = Ξ£α΅’ Ξ±α΅’ Tα΅’ cα΅’
where Ξ±α΅’ = 1 - exp(-Οα΅’Ξ΄α΅’), Tα΅’ = Ξ β±Όβββ±β»ΒΉ (1 - Ξ±β±Ό)
18. ReferencesΒΆ
Foundational:
Sitzmann et al. (2020): SIREN - Implicit Neural Representations with Periodic Activation Functions
Tancik et al. (2020): Fourier Features Let Networks Learn High Frequency Functions
Mildenhall et al. (2020): NeRF - Representing Scenes as Neural Radiance Fields
Fast NeRF:
MΓΌller et al. (2022): Instant Neural Graphics Primitives with a Multiresolution Hash Encoding
Kerbl et al. (2023): 3D Gaussian Splatting for Real-Time Radiance Field Rendering
Shape Representations:
Park et al. (2019): DeepSDF - Learning Continuous Signed Distance Functions
Mescheder et al. (2019): Occupancy Networks
Wang et al. (2021): NeuS - Learning Neural Implicit Surfaces
Analysis:
Rahaman et al. (2019): On the Spectral Bias of Neural Networks
Jacot et al. (2018): Neural Tangent Kernel (theoretical foundation)
Applications:
Raissi et al. (2019): Physics-Informed Neural Networks (PINNs)
Chan et al. (2022): EG3D - Efficient Geometry-aware 3D GANs
"""
Complete Implicit Neural Representations Implementations
=========================================================
Includes: SIREN, Fourier features, NeRF (simplified), DeepSDF, occupancy networks,
positional encoding, volume rendering.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# ============================================================================
# 1. Positional Encoding (Fourier Features)
# ============================================================================
class FourierFeatures(nn.Module):
"""
Random Fourier Features for positional encoding.
Ξ³(x) = [sin(2ΟBx), cos(2ΟBx)]
Args:
input_dim: Input dimension
num_frequencies: Number of frequency bands M
scale: Frequency scale Ο (controls max frequency)
"""
def __init__(self, input_dim, num_frequencies=256, scale=10.0):
super(FourierFeatures, self).__init__()
# Sample random frequencies from Gaussian
B = torch.randn(num_frequencies, input_dim) * scale
self.register_buffer('B', B)
def forward(self, x):
"""
Args:
x: Input coordinates [batch, input_dim]
Returns:
features: [batch, 2*num_frequencies]
"""
# x @ B^T: [batch, num_frequencies]
x_proj = 2 * np.pi * x @ self.B.T
# Concatenate sin and cos
return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
class LearnedFourierFeatures(nn.Module):
"""
Learnable Fourier features.
Frequencies and phases are learned parameters.
"""
def __init__(self, input_dim, num_frequencies=256, scale=10.0):
super(LearnedFourierFeatures, self).__init__()
# Initialize with random frequencies
self.B = nn.Parameter(torch.randn(num_frequencies, input_dim) * scale)
self.b = nn.Parameter(torch.zeros(num_frequencies))
def forward(self, x):
x_proj = 2 * np.pi * (x @ self.B.T + self.b)
return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
# ============================================================================
# 2. SIREN (Sinusoidal Representation Networks)
# ============================================================================
class SineLayer(nn.Module):
"""
Linear layer followed by sine activation.
Special initialization required for SIREN.
"""
def __init__(self, in_features, out_features, bias=True,
is_first=False, omega_0=30.0):
super(SineLayer, self).__init__()
self.omega_0 = omega_0
self.is_first = is_first
self.linear = nn.Linear(in_features, out_features, bias=bias)
# Initialize weights
self._init_weights()
def _init_weights(self):
"""SIREN initialization."""
with torch.no_grad():
if self.is_first:
# First layer: Uniform(-1/n, 1/n)
bound = 1 / self.linear.in_features
else:
# Later layers: Uniform(-β(6/n)/Οβ, β(6/n)/Οβ)
bound = np.sqrt(6 / self.linear.in_features) / self.omega_0
self.linear.weight.uniform_(-bound, bound)
def forward(self, x):
return torch.sin(self.omega_0 * self.linear(x))
class SIREN(nn.Module):
"""
Sinusoidal Representation Network.
f(x) = W_n sin(Οβ W_{n-1} sin(...Οβ Wβ sin(Οβ Wβ x)...))
Args:
input_dim: Input dimension (e.g., 2 for images, 3 for 3D)
output_dim: Output dimension (e.g., 3 for RGB, 1 for SDF)
hidden_dim: Hidden layer size
num_layers: Number of layers
omega_0: Frequency parameter
"""
def __init__(self, input_dim, output_dim, hidden_dim=256,
num_layers=5, omega_0=30.0):
super(SIREN, self).__init__()
layers = []
# First layer
layers.append(SineLayer(input_dim, hidden_dim, is_first=True, omega_0=omega_0))
# Hidden layers
for _ in range(num_layers - 2):
layers.append(SineLayer(hidden_dim, hidden_dim, is_first=False, omega_0=omega_0))
# Final layer (linear, no sine)
final_layer = nn.Linear(hidden_dim, output_dim)
# Initialize final layer
with torch.no_grad():
bound = np.sqrt(6 / hidden_dim) / omega_0
final_layer.weight.uniform_(-bound, bound)
layers.append(final_layer)
self.network = nn.Sequential(*layers)
def forward(self, x):
"""
Args:
x: Coordinates [batch, input_dim]
Returns:
output: [batch, output_dim]
"""
return self.network(x)
def gradient(self, x):
"""
Compute gradient βf(x) w.r.t. input.
Useful for normals, physics-informed learning.
"""
x = x.requires_grad_(True)
y = self.forward(x)
# Compute gradient for each output dimension
grads = []
for i in range(y.shape[1]):
grad = torch.autograd.grad(
y[:, i].sum(), x, create_graph=True, retain_graph=True)[0]
grads.append(grad)
return torch.stack(grads, dim=1)
# ============================================================================
# 3. Coordinate MLP with Fourier Features
# ============================================================================
class FourierMLP(nn.Module):
"""
MLP with Fourier feature encoding.
Combines positional encoding with standard ReLU MLP.
"""
def __init__(self, input_dim, output_dim, hidden_dim=256, num_layers=4,
num_frequencies=256, freq_scale=10.0):
super(FourierMLP, self).__init__()
# Positional encoding
self.fourier = FourierFeatures(input_dim, num_frequencies, freq_scale)
# MLP
layers = []
in_dim = 2 * num_frequencies
for i in range(num_layers - 1):
layers.extend([
nn.Linear(in_dim if i == 0 else hidden_dim, hidden_dim),
nn.ReLU(inplace=True)
])
layers.append(nn.Linear(hidden_dim, output_dim))
self.network = nn.Sequential(*layers)
def forward(self, x):
features = self.fourier(x)
return self.network(features)
# ============================================================================
# 4. Neural Radiance Field (Simplified NeRF)
# ============================================================================
class NeRFEncoding(nn.Module):
"""
Positional encoding for NeRF (learned frequencies).
Ξ³(p) = [p, sin(2β°Οp), cos(2β°Οp), ..., sin(2^{L-1}Οp), cos(2^{L-1}Οp)]
"""
def __init__(self, input_dim, num_freqs=10):
super(NeRFEncoding, self).__init__()
self.num_freqs = num_freqs
# Frequency bands: 2^0, 2^1, ..., 2^{L-1}
freq_bands = 2.0 ** torch.linspace(0, num_freqs - 1, num_freqs)
self.register_buffer('freq_bands', freq_bands)
def forward(self, x):
"""
Args:
x: [batch, input_dim]
Returns:
encoded: [batch, input_dim * (2*num_freqs + 1)]
"""
out = [x]
for freq in self.freq_bands:
for func in [torch.sin, torch.cos]:
out.append(func(x * freq * np.pi))
return torch.cat(out, dim=-1)
class SimpleNeRF(nn.Module):
"""
Simplified Neural Radiance Field.
F: (x, y, z, ΞΈ, Ο) β (r, g, b, Ο)
Args:
pos_freqs: Positional encoding frequencies for position
dir_freqs: Positional encoding frequencies for direction
"""
def __init__(self, pos_freqs=10, dir_freqs=4, hidden_dim=256):
super(SimpleNeRF, self).__init__()
# Encodings
self.pos_encoding = NeRFEncoding(3, pos_freqs)
self.dir_encoding = NeRFEncoding(3, dir_freqs)
# Dimensions after encoding
pos_dim = 3 * (2 * pos_freqs + 1)
dir_dim = 3 * (2 * dir_freqs + 1)
# Position network (β density + features)
self.pos_net = nn.Sequential(
nn.Linear(pos_dim, hidden_dim),
nn.ReLU(inplace=True),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(inplace=True),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(inplace=True),
)
# Density head
self.density_head = nn.Sequential(
nn.Linear(hidden_dim, 1),
nn.ReLU(inplace=True) # Density must be non-negative
)
# Color network (features + direction β RGB)
self.color_net = nn.Sequential(
nn.Linear(hidden_dim + dir_dim, hidden_dim // 2),
nn.ReLU(inplace=True),
nn.Linear(hidden_dim // 2, 3),
nn.Sigmoid() # RGB in [0, 1]
)
def forward(self, pos, direction):
"""
Args:
pos: Position (x, y, z) [batch, 3]
direction: View direction (ΞΈ, Ο) [batch, 3] (normalized)
Returns:
rgb: Color [batch, 3]
sigma: Density [batch, 1]
"""
# Encode inputs
pos_encoded = self.pos_encoding(pos)
dir_encoded = self.dir_encoding(direction)
# Position-dependent features
features = self.pos_net(pos_encoded)
# Density (view-independent)
sigma = self.density_head(features)
# Color (view-dependent)
rgb = self.color_net(torch.cat([features, dir_encoded], dim=-1))
return rgb, sigma
def volume_rendering(rgb, sigma, z_vals):
"""
Volume rendering equation (discrete).
C = Ξ£α΅’ Tα΅’ (1 - exp(-Οα΅’Ξ΄α΅’)) cα΅’
Args:
rgb: Colors [batch, num_samples, 3]
sigma: Densities [batch, num_samples, 1]
z_vals: Depth values [batch, num_samples]
Returns:
rgb_map: Rendered color [batch, 3]
depth_map: Expected depth [batch, 1]
"""
# Compute distances between samples
dists = z_vals[..., 1:] - z_vals[..., :-1]
dists = torch.cat([dists, torch.ones_like(dists[..., :1]) * 1e10], dim=-1)
# Alpha (opacity)
alpha = 1.0 - torch.exp(-sigma.squeeze(-1) * dists)
# Transmittance: Tα΅’ = exp(-Ξ£β±Όβββ±β»ΒΉ Οβ±ΌΞ΄β±Ό) = Ξ β±Όβββ±β»ΒΉ (1 - Ξ±β±Ό)
transmittance = torch.cumprod(
torch.cat([torch.ones_like(alpha[..., :1]), 1.0 - alpha + 1e-10], dim=-1),
dim=-1
)[..., :-1]
# Weights
weights = alpha * transmittance
# Composite RGB
rgb_map = (weights[..., None] * rgb).sum(dim=1)
# Expected depth
depth_map = (weights * z_vals).sum(dim=-1, keepdim=True)
return rgb_map, depth_map
# ============================================================================
# 5. DeepSDF (Signed Distance Functions)
# ============================================================================
class DeepSDF(nn.Module):
"""
Deep Signed Distance Function network.
f: βΒ³ β β (signed distance)
With eikonal regularization: ||βf(x)|| = 1
"""
def __init__(self, hidden_dim=256, num_layers=8):
super(DeepSDF, self).__init__()
layers = []
# First layer
layers.extend([
nn.Linear(3, hidden_dim),
nn.ReLU(inplace=True)
])
# Hidden layers with skip connection at layer 4
for i in range(num_layers - 2):
if i == 3:
# Skip connection
layers.extend([
nn.Linear(hidden_dim + 3, hidden_dim),
nn.ReLU(inplace=True)
])
else:
layers.extend([
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(inplace=True)
])
# Output layer
layers.append(nn.Linear(hidden_dim, 1))
# Store layers for skip connection
self.layers = nn.ModuleList(layers)
def forward(self, x):
"""
Args:
x: 3D coordinates [batch, 3]
Returns:
sdf: Signed distance [batch, 1]
"""
h = x
for i, layer in enumerate(self.layers):
# Skip connection at layer 4 (index 8 in layer list)
if i == 8:
h = torch.cat([h, x], dim=-1)
h = layer(h)
return h
def eikonal_loss(self, x):
"""
Eikonal constraint: ||βf(x)|| = 1
Returns:
loss: E[(||βf|| - 1)Β²]
"""
x = x.requires_grad_(True)
sdf = self.forward(x)
# Compute gradient
grad = torch.autograd.grad(
sdf.sum(), x, create_graph=True)[0]
# Eikonal loss
loss = ((grad.norm(dim=-1) - 1) ** 2).mean()
return loss
# ============================================================================
# 6. Occupancy Network
# ============================================================================
class OccupancyNetwork(nn.Module):
"""
Occupancy network: Predict probability that point is inside shape.
f: βΒ³ β [0, 1]
"""
def __init__(self, hidden_dim=256, num_layers=5):
super(OccupancyNetwork, self).__init__()
layers = []
layers.extend([
nn.Linear(3, hidden_dim),
nn.ReLU(inplace=True)
])
for _ in range(num_layers - 2):
layers.extend([
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(inplace=True)
])
layers.extend([
nn.Linear(hidden_dim, 1),
nn.Sigmoid() # Output in [0, 1]
])
self.network = nn.Sequential(*layers)
def forward(self, x):
"""
Args:
x: 3D coordinates [batch, 3]
Returns:
occupancy: Probability [batch, 1]
"""
return self.network(x)
# ============================================================================
# 7. Demonstrations
# ============================================================================
def demo_fourier_features():
"""Demonstrate Fourier features for image fitting."""
print("="*70)
print("Fourier Features Demo")
print("="*70)
# Create simple 2D signal (checkerboard)
size = 32
coords = torch.stack(torch.meshgrid(
torch.linspace(-1, 1, size),
torch.linspace(-1, 1, size),
indexing='ij'
), dim=-1).reshape(-1, 2)
# Target: Checkerboard pattern
target = ((coords[:, 0] * 8) % 2 < 1) & ((coords[:, 1] * 8) % 2 < 1)
target = target.float().unsqueeze(-1)
# Compare: No encoding vs. Fourier features
print("Training without Fourier features...")
model_no_enc = nn.Sequential(
nn.Linear(2, 256),
nn.ReLU(),
nn.Linear(256, 256),
nn.ReLU(),
nn.Linear(256, 1),
nn.Sigmoid()
)
optimizer = torch.optim.Adam(model_no_enc.parameters(), lr=1e-3)
for epoch in range(500):
optimizer.zero_grad()
pred = model_no_enc(coords)
loss = F.mse_loss(pred, target)
loss.backward()
optimizer.step()
if (epoch + 1) % 100 == 0:
print(f" Epoch {epoch+1}: Loss = {loss.item():.6f}")
print("\nTraining with Fourier features...")
model_fourier = FourierMLP(2, 1, hidden_dim=256, num_frequencies=128, freq_scale=5.0)
optimizer = torch.optim.Adam(model_fourier.parameters(), lr=1e-3)
for epoch in range(500):
optimizer.zero_grad()
pred = model_fourier(coords)
pred = torch.sigmoid(pred)
loss = F.mse_loss(pred, target)
loss.backward()
optimizer.step()
if (epoch + 1) % 100 == 0:
print(f" Epoch {epoch+1}: Loss = {loss.item():.6f}")
print("\nFourier features enable learning high-frequency patterns!")
print()
def demo_siren():
"""Demonstrate SIREN for image representation."""
print("="*70)
print("SIREN Demo")
print("="*70)
# Create 2D signal
size = 64
coords = torch.stack(torch.meshgrid(
torch.linspace(-1, 1, size),
torch.linspace(-1, 1, size),
indexing='ij'
), dim=-1).reshape(-1, 2)
# Target: Concentric circles
r = torch.sqrt(coords[:, 0]**2 + coords[:, 1]**2)
target = torch.sin(10 * r).unsqueeze(-1)
# Train SIREN
print("Training SIREN...")
model = SIREN(input_dim=2, output_dim=1, hidden_dim=256, num_layers=5, omega_0=30.0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(1000):
optimizer.zero_grad()
pred = model(coords)
loss = F.mse_loss(pred, target)
loss.backward()
optimizer.step()
if (epoch + 1) % 200 == 0:
print(f" Epoch {epoch+1}: Loss = {loss.item():.6f}")
# Test gradient computation
test_coords = torch.randn(10, 2) * 0.5
test_coords.requires_grad_(True)
output = model(test_coords)
grad = torch.autograd.grad(output.sum(), test_coords, create_graph=True)[0]
print(f"\nGradient shape: {grad.shape}")
print(f"Gradient norm (should be smooth): {grad.norm(dim=-1).mean():.4f}")
print()
def demo_simple_nerf():
"""Demonstrate simplified NeRF."""
print("="*70)
print("Simplified NeRF Demo")
print("="*70)
# Create simple NeRF
nerf = SimpleNeRF(pos_freqs=6, dir_freqs=3, hidden_dim=128)
# Sample points along a ray
num_samples = 64
ray_origin = torch.tensor([[0.0, 0.0, -2.0]])
ray_direction = torch.tensor([[0.0, 0.0, 1.0]])
# Sample depths
z_vals = torch.linspace(0, 4, num_samples).unsqueeze(0)
# Compute 3D positions
positions = ray_origin + z_vals.unsqueeze(-1) * ray_direction
# Query NeRF
directions = ray_direction.expand(num_samples, -1)
rgb, sigma = nerf(positions.squeeze(0), directions)
print(f"Ray samples: {num_samples}")
print(f"RGB shape: {rgb.shape}")
print(f"Sigma shape: {sigma.shape}")
print(f"Sigma range: [{sigma.min().item():.4f}, {sigma.max().item():.4f}]")
# Volume rendering
rgb_rendered, depth = volume_rendering(
rgb.unsqueeze(0), sigma.unsqueeze(0), z_vals)
print(f"\nRendered RGB: {rgb_rendered.squeeze().tolist()}")
print(f"Expected depth: {depth.item():.4f}")
print()
def demo_deepsdf():
"""Demonstrate DeepSDF."""
print("="*70)
print("DeepSDF Demo")
print("="*70)
# Create DeepSDF
model = DeepSDF(hidden_dim=128, num_layers=6)
# Sample points
points = torch.randn(100, 3)
# Forward pass
sdf_values = model(points)
print(f"Input points: {points.shape}")
print(f"SDF values shape: {sdf_values.shape}")
print(f"SDF range: [{sdf_values.min().item():.4f}, {sdf_values.max().item():.4f}]")
# Compute eikonal loss
eikonal = model.eikonal_loss(points)
print(f"\nEikonal loss (||βf|| - 1)Β²: {eikonal.item():.6f}")
print("(Lower is better, should be close to 0 after training)")
print()
def print_method_comparison():
"""Print comparison of INR methods."""
print("="*70)
print("Implicit Neural Representation Methods Comparison")
print("="*70)
print()
comparison = """
ββββββββββββββββββββ¬βββββββββββββββββ¬ββββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ
β Method β Activation β Encoding β Best For β Special β
ββββββββββββββββββββΌβββββββββββββββββΌββββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β Vanilla MLP β ReLU β None β Baseline β Spectral biasβ
ββββββββββββββββββββΌβββββββββββββββββΌββββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β Fourier MLP β ReLU β Random Fourierβ Images β High-freq OK β
β β β Features β β β
ββββββββββββββββββββΌβββββββββββββββββΌββββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β SIREN β Sine β None (implicitβ PDEs, images β Derivatives β
β β β in activation)β β β
ββββββββββββββββββββΌβββββββββββββββββΌββββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β NeRF β ReLU β Learned β Novel view β 5D function β
β β β positional β synthesis β β
ββββββββββββββββββββΌβββββββββββββββββΌββββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β Instant NGP β ReLU β Hash encoding β Fast NeRF β Real-time β
ββββββββββββββββββββΌβββββββββββββββββΌββββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β DeepSDF β ReLU + skip β None β 3D shapes β Eikonal loss β
ββββββββββββββββββββΌβββββββββββββββββΌββββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β Occupancy Net β ReLU β None β 3D shapes β Binary class β
ββββββββββββββββββββ΄βββββββββββββββββ΄ββββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ
**Training Speed:**
- SIREN: Fast (smooth gradients)
- Fourier MLP: Medium (high-dim features)
- NeRF: Slow (per-scene optimization, ~hours)
- Instant NGP: Fast (~seconds with hash encoding)
**Memory:**
- All INRs: O(parameters) independent of resolution
- Typical: 1-10MB for full scene (vs. GB for voxels)
**Applications:**
1. **Graphics**: NeRF, SDF, occupancy β 3D reconstruction
2. **Compression**: Store weights instead of pixels
3. **Physics**: SIREN + PDE residual β solve differential equations
4. **Editing**: Manipulate latent codes or network weights
**Decision Guide:**
- **Need high-frequency details?** β Fourier features or SIREN
- **Novel view synthesis?** β NeRF (or Instant NGP for speed)
- **3D shape representation?** β DeepSDF or Occupancy
- **Solve PDEs?** β SIREN (derivatives crucial)
- **Real-time rendering?** β Instant NGP or 3D Gaussian Splatting
"""
print(comparison)
print()
# ============================================================================
# Run Demonstrations
# ============================================================================
if __name__ == "__main__":
torch.manual_seed(42)
np.random.seed(42)
demo_fourier_features()
demo_siren()
demo_simple_nerf()
demo_deepsdf()
print_method_comparison()
print("="*70)
print("Implicit Neural Representations Implementations Complete")
print("="*70)
print()
print("Summary:")
print(" β’ Fourier Features: Random encoding for high-frequency learning")
print(" β’ SIREN: Sine activations enable derivatives for PDEs")
print(" β’ NeRF: 5D radiance field for photorealistic view synthesis")
print(" β’ DeepSDF: Signed distance functions with eikonal regularization")
print(" β’ Occupancy: Binary classification for 3D shape occupancy")
print()
print("Key insight: INRs represent signals as continuous functions")
print("Trade-off: Flexibility vs. computational cost")
print("Applications: Novel view synthesis, compression, physics, 3D shapes")
print()