Run this notebook: Open in Colab Open in Kaggle

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

1. Point Cloud Basics¶

Representation¶

Point cloud: \(\{p_i\}_{i=1}^N\) where \(p_i \in \mathbb{R}^3\)

Key Properties:¶

Permutation invariance: \(f(\{p_1, p_2, \ldots\}) = f(\{p_{\pi(1)}, p_{\pi(2)}, \ldots\})\)
Transformation invariance: Robust to rotations/translations
Unordered: No canonical ordering

📚 Reference Materials:

cv_3d_foundation.pdf - Cv 3D Foundation
cv_3d_research.pdf - Cv 3D Research

def generate_sphere_points(n_points=1000, radius=1.0):
    """Generate points on sphere surface."""
    theta = np.random.uniform(0, 2*np.pi, n_points)
    phi = np.random.uniform(0, np.pi, n_points)
    
    x = radius * np.sin(phi) * np.cos(theta)
    y = radius * np.sin(phi) * np.sin(theta)
    z = radius * np.cos(phi)
    
    return np.stack([x, y, z], axis=1)

def generate_cube_points(n_points=1000, size=1.0):
    """Generate points in cube volume."""
    return np.random.uniform(-size/2, size/2, (n_points, 3))

# Generate samples
sphere_pc = generate_sphere_points(500)
cube_pc = generate_cube_points(500)

# Visualize
fig = plt.figure(figsize=(14, 6))

ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(sphere_pc[:, 0], sphere_pc[:, 1], sphere_pc[:, 2], c='blue', s=1, alpha=0.6)
ax1.set_title('Sphere Point Cloud', fontsize=11)

ax2 = fig.add_subplot(122, projection='3d')
ax2.scatter(cube_pc[:, 0], cube_pc[:, 1], cube_pc[:, 2], c='red', s=1, alpha=0.6)
ax2.set_title('Cube Point Cloud', fontsize=11)

plt.tight_layout()
plt.show()

2. T-Net (Transformation Network)¶

Learns spatial transformer for input alignment:

\[T(x) = \text{MLP}(x) \rightarrow \mathbb{R}^{3 \times 3}\]

class TNet(nn.Module):
    """Spatial transformer network."""
    
    def __init__(self, k=3):
        super().__init__()
        self.k = k
        
        self.conv1 = nn.Conv1d(k, 64, 1)
        self.conv2 = nn.Conv1d(64, 128, 1)
        self.conv3 = nn.Conv1d(128, 1024, 1)
        
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, k*k)
        
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(1024)
        self.bn4 = nn.BatchNorm1d(512)
        self.bn5 = nn.BatchNorm1d(256)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        
        x = torch.max(x, 2)[0]  # Max pooling
        
        x = F.relu(self.bn4(self.fc1(x)))
        x = F.relu(self.bn5(self.fc2(x)))
        x = self.fc3(x)
        
        # Initialize as identity
        identity = torch.eye(self.k, device=x.device).flatten().unsqueeze(0)
        x = x + identity
        
        return x.view(batch_size, self.k, self.k)

# Test
tnet = TNet(k=3).to(device)
test_input = torch.randn(4, 3, 100).to(device)
transform = tnet(test_input)
print(f"Transform shape: {transform.shape}")

3. PointNet Architecture¶

Symmetric Function:¶

\[f(x_1, \ldots, x_n) = g\left(\max_{i=1}^n h(x_i)\right)\]

Max pooling ensures permutation invariance.

class PointNet(nn.Module):
    """PointNet for classification."""
    
    def __init__(self, num_classes=10):
        super().__init__()
        
        # Input transform
        self.input_transform = TNet(k=3)
        
        # Feature transform
        self.feature_transform = TNet(k=64)
        
        # MLP 1
        self.conv1 = nn.Conv1d(3, 64, 1)
        self.conv2 = nn.Conv1d(64, 64, 1)
        
        # MLP 2
        self.conv3 = nn.Conv1d(64, 64, 1)
        self.conv4 = nn.Conv1d(64, 128, 1)
        self.conv5 = nn.Conv1d(128, 1024, 1)
        
        # Batch norms
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(64)
        self.bn3 = nn.BatchNorm1d(64)
        self.bn4 = nn.BatchNorm1d(128)
        self.bn5 = nn.BatchNorm1d(1024)
        
        # Classifier
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)
        
        self.bn6 = nn.BatchNorm1d(512)
        self.bn7 = nn.BatchNorm1d(256)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        batch_size = x.size(0)
        n_points = x.size(2)
        
        # Input transform
        trans = self.input_transform(x)
        x = torch.bmm(trans, x)
        
        # MLP 1
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        
        # Feature transform
        trans_feat = self.feature_transform(x)
        x = torch.bmm(trans_feat, x)
        
        # MLP 2
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = F.relu(self.bn5(self.conv5(x)))
        
        # Global feature
        x = torch.max(x, 2)[0]
        
        # Classifier
        x = F.relu(self.bn6(self.fc1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn7(self.fc2(x)))
        x = self.dropout(x)
        x = self.fc3(x)
        
        return x, trans_feat

model = PointNet(num_classes=2).to(device)
print("PointNet created")

Create Synthetic Dataset¶

To demonstrate PointNet without requiring large 3D datasets, we create synthetic point clouds by sampling points from simple geometric shapes (spheres, cubes, cylinders, cones). Each shape class is represented by a set of \(N\) 3D coordinates, with some noise added for realism. This controlled setup lets us verify that the network can classify shapes based purely on their geometric structure, without relying on texture or color. The principles learned here transfer directly to real point cloud data from LiDAR sensors, depth cameras, and 3D scanners.

class PointCloudDataset(torch.utils.data.Dataset):
    """Synthetic point cloud dataset."""
    
    def __init__(self, n_samples=1000, n_points=1024):
        self.n_samples = n_samples
        self.n_points = n_points
        
        self.data = []
        self.labels = []
        
        for i in range(n_samples):
            if i % 2 == 0:
                # Sphere
                pc = generate_sphere_points(n_points)
                label = 0
            else:
                # Cube
                pc = generate_cube_points(n_points)
                label = 1
            
            self.data.append(pc)
            self.labels.append(label)
    
    def __len__(self):
        return self.n_samples
    
    def __getitem__(self, idx):
        pc = torch.FloatTensor(self.data[idx]).T  # (3, N)
        label = torch.LongTensor([self.labels[idx]])[0]
        return pc, label

# Create datasets
train_dataset = PointCloudDataset(n_samples=800, n_points=512)
test_dataset = PointCloudDataset(n_samples=200, n_points=512)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

print(f"Train: {len(train_dataset)}, Test: {len(test_dataset)}")

Train PointNet¶

Training PointNet follows the standard supervised learning loop: forward pass through the shared MLPs, symmetric max-pooling to obtain a global feature vector, and classification via fully connected layers with cross-entropy loss. The max-pooling aggregation is what makes PointNet permutation invariant – shuffling the order of input points produces the same global feature. An optional spatial transformer network (T-Net) learns to canonicalize the input point cloud’s orientation, making the model robust to rigid transformations. Monitoring both training loss and per-class accuracy helps detect class imbalance issues common in 3D shape datasets.

def regularization_loss(trans_feat):
    """Feature transform regularization."""
    batch_size = trans_feat.size(0)
    identity = torch.eye(64, device=trans_feat.device).unsqueeze(0).repeat(batch_size, 1, 1)
    diff = trans_feat @ trans_feat.transpose(1, 2) - identity
    return torch.norm(diff, dim=(1, 2)).mean()

def train_pointnet(model, train_loader, n_epochs=20):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    
    losses = []
    
    for epoch in range(n_epochs):
        model.train()
        epoch_loss = 0
        
        for pc, labels in train_loader:
            pc, labels = pc.to(device), labels.to(device)
            
            output, trans_feat = model(pc)
            
            # Classification loss
            cls_loss = F.cross_entropy(output, labels)
            
            # Regularization
            reg_loss = regularization_loss(trans_feat)
            
            loss = cls_loss + 0.001 * reg_loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            losses.append(loss.item())
        
        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss/len(train_loader):.4f}")
    
    return losses

losses = train_pointnet(model, train_loader, n_epochs=20)

plt.figure(figsize=(10, 5))
plt.plot(losses, alpha=0.3)
plt.plot(np.convolve(losses, np.ones(20)/20, mode='valid'), linewidth=2)
plt.xlabel('Iteration', fontsize=11)
plt.ylabel('Loss', fontsize=11)
plt.title('PointNet Training', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

Evaluate¶

Evaluation on held-out test shapes measures overall classification accuracy and per-class performance. A confusion matrix reveals which shape pairs are most commonly confused – for example, cubes and rectangular prisms might be hard to distinguish from sparse point samples. Comparing PointNet’s accuracy to simpler baselines (e.g., hand-crafted 3D features plus SVM) demonstrates the power of learning point-level features directly from raw coordinates. In real applications, this evaluation extends to larger benchmarks like ModelNet40 (40 object categories) and ShapeNet (part segmentation).

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for pc, labels in test_loader:
        pc, labels = pc.to(device), labels.to(device)
        output, _ = model(pc)
        pred = output.argmax(dim=1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

Test Permutation Invariance¶

A critical property of PointNet is permutation invariance: the output should be identical regardless of the order in which points are presented. We verify this empirically by passing the same point cloud through the network multiple times with different random permutations of the point order and checking that the output logits are exactly equal. This test validates that the max-pooling aggregation correctly eliminates ordering dependence. Permutation invariance is essential because point clouds from sensors arrive in arbitrary order, and the model must not depend on an artificial indexing scheme.

# Get sample
pc_sample, _ = test_dataset[0]
pc_sample = pc_sample.unsqueeze(0).to(device)

# Original prediction
model.eval()
with torch.no_grad():
    output1, _ = model(pc_sample)
    pred1 = output1.argmax(dim=1).item()

# Shuffle points
perm = torch.randperm(pc_sample.size(2))
pc_shuffled = pc_sample[:, :, perm]

with torch.no_grad():
    output2, _ = model(pc_shuffled)
    pred2 = output2.argmax(dim=1).item()

print(f"Original prediction: {pred1}")
print(f"Shuffled prediction: {pred2}")
print(f"Predictions match: {pred1 == pred2}")
print(f"Output difference: {torch.abs(output1 - output2).max().item():.6f}")

Summary¶

PointNet Key Ideas:¶

Permutation invariance via max pooling
Spatial transformers for alignment
Global + local features
Direct point processing

Architecture:¶

Input transform (T-Net)
Feature transform (T-Net)
Symmetric aggregation (max pool)
MLP classifier

Applications:¶

3D object classification
Part segmentation
Scene understanding
Autonomous driving

Extensions:¶

PointNet++: Hierarchical learning
DGCNN: Dynamic graph CNN
PointConv: Continuous convolution
Point Transformers: Attention mechanisms

Advanced Point Cloud Networks Theory¶

1. Introduction to Point Cloud Processing¶

Point clouds are sets of 3D points representing object surfaces or scenes, commonly obtained from LiDAR, RGB-D cameras, or 3D reconstruction.

1.1 Point Cloud Characteristics¶

Unique properties:

Unordered: No canonical ordering of points
Irregular: Non-uniform density, varying sampling rates
Permutation invariant: Should produce same result for any point ordering
Sparse: 3D space is largely empty
Transformation equivariant: Rotations/translations should preserve structure

Representation: P = {p_i ∈ ℝ³ | i = 1,…,N}

May include features: p_i ∈ ℝ^(3+F) (position + color, normal, etc.)

1.2 Challenges¶

Permutation invariance: Traditional CNNs assume grid structure
Rotation sensitivity: Points in different orientations should be recognized as same
Varying density: Distant points are sparser
Scalability: Millions of points in real scenes
Local structure: Capturing geometric patterns

Applications:

3D object classification/detection
Autonomous driving (LiDAR perception)
Robotics (grasping, navigation)
3D reconstruction and completion
Semantic segmentation of scenes

2. PointNet: Pioneer Architecture¶

PointNet [Qi et al., 2017a]: First deep learning architecture for raw point clouds.

2.1 Core Idea¶

Key insight: Use symmetric function to achieve permutation invariance.

Architecture:

Point Cloud (N × 3) → MLP → Features (N × d) → MaxPool → Global Feature (d) → MLP → Output

Symmetric function: f({x₁,…,xₙ}) ≈ g(h(x₁),…,h(xₙ))

h: Per-point feature extractor (shared MLP)
g: Symmetric aggregator (max pooling)

2.2 Mathematical Framework¶

Per-point feature extraction:

f_i = MLP(p_i) ∈ ℝ^d

Global feature (permutation invariant):

f_global = max_{i=1,...,N} f_i

Classification:

y = MLP_cls(f_global)

Segmentation (per-point labels):

y_i = MLP_seg(concat(f_i, f_global))

2.3 Input and Feature Transformations¶

Problem: Rotation sensitivity

Solution: T-Net (Spatial Transformer Network)

Input transform (3 × 3):

T_input = TNet_3×3({p_i})
p_i' = T_input · p_i

Feature transform (64 × 64):

T_feature = TNet_64×64({f_i})
f_i' = T_feature · f_i

Regularization (orthogonality):

L_reg = ||I - T^T T||²_F

Ensures transformation is approximately rigid.

2.4 Theoretical Guarantees¶

Universal approximation: PointNet can approximate any continuous set function.

Proof sketch:

h (MLP) can approximate any continuous function on ℝ³
max operation selects representative features
g (MLP) combines features to approximate target function

Critical points: Max pooling selects “skeleton” of point cloud.

2.5 Limitations¶

Local structure: Doesn’t explicitly capture local geometry
Context: Each point processed independently (before max pool)
Scalability: All points in one forward pass

3. PointNet++: Hierarchical Feature Learning¶

PointNet++ [Qi et al., 2017b]: Extends PointNet with hierarchical structure and local context.

3.1 Architecture Overview¶

Hierarchical abstraction:

Points (N₁ × 3) → SA Layer 1 → Points (N₂ × d₁) → SA Layer 2 → Points (N₃ × d₂) → ...

Set Abstraction (SA) Layer:

Sampling: Select subset of points (centroids)
Grouping: Group neighbors around each centroid
PointNet: Extract local features

3.2 Set Abstraction Layer¶

Components:

1. Farthest Point Sampling (FPS):

Select N’ centroids from N points
Maximize minimum distance between centroids

Algorithm:

C = {c₁}  # Start with random point
for i = 2 to N':
    c_i = argmax_{p ∈ P \ C} min_{c ∈ C} ||p - c||
    C = C ∪ {c_i}

2. Ball Query:

For each centroid c_i, find all points within radius r
More robust than k-NN to varying density

N(c_i) = {p_j ∈ P | ||p_j - c_i|| < r}

3. PointNet on local region:

f_i = PointNet({p_j - c_i | p_j ∈ N(c_i)})

Note: Use relative coordinates (p_j - c_i) for translation invariance.

3.3 Multi-Scale Grouping (MSG)¶

Problem: Varying point density

Solution: Multiple scales with different radii

MSG layer:

f_i = concat(PointNet_r₁(N_r₁(c_i)), PointNet_r₂(N_r₂(c_i)), ...)

Example: r = [0.1, 0.2, 0.4] for coarse-to-fine features

3.4 Multi-Resolution Grouping (MRG)¶

Alternative: Combine features from different abstraction levels

Idea:

Low-level: Dense sampling, fine details
High-level: Sparse sampling, global context

Concatenate features from multiple resolutions.

3.5 Point Feature Propagation¶

For segmentation: Need per-point labels

Problem: SA layers downsample → fewer points

Solution: Interpolate features back to original points

Interpolation (inverse distance weighted):

f(p) = Σ_{i=1}^k w_i f_i / Σ_{i=1}^k w_i

w_i = 1 / ||p - p_i||²

Where {p_i} are k nearest neighbors in coarser layer.

Feature propagation:

f_propagated = MLP(concat(f_interpolated, f_skip))

f_skip: Skip connection from encoder.

4. Dynamic Graph CNN (DGCNN)¶

DGCNN [Wang et al., 2019]: Constructs graph dynamically in feature space.

4.1 EdgeConv Operation¶

Key idea: Compute edge features between points and neighbors.

Edge feature:

e_ij = h_θ(x_i, x_j - x_i)

Where:

x_i: Center point feature
x_j - x_i: Relative neighbor feature
h_θ: Shared MLP (learnable)

Aggregation:

x_i' = max_{j ∈ N(i)} e_ij

Or other aggregators: sum, mean, attention.

4.2 Dynamic Graph Construction¶

Key innovation: Update k-NN graph after each layer.

Procedure:

Compute k-NN in feature space (not 3D coordinates)
Apply EdgeConv
Repeat with updated features

Benefits:

Adapts to learned feature geometry
Captures semantic relationships (not just spatial)

Graph at layer l:

G^(l) = kNN(F^(l), k)

Where F^(l) are features at layer l.

4.3 Architecture¶

Stacked EdgeConv layers:

Input (N × 3) → EdgeConv₁ → EdgeConv₂ → ... → EdgeConvₗ → MaxPool → MLP → Output

Feature concatenation: Combine features from all layers.

f_global = max(concat(f₁, f₂, ..., fₗ))

4.4 Comparison to PointNet++¶

PointNet++:

Fixed neighborhoods (ball query on 3D coords)
Hierarchical downsampling

DGCNN:

Dynamic neighborhoods (k-NN in feature space)
Full resolution throughout (optional downsampling)

5. Point Transformers¶

5.1 Point Transformer Layer¶

Applying attention to point clouds [Zhao et al., 2021].

Vector attention:

y_i = Σ_{j ∈ N(i)} ρ(γ(φ(x_i) - ψ(x_j) + δ)) ⊙ (α(x_j) + δ)

Where:

φ, ψ, α, γ: Linear transformations (like Q, K, V in standard attention)
δ = φ_pos(p_i - p_j): Position encoding
ρ: Softmax normalization
⊙: Element-wise product

Position encoding:

δ = MLP(p_i - p_j)

Captures relative position information.

Attention weights:

a_ij = softmax_j(γ(φ(x_i) - ψ(x_j) + δ))

Output:

y_i = Σ_j a_ij ⊙ (α(x_j) + δ)

5.2 Transition Layers¶

Downsampling: Farthest point sampling + k-NN grouping

Upsampling: Trilinear interpolation + skip connections

Similar to PointNet++ but with transformer blocks.

5.3 Advantages¶

Long-range dependencies: Attention captures global context
Adaptive: Learns which points to attend to
Position-aware: δ encodes geometric relationships

6. Rotation-Invariant Networks¶

6.1 Problem: SO(3) Equivariance¶

Challenge: Point clouds can be arbitrarily rotated.

Desirable property: f(R · P) = R · f(P) or f(R · P) = f(P)

Approaches:

Data augmentation: Train on rotated copies (not truly invariant)
Canonical orientation: Align to principal axes (unstable)
Equivariant features: Design rotation-equivariant operations

6.2 PointConv¶

PointConv [Wu et al., 2019]: Continuous convolution on point clouds.

Convolution:

f_out(p) = Σ_{p_i ∈ N(p)} W(p - p_i) f_in(p_i)

Learned kernel:

W(Δp) = MLP(Δp)

Density-aware: Weight by local density.

6.3 VN-PointNet/DGCNN¶

Vector Neuron networks [Deng et al., 2021]: Equivariant operations.

Vector neuron: 3D vector instead of scalar

Operations:

VN-Linear: V_out = W V_in (matrix-vector product)
VN-ReLU: ReLU(||V||) · V/||V|| (direction-preserving)
VN-MaxPool: argmax_i ||V_i||

Invariant features: Extract rotation-invariant scalars

||V||: Magnitude
V^T V’: Inner product
||(V × V’)||: Cross product magnitude

7. Point Cloud Segmentation¶

7.1 Semantic Segmentation¶

Task: Assign class label to each point.

Architecture: Encoder-decoder with skip connections.

PointNet++ segmentation:

Input → SA layers → FP layers → Per-point MLP → Scores

Loss: Cross-entropy per point

L = -Σ_i Σ_c y_ic log ŷ_ic

7.2 Instance Segmentation¶

Task: Group points into object instances + classify.

3D-BoNet [Yang et al., 2019]:

Bounding box prediction: Predict 3D boxes
Point-box association: Assign points to boxes
Classification: Classify each instance

SGPN [Wang et al., 2018]: Similarity Group Proposal Network

Learn pairwise point similarity
Group points via clustering

7.3 Part Segmentation¶

Task: Segment object into parts (e.g., chair legs, back, seat).

ShapeNet Part dataset: 16 categories, 50 parts

Approach: Category-conditional segmentation

Input: Point cloud + category label
Output: Part labels per point

8. Point Cloud Object Detection¶

8.1 3D Bounding Boxes¶

Representation: (x, y, z, l, w, h, θ)

Center: (x, y, z)
Size: length, width, height
Orientation: yaw angle θ

IoU: 3D Intersection over Union for evaluation

8.2 VoxelNet¶

VoxelNet [Zhou & Tuzel, 2018]: Voxelize point cloud.

Pipeline:

Voxel partition: Divide 3D space into voxels
VFE (Voxel Feature Encoding): PointNet per voxel
3D CNN: Process voxel grid
RPN (Region Proposal Network): Predict 3D boxes

VFE layer:

f_voxel = max_{p ∈ voxel} MLP(concat(p, f_voxel_mean))

8.3 PointPillars¶

PointPillars [Lang et al., 2019]: Organize points into vertical pillars.

Advantages:

Faster than VoxelNet (2D conv instead of 3D)
Efficient for LiDAR (mostly empty above ground)

Pipeline:

Pillar encoding: PointNet on each pillar
Pseudo-image: Scatter features to BEV (bird’s-eye view)
2D CNN: Backbone network
Detection head: Predict boxes

8.4 PointRCNN¶

PointRCNN [Shi et al., 2019]: Two-stage detector on raw points.

Stage 1: 3D proposal generation

Segment foreground points
Predict 3D box per point
Refine via clustering

Stage 2: Box refinement

Canonical coordinates: Transform points to box frame
PointNet++: Refine box parameters

9. Point Cloud Generation¶

9.1 Point Cloud Autoencoders¶

Architecture: Encoder-decoder for point clouds.

Encoder: PointNet → latent vector z

Decoder: FC layers → N × 3 points

Loss: Chamfer distance

L_CD = Σ_{p ∈ P₁} min_{q ∈ P₂} ||p - q||² + Σ_{q ∈ P₂} min_{p ∈ P₁} ||p - q||²

Earth Mover’s Distance (more accurate, slower):

L_EMD = min_{φ:P₁→P₂} Σ_{p ∈ P₁} ||p - φ(p)||²

Where φ is bijection.

9.2 Point Cloud GANs¶

PC-GAN [Li et al., 2018]: Generate point clouds.

Generator: z ~ N(0,I) → MLP → N × 3 points

Discriminator: PointNet → real/fake score

Challenges:

Mode collapse
Unstable training
Evaluation metrics (FID for point clouds)

9.3 Point Flow¶

PointFlow [Yang et al., 2019]: Normalizing flow for point clouds.

Idea: Learn bijection between Gaussian and point cloud distribution.

Two-level hierarchy:

Global: Sample number of points and shape code
Local: Sample point positions conditioned on shape

Benefits: Exact likelihood, better mode coverage than GANs.

10. Point Cloud Completion¶

10.1 Problem Formulation¶

Input: Partial point cloud P_partial Output: Complete point cloud P_complete

Applications:

Occlusion handling in robotics
3D reconstruction from single view
Shape completion from scans

10.2 PCN: Point Completion Network¶

PCN [Yuan et al., 2018]:

Encoder: PointNet → coarse global feature

Decoder:

Coarse completion: FC layers → M points (low-res)
Fine completion: Folding-based → N points (high-res)

Multi-resolution loss:

L = L_coarse(P_coarse, P_gt) + L_fine(P_fine, P_gt)

10.3 FoldingNet¶

FoldingNet [Yang et al., 2018]: Learn to fold 2D grid into 3D shape.

Decoder:

p_i = MLP(concat(f_global, grid_i))

Where grid_i ∈ [0,1]² is 2D grid point.

Intuition: Learn deformation from canonical 2D plane to 3D surface.

11. Point Cloud Upsampling¶

11.1 PU-Net¶

PU-Net [Yu et al., 2018]: Upsample sparse point clouds.

Idea: Learn to densify point cloud while preserving geometry.

Pipeline:

Feature extraction: PointNet++
Feature expansion: Replicate features
Point generation: MLP → dense points
Coordinate refinement: Adjust positions

Expansion:

f_i → [f_i, f_i, ..., f_i]  (r times)

Then MLP to generate r points per input point.

11.2 PU-GAN¶

PU-GAN [Li et al., 2019]: GAN-based upsampling.

Generator: Upsample point cloud

Discriminator: Distinguish real vs. generated patches

Self-attention: Capture long-range dependencies

Losses:

Adversarial loss
Uniform loss (encourage uniform distribution)
Reconstruction loss

12. Applications¶

12.1 Autonomous Driving¶

LiDAR perception:

Detection: Cars, pedestrians, cyclists (PointPillars, SECOND)
Tracking: Multi-object tracking in 3D
Segmentation: Road, buildings, vegetation

Sensor fusion: Combine LiDAR + camera

12.2 Robotics¶

Grasping:

Segment graspable regions
Plan grasp poses from partial point clouds

Navigation:

SLAM (Simultaneous Localization and Mapping)
Obstacle avoidance

12.3 AR/VR¶

3D reconstruction: Real-time scene capture

Object recognition: Identify objects in 3D space

Interaction: Hand tracking, gesture recognition

12.4 Medical Imaging¶

Organ segmentation: From CT/MRI scans

Surgical planning: 3D anatomy visualization

13. Recent Advances (2020-2024)¶

13.1 Point-Voxel CNN (PVCNN)¶

PVCNN [Liu et al., 2019]: Combine point and voxel representations.

Idea:

Voxel branch: Regular grid, efficient 3D conv
Point branch: Precise geometry, no discretization

Fusion: Devoxelize voxel features → combine with point features

Benefits: Speed of voxels + accuracy of points.

13.2 Stratified Transformer¶

Stratified Transformer [Lai et al., 2022]: Efficient attention for large point clouds.

Key idea: Hierarchical grouping + local attention

Complexity: O(N log N) instead of O(N²)

13.3 Point-BERT¶

Point-BERT [Yu et al., 2022]: Pre-training for point clouds.

Masked autoencoding:

Mask random points
Predict masked points from context

Discretization: Learn discrete tokens via dVAE

Transfer: Fine-tune on downstream tasks

13.4 Point-MAE¶

Point-MAE [Pang et al., 2022]: Masked autoencoders.

High masking ratio: 60-90% of points masked

Decoder: Lightweight transformer

Reconstruction: Predict coordinates of masked points

Finding: High masking ratio forces learning global structure.

14. Benchmarks and Datasets¶

14.1 ModelNet¶

ModelNet40: 12,311 CAD models, 40 categories

Task: Classification

Evaluation: Overall accuracy

Sampling: 1,024 or 2,048 points per model

14.2 ShapeNet¶

ShapeNetCore: 51,300 models, 55 categories

ShapeNetPart: Part segmentation (16 categories, 50 parts)

Tasks: Classification, segmentation, completion

14.3 ScanNet¶

Indoor scene dataset: 1,513 scanned scenes

Annotations: Semantic labels, instance masks, 3D boxes

Tasks: Scene understanding, object detection, segmentation

14.4 KITTI¶

Autonomous driving: 7,481 training, 7,518 testing frames

LiDAR + camera: Multi-modal

Tasks: 3D object detection, tracking

Categories: Car, pedestrian, cyclist

14.5 Waymo Open Dataset¶

Large-scale: 1,000 scenes, 200k frames

High quality: 5 LiDAR sensors, 5 cameras

Dense annotations: 3D boxes, tracking IDs

15. Evaluation Metrics¶

15.1 Classification¶

Accuracy: Fraction of correctly classified point clouds

Per-class accuracy: Accuracy per category (balanced)

15.2 Segmentation¶

mIoU (mean Intersection over Union):

IoU_c = TP_c / (TP_c + FP_c + FN_c)
mIoU = (1/C) Σ_c IoU_c

Overall accuracy: Fraction of correctly labeled points

15.3 Detection¶

AP (Average Precision): At IoU threshold (typically 0.5, 0.7)

mAP: Mean over categories

NDS (Waymo): Normalized Detection Score

NDS = (1/10) Σ_{thresholds} AP_threshold

15.4 Completion/Generation¶

Chamfer Distance (CD):

CD = (1/|P₁|) Σ_{p ∈ P₁} min_{q ∈ P₂} ||p - q||² + 
     (1/|P₂|) Σ_{q ∈ P₂} min_{p ∈ P₁} ||p - q||²

Earth Mover’s Distance (EMD):

EMD = min_{φ:P₁→P₂} (1/N) Σ_{p ∈ P₁} ||p - φ(p)||²

Hausdorff Distance:

d_H = max(max_{p ∈ P₁} min_{q ∈ P₂} ||p-q||, max_{q ∈ P₂} min_{p ∈ P₁} ||p-q||)

16. Implementation Considerations¶

16.1 Efficient Neighborhood Search¶

k-NN in 3D:

Naive: O(N²) comparison
KD-tree: O(N log N) build, O(log N) query
Ball tree: Better for high dimensions
GPU: Batch k-NN via sorting

Libraries:

Open3D, PyTorch3D (GPU-accelerated)
scipy.spatial.KDTree (CPU)

16.2 Sampling Strategies¶

Random sampling: Fast but may miss details

Farthest Point Sampling: Better coverage, O(N²)

Approximate FPS: Faster alternatives

Grid-based: Voxelize → sample per voxel
Random subset FPS: FPS on random subset

16.3 Normalization¶

Unit sphere:

P_norm = (P - center) / max_distance

Zero mean, unit variance:

P_norm = (P - mean(P)) / std(P)

Align to canonical orientation: PCA, but can be unstable

16.4 Data Augmentation¶

Rotation: Random SO(3) rotation

Small angles (±15°) or full rotation

Jittering: Add Gaussian noise

p_i ← p_i + ε, ε ~ N(0, σ²I)

Scaling: Random scaling factor [0.8, 1.2]

Dropout: Randomly drop points (simulate occlusion)

17. Limitations and Open Problems¶

17.1 Current Limitations¶

Scalability: Large scenes (millions of points) challenging
Density variation: Varying sampling rates
Noise robustness: Real scans have outliers, noise
Occlusion: Missing data from self-occlusion
Computational cost: Point-wise operations expensive

17.2 Open Problems¶

Unified architecture: Single model for all tasks
Self-supervised learning: Better pre-training strategies
Multi-modal fusion: Optimal way to combine LiDAR + RGB
Temporal modeling: Point cloud sequences (4D)
Generalization: Cross-dataset performance

17.3 Future Directions¶

Transformer-based: Scaling transformers to large point clouds
Implicit representations: Neural fields vs. explicit points
Efficient attention: Linear complexity attention
Neural rendering: NeRF integration with point clouds
Foundation models: Large pre-trained models (Point-BERT, Point-MAE)

18. Key Takeaways¶

PointNet: Pioneering permutation-invariant architecture via symmetric function (max pooling)
PointNet++: Hierarchical feature learning with FPS + ball query + local PointNet
DGCNN: Dynamic graph construction in feature space with EdgeConv
Point Transformers: Attention mechanism with position encoding for point clouds
Rotation invariance: Critical for real-world robustness (VN networks, equivariant features)
Set abstraction: Sampling + grouping + feature extraction fundamental building block
Applications: Autonomous driving (detection), robotics (grasping), AR/VR (reconstruction)
Metrics: Chamfer distance (generation), mIoU (segmentation), mAP (detection)
Challenges: Scalability, density variation, rotation sensitivity
Future: Transformers, self-supervised learning, multi-modal fusion

Core insight: Point clouds require permutation-invariant operations and hierarchical abstractions to capture both local geometry and global structure.

19. Mathematical Summary¶

PointNet global feature:

f_global = max_{i=1,...,N} MLP(p_i)

PointNet++ set abstraction:

S^(l) = {f_i^(l) | f_i^(l) = PointNet({p_j - c_i | p_j ∈ B(c_i, r)})}

DGCNN EdgeConv:

x_i' = max_{j ∈ kNN(x_i)} MLP(concat(x_i, x_j - x_i))

Point Transformer attention:

y_i = Σ_{j ∈ N(i)} softmax(γ(φ(x_i) - ψ(x_j) + δ_ij)) ⊙ (α(x_j) + δ_ij)

Chamfer Distance:

CD(P₁, P₂) = Σ_{p ∈ P₁} min_{q ∈ P₂} ||p-q||² + Σ_{q ∈ P₂} min_{p ∈ P₁} ||p-q||²

References¶

Qi et al. (2017a) “PointNet: Deep Learning on Point Sets for 3D Classification and Segmentation”
Qi et al. (2017b) “PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space”
Wang et al. (2019) “Dynamic Graph CNN for Learning on Point Clouds”
Zhao et al. (2021) “Point Transformer”
Wu et al. (2019) “PointConv: Deep Convolutional Networks on 3D Point Clouds”
Zhou & Tuzel (2018) “VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection”
Lang et al. (2019) “PointPillars: Fast Encoders for Object Detection from Point Clouds”
Yu et al. (2018) “PU-Net: Point Cloud Upsampling Network”
Pang et al. (2022) “Masked Autoencoders for Point Cloud Self-supervised Learning”
Liu et al. (2019) “Point-Voxel CNN for Efficient 3D Deep Learning”

"""
Advanced Point Cloud Networks Implementations

Production-ready PyTorch implementations of point cloud processing architectures.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import List, Tuple, Optional, Callable
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# ============================================================================
# 1. PointNet Architecture
# ============================================================================

class TNet(nn.Module):
    """
    Transformation Network (T-Net) for input/feature transformation.
    
    Predicts transformation matrix to canonicalize inputs.
    
    Args:
        k: Dimensionality of input (3 for spatial, 64 for features)
    """
    def __init__(self, k: int = 3):
        super().__init__()
        self.k = k
        
        # Shared MLPs
        self.conv1 = nn.Conv1d(k, 64, 1)
        self.conv2 = nn.Conv1d(64, 128, 1)
        self.conv3 = nn.Conv1d(128, 1024, 1)
        
        # FC layers
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, k * k)
        
        # Batch norm
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(1024)
        self.bn4 = nn.BatchNorm1d(512)
        self.bn5 = nn.BatchNorm1d(256)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [batch, k, num_points]
            
        Returns:
            transform: [batch, k, k] transformation matrix
        """
        batch_size = x.size(0)
        
        # Shared MLP
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        
        # Max pooling
        x = torch.max(x, 2)[0]  # [batch, 1024]
        
        # FC layers
        x = F.relu(self.bn4(self.fc1(x)))
        x = F.relu(self.bn5(self.fc2(x)))
        x = self.fc3(x)  # [batch, k*k]
        
        # Initialize as identity
        identity = torch.eye(self.k, device=x.device).flatten().unsqueeze(0)
        identity = identity.repeat(batch_size, 1)
        
        x = x + identity
        x = x.view(-1, self.k, self.k)
        
        return x


class PointNet(nn.Module):
    """
    PointNet for classification and segmentation.
    
    Args:
        num_classes: Number of output classes
        task: 'classification' or 'segmentation'
        use_transform: Whether to use input/feature transforms
    """
    def __init__(
        self,
        num_classes: int,
        task: str = 'classification',
        use_transform: bool = True
    ):
        super().__init__()
        self.task = task
        self.use_transform = use_transform
        
        # Input transform (3x3)
        if use_transform:
            self.input_transform = TNet(k=3)
            self.feature_transform = TNet(k=64)
        
        # Shared MLPs
        self.conv1 = nn.Conv1d(3, 64, 1)
        self.conv2 = nn.Conv1d(64, 128, 1)
        self.conv3 = nn.Conv1d(128, 1024, 1)
        
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(1024)
        
        if task == 'classification':
            # Classification head
            self.fc1 = nn.Linear(1024, 512)
            self.fc2 = nn.Linear(512, 256)
            self.fc3 = nn.Linear(256, num_classes)
            self.bn_fc1 = nn.BatchNorm1d(512)
            self.bn_fc2 = nn.BatchNorm1d(256)
            self.dropout = nn.Dropout(p=0.3)
        else:
            # Segmentation head
            self.conv4 = nn.Conv1d(1088, 512, 1)  # 1024 global + 64 local
            self.conv5 = nn.Conv1d(512, 256, 1)
            self.conv6 = nn.Conv1d(256, 128, 1)
            self.conv7 = nn.Conv1d(128, num_classes, 1)
            self.bn4 = nn.BatchNorm1d(512)
            self.bn5 = nn.BatchNorm1d(256)
            self.bn6 = nn.BatchNorm1d(128)
        
    def forward(
        self,
        x: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """
        Args:
            x: [batch, 3, num_points] or [batch, num_points, 3]
            
        Returns:
            output: [batch, num_classes] or [batch, num_classes, num_points]
            feature_transform: [batch, 64, 64] if use_transform else None
        """
        # Ensure [batch, 3, num_points]
        if x.size(1) != 3:
            x = x.transpose(1, 2)
        
        batch_size = x.size(0)
        num_points = x.size(2)
        
        # Input transform
        if self.use_transform:
            trans_input = self.input_transform(x)
            x = torch.bmm(trans_input, x)
        
        # MLP (64, 64)
        x = F.relu(self.bn1(self.conv1(x)))
        point_features = x  # Save for segmentation
        
        # Feature transform
        trans_feat = None
        if self.use_transform:
            trans_feat = self.feature_transform(x)
            x = torch.bmm(trans_feat, x)
        
        # MLP (64, 128, 1024)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.bn3(self.conv3(x))
        
        # Max pooling
        global_feature = torch.max(x, 2)[0]  # [batch, 1024]
        
        if self.task == 'classification':
            # Classification
            x = F.relu(self.bn_fc1(self.fc1(global_feature)))
            x = self.dropout(x)
            x = F.relu(self.bn_fc2(self.fc2(x)))
            x = self.dropout(x)
            x = self.fc3(x)
            return x, trans_feat
        else:
            # Segmentation
            # Replicate global feature
            global_feature_expanded = global_feature.unsqueeze(2).repeat(
                1, 1, num_points
            )
            
            # Concatenate point features with global feature
            x = torch.cat([point_features, global_feature_expanded], dim=1)
            
            # Segmentation MLP
            x = F.relu(self.bn4(self.conv4(x)))
            x = F.relu(self.bn5(self.conv5(x)))
            x = F.relu(self.bn6(self.conv6(x)))
            x = self.conv7(x)
            
            return x, trans_feat


def pointnet_loss(
    outputs: torch.Tensor,
    labels: torch.Tensor,
    trans_feat: Optional[torch.Tensor] = None,
    reg_weight: float = 0.001
) -> Tuple[torch.Tensor, Dict[str, float]]:
    """
    PointNet loss with transformation regularization.
    
    Args:
        outputs: Model predictions
        labels: Ground truth
        trans_feat: Feature transformation matrix [batch, 64, 64]
        reg_weight: Regularization weight
        
    Returns:
        total_loss, loss_dict
    """
    # Classification/segmentation loss
    if len(outputs.shape) == 2:
        # Classification
        task_loss = F.cross_entropy(outputs, labels)
    else:
        # Segmentation
        task_loss = F.cross_entropy(outputs, labels)
    
    # Regularization: Enforce orthogonality
    reg_loss = 0.0
    if trans_feat is not None:
        batch_size = trans_feat.size(0)
        I = torch.eye(64, device=trans_feat.device).unsqueeze(0).repeat(
            batch_size, 1, 1
        )
        # ||I - A^T A||^2
        diff = I - torch.bmm(trans_feat.transpose(1, 2), trans_feat)
        reg_loss = torch.mean(torch.norm(diff, dim=(1, 2)))
    
    total_loss = task_loss + reg_weight * reg_loss
    
    loss_dict = {
        'total': total_loss.item(),
        'task': task_loss.item(),
        'reg': reg_loss if isinstance(reg_loss, float) else reg_loss.item()
    }
    
    return total_loss, loss_dict


# ============================================================================
# 2. PointNet++ Components
# ============================================================================

def farthest_point_sampling(xyz: torch.Tensor, num_samples: int) -> torch.Tensor:
    """
    Farthest Point Sampling (FPS).
    
    Args:
        xyz: [batch, num_points, 3]
        num_samples: Number of points to sample
        
    Returns:
        indices: [batch, num_samples] indices of sampled points
    """
    batch_size, num_points, _ = xyz.shape
    device = xyz.device
    
    # Initialize
    centroids = torch.zeros(batch_size, num_samples, dtype=torch.long, device=device)
    distance = torch.ones(batch_size, num_points, device=device) * 1e10
    
    # Random first point
    farthest = torch.randint(0, num_points, (batch_size,), device=device)
    batch_indices = torch.arange(batch_size, device=device)
    
    for i in range(num_samples):
        centroids[:, i] = farthest
        
        # Get centroid coordinates
        centroid = xyz[batch_indices, farthest, :].unsqueeze(1)  # [batch, 1, 3]
        
        # Compute distances
        dist = torch.sum((xyz - centroid) ** 2, dim=-1)
        
        # Update minimum distances
        mask = dist < distance
        distance[mask] = dist[mask]
        
        # Select farthest point
        farthest = torch.max(distance, dim=1)[1]
    
    return centroids


def ball_query(
    xyz: torch.Tensor,
    centroids: torch.Tensor,
    radius: float,
    num_samples: int
) -> torch.Tensor:
    """
    Ball query: Find points within radius of each centroid.
    
    Args:
        xyz: [batch, num_points, 3] all points
        centroids: [batch, num_centroids, 3] query points
        radius: Search radius
        num_samples: Max number of points per ball
        
    Returns:
        indices: [batch, num_centroids, num_samples]
    """
    batch_size = xyz.size(0)
    num_points = xyz.size(1)
    num_centroids = centroids.size(1)
    device = xyz.device
    
    # Compute pairwise distances
    # [batch, num_centroids, num_points]
    xyz_expanded = xyz.unsqueeze(1).repeat(1, num_centroids, 1, 1)
    centroids_expanded = centroids.unsqueeze(2).repeat(1, 1, num_points, 1)
    distances = torch.sum((xyz_expanded - centroids_expanded) ** 2, dim=-1)
    
    # Find points within radius
    mask = distances < radius ** 2
    
    # Sample num_samples points
    indices = torch.zeros(
        batch_size, num_centroids, num_samples,
        dtype=torch.long, device=device
    )
    
    for b in range(batch_size):
        for c in range(num_centroids):
            valid_indices = torch.where(mask[b, c])[0]
            
            if len(valid_indices) == 0:
                # No points in ball, use nearest
                nearest = torch.argmin(distances[b, c])
                indices[b, c, :] = nearest
            elif len(valid_indices) < num_samples:
                # Fewer points than needed, repeat
                indices[b, c, :len(valid_indices)] = valid_indices
                indices[b, c, len(valid_indices):] = valid_indices[
                    torch.randint(0, len(valid_indices), (num_samples - len(valid_indices),))
                ]
            else:
                # More points than needed, sample
                sampled = valid_indices[
                    torch.randperm(len(valid_indices), device=device)[:num_samples]
                ]
                indices[b, c, :] = sampled
    
    return indices


class SetAbstractionLayer(nn.Module):
    """
    PointNet++ Set Abstraction Layer.
    
    Samples centroids, groups neighbors, extracts features.
    
    Args:
        num_centroids: Number of points to sample
        radius: Ball query radius
        num_samples: Max points per neighborhood
        in_channels: Input feature channels
        mlp_channels: List of MLP output channels
    """
    def __init__(
        self,
        num_centroids: int,
        radius: float,
        num_samples: int,
        in_channels: int,
        mlp_channels: List[int]
    ):
        super().__init__()
        self.num_centroids = num_centroids
        self.radius = radius
        self.num_samples = num_samples
        
        # MLP for feature extraction
        layers = []
        in_ch = in_channels + 3  # Coordinates + features
        for out_ch in mlp_channels:
            layers.append(nn.Conv2d(in_ch, out_ch, 1))
            layers.append(nn.BatchNorm2d(out_ch))
            layers.append(nn.ReLU())
            in_ch = out_ch
        self.mlp = nn.Sequential(*layers)
        
    def forward(
        self,
        xyz: torch.Tensor,
        features: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            xyz: [batch, num_points, 3]
            features: [batch, num_points, C] or None
            
        Returns:
            new_xyz: [batch, num_centroids, 3]
            new_features: [batch, num_centroids, mlp_channels[-1]]
        """
        # Farthest point sampling
        centroid_indices = farthest_point_sampling(xyz, self.num_centroids)
        
        # Get centroid coordinates
        batch_size = xyz.size(0)
        new_xyz = torch.gather(
            xyz, 1,
            centroid_indices.unsqueeze(-1).repeat(1, 1, 3)
        )
        
        # Ball query
        neighbor_indices = ball_query(xyz, new_xyz, self.radius, self.num_samples)
        
        # Group neighbors
        # [batch, num_centroids, num_samples, 3]
        grouped_xyz = torch.gather(
            xyz.unsqueeze(1).repeat(1, self.num_centroids, 1, 1),
            2,
            neighbor_indices.unsqueeze(-1).repeat(1, 1, 1, 3)
        )
        
        # Relative coordinates
        grouped_xyz = grouped_xyz - new_xyz.unsqueeze(2)
        
        # Group features
        if features is not None:
            grouped_features = torch.gather(
                features.unsqueeze(1).repeat(1, self.num_centroids, 1, 1),
                2,
                neighbor_indices.unsqueeze(-1).repeat(1, 1, 1, features.size(-1))
            )
            # Concatenate
            grouped_features = torch.cat([grouped_xyz, grouped_features], dim=-1)
        else:
            grouped_features = grouped_xyz
        
        # [batch, C, num_centroids, num_samples]
        grouped_features = grouped_features.permute(0, 3, 1, 2)
        
        # Apply MLP
        new_features = self.mlp(grouped_features)
        
        # Max pooling across neighbors
        new_features = torch.max(new_features, dim=-1)[0]  # [batch, C, num_centroids]
        new_features = new_features.transpose(1, 2)  # [batch, num_centroids, C]
        
        return new_xyz, new_features


class PointNetPlusPlus(nn.Module):
    """
    PointNet++ for classification.
    
    Args:
        num_classes: Number of output classes
        num_points: Number of input points
    """
    def __init__(self, num_classes: int, num_points: int = 1024):
        super().__init__()
        
        # Set abstraction layers
        self.sa1 = SetAbstractionLayer(
            num_centroids=512,
            radius=0.2,
            num_samples=32,
            in_channels=0,
            mlp_channels=[64, 64, 128]
        )
        
        self.sa2 = SetAbstractionLayer(
            num_centroids=128,
            radius=0.4,
            num_samples=64,
            in_channels=128,
            mlp_channels=[128, 128, 256]
        )
        
        self.sa3 = SetAbstractionLayer(
            num_centroids=1,
            radius=10.0,  # Large radius to capture all
            num_samples=128,
            in_channels=256,
            mlp_channels=[256, 512, 1024]
        )
        
        # Classification head
        self.fc1 = nn.Linear(1024, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.4)
        self.fc3 = nn.Linear(256, num_classes)
        
    def forward(self, xyz: torch.Tensor) -> torch.Tensor:
        """
        Args:
            xyz: [batch, num_points, 3]
            
        Returns:
            logits: [batch, num_classes]
        """
        # Ensure [batch, num_points, 3]
        if xyz.size(-1) != 3:
            xyz = xyz.transpose(1, 2)
        
        # Set abstraction layers
        xyz, features = self.sa1(xyz, None)
        xyz, features = self.sa2(xyz, features)
        xyz, features = self.sa3(xyz, features)
        
        # Global feature
        features = features.squeeze(1)  # [batch, 1024]
        
        # Classification
        x = F.relu(self.bn1(self.fc1(features)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        
        return x


# ============================================================================
# 3. DGCNN (Dynamic Graph CNN)
# ============================================================================

def knn(x: torch.Tensor, k: int) -> torch.Tensor:
    """
    Find k nearest neighbors.
    
    Args:
        x: [batch, num_points, dim]
        k: Number of neighbors
        
    Returns:
        indices: [batch, num_points, k]
    """
    # Compute pairwise distances
    inner = -2 * torch.matmul(x, x.transpose(2, 1))
    xx = torch.sum(x ** 2, dim=2, keepdim=True)
    pairwise_distance = -xx - inner - xx.transpose(2, 1)
    
    # Get k nearest
    idx = pairwise_distance.topk(k=k, dim=-1)[1]
    return idx


def get_graph_feature(
    x: torch.Tensor,
    k: int = 20,
    idx: Optional[torch.Tensor] = None
) -> torch.Tensor:
    """
    Construct edge features for DGCNN.
    
    Args:
        x: [batch, num_points, dim]
        k: Number of neighbors
        idx: Pre-computed neighbor indices
        
    Returns:
        edge_features: [batch, num_points, k, 2*dim]
    """
    batch_size, num_points, dim = x.size()
    device = x.device
    
    # Get k-NN
    if idx is None:
        idx = knn(x, k)
    
    # Gather neighbors
    idx_base = torch.arange(0, batch_size, device=device).view(-1, 1, 1) * num_points
    idx = idx + idx_base
    idx = idx.view(-1)
    
    x_flat = x.view(batch_size * num_points, -1)
    neighbors = x_flat[idx, :].view(batch_size, num_points, k, dim)
    
    # Edge features: [x_i, x_j - x_i]
    x_expanded = x.unsqueeze(2).repeat(1, 1, k, 1)
    edge_features = torch.cat([x_expanded, neighbors - x_expanded], dim=-1)
    
    return edge_features


class EdgeConv(nn.Module):
    """
    Edge Convolution layer for DGCNN.
    
    Args:
        in_channels: Input feature channels
        out_channels: Output feature channels
        k: Number of neighbors
    """
    def __init__(self, in_channels: int, out_channels: int, k: int = 20):
        super().__init__()
        self.k = k
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels * 2, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(negative_slope=0.2)
        )
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [batch, num_points, in_channels]
            
        Returns:
            output: [batch, num_points, out_channels]
        """
        # Get edge features
        edge_features = get_graph_feature(x, self.k)
        
        # [batch, num_points, k, 2*in_channels] -> [batch, 2*in_channels, num_points, k]
        edge_features = edge_features.permute(0, 3, 1, 2)
        
        # Convolution
        out = self.conv(edge_features)
        
        # Max aggregation
        out = out.max(dim=-1, keepdim=False)[0]
        
        # [batch, out_channels, num_points] -> [batch, num_points, out_channels]
        out = out.transpose(1, 2)
        
        return out


class DGCNN(nn.Module):
    """
    Dynamic Graph CNN for classification.
    
    Args:
        num_classes: Number of output classes
        k: Number of neighbors
    """
    def __init__(self, num_classes: int, k: int = 20):
        super().__init__()
        self.k = k
        
        # EdgeConv layers
        self.conv1 = EdgeConv(3, 64, k)
        self.conv2 = EdgeConv(64, 64, k)
        self.conv3 = EdgeConv(64, 128, k)
        self.conv4 = EdgeConv(128, 256, k)
        
        # Aggregation
        self.conv5 = nn.Sequential(
            nn.Conv1d(512, 1024, 1, bias=False),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(negative_slope=0.2)
        )
        
        # Classification head
        self.fc1 = nn.Linear(1024 * 2, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, num_classes)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [batch, num_points, 3]
            
        Returns:
            logits: [batch, num_classes]
        """
        batch_size = x.size(0)
        
        # EdgeConv layers
        x1 = self.conv1(x)
        x2 = self.conv2(x1)
        x3 = self.conv3(x2)
        x4 = self.conv4(x3)
        
        # Concatenate features
        x_cat = torch.cat([x1, x2, x3, x4], dim=-1)  # [batch, num_points, 512]
        
        # [batch, num_points, 512] -> [batch, 512, num_points]
        x_cat = x_cat.transpose(1, 2)
        
        # Global feature
        x5 = self.conv5(x_cat)  # [batch, 1024, num_points]
        
        # Max and average pooling
        x_max = F.adaptive_max_pool1d(x5, 1).view(batch_size, -1)
        x_avg = F.adaptive_avg_pool1d(x5, 1).view(batch_size, -1)
        x_global = torch.cat([x_max, x_avg], dim=1)  # [batch, 2048]
        
        # Classification
        x = F.leaky_relu(self.bn1(self.fc1(x_global)), negative_slope=0.2)
        x = self.dropout1(x)
        x = F.leaky_relu(self.bn2(self.fc2(x)), negative_slope=0.2)
        x = self.dropout2(x)
        x = self.fc3(x)
        
        return x


# ============================================================================
# 4. Chamfer Distance
# ============================================================================

def chamfer_distance(
    pc1: torch.Tensor,
    pc2: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Compute Chamfer Distance between two point clouds.
    
    CD = mean(min_q ||p - q||^2) + mean(min_p ||q - p||^2)
    
    Args:
        pc1: [batch, N1, 3]
        pc2: [batch, N2, 3]
        
    Returns:
        cd: Chamfer distance
        cd1: pc1 -> pc2 term
        cd2: pc2 -> pc1 term
    """
    # Pairwise distances
    # [batch, N1, N2]
    diff = pc1.unsqueeze(2) - pc2.unsqueeze(1)
    dist = torch.sum(diff ** 2, dim=-1)
    
    # min over pc2 for each point in pc1
    dist1 = torch.min(dist, dim=2)[0]  # [batch, N1]
    cd1 = torch.mean(dist1, dim=1)  # [batch]
    
    # min over pc1 for each point in pc2
    dist2 = torch.min(dist, dim=1)[0]  # [batch, N2]
    cd2 = torch.mean(dist2, dim=1)  # [batch]
    
    cd = cd1 + cd2
    
    return cd, cd1, cd2


# ============================================================================
# 5. Demo and Visualization
# ============================================================================

def generate_synthetic_point_cloud(
    num_points: int = 1024,
    shape: str = 'sphere'
) -> torch.Tensor:
    """
    Generate synthetic point cloud.
    
    Args:
        num_points: Number of points
        shape: 'sphere', 'cube', or 'cylinder'
        
    Returns:
        points: [num_points, 3]
    """
    if shape == 'sphere':
        # Sample on unit sphere
        theta = np.random.uniform(0, 2 * np.pi, num_points)
        phi = np.random.uniform(0, np.pi, num_points)
        x = np.sin(phi) * np.cos(theta)
        y = np.sin(phi) * np.sin(theta)
        z = np.cos(phi)
        points = np.stack([x, y, z], axis=1)
    elif shape == 'cube':
        # Sample in unit cube
        points = np.random.uniform(-1, 1, (num_points, 3))
    elif shape == 'cylinder':
        # Sample on cylinder
        theta = np.random.uniform(0, 2 * np.pi, num_points)
        z = np.random.uniform(-1, 1, num_points)
        r = 1.0
        x = r * np.cos(theta)
        y = r * np.sin(theta)
        points = np.stack([x, y, z], axis=1)
    else:
        raise ValueError(f"Unknown shape: {shape}")
    
    return torch.tensor(points, dtype=torch.float32)


def demo_pointnet():
    """Demonstrate PointNet architecture."""
    print("=" * 80)
    print("PointNet Demo")
    print("=" * 80)
    
    # Create model
    model = PointNet(num_classes=10, task='classification', use_transform=True)
    
    # Generate batch of point clouds
    batch_size = 4
    num_points = 1024
    point_clouds = torch.randn(batch_size, num_points, 3)
    
    # Forward pass
    print(f"\nInput shape: {point_clouds.shape}")
    outputs, trans_feat = model(point_clouds)
    print(f"Output shape: {outputs.shape}")
    print(f"Feature transform shape: {trans_feat.shape if trans_feat is not None else None}")
    
    # Compute loss
    labels = torch.randint(0, 10, (batch_size,))
    loss, loss_dict = pointnet_loss(outputs, labels, trans_feat)
    
    print(f"\nLoss breakdown:")
    print(f"  Total: {loss_dict['total']:.4f}")
    print(f"  Task: {loss_dict['task']:.4f}")
    print(f"  Regularization: {loss_dict['reg']:.6f}")
    
    # Count parameters
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nNumber of parameters: {num_params:,}")


def demo_pointnet_plusplus():
    """Demonstrate PointNet++."""
    print("\n" + "=" * 80)
    print("PointNet++ Demo")
    print("=" * 80)
    
    # Create model
    model = PointNetPlusPlus(num_classes=10, num_points=1024)
    
    # Generate batch
    batch_size = 2
    num_points = 1024
    point_clouds = torch.randn(batch_size, num_points, 3)
    
    # Forward pass
    print(f"\nInput shape: {point_clouds.shape}")
    outputs = model(point_clouds)
    print(f"Output shape: {outputs.shape}")
    
    # Count parameters
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Number of parameters: {num_params:,}")


def demo_dgcnn():
    """Demonstrate DGCNN."""
    print("\n" + "=" * 80)
    print("DGCNN Demo")
    print("=" * 80)
    
    # Create model
    model = DGCNN(num_classes=10, k=20)
    
    # Generate batch
    batch_size = 2
    num_points = 1024
    point_clouds = torch.randn(batch_size, num_points, 3)
    
    # Forward pass
    print(f"\nInput shape: {point_clouds.shape}")
    outputs = model(point_clouds)
    print(f"Output shape: {outputs.shape}")
    
    # Test dynamic graph
    print(f"\nDynamic k-NN graph construction:")
    features = torch.randn(batch_size, num_points, 64)
    idx = knn(features, k=20)
    print(f"  Neighbor indices shape: {idx.shape}")
    
    # Count parameters
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Number of parameters: {num_params:,}")


def demo_chamfer_distance():
    """Demonstrate Chamfer Distance."""
    print("\n" + "=" * 80)
    print("Chamfer Distance Demo")
    print("=" * 80)
    
    # Generate two point clouds
    pc1 = generate_synthetic_point_cloud(1024, 'sphere').unsqueeze(0)
    pc2 = generate_synthetic_point_cloud(1024, 'sphere').unsqueeze(0)
    pc2 = pc2 * 0.9 + 0.1  # Slightly different
    
    # Compute Chamfer distance
    cd, cd1, cd2 = chamfer_distance(pc1, pc2)
    
    print(f"\nPoint cloud 1 shape: {pc1.shape}")
    print(f"Point cloud 2 shape: {pc2.shape}")
    print(f"\nChamfer Distance: {cd.item():.6f}")
    print(f"  PC1 -> PC2: {cd1.item():.6f}")
    print(f"  PC2 -> PC1: {cd2.item():.6f}")
    
    # Test with identical point clouds
    cd_self, _, _ = chamfer_distance(pc1, pc1)
    print(f"\nSelf Chamfer Distance (should be ~0): {cd_self.item():.8f}")


def print_performance_comparison():
    """Print performance comparison table."""
    print("\n" + "=" * 80)
    print("POINT CLOUD NETWORKS COMPARISON")
    print("=" * 80)
    
    comparison = """
    ModelNet40 Classification Performance:
    ┌──────────────────┬──────────────┬──────────────┬──────────────┐
    │ Method           │ Accuracy (%) │ Parameters   │ Inference    │
    │                  │              │ (M)          │ Time (ms)    │
    ├──────────────────┼──────────────┼──────────────┼──────────────┤
    │ PointNet         │ 89.2         │ 3.5          │ 5.2          │
    │ PointNet++       │ 91.9         │ 1.5          │ 18.7         │
    │ DGCNN            │ 92.9         │ 1.8          │ 12.3         │
    │ PointConv        │ 92.5         │ 2.1          │ 25.4         │
    │ Point Trans.     │ 93.7         │ 7.8          │ 32.1         │
    │ Point-BERT       │ 94.6         │ 22.0         │ 45.3         │
    └──────────────────┴──────────────┴──────────────┴──────────────┘
    
    ShapeNet Part Segmentation (mIoU):
    ┌──────────────────┬──────────────┬──────────────┐
    │ Method           │ Class mIoU   │ Instance mIoU│
    ├──────────────────┼──────────────┼──────────────┤
    │ PointNet         │ 80.4         │ 83.7         │
    │ PointNet++       │ 81.9         │ 85.1         │
    │ DGCNN            │ 82.3         │ 85.2         │
    │ Point Trans.     │ 83.7         │ 86.6         │
    └──────────────────┴──────────────┴──────────────┘
    
    KITTI 3D Object Detection (Car, AP@0.7):
    ┌──────────────────┬──────────────┬──────────────┬──────────────┐
    │ Method           │ Easy         │ Moderate     │ Hard         │
    ├──────────────────┼──────────────┼──────────────┼──────────────┤
    │ VoxelNet         │ 77.5         │ 65.1         │ 57.7         │
    │ PointPillars     │ 82.6         │ 74.3         │ 68.0         │
    │ PointRCNN        │ 88.9         │ 78.6         │ 77.4         │
    │ PV-RCNN          │ 90.3         │ 81.4         │ 76.8         │
    └──────────────────┴──────────────┴──────────────┴──────────────┘
    
    Architectural Characteristics:
    ┌──────────────────┬──────────────────────────────────────────────┐
    │ Method           │ Key Features                                 │
    ├──────────────────┼──────────────────────────────────────────────┤
    │ PointNet         │ ✓ Permutation invariant (max pooling)       │
    │                  │ ✓ T-Net (input/feature transforms)          │
    │                  │ ✗ No local context                          │
    ├──────────────────┼──────────────────────────────────────────────┤
    │ PointNet++       │ ✓ Hierarchical feature learning             │
    │                  │ ✓ FPS + Ball query (local context)          │
    │                  │ ✓ Multi-scale grouping                      │
    ├──────────────────┼──────────────────────────────────────────────┤
    │ DGCNN            │ ✓ Dynamic graph (k-NN in feature space)     │
    │                  │ ✓ EdgeConv (edge features)                  │
    │                  │ ✓ Feature concatenation                     │
    ├──────────────────┼──────────────────────────────────────────────┤
    │ Point Trans.     │ ✓ Vector self-attention                     │
    │                  │ ✓ Position encoding (δ = MLP(p_i - p_j))    │
    │                  │ ✓ Long-range dependencies                   │
    └──────────────────┴──────────────────────────────────────────────┘
    
    Decision Guide:
    ┌──────────────────────────────────┬─────────────────────────────┐
    │ Scenario                         │ Recommended Method          │
    ├──────────────────────────────────┼─────────────────────────────┤
    │ Fast inference, simple shapes    │ PointNet                    │
    │ Complex geometry, local patterns │ PointNet++                  │
    │ Semantic relationships           │ DGCNN                       │
    │ Long-range dependencies          │ Point Transformer           │
    │ 3D detection (autonomous)        │ PointPillars, PV-RCNN       │
    │ Large-scale scenes               │ Voxel-based (VoxelNet)      │
    │ Rotation invariance required     │ VN-PointNet, PointConv      │
    │ Pre-training available           │ Point-BERT, Point-MAE       │
    └──────────────────────────────────┴─────────────────────────────┘
    """
    
    print(comparison)
    
    print("\nKey Insights:")
    print("1. PointNet: Simple, fast, but lacks local structure")
    print("2. PointNet++: Hierarchical abstraction, multi-scale features")
    print("3. DGCNN: Dynamic graphs adapt to feature space, not just 3D coords")
    print("4. Point Transformer: Attention mechanism for long-range context")
    print("5. Detection: Voxel/pillar-based faster, point-based more accurate")
    print("6. FPS + Ball query: Core building block for local feature extraction")
    print("7. Chamfer Distance: Standard metric for point cloud generation/completion")
    print("8. Rotation invariance: Critical for real-world robustness")
    print("9. Scalability: Hierarchical downsampling essential for large scenes")
    print("10. Trade-off: Accuracy vs. speed (Point Transformer vs. PointPillars)")


# ============================================================================
# Run Demonstrations
# ============================================================================

if __name__ == "__main__":
    demo_pointnet()
    demo_pointnet_plusplus()
    demo_dgcnn()
    demo_chamfer_distance()
    print_performance_comparison()
    
    print("\n" + "=" * 80)
    print("Point Cloud Networks Implementations Complete!")
    print("=" * 80)