import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
1. Point Cloud BasicsΒΆ
RepresentationΒΆ
Point cloud: \(\{p_i\}_{i=1}^N\) where \(p_i \in \mathbb{R}^3\)
Key Properties:ΒΆ
Permutation invariance: \(f(\{p_1, p_2, \ldots\}) = f(\{p_{\pi(1)}, p_{\pi(2)}, \ldots\})\)
Transformation invariance: Robust to rotations/translations
Unordered: No canonical ordering
π Reference Materials:
cv_3d_foundation.pdf - Cv 3D Foundation
cv_3d_research.pdf - Cv 3D Research
def generate_sphere_points(n_points=1000, radius=1.0):
"""Generate points on sphere surface."""
theta = np.random.uniform(0, 2*np.pi, n_points)
phi = np.random.uniform(0, np.pi, n_points)
x = radius * np.sin(phi) * np.cos(theta)
y = radius * np.sin(phi) * np.sin(theta)
z = radius * np.cos(phi)
return np.stack([x, y, z], axis=1)
def generate_cube_points(n_points=1000, size=1.0):
"""Generate points in cube volume."""
return np.random.uniform(-size/2, size/2, (n_points, 3))
# Generate samples
sphere_pc = generate_sphere_points(500)
cube_pc = generate_cube_points(500)
# Visualize
fig = plt.figure(figsize=(14, 6))
ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(sphere_pc[:, 0], sphere_pc[:, 1], sphere_pc[:, 2], c='blue', s=1, alpha=0.6)
ax1.set_title('Sphere Point Cloud', fontsize=11)
ax2 = fig.add_subplot(122, projection='3d')
ax2.scatter(cube_pc[:, 0], cube_pc[:, 1], cube_pc[:, 2], c='red', s=1, alpha=0.6)
ax2.set_title('Cube Point Cloud', fontsize=11)
plt.tight_layout()
plt.show()
2. T-Net (Transformation Network)ΒΆ
Learns spatial transformer for input alignment:
class TNet(nn.Module):
"""Spatial transformer network."""
def __init__(self, k=3):
super().__init__()
self.k = k
self.conv1 = nn.Conv1d(k, 64, 1)
self.conv2 = nn.Conv1d(64, 128, 1)
self.conv3 = nn.Conv1d(128, 1024, 1)
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, k*k)
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
self.bn4 = nn.BatchNorm1d(512)
self.bn5 = nn.BatchNorm1d(256)
def forward(self, x):
batch_size = x.size(0)
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
x = torch.max(x, 2)[0] # Max pooling
x = F.relu(self.bn4(self.fc1(x)))
x = F.relu(self.bn5(self.fc2(x)))
x = self.fc3(x)
# Initialize as identity
identity = torch.eye(self.k, device=x.device).flatten().unsqueeze(0)
x = x + identity
return x.view(batch_size, self.k, self.k)
# Test
tnet = TNet(k=3).to(device)
test_input = torch.randn(4, 3, 100).to(device)
transform = tnet(test_input)
print(f"Transform shape: {transform.shape}")
3. PointNet ArchitectureΒΆ
Symmetric Function:ΒΆ
Max pooling ensures permutation invariance.
class PointNet(nn.Module):
"""PointNet for classification."""
def __init__(self, num_classes=10):
super().__init__()
# Input transform
self.input_transform = TNet(k=3)
# Feature transform
self.feature_transform = TNet(k=64)
# MLP 1
self.conv1 = nn.Conv1d(3, 64, 1)
self.conv2 = nn.Conv1d(64, 64, 1)
# MLP 2
self.conv3 = nn.Conv1d(64, 64, 1)
self.conv4 = nn.Conv1d(64, 128, 1)
self.conv5 = nn.Conv1d(128, 1024, 1)
# Batch norms
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(64)
self.bn3 = nn.BatchNorm1d(64)
self.bn4 = nn.BatchNorm1d(128)
self.bn5 = nn.BatchNorm1d(1024)
# Classifier
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, num_classes)
self.bn6 = nn.BatchNorm1d(512)
self.bn7 = nn.BatchNorm1d(256)
self.dropout = nn.Dropout(0.3)
def forward(self, x):
batch_size = x.size(0)
n_points = x.size(2)
# Input transform
trans = self.input_transform(x)
x = torch.bmm(trans, x)
# MLP 1
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
# Feature transform
trans_feat = self.feature_transform(x)
x = torch.bmm(trans_feat, x)
# MLP 2
x = F.relu(self.bn3(self.conv3(x)))
x = F.relu(self.bn4(self.conv4(x)))
x = F.relu(self.bn5(self.conv5(x)))
# Global feature
x = torch.max(x, 2)[0]
# Classifier
x = F.relu(self.bn6(self.fc1(x)))
x = self.dropout(x)
x = F.relu(self.bn7(self.fc2(x)))
x = self.dropout(x)
x = self.fc3(x)
return x, trans_feat
model = PointNet(num_classes=2).to(device)
print("PointNet created")
Create Synthetic DatasetΒΆ
To demonstrate PointNet without requiring large 3D datasets, we create synthetic point clouds by sampling points from simple geometric shapes (spheres, cubes, cylinders, cones). Each shape class is represented by a set of \(N\) 3D coordinates, with some noise added for realism. This controlled setup lets us verify that the network can classify shapes based purely on their geometric structure, without relying on texture or color. The principles learned here transfer directly to real point cloud data from LiDAR sensors, depth cameras, and 3D scanners.
class PointCloudDataset(torch.utils.data.Dataset):
"""Synthetic point cloud dataset."""
def __init__(self, n_samples=1000, n_points=1024):
self.n_samples = n_samples
self.n_points = n_points
self.data = []
self.labels = []
for i in range(n_samples):
if i % 2 == 0:
# Sphere
pc = generate_sphere_points(n_points)
label = 0
else:
# Cube
pc = generate_cube_points(n_points)
label = 1
self.data.append(pc)
self.labels.append(label)
def __len__(self):
return self.n_samples
def __getitem__(self, idx):
pc = torch.FloatTensor(self.data[idx]).T # (3, N)
label = torch.LongTensor([self.labels[idx]])[0]
return pc, label
# Create datasets
train_dataset = PointCloudDataset(n_samples=800, n_points=512)
test_dataset = PointCloudDataset(n_samples=200, n_points=512)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)
print(f"Train: {len(train_dataset)}, Test: {len(test_dataset)}")
Train PointNetΒΆ
Training PointNet follows the standard supervised learning loop: forward pass through the shared MLPs, symmetric max-pooling to obtain a global feature vector, and classification via fully connected layers with cross-entropy loss. The max-pooling aggregation is what makes PointNet permutation invariant β shuffling the order of input points produces the same global feature. An optional spatial transformer network (T-Net) learns to canonicalize the input point cloudβs orientation, making the model robust to rigid transformations. Monitoring both training loss and per-class accuracy helps detect class imbalance issues common in 3D shape datasets.
def regularization_loss(trans_feat):
"""Feature transform regularization."""
batch_size = trans_feat.size(0)
identity = torch.eye(64, device=trans_feat.device).unsqueeze(0).repeat(batch_size, 1, 1)
diff = trans_feat @ trans_feat.transpose(1, 2) - identity
return torch.norm(diff, dim=(1, 2)).mean()
def train_pointnet(model, train_loader, n_epochs=20):
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
losses = []
for epoch in range(n_epochs):
model.train()
epoch_loss = 0
for pc, labels in train_loader:
pc, labels = pc.to(device), labels.to(device)
output, trans_feat = model(pc)
# Classification loss
cls_loss = F.cross_entropy(output, labels)
# Regularization
reg_loss = regularization_loss(trans_feat)
loss = cls_loss + 0.001 * reg_loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
losses.append(loss.item())
if (epoch + 1) % 5 == 0:
print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss/len(train_loader):.4f}")
return losses
losses = train_pointnet(model, train_loader, n_epochs=20)
plt.figure(figsize=(10, 5))
plt.plot(losses, alpha=0.3)
plt.plot(np.convolve(losses, np.ones(20)/20, mode='valid'), linewidth=2)
plt.xlabel('Iteration', fontsize=11)
plt.ylabel('Loss', fontsize=11)
plt.title('PointNet Training', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()
EvaluateΒΆ
Evaluation on held-out test shapes measures overall classification accuracy and per-class performance. A confusion matrix reveals which shape pairs are most commonly confused β for example, cubes and rectangular prisms might be hard to distinguish from sparse point samples. Comparing PointNetβs accuracy to simpler baselines (e.g., hand-crafted 3D features plus SVM) demonstrates the power of learning point-level features directly from raw coordinates. In real applications, this evaluation extends to larger benchmarks like ModelNet40 (40 object categories) and ShapeNet (part segmentation).
model.eval()
correct = 0
total = 0
with torch.no_grad():
for pc, labels in test_loader:
pc, labels = pc.to(device), labels.to(device)
output, _ = model(pc)
pred = output.argmax(dim=1)
correct += (pred == labels).sum().item()
total += labels.size(0)
accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")
Test Permutation InvarianceΒΆ
A critical property of PointNet is permutation invariance: the output should be identical regardless of the order in which points are presented. We verify this empirically by passing the same point cloud through the network multiple times with different random permutations of the point order and checking that the output logits are exactly equal. This test validates that the max-pooling aggregation correctly eliminates ordering dependence. Permutation invariance is essential because point clouds from sensors arrive in arbitrary order, and the model must not depend on an artificial indexing scheme.
# Get sample
pc_sample, _ = test_dataset[0]
pc_sample = pc_sample.unsqueeze(0).to(device)
# Original prediction
model.eval()
with torch.no_grad():
output1, _ = model(pc_sample)
pred1 = output1.argmax(dim=1).item()
# Shuffle points
perm = torch.randperm(pc_sample.size(2))
pc_shuffled = pc_sample[:, :, perm]
with torch.no_grad():
output2, _ = model(pc_shuffled)
pred2 = output2.argmax(dim=1).item()
print(f"Original prediction: {pred1}")
print(f"Shuffled prediction: {pred2}")
print(f"Predictions match: {pred1 == pred2}")
print(f"Output difference: {torch.abs(output1 - output2).max().item():.6f}")
SummaryΒΆ
PointNet Key Ideas:ΒΆ
Permutation invariance via max pooling
Spatial transformers for alignment
Global + local features
Direct point processing
Architecture:ΒΆ
Input transform (T-Net)
Feature transform (T-Net)
Symmetric aggregation (max pool)
MLP classifier
Applications:ΒΆ
3D object classification
Part segmentation
Scene understanding
Autonomous driving
Extensions:ΒΆ
PointNet++: Hierarchical learning
DGCNN: Dynamic graph CNN
PointConv: Continuous convolution
Point Transformers: Attention mechanisms
Advanced Point Cloud Networks TheoryΒΆ
1. Introduction to Point Cloud ProcessingΒΆ
Point clouds are sets of 3D points representing object surfaces or scenes, commonly obtained from LiDAR, RGB-D cameras, or 3D reconstruction.
1.1 Point Cloud CharacteristicsΒΆ
Unique properties:
Unordered: No canonical ordering of points
Irregular: Non-uniform density, varying sampling rates
Permutation invariant: Should produce same result for any point ordering
Sparse: 3D space is largely empty
Transformation equivariant: Rotations/translations should preserve structure
Representation: P = {p_i β βΒ³ | i = 1,β¦,N}
May include features: p_i β β^(3+F) (position + color, normal, etc.)
1.2 ChallengesΒΆ
Permutation invariance: Traditional CNNs assume grid structure
Rotation sensitivity: Points in different orientations should be recognized as same
Varying density: Distant points are sparser
Scalability: Millions of points in real scenes
Local structure: Capturing geometric patterns
Applications:
3D object classification/detection
Autonomous driving (LiDAR perception)
Robotics (grasping, navigation)
3D reconstruction and completion
Semantic segmentation of scenes
2. PointNet: Pioneer ArchitectureΒΆ
PointNet [Qi et al., 2017a]: First deep learning architecture for raw point clouds.
2.1 Core IdeaΒΆ
Key insight: Use symmetric function to achieve permutation invariance.
Architecture:
Point Cloud (N Γ 3) β MLP β Features (N Γ d) β MaxPool β Global Feature (d) β MLP β Output
Symmetric function: f({xβ,β¦,xβ}) β g(h(xβ),β¦,h(xβ))
h: Per-point feature extractor (shared MLP)
g: Symmetric aggregator (max pooling)
2.2 Mathematical FrameworkΒΆ
Per-point feature extraction:
f_i = MLP(p_i) β β^d
Global feature (permutation invariant):
f_global = max_{i=1,...,N} f_i
Classification:
y = MLP_cls(f_global)
Segmentation (per-point labels):
y_i = MLP_seg(concat(f_i, f_global))
2.3 Input and Feature TransformationsΒΆ
Problem: Rotation sensitivity
Solution: T-Net (Spatial Transformer Network)
Input transform (3 Γ 3):
T_input = TNet_3Γ3({p_i})
p_i' = T_input Β· p_i
Feature transform (64 Γ 64):
T_feature = TNet_64Γ64({f_i})
f_i' = T_feature Β· f_i
Regularization (orthogonality):
L_reg = ||I - T^T T||Β²_F
Ensures transformation is approximately rigid.
2.4 Theoretical GuaranteesΒΆ
Universal approximation: PointNet can approximate any continuous set function.
Proof sketch:
h (MLP) can approximate any continuous function on βΒ³
max operation selects representative features
g (MLP) combines features to approximate target function
Critical points: Max pooling selects βskeletonβ of point cloud.
2.5 LimitationsΒΆ
Local structure: Doesnβt explicitly capture local geometry
Context: Each point processed independently (before max pool)
Scalability: All points in one forward pass
3. PointNet++: Hierarchical Feature LearningΒΆ
PointNet++ [Qi et al., 2017b]: Extends PointNet with hierarchical structure and local context.
3.1 Architecture OverviewΒΆ
Hierarchical abstraction:
Points (Nβ Γ 3) β SA Layer 1 β Points (Nβ Γ dβ) β SA Layer 2 β Points (Nβ Γ dβ) β ...
Set Abstraction (SA) Layer:
Sampling: Select subset of points (centroids)
Grouping: Group neighbors around each centroid
PointNet: Extract local features
3.2 Set Abstraction LayerΒΆ
Components:
1. Farthest Point Sampling (FPS):
Select Nβ centroids from N points
Maximize minimum distance between centroids
Algorithm:
C = {cβ} # Start with random point
for i = 2 to N':
c_i = argmax_{p β P \ C} min_{c β C} ||p - c||
C = C βͺ {c_i}
2. Ball Query:
For each centroid c_i, find all points within radius r
More robust than k-NN to varying density
N(c_i) = {p_j β P | ||p_j - c_i|| < r}
3. PointNet on local region:
f_i = PointNet({p_j - c_i | p_j β N(c_i)})
Note: Use relative coordinates (p_j - c_i) for translation invariance.
3.3 Multi-Scale Grouping (MSG)ΒΆ
Problem: Varying point density
Solution: Multiple scales with different radii
MSG layer:
f_i = concat(PointNet_rβ(N_rβ(c_i)), PointNet_rβ(N_rβ(c_i)), ...)
Example: r = [0.1, 0.2, 0.4] for coarse-to-fine features
3.4 Multi-Resolution Grouping (MRG)ΒΆ
Alternative: Combine features from different abstraction levels
Idea:
Low-level: Dense sampling, fine details
High-level: Sparse sampling, global context
Concatenate features from multiple resolutions.
3.5 Point Feature PropagationΒΆ
For segmentation: Need per-point labels
Problem: SA layers downsample β fewer points
Solution: Interpolate features back to original points
Interpolation (inverse distance weighted):
f(p) = Ξ£_{i=1}^k w_i f_i / Ξ£_{i=1}^k w_i
w_i = 1 / ||p - p_i||Β²
Where {p_i} are k nearest neighbors in coarser layer.
Feature propagation:
f_propagated = MLP(concat(f_interpolated, f_skip))
f_skip: Skip connection from encoder.
4. Dynamic Graph CNN (DGCNN)ΒΆ
DGCNN [Wang et al., 2019]: Constructs graph dynamically in feature space.
4.1 EdgeConv OperationΒΆ
Key idea: Compute edge features between points and neighbors.
Edge feature:
e_ij = h_ΞΈ(x_i, x_j - x_i)
Where:
x_i: Center point feature
x_j - x_i: Relative neighbor feature
h_ΞΈ: Shared MLP (learnable)
Aggregation:
x_i' = max_{j β N(i)} e_ij
Or other aggregators: sum, mean, attention.
4.2 Dynamic Graph ConstructionΒΆ
Key innovation: Update k-NN graph after each layer.
Procedure:
Compute k-NN in feature space (not 3D coordinates)
Apply EdgeConv
Repeat with updated features
Benefits:
Adapts to learned feature geometry
Captures semantic relationships (not just spatial)
Graph at layer l:
G^(l) = kNN(F^(l), k)
Where F^(l) are features at layer l.
4.3 ArchitectureΒΆ
Stacked EdgeConv layers:
Input (N Γ 3) β EdgeConvβ β EdgeConvβ β ... β EdgeConvβ β MaxPool β MLP β Output
Feature concatenation: Combine features from all layers.
f_global = max(concat(fβ, fβ, ..., fβ))
4.4 Comparison to PointNet++ΒΆ
PointNet++:
Fixed neighborhoods (ball query on 3D coords)
Hierarchical downsampling
DGCNN:
Dynamic neighborhoods (k-NN in feature space)
Full resolution throughout (optional downsampling)
5. Point TransformersΒΆ
5.1 Point Transformer LayerΒΆ
Applying attention to point clouds [Zhao et al., 2021].
Vector attention:
y_i = Ξ£_{j β N(i)} Ο(Ξ³(Ο(x_i) - Ο(x_j) + Ξ΄)) β (Ξ±(x_j) + Ξ΄)
Where:
Ο, Ο, Ξ±, Ξ³: Linear transformations (like Q, K, V in standard attention)
Ξ΄ = Ο_pos(p_i - p_j): Position encoding
Ο: Softmax normalization
β: Element-wise product
Position encoding:
Ξ΄ = MLP(p_i - p_j)
Captures relative position information.
Attention weights:
a_ij = softmax_j(Ξ³(Ο(x_i) - Ο(x_j) + Ξ΄))
Output:
y_i = Ξ£_j a_ij β (Ξ±(x_j) + Ξ΄)
5.2 Transition LayersΒΆ
Downsampling: Farthest point sampling + k-NN grouping
Upsampling: Trilinear interpolation + skip connections
Similar to PointNet++ but with transformer blocks.
5.3 AdvantagesΒΆ
Long-range dependencies: Attention captures global context
Adaptive: Learns which points to attend to
Position-aware: Ξ΄ encodes geometric relationships
6. Rotation-Invariant NetworksΒΆ
6.1 Problem: SO(3) EquivarianceΒΆ
Challenge: Point clouds can be arbitrarily rotated.
Desirable property: f(R Β· P) = R Β· f(P) or f(R Β· P) = f(P)
Approaches:
Data augmentation: Train on rotated copies (not truly invariant)
Canonical orientation: Align to principal axes (unstable)
Equivariant features: Design rotation-equivariant operations
6.2 PointConvΒΆ
PointConv [Wu et al., 2019]: Continuous convolution on point clouds.
Convolution:
f_out(p) = Ξ£_{p_i β N(p)} W(p - p_i) f_in(p_i)
Learned kernel:
W(Ξp) = MLP(Ξp)
Density-aware: Weight by local density.
6.3 VN-PointNet/DGCNNΒΆ
Vector Neuron networks [Deng et al., 2021]: Equivariant operations.
Vector neuron: 3D vector instead of scalar
Operations:
VN-Linear: V_out = W V_in (matrix-vector product)
VN-ReLU: ReLU(||V||) Β· V/||V|| (direction-preserving)
VN-MaxPool: argmax_i ||V_i||
Invariant features: Extract rotation-invariant scalars
||V||: Magnitude
V^T Vβ: Inner product
||(V Γ Vβ)||: Cross product magnitude
7. Point Cloud SegmentationΒΆ
7.1 Semantic SegmentationΒΆ
Task: Assign class label to each point.
Architecture: Encoder-decoder with skip connections.
PointNet++ segmentation:
Input β SA layers β FP layers β Per-point MLP β Scores
Loss: Cross-entropy per point
L = -Ξ£_i Ξ£_c y_ic log Ε·_ic
7.2 Instance SegmentationΒΆ
Task: Group points into object instances + classify.
3D-BoNet [Yang et al., 2019]:
Bounding box prediction: Predict 3D boxes
Point-box association: Assign points to boxes
Classification: Classify each instance
SGPN [Wang et al., 2018]: Similarity Group Proposal Network
Learn pairwise point similarity
Group points via clustering
7.3 Part SegmentationΒΆ
Task: Segment object into parts (e.g., chair legs, back, seat).
ShapeNet Part dataset: 16 categories, 50 parts
Approach: Category-conditional segmentation
Input: Point cloud + category label
Output: Part labels per point
8. Point Cloud Object DetectionΒΆ
8.1 3D Bounding BoxesΒΆ
Representation: (x, y, z, l, w, h, ΞΈ)
Center: (x, y, z)
Size: length, width, height
Orientation: yaw angle ΞΈ
IoU: 3D Intersection over Union for evaluation
8.2 VoxelNetΒΆ
VoxelNet [Zhou & Tuzel, 2018]: Voxelize point cloud.
Pipeline:
Voxel partition: Divide 3D space into voxels
VFE (Voxel Feature Encoding): PointNet per voxel
3D CNN: Process voxel grid
RPN (Region Proposal Network): Predict 3D boxes
VFE layer:
f_voxel = max_{p β voxel} MLP(concat(p, f_voxel_mean))
8.3 PointPillarsΒΆ
PointPillars [Lang et al., 2019]: Organize points into vertical pillars.
Advantages:
Faster than VoxelNet (2D conv instead of 3D)
Efficient for LiDAR (mostly empty above ground)
Pipeline:
Pillar encoding: PointNet on each pillar
Pseudo-image: Scatter features to BEV (birdβs-eye view)
2D CNN: Backbone network
Detection head: Predict boxes
8.4 PointRCNNΒΆ
PointRCNN [Shi et al., 2019]: Two-stage detector on raw points.
Stage 1: 3D proposal generation
Segment foreground points
Predict 3D box per point
Refine via clustering
Stage 2: Box refinement
Canonical coordinates: Transform points to box frame
PointNet++: Refine box parameters
9. Point Cloud GenerationΒΆ
9.1 Point Cloud AutoencodersΒΆ
Architecture: Encoder-decoder for point clouds.
Encoder: PointNet β latent vector z
Decoder: FC layers β N Γ 3 points
Loss: Chamfer distance
L_CD = Ξ£_{p β Pβ} min_{q β Pβ} ||p - q||Β² + Ξ£_{q β Pβ} min_{p β Pβ} ||p - q||Β²
Earth Moverβs Distance (more accurate, slower):
L_EMD = min_{Ο:PββPβ} Ξ£_{p β Pβ} ||p - Ο(p)||Β²
Where Ο is bijection.
9.2 Point Cloud GANsΒΆ
PC-GAN [Li et al., 2018]: Generate point clouds.
Generator: z ~ N(0,I) β MLP β N Γ 3 points
Discriminator: PointNet β real/fake score
Challenges:
Mode collapse
Unstable training
Evaluation metrics (FID for point clouds)
9.3 Point FlowΒΆ
PointFlow [Yang et al., 2019]: Normalizing flow for point clouds.
Idea: Learn bijection between Gaussian and point cloud distribution.
Two-level hierarchy:
Global: Sample number of points and shape code
Local: Sample point positions conditioned on shape
Benefits: Exact likelihood, better mode coverage than GANs.
10. Point Cloud CompletionΒΆ
10.1 Problem FormulationΒΆ
Input: Partial point cloud P_partial Output: Complete point cloud P_complete
Applications:
Occlusion handling in robotics
3D reconstruction from single view
Shape completion from scans
10.2 PCN: Point Completion NetworkΒΆ
PCN [Yuan et al., 2018]:
Encoder: PointNet β coarse global feature
Decoder:
Coarse completion: FC layers β M points (low-res)
Fine completion: Folding-based β N points (high-res)
Multi-resolution loss:
L = L_coarse(P_coarse, P_gt) + L_fine(P_fine, P_gt)
10.3 FoldingNetΒΆ
FoldingNet [Yang et al., 2018]: Learn to fold 2D grid into 3D shape.
Decoder:
p_i = MLP(concat(f_global, grid_i))
Where grid_i β [0,1]Β² is 2D grid point.
Intuition: Learn deformation from canonical 2D plane to 3D surface.
11. Point Cloud UpsamplingΒΆ
11.1 PU-NetΒΆ
PU-Net [Yu et al., 2018]: Upsample sparse point clouds.
Idea: Learn to densify point cloud while preserving geometry.
Pipeline:
Feature extraction: PointNet++
Feature expansion: Replicate features
Point generation: MLP β dense points
Coordinate refinement: Adjust positions
Expansion:
f_i β [f_i, f_i, ..., f_i] (r times)
Then MLP to generate r points per input point.
11.2 PU-GANΒΆ
PU-GAN [Li et al., 2019]: GAN-based upsampling.
Generator: Upsample point cloud
Discriminator: Distinguish real vs. generated patches
Self-attention: Capture long-range dependencies
Losses:
Adversarial loss
Uniform loss (encourage uniform distribution)
Reconstruction loss
12. ApplicationsΒΆ
12.1 Autonomous DrivingΒΆ
LiDAR perception:
Detection: Cars, pedestrians, cyclists (PointPillars, SECOND)
Tracking: Multi-object tracking in 3D
Segmentation: Road, buildings, vegetation
Sensor fusion: Combine LiDAR + camera
12.2 RoboticsΒΆ
Grasping:
Segment graspable regions
Plan grasp poses from partial point clouds
Navigation:
SLAM (Simultaneous Localization and Mapping)
Obstacle avoidance
12.3 AR/VRΒΆ
3D reconstruction: Real-time scene capture
Object recognition: Identify objects in 3D space
Interaction: Hand tracking, gesture recognition
12.4 Medical ImagingΒΆ
Organ segmentation: From CT/MRI scans
Surgical planning: 3D anatomy visualization
13. Recent Advances (2020-2024)ΒΆ
13.1 Point-Voxel CNN (PVCNN)ΒΆ
PVCNN [Liu et al., 2019]: Combine point and voxel representations.
Idea:
Voxel branch: Regular grid, efficient 3D conv
Point branch: Precise geometry, no discretization
Fusion: Devoxelize voxel features β combine with point features
Benefits: Speed of voxels + accuracy of points.
13.2 Stratified TransformerΒΆ
Stratified Transformer [Lai et al., 2022]: Efficient attention for large point clouds.
Key idea: Hierarchical grouping + local attention
Complexity: O(N log N) instead of O(NΒ²)
13.3 Point-BERTΒΆ
Point-BERT [Yu et al., 2022]: Pre-training for point clouds.
Masked autoencoding:
Mask random points
Predict masked points from context
Discretization: Learn discrete tokens via dVAE
Transfer: Fine-tune on downstream tasks
13.4 Point-MAEΒΆ
Point-MAE [Pang et al., 2022]: Masked autoencoders.
High masking ratio: 60-90% of points masked
Decoder: Lightweight transformer
Reconstruction: Predict coordinates of masked points
Finding: High masking ratio forces learning global structure.
14. Benchmarks and DatasetsΒΆ
14.1 ModelNetΒΆ
ModelNet40: 12,311 CAD models, 40 categories
Task: Classification
Evaluation: Overall accuracy
Sampling: 1,024 or 2,048 points per model
14.2 ShapeNetΒΆ
ShapeNetCore: 51,300 models, 55 categories
ShapeNetPart: Part segmentation (16 categories, 50 parts)
Tasks: Classification, segmentation, completion
14.3 ScanNetΒΆ
Indoor scene dataset: 1,513 scanned scenes
Annotations: Semantic labels, instance masks, 3D boxes
Tasks: Scene understanding, object detection, segmentation
14.4 KITTIΒΆ
Autonomous driving: 7,481 training, 7,518 testing frames
LiDAR + camera: Multi-modal
Tasks: 3D object detection, tracking
Categories: Car, pedestrian, cyclist
14.5 Waymo Open DatasetΒΆ
Large-scale: 1,000 scenes, 200k frames
High quality: 5 LiDAR sensors, 5 cameras
Dense annotations: 3D boxes, tracking IDs
15. Evaluation MetricsΒΆ
15.1 ClassificationΒΆ
Accuracy: Fraction of correctly classified point clouds
Per-class accuracy: Accuracy per category (balanced)
15.2 SegmentationΒΆ
mIoU (mean Intersection over Union):
IoU_c = TP_c / (TP_c + FP_c + FN_c)
mIoU = (1/C) Ξ£_c IoU_c
Overall accuracy: Fraction of correctly labeled points
15.3 DetectionΒΆ
AP (Average Precision): At IoU threshold (typically 0.5, 0.7)
mAP: Mean over categories
NDS (Waymo): Normalized Detection Score
NDS = (1/10) Ξ£_{thresholds} AP_threshold
15.4 Completion/GenerationΒΆ
Chamfer Distance (CD):
CD = (1/|Pβ|) Ξ£_{p β Pβ} min_{q β Pβ} ||p - q||Β² +
(1/|Pβ|) Ξ£_{q β Pβ} min_{p β Pβ} ||p - q||Β²
Earth Moverβs Distance (EMD):
EMD = min_{Ο:PββPβ} (1/N) Ξ£_{p β Pβ} ||p - Ο(p)||Β²
Hausdorff Distance:
d_H = max(max_{p β Pβ} min_{q β Pβ} ||p-q||, max_{q β Pβ} min_{p β Pβ} ||p-q||)
16. Implementation ConsiderationsΒΆ
16.1 Efficient Neighborhood SearchΒΆ
k-NN in 3D:
Naive: O(NΒ²) comparison
KD-tree: O(N log N) build, O(log N) query
Ball tree: Better for high dimensions
GPU: Batch k-NN via sorting
Libraries:
Open3D, PyTorch3D (GPU-accelerated)
scipy.spatial.KDTree (CPU)
16.2 Sampling StrategiesΒΆ
Random sampling: Fast but may miss details
Farthest Point Sampling: Better coverage, O(NΒ²)
Approximate FPS: Faster alternatives
Grid-based: Voxelize β sample per voxel
Random subset FPS: FPS on random subset
16.3 NormalizationΒΆ
Unit sphere:
P_norm = (P - center) / max_distance
Zero mean, unit variance:
P_norm = (P - mean(P)) / std(P)
Align to canonical orientation: PCA, but can be unstable
16.4 Data AugmentationΒΆ
Rotation: Random SO(3) rotation
Small angles (Β±15Β°) or full rotation
Jittering: Add Gaussian noise
p_i β p_i + Ξ΅, Ξ΅ ~ N(0, ΟΒ²I)
Scaling: Random scaling factor [0.8, 1.2]
Dropout: Randomly drop points (simulate occlusion)
17. Limitations and Open ProblemsΒΆ
17.1 Current LimitationsΒΆ
Scalability: Large scenes (millions of points) challenging
Density variation: Varying sampling rates
Noise robustness: Real scans have outliers, noise
Occlusion: Missing data from self-occlusion
Computational cost: Point-wise operations expensive
17.2 Open ProblemsΒΆ
Unified architecture: Single model for all tasks
Self-supervised learning: Better pre-training strategies
Multi-modal fusion: Optimal way to combine LiDAR + RGB
Temporal modeling: Point cloud sequences (4D)
Generalization: Cross-dataset performance
17.3 Future DirectionsΒΆ
Transformer-based: Scaling transformers to large point clouds
Implicit representations: Neural fields vs. explicit points
Efficient attention: Linear complexity attention
Neural rendering: NeRF integration with point clouds
Foundation models: Large pre-trained models (Point-BERT, Point-MAE)
18. Key TakeawaysΒΆ
PointNet: Pioneering permutation-invariant architecture via symmetric function (max pooling)
PointNet++: Hierarchical feature learning with FPS + ball query + local PointNet
DGCNN: Dynamic graph construction in feature space with EdgeConv
Point Transformers: Attention mechanism with position encoding for point clouds
Rotation invariance: Critical for real-world robustness (VN networks, equivariant features)
Set abstraction: Sampling + grouping + feature extraction fundamental building block
Applications: Autonomous driving (detection), robotics (grasping), AR/VR (reconstruction)
Metrics: Chamfer distance (generation), mIoU (segmentation), mAP (detection)
Challenges: Scalability, density variation, rotation sensitivity
Future: Transformers, self-supervised learning, multi-modal fusion
Core insight: Point clouds require permutation-invariant operations and hierarchical abstractions to capture both local geometry and global structure.
19. Mathematical SummaryΒΆ
PointNet global feature:
f_global = max_{i=1,...,N} MLP(p_i)
PointNet++ set abstraction:
S^(l) = {f_i^(l) | f_i^(l) = PointNet({p_j - c_i | p_j β B(c_i, r)})}
DGCNN EdgeConv:
x_i' = max_{j β kNN(x_i)} MLP(concat(x_i, x_j - x_i))
Point Transformer attention:
y_i = Ξ£_{j β N(i)} softmax(Ξ³(Ο(x_i) - Ο(x_j) + Ξ΄_ij)) β (Ξ±(x_j) + Ξ΄_ij)
Chamfer Distance:
CD(Pβ, Pβ) = Ξ£_{p β Pβ} min_{q β Pβ} ||p-q||Β² + Ξ£_{q β Pβ} min_{p β Pβ} ||p-q||Β²
ReferencesΒΆ
Qi et al. (2017a) βPointNet: Deep Learning on Point Sets for 3D Classification and Segmentationβ
Qi et al. (2017b) βPointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Spaceβ
Wang et al. (2019) βDynamic Graph CNN for Learning on Point Cloudsβ
Zhao et al. (2021) βPoint Transformerβ
Wu et al. (2019) βPointConv: Deep Convolutional Networks on 3D Point Cloudsβ
Zhou & Tuzel (2018) βVoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detectionβ
Lang et al. (2019) βPointPillars: Fast Encoders for Object Detection from Point Cloudsβ
Yu et al. (2018) βPU-Net: Point Cloud Upsampling Networkβ
Pang et al. (2022) βMasked Autoencoders for Point Cloud Self-supervised Learningβ
Liu et al. (2019) βPoint-Voxel CNN for Efficient 3D Deep Learningβ
"""
Advanced Point Cloud Networks Implementations
Production-ready PyTorch implementations of point cloud processing architectures.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import List, Tuple, Optional, Callable
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# ============================================================================
# 1. PointNet Architecture
# ============================================================================
class TNet(nn.Module):
"""
Transformation Network (T-Net) for input/feature transformation.
Predicts transformation matrix to canonicalize inputs.
Args:
k: Dimensionality of input (3 for spatial, 64 for features)
"""
def __init__(self, k: int = 3):
super().__init__()
self.k = k
# Shared MLPs
self.conv1 = nn.Conv1d(k, 64, 1)
self.conv2 = nn.Conv1d(64, 128, 1)
self.conv3 = nn.Conv1d(128, 1024, 1)
# FC layers
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, k * k)
# Batch norm
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
self.bn4 = nn.BatchNorm1d(512)
self.bn5 = nn.BatchNorm1d(256)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: [batch, k, num_points]
Returns:
transform: [batch, k, k] transformation matrix
"""
batch_size = x.size(0)
# Shared MLP
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
# Max pooling
x = torch.max(x, 2)[0] # [batch, 1024]
# FC layers
x = F.relu(self.bn4(self.fc1(x)))
x = F.relu(self.bn5(self.fc2(x)))
x = self.fc3(x) # [batch, k*k]
# Initialize as identity
identity = torch.eye(self.k, device=x.device).flatten().unsqueeze(0)
identity = identity.repeat(batch_size, 1)
x = x + identity
x = x.view(-1, self.k, self.k)
return x
class PointNet(nn.Module):
"""
PointNet for classification and segmentation.
Args:
num_classes: Number of output classes
task: 'classification' or 'segmentation'
use_transform: Whether to use input/feature transforms
"""
def __init__(
self,
num_classes: int,
task: str = 'classification',
use_transform: bool = True
):
super().__init__()
self.task = task
self.use_transform = use_transform
# Input transform (3x3)
if use_transform:
self.input_transform = TNet(k=3)
self.feature_transform = TNet(k=64)
# Shared MLPs
self.conv1 = nn.Conv1d(3, 64, 1)
self.conv2 = nn.Conv1d(64, 128, 1)
self.conv3 = nn.Conv1d(128, 1024, 1)
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
if task == 'classification':
# Classification head
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, num_classes)
self.bn_fc1 = nn.BatchNorm1d(512)
self.bn_fc2 = nn.BatchNorm1d(256)
self.dropout = nn.Dropout(p=0.3)
else:
# Segmentation head
self.conv4 = nn.Conv1d(1088, 512, 1) # 1024 global + 64 local
self.conv5 = nn.Conv1d(512, 256, 1)
self.conv6 = nn.Conv1d(256, 128, 1)
self.conv7 = nn.Conv1d(128, num_classes, 1)
self.bn4 = nn.BatchNorm1d(512)
self.bn5 = nn.BatchNorm1d(256)
self.bn6 = nn.BatchNorm1d(128)
def forward(
self,
x: torch.Tensor
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""
Args:
x: [batch, 3, num_points] or [batch, num_points, 3]
Returns:
output: [batch, num_classes] or [batch, num_classes, num_points]
feature_transform: [batch, 64, 64] if use_transform else None
"""
# Ensure [batch, 3, num_points]
if x.size(1) != 3:
x = x.transpose(1, 2)
batch_size = x.size(0)
num_points = x.size(2)
# Input transform
if self.use_transform:
trans_input = self.input_transform(x)
x = torch.bmm(trans_input, x)
# MLP (64, 64)
x = F.relu(self.bn1(self.conv1(x)))
point_features = x # Save for segmentation
# Feature transform
trans_feat = None
if self.use_transform:
trans_feat = self.feature_transform(x)
x = torch.bmm(trans_feat, x)
# MLP (64, 128, 1024)
x = F.relu(self.bn2(self.conv2(x)))
x = self.bn3(self.conv3(x))
# Max pooling
global_feature = torch.max(x, 2)[0] # [batch, 1024]
if self.task == 'classification':
# Classification
x = F.relu(self.bn_fc1(self.fc1(global_feature)))
x = self.dropout(x)
x = F.relu(self.bn_fc2(self.fc2(x)))
x = self.dropout(x)
x = self.fc3(x)
return x, trans_feat
else:
# Segmentation
# Replicate global feature
global_feature_expanded = global_feature.unsqueeze(2).repeat(
1, 1, num_points
)
# Concatenate point features with global feature
x = torch.cat([point_features, global_feature_expanded], dim=1)
# Segmentation MLP
x = F.relu(self.bn4(self.conv4(x)))
x = F.relu(self.bn5(self.conv5(x)))
x = F.relu(self.bn6(self.conv6(x)))
x = self.conv7(x)
return x, trans_feat
def pointnet_loss(
outputs: torch.Tensor,
labels: torch.Tensor,
trans_feat: Optional[torch.Tensor] = None,
reg_weight: float = 0.001
) -> Tuple[torch.Tensor, Dict[str, float]]:
"""
PointNet loss with transformation regularization.
Args:
outputs: Model predictions
labels: Ground truth
trans_feat: Feature transformation matrix [batch, 64, 64]
reg_weight: Regularization weight
Returns:
total_loss, loss_dict
"""
# Classification/segmentation loss
if len(outputs.shape) == 2:
# Classification
task_loss = F.cross_entropy(outputs, labels)
else:
# Segmentation
task_loss = F.cross_entropy(outputs, labels)
# Regularization: Enforce orthogonality
reg_loss = 0.0
if trans_feat is not None:
batch_size = trans_feat.size(0)
I = torch.eye(64, device=trans_feat.device).unsqueeze(0).repeat(
batch_size, 1, 1
)
# ||I - A^T A||^2
diff = I - torch.bmm(trans_feat.transpose(1, 2), trans_feat)
reg_loss = torch.mean(torch.norm(diff, dim=(1, 2)))
total_loss = task_loss + reg_weight * reg_loss
loss_dict = {
'total': total_loss.item(),
'task': task_loss.item(),
'reg': reg_loss if isinstance(reg_loss, float) else reg_loss.item()
}
return total_loss, loss_dict
# ============================================================================
# 2. PointNet++ Components
# ============================================================================
def farthest_point_sampling(xyz: torch.Tensor, num_samples: int) -> torch.Tensor:
"""
Farthest Point Sampling (FPS).
Args:
xyz: [batch, num_points, 3]
num_samples: Number of points to sample
Returns:
indices: [batch, num_samples] indices of sampled points
"""
batch_size, num_points, _ = xyz.shape
device = xyz.device
# Initialize
centroids = torch.zeros(batch_size, num_samples, dtype=torch.long, device=device)
distance = torch.ones(batch_size, num_points, device=device) * 1e10
# Random first point
farthest = torch.randint(0, num_points, (batch_size,), device=device)
batch_indices = torch.arange(batch_size, device=device)
for i in range(num_samples):
centroids[:, i] = farthest
# Get centroid coordinates
centroid = xyz[batch_indices, farthest, :].unsqueeze(1) # [batch, 1, 3]
# Compute distances
dist = torch.sum((xyz - centroid) ** 2, dim=-1)
# Update minimum distances
mask = dist < distance
distance[mask] = dist[mask]
# Select farthest point
farthest = torch.max(distance, dim=1)[1]
return centroids
def ball_query(
xyz: torch.Tensor,
centroids: torch.Tensor,
radius: float,
num_samples: int
) -> torch.Tensor:
"""
Ball query: Find points within radius of each centroid.
Args:
xyz: [batch, num_points, 3] all points
centroids: [batch, num_centroids, 3] query points
radius: Search radius
num_samples: Max number of points per ball
Returns:
indices: [batch, num_centroids, num_samples]
"""
batch_size = xyz.size(0)
num_points = xyz.size(1)
num_centroids = centroids.size(1)
device = xyz.device
# Compute pairwise distances
# [batch, num_centroids, num_points]
xyz_expanded = xyz.unsqueeze(1).repeat(1, num_centroids, 1, 1)
centroids_expanded = centroids.unsqueeze(2).repeat(1, 1, num_points, 1)
distances = torch.sum((xyz_expanded - centroids_expanded) ** 2, dim=-1)
# Find points within radius
mask = distances < radius ** 2
# Sample num_samples points
indices = torch.zeros(
batch_size, num_centroids, num_samples,
dtype=torch.long, device=device
)
for b in range(batch_size):
for c in range(num_centroids):
valid_indices = torch.where(mask[b, c])[0]
if len(valid_indices) == 0:
# No points in ball, use nearest
nearest = torch.argmin(distances[b, c])
indices[b, c, :] = nearest
elif len(valid_indices) < num_samples:
# Fewer points than needed, repeat
indices[b, c, :len(valid_indices)] = valid_indices
indices[b, c, len(valid_indices):] = valid_indices[
torch.randint(0, len(valid_indices), (num_samples - len(valid_indices),))
]
else:
# More points than needed, sample
sampled = valid_indices[
torch.randperm(len(valid_indices), device=device)[:num_samples]
]
indices[b, c, :] = sampled
return indices
class SetAbstractionLayer(nn.Module):
"""
PointNet++ Set Abstraction Layer.
Samples centroids, groups neighbors, extracts features.
Args:
num_centroids: Number of points to sample
radius: Ball query radius
num_samples: Max points per neighborhood
in_channels: Input feature channels
mlp_channels: List of MLP output channels
"""
def __init__(
self,
num_centroids: int,
radius: float,
num_samples: int,
in_channels: int,
mlp_channels: List[int]
):
super().__init__()
self.num_centroids = num_centroids
self.radius = radius
self.num_samples = num_samples
# MLP for feature extraction
layers = []
in_ch = in_channels + 3 # Coordinates + features
for out_ch in mlp_channels:
layers.append(nn.Conv2d(in_ch, out_ch, 1))
layers.append(nn.BatchNorm2d(out_ch))
layers.append(nn.ReLU())
in_ch = out_ch
self.mlp = nn.Sequential(*layers)
def forward(
self,
xyz: torch.Tensor,
features: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Args:
xyz: [batch, num_points, 3]
features: [batch, num_points, C] or None
Returns:
new_xyz: [batch, num_centroids, 3]
new_features: [batch, num_centroids, mlp_channels[-1]]
"""
# Farthest point sampling
centroid_indices = farthest_point_sampling(xyz, self.num_centroids)
# Get centroid coordinates
batch_size = xyz.size(0)
new_xyz = torch.gather(
xyz, 1,
centroid_indices.unsqueeze(-1).repeat(1, 1, 3)
)
# Ball query
neighbor_indices = ball_query(xyz, new_xyz, self.radius, self.num_samples)
# Group neighbors
# [batch, num_centroids, num_samples, 3]
grouped_xyz = torch.gather(
xyz.unsqueeze(1).repeat(1, self.num_centroids, 1, 1),
2,
neighbor_indices.unsqueeze(-1).repeat(1, 1, 1, 3)
)
# Relative coordinates
grouped_xyz = grouped_xyz - new_xyz.unsqueeze(2)
# Group features
if features is not None:
grouped_features = torch.gather(
features.unsqueeze(1).repeat(1, self.num_centroids, 1, 1),
2,
neighbor_indices.unsqueeze(-1).repeat(1, 1, 1, features.size(-1))
)
# Concatenate
grouped_features = torch.cat([grouped_xyz, grouped_features], dim=-1)
else:
grouped_features = grouped_xyz
# [batch, C, num_centroids, num_samples]
grouped_features = grouped_features.permute(0, 3, 1, 2)
# Apply MLP
new_features = self.mlp(grouped_features)
# Max pooling across neighbors
new_features = torch.max(new_features, dim=-1)[0] # [batch, C, num_centroids]
new_features = new_features.transpose(1, 2) # [batch, num_centroids, C]
return new_xyz, new_features
class PointNetPlusPlus(nn.Module):
"""
PointNet++ for classification.
Args:
num_classes: Number of output classes
num_points: Number of input points
"""
def __init__(self, num_classes: int, num_points: int = 1024):
super().__init__()
# Set abstraction layers
self.sa1 = SetAbstractionLayer(
num_centroids=512,
radius=0.2,
num_samples=32,
in_channels=0,
mlp_channels=[64, 64, 128]
)
self.sa2 = SetAbstractionLayer(
num_centroids=128,
radius=0.4,
num_samples=64,
in_channels=128,
mlp_channels=[128, 128, 256]
)
self.sa3 = SetAbstractionLayer(
num_centroids=1,
radius=10.0, # Large radius to capture all
num_samples=128,
in_channels=256,
mlp_channels=[256, 512, 1024]
)
# Classification head
self.fc1 = nn.Linear(1024, 512)
self.bn1 = nn.BatchNorm1d(512)
self.dropout1 = nn.Dropout(0.4)
self.fc2 = nn.Linear(512, 256)
self.bn2 = nn.BatchNorm1d(256)
self.dropout2 = nn.Dropout(0.4)
self.fc3 = nn.Linear(256, num_classes)
def forward(self, xyz: torch.Tensor) -> torch.Tensor:
"""
Args:
xyz: [batch, num_points, 3]
Returns:
logits: [batch, num_classes]
"""
# Ensure [batch, num_points, 3]
if xyz.size(-1) != 3:
xyz = xyz.transpose(1, 2)
# Set abstraction layers
xyz, features = self.sa1(xyz, None)
xyz, features = self.sa2(xyz, features)
xyz, features = self.sa3(xyz, features)
# Global feature
features = features.squeeze(1) # [batch, 1024]
# Classification
x = F.relu(self.bn1(self.fc1(features)))
x = self.dropout1(x)
x = F.relu(self.bn2(self.fc2(x)))
x = self.dropout2(x)
x = self.fc3(x)
return x
# ============================================================================
# 3. DGCNN (Dynamic Graph CNN)
# ============================================================================
def knn(x: torch.Tensor, k: int) -> torch.Tensor:
"""
Find k nearest neighbors.
Args:
x: [batch, num_points, dim]
k: Number of neighbors
Returns:
indices: [batch, num_points, k]
"""
# Compute pairwise distances
inner = -2 * torch.matmul(x, x.transpose(2, 1))
xx = torch.sum(x ** 2, dim=2, keepdim=True)
pairwise_distance = -xx - inner - xx.transpose(2, 1)
# Get k nearest
idx = pairwise_distance.topk(k=k, dim=-1)[1]
return idx
def get_graph_feature(
x: torch.Tensor,
k: int = 20,
idx: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""
Construct edge features for DGCNN.
Args:
x: [batch, num_points, dim]
k: Number of neighbors
idx: Pre-computed neighbor indices
Returns:
edge_features: [batch, num_points, k, 2*dim]
"""
batch_size, num_points, dim = x.size()
device = x.device
# Get k-NN
if idx is None:
idx = knn(x, k)
# Gather neighbors
idx_base = torch.arange(0, batch_size, device=device).view(-1, 1, 1) * num_points
idx = idx + idx_base
idx = idx.view(-1)
x_flat = x.view(batch_size * num_points, -1)
neighbors = x_flat[idx, :].view(batch_size, num_points, k, dim)
# Edge features: [x_i, x_j - x_i]
x_expanded = x.unsqueeze(2).repeat(1, 1, k, 1)
edge_features = torch.cat([x_expanded, neighbors - x_expanded], dim=-1)
return edge_features
class EdgeConv(nn.Module):
"""
Edge Convolution layer for DGCNN.
Args:
in_channels: Input feature channels
out_channels: Output feature channels
k: Number of neighbors
"""
def __init__(self, in_channels: int, out_channels: int, k: int = 20):
super().__init__()
self.k = k
self.conv = nn.Sequential(
nn.Conv2d(in_channels * 2, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(negative_slope=0.2)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: [batch, num_points, in_channels]
Returns:
output: [batch, num_points, out_channels]
"""
# Get edge features
edge_features = get_graph_feature(x, self.k)
# [batch, num_points, k, 2*in_channels] -> [batch, 2*in_channels, num_points, k]
edge_features = edge_features.permute(0, 3, 1, 2)
# Convolution
out = self.conv(edge_features)
# Max aggregation
out = out.max(dim=-1, keepdim=False)[0]
# [batch, out_channels, num_points] -> [batch, num_points, out_channels]
out = out.transpose(1, 2)
return out
class DGCNN(nn.Module):
"""
Dynamic Graph CNN for classification.
Args:
num_classes: Number of output classes
k: Number of neighbors
"""
def __init__(self, num_classes: int, k: int = 20):
super().__init__()
self.k = k
# EdgeConv layers
self.conv1 = EdgeConv(3, 64, k)
self.conv2 = EdgeConv(64, 64, k)
self.conv3 = EdgeConv(64, 128, k)
self.conv4 = EdgeConv(128, 256, k)
# Aggregation
self.conv5 = nn.Sequential(
nn.Conv1d(512, 1024, 1, bias=False),
nn.BatchNorm1d(1024),
nn.LeakyReLU(negative_slope=0.2)
)
# Classification head
self.fc1 = nn.Linear(1024 * 2, 512)
self.bn1 = nn.BatchNorm1d(512)
self.dropout1 = nn.Dropout(0.5)
self.fc2 = nn.Linear(512, 256)
self.bn2 = nn.BatchNorm1d(256)
self.dropout2 = nn.Dropout(0.5)
self.fc3 = nn.Linear(256, num_classes)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: [batch, num_points, 3]
Returns:
logits: [batch, num_classes]
"""
batch_size = x.size(0)
# EdgeConv layers
x1 = self.conv1(x)
x2 = self.conv2(x1)
x3 = self.conv3(x2)
x4 = self.conv4(x3)
# Concatenate features
x_cat = torch.cat([x1, x2, x3, x4], dim=-1) # [batch, num_points, 512]
# [batch, num_points, 512] -> [batch, 512, num_points]
x_cat = x_cat.transpose(1, 2)
# Global feature
x5 = self.conv5(x_cat) # [batch, 1024, num_points]
# Max and average pooling
x_max = F.adaptive_max_pool1d(x5, 1).view(batch_size, -1)
x_avg = F.adaptive_avg_pool1d(x5, 1).view(batch_size, -1)
x_global = torch.cat([x_max, x_avg], dim=1) # [batch, 2048]
# Classification
x = F.leaky_relu(self.bn1(self.fc1(x_global)), negative_slope=0.2)
x = self.dropout1(x)
x = F.leaky_relu(self.bn2(self.fc2(x)), negative_slope=0.2)
x = self.dropout2(x)
x = self.fc3(x)
return x
# ============================================================================
# 4. Chamfer Distance
# ============================================================================
def chamfer_distance(
pc1: torch.Tensor,
pc2: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Compute Chamfer Distance between two point clouds.
CD = mean(min_q ||p - q||^2) + mean(min_p ||q - p||^2)
Args:
pc1: [batch, N1, 3]
pc2: [batch, N2, 3]
Returns:
cd: Chamfer distance
cd1: pc1 -> pc2 term
cd2: pc2 -> pc1 term
"""
# Pairwise distances
# [batch, N1, N2]
diff = pc1.unsqueeze(2) - pc2.unsqueeze(1)
dist = torch.sum(diff ** 2, dim=-1)
# min over pc2 for each point in pc1
dist1 = torch.min(dist, dim=2)[0] # [batch, N1]
cd1 = torch.mean(dist1, dim=1) # [batch]
# min over pc1 for each point in pc2
dist2 = torch.min(dist, dim=1)[0] # [batch, N2]
cd2 = torch.mean(dist2, dim=1) # [batch]
cd = cd1 + cd2
return cd, cd1, cd2
# ============================================================================
# 5. Demo and Visualization
# ============================================================================
def generate_synthetic_point_cloud(
num_points: int = 1024,
shape: str = 'sphere'
) -> torch.Tensor:
"""
Generate synthetic point cloud.
Args:
num_points: Number of points
shape: 'sphere', 'cube', or 'cylinder'
Returns:
points: [num_points, 3]
"""
if shape == 'sphere':
# Sample on unit sphere
theta = np.random.uniform(0, 2 * np.pi, num_points)
phi = np.random.uniform(0, np.pi, num_points)
x = np.sin(phi) * np.cos(theta)
y = np.sin(phi) * np.sin(theta)
z = np.cos(phi)
points = np.stack([x, y, z], axis=1)
elif shape == 'cube':
# Sample in unit cube
points = np.random.uniform(-1, 1, (num_points, 3))
elif shape == 'cylinder':
# Sample on cylinder
theta = np.random.uniform(0, 2 * np.pi, num_points)
z = np.random.uniform(-1, 1, num_points)
r = 1.0
x = r * np.cos(theta)
y = r * np.sin(theta)
points = np.stack([x, y, z], axis=1)
else:
raise ValueError(f"Unknown shape: {shape}")
return torch.tensor(points, dtype=torch.float32)
def demo_pointnet():
"""Demonstrate PointNet architecture."""
print("=" * 80)
print("PointNet Demo")
print("=" * 80)
# Create model
model = PointNet(num_classes=10, task='classification', use_transform=True)
# Generate batch of point clouds
batch_size = 4
num_points = 1024
point_clouds = torch.randn(batch_size, num_points, 3)
# Forward pass
print(f"\nInput shape: {point_clouds.shape}")
outputs, trans_feat = model(point_clouds)
print(f"Output shape: {outputs.shape}")
print(f"Feature transform shape: {trans_feat.shape if trans_feat is not None else None}")
# Compute loss
labels = torch.randint(0, 10, (batch_size,))
loss, loss_dict = pointnet_loss(outputs, labels, trans_feat)
print(f"\nLoss breakdown:")
print(f" Total: {loss_dict['total']:.4f}")
print(f" Task: {loss_dict['task']:.4f}")
print(f" Regularization: {loss_dict['reg']:.6f}")
# Count parameters
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nNumber of parameters: {num_params:,}")
def demo_pointnet_plusplus():
"""Demonstrate PointNet++."""
print("\n" + "=" * 80)
print("PointNet++ Demo")
print("=" * 80)
# Create model
model = PointNetPlusPlus(num_classes=10, num_points=1024)
# Generate batch
batch_size = 2
num_points = 1024
point_clouds = torch.randn(batch_size, num_points, 3)
# Forward pass
print(f"\nInput shape: {point_clouds.shape}")
outputs = model(point_clouds)
print(f"Output shape: {outputs.shape}")
# Count parameters
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {num_params:,}")
def demo_dgcnn():
"""Demonstrate DGCNN."""
print("\n" + "=" * 80)
print("DGCNN Demo")
print("=" * 80)
# Create model
model = DGCNN(num_classes=10, k=20)
# Generate batch
batch_size = 2
num_points = 1024
point_clouds = torch.randn(batch_size, num_points, 3)
# Forward pass
print(f"\nInput shape: {point_clouds.shape}")
outputs = model(point_clouds)
print(f"Output shape: {outputs.shape}")
# Test dynamic graph
print(f"\nDynamic k-NN graph construction:")
features = torch.randn(batch_size, num_points, 64)
idx = knn(features, k=20)
print(f" Neighbor indices shape: {idx.shape}")
# Count parameters
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {num_params:,}")
def demo_chamfer_distance():
"""Demonstrate Chamfer Distance."""
print("\n" + "=" * 80)
print("Chamfer Distance Demo")
print("=" * 80)
# Generate two point clouds
pc1 = generate_synthetic_point_cloud(1024, 'sphere').unsqueeze(0)
pc2 = generate_synthetic_point_cloud(1024, 'sphere').unsqueeze(0)
pc2 = pc2 * 0.9 + 0.1 # Slightly different
# Compute Chamfer distance
cd, cd1, cd2 = chamfer_distance(pc1, pc2)
print(f"\nPoint cloud 1 shape: {pc1.shape}")
print(f"Point cloud 2 shape: {pc2.shape}")
print(f"\nChamfer Distance: {cd.item():.6f}")
print(f" PC1 -> PC2: {cd1.item():.6f}")
print(f" PC2 -> PC1: {cd2.item():.6f}")
# Test with identical point clouds
cd_self, _, _ = chamfer_distance(pc1, pc1)
print(f"\nSelf Chamfer Distance (should be ~0): {cd_self.item():.8f}")
def print_performance_comparison():
"""Print performance comparison table."""
print("\n" + "=" * 80)
print("POINT CLOUD NETWORKS COMPARISON")
print("=" * 80)
comparison = """
ModelNet40 Classification Performance:
ββββββββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ
β Method β Accuracy (%) β Parameters β Inference β
β β β (M) β Time (ms) β
ββββββββββββββββββββΌβββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β PointNet β 89.2 β 3.5 β 5.2 β
β PointNet++ β 91.9 β 1.5 β 18.7 β
β DGCNN β 92.9 β 1.8 β 12.3 β
β PointConv β 92.5 β 2.1 β 25.4 β
β Point Trans. β 93.7 β 7.8 β 32.1 β
β Point-BERT β 94.6 β 22.0 β 45.3 β
ββββββββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ
ShapeNet Part Segmentation (mIoU):
ββββββββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ
β Method β Class mIoU β Instance mIoUβ
ββββββββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β PointNet β 80.4 β 83.7 β
β PointNet++ β 81.9 β 85.1 β
β DGCNN β 82.3 β 85.2 β
β Point Trans. β 83.7 β 86.6 β
ββββββββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ
KITTI 3D Object Detection (Car, AP@0.7):
ββββββββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ
β Method β Easy β Moderate β Hard β
ββββββββββββββββββββΌβββββββββββββββΌβββββββββββββββΌβββββββββββββββ€
β VoxelNet β 77.5 β 65.1 β 57.7 β
β PointPillars β 82.6 β 74.3 β 68.0 β
β PointRCNN β 88.9 β 78.6 β 77.4 β
β PV-RCNN β 90.3 β 81.4 β 76.8 β
ββββββββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ
Architectural Characteristics:
ββββββββββββββββββββ¬βββββββββββββββββββββββββββββββββββββββββββββββ
β Method β Key Features β
ββββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββ€
β PointNet β β Permutation invariant (max pooling) β
β β β T-Net (input/feature transforms) β
β β β No local context β
ββββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββ€
β PointNet++ β β Hierarchical feature learning β
β β β FPS + Ball query (local context) β
β β β Multi-scale grouping β
ββββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββ€
β DGCNN β β Dynamic graph (k-NN in feature space) β
β β β EdgeConv (edge features) β
β β β Feature concatenation β
ββββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββ€
β Point Trans. β β Vector self-attention β
β β β Position encoding (Ξ΄ = MLP(p_i - p_j)) β
β β β Long-range dependencies β
ββββββββββββββββββββ΄βββββββββββββββββββββββββββββββββββββββββββββββ
Decision Guide:
ββββββββββββββββββββββββββββββββββββ¬ββββββββββββββββββββββββββββββ
β Scenario β Recommended Method β
ββββββββββββββββββββββββββββββββββββΌββββββββββββββββββββββββββββββ€
β Fast inference, simple shapes β PointNet β
β Complex geometry, local patterns β PointNet++ β
β Semantic relationships β DGCNN β
β Long-range dependencies β Point Transformer β
β 3D detection (autonomous) β PointPillars, PV-RCNN β
β Large-scale scenes β Voxel-based (VoxelNet) β
β Rotation invariance required β VN-PointNet, PointConv β
β Pre-training available β Point-BERT, Point-MAE β
ββββββββββββββββββββββββββββββββββββ΄ββββββββββββββββββββββββββββββ
"""
print(comparison)
print("\nKey Insights:")
print("1. PointNet: Simple, fast, but lacks local structure")
print("2. PointNet++: Hierarchical abstraction, multi-scale features")
print("3. DGCNN: Dynamic graphs adapt to feature space, not just 3D coords")
print("4. Point Transformer: Attention mechanism for long-range context")
print("5. Detection: Voxel/pillar-based faster, point-based more accurate")
print("6. FPS + Ball query: Core building block for local feature extraction")
print("7. Chamfer Distance: Standard metric for point cloud generation/completion")
print("8. Rotation invariance: Critical for real-world robustness")
print("9. Scalability: Hierarchical downsampling essential for large scenes")
print("10. Trade-off: Accuracy vs. speed (Point Transformer vs. PointPillars)")
# ============================================================================
# Run Demonstrations
# ============================================================================
if __name__ == "__main__":
demo_pointnet()
demo_pointnet_plusplus()
demo_dgcnn()
demo_chamfer_distance()
print_performance_comparison()
print("\n" + "=" * 80)
print("Point Cloud Networks Implementations Complete!")
print("=" * 80)