Object Detection: From Bounding Boxes to Modern DetectorsΒΆ
Image classification tells you whatβs in an image. Object detection tells you what AND where. This notebook covers the key concepts β sliding windows, anchor boxes, IoU, NMS β and shows how to use modern pretrained detectors (YOLO, Faster R-CNN) for practical tasks.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image, ImageDraw
import warnings
warnings.filterwarnings('ignore')
try:
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
HAS_TORCHVISION = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'torchvision available. Device: {device}')
except ImportError:
HAS_TORCHVISION = False
print('torchvision not installed β showing core concepts with numpy')
print('Install: pip install torch torchvision')
# Create synthetic image with multiple objects
def create_scene(size=400):
"""Synthetic scene with rectangles representing objects."""
img = np.ones((size, size, 3), dtype=np.uint8) * 200 # Gray background
objects = [
{'box': [50, 80, 130, 160], 'color': (220, 80, 80), 'label': 'car'},
{'box': [200, 150, 300, 280], 'color': (80, 150, 220), 'label': 'truck'},
{'box': [320, 50, 380, 130], 'color': (80, 220, 80), 'label': 'person'},
]
for obj in objects:
x1, y1, x2, y2 = obj['box']
img[y1:y2, x1:x2, :] = obj['color']
return img, objects
scene_img, ground_truth = create_scene()
print(f'Scene created: {len(ground_truth)} objects')
for obj in ground_truth:
print(f' {obj["label"]}: box={obj["box"]}')
1. Key Concepts: IoU, NMS, and Anchor BoxesΒΆ
# Intersection over Union (IoU)
def calculate_iou(box1: list, box2: list) -> float:
"""
box format: [x1, y1, x2, y2]
IoU = Area of Intersection / Area of Union
"""
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
intersection = max(0, x2 - x1) * max(0, y2 - y1)
area1 = (box1[2]-box1[0]) * (box1[3]-box1[1])
area2 = (box2[2]-box2[0]) * (box2[3]-box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0
# Non-Maximum Suppression
def nms(boxes: list, scores: list, iou_threshold: float = 0.5) -> list:
"""
Remove duplicate detections.
Keep highest-score box, remove overlapping boxes above iou_threshold.
"""
if not boxes:
return []
# Sort by score descending
order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
keep = []
while order:
best = order[0]
keep.append(best)
order = order[1:]
# Remove boxes that overlap too much with best
order = [i for i in order
if calculate_iou(boxes[best], boxes[i]) < iou_threshold]
return keep
# Demonstrate IoU
gt_box = ground_truth[0]['box'] # [50, 80, 130, 160]
pred_boxes = [
[55, 85, 125, 155], # Good match
[200, 200, 280, 300], # No overlap
[30, 60, 160, 180], # Partial overlap
]
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, pred in zip(axes, pred_boxes):
iou = calculate_iou(gt_box, pred)
ax.imshow(scene_img)
# Ground truth
x1, y1, x2, y2 = gt_box
rect_gt = patches.Rectangle((x1,y1), x2-x1, y2-y1, linewidth=2, edgecolor='green', facecolor='none')
ax.add_patch(rect_gt)
# Prediction
x1, y1, x2, y2 = pred
rect_pred = patches.Rectangle((x1,y1), x2-x1, y2-y1, linewidth=2, edgecolor='red', facecolor='none', linestyle='--')
ax.add_patch(rect_pred)
ax.set_title(f'IoU = {iou:.2f}\n{"β
Good (>0.5)" if iou>0.5 else "β Poor (<0.5)"}')
ax.axis('off')
ax.legend([patches.Patch(color='green', label='GT'), patches.Patch(color='red', label='Pred')], fontsize=8)
plt.suptitle('Intersection over Union (IoU) Examples', fontsize=13)
plt.tight_layout()
plt.show()
# NMS demonstration
print('Non-Maximum Suppression demo:')
sim_boxes = [[50,80,130,160], [55,85,125,155], [58,82,135,162], [200,150,300,280]]
sim_scores = [0.95, 0.87, 0.82, 0.91]
kept = nms(sim_boxes, sim_scores, iou_threshold=0.5)
print(f' Before NMS: {len(sim_boxes)} boxes')
print(f' After NMS: {len(kept)} boxes kept β indices {kept}')
print(f' Removed overlapping low-confidence boxes: {len(sim_boxes)-len(kept)}')
2. Using Pretrained Faster R-CNNΒΆ
# COCO class labels (80 classes)
COCO_CLASSES = [
'__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]
if HAS_TORCHVISION:
# Load pretrained detector
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
detector = fasterrcnn_resnet50_fpn(weights=weights)
detector.eval().to(device)
total_params = sum(p.numel() for p in detector.parameters())
print(f'Faster R-CNN: {total_params:,} parameters')
# Inference on synthetic image
import torchvision.transforms.functional as F_tv
img_tensor = F_tv.to_tensor(scene_img).unsqueeze(0).to(device)
with torch.no_grad():
predictions = detector(img_tensor)
pred = predictions[0]
boxes = pred['boxes'].cpu().numpy()
labels = pred['labels'].cpu().numpy()
scores = pred['scores'].cpu().numpy()
# Filter by confidence
threshold = 0.5
mask = scores > threshold
print(f'Detections above {threshold:.0%} confidence: {mask.sum()}')
for box, label, score in zip(boxes[mask], labels[mask], scores[mask]):
print(f' {COCO_CLASSES[label]}: {score:.2%} at [{box[0]:.0f},{box[1]:.0f},{box[2]:.0f},{box[3]:.0f}]')
else:
print('Faster R-CNN usage pattern (torchvision):')
print()
print("from torchvision.models.detection import fasterrcnn_resnet50_fpn")
print("detector = fasterrcnn_resnet50_fpn(weights='DEFAULT')")
print("detector.eval()")
print()
print("# Inference:")
print("img_tensor = torchvision.transforms.functional.to_tensor(pil_image)")
print("predictions = detector([img_tensor])")
print("# Returns: {'boxes': ..., 'labels': ..., 'scores': ...}")
print()
print('Object detection models (comparison):')
print(f'{"Model":<20} {"Speed":<12} {"mAP (COCO)":<12} {"Use case"}')
print('-' * 60)
for m, s, mAP, use in [
('YOLOv8-nano', '5ms/img', '37.3', 'Real-time, edge'),
('YOLOv8-large', '25ms/img', '52.9', 'High accuracy'),
('Faster R-CNN', '80ms/img', '46.7', 'Baseline, two-stage'),
('DETR', '120ms/img','42.0', 'Transformer-based'),
('RT-DETR-L', '32ms/img', '53.0', 'Modern real-time'),
]:
print(f'{m:<20} {s:<12} {mAP:<12} {use}')
3. Evaluation: mAPΒΆ
def compute_ap(recalls: list, precisions: list) -> float:
"""Area under Precision-Recall curve (11-point interpolation)."""
ap = 0
for t in np.linspace(0, 1, 11):
prec_at_rec = [p for r, p in zip(recalls, precisions) if r >= t]
ap += max(prec_at_rec) / 11 if prec_at_rec else 0
return ap
# Simulate detection results for one class
np.random.seed(42)
n_gt = 20 # Ground truth objects
# Simulate sorted detections (by score)
detections = sorted([
{'score': np.random.uniform(0.5, 1.0), 'is_tp': np.random.random() < 0.7}
for _ in range(30)
], key=lambda x: x['score'], reverse=True)
cumulative_tp, cumulative_fp = 0, 0
precisions, recalls = [], []
for det in detections:
if det['is_tp']:
cumulative_tp += 1
else:
cumulative_fp += 1
precision = cumulative_tp / (cumulative_tp + cumulative_fp)
recall = cumulative_tp / n_gt
precisions.append(precision)
recalls.append(recall)
ap = compute_ap(recalls, precisions)
fig, ax = plt.subplots(figsize=(8, 5))
ax.step(recalls, precisions, 'b-', linewidth=2)
ax.fill_between(recalls, precisions, alpha=0.2, color='blue')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title(f'Precision-Recall Curve β AP = {ap:.3f}')
ax.set_xlim([0, 1])
ax.set_ylim([0, 1.05])
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f'Average Precision (AP): {ap:.3f}')
print()
print('mAP = mean AP across all classes')
print('mAP@0.5 = mAP with IoU threshold 0.5 (easy)')
print('mAP@0.5:0.95 = averaged over IoU thresholds 0.5, 0.55, ..., 0.95 (strict)')
print('COCO standard: mAP@0.5:0.95 β YOLOv8-large achieves ~53 mAP')
Object Detection Cheat SheetΒΆ
Concept Description
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Bounding box [x1, y1, x2, y2] or [x_center, y_center, w, h]
IoU Intersection / Union. >0.5 = match, >0.75 = good
NMS Remove duplicate boxes β keep highest score
Anchor boxes Pre-defined box shapes at each grid cell
mAP Mean Average Precision β primary evaluation metric
Model Selection:
Real-time (< 10ms): YOLOv8-nano, YOLOv8-small
Balanced: YOLOv8-medium, RT-DETR
Highest accuracy: YOLOv8-x, Co-DETR, DINO
2-stage (research): Faster R-CNN, Cascade R-CNN
Usage Pattern (YOLO via ultralytics):
from ultralytics import YOLO
model = YOLO('yolov8n.pt')
results = model('image.jpg') # Detect
results = model.train(data='dataset.yaml', epochs=50) # Fine-tune
Fine-tuning detector checklist:
1. Annotate data (Roboflow, CVAT, LabelImg)
2. Export in YOLO format (txt files with normalized coords)
3. Create dataset.yaml with class names and paths
4. Start from pretrained weights (always)
5. Freeze backbone first, then unfreeze
6. Evaluate with mAP@0.5 on validation set
ExercisesΒΆ
Implement
calculate_ioufor the[x_center, y_center, w, h]format and verify it matches the[x1,y1,x2,y2]version.Run Faster R-CNN (torchvision) on an image from disk β filter by class and confidence, draw bounding boxes.
Implement a simple sliding window detector using a CNN classifier: slide a 32Γ32 window over a larger image.
Compare YOLO vs Faster R-CNN speed on the same image (use
timeitwith 100 runs).Implement the PASCAL VOC mAP evaluation: given predicted boxes + GT boxes for multiple classes, compute mAP@0.5.