Object Detection: From Bounding Boxes to Modern DetectorsΒΆ

Image classification tells you what’s in an image. Object detection tells you what AND where. This notebook covers the key concepts β€” sliding windows, anchor boxes, IoU, NMS β€” and shows how to use modern pretrained detectors (YOLO, Faster R-CNN) for practical tasks.

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image, ImageDraw
import warnings
warnings.filterwarnings('ignore')

try:
    import torch
    import torchvision
    from torchvision.models.detection import fasterrcnn_resnet50_fpn
    from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
    HAS_TORCHVISION = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'torchvision available. Device: {device}')
except ImportError:
    HAS_TORCHVISION = False
    print('torchvision not installed β€” showing core concepts with numpy')
    print('Install: pip install torch torchvision')

# Create synthetic image with multiple objects
def create_scene(size=400):
    """Synthetic scene with rectangles representing objects."""
    img = np.ones((size, size, 3), dtype=np.uint8) * 200  # Gray background
    objects = [
        {'box': [50, 80, 130, 160], 'color': (220, 80, 80),  'label': 'car'},
        {'box': [200, 150, 300, 280], 'color': (80, 150, 220), 'label': 'truck'},
        {'box': [320, 50, 380, 130], 'color': (80, 220, 80),  'label': 'person'},
    ]
    for obj in objects:
        x1, y1, x2, y2 = obj['box']
        img[y1:y2, x1:x2, :] = obj['color']
    return img, objects

scene_img, ground_truth = create_scene()
print(f'Scene created: {len(ground_truth)} objects')
for obj in ground_truth:
    print(f'  {obj["label"]}: box={obj["box"]}')

1. Key Concepts: IoU, NMS, and Anchor BoxesΒΆ

# Intersection over Union (IoU)
def calculate_iou(box1: list, box2: list) -> float:
    """
    box format: [x1, y1, x2, y2]
    IoU = Area of Intersection / Area of Union
    """
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2]-box1[0]) * (box1[3]-box1[1])
    area2 = (box2[2]-box2[0]) * (box2[3]-box2[1])
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0

# Non-Maximum Suppression
def nms(boxes: list, scores: list, iou_threshold: float = 0.5) -> list:
    """
    Remove duplicate detections.
    Keep highest-score box, remove overlapping boxes above iou_threshold.
    """
    if not boxes:
        return []
    
    # Sort by score descending
    order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    keep = []
    
    while order:
        best = order[0]
        keep.append(best)
        order = order[1:]
        
        # Remove boxes that overlap too much with best
        order = [i for i in order
                 if calculate_iou(boxes[best], boxes[i]) < iou_threshold]
    
    return keep

# Demonstrate IoU
gt_box = ground_truth[0]['box']  # [50, 80, 130, 160]
pred_boxes = [
    [55, 85, 125, 155],   # Good match
    [200, 200, 280, 300], # No overlap
    [30, 60, 160, 180],   # Partial overlap
]

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, pred in zip(axes, pred_boxes):
    iou = calculate_iou(gt_box, pred)
    ax.imshow(scene_img)
    # Ground truth
    x1, y1, x2, y2 = gt_box
    rect_gt = patches.Rectangle((x1,y1), x2-x1, y2-y1, linewidth=2, edgecolor='green', facecolor='none')
    ax.add_patch(rect_gt)
    # Prediction
    x1, y1, x2, y2 = pred
    rect_pred = patches.Rectangle((x1,y1), x2-x1, y2-y1, linewidth=2, edgecolor='red', facecolor='none', linestyle='--')
    ax.add_patch(rect_pred)
    ax.set_title(f'IoU = {iou:.2f}\n{"βœ… Good (>0.5)" if iou>0.5 else "❌ Poor (<0.5)"}')
    ax.axis('off')
    ax.legend([patches.Patch(color='green', label='GT'), patches.Patch(color='red', label='Pred')], fontsize=8)

plt.suptitle('Intersection over Union (IoU) Examples', fontsize=13)
plt.tight_layout()
plt.show()

# NMS demonstration
print('Non-Maximum Suppression demo:')
sim_boxes  = [[50,80,130,160], [55,85,125,155], [58,82,135,162], [200,150,300,280]]
sim_scores = [0.95, 0.87, 0.82, 0.91]
kept = nms(sim_boxes, sim_scores, iou_threshold=0.5)
print(f'  Before NMS: {len(sim_boxes)} boxes')
print(f'  After NMS:  {len(kept)} boxes kept β†’ indices {kept}')
print(f'  Removed overlapping low-confidence boxes: {len(sim_boxes)-len(kept)}')

2. Using Pretrained Faster R-CNNΒΆ

# COCO class labels (80 classes)
COCO_CLASSES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

if HAS_TORCHVISION:
    # Load pretrained detector
    weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
    detector = fasterrcnn_resnet50_fpn(weights=weights)
    detector.eval().to(device)
    
    total_params = sum(p.numel() for p in detector.parameters())
    print(f'Faster R-CNN: {total_params:,} parameters')
    
    # Inference on synthetic image
    import torchvision.transforms.functional as F_tv
    img_tensor = F_tv.to_tensor(scene_img).unsqueeze(0).to(device)
    
    with torch.no_grad():
        predictions = detector(img_tensor)
    
    pred = predictions[0]
    boxes  = pred['boxes'].cpu().numpy()
    labels = pred['labels'].cpu().numpy()
    scores = pred['scores'].cpu().numpy()
    
    # Filter by confidence
    threshold = 0.5
    mask = scores > threshold
    
    print(f'Detections above {threshold:.0%} confidence: {mask.sum()}')
    for box, label, score in zip(boxes[mask], labels[mask], scores[mask]):
        print(f'  {COCO_CLASSES[label]}: {score:.2%} at [{box[0]:.0f},{box[1]:.0f},{box[2]:.0f},{box[3]:.0f}]')
else:
    print('Faster R-CNN usage pattern (torchvision):')
    print()
    print("from torchvision.models.detection import fasterrcnn_resnet50_fpn")
    print("detector = fasterrcnn_resnet50_fpn(weights='DEFAULT')")
    print("detector.eval()")
    print()
    print("# Inference:")
    print("img_tensor = torchvision.transforms.functional.to_tensor(pil_image)")
    print("predictions = detector([img_tensor])")
    print("# Returns: {'boxes': ..., 'labels': ..., 'scores': ...}")
    print()
    print('Object detection models (comparison):')
    print(f'{"Model":<20} {"Speed":<12} {"mAP (COCO)":<12} {"Use case"}')
    print('-' * 60)
    for m, s, mAP, use in [
        ('YOLOv8-nano',    '5ms/img',  '37.3',  'Real-time, edge'),
        ('YOLOv8-large',  '25ms/img', '52.9',  'High accuracy'),
        ('Faster R-CNN',  '80ms/img', '46.7',  'Baseline, two-stage'),
        ('DETR',          '120ms/img','42.0',  'Transformer-based'),
        ('RT-DETR-L',     '32ms/img', '53.0',  'Modern real-time'),
    ]:
        print(f'{m:<20} {s:<12} {mAP:<12} {use}')

3. Evaluation: mAPΒΆ

def compute_ap(recalls: list, precisions: list) -> float:
    """Area under Precision-Recall curve (11-point interpolation)."""
    ap = 0
    for t in np.linspace(0, 1, 11):
        prec_at_rec = [p for r, p in zip(recalls, precisions) if r >= t]
        ap += max(prec_at_rec) / 11 if prec_at_rec else 0
    return ap

# Simulate detection results for one class
np.random.seed(42)
n_gt = 20  # Ground truth objects

# Simulate sorted detections (by score)
detections = sorted([
    {'score': np.random.uniform(0.5, 1.0), 'is_tp': np.random.random() < 0.7}
    for _ in range(30)
], key=lambda x: x['score'], reverse=True)

cumulative_tp, cumulative_fp = 0, 0
precisions, recalls = [], []

for det in detections:
    if det['is_tp']:
        cumulative_tp += 1
    else:
        cumulative_fp += 1
    precision = cumulative_tp / (cumulative_tp + cumulative_fp)
    recall    = cumulative_tp / n_gt
    precisions.append(precision)
    recalls.append(recall)

ap = compute_ap(recalls, precisions)

fig, ax = plt.subplots(figsize=(8, 5))
ax.step(recalls, precisions, 'b-', linewidth=2)
ax.fill_between(recalls, precisions, alpha=0.2, color='blue')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title(f'Precision-Recall Curve β€” AP = {ap:.3f}')
ax.set_xlim([0, 1])
ax.set_ylim([0, 1.05])
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f'Average Precision (AP): {ap:.3f}')
print()
print('mAP = mean AP across all classes')
print('mAP@0.5 = mAP with IoU threshold 0.5 (easy)')
print('mAP@0.5:0.95 = averaged over IoU thresholds 0.5, 0.55, ..., 0.95 (strict)')
print('COCO standard: mAP@0.5:0.95 β€” YOLOv8-large achieves ~53 mAP')

Object Detection Cheat SheetΒΆ

Concept          Description
────────────────────────────────────────────────────────────────
Bounding box     [x1, y1, x2, y2] or [x_center, y_center, w, h]
IoU              Intersection / Union. >0.5 = match, >0.75 = good
NMS              Remove duplicate boxes β€” keep highest score
Anchor boxes     Pre-defined box shapes at each grid cell
mAP              Mean Average Precision β€” primary evaluation metric

Model Selection:
  Real-time (< 10ms):   YOLOv8-nano, YOLOv8-small
  Balanced:             YOLOv8-medium, RT-DETR
  Highest accuracy:     YOLOv8-x, Co-DETR, DINO
  2-stage (research):   Faster R-CNN, Cascade R-CNN

Usage Pattern (YOLO via ultralytics):
  from ultralytics import YOLO
  model = YOLO('yolov8n.pt')
  results = model('image.jpg')  # Detect
  results = model.train(data='dataset.yaml', epochs=50)  # Fine-tune

Fine-tuning detector checklist:
  1. Annotate data (Roboflow, CVAT, LabelImg)
  2. Export in YOLO format (txt files with normalized coords)
  3. Create dataset.yaml with class names and paths
  4. Start from pretrained weights (always)
  5. Freeze backbone first, then unfreeze
  6. Evaluate with mAP@0.5 on validation set

ExercisesΒΆ

  1. Implement calculate_iou for the [x_center, y_center, w, h] format and verify it matches the [x1,y1,x2,y2] version.

  2. Run Faster R-CNN (torchvision) on an image from disk β€” filter by class and confidence, draw bounding boxes.

  3. Implement a simple sliding window detector using a CNN classifier: slide a 32Γ—32 window over a larger image.

  4. Compare YOLO vs Faster R-CNN speed on the same image (use timeit with 100 runs).

  5. Implement the PASCAL VOC mAP evaluation: given predicted boxes + GT boxes for multiple classes, compute mAP@0.5.