Solutions: Computer Vision TrackΒΆ

Worked solutions to all exercises from the computer-vision/ notebooks.

01 β€” Image Processing BasicsΒΆ

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import warnings; warnings.filterwarnings('ignore')

# Synthetic RGB test image (H=64, W=64, C=3)
np.random.seed(42)
img_np = np.random.randint(0, 256, (64, 64, 3), dtype=np.uint8)
img_pil = Image.fromarray(img_np)

# Exercise 1: Center crop and random crop
# Key insight: Center crop is deterministic and used at inference;
# random crop introduces spatial augmentation during training.
def center_crop(img_arr, crop_h, crop_w):
    H, W = img_arr.shape[:2]
    top  = (H - crop_h) // 2
    left = (W - crop_w) // 2
    return img_arr[top:top+crop_h, left:left+crop_w]

def random_crop(img_arr, crop_h, crop_w):
    H, W = img_arr.shape[:2]
    top  = np.random.randint(0, H - crop_h + 1)
    left = np.random.randint(0, W - crop_w + 1)
    return img_arr[top:top+crop_h, left:left+crop_w]

cc = center_crop(img_np, 32, 32)
rc = random_crop(img_np, 32, 32)
print(f'Center crop shape: {cc.shape}')  # (32, 32, 3)
print(f'Random crop shape: {rc.shape}')  # (32, 32, 3)
assert cc.shape == (32, 32, 3)
assert rc.shape == (32, 32, 3)
# Exercise 2: HSV -> RGB conversion (inverse of RGB -> HSV)
# Key insight: HSV is easier for human-intuitive color manipulation;
# the inverse uses sector-based piecewise reconstruction of RGB from (H, S, V).
def rgb_to_hsv(rgb):
    r, g, b = rgb[...,0]/255.0, rgb[...,1]/255.0, rgb[...,2]/255.0
    Cmax = np.maximum(np.maximum(r, g), b)
    Cmin = np.minimum(np.minimum(r, g), b)
    delta = Cmax - Cmin + 1e-10
    H = np.where(Cmax == r, (g - b) / delta % 6,
        np.where(Cmax == g, (b - r) / delta + 2, (r - g) / delta + 4)) * 60
    S = np.where(Cmax == 0, 0, delta / Cmax)
    V = Cmax
    return np.stack([H, S, V], axis=-1)

def hsv_to_rgb(hsv):
    H, S, V = hsv[...,0], hsv[...,1], hsv[...,2]
    C = V * S
    X = C * (1 - np.abs((H / 60) % 2 - 1))
    m = V - C
    sector = (H // 60).astype(int) % 6
    R1 = np.select([sector==0, sector==1, sector==2, sector==3, sector==4, sector==5],
                   [C, X, 0*C, 0*C, X, C])
    G1 = np.select([sector==0, sector==1, sector==2, sector==3, sector==4, sector==5],
                   [X, C, C, X, 0*C, 0*C])
    B1 = np.select([sector==0, sector==1, sector==2, sector==3, sector==4, sector==5],
                   [0*C, 0*C, X, C, C, X])
    rgb = np.stack([(R1+m)*255, (G1+m)*255, (B1+m)*255], axis=-1).clip(0,255).astype(np.uint8)
    return rgb

# Round-trip verification
small = img_np[:8, :8]
reconstructed = hsv_to_rgb(rgb_to_hsv(small.astype(float)))
diff = np.abs(small.astype(int) - reconstructed.astype(int)).max()
print(f'Max round-trip pixel error: {diff} (should be <= 2 due to float rounding)')
# Exercise 3: Sobel X+Y combined edge detector -> binary edge map
# Key insight: Sobel combines horizontal and vertical gradients;
# thresholding the magnitude creates a clean binary edge map.
from scipy.ndimage import convolve

Kx = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=float)
Ky = np.array([[-1,-2,-1], [ 0, 0, 0], [ 1, 2, 1]], dtype=float)

gray = img_np.mean(axis=2)
Gx   = convolve(gray, Kx)
Gy   = convolve(gray, Ky)
mag  = np.sqrt(Gx**2 + Gy**2)
edge_binary = (mag > np.percentile(mag, 80)).astype(np.uint8)

fig, axes = plt.subplots(1, 3, figsize=(10, 3))
axes[0].imshow(gray, cmap='gray'); axes[0].set_title('Grayscale')
axes[1].imshow(mag, cmap='hot');   axes[1].set_title('Sobel Magnitude')
axes[2].imshow(edge_binary, cmap='gray'); axes[2].set_title('Binary Edges')
plt.tight_layout(); plt.show()
# Exercise 4: 3x3 median filter for salt-and-pepper noise
# Key insight: Median filter replaces each pixel with the neighborhood median,
# removing impulse noise while preserving edges better than Gaussian blur.
from scipy.ndimage import median_filter

# Add salt-and-pepper noise
noisy = gray.copy()
salt_idx   = np.random.choice(noisy.size, noisy.size // 20, replace=False)
pepper_idx = np.random.choice(noisy.size, noisy.size // 20, replace=False)
noisy.flat[salt_idx]   = 255
noisy.flat[pepper_idx] = 0

# Manual 3x3 median filter
def median_filter_3x3(img):
    H, W = img.shape
    out  = img.copy().astype(float)
    for i in range(1, H-1):
        for j in range(1, W-1):
            patch = img[i-1:i+2, j-1:j+2]
            out[i, j] = np.median(patch)
    return out.astype(np.uint8)

cleaned_manual = median_filter_3x3(noisy)
cleaned_scipy  = median_filter(noisy, size=3)

fig, axes = plt.subplots(1, 3, figsize=(10, 3))
axes[0].imshow(noisy,           cmap='gray'); axes[0].set_title('Noisy')
axes[1].imshow(cleaned_manual,  cmap='gray'); axes[1].set_title('Manual 3x3 Median')
axes[2].imshow(cleaned_scipy,   cmap='gray'); axes[2].set_title('Scipy Median')
plt.tight_layout(); plt.show()
print(f'Manual vs scipy max diff: {np.abs(cleaned_manual.astype(int) - cleaned_scipy.astype(int)).max()}')
# Exercise 5: Gaussian blur at sigma=1,2,3 and frequency domain comparison
# Key insight: Gaussian blur is a low-pass filter; larger sigma suppresses higher
# frequencies more aggressively, visible as dimmer outer rings in the FFT magnitude.
from scipy.ndimage import gaussian_filter

sigmas = [1, 2, 3]
blurred = [gaussian_filter(gray, sigma=s) for s in sigmas]
ffts    = [np.fft.fftshift(np.abs(np.fft.fft2(b))) for b in blurred]

fig, axes = plt.subplots(2, 3, figsize=(12, 6))
for i, (b, f, s) in enumerate(zip(blurred, ffts, sigmas)):
    axes[0, i].imshow(b, cmap='gray'); axes[0, i].set_title(f'Blur sigma={s}')
    axes[1, i].imshow(np.log1p(f), cmap='inferno'); axes[1, i].set_title(f'FFT log-magnitude sigma={s}')
plt.tight_layout(); plt.show()

02 β€” CNN from ScratchΒΆ

try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    TORCH_AVAILABLE = True
except ImportError:
    print('PyTorch not installed.'); TORCH_AVAILABLE = False

if TORCH_AVAILABLE:
    # Synthetic dataset: 3-class image classification, 32x32 RGB
    torch.manual_seed(42)
    N_img = 600
    X_img = torch.randn(N_img, 3, 32, 32)
    y_img = torch.randint(0, 3, (N_img,))
    # Make classes separable
    X_img[y_img == 0, 0] += 1.0
    X_img[y_img == 1, 1] += 1.0
    X_img[y_img == 2, 2] += 1.0

    split = 480
    tr_ds = TensorDataset(X_img[:split], y_img[:split])
    va_ds = TensorDataset(X_img[split:], y_img[split:])
    tr_dl = DataLoader(tr_ds, batch_size=32, shuffle=True)
    va_dl = DataLoader(va_ds, batch_size=32)
if TORCH_AVAILABLE:
    def train_cnn(model, tr_dl, va_dl, epochs=15, lr=1e-3):
        opt  = optim.Adam(model.parameters(), lr=lr)
        crit = nn.CrossEntropyLoss()
        tr_losses, va_losses = [], []
        for ep in range(epochs):
            model.train(); running_loss = 0
            for xb, yb in tr_dl:
                opt.zero_grad(); loss = crit(model(xb), yb); loss.backward(); opt.step()
                running_loss += loss.item()
            tr_losses.append(running_loss / len(tr_dl))
            model.eval(); va_loss = 0
            with torch.no_grad():
                for xb, yb in va_dl:
                    va_loss += crit(model(xb), yb).item()
            va_losses.append(va_loss / len(va_dl))
        return tr_losses, va_losses

    # Exercise 1: BatchNorm before vs after ReLU
    # Key insight: BN before ReLU normalizes pre-activations; BN after ReLU
    # normalizes positive values only. BN-before is more common and usually converges faster.
    class CNNBNBefore(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Conv2d(3, 16, 3, padding=1), nn.BatchNorm2d(16), nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Conv2d(16, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(),
                nn.AdaptiveAvgPool2d(4), nn.Flatten(), nn.Linear(32*4*4, 3))
        def forward(self, x): return self.net(x)

    class CNNBNAfter(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(16),
                nn.MaxPool2d(2),
                nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(32),
                nn.AdaptiveAvgPool2d(4), nn.Flatten(), nn.Linear(32*4*4, 3))
        def forward(self, x): return self.net(x)

    tr_bn_before, va_bn_before = train_cnn(CNNBNBefore(), tr_dl, va_dl, epochs=15)
    tr_bn_after,  va_bn_after  = train_cnn(CNNBNAfter(),  tr_dl, va_dl, epochs=15)

    plt.figure(figsize=(10, 4))
    plt.plot(va_bn_before, label='BN before ReLU (val)')
    plt.plot(va_bn_after,  label='BN after ReLU (val)')
    plt.title('BatchNorm Placement: Val Loss'); plt.xlabel('Epoch'); plt.legend(); plt.show()
if TORCH_AVAILABLE:
    # Exercise 2: Replace MaxPool2d with strided Conv2d
    # Key insight: Strided conv learns the downsampling kernel (more parameters)
    # whereas MaxPool2d is a fixed operation with no parameters.
    class CNNStrided(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Conv2d(3, 16, 3, padding=1),  nn.ReLU(),
                nn.Conv2d(16, 16, 3, stride=2, padding=1),  # replaces MaxPool
                nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(),
                nn.Conv2d(32, 32, 3, stride=2, padding=1),  # replaces MaxPool
                nn.AdaptiveAvgPool2d(4), nn.Flatten(), nn.Linear(32*4*4, 3))
        def forward(self, x): return self.net(x)

    class CNNMaxPool(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
                nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
                nn.AdaptiveAvgPool2d(4), nn.Flatten(), nn.Linear(32*4*4, 3))
        def forward(self, x): return self.net(x)

    m_strided = CNNStrided()
    m_maxpool = CNNMaxPool()
    params_strided = sum(p.numel() for p in m_strided.parameters())
    params_maxpool = sum(p.numel() for p in m_maxpool.parameters())
    print(f'Strided Conv params: {params_strided}')
    print(f'MaxPool Conv params: {params_maxpool}')

    _, va_strided = train_cnn(m_strided, tr_dl, va_dl, epochs=15)
    _, va_maxpool = train_cnn(m_maxpool, tr_dl, va_dl, epochs=15)
    print(f'Final val loss β€” Strided: {va_strided[-1]:.4f}, MaxPool: {va_maxpool[-1]:.4f}')
if TORCH_AVAILABLE:
    # Exercise 3: Plot training vs validation loss curves β€” identify overfitting epoch
    # Key insight: Overfitting starts where validation loss stops decreasing
    # while training loss keeps falling; use early stopping at that epoch.
    class OverfitCNN(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
                nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
                nn.Conv2d(64, 64, 3, padding=1), nn.ReLU(),
                nn.AdaptiveAvgPool2d(2), nn.Flatten(),
                nn.Linear(64*4, 128), nn.ReLU(), nn.Linear(128, 3))
        def forward(self, x): return self.net(x)

    tr_losses, va_losses = train_cnn(OverfitCNN(), tr_dl, va_dl, epochs=40)
    overfit_epoch = np.argmin(va_losses)
    print(f'Best validation epoch: {overfit_epoch + 1} (val loss={va_losses[overfit_epoch]:.4f})')

    plt.figure(figsize=(10, 4))
    plt.plot(tr_losses, label='Train loss')
    plt.plot(va_losses, label='Val loss')
    plt.axvline(overfit_epoch, color='red', linestyle='--', label=f'Best epoch={overfit_epoch+1}')
    plt.title('Train vs Val Loss β€” Overfitting Curve'); plt.xlabel('Epoch'); plt.legend(); plt.show()
if TORCH_AVAILABLE:
    # Exercise 4: Visualize first-layer filters before and after training
    # Key insight: Untrained filters are random noise; after training they show
    # oriented edges and color blobs β€” the lowest-level visual primitives.
    model_vis = CNNMaxPool()
    w_before  = model_vis.net[0].weight.data.clone()  # Conv2d is index 0

    train_cnn(model_vis, tr_dl, va_dl, epochs=20)
    w_after = model_vis.net[0].weight.data.clone()

    def show_filters(weights, title, n=8):
        fig, axes = plt.subplots(1, n, figsize=(2*n, 2))
        for i, ax in enumerate(axes):
            f = weights[i].permute(1, 2, 0).numpy()
            f = (f - f.min()) / (f.max() - f.min() + 1e-8)
            ax.imshow(f); ax.axis('off')
        fig.suptitle(title); plt.tight_layout(); plt.show()

    show_filters(w_before, 'First-layer filters BEFORE training')
    show_filters(w_after,  'First-layer filters AFTER training')
    print('After training, filters develop structured patterns: edges, color gradients.')
if TORCH_AVAILABLE:
    # Exercise 5: Gradient visualization β€” dLoss/dInput (saliency map)
    # Key insight: The gradient of the loss w.r.t. input pixels shows which
    # spatial regions most influence the prediction β€” a simple saliency map.
    model_grad = CNNMaxPool()
    train_cnn(model_grad, tr_dl, va_dl, epochs=15)
    model_grad.eval()

    sample_img = X_img[0:1].clone().requires_grad_(True)
    logits = model_grad(sample_img)
    pred_class = logits.argmax(dim=1).item()
    logits[0, pred_class].backward()

    saliency = sample_img.grad.data.abs().squeeze()
    saliency_map = saliency.max(dim=0)[0].numpy()  # max across channels

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
    ax1.imshow(sample_img[0].detach().permute(1,2,0).numpy().clip(0,1))
    ax1.set_title(f'Input (predicted class {pred_class})')
    ax2.imshow(saliency_map, cmap='hot'); ax2.set_title('Gradient Saliency Map')
    plt.tight_layout(); plt.show()

03 β€” Transfer LearningΒΆ

if TORCH_AVAILABLE:
    import torchvision.models as models
    import torchvision.transforms as T

    # Exercise 1: ResNet18 vs EfficientNet-B0 feature extraction
    # Key insight: EfficientNet scales depth/width/resolution jointly; its features
    # are often more discriminative per-parameter than ResNet for the same compute.
    transform = T.Compose([T.Resize((224, 224)), T.ToTensor(),
                            T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])

    # Synthetic PIL images
    pil_images = [Image.fromarray(np.random.randint(0, 256, (64, 64, 3), dtype=np.uint8))
                  for _ in range(10)]
    imgs_tensor = torch.stack([transform(im) for im in pil_images])

    resnet = models.resnet18(weights=None)
    resnet.fc = nn.Identity()
    resnet.eval()

    try:
        effnet = models.efficientnet_b0(weights=None)
        effnet.classifier = nn.Identity()
        effnet.eval()
        with torch.no_grad():
            feat_eff = effnet(imgs_tensor)
        print(f'EfficientNet-B0 feature dim: {feat_eff.shape[1]}')
    except AttributeError:
        print('EfficientNet not available in this torchvision version')

    with torch.no_grad():
        feat_res = resnet(imgs_tensor)
    print(f'ResNet18 feature dim: {feat_res.shape[1]}')
    print('ResNet18 params:', sum(p.numel() for p in resnet.parameters()))
if TORCH_AVAILABLE:
    # Exercise 2: LR warmup β€” linearly increase from 0 to target LR over 3 epochs
    # Key insight: Warmup prevents large early updates that destabilize pretrained
    # weights; the scheduler linearly ramps to avoid gradient shock.
    from torch.optim.lr_scheduler import LambdaLR

    model_ft = CNNMaxPool()
    opt_ft   = optim.Adam(model_ft.parameters(), lr=1e-3)
    warmup_epochs = 3
    total_epochs  = 20

    def warmup_lambda(epoch):
        if epoch < warmup_epochs:
            return (epoch + 1) / warmup_epochs
        return 1.0

    scheduler = LambdaLR(opt_ft, lr_lambda=warmup_lambda)
    crit = nn.CrossEntropyLoss()
    lrs  = []
    for ep in range(total_epochs):
        model_ft.train()
        for xb, yb in tr_dl:
            opt_ft.zero_grad(); crit(model_ft(xb), yb).backward(); opt_ft.step()
        lrs.append(opt_ft.param_groups[0]['lr'])
        scheduler.step()

    plt.figure(figsize=(8, 3))
    plt.plot(lrs, marker='o')
    plt.title('LR Warmup Schedule'); plt.xlabel('Epoch'); plt.ylabel('LR'); plt.show()
if TORCH_AVAILABLE:
    # Exercise 3: Gradient clipping during fine-tuning β€” track gradient norms
    # Key insight: Large gradient norms indicate unstable optimization; clipping
    # caps them at max_norm=1.0, preventing catastrophic forgetting of pretrained weights.
    model_gc = CNNMaxPool()
    opt_gc   = optim.Adam(model_gc.parameters(), lr=1e-3)
    crit     = nn.CrossEntropyLoss()
    grad_norms_clip, grad_norms_no = [], []

    for ep in range(15):
        model_gc.train()
        for xb, yb in tr_dl:
            opt_gc.zero_grad()
            crit(model_gc(xb), yb).backward()
            total_norm = sum(p.grad.data.norm(2).item()**2
                             for p in model_gc.parameters() if p.grad is not None)**0.5
            grad_norms_clip.append(total_norm)
            nn.utils.clip_grad_norm_(model_gc.parameters(), max_norm=1.0)
            opt_gc.step()

    plt.figure(figsize=(10, 3))
    plt.plot(grad_norms_clip, alpha=0.7, label='Gradient norm (clipped at 1.0)')
    plt.axhline(1.0, color='red', linestyle='--', label='Clip threshold')
    plt.title('Gradient Norms During Fine-Tuning'); plt.xlabel('Batch'); plt.legend(); plt.show()
if TORCH_AVAILABLE:
    # Exercise 4: Test-time augmentation (TTA) β€” average over 5 augmented versions
    # Key insight: TTA reduces prediction variance by averaging over augmentations;
    # equivalent to a cheap ensemble that costs only extra forward passes.
    model_tta = CNNMaxPool()
    train_cnn(model_tta, tr_dl, va_dl, epochs=15)
    model_tta.eval()

    tta_transforms = [
        lambda x: x,
        lambda x: torch.flip(x, dims=[3]),          # horizontal flip
        lambda x: torch.flip(x, dims=[2]),          # vertical flip
        lambda x: x + 0.05 * torch.randn_like(x),  # Gaussian noise
        lambda x: torch.roll(x, shifts=2, dims=3), # shift
    ]

    xb_test, yb_test = next(iter(va_dl))
    with torch.no_grad():
        single_pred = model_tta(xb_test).softmax(dim=1)
        tta_pred    = torch.stack([model_tta(aug(xb_test)).softmax(dim=1)
                                   for aug in tta_transforms]).mean(dim=0)

    single_acc = (single_pred.argmax(1) == yb_test).float().mean()
    tta_acc    = (tta_pred.argmax(1)    == yb_test).float().mean()
    print(f'Single-pass accuracy: {single_acc:.3f}')
    print(f'TTA accuracy (5 aug):  {tta_acc:.3f}')
if TORCH_AVAILABLE:
    # Exercise 5: t-SNE of features before and after fine-tuning
    # Key insight: Pre-trained features cluster by visual similarity but not task label;
    # after fine-tuning, class-discriminative clusters should tighten.
    from sklearn.manifold import TSNE

    def extract_features(model_backbone, data_tensor):
        feats = []
        model_backbone.eval()
        with torch.no_grad():
            for i in range(0, len(data_tensor), 32):
                batch = data_tensor[i:i+32]
                # Use the model without final layer
                x = model_backbone.net[:-1](batch) if hasattr(model_backbone, 'net') else batch
                feats.append(x.view(len(batch), -1))
        return torch.cat(feats).numpy()

    # Before fine-tuning (random init)
    model_before = CNNMaxPool()
    feats_before = extract_features(model_before, X_img[:120])

    # After fine-tuning
    model_after = CNNMaxPool()
    train_cnn(model_after, tr_dl, va_dl, epochs=20)
    feats_after = extract_features(model_after, X_img[:120])

    labels_120 = y_img[:120].numpy()
    tsne = TSNE(n_components=2, random_state=42, perplexity=20)

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    for ax, feats, title in zip(axes, [feats_before, feats_after],
                                 ['t-SNE Before Fine-tuning', 't-SNE After Fine-tuning']):
        emb = tsne.fit_transform(feats)
        for cls in range(3):
            mask = labels_120 == cls
            ax.scatter(emb[mask, 0], emb[mask, 1], label=f'Class {cls}', alpha=0.7)
        ax.set_title(title); ax.legend()
    plt.tight_layout(); plt.show()

04 β€” Object DetectionΒΆ

# Exercise 1: IoU for [x_center, y_center, w, h] format
# Key insight: YOLO uses center format internally; always convert to corner format
# before computing IoU to avoid sign errors.
def cxcywh_to_xyxy(box):
    cx, cy, w, h = box
    return [cx - w/2, cy - h/2, cx + w/2, cy + h/2]

def iou_xyxy(b1, b2):
    x1 = max(b1[0], b2[0]); y1 = max(b1[1], b2[1])
    x2 = min(b1[2], b2[2]); y2 = min(b1[3], b2[3])
    inter = max(0, x2-x1) * max(0, y2-y1)
    a1 = (b1[2]-b1[0]) * (b1[3]-b1[1])
    a2 = (b2[2]-b2[0]) * (b2[3]-b2[1])
    return inter / (a1 + a2 - inter + 1e-8)

def iou_cxcywh(b1, b2):
    return iou_xyxy(cxcywh_to_xyxy(b1), cxcywh_to_xyxy(b2))

# Verify: both formats give same IoU
box1_xyxy  = [0.1, 0.1, 0.5, 0.5]
box2_xyxy  = [0.3, 0.3, 0.7, 0.7]
box1_cxcywh = [0.3, 0.3, 0.4, 0.4]
box2_cxcywh = [0.5, 0.5, 0.4, 0.4]

iou_from_xyxy  = iou_xyxy(box1_xyxy, box2_xyxy)
iou_from_cx    = iou_cxcywh(box1_cxcywh, box2_cxcywh)
print(f'IoU (xyxy format):    {iou_from_xyxy:.4f}')
print(f'IoU (cxcywh format):  {iou_from_cx:.4f}')
print(f'Match (same boxes):   {abs(iou_from_xyxy - iou_from_cx) < 1e-6}')
if TORCH_AVAILABLE:
    # Exercise 2: Run Faster R-CNN on a synthetic image, filter by class and confidence
    # Key insight: Faster R-CNN outputs boxes per class with confidence scores;
    # NMS + confidence threshold are applied post-model to get clean detections.
    import torchvision

    # Synthetic image: white background with a colored rectangle
    syn_img = np.ones((300, 400, 3), dtype=np.uint8) * 200
    syn_img[80:180, 100:250] = [220, 50, 50]
    img_tensor = T.ToTensor()(Image.fromarray(syn_img))

    try:
        frcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None)
        frcnn.eval()
        with torch.no_grad():
            preds = frcnn([img_tensor])[0]

        conf_thresh = 0.3
        mask = preds['scores'] > conf_thresh
        filtered_boxes  = preds['boxes'][mask]
        filtered_labels = preds['labels'][mask]
        filtered_scores = preds['scores'][mask]
        print(f'Detections above {conf_thresh}: {len(filtered_boxes)}')
    except Exception as e:
        print(f'Faster R-CNN: {e}')
        print('Using synthetic detection output for demo.')
        filtered_boxes  = torch.tensor([[100., 80., 250., 180.]])
        filtered_scores = torch.tensor([0.85])
        filtered_labels = torch.tensor([1])

    plt.figure(figsize=(6, 4))
    plt.imshow(syn_img)
    for box, score, label in zip(filtered_boxes, filtered_scores, filtered_labels):
        x1,y1,x2,y2 = box.tolist()
        rect = plt.Rectangle((x1,y1), x2-x1, y2-y1, fill=False, edgecolor='lime', linewidth=2)
        plt.gca().add_patch(rect)
        plt.text(x1, y1-5, f'cls={label.item()} {score.item():.2f}', color='lime', fontsize=8)
    plt.title('Faster R-CNN detections'); plt.axis('off'); plt.show()
if TORCH_AVAILABLE:
    # Exercise 3: Sliding window detector over 128x128 with 32x32 windows
    # Key insight: Sliding window is computationally expensive (O(H*W/stride^2))
    # but conceptually simple; modern detectors use shared feature maps instead.
    class TinyCNN(nn.Module):
        def __init__(self, num_classes=2):
            super().__init__()
            self.net = nn.Sequential(
                nn.Conv2d(3, 8, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d(4),
                nn.Flatten(), nn.Linear(8*4*4, num_classes))
        def forward(self, x): return self.net(x)

    tiny_clf = TinyCNN(num_classes=2)
    tiny_clf.eval()

    large_img = torch.rand(1, 3, 128, 128)
    win_size, stride = 32, 16
    detections = []

    with torch.no_grad():
        for y in range(0, 128 - win_size + 1, stride):
            for x in range(0, 128 - win_size + 1, stride):
                patch = large_img[:, :, y:y+win_size, x:x+win_size]
                score = tiny_clf(patch).softmax(dim=1)[0, 1].item()
                detections.append({'x': x, 'y': y, 'score': score})

    top_det = sorted(detections, key=lambda d: d['score'], reverse=True)[:3]
    print(f'Total windows evaluated: {len(detections)}')
    print('Top-3 detections:', top_det)
if TORCH_AVAILABLE:
    # Exercise 4: Time YOLO vs Faster R-CNN on 100 images
    # Key insight: YOLO is a single-stage detector (one forward pass per image);
    # Faster R-CNN is two-stage (region proposals + classification), so slower but more accurate.
    import time

    n_imgs = 20  # reduced for demo speed
    test_imgs = [torch.rand(3, 320, 320) for _ in range(n_imgs)]

    # Time Faster R-CNN
    try:
        frcnn_time = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None)
        frcnn_time.eval()
        start = time.perf_counter()
        with torch.no_grad():
            for img in test_imgs:
                _ = frcnn_time([img])
        frcnn_elapsed = time.perf_counter() - start
        frcnn_fps = n_imgs / frcnn_elapsed
        print(f'Faster R-CNN: {frcnn_fps:.1f} FPS ({frcnn_elapsed:.2f}s for {n_imgs} images)')
    except Exception as e:
        print(f'Faster R-CNN timing skipped: {e}')

    # Simulate YOLO with a tiny single-stage model (placeholder)
    yolo_sim = nn.Sequential(nn.Conv2d(3, 32, 3, stride=2, padding=1), nn.ReLU(),
                              nn.AdaptiveAvgPool2d(1), nn.Flatten(), nn.Linear(32, 10))
    yolo_sim.eval()
    start = time.perf_counter()
    with torch.no_grad():
        for img in test_imgs:
            _ = yolo_sim(img.unsqueeze(0))
    yolo_elapsed = time.perf_counter() - start
    print(f'YOLO-like (tiny model): {n_imgs/yolo_elapsed:.1f} FPS β€” single-stage is faster')
# Exercise 5: PASCAL VOC mAP@0.5 computation
# Key insight: mAP averages per-class AP scores; AP is the area under the
# precision-recall curve, computed by accumulating TP/FP sorted by confidence.

def compute_ap(recalls, precisions):
    """Compute AP using 11-point interpolation (VOC 2007 style)."""
    ap = 0.0
    for thr in np.linspace(0, 1, 11):
        p = max((precisions[i] for i, r in enumerate(recalls) if r >= thr), default=0)
        ap += p / 11
    return ap

def voc_ap_for_class(pred_boxes, pred_scores, gt_boxes, iou_thresh=0.5):
    """pred_boxes: list of [x1,y1,x2,y2], pred_scores: list of floats, gt_boxes: list of [x1,y1,x2,y2]."""
    sorted_idx = np.argsort(-np.array(pred_scores))
    matched_gt  = set()
    tp, fp = [], []
    for idx in sorted_idx:
        pb = pred_boxes[idx]
        best_iou, best_j = 0, -1
        for j, gb in enumerate(gt_boxes):
            iou = iou_xyxy(pb, gb)
            if iou > best_iou:
                best_iou, best_j = iou, j
        if best_iou >= iou_thresh and best_j not in matched_gt:
            tp.append(1); fp.append(0); matched_gt.add(best_j)
        else:
            tp.append(0); fp.append(1)
    tp_cum = np.cumsum(tp); fp_cum = np.cumsum(fp)
    recalls    = (tp_cum / (len(gt_boxes) + 1e-8)).tolist()
    precisions = (tp_cum / (tp_cum + fp_cum + 1e-8)).tolist()
    return compute_ap(recalls, precisions)

# Synthetic 3-class detection results
np.random.seed(7)
class_aps = []
for cls in range(3):
    gt   = [[np.random.uniform(0, 0.6), np.random.uniform(0, 0.6),
              np.random.uniform(0.4, 1), np.random.uniform(0.4, 1)] for _ in range(5)]
    pred = [[b[0]+np.random.uniform(-0.1,0.1), b[1]+np.random.uniform(-0.1,0.1),
             b[2]+np.random.uniform(-0.1,0.1), b[3]+np.random.uniform(-0.1,0.1)] for b in gt]
    pred += [[np.random.uniform(0,0.5)]*2 + [np.random.uniform(0.5,1)]*2 for _ in range(3)]  # FP
    scores = [np.random.uniform(0.5, 1) for _ in gt] + [np.random.uniform(0, 0.4) for _ in range(3)]
    ap = voc_ap_for_class(pred, scores, gt)
    class_aps.append(ap)
    print(f'Class {cls} AP@0.5: {ap:.3f}')

print(f'mAP@0.5: {np.mean(class_aps):.3f}')

05 β€” Image SegmentationΒΆ

if TORCH_AVAILABLE:
    # Minimal U-Net for 3-class segmentation (64x64 synthetic scenes)
    class UNet(nn.Module):
        def __init__(self, in_ch=3, out_ch=3):
            super().__init__()
            def block(ic, oc): return nn.Sequential(nn.Conv2d(ic,oc,3,padding=1), nn.ReLU(),
                                                     nn.Conv2d(oc,oc,3,padding=1), nn.ReLU())
            self.enc1 = block(in_ch, 16); self.pool1 = nn.MaxPool2d(2)
            self.enc2 = block(16, 32);    self.pool2 = nn.MaxPool2d(2)
            self.bottleneck = block(32, 64)
            self.up2  = nn.ConvTranspose2d(64, 32, 2, stride=2)
            self.dec2 = block(64, 32)
            self.up1  = nn.ConvTranspose2d(32, 16, 2, stride=2)
            self.dec1 = block(32, 16)
            self.out  = nn.Conv2d(16, out_ch, 1)
        def forward(self, x):
            e1 = self.enc1(x)
            e2 = self.enc2(self.pool1(e1))
            b  = self.bottleneck(self.pool2(e2))
            d2 = self.dec2(torch.cat([self.up2(b), e2], dim=1))
            d1 = self.dec1(torch.cat([self.up1(d2), e1], dim=1))
            return self.out(d1)

    # Synthetic segmentation dataset
    N_seg = 80; H_seg = W_seg = 64; n_classes = 3
    X_seg = torch.rand(N_seg, 3, H_seg, W_seg)
    y_seg = torch.zeros(N_seg, H_seg, W_seg, dtype=torch.long)
    y_seg[:, :32, :]  = 0  # top half = class 0
    y_seg[:, 32:, :32] = 1  # bottom-left = class 1
    y_seg[:, 32:, 32:] = 2  # bottom-right = class 2

    seg_split = 60
    seg_tr = TensorDataset(X_seg[:seg_split], y_seg[:seg_split])
    seg_va = TensorDataset(X_seg[seg_split:], y_seg[seg_split:])
    seg_tr_dl = DataLoader(seg_tr, batch_size=8, shuffle=True)
    seg_va_dl = DataLoader(seg_va, batch_size=8)

    def compute_miou(preds, targets, n_cls=3):
        ious = []
        p, t = preds.flatten(), targets.flatten()
        for c in range(n_cls):
            inter = ((p == c) & (t == c)).sum().float()
            union = ((p == c) | (t == c)).sum().float()
            if union > 0: ious.append((inter / union).item())
        return np.mean(ious) if ious else 0.0
if TORCH_AVAILABLE:
    # Exercise 1: Train U-Net for 30 epochs, plot mIoU learning curve
    # Key insight: mIoU measures overlap per class then averages;
    # it penalizes class imbalance less than pixel accuracy.
    unet = UNet()
    opt_unet = optim.Adam(unet.parameters(), lr=1e-3)
    crit_ce  = nn.CrossEntropyLoss()
    miou_curve = []

    for ep in range(30):
        unet.train()
        for xb, yb in seg_tr_dl:
            opt_unet.zero_grad(); crit_ce(unet(xb), yb).backward(); opt_unet.step()
        unet.eval(); miou_ep = []
        with torch.no_grad():
            for xb, yb in seg_va_dl:
                p = unet(xb).argmax(dim=1)
                miou_ep.append(compute_miou(p, yb))
        miou_curve.append(np.mean(miou_ep))

    plt.figure(figsize=(8, 4))
    plt.plot(miou_curve, marker='o', markersize=3)
    plt.title('U-Net mIoU Learning Curve'); plt.xlabel('Epoch'); plt.ylabel('mIoU'); plt.show()
    print(f'Final mIoU: {miou_curve[-1]:.4f}')
if TORCH_AVAILABLE:
    # Exercise 2: Dice Loss vs CrossEntropyLoss convergence
    # Key insight: Dice loss directly optimizes the IoU-like overlap measure,
    # making it better for imbalanced segmentation tasks.
    def dice_loss(logits, targets, smooth=1.0):
        probs  = logits.softmax(dim=1)
        n_cls  = logits.shape[1]
        losses = []
        for c in range(n_cls):
            p = probs[:, c].flatten()
            t = (targets == c).float().flatten()
            inter = (p * t).sum()
            losses.append(1 - (2*inter + smooth) / (p.sum() + t.sum() + smooth))
        return torch.stack(losses).mean()

    def train_seg(loss_fn, epochs=20):
        m = UNet()
        o = optim.Adam(m.parameters(), lr=1e-3)
        miou_vals = []
        for ep in range(epochs):
            m.train()
            for xb, yb in seg_tr_dl:
                o.zero_grad(); loss_fn(m(xb), yb).backward(); o.step()
            m.eval(); ep_miou = []
            with torch.no_grad():
                for xb, yb in seg_va_dl:
                    ep_miou.append(compute_miou(m(xb).argmax(1), yb))
            miou_vals.append(np.mean(ep_miou))
        return miou_vals

    miou_ce   = train_seg(nn.CrossEntropyLoss(), epochs=20)
    miou_dice = train_seg(dice_loss, epochs=20)

    plt.figure(figsize=(8, 4))
    plt.plot(miou_ce,   label='CrossEntropy')
    plt.plot(miou_dice, label='Dice Loss')
    plt.title('Dice vs CrossEntropy mIoU'); plt.xlabel('Epoch'); plt.legend(); plt.show()
if TORCH_AVAILABLE:
    # Exercise 3: Class-weighted loss (inversely proportional to pixel frequency)
    # Key insight: Rare classes contribute little to standard CE loss;
    # inverse-frequency weights up-weight rare classes, improving their IoU.
    pixel_counts = torch.zeros(n_classes)
    for _, yb in seg_tr_dl:
        for c in range(n_classes):
            pixel_counts[c] += (yb == c).sum()

    inv_freq_weights = 1.0 / (pixel_counts / pixel_counts.sum() + 1e-6)
    inv_freq_weights /= inv_freq_weights.sum()
    print('Class pixel counts:', pixel_counts.tolist())
    print('Inverse-frequency weights:', inv_freq_weights.tolist())

    weighted_ce = nn.CrossEntropyLoss(weight=inv_freq_weights)
    miou_weighted = train_seg(weighted_ce, epochs=20)
    print(f'Final mIoU with class weighting: {miou_weighted[-1]:.4f}')
if TORCH_AVAILABLE:
    # Exercise 4: DeepLabV3 pretrained on a PIL image
    # Key insight: DeepLabV3 uses Atrous Spatial Pyramid Pooling (ASPP) to capture
    # multi-scale context; pretrained COCO weights work zero-shot on natural images.
    try:
        from torchvision.models.segmentation import deeplabv3_resnet50
        deeplab = deeplabv3_resnet50(weights=None)
        deeplab.eval()

        pil_seg  = Image.fromarray(np.random.randint(0, 256, (256, 256, 3), dtype=np.uint8))
        seg_tfm  = T.Compose([T.ToTensor(),
                              T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])
        inp      = seg_tfm(pil_seg).unsqueeze(0)

        with torch.no_grad():
            out = deeplab(inp)['out']

        pred_mask = out.argmax(dim=1).squeeze().numpy()
        plt.figure(figsize=(8, 3))
        plt.subplot(1,2,1); plt.imshow(pil_seg); plt.title('Input Image')
        plt.subplot(1,2,2); plt.imshow(pred_mask, cmap='tab20'); plt.title('DeepLabV3 Mask')
        plt.tight_layout(); plt.show()
    except Exception as e:
        print(f'DeepLabV3 error: {e}')
if TORCH_AVAILABLE:
    # Exercise 5: Sliding window inference with overlapping tiles
    # Key insight: Overlapping tiles and averaging predictions at overlap regions
    # reduces boundary artifacts compared to non-overlapping tiling.
    def sliding_window_inference(model, img_tensor, tile_size=128, stride=96, n_cls=3):
        """img_tensor: (1, C, H, W), returns (n_cls, H, W) accumulated logits."""
        _, C, H, W = img_tensor.shape
        accum   = torch.zeros(n_cls, H, W)
        count   = torch.zeros(1, H, W)
        model.eval()
        with torch.no_grad():
            for y in range(0, H - tile_size + 1, stride):
                for x in range(0, W - tile_size + 1, stride):
                    tile = img_tensor[:, :, y:y+tile_size, x:x+tile_size]
                    pred = torch.nn.functional.interpolate(
                        model(tile), size=(tile_size, tile_size), mode='bilinear', align_corners=False)
                    accum[:, y:y+tile_size, x:x+tile_size] += pred.squeeze(0)
                    count[0, y:y+tile_size, x:x+tile_size] += 1
        count = count.clamp(min=1)
        return (accum / count).argmax(dim=0)

    large_input = torch.rand(1, 3, 512, 512)
    # Use a simple conv model for demo (U-Net expects specific sizes)
    demo_seg = nn.Sequential(nn.Conv2d(3, 3, 3, padding=1))
    result   = sliding_window_inference(demo_seg, large_input, tile_size=128, stride=96, n_cls=3)
    print(f'Sliding window output shape: {result.shape}')  # (512, 512)