Solutions: Computer Vision TrackΒΆ
Worked solutions to all exercises from the computer-vision/ notebooks.
01 β Image Processing BasicsΒΆ
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import warnings; warnings.filterwarnings('ignore')
# Synthetic RGB test image (H=64, W=64, C=3)
np.random.seed(42)
img_np = np.random.randint(0, 256, (64, 64, 3), dtype=np.uint8)
img_pil = Image.fromarray(img_np)
# Exercise 1: Center crop and random crop
# Key insight: Center crop is deterministic and used at inference;
# random crop introduces spatial augmentation during training.
def center_crop(img_arr, crop_h, crop_w):
H, W = img_arr.shape[:2]
top = (H - crop_h) // 2
left = (W - crop_w) // 2
return img_arr[top:top+crop_h, left:left+crop_w]
def random_crop(img_arr, crop_h, crop_w):
H, W = img_arr.shape[:2]
top = np.random.randint(0, H - crop_h + 1)
left = np.random.randint(0, W - crop_w + 1)
return img_arr[top:top+crop_h, left:left+crop_w]
cc = center_crop(img_np, 32, 32)
rc = random_crop(img_np, 32, 32)
print(f'Center crop shape: {cc.shape}') # (32, 32, 3)
print(f'Random crop shape: {rc.shape}') # (32, 32, 3)
assert cc.shape == (32, 32, 3)
assert rc.shape == (32, 32, 3)
# Exercise 2: HSV -> RGB conversion (inverse of RGB -> HSV)
# Key insight: HSV is easier for human-intuitive color manipulation;
# the inverse uses sector-based piecewise reconstruction of RGB from (H, S, V).
def rgb_to_hsv(rgb):
r, g, b = rgb[...,0]/255.0, rgb[...,1]/255.0, rgb[...,2]/255.0
Cmax = np.maximum(np.maximum(r, g), b)
Cmin = np.minimum(np.minimum(r, g), b)
delta = Cmax - Cmin + 1e-10
H = np.where(Cmax == r, (g - b) / delta % 6,
np.where(Cmax == g, (b - r) / delta + 2, (r - g) / delta + 4)) * 60
S = np.where(Cmax == 0, 0, delta / Cmax)
V = Cmax
return np.stack([H, S, V], axis=-1)
def hsv_to_rgb(hsv):
H, S, V = hsv[...,0], hsv[...,1], hsv[...,2]
C = V * S
X = C * (1 - np.abs((H / 60) % 2 - 1))
m = V - C
sector = (H // 60).astype(int) % 6
R1 = np.select([sector==0, sector==1, sector==2, sector==3, sector==4, sector==5],
[C, X, 0*C, 0*C, X, C])
G1 = np.select([sector==0, sector==1, sector==2, sector==3, sector==4, sector==5],
[X, C, C, X, 0*C, 0*C])
B1 = np.select([sector==0, sector==1, sector==2, sector==3, sector==4, sector==5],
[0*C, 0*C, X, C, C, X])
rgb = np.stack([(R1+m)*255, (G1+m)*255, (B1+m)*255], axis=-1).clip(0,255).astype(np.uint8)
return rgb
# Round-trip verification
small = img_np[:8, :8]
reconstructed = hsv_to_rgb(rgb_to_hsv(small.astype(float)))
diff = np.abs(small.astype(int) - reconstructed.astype(int)).max()
print(f'Max round-trip pixel error: {diff} (should be <= 2 due to float rounding)')
# Exercise 3: Sobel X+Y combined edge detector -> binary edge map
# Key insight: Sobel combines horizontal and vertical gradients;
# thresholding the magnitude creates a clean binary edge map.
from scipy.ndimage import convolve
Kx = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=float)
Ky = np.array([[-1,-2,-1], [ 0, 0, 0], [ 1, 2, 1]], dtype=float)
gray = img_np.mean(axis=2)
Gx = convolve(gray, Kx)
Gy = convolve(gray, Ky)
mag = np.sqrt(Gx**2 + Gy**2)
edge_binary = (mag > np.percentile(mag, 80)).astype(np.uint8)
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
axes[0].imshow(gray, cmap='gray'); axes[0].set_title('Grayscale')
axes[1].imshow(mag, cmap='hot'); axes[1].set_title('Sobel Magnitude')
axes[2].imshow(edge_binary, cmap='gray'); axes[2].set_title('Binary Edges')
plt.tight_layout(); plt.show()
# Exercise 4: 3x3 median filter for salt-and-pepper noise
# Key insight: Median filter replaces each pixel with the neighborhood median,
# removing impulse noise while preserving edges better than Gaussian blur.
from scipy.ndimage import median_filter
# Add salt-and-pepper noise
noisy = gray.copy()
salt_idx = np.random.choice(noisy.size, noisy.size // 20, replace=False)
pepper_idx = np.random.choice(noisy.size, noisy.size // 20, replace=False)
noisy.flat[salt_idx] = 255
noisy.flat[pepper_idx] = 0
# Manual 3x3 median filter
def median_filter_3x3(img):
H, W = img.shape
out = img.copy().astype(float)
for i in range(1, H-1):
for j in range(1, W-1):
patch = img[i-1:i+2, j-1:j+2]
out[i, j] = np.median(patch)
return out.astype(np.uint8)
cleaned_manual = median_filter_3x3(noisy)
cleaned_scipy = median_filter(noisy, size=3)
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
axes[0].imshow(noisy, cmap='gray'); axes[0].set_title('Noisy')
axes[1].imshow(cleaned_manual, cmap='gray'); axes[1].set_title('Manual 3x3 Median')
axes[2].imshow(cleaned_scipy, cmap='gray'); axes[2].set_title('Scipy Median')
plt.tight_layout(); plt.show()
print(f'Manual vs scipy max diff: {np.abs(cleaned_manual.astype(int) - cleaned_scipy.astype(int)).max()}')
# Exercise 5: Gaussian blur at sigma=1,2,3 and frequency domain comparison
# Key insight: Gaussian blur is a low-pass filter; larger sigma suppresses higher
# frequencies more aggressively, visible as dimmer outer rings in the FFT magnitude.
from scipy.ndimage import gaussian_filter
sigmas = [1, 2, 3]
blurred = [gaussian_filter(gray, sigma=s) for s in sigmas]
ffts = [np.fft.fftshift(np.abs(np.fft.fft2(b))) for b in blurred]
fig, axes = plt.subplots(2, 3, figsize=(12, 6))
for i, (b, f, s) in enumerate(zip(blurred, ffts, sigmas)):
axes[0, i].imshow(b, cmap='gray'); axes[0, i].set_title(f'Blur sigma={s}')
axes[1, i].imshow(np.log1p(f), cmap='inferno'); axes[1, i].set_title(f'FFT log-magnitude sigma={s}')
plt.tight_layout(); plt.show()
02 β CNN from ScratchΒΆ
try:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
TORCH_AVAILABLE = True
except ImportError:
print('PyTorch not installed.'); TORCH_AVAILABLE = False
if TORCH_AVAILABLE:
# Synthetic dataset: 3-class image classification, 32x32 RGB
torch.manual_seed(42)
N_img = 600
X_img = torch.randn(N_img, 3, 32, 32)
y_img = torch.randint(0, 3, (N_img,))
# Make classes separable
X_img[y_img == 0, 0] += 1.0
X_img[y_img == 1, 1] += 1.0
X_img[y_img == 2, 2] += 1.0
split = 480
tr_ds = TensorDataset(X_img[:split], y_img[:split])
va_ds = TensorDataset(X_img[split:], y_img[split:])
tr_dl = DataLoader(tr_ds, batch_size=32, shuffle=True)
va_dl = DataLoader(va_ds, batch_size=32)
if TORCH_AVAILABLE:
def train_cnn(model, tr_dl, va_dl, epochs=15, lr=1e-3):
opt = optim.Adam(model.parameters(), lr=lr)
crit = nn.CrossEntropyLoss()
tr_losses, va_losses = [], []
for ep in range(epochs):
model.train(); running_loss = 0
for xb, yb in tr_dl:
opt.zero_grad(); loss = crit(model(xb), yb); loss.backward(); opt.step()
running_loss += loss.item()
tr_losses.append(running_loss / len(tr_dl))
model.eval(); va_loss = 0
with torch.no_grad():
for xb, yb in va_dl:
va_loss += crit(model(xb), yb).item()
va_losses.append(va_loss / len(va_dl))
return tr_losses, va_losses
# Exercise 1: BatchNorm before vs after ReLU
# Key insight: BN before ReLU normalizes pre-activations; BN after ReLU
# normalizes positive values only. BN-before is more common and usually converges faster.
class CNNBNBefore(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(3, 16, 3, padding=1), nn.BatchNorm2d(16), nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(16, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(),
nn.AdaptiveAvgPool2d(4), nn.Flatten(), nn.Linear(32*4*4, 3))
def forward(self, x): return self.net(x)
class CNNBNAfter(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(16),
nn.MaxPool2d(2),
nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(32),
nn.AdaptiveAvgPool2d(4), nn.Flatten(), nn.Linear(32*4*4, 3))
def forward(self, x): return self.net(x)
tr_bn_before, va_bn_before = train_cnn(CNNBNBefore(), tr_dl, va_dl, epochs=15)
tr_bn_after, va_bn_after = train_cnn(CNNBNAfter(), tr_dl, va_dl, epochs=15)
plt.figure(figsize=(10, 4))
plt.plot(va_bn_before, label='BN before ReLU (val)')
plt.plot(va_bn_after, label='BN after ReLU (val)')
plt.title('BatchNorm Placement: Val Loss'); plt.xlabel('Epoch'); plt.legend(); plt.show()
if TORCH_AVAILABLE:
# Exercise 2: Replace MaxPool2d with strided Conv2d
# Key insight: Strided conv learns the downsampling kernel (more parameters)
# whereas MaxPool2d is a fixed operation with no parameters.
class CNNStrided(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(),
nn.Conv2d(16, 16, 3, stride=2, padding=1), # replaces MaxPool
nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(),
nn.Conv2d(32, 32, 3, stride=2, padding=1), # replaces MaxPool
nn.AdaptiveAvgPool2d(4), nn.Flatten(), nn.Linear(32*4*4, 3))
def forward(self, x): return self.net(x)
class CNNMaxPool(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.AdaptiveAvgPool2d(4), nn.Flatten(), nn.Linear(32*4*4, 3))
def forward(self, x): return self.net(x)
m_strided = CNNStrided()
m_maxpool = CNNMaxPool()
params_strided = sum(p.numel() for p in m_strided.parameters())
params_maxpool = sum(p.numel() for p in m_maxpool.parameters())
print(f'Strided Conv params: {params_strided}')
print(f'MaxPool Conv params: {params_maxpool}')
_, va_strided = train_cnn(m_strided, tr_dl, va_dl, epochs=15)
_, va_maxpool = train_cnn(m_maxpool, tr_dl, va_dl, epochs=15)
print(f'Final val loss β Strided: {va_strided[-1]:.4f}, MaxPool: {va_maxpool[-1]:.4f}')
if TORCH_AVAILABLE:
# Exercise 3: Plot training vs validation loss curves β identify overfitting epoch
# Key insight: Overfitting starts where validation loss stops decreasing
# while training loss keeps falling; use early stopping at that epoch.
class OverfitCNN(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(64, 64, 3, padding=1), nn.ReLU(),
nn.AdaptiveAvgPool2d(2), nn.Flatten(),
nn.Linear(64*4, 128), nn.ReLU(), nn.Linear(128, 3))
def forward(self, x): return self.net(x)
tr_losses, va_losses = train_cnn(OverfitCNN(), tr_dl, va_dl, epochs=40)
overfit_epoch = np.argmin(va_losses)
print(f'Best validation epoch: {overfit_epoch + 1} (val loss={va_losses[overfit_epoch]:.4f})')
plt.figure(figsize=(10, 4))
plt.plot(tr_losses, label='Train loss')
plt.plot(va_losses, label='Val loss')
plt.axvline(overfit_epoch, color='red', linestyle='--', label=f'Best epoch={overfit_epoch+1}')
plt.title('Train vs Val Loss β Overfitting Curve'); plt.xlabel('Epoch'); plt.legend(); plt.show()
if TORCH_AVAILABLE:
# Exercise 4: Visualize first-layer filters before and after training
# Key insight: Untrained filters are random noise; after training they show
# oriented edges and color blobs β the lowest-level visual primitives.
model_vis = CNNMaxPool()
w_before = model_vis.net[0].weight.data.clone() # Conv2d is index 0
train_cnn(model_vis, tr_dl, va_dl, epochs=20)
w_after = model_vis.net[0].weight.data.clone()
def show_filters(weights, title, n=8):
fig, axes = plt.subplots(1, n, figsize=(2*n, 2))
for i, ax in enumerate(axes):
f = weights[i].permute(1, 2, 0).numpy()
f = (f - f.min()) / (f.max() - f.min() + 1e-8)
ax.imshow(f); ax.axis('off')
fig.suptitle(title); plt.tight_layout(); plt.show()
show_filters(w_before, 'First-layer filters BEFORE training')
show_filters(w_after, 'First-layer filters AFTER training')
print('After training, filters develop structured patterns: edges, color gradients.')
if TORCH_AVAILABLE:
# Exercise 5: Gradient visualization β dLoss/dInput (saliency map)
# Key insight: The gradient of the loss w.r.t. input pixels shows which
# spatial regions most influence the prediction β a simple saliency map.
model_grad = CNNMaxPool()
train_cnn(model_grad, tr_dl, va_dl, epochs=15)
model_grad.eval()
sample_img = X_img[0:1].clone().requires_grad_(True)
logits = model_grad(sample_img)
pred_class = logits.argmax(dim=1).item()
logits[0, pred_class].backward()
saliency = sample_img.grad.data.abs().squeeze()
saliency_map = saliency.max(dim=0)[0].numpy() # max across channels
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
ax1.imshow(sample_img[0].detach().permute(1,2,0).numpy().clip(0,1))
ax1.set_title(f'Input (predicted class {pred_class})')
ax2.imshow(saliency_map, cmap='hot'); ax2.set_title('Gradient Saliency Map')
plt.tight_layout(); plt.show()
03 β Transfer LearningΒΆ
if TORCH_AVAILABLE:
import torchvision.models as models
import torchvision.transforms as T
# Exercise 1: ResNet18 vs EfficientNet-B0 feature extraction
# Key insight: EfficientNet scales depth/width/resolution jointly; its features
# are often more discriminative per-parameter than ResNet for the same compute.
transform = T.Compose([T.Resize((224, 224)), T.ToTensor(),
T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])
# Synthetic PIL images
pil_images = [Image.fromarray(np.random.randint(0, 256, (64, 64, 3), dtype=np.uint8))
for _ in range(10)]
imgs_tensor = torch.stack([transform(im) for im in pil_images])
resnet = models.resnet18(weights=None)
resnet.fc = nn.Identity()
resnet.eval()
try:
effnet = models.efficientnet_b0(weights=None)
effnet.classifier = nn.Identity()
effnet.eval()
with torch.no_grad():
feat_eff = effnet(imgs_tensor)
print(f'EfficientNet-B0 feature dim: {feat_eff.shape[1]}')
except AttributeError:
print('EfficientNet not available in this torchvision version')
with torch.no_grad():
feat_res = resnet(imgs_tensor)
print(f'ResNet18 feature dim: {feat_res.shape[1]}')
print('ResNet18 params:', sum(p.numel() for p in resnet.parameters()))
if TORCH_AVAILABLE:
# Exercise 2: LR warmup β linearly increase from 0 to target LR over 3 epochs
# Key insight: Warmup prevents large early updates that destabilize pretrained
# weights; the scheduler linearly ramps to avoid gradient shock.
from torch.optim.lr_scheduler import LambdaLR
model_ft = CNNMaxPool()
opt_ft = optim.Adam(model_ft.parameters(), lr=1e-3)
warmup_epochs = 3
total_epochs = 20
def warmup_lambda(epoch):
if epoch < warmup_epochs:
return (epoch + 1) / warmup_epochs
return 1.0
scheduler = LambdaLR(opt_ft, lr_lambda=warmup_lambda)
crit = nn.CrossEntropyLoss()
lrs = []
for ep in range(total_epochs):
model_ft.train()
for xb, yb in tr_dl:
opt_ft.zero_grad(); crit(model_ft(xb), yb).backward(); opt_ft.step()
lrs.append(opt_ft.param_groups[0]['lr'])
scheduler.step()
plt.figure(figsize=(8, 3))
plt.plot(lrs, marker='o')
plt.title('LR Warmup Schedule'); plt.xlabel('Epoch'); plt.ylabel('LR'); plt.show()
if TORCH_AVAILABLE:
# Exercise 3: Gradient clipping during fine-tuning β track gradient norms
# Key insight: Large gradient norms indicate unstable optimization; clipping
# caps them at max_norm=1.0, preventing catastrophic forgetting of pretrained weights.
model_gc = CNNMaxPool()
opt_gc = optim.Adam(model_gc.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss()
grad_norms_clip, grad_norms_no = [], []
for ep in range(15):
model_gc.train()
for xb, yb in tr_dl:
opt_gc.zero_grad()
crit(model_gc(xb), yb).backward()
total_norm = sum(p.grad.data.norm(2).item()**2
for p in model_gc.parameters() if p.grad is not None)**0.5
grad_norms_clip.append(total_norm)
nn.utils.clip_grad_norm_(model_gc.parameters(), max_norm=1.0)
opt_gc.step()
plt.figure(figsize=(10, 3))
plt.plot(grad_norms_clip, alpha=0.7, label='Gradient norm (clipped at 1.0)')
plt.axhline(1.0, color='red', linestyle='--', label='Clip threshold')
plt.title('Gradient Norms During Fine-Tuning'); plt.xlabel('Batch'); plt.legend(); plt.show()
if TORCH_AVAILABLE:
# Exercise 4: Test-time augmentation (TTA) β average over 5 augmented versions
# Key insight: TTA reduces prediction variance by averaging over augmentations;
# equivalent to a cheap ensemble that costs only extra forward passes.
model_tta = CNNMaxPool()
train_cnn(model_tta, tr_dl, va_dl, epochs=15)
model_tta.eval()
tta_transforms = [
lambda x: x,
lambda x: torch.flip(x, dims=[3]), # horizontal flip
lambda x: torch.flip(x, dims=[2]), # vertical flip
lambda x: x + 0.05 * torch.randn_like(x), # Gaussian noise
lambda x: torch.roll(x, shifts=2, dims=3), # shift
]
xb_test, yb_test = next(iter(va_dl))
with torch.no_grad():
single_pred = model_tta(xb_test).softmax(dim=1)
tta_pred = torch.stack([model_tta(aug(xb_test)).softmax(dim=1)
for aug in tta_transforms]).mean(dim=0)
single_acc = (single_pred.argmax(1) == yb_test).float().mean()
tta_acc = (tta_pred.argmax(1) == yb_test).float().mean()
print(f'Single-pass accuracy: {single_acc:.3f}')
print(f'TTA accuracy (5 aug): {tta_acc:.3f}')
if TORCH_AVAILABLE:
# Exercise 5: t-SNE of features before and after fine-tuning
# Key insight: Pre-trained features cluster by visual similarity but not task label;
# after fine-tuning, class-discriminative clusters should tighten.
from sklearn.manifold import TSNE
def extract_features(model_backbone, data_tensor):
feats = []
model_backbone.eval()
with torch.no_grad():
for i in range(0, len(data_tensor), 32):
batch = data_tensor[i:i+32]
# Use the model without final layer
x = model_backbone.net[:-1](batch) if hasattr(model_backbone, 'net') else batch
feats.append(x.view(len(batch), -1))
return torch.cat(feats).numpy()
# Before fine-tuning (random init)
model_before = CNNMaxPool()
feats_before = extract_features(model_before, X_img[:120])
# After fine-tuning
model_after = CNNMaxPool()
train_cnn(model_after, tr_dl, va_dl, epochs=20)
feats_after = extract_features(model_after, X_img[:120])
labels_120 = y_img[:120].numpy()
tsne = TSNE(n_components=2, random_state=42, perplexity=20)
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, feats, title in zip(axes, [feats_before, feats_after],
['t-SNE Before Fine-tuning', 't-SNE After Fine-tuning']):
emb = tsne.fit_transform(feats)
for cls in range(3):
mask = labels_120 == cls
ax.scatter(emb[mask, 0], emb[mask, 1], label=f'Class {cls}', alpha=0.7)
ax.set_title(title); ax.legend()
plt.tight_layout(); plt.show()
04 β Object DetectionΒΆ
# Exercise 1: IoU for [x_center, y_center, w, h] format
# Key insight: YOLO uses center format internally; always convert to corner format
# before computing IoU to avoid sign errors.
def cxcywh_to_xyxy(box):
cx, cy, w, h = box
return [cx - w/2, cy - h/2, cx + w/2, cy + h/2]
def iou_xyxy(b1, b2):
x1 = max(b1[0], b2[0]); y1 = max(b1[1], b2[1])
x2 = min(b1[2], b2[2]); y2 = min(b1[3], b2[3])
inter = max(0, x2-x1) * max(0, y2-y1)
a1 = (b1[2]-b1[0]) * (b1[3]-b1[1])
a2 = (b2[2]-b2[0]) * (b2[3]-b2[1])
return inter / (a1 + a2 - inter + 1e-8)
def iou_cxcywh(b1, b2):
return iou_xyxy(cxcywh_to_xyxy(b1), cxcywh_to_xyxy(b2))
# Verify: both formats give same IoU
box1_xyxy = [0.1, 0.1, 0.5, 0.5]
box2_xyxy = [0.3, 0.3, 0.7, 0.7]
box1_cxcywh = [0.3, 0.3, 0.4, 0.4]
box2_cxcywh = [0.5, 0.5, 0.4, 0.4]
iou_from_xyxy = iou_xyxy(box1_xyxy, box2_xyxy)
iou_from_cx = iou_cxcywh(box1_cxcywh, box2_cxcywh)
print(f'IoU (xyxy format): {iou_from_xyxy:.4f}')
print(f'IoU (cxcywh format): {iou_from_cx:.4f}')
print(f'Match (same boxes): {abs(iou_from_xyxy - iou_from_cx) < 1e-6}')
if TORCH_AVAILABLE:
# Exercise 2: Run Faster R-CNN on a synthetic image, filter by class and confidence
# Key insight: Faster R-CNN outputs boxes per class with confidence scores;
# NMS + confidence threshold are applied post-model to get clean detections.
import torchvision
# Synthetic image: white background with a colored rectangle
syn_img = np.ones((300, 400, 3), dtype=np.uint8) * 200
syn_img[80:180, 100:250] = [220, 50, 50]
img_tensor = T.ToTensor()(Image.fromarray(syn_img))
try:
frcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None)
frcnn.eval()
with torch.no_grad():
preds = frcnn([img_tensor])[0]
conf_thresh = 0.3
mask = preds['scores'] > conf_thresh
filtered_boxes = preds['boxes'][mask]
filtered_labels = preds['labels'][mask]
filtered_scores = preds['scores'][mask]
print(f'Detections above {conf_thresh}: {len(filtered_boxes)}')
except Exception as e:
print(f'Faster R-CNN: {e}')
print('Using synthetic detection output for demo.')
filtered_boxes = torch.tensor([[100., 80., 250., 180.]])
filtered_scores = torch.tensor([0.85])
filtered_labels = torch.tensor([1])
plt.figure(figsize=(6, 4))
plt.imshow(syn_img)
for box, score, label in zip(filtered_boxes, filtered_scores, filtered_labels):
x1,y1,x2,y2 = box.tolist()
rect = plt.Rectangle((x1,y1), x2-x1, y2-y1, fill=False, edgecolor='lime', linewidth=2)
plt.gca().add_patch(rect)
plt.text(x1, y1-5, f'cls={label.item()} {score.item():.2f}', color='lime', fontsize=8)
plt.title('Faster R-CNN detections'); plt.axis('off'); plt.show()
if TORCH_AVAILABLE:
# Exercise 3: Sliding window detector over 128x128 with 32x32 windows
# Key insight: Sliding window is computationally expensive (O(H*W/stride^2))
# but conceptually simple; modern detectors use shared feature maps instead.
class TinyCNN(nn.Module):
def __init__(self, num_classes=2):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(3, 8, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d(4),
nn.Flatten(), nn.Linear(8*4*4, num_classes))
def forward(self, x): return self.net(x)
tiny_clf = TinyCNN(num_classes=2)
tiny_clf.eval()
large_img = torch.rand(1, 3, 128, 128)
win_size, stride = 32, 16
detections = []
with torch.no_grad():
for y in range(0, 128 - win_size + 1, stride):
for x in range(0, 128 - win_size + 1, stride):
patch = large_img[:, :, y:y+win_size, x:x+win_size]
score = tiny_clf(patch).softmax(dim=1)[0, 1].item()
detections.append({'x': x, 'y': y, 'score': score})
top_det = sorted(detections, key=lambda d: d['score'], reverse=True)[:3]
print(f'Total windows evaluated: {len(detections)}')
print('Top-3 detections:', top_det)
if TORCH_AVAILABLE:
# Exercise 4: Time YOLO vs Faster R-CNN on 100 images
# Key insight: YOLO is a single-stage detector (one forward pass per image);
# Faster R-CNN is two-stage (region proposals + classification), so slower but more accurate.
import time
n_imgs = 20 # reduced for demo speed
test_imgs = [torch.rand(3, 320, 320) for _ in range(n_imgs)]
# Time Faster R-CNN
try:
frcnn_time = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None)
frcnn_time.eval()
start = time.perf_counter()
with torch.no_grad():
for img in test_imgs:
_ = frcnn_time([img])
frcnn_elapsed = time.perf_counter() - start
frcnn_fps = n_imgs / frcnn_elapsed
print(f'Faster R-CNN: {frcnn_fps:.1f} FPS ({frcnn_elapsed:.2f}s for {n_imgs} images)')
except Exception as e:
print(f'Faster R-CNN timing skipped: {e}')
# Simulate YOLO with a tiny single-stage model (placeholder)
yolo_sim = nn.Sequential(nn.Conv2d(3, 32, 3, stride=2, padding=1), nn.ReLU(),
nn.AdaptiveAvgPool2d(1), nn.Flatten(), nn.Linear(32, 10))
yolo_sim.eval()
start = time.perf_counter()
with torch.no_grad():
for img in test_imgs:
_ = yolo_sim(img.unsqueeze(0))
yolo_elapsed = time.perf_counter() - start
print(f'YOLO-like (tiny model): {n_imgs/yolo_elapsed:.1f} FPS β single-stage is faster')
# Exercise 5: PASCAL VOC mAP@0.5 computation
# Key insight: mAP averages per-class AP scores; AP is the area under the
# precision-recall curve, computed by accumulating TP/FP sorted by confidence.
def compute_ap(recalls, precisions):
"""Compute AP using 11-point interpolation (VOC 2007 style)."""
ap = 0.0
for thr in np.linspace(0, 1, 11):
p = max((precisions[i] for i, r in enumerate(recalls) if r >= thr), default=0)
ap += p / 11
return ap
def voc_ap_for_class(pred_boxes, pred_scores, gt_boxes, iou_thresh=0.5):
"""pred_boxes: list of [x1,y1,x2,y2], pred_scores: list of floats, gt_boxes: list of [x1,y1,x2,y2]."""
sorted_idx = np.argsort(-np.array(pred_scores))
matched_gt = set()
tp, fp = [], []
for idx in sorted_idx:
pb = pred_boxes[idx]
best_iou, best_j = 0, -1
for j, gb in enumerate(gt_boxes):
iou = iou_xyxy(pb, gb)
if iou > best_iou:
best_iou, best_j = iou, j
if best_iou >= iou_thresh and best_j not in matched_gt:
tp.append(1); fp.append(0); matched_gt.add(best_j)
else:
tp.append(0); fp.append(1)
tp_cum = np.cumsum(tp); fp_cum = np.cumsum(fp)
recalls = (tp_cum / (len(gt_boxes) + 1e-8)).tolist()
precisions = (tp_cum / (tp_cum + fp_cum + 1e-8)).tolist()
return compute_ap(recalls, precisions)
# Synthetic 3-class detection results
np.random.seed(7)
class_aps = []
for cls in range(3):
gt = [[np.random.uniform(0, 0.6), np.random.uniform(0, 0.6),
np.random.uniform(0.4, 1), np.random.uniform(0.4, 1)] for _ in range(5)]
pred = [[b[0]+np.random.uniform(-0.1,0.1), b[1]+np.random.uniform(-0.1,0.1),
b[2]+np.random.uniform(-0.1,0.1), b[3]+np.random.uniform(-0.1,0.1)] for b in gt]
pred += [[np.random.uniform(0,0.5)]*2 + [np.random.uniform(0.5,1)]*2 for _ in range(3)] # FP
scores = [np.random.uniform(0.5, 1) for _ in gt] + [np.random.uniform(0, 0.4) for _ in range(3)]
ap = voc_ap_for_class(pred, scores, gt)
class_aps.append(ap)
print(f'Class {cls} AP@0.5: {ap:.3f}')
print(f'mAP@0.5: {np.mean(class_aps):.3f}')
05 β Image SegmentationΒΆ
if TORCH_AVAILABLE:
# Minimal U-Net for 3-class segmentation (64x64 synthetic scenes)
class UNet(nn.Module):
def __init__(self, in_ch=3, out_ch=3):
super().__init__()
def block(ic, oc): return nn.Sequential(nn.Conv2d(ic,oc,3,padding=1), nn.ReLU(),
nn.Conv2d(oc,oc,3,padding=1), nn.ReLU())
self.enc1 = block(in_ch, 16); self.pool1 = nn.MaxPool2d(2)
self.enc2 = block(16, 32); self.pool2 = nn.MaxPool2d(2)
self.bottleneck = block(32, 64)
self.up2 = nn.ConvTranspose2d(64, 32, 2, stride=2)
self.dec2 = block(64, 32)
self.up1 = nn.ConvTranspose2d(32, 16, 2, stride=2)
self.dec1 = block(32, 16)
self.out = nn.Conv2d(16, out_ch, 1)
def forward(self, x):
e1 = self.enc1(x)
e2 = self.enc2(self.pool1(e1))
b = self.bottleneck(self.pool2(e2))
d2 = self.dec2(torch.cat([self.up2(b), e2], dim=1))
d1 = self.dec1(torch.cat([self.up1(d2), e1], dim=1))
return self.out(d1)
# Synthetic segmentation dataset
N_seg = 80; H_seg = W_seg = 64; n_classes = 3
X_seg = torch.rand(N_seg, 3, H_seg, W_seg)
y_seg = torch.zeros(N_seg, H_seg, W_seg, dtype=torch.long)
y_seg[:, :32, :] = 0 # top half = class 0
y_seg[:, 32:, :32] = 1 # bottom-left = class 1
y_seg[:, 32:, 32:] = 2 # bottom-right = class 2
seg_split = 60
seg_tr = TensorDataset(X_seg[:seg_split], y_seg[:seg_split])
seg_va = TensorDataset(X_seg[seg_split:], y_seg[seg_split:])
seg_tr_dl = DataLoader(seg_tr, batch_size=8, shuffle=True)
seg_va_dl = DataLoader(seg_va, batch_size=8)
def compute_miou(preds, targets, n_cls=3):
ious = []
p, t = preds.flatten(), targets.flatten()
for c in range(n_cls):
inter = ((p == c) & (t == c)).sum().float()
union = ((p == c) | (t == c)).sum().float()
if union > 0: ious.append((inter / union).item())
return np.mean(ious) if ious else 0.0
if TORCH_AVAILABLE:
# Exercise 1: Train U-Net for 30 epochs, plot mIoU learning curve
# Key insight: mIoU measures overlap per class then averages;
# it penalizes class imbalance less than pixel accuracy.
unet = UNet()
opt_unet = optim.Adam(unet.parameters(), lr=1e-3)
crit_ce = nn.CrossEntropyLoss()
miou_curve = []
for ep in range(30):
unet.train()
for xb, yb in seg_tr_dl:
opt_unet.zero_grad(); crit_ce(unet(xb), yb).backward(); opt_unet.step()
unet.eval(); miou_ep = []
with torch.no_grad():
for xb, yb in seg_va_dl:
p = unet(xb).argmax(dim=1)
miou_ep.append(compute_miou(p, yb))
miou_curve.append(np.mean(miou_ep))
plt.figure(figsize=(8, 4))
plt.plot(miou_curve, marker='o', markersize=3)
plt.title('U-Net mIoU Learning Curve'); plt.xlabel('Epoch'); plt.ylabel('mIoU'); plt.show()
print(f'Final mIoU: {miou_curve[-1]:.4f}')
if TORCH_AVAILABLE:
# Exercise 2: Dice Loss vs CrossEntropyLoss convergence
# Key insight: Dice loss directly optimizes the IoU-like overlap measure,
# making it better for imbalanced segmentation tasks.
def dice_loss(logits, targets, smooth=1.0):
probs = logits.softmax(dim=1)
n_cls = logits.shape[1]
losses = []
for c in range(n_cls):
p = probs[:, c].flatten()
t = (targets == c).float().flatten()
inter = (p * t).sum()
losses.append(1 - (2*inter + smooth) / (p.sum() + t.sum() + smooth))
return torch.stack(losses).mean()
def train_seg(loss_fn, epochs=20):
m = UNet()
o = optim.Adam(m.parameters(), lr=1e-3)
miou_vals = []
for ep in range(epochs):
m.train()
for xb, yb in seg_tr_dl:
o.zero_grad(); loss_fn(m(xb), yb).backward(); o.step()
m.eval(); ep_miou = []
with torch.no_grad():
for xb, yb in seg_va_dl:
ep_miou.append(compute_miou(m(xb).argmax(1), yb))
miou_vals.append(np.mean(ep_miou))
return miou_vals
miou_ce = train_seg(nn.CrossEntropyLoss(), epochs=20)
miou_dice = train_seg(dice_loss, epochs=20)
plt.figure(figsize=(8, 4))
plt.plot(miou_ce, label='CrossEntropy')
plt.plot(miou_dice, label='Dice Loss')
plt.title('Dice vs CrossEntropy mIoU'); plt.xlabel('Epoch'); plt.legend(); plt.show()
if TORCH_AVAILABLE:
# Exercise 3: Class-weighted loss (inversely proportional to pixel frequency)
# Key insight: Rare classes contribute little to standard CE loss;
# inverse-frequency weights up-weight rare classes, improving their IoU.
pixel_counts = torch.zeros(n_classes)
for _, yb in seg_tr_dl:
for c in range(n_classes):
pixel_counts[c] += (yb == c).sum()
inv_freq_weights = 1.0 / (pixel_counts / pixel_counts.sum() + 1e-6)
inv_freq_weights /= inv_freq_weights.sum()
print('Class pixel counts:', pixel_counts.tolist())
print('Inverse-frequency weights:', inv_freq_weights.tolist())
weighted_ce = nn.CrossEntropyLoss(weight=inv_freq_weights)
miou_weighted = train_seg(weighted_ce, epochs=20)
print(f'Final mIoU with class weighting: {miou_weighted[-1]:.4f}')
if TORCH_AVAILABLE:
# Exercise 4: DeepLabV3 pretrained on a PIL image
# Key insight: DeepLabV3 uses Atrous Spatial Pyramid Pooling (ASPP) to capture
# multi-scale context; pretrained COCO weights work zero-shot on natural images.
try:
from torchvision.models.segmentation import deeplabv3_resnet50
deeplab = deeplabv3_resnet50(weights=None)
deeplab.eval()
pil_seg = Image.fromarray(np.random.randint(0, 256, (256, 256, 3), dtype=np.uint8))
seg_tfm = T.Compose([T.ToTensor(),
T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])
inp = seg_tfm(pil_seg).unsqueeze(0)
with torch.no_grad():
out = deeplab(inp)['out']
pred_mask = out.argmax(dim=1).squeeze().numpy()
plt.figure(figsize=(8, 3))
plt.subplot(1,2,1); plt.imshow(pil_seg); plt.title('Input Image')
plt.subplot(1,2,2); plt.imshow(pred_mask, cmap='tab20'); plt.title('DeepLabV3 Mask')
plt.tight_layout(); plt.show()
except Exception as e:
print(f'DeepLabV3 error: {e}')
if TORCH_AVAILABLE:
# Exercise 5: Sliding window inference with overlapping tiles
# Key insight: Overlapping tiles and averaging predictions at overlap regions
# reduces boundary artifacts compared to non-overlapping tiling.
def sliding_window_inference(model, img_tensor, tile_size=128, stride=96, n_cls=3):
"""img_tensor: (1, C, H, W), returns (n_cls, H, W) accumulated logits."""
_, C, H, W = img_tensor.shape
accum = torch.zeros(n_cls, H, W)
count = torch.zeros(1, H, W)
model.eval()
with torch.no_grad():
for y in range(0, H - tile_size + 1, stride):
for x in range(0, W - tile_size + 1, stride):
tile = img_tensor[:, :, y:y+tile_size, x:x+tile_size]
pred = torch.nn.functional.interpolate(
model(tile), size=(tile_size, tile_size), mode='bilinear', align_corners=False)
accum[:, y:y+tile_size, x:x+tile_size] += pred.squeeze(0)
count[0, y:y+tile_size, x:x+tile_size] += 1
count = count.clamp(min=1)
return (accum / count).argmax(dim=0)
large_input = torch.rand(1, 3, 512, 512)
# Use a simple conv model for demo (U-Net expects specific sizes)
demo_seg = nn.Sequential(nn.Conv2d(3, 3, 3, padding=1))
result = sliding_window_inference(demo_seg, large_input, tile_size=128, stride=96, n_cls=3)
print(f'Sliding window output shape: {result.shape}') # (512, 512)