CNNs From Scratch: Understanding Convolution, Pooling, and ClassificationΒΆ
Before you use ResNet-50, understand whatβs inside it. This notebook builds a Convolutional Neural Network from the ground up in PyTorch β conv layers, pooling, batch norm, dropout β training on a synthetic image classification task.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
try:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
HAS_TORCH = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'PyTorch {torch.__version__}. Device: {device}')
except ImportError:
HAS_TORCH = False
print('PyTorch not installed β showing architecture with numpy simulation')
print('Install: pip install torch torchvision')
np.random.seed(42)
# Generate synthetic image dataset (4 classes of geometric patterns)
def make_pattern(pattern: str, size: int = 32) -> np.ndarray:
"""Generate a synthetic pattern image."""
img = np.zeros((size, size, 3), dtype=np.float32)
cx, cy = size // 2, size // 2
y, x = np.ogrid[:size, :size]
if pattern == 'circle':
mask = (x-cx)**2 + (y-cy)**2 <= (size//4)**2
img[mask, 0] = 1.0 # Red circle
elif pattern == 'square':
s = size // 4
img[cy-s:cy+s, cx-s:cx+s, 1] = 1.0 # Green square
elif pattern == 'diagonal':
for i in range(size):
j = int(i * (size-1) / (size-1))
for w in range(-2, 3):
if 0 <= j+w < size:
img[i, j+w, 2] = 1.0 # Blue diagonal
elif pattern == 'ring':
r_out = size // 3
r_in = size // 5
mask = ((x-cx)**2 + (y-cy)**2 <= r_out**2) & ((x-cx)**2 + (y-cy)**2 >= r_in**2)
img[mask, 0] = 1.0
img[mask, 1] = 0.5 # Orange ring
# Add noise
img += np.random.normal(0, 0.1, img.shape).astype(np.float32)
return np.clip(img, 0, 1)
CLASSES = ['circle', 'square', 'diagonal', 'ring']
n_per_class = 250
images, labels = [], []
for label, pattern in enumerate(CLASSES):
for _ in range(n_per_class):
images.append(make_pattern(pattern))
labels.append(label)
images = np.array(images) # (1000, 32, 32, 3)
labels = np.array(labels)
print(f'Dataset: {len(images)} images, {len(CLASSES)} classes, size {images[0].shape}')
# Visualize samples
fig, axes = plt.subplots(2, 4, figsize=(12, 6))
for col, (cls, idx) in enumerate(zip(CLASSES, [np.where(labels==c)[0][0] for c in range(4)])):
axes[0, col].imshow(images[idx])
axes[0, col].set_title(cls)
axes[0, col].axis('off')
axes[1, col].imshow(images[idx][:,:,0], cmap='Reds')
axes[1, col].set_title(f'{cls} (R channel)')
axes[1, col].axis('off')
plt.suptitle('Synthetic Image Classes')
plt.tight_layout()
plt.show()
1. The Convolution Operation β VisualizedΒΆ
# Manual convolution to build intuition
test_img = images[0, :, :, 0] # Single channel
kernels = {
'Edge Horizontal': np.array([[-1,-1,-1],[0,0,0],[1,1,1]], dtype=np.float32),
'Edge Vertical': np.array([[-1,0,1],[-1,0,1],[-1,0,1]], dtype=np.float32),
'Blur': np.ones((3,3), dtype=np.float32) / 9,
'Sharpen': np.array([[0,-1,0],[-1,5,-1],[0,-1,0]], dtype=np.float32),
}
fig, axes = plt.subplots(1, 5, figsize=(18, 4))
axes[0].imshow(test_img, cmap='gray')
axes[0].set_title('Input (32Γ32)')
axes[0].axis('off')
for i, (name, kernel) in enumerate(kernels.items()):
# Apply convolution (valid padding)
H, W = test_img.shape
kH, kW = kernel.shape
out = np.zeros((H - kH + 1, W - kW + 1))
for y in range(out.shape[0]):
for x in range(out.shape[1]):
out[y, x] = (test_img[y:y+kH, x:x+kW] * kernel).sum()
axes[i+1].imshow(out, cmap='gray')
axes[i+1].set_title(f'{name}\nβ {out.shape}')
axes[i+1].axis('off')
plt.suptitle('Convolution with Different Kernels (learned automatically in CNNs)')
plt.tight_layout()
plt.show()
print('Convolution intuition:')
print(' Kernel slides over input β computes element-wise product + sum at each position')
print(' Stride: how many pixels to move each step (stride=2 β halves spatial size)')
print(' Padding: add zeros around border to preserve spatial size (same padding)')
print(' Output size = (H - K + 2P) / S + 1 (H=input, K=kernel, P=pad, S=stride)')
2. CNN Architecture in PyTorchΒΆ
if HAS_TORCH:
class SimpleCNN(nn.Module):
"""Classic CNN architecture: Conv β Pool β Conv β Pool β FC."""
def __init__(self, num_classes: int = 4):
super().__init__()
# Convolutional blocks
self.conv_block1 = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, padding=1), # (B, 32, 32, 32)
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2), # (B, 32, 16, 16)
nn.Dropout2d(0.25),
)
self.conv_block2 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=3, padding=1), # (B, 64, 16, 16)
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2), # (B, 64, 8, 8)
nn.Dropout2d(0.25),
)
self.conv_block3 = nn.Sequential(
nn.Conv2d(64, 128, kernel_size=3, padding=1), # (B, 128, 8, 8)
nn.BatchNorm2d(128),
nn.ReLU(),
nn.AdaptiveAvgPool2d(4), # (B, 128, 4, 4)
)
# Classifier head
self.classifier = nn.Sequential(
nn.Flatten(), # (B, 128*4*4 = 2048)
nn.Linear(128 * 4 * 4, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes),
)
def forward(self, x):
x = self.conv_block1(x)
x = self.conv_block2(x)
x = self.conv_block3(x)
return self.classifier(x)
model = SimpleCNN(num_classes=4)
# Parameter count
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total parameters: {total:,}')
print(f'Trainable: {trainable:,}')
print()
# Trace through dimensions
x_dummy = torch.randn(1, 3, 32, 32)
print('Forward pass dimensions:')
print(f' Input: {x_dummy.shape}')
x_dummy = model.conv_block1(x_dummy); print(f' After block1: {x_dummy.shape}')
x_dummy = model.conv_block2(x_dummy); print(f' After block2: {x_dummy.shape}')
x_dummy = model.conv_block3(x_dummy); print(f' After block3: {x_dummy.shape}')
x_flat = x_dummy.flatten(1); print(f' Flattened: {x_flat.shape}')
else:
print('CNN Architecture (SimpleCNN):')
print(' Conv2d(3β32, k=3, pad=1) + BN + ReLU + MaxPool(2) + Dropout2d')
print(' β (B, 32, 16, 16)')
print(' Conv2d(32β64, k=3, pad=1) + BN + ReLU + MaxPool(2) + Dropout2d')
print(' β (B, 64, 8, 8)')
print(' Conv2d(64β128, k=3, pad=1) + BN + ReLU + AdaptiveAvgPool(4)')
print(' β (B, 128, 4, 4)')
print(' Flatten β Linear(2048, 256) β ReLU β Dropout β Linear(256, 4)')
print(' Total params: ~540,000')
3. TrainingΒΆ
if HAS_TORCH:
class ImageDataset(Dataset):
def __init__(self, images, labels, transform=None):
# Convert (N, H, W, C) to (N, C, H, W)
self.images = torch.FloatTensor(images.transpose(0, 3, 1, 2))
self.labels = torch.LongTensor(labels)
self.transform = transform
def __len__(self): return len(self.images)
def __getitem__(self, idx):
x = self.images[idx]
if self.transform:
x = self.transform(x)
return x, self.labels[idx]
dataset = ImageDataset(images, labels)
n_train = int(0.8 * len(dataset))
train_ds, test_ds = random_split(dataset, [n_train, len(dataset) - n_train])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)
model = SimpleCNN(num_classes=4).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
criterion = nn.CrossEntropyLoss()
history = {'train_loss': [], 'val_acc': []}
for epoch in range(30):
model.train()
total_loss = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
loss = criterion(model(X_batch), y_batch)
loss.backward()
optimizer.step()
total_loss += loss.item()
model.eval()
correct = 0
with torch.no_grad():
for X_batch, y_batch in test_loader:
preds = model(X_batch.to(device)).argmax(1)
correct += (preds == y_batch.to(device)).sum().item()
val_acc = correct / len(test_ds)
history['train_loss'].append(total_loss / len(train_loader))
history['val_acc'].append(val_acc)
scheduler.step()
if (epoch + 1) % 10 == 0:
print(f'Epoch {epoch+1:2d}: loss={history["train_loss"][-1]:.4f}, val_acc={val_acc:.3f}')
# Final evaluation
all_preds, all_labels = [], []
model.eval()
with torch.no_grad():
for X_batch, y_batch in test_loader:
preds = model(X_batch.to(device)).argmax(1).cpu().numpy()
all_preds.extend(preds)
all_labels.extend(y_batch.numpy())
print('\nClassification Report:')
print(classification_report(all_labels, all_preds, target_names=CLASSES))
else:
print('Training output (simulated β 30 epochs):')
print('Epoch 10: loss=0.8234, val_acc=0.780')
print('Epoch 20: loss=0.3421, val_acc=0.920')
print('Epoch 30: loss=0.1823, val_acc=0.965')
print()
print('Classification Report (simulated):')
print(' precision recall f1-score support')
print(' circle 0.97 0.96 0.97 50')
print(' square 0.98 0.98 0.98 50')
print('diagonal 0.95 0.96 0.96 50')
print(' ring 0.96 0.96 0.96 50')
4. Visualizing What the CNN LearnedΒΆ
if HAS_TORCH:
# Visualize learned filters in the first conv layer
first_conv_weights = model.conv_block1[0].weight.data.cpu().numpy() # (32, 3, 3, 3)
fig, axes = plt.subplots(4, 8, figsize=(16, 8))
for i, ax in enumerate(axes.flatten()):
if i < first_conv_weights.shape[0]:
# Show as RGB filter (normalize for display)
filt = first_conv_weights[i].transpose(1, 2, 0)
filt = (filt - filt.min()) / (filt.max() - filt.min() + 1e-8)
ax.imshow(filt)
ax.axis('off')
plt.suptitle('Learned Conv Layer 1 Filters (32 filters of 3Γ3Γ3)')
plt.tight_layout()
plt.show()
print('Early filters learn edge detectors and color detectors (like V1 cells in visual cortex)')
else:
print('First-layer filters visualization:')
print('CNNs learn filters similar to:')
print(' - Edge detectors (horizontal, vertical, diagonal)')
print(' - Color blob detectors')
print(' - Texture detectors')
print('Deeper layers combine these into part detectors (wheels, eyes, etc.)')
CNN Architecture Cheat SheetΒΆ
Layer Type Parameters Effect
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Conv2d(C_in, C_out, K) KΓKΓC_inΓC_out Feature extraction
BatchNorm2d(C) 2C (Ξ³, Ξ²) Stabilizes training
ReLU 0 Non-linearity
MaxPool2d(2) 0 Downsampling (Γ·2 spatial)
Dropout2d(p) 0 Spatial dropout regularization
AdaptiveAvgPool 0 Fixed-size output regardless of input
Flatten 0 Reshape to 1D
Linear(in, out) inΓout + out Classification head
Output size formula (single dim):
out = β(in + 2Γpad - kernel) / strideβ + 1
Common input sizes:
32Γ32 β CIFAR-10 style
224Γ224 β ImageNet (ResNet, VGG, EfficientNet)
299Γ299 β InceptionV3
Training tips:
- batch_size 32-64 (smaller = more regularization)
- Adam lr=1e-3, decay with StepLR or CosineAnnealing
- weight_decay=1e-4 (L2 regularization)
- Data augmentation: flip, crop, color jitter
- BatchNorm before or after ReLU (usually before)
ExercisesΒΆ
Add a fourth convolutional block and compare accuracy vs the 3-block model β does deeper always win?
Replace MaxPool with a stride-2 convolution β is there a performance difference?
Implement Grad-CAM to visualize which image regions activate the correct class.
Add
transforms.RandomRotation(15)to the training pipeline and measure the impact on test accuracy.Replace CrossEntropyLoss with Label Smoothing (nn.CrossEntropyLoss(label_smoothing=0.1)) β does it help?