CNNs From Scratch: Understanding Convolution, Pooling, and ClassificationΒΆ

Before you use ResNet-50, understand what’s inside it. This notebook builds a Convolutional Neural Network from the ground up in PyTorch β€” conv layers, pooling, batch norm, dropout β€” training on a synthetic image classification task.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from torch.utils.data import Dataset, DataLoader, random_split
    from torchvision import transforms
    HAS_TORCH = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'PyTorch {torch.__version__}. Device: {device}')
except ImportError:
    HAS_TORCH = False
    print('PyTorch not installed β€” showing architecture with numpy simulation')
    print('Install: pip install torch torchvision')

np.random.seed(42)

# Generate synthetic image dataset (4 classes of geometric patterns)
def make_pattern(pattern: str, size: int = 32) -> np.ndarray:
    """Generate a synthetic pattern image."""
    img = np.zeros((size, size, 3), dtype=np.float32)
    cx, cy = size // 2, size // 2
    y, x = np.ogrid[:size, :size]
    
    if pattern == 'circle':
        mask = (x-cx)**2 + (y-cy)**2 <= (size//4)**2
        img[mask, 0] = 1.0  # Red circle
    elif pattern == 'square':
        s = size // 4
        img[cy-s:cy+s, cx-s:cx+s, 1] = 1.0  # Green square
    elif pattern == 'diagonal':
        for i in range(size):
            j = int(i * (size-1) / (size-1))
            for w in range(-2, 3):
                if 0 <= j+w < size:
                    img[i, j+w, 2] = 1.0  # Blue diagonal
    elif pattern == 'ring':
        r_out = size // 3
        r_in = size // 5
        mask = ((x-cx)**2 + (y-cy)**2 <= r_out**2) & ((x-cx)**2 + (y-cy)**2 >= r_in**2)
        img[mask, 0] = 1.0
        img[mask, 1] = 0.5  # Orange ring
    
    # Add noise
    img += np.random.normal(0, 0.1, img.shape).astype(np.float32)
    return np.clip(img, 0, 1)

CLASSES = ['circle', 'square', 'diagonal', 'ring']
n_per_class = 250
images, labels = [], []
for label, pattern in enumerate(CLASSES):
    for _ in range(n_per_class):
        images.append(make_pattern(pattern))
        labels.append(label)

images = np.array(images)  # (1000, 32, 32, 3)
labels = np.array(labels)

print(f'Dataset: {len(images)} images, {len(CLASSES)} classes, size {images[0].shape}')

# Visualize samples
fig, axes = plt.subplots(2, 4, figsize=(12, 6))
for col, (cls, idx) in enumerate(zip(CLASSES, [np.where(labels==c)[0][0] for c in range(4)])):
    axes[0, col].imshow(images[idx])
    axes[0, col].set_title(cls)
    axes[0, col].axis('off')
    axes[1, col].imshow(images[idx][:,:,0], cmap='Reds')
    axes[1, col].set_title(f'{cls} (R channel)')
    axes[1, col].axis('off')
plt.suptitle('Synthetic Image Classes')
plt.tight_layout()
plt.show()

1. The Convolution Operation β€” VisualizedΒΆ

# Manual convolution to build intuition
test_img = images[0, :, :, 0]  # Single channel

kernels = {
    'Edge Horizontal': np.array([[-1,-1,-1],[0,0,0],[1,1,1]], dtype=np.float32),
    'Edge Vertical':   np.array([[-1,0,1],[-1,0,1],[-1,0,1]], dtype=np.float32),
    'Blur':            np.ones((3,3), dtype=np.float32) / 9,
    'Sharpen':         np.array([[0,-1,0],[-1,5,-1],[0,-1,0]], dtype=np.float32),
}

fig, axes = plt.subplots(1, 5, figsize=(18, 4))
axes[0].imshow(test_img, cmap='gray')
axes[0].set_title('Input (32Γ—32)')
axes[0].axis('off')

for i, (name, kernel) in enumerate(kernels.items()):
    # Apply convolution (valid padding)
    H, W = test_img.shape
    kH, kW = kernel.shape
    out = np.zeros((H - kH + 1, W - kW + 1))
    for y in range(out.shape[0]):
        for x in range(out.shape[1]):
            out[y, x] = (test_img[y:y+kH, x:x+kW] * kernel).sum()
    axes[i+1].imshow(out, cmap='gray')
    axes[i+1].set_title(f'{name}\n→ {out.shape}')
    axes[i+1].axis('off')

plt.suptitle('Convolution with Different Kernels (learned automatically in CNNs)')
plt.tight_layout()
plt.show()

print('Convolution intuition:')
print('  Kernel slides over input β†’ computes element-wise product + sum at each position')
print('  Stride: how many pixels to move each step (stride=2 β†’ halves spatial size)')
print('  Padding: add zeros around border to preserve spatial size (same padding)')
print('  Output size = (H - K + 2P) / S + 1   (H=input, K=kernel, P=pad, S=stride)')

2. CNN Architecture in PyTorchΒΆ

if HAS_TORCH:
    class SimpleCNN(nn.Module):
        """Classic CNN architecture: Conv β†’ Pool β†’ Conv β†’ Pool β†’ FC."""
        
        def __init__(self, num_classes: int = 4):
            super().__init__()
            
            # Convolutional blocks
            self.conv_block1 = nn.Sequential(
                nn.Conv2d(3, 32, kernel_size=3, padding=1),  # (B, 32, 32, 32)
                nn.BatchNorm2d(32),
                nn.ReLU(),
                nn.MaxPool2d(2),                              # (B, 32, 16, 16)
                nn.Dropout2d(0.25),
            )
            
            self.conv_block2 = nn.Sequential(
                nn.Conv2d(32, 64, kernel_size=3, padding=1),  # (B, 64, 16, 16)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d(2),                               # (B, 64, 8, 8)
                nn.Dropout2d(0.25),
            )
            
            self.conv_block3 = nn.Sequential(
                nn.Conv2d(64, 128, kernel_size=3, padding=1),  # (B, 128, 8, 8)
                nn.BatchNorm2d(128),
                nn.ReLU(),
                nn.AdaptiveAvgPool2d(4),                        # (B, 128, 4, 4)
            )
            
            # Classifier head
            self.classifier = nn.Sequential(
                nn.Flatten(),                 # (B, 128*4*4 = 2048)
                nn.Linear(128 * 4 * 4, 256),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(256, num_classes),
            )
        
        def forward(self, x):
            x = self.conv_block1(x)
            x = self.conv_block2(x)
            x = self.conv_block3(x)
            return self.classifier(x)
    
    model = SimpleCNN(num_classes=4)
    
    # Parameter count
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'Total parameters: {total:,}')
    print(f'Trainable:        {trainable:,}')
    print()
    
    # Trace through dimensions
    x_dummy = torch.randn(1, 3, 32, 32)
    print('Forward pass dimensions:')
    print(f'  Input:       {x_dummy.shape}')
    x_dummy = model.conv_block1(x_dummy); print(f'  After block1: {x_dummy.shape}')
    x_dummy = model.conv_block2(x_dummy); print(f'  After block2: {x_dummy.shape}')
    x_dummy = model.conv_block3(x_dummy); print(f'  After block3: {x_dummy.shape}')
    x_flat  = x_dummy.flatten(1);         print(f'  Flattened:   {x_flat.shape}')
else:
    print('CNN Architecture (SimpleCNN):')
    print('  Conv2d(3β†’32, k=3, pad=1) + BN + ReLU + MaxPool(2) + Dropout2d')
    print('    β†’ (B, 32, 16, 16)')
    print('  Conv2d(32β†’64, k=3, pad=1) + BN + ReLU + MaxPool(2) + Dropout2d')
    print('    β†’ (B, 64, 8, 8)')
    print('  Conv2d(64β†’128, k=3, pad=1) + BN + ReLU + AdaptiveAvgPool(4)')
    print('    β†’ (B, 128, 4, 4)')
    print('  Flatten β†’ Linear(2048, 256) β†’ ReLU β†’ Dropout β†’ Linear(256, 4)')
    print('  Total params: ~540,000')

3. TrainingΒΆ

if HAS_TORCH:
    class ImageDataset(Dataset):
        def __init__(self, images, labels, transform=None):
            # Convert (N, H, W, C) to (N, C, H, W)
            self.images = torch.FloatTensor(images.transpose(0, 3, 1, 2))
            self.labels = torch.LongTensor(labels)
            self.transform = transform
        def __len__(self): return len(self.images)
        def __getitem__(self, idx):
            x = self.images[idx]
            if self.transform:
                x = self.transform(x)
            return x, self.labels[idx]
    
    dataset = ImageDataset(images, labels)
    n_train = int(0.8 * len(dataset))
    train_ds, test_ds = random_split(dataset, [n_train, len(dataset) - n_train])
    
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    test_loader  = DataLoader(test_ds,  batch_size=64, shuffle=False)
    
    model = SimpleCNN(num_classes=4).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
    criterion = nn.CrossEntropyLoss()
    
    history = {'train_loss': [], 'val_acc': []}
    
    for epoch in range(30):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X_batch), y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        model.eval()
        correct = 0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                preds = model(X_batch.to(device)).argmax(1)
                correct += (preds == y_batch.to(device)).sum().item()
        
        val_acc = correct / len(test_ds)
        history['train_loss'].append(total_loss / len(train_loader))
        history['val_acc'].append(val_acc)
        scheduler.step()
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1:2d}: loss={history["train_loss"][-1]:.4f}, val_acc={val_acc:.3f}')
    
    # Final evaluation
    all_preds, all_labels = [], []
    model.eval()
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            preds = model(X_batch.to(device)).argmax(1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(y_batch.numpy())
    
    print('\nClassification Report:')
    print(classification_report(all_labels, all_preds, target_names=CLASSES))
else:
    print('Training output (simulated β€” 30 epochs):')
    print('Epoch 10: loss=0.8234, val_acc=0.780')
    print('Epoch 20: loss=0.3421, val_acc=0.920')
    print('Epoch 30: loss=0.1823, val_acc=0.965')
    print()
    print('Classification Report (simulated):')
    print('          precision  recall  f1-score  support')
    print('  circle     0.97    0.96      0.97      50')
    print('  square     0.98    0.98      0.98      50')
    print('diagonal     0.95    0.96      0.96      50')
    print('    ring     0.96    0.96      0.96      50')

4. Visualizing What the CNN LearnedΒΆ

if HAS_TORCH:
    # Visualize learned filters in the first conv layer
    first_conv_weights = model.conv_block1[0].weight.data.cpu().numpy()  # (32, 3, 3, 3)
    
    fig, axes = plt.subplots(4, 8, figsize=(16, 8))
    for i, ax in enumerate(axes.flatten()):
        if i < first_conv_weights.shape[0]:
            # Show as RGB filter (normalize for display)
            filt = first_conv_weights[i].transpose(1, 2, 0)
            filt = (filt - filt.min()) / (filt.max() - filt.min() + 1e-8)
            ax.imshow(filt)
        ax.axis('off')
    plt.suptitle('Learned Conv Layer 1 Filters (32 filters of 3Γ—3Γ—3)')
    plt.tight_layout()
    plt.show()
    print('Early filters learn edge detectors and color detectors (like V1 cells in visual cortex)')
else:
    print('First-layer filters visualization:')
    print('CNNs learn filters similar to:')
    print('  - Edge detectors (horizontal, vertical, diagonal)')
    print('  - Color blob detectors')
    print('  - Texture detectors')
    print('Deeper layers combine these into part detectors (wheels, eyes, etc.)')

CNN Architecture Cheat SheetΒΆ

Layer Type          Parameters       Effect
────────────────────────────────────────────────────────────────
Conv2d(C_in, C_out, K)  KΓ—KΓ—C_inΓ—C_out  Feature extraction
BatchNorm2d(C)      2C (Ξ³, Ξ²)        Stabilizes training
ReLU                0                Non-linearity
MaxPool2d(2)        0                Downsampling (Γ·2 spatial)
Dropout2d(p)        0                Spatial dropout regularization
AdaptiveAvgPool     0                Fixed-size output regardless of input
Flatten             0                Reshape to 1D
Linear(in, out)     inΓ—out + out     Classification head

Output size formula (single dim):
  out = ⌊(in + 2Γ—pad - kernel) / strideβŒ‹ + 1

Common input sizes:
  32Γ—32   β†’ CIFAR-10 style
  224Γ—224 β†’ ImageNet (ResNet, VGG, EfficientNet)
  299Γ—299 β†’ InceptionV3

Training tips:
  - batch_size 32-64 (smaller = more regularization)
  - Adam lr=1e-3, decay with StepLR or CosineAnnealing
  - weight_decay=1e-4 (L2 regularization)
  - Data augmentation: flip, crop, color jitter
  - BatchNorm before or after ReLU (usually before)

ExercisesΒΆ

  1. Add a fourth convolutional block and compare accuracy vs the 3-block model β€” does deeper always win?

  2. Replace MaxPool with a stride-2 convolution β€” is there a performance difference?

  3. Implement Grad-CAM to visualize which image regions activate the correct class.

  4. Add transforms.RandomRotation(15) to the training pipeline and measure the impact on test accuracy.

  5. Replace CrossEntropyLoss with Label Smoothing (nn.CrossEntropyLoss(label_smoothing=0.1)) β€” does it help?