import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2LMHeadModel
torch.manual_seed(42)
np.random.seed(42)
print("β
Imports successful!")
Why Transformers Changed EverythingΒΆ
Paper: βAttention Is All You Needβ (2017)
Key Innovation: Replace recurrence with attention!
Before Transformers (RNNs/LSTMs):
Sequential processing: t1 β t2 β t3 β t4
β Slow (can't parallelize)
β Vanishing gradients for long sequences
β Limited memory
Transformers:
Parallel processing: All tokens processed simultaneously
β
Fast training (parallelizable)
β
Long-range dependencies (attention)
β
Scalable to billions of parameters
Impact:
BERT, GPT, T5, DALL-E, ChatGPT, Claude
Powers modern NLP, vision, multimodal AI
Enabled the AI revolution weβre experiencing now
2. Positional EncodingΒΆ
Problem: Attention has no notion of position!
βThe cat sat on the matβ = βmat the on sat cat Theβ (same attention, different meaning)
Solution: Add positional information to embeddings
Where:
\(pos\) = position in sequence
\(i\) = dimension index
\(d\) = embedding dimension
class PositionalEncoding(nn.Module):
"""
Sinusoidal positional encoding
"""
def __init__(self, d_model, max_len=5000):
super().__init__()
# Create positional encoding matrix
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
# Compute div_term
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
(-math.log(10000.0) / d_model))
# Apply sin to even indices
pe[:, 0::2] = torch.sin(position * div_term)
# Apply cos to odd indices
pe[:, 1::2] = torch.cos(position * div_term)
# Add batch dimension
pe = pe.unsqueeze(0) # (1, max_len, d_model)
# Register as buffer (not a parameter)
self.register_buffer('pe', pe)
def forward(self, x):
"""
Args:
x: Input embeddings (batch, seq_len, d_model)
Returns:
x + positional encoding
"""
seq_len = x.size(1)
x = x + self.pe[:, :seq_len, :]
return x
# Visualize positional encoding
d_model = 64
max_len = 100
pe_layer = PositionalEncoding(d_model, max_len)
# Get positional encodings
pe = pe_layer.pe[0].numpy() # (max_len, d_model)
# Plot
plt.figure(figsize=(12, 6))
plt.imshow(pe.T, cmap='RdBu', aspect='auto')
plt.colorbar(label='Encoding Value')
plt.xlabel('Position in Sequence')
plt.ylabel('Embedding Dimension')
plt.title('Positional Encoding Visualization')
plt.tight_layout()
plt.show()
print("\nπ‘ Key Properties:")
print(" β’ Each position gets a unique encoding")
print(" β’ Relative positions can be computed (PE(pos+k) is a linear function of PE(pos))")
print(" β’ Works for sequences longer than training length")
print(" β’ Different frequencies capture different position scales")
3. Feedforward NetworkΒΆ
After attention, transformers apply a feedforward network to each position:
Two linear layers with ReLU in between
Applied independently to each position
Expands then contracts dimension
class FeedForward(nn.Module):
"""
Position-wise Feed-Forward Network
"""
def __init__(self, d_model, d_ff, dropout=0.1):
super().__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# x: (batch, seq_len, d_model)
x = F.relu(self.linear1(x)) # (batch, seq_len, d_ff)
x = self.dropout(x)
x = self.linear2(x) # (batch, seq_len, d_model)
return x
# Example
d_model = 512
d_ff = 2048 # Usually 4x larger
ff = FeedForward(d_model, d_ff)
x = torch.randn(2, 10, d_model) # (batch=2, seq_len=10, d_model=512)
output = ff(x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in ff.parameters()):,}")
4. Transformer Encoder LayerΒΆ
One encoder layer consists of:
Multi-head self-attention
Add & Norm (residual connection + layer normalization)
Feed-forward network
Add & Norm
class TransformerEncoderLayer(nn.Module):
"""
Single Transformer Encoder Layer
"""
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
# Multi-head attention
self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
# Feed-forward
self.feed_forward = FeedForward(d_model, d_ff, dropout)
# Layer normalization
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
# Dropout
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
"""
Args:
x: Input (seq_len, batch, d_model)
mask: Attention mask
Returns:
output: (seq_len, batch, d_model)
"""
# Self-attention with residual connection
attn_output, _ = self.self_attn(x, x, x, attn_mask=mask)
x = x + self.dropout(attn_output) # Residual
x = self.norm1(x) # Layer norm
# Feed-forward with residual connection
ff_output = self.feed_forward(x)
x = x + self.dropout(ff_output) # Residual
x = self.norm2(x) # Layer norm
return x
# Test encoder layer
d_model = 512
num_heads = 8
d_ff = 2048
encoder_layer = TransformerEncoderLayer(d_model, num_heads, d_ff)
# Input: (seq_len, batch, d_model)
x = torch.randn(10, 2, d_model)
output = encoder_layer(x)
print(f"Input: {x.shape}")
print(f"Output: {output.shape}")
print(f"\nEncoder layer parameters: {sum(p.numel() for p in encoder_layer.parameters()):,}")
5. Complete Transformer EncoderΒΆ
Assembling the Full StackΒΆ
A complete transformer encoder combines token embeddings, positional encoding, and a stack of \(N\) identical encoder layers. The token embedding lookup converts integer token IDs into dense vectors, which are scaled by \(\sqrt{d_{\text{model}}}\) to keep the magnitude comparable to the positional encodings that are added next. The output then passes through each encoder layer in sequence, with residual connections and layer normalization ensuring stable gradient flow even in deep stacks. The original βAttention Is All You Needβ paper used \(N=6\) layers with \(d_{\text{model}}=512\) and 8 attention heads; modern models like BERT-large use 24 layers with \(d_{\text{model}}=1024\).
class TransformerEncoder(nn.Module):
"""
Stack of N encoder layers
"""
def __init__(self, vocab_size, d_model=512, num_heads=8,
d_ff=2048, num_layers=6, dropout=0.1, max_len=5000):
super().__init__()
# Token embedding
self.embedding = nn.Embedding(vocab_size, d_model)
# Positional encoding
self.pos_encoding = PositionalEncoding(d_model, max_len)
# Stack of encoder layers
self.layers = nn.ModuleList([
TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers)
])
self.dropout = nn.Dropout(dropout)
self.d_model = d_model
def forward(self, x, mask=None):
"""
Args:
x: Token indices (batch, seq_len)
mask: Attention mask
Returns:
output: Encoder output (seq_len, batch, d_model)
"""
# Embed and scale
x = self.embedding(x) * math.sqrt(self.d_model) # (batch, seq, d_model)
# Add positional encoding
x = self.pos_encoding(x)
x = self.dropout(x)
# Transpose for nn.MultiheadAttention: (seq, batch, d_model)
x = x.transpose(0, 1)
# Pass through encoder layers
for layer in self.layers:
x = layer(x, mask)
return x
# Create encoder
vocab_size = 10000
encoder = TransformerEncoder(
vocab_size=vocab_size,
d_model=512,
num_heads=8,
d_ff=2048,
num_layers=6
)
# Test
batch_size = 2
seq_len = 20
x = torch.randint(0, vocab_size, (batch_size, seq_len))
output = encoder(x)
print(f"Input (token indices): {x.shape}")
print(f"Output: {output.shape}")
print(f"\nTotal parameters: {sum(p.numel() for p in encoder.parameters()):,}")
6. Simple Classification with TransformerΒΆ
From Encoder Representations to PredictionsΒΆ
A transformer encoder produces a contextualized representation for every position in the input sequence. To make a single prediction for the entire sequence (e.g., sentiment classification), we need a pooling strategy that reduces the sequence of vectors into a fixed-size representation. Common approaches include taking the mean across positions, using only the first tokenβs representation (as BERT does with the [CLS] token), or applying learned attention pooling. The pooled vector is then fed through a linear classification head that maps it to class logits. The model below demonstrates this pattern with a small transformer followed by mean pooling and a nn.Linear layer.
class TransformerClassifier(nn.Module):
"""
Transformer for sequence classification
"""
def __init__(self, vocab_size, num_classes, d_model=128,
num_heads=4, d_ff=512, num_layers=2):
super().__init__()
self.encoder = TransformerEncoder(
vocab_size, d_model, num_heads, d_ff, num_layers
)
self.classifier = nn.Linear(d_model, num_classes)
def forward(self, x):
# Encode: (seq, batch, d_model)
encoded = self.encoder(x)
# Pool: take mean across sequence
pooled = encoded.mean(dim=0) # (batch, d_model)
# Classify
logits = self.classifier(pooled) # (batch, num_classes)
return logits
# Example: Sentiment classification
vocab_size = 5000
num_classes = 2 # Positive / Negative
model = TransformerClassifier(vocab_size, num_classes)
# Dummy batch
batch_size = 4
seq_len = 30
x = torch.randint(0, vocab_size, (batch_size, seq_len))
logits = model(x)
predictions = torch.argmax(logits, dim=1)
print(f"Input shape: {x.shape}")
print(f"Output logits: {logits.shape}")
print(f"Predictions: {predictions}")
print(f"\nModel size: {sum(p.numel() for p in model.parameters()):,} parameters")
7. Using Pre-trained TransformersΒΆ
Building transformers from scratch is educational, but in practice we use pre-trained models!
Popular Pre-trained Models:ΒΆ
BERT: Bidirectional encoder (understanding)
GPT: Autoregressive decoder (generation)
T5: Encoder-decoder (text-to-text)
RoBERTa, ALBERT, DistilBERT: BERT variants
Example 1: BERT for Feature ExtractionΒΆ
BERT (Bidirectional Encoder Representations from Transformers) is pre-trained on massive text corpora using two self-supervised objectives: masked language modeling and next-sentence prediction. As a result, its internal representations capture rich semantic and syntactic information about language. By passing a sentence through BERT and extracting the hidden state at the special [CLS] token, we obtain a 768-dimensional sentence embedding that can be used for downstream tasks like semantic search, clustering, or classification β often with minimal additional training. The code below loads bert-base-uncased via HuggingFace and extracts embeddings for a few example sentences.
# Load pre-trained BERT
print("Loading BERT...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
# Example sentences
sentences = [
"The cat sat on the mat.",
"I love deep learning and transformers!",
"Neural networks are amazing."
]
print("\nProcessing sentences...\n")
for sent in sentences:
# Tokenize
inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)
# Get BERT embeddings
with torch.no_grad():
outputs = model(**inputs)
# Extract [CLS] token (sentence embedding)
cls_embedding = outputs.last_hidden_state[:, 0, :] # (1, 768)
print(f"Sentence: {sent}")
print(f" Tokens: {tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])}")
print(f" [CLS] embedding shape: {cls_embedding.shape}")
print(f" Embedding norm: {cls_embedding.norm().item():.2f}")
print()
print("\nπ‘ Use cases:")
print(" β’ Semantic search (compare sentence embeddings)")
print(" β’ Text classification (add classifier on top)")
print(" β’ Named entity recognition")
print(" β’ Question answering")
Example 2: GPT for Text GenerationΒΆ
GPT (Generative Pre-trained Transformer) is a decoder-only transformer trained to predict the next token given all previous tokens. Because it processes text left-to-right with causal (masked) attention, it naturally generates coherent continuations of any given prompt. The model.generate() API handles the autoregressive loop: at each step it feeds the current sequence through the model, samples or selects the next token, appends it, and repeats until a stop condition is met. Parameters like temperature (controls randomness), top_k, and top_p (nucleus sampling) let you trade off between creative and deterministic outputs.
# Load GPT-2
print("Loading GPT-2...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()
# Set pad token
tokenizer.pad_token = tokenizer.eos_token
# Generate text
prompts = [
"Once upon a time,",
"The future of artificial intelligence is",
"Deep learning models like transformers"
]
print("\nGenerating text...\n")
for prompt in prompts:
# Tokenize
inputs = tokenizer(prompt, return_tensors='pt')
# Generate
with torch.no_grad():
outputs = model.generate(
inputs['input_ids'],
max_length=50,
num_return_sequences=1,
temperature=0.8,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decode
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Prompt: {prompt}")
print(f"Generated: {generated_text}")
print()
print("\nπ‘ Generation parameters:")
print(" β’ temperature: Controls randomness (higher = more creative)")
print(" β’ top_k: Sample from top k tokens")
print(" β’ top_p (nucleus): Sample from smallest set with cumulative prob > p")
print(" β’ num_beams: Beam search for better quality")
8. Fine-tuning ExampleΒΆ
Adapting a Pre-trained Transformer to Your TaskΒΆ
Fine-tuning takes a model that has already learned general language understanding (like BERT) and specializes it for a specific downstream task by training on a smaller, task-specific dataset. The standard recipe is: (1) load the pre-trained weights, (2) attach a new classification head (nn.Linear) on top, (3) optionally freeze the base model layers initially, and (4) train end-to-end with a small learning rate (typically 1e-5 to 3e-5). Fine-tuning is far more data-efficient than training from scratch because the pre-trained encoder already captures grammar, semantics, and world knowledge β the classification head only needs to learn the mapping from those rich representations to your target labels.
class BERTClassifier(nn.Module):
"""
BERT for sequence classification
"""
def __init__(self, num_classes):
super().__init__()
# Load pre-trained BERT
self.bert = BertModel.from_pretrained('bert-base-uncased')
# Classification head
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(768, num_classes) # BERT-base has 768 dims
def forward(self, input_ids, attention_mask):
# BERT encoding
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
# Get [CLS] token
pooled_output = outputs.pooler_output # (batch, 768)
# Classify
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return logits
# Create classifier
num_classes = 3 # e.g., Positive, Negative, Neutral
classifier = BERTClassifier(num_classes)
# Example forward pass
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
texts = [
"I love this product!",
"This is terrible.",
"It's okay, nothing special."
]
# Tokenize
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
# Forward pass
classifier.eval()
with torch.no_grad():
logits = classifier(inputs['input_ids'], inputs['attention_mask'])
predictions = torch.argmax(logits, dim=1)
print("Fine-tuning example:")
print(f"\nInput: {len(texts)} sentences")
print(f"Output logits: {logits.shape}")
print(f"Predictions: {predictions}")
print(f"\nModel parameters: {sum(p.numel() for p in classifier.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in classifier.parameters() if p.requires_grad):,}")
print("\nπ‘ Fine-tuning tips:")
print(" 1. Freeze BERT layers initially, train classifier")
print(" 2. Then unfreeze and fine-tune with small learning rate (1e-5)")
print(" 3. Use warmup for learning rate")
print(" 4. Monitor for overfitting (early stopping)")
print(" 5. Use gradient accumulation for large batches")
9. Transformer Architecture DiagramΒΆ
The Big PictureΒΆ
The diagram below summarizes the complete transformer encoder architecture. Data enters as token IDs, passes through an embedding layer and positional encoding, then flows through \(N\) identical blocks. Each block applies multi-head self-attention (so every position can attend to every other position), followed by a position-wise feed-forward network, with residual connections and layer normalization wrapping each sub-layer. This architecture is the foundation of BERT, GPT, T5, and virtually every large language model in production today. The elegance of the design lies in its simplicity and parallelizability β there are no recurrent connections, so the entire sequence can be processed simultaneously on modern hardware.
print("""
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β TRANSFORMER ARCHITECTURE β
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£
β β
β Input Tokens β
β β β
β Token Embedding + Positional Encoding β
β β β
β βββββββββββββββββββββββββββββββββββββββ β
β β ENCODER (ΓN layers) β β
β β βββββββββββββββββββββββββββββββ β β
β β β Multi-Head Self-Attention β β β
β β βββββββββββββββββββββββββββββββ β β
β β β β β
β β βββββββββββββββββββββββββββββββ β β
β β β Add & Norm (Residual) β β β
β β βββββββββββββββββββββββββββββββ β β
β β β β β
β β βββββββββββββββββββββββββββββββ β β
β β β Feed-Forward Network β β β
β β βββββββββββββββββββββββββββββββ β β
β β β β β
β β βββββββββββββββββββββββββββββββ β β
β β β Add & Norm (Residual) β β β
β β βββββββββββββββββββββββββββββββ β β
β βββββββββββββββββββββββββββββββββββββββ β
β β β
β Task-Specific Head β
β (Classification, Generation, etc.) β
β β β
β Output β
β β
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£
β Key Components: β
β β’ Multi-Head Attention: Parallel attention mechanisms β
β β’ Residual Connections: Skip connections for gradient flow β
β β’ Layer Normalization: Stabilize training β
β β’ Feed-Forward: Position-wise transformation β
β β’ Positional Encoding: Add position information β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
""")
SummaryΒΆ
β What You LearnedΒΆ
Transformer Architecture: Complete understanding of the revolutionary model
Positional Encoding: Adding position information to embeddings
Encoder Layers: Self-attention + feed-forward with residuals
Pre-trained Models: BERT for understanding, GPT for generation
Fine-tuning: Adapting pre-trained models to your tasks
Text Generation: Sampling strategies and parameters
π Key Architecture ComponentsΒΆ
# Encoder Layer
x = x + MultiHeadAttention(x, x, x) # Self-attention + residual
x = LayerNorm(x)
x = x + FeedForward(x) # FFN + residual
x = LayerNorm(x)
π‘ Key InsightsΒΆ
Parallelization: All positions processed simultaneously
Scalability: Can train on massive datasets with billions of parameters
Transfer Learning: Pre-train once, fine-tune for many tasks
Versatility: Works for NLP, vision, audio, multimodal tasks
Residual Connections: Essential for training deep networks
Layer Normalization: Stabilizes training
π― Model FamiliesΒΆ
Encoder-only (BERT family):
Use: Understanding, classification, NER
Models: BERT, RoBERTa, ALBERT, DistilBERT
Decoder-only (GPT family):
Use: Text generation, few-shot learning
Models: GPT-2, GPT-3, GPT-4, LLaMA
Encoder-Decoder (T5 family):
Use: Translation, summarization, Q&A
Models: T5, BART, mT5
π Next StepsΒΆ
To continue learning:
Fine-tune BERT on your own dataset
Experiment with different model sizes
Try prompt engineering with GPT models
Explore vision transformers (ViT)
Learn about efficient transformers (ALBERT, DistilBERT)
Study recent advances (LLaMA, GPT-4, Claude)
π Additional ResourcesΒΆ
Papers:
Tutorials:
Code:
π Congratulations!ΒΆ
Youβve completed the Neural Networks & Transformers module!
You now understand:
β Neural network fundamentals
β Backpropagation and training
β PyTorch and modern frameworks
β Attention mechanisms
β Transformer architecture
β Pre-trained models and fine-tuning
This knowledge enables you to:
Build and train neural networks from scratch
Use state-of-the-art pre-trained models
Fine-tune models for your specific tasks
Understand how modern AI systems work
Read and implement research papers
Youβre now ready to:
Build your own AI applications
Contribute to open-source AI projects
Continue learning advanced topics
Apply transformers to real-world problems
Keep learning and building! π
The transformer revolution has just begun, and youβre now equipped to be part of it!