Run this notebook: Open in Colab Open in Kaggle

LLM Fine-Tuning with LoRA and QLoRA¶

Fine-tuning a 7B parameter model used to require 8× A100s. LoRA changed that — it adapts large models with <1% of trainable parameters, fitting in a single GPU or even CPU. This notebook covers the theory and practice of LoRA/QLoRA fine-tuning.

1. Setup¶

Import required libraries and check hardware availability.

import numpy as np
import math
import sys
from typing import Optional, List, Dict

# --- PyTorch ---
try:
    import torch
    import torch.nn as nn
    HAS_TORCH = True
    print(f"PyTorch {torch.__version__} available")
except ImportError:
    HAS_TORCH = False
    print("PyTorch not available — some cells will show patterns only")

# --- Transformers ---
try:
    import transformers
    from transformers import (
        AutoTokenizer, AutoModelForCausalLM,
        TrainingArguments, Trainer,
        DataCollatorForLanguageModeling,
    )
    HAS_TRANSFORMERS = True
    print(f"Transformers {transformers.__version__} available")
except ImportError:
    HAS_TRANSFORMERS = False
    print("Transformers not available — will simulate outputs")

# --- PEFT ---
try:
    import peft
    from peft import LoraConfig, get_peft_model, TaskType, PeftModel
    HAS_PEFT = True
    print(f"PEFT {peft.__version__} available")
except ImportError:
    HAS_PEFT = False
    print("PEFT not available — will show code patterns and simulate outputs")

# --- Datasets ---
try:
    from datasets import Dataset
    HAS_DATASETS = True
    print("Datasets library available")
except ImportError:
    HAS_DATASETS = False
    print("Datasets not available — will use plain Python structures")

# --- Print available memory ---
if HAS_TORCH:
    if torch.cuda.is_available():
        device = torch.device("cuda")
        total_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
        free_mem, _ = torch.cuda.mem_get_info()
        free_mem_gb = free_mem / (1024 ** 3)
        print(f"\nGPU: {torch.cuda.get_device_name(0)}")
        print(f"  Total VRAM : {total_mem:.1f} GB")
        print(f"  Free VRAM  : {free_mem_gb:.1f} GB")
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        device = torch.device("mps")
        print("\nApple MPS (Metal) backend available")
        import psutil
        ram_gb = psutil.virtual_memory().total / (1024 ** 3)
        print(f"  System RAM (shared with GPU): {ram_gb:.1f} GB")
    else:
        device = torch.device("cpu")
        try:
            import psutil
            ram_gb = psutil.virtual_memory().total / (1024 ** 3)
            avail_gb = psutil.virtual_memory().available / (1024 ** 3)
            print(f"\nCPU only — Total RAM: {ram_gb:.1f} GB, Available: {avail_gb:.1f} GB")
        except ImportError:
            print("\nCPU only")
    print(f"Device selected: {device}")
else:
    device = None
    print("\nNo torch — device: N/A")

2. Why Full Fine-Tuning Is Expensive¶

Before understanding LoRA, we need to appreciate why full fine-tuning is prohibitive.

Memory breakdown for training¶

During training you need to hold in memory:

Component	Memory cost
Model weights	`params × bytes_per_param`
Gradients	`params × bytes_per_param` (same dtype)
Optimizer states	`params × 8 bytes` (Adam keeps fp32 copy + two moments)
Activations	Varies with batch size and sequence length

Formula (fp16 mixed precision with Adam):

Total ≈ params × (2 + 2 + 12) bytes  =  params × 16 bytes
                  |   |   └── Adam: fp32 param copy (4) + m (4) + v (4)
                  |   └────── fp16 gradients
                  └────────── fp16 weights

For a 7B model: 7 × 10⁹ × 16 = 112 GB — that’s more than two A100 80GB GPUs just for parameters!

def compute_training_memory(params: int, dtype: str = "fp16") -> Dict[str, float]:
    """
    Estimate GPU memory (GB) for training a model with Adam optimizer.
    
    Assumptions:
      - Mixed-precision training: weights + grads in `dtype`, optimizer states in fp32
      - Adam keeps: fp32 param copy, first moment (m), second moment (v)
    """
    bytes_map = {"fp32": 4, "fp16": 2, "bf16": 2, "int8": 1, "int4": 0.5}
    bpp = bytes_map[dtype]          # bytes per parameter for weights/grads

    weights_gb    = params * bpp / 1e9
    gradients_gb  = params * bpp / 1e9
    # Adam in fp32: param_copy(4) + m(4) + v(4) = 12 bytes per param
    optimizer_gb  = params * 12 / 1e9
    total_gb      = weights_gb + gradients_gb + optimizer_gb

    return {
        "dtype":        dtype,
        "weights_GB":   round(weights_gb,   2),
        "gradients_GB": round(gradients_gb, 2),
        "optimizer_GB": round(optimizer_gb, 2),
        "total_GB":     round(total_gb,     2),
    }


def inference_memory(params: int, dtype: str) -> float:
    """Estimate inference-only memory (weights only, no optimizer)."""
    bytes_map = {"fp32": 4, "fp16": 2, "bf16": 2, "int8": 1, "int4": 0.5}
    return round(params * bytes_map[dtype] / 1e9, 2)


# ── GPT-2 (117 M params) ───────────────────────────────────────────────────
gpt2_params = 117_000_000
print("=" * 62)
print(f"GPT-2  ({gpt2_params/1e6:.0f}M params)")
print("=" * 62)
for dtype in ["fp32", "fp16"]:
    m = compute_training_memory(gpt2_params, dtype)
    print(f"  [{dtype:>4}]  weights={m['weights_GB']:.2f} GB  "
          f"grads={m['gradients_GB']:.2f} GB  "
          f"optim={m['optimizer_GB']:.2f} GB  "
          f"→ TOTAL={m['total_GB']:.2f} GB")

print()

# ── LLaMA-2-7B (7 B params) ───────────────────────────────────────────────
llama_params = 7_000_000_000
print("=" * 62)
print(f"LLaMA-2-7B  ({llama_params/1e9:.0f}B params)")
print("=" * 62)
for dtype in ["fp32", "fp16", "int8", "int4"]:
    m = compute_training_memory(llama_params, dtype)
    print(f"  [{dtype:>4}]  weights={m['weights_GB']:6.1f} GB  "
          f"grads={m['gradients_GB']:6.1f} GB  "
          f"optim={m['optimizer_GB']:6.1f} GB  "
          f"→ TOTAL={m['total_GB']:6.1f} GB")

print()
print("Inference-only (weights only, no optimizer):")
print(f"  {'dtype':<6}  {'LLaMA-7B':>10}")
print(f"  {'------':<6}  {'--------':>10}")
for dtype in ["fp32", "fp16", "int8", "int4"]:
    gb = inference_memory(llama_params, dtype)
    print(f"  {dtype:<6}  {gb:>9.1f} GB")

print()
print("Key insight: fp32 training needs ~112 GB — requires multiple A100s.")
print("4-bit QLoRA brings this to ~4-5 GB — fits on a consumer GPU!")

3. LoRA: Low-Rank Adaptation¶

The key insight: Pre-trained weight matrices live in a high-dimensional space, but the task-specific change (ΔW) has a much lower intrinsic rank.

The Math¶

Given a pre-trained weight matrix W₀ ∈ ℝ^(d×k), instead of learning a full update ΔW ∈ ℝ^(d×k), LoRA decomposes it:

\[\Delta W = BA\]

where B ∈ ℝ^(d×r) and A ∈ ℝ^(r×k), with rank r ≪ min(d, k).

The modified forward pass becomes:

\[h = W_0 x + \frac{\alpha}{r} BA \cdot x\]

where α is a scaling hyperparameter (often set to 2r).

Initialization¶

A is initialized with a Gaussian (random noise) so the LoRA branch starts active
B is initialized to zero so ΔW = 0 at the start — identical to the pretrained model

Parameter savings¶

For a weight matrix of shape (4096 × 4096) with rank r=8:

	Parameters
Original W	4096 × 4096 = 16,777,216
LoRA A + B	(4096×8) + (8×4096) = 65,536
Compression ratio	256× fewer parameters

Where to inject LoRA¶

LoRA is typically injected into attention projection matrices:

q_proj (query) — most impactful
v_proj (value) — most impactful
Optionally: k_proj, o_proj, up_proj, down_proj

Feed-forward layers are less commonly targeted but can improve results.

4. LoRA from Scratch¶

Implement a minimal LoRALinear layer to understand the mechanics.

if HAS_TORCH:
    class LoRALinear(nn.Module):
        """
        Linear layer augmented with Low-Rank Adaptation (LoRA).

        Forward pass: y = W₀·x + (α/r)·B·A·x
        Only A and B are trained; W₀ is frozen.
        """
        def __init__(self, in_features: int, out_features: int,
                     rank: int = 4, alpha: float = 1.0):
            super().__init__()
            self.linear = nn.Linear(in_features, out_features, bias=False)
            self.lora_A  = nn.Linear(in_features, rank, bias=False)
            self.lora_B  = nn.Linear(rank, out_features, bias=False)
            self.alpha   = alpha
            self.rank    = rank

            # Freeze the original weights — they will not accumulate gradients
            self.linear.weight.requires_grad = False

            # A ~ N(0, 0.02) so the LoRA branch is non-zero from the start
            nn.init.normal_(self.lora_A.weight, std=0.02)
            # B = 0 so ΔW = B·A = 0 initially → identical to pretrained model
            nn.init.zeros_(self.lora_B.weight)

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            # Original path (frozen) + LoRA path (trainable)
            return self.linear(x) + (self.alpha / self.rank) * self.lora_B(self.lora_A(x))

        def count_parameters(self) -> Dict[str, int]:
            total    = sum(p.numel() for p in self.parameters())
            frozen   = sum(p.numel() for p in self.parameters() if not p.requires_grad)
            trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
            return {"total": total, "frozen": frozen, "trainable": trainable}


    # ── Demonstration ─────────────────────────────────────────────────────────
    in_f, out_f, rank = 512, 512, 8
    original_layer = nn.Linear(in_f, out_f, bias=False)
    lora_layer     = LoRALinear(in_f, out_f, rank=rank, alpha=16.0)

    orig_params  = sum(p.numel() for p in original_layer.parameters())
    stats        = lora_layer.count_parameters()

    print(f"Layer shape: ({in_f} × {out_f}), LoRA rank: {rank}")
    print()
    print(f"  Original layer parameters  : {orig_params:>10,}")
    print(f"  LoRA total parameters       : {stats['total']:>10,}")
    print(f"    ├─ Frozen (W₀)            : {stats['frozen']:>10,}")
    print(f"    └─ Trainable (A + B)      : {stats['trainable']:>10,}")
    print()
    print(f"  LoRA A shape: {tuple(lora_layer.lora_A.weight.shape)}  "
          f"(in={in_f} → rank={rank})")
    print(f"  LoRA B shape: {tuple(lora_layer.lora_B.weight.shape)}  "
          f"(rank={rank} → out={out_f})")
    print()
    compression = orig_params / stats['trainable']
    print(f"  Compression ratio: {compression:.1f}×  "
          f"({stats['trainable'] / orig_params * 100:.2f}% of original)")

    # ── Forward pass verification ──────────────────────────────────────────────
    batch_size, seq_len = 2, 32
    x = torch.randn(batch_size, seq_len, in_f)

    lora_layer.eval()
    with torch.no_grad():
        out = lora_layer(x)

    print()
    print(f"Forward pass:")
    print(f"  Input shape  : {tuple(x.shape)}")
    print(f"  Output shape : {tuple(out.shape)}")
    print(f"  Output mean  : {out.mean().item():.6f}")
    print(f"  Output std   : {out.std().item():.6f}")
    print()
    print("Verification: B=0 init means output ≈ W₀·x at epoch 0")

    # Compare initial output to base linear (W₀)
    # Copy base weights into lora_layer.linear for direct comparison
    lora_layer.linear.weight.data = original_layer.weight.data.clone()
    # Reinitialize B to zero to confirm ΔW=0
    nn.init.zeros_(lora_layer.lora_B.weight)

    with torch.no_grad():
        base_out = original_layer(x)
        lora_out = lora_layer(x)
        max_diff = (lora_out - base_out).abs().max().item()

    print(f"  Max difference (B=0): {max_diff:.2e}  ← should be ~0")

else:
    print("PyTorch not available. LoRA layer code pattern:")
    print()
    print("class LoRALinear(nn.Module):")
    print("    def __init__(self, in_features, out_features, rank=4, alpha=1.0):")
    print("        super().__init__()")
    print("        self.linear = nn.Linear(in_features, out_features, bias=False)")
    print("        self.lora_A = nn.Linear(in_features, rank, bias=False)")
    print("        self.lora_B = nn.Linear(rank, out_features, bias=False)")
    print("        self.linear.weight.requires_grad = False")
    print("        nn.init.normal_(self.lora_A.weight, std=0.02)")
    print("        nn.init.zeros_(self.lora_B.weight)")
    print()
    print("    def forward(self, x):")
    print("        return self.linear(x) + (self.alpha / self.rank) * self.lora_B(self.lora_A(x))")
    print()
    print("Simulated parameter counts for (512 × 512) layer with rank=8:")
    print(f"  Original:    {512*512:>10,}  params")
    print(f"  LoRA A+B:    {512*8 + 8*512:>10,}  params")
    print(f"  Compression: {512*512 / (512*8 + 8*512):.1f}×")

5. LoRA with the PEFT Library¶

The Hugging Face PEFT (Parameter-Efficient Fine-Tuning) library wraps any transformer model with LoRA adapters in a few lines. It handles:

Automatic injection into target modules
Freezing the base model
Saving/loading only adapter weights (~MBs instead of GBs)
Merging adapters back into the base model for deployment

def simulate_peft_output():
    """Print what PEFT output looks like when libraries are unavailable."""
    print("PEFT code pattern (requires: pip install peft transformers):")
    print()
    print("from peft import LoraConfig, get_peft_model, TaskType")
    print()
    print("config = LoraConfig(")
    print('    r=8,                               # LoRA rank')
    print('    lora_alpha=32,                     # scaling = alpha/r = 4')
    print('    target_modules=["q_proj", "v_proj"],# which layers to adapt')
    print('    lora_dropout=0.1,                  # dropout on LoRA path')
    print('    bias="none",                       # do not adapt biases')
    print('    task_type=TaskType.CAUSAL_LM,      # task type')
    print(")")
    print()
    print("base_model = AutoModelForCausalLM.from_pretrained('gpt2')")
    print("model = get_peft_model(base_model, config)")
    print("model.print_trainable_parameters()")
    print()
    print("Simulated output for GPT-2 (124M) with LoRA r=8 on q_proj + v_proj:")

    # Simulate parameter counts
    # GPT-2 small: 12 layers, 768 hidden, q/v projections are 768×768
    gpt2_total = 124_000_000
    n_layers = 12
    hidden   = 768
    r        = 8
    # Each q_proj and v_proj: 768×768
    # LoRA per proj: (768×r + r×768) = 2 × 768 × r
    lora_per_proj = 2 * hidden * r   # A + B matrices
    lora_total = n_layers * 2 * lora_per_proj  # 2 projections per layer
    pct = lora_total / gpt2_total * 100
    print(f"  trainable params: {lora_total:,} || "
          f"all params: {gpt2_total:,} || "
          f"trainable%: {pct:.4f}")


if HAS_TRANSFORMERS and HAS_PEFT:
    print("Loading GPT-2 and applying LoRA via PEFT...")
    print()

    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained("gpt2")

    # Define LoRA configuration
    config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["c_attn"],  # GPT-2 uses c_attn (combined qkv)
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    # Wrap the base model
    peft_model = get_peft_model(base_model, config)

    # Print trainable parameters
    peft_model.print_trainable_parameters()

    # Inspect the architecture
    print()
    print("LoRA config:")
    print(f"  r (rank)      : {config.r}")
    print(f"  alpha         : {config.lora_alpha}")
    print(f"  alpha/r ratio : {config.lora_alpha / config.r:.1f}")
    print(f"  target modules: {config.target_modules}")
    print(f"  dropout       : {config.lora_dropout}")
    print(f"  bias          : {config.bias}")

    # Show which modules have LoRA adapters
    print()
    print("Modules with LoRA adapters:")
    for name, module in peft_model.named_modules():
        if "lora_A" in name and "weight" not in name:
            print(f"  {name}")

    # Clean up to free memory
    del base_model, peft_model
    if HAS_TORCH and torch.cuda.is_available():
        torch.cuda.empty_cache()

else:
    simulate_peft_output()

6. Dataset Preparation for Instruction Tuning¶

Alpaca Format¶

The most common format for instruction fine-tuning follows the Alpaca template:

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}

When there is no additional input, the ### Input: section is omitted.

import random
random.seed(42)
np.random.seed(42)

# ── Alpaca prompt template ─────────────────────────────────────────────────
PROMPT_TEMPLATE = """### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}"""

PROMPT_NO_INPUT = """### Instruction:
{instruction}

### Response:
{output}"""


def format_alpaca(example: Dict) -> str:
    """Format a single example into the Alpaca prompt format."""
    if example.get("input", "").strip():
        return PROMPT_TEMPLATE.format(**example)
    return PROMPT_NO_INPUT.format(**example)


# ── Synthetic sentiment classification dataset (50 examples) ──────────────
POSITIVE_PHRASES = [
    "I absolutely love this product!",
    "Best purchase I've made all year.",
    "Exceptional quality and fast shipping.",
    "Highly recommend to everyone.",
    "Exceeded my expectations in every way.",
    "Outstanding customer service.",
    "Will definitely buy again.",
    "Fantastic value for the price.",
    "Works perfectly right out of the box.",
    "Five stars, no hesitation.",
]
NEGATIVE_PHRASES = [
    "Terrible quality, broke after one day.",
    "Very disappointed with this purchase.",
    "Does not work as described.",
    "Complete waste of money.",
    "The worst product I have ever bought.",
    "Arrived damaged and customer support ignored me.",
    "Would not recommend to anyone.",
    "Returned immediately, total garbage.",
    "Instructions were useless and product failed.",
    "One star is too generous.",
]
NEUTRAL_PHRASES = [
    "The product is okay, nothing special.",
    "It does what it says, nothing more.",
    "Average quality for the price.",
    "Some features are good, others lacking.",
    "Acceptable but room for improvement.",
]

raw_dataset = []
for _ in range(18):
    raw_dataset.append({
        "instruction": "Classify the sentiment of the following product review as positive, negative, or neutral.",
        "input": random.choice(POSITIVE_PHRASES),
        "output": "positive",
    })
for _ in range(18):
    raw_dataset.append({
        "instruction": "Classify the sentiment of the following product review as positive, negative, or neutral.",
        "input": random.choice(NEGATIVE_PHRASES),
        "output": "negative",
    })
for _ in range(14):
    raw_dataset.append({
        "instruction": "Classify the sentiment of the following product review as positive, negative, or neutral.",
        "input": random.choice(NEUTRAL_PHRASES),
        "output": "neutral",
    })

random.shuffle(raw_dataset)
print(f"Dataset size: {len(raw_dataset)} examples")
print()
print("Sample formatted prompt:")
print("-" * 50)
print(format_alpaca(raw_dataset[0]))
print("-" * 50)

# ── Tokenization ──────────────────────────────────────────────────────────
if HAS_TRANSFORMERS:
    print("\nTokenizing dataset with GPT-2 tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

    def tokenize_example(example: Dict, max_length: int = 128) -> Dict:
        text = format_alpaca(example)
        tokens = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors=None,
        )
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized = [tokenize_example(ex) for ex in raw_dataset]

    print(f"  Tokenized {len(tokenized)} examples")
    print(f"  Input IDs shape: ({len(tokenized)}, {len(tokenized[0]['input_ids'])})")
    print(f"  Vocab size: {tokenizer.vocab_size:,}")
    print()

    # Show token length distribution
    # (before padding — count non-pad tokens)
    actual_lengths = [
        sum(1 for tok in ex["input_ids"] if tok != tokenizer.pad_token_id)
        for ex in tokenized
    ]
    print(f"  Token length stats (excluding padding):")
    print(f"    min={min(actual_lengths)}, "
          f"max={max(actual_lengths)}, "
          f"mean={np.mean(actual_lengths):.1f}")

    if HAS_DATASETS:
        hf_dataset = Dataset.from_list(tokenized)
        hf_dataset = hf_dataset.train_test_split(test_size=0.1, seed=42)
        print(f"\n  Train split: {len(hf_dataset['train'])} examples")
        print(f"  Test split : {len(hf_dataset['test'])} examples")

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,  # causal LM, not masked LM
    )
    print("\n  DataCollatorForLanguageModeling (causal LM) ready")

else:
    print("\nTransformers not available. Showing tokenization pattern:")
    print()
    print("tokenizer = AutoTokenizer.from_pretrained('gpt2')")
    print("tokenizer.pad_token = tokenizer.eos_token")
    print()
    print("tokens = tokenizer(text, truncation=True, max_length=128,")
    print("                   padding='max_length', return_tensors=None)")
    print("tokens['labels'] = tokens['input_ids'].copy()")

7. Training with PEFT Trainer¶

A complete fine-tuning script using Hugging Face Trainer with a PEFT model.

Key training arguments for memory efficiency¶

Argument	Value	Why
`fp16=True`	True	Half-precision training, ~2× memory saving
`gradient_checkpointing`	True	Recompute activations on backward pass
`per_device_train_batch_size`	4	Small batch fits in memory
`gradient_accumulation_steps`	4	Effective batch = 4×4 = 16
`optim`	`"adamw_torch"`	Standard optimizer

def simulate_training_log():
    """Print what a LoRA fine-tuning training log looks like."""
    print("Simulated training output (GPU not available):")
    print()
    print("trainable params: 294,912 || all params: 124,734,720 || "
          "trainable%: 0.2364")
    print()
    header = f"{'Step':>6}  {'Training Loss':>14}  {'Epoch':>6}"
    print(header)
    print("-" * len(header))

    np.random.seed(1)
    base_loss = 3.8
    for step in range(1, 12):
        epoch = round(step * 3 / 11, 2)
        loss = base_loss * np.exp(-0.15 * step) + np.random.normal(0, 0.02)
        print(f"{step:>6}  {loss:>14.4f}  {epoch:>6.2f}")

    print()
    print("Training complete.")
    print("  Runtime        : ~4 min on A100, ~15 min on T4")
    print("  Adapter size   : ~1.2 MB  (vs ~500 MB for full GPT-2)")
    print("  Peak VRAM used : ~3.1 GB  (full fine-tune would need ~8 GB)")
    print()

    # Memory savings comparison
    print("Memory savings comparison (GPT-2 124M):")
    print(f"  {'Method':<25}  {'VRAM':>8}  {'Adapter size':>14}")
    print(f"  {'-'*25}  {'-'*8}  {'-'*14}")
    print(f"  {'Full fine-tuning':<25}  {'~8 GB':>8}  {'~500 MB':>14}")
    print(f"  {'LoRA (r=8, q+v)':<25}  {'~3 GB':>8}  {'~1.2 MB':>14}")
    print(f"  {'LoRA (r=4, q+v)':<25}  {'~2 GB':>8}  {'~0.6 MB':>14}")


if HAS_TRANSFORMERS and HAS_PEFT and HAS_DATASETS and HAS_TORCH:
    if device is not None and str(device) != "cpu":
        print("Setting up PEFT Trainer for LoRA fine-tuning...")

        # Load fresh base model + tokenizer
        tokenizer_ft = AutoTokenizer.from_pretrained("gpt2")
        tokenizer_ft.pad_token = tokenizer_ft.eos_token
        base_model_ft = AutoModelForCausalLM.from_pretrained("gpt2")

        # Apply LoRA
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["c_attn"],
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.CAUSAL_LM,
        )
        peft_model_ft = get_peft_model(base_model_ft, lora_config)
        peft_model_ft.print_trainable_parameters()

        # Training arguments optimized for memory efficiency
        training_args = TrainingArguments(
            output_dir="./lora_gpt2_sentiment",
            num_train_epochs=3,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,      # effective batch size = 16
            fp16=True,                          # half-precision
            gradient_checkpointing=True,         # trade compute for memory
            learning_rate=2e-4,
            lr_scheduler_type="cosine",
            warmup_ratio=0.05,
            logging_steps=1,
            save_strategy="no",
            report_to="none",
            optim="adamw_torch",
        )

        trainer = Trainer(
            model=peft_model_ft,
            args=training_args,
            train_dataset=hf_dataset["train"],
            eval_dataset=hf_dataset["test"],
            data_collator=data_collator,
        )

        print("\nStarting training...")
        trainer.train()
        print("Training complete!")

        del base_model_ft, peft_model_ft, trainer
        torch.cuda.empty_cache()

    else:
        print("No GPU available — showing training setup and simulated output.")
        print()
        print("Training arguments for memory-efficient fine-tuning:")
        print()
        print("training_args = TrainingArguments(")
        print('    output_dir="./lora_gpt2_sentiment",')  
        print('    num_train_epochs=3,')
        print('    per_device_train_batch_size=4,')
        print('    gradient_accumulation_steps=4,  # effective batch = 16')
        print('    fp16=True,                      # half-precision')
        print('    gradient_checkpointing=True,     # recompute activations')
        print('    learning_rate=2e-4,')
        print('    lr_scheduler_type="cosine",')
        print('    warmup_ratio=0.05,')
        print('    optim="adamw_torch",')
        print(")")
        print()
        simulate_training_log()
else:
    simulate_training_log()

8. QLoRA: Quantized LoRA¶

QLoRA (Dettmers et al., 2023) combines 4-bit quantization of the base model with LoRA adapters, enabling fine-tuning of 65B parameter models on a single 48GB GPU — or 7B models on a 6GB GPU.

Three innovations in QLoRA¶

NF4 (NormalFloat4) — A new data type optimal for normally distributed weights. Unlike standard int4, NF4 has equal spacing in probability space, minimizing quantization error for weights that follow a bell curve.
Double quantization — Quantize the quantization constants themselves. Each block of 64 weights has one scale factor; these scale factors are then quantized from 32-bit to 8-bit, saving ~0.5 bits per parameter.
Paged optimizers — Use NVIDIA unified memory to page optimizer states to CPU RAM when GPU is full, preventing OOM errors during long sequences.

Memory comparison: LLaMA-7B¶

Method	VRAM	Notes
Full fine-tune fp32	112 GB	Not feasible on single GPU
Full fine-tune fp16	14 GB	Requires A100 80GB
LoRA fp16	8–10 GB	Requires A100 40GB or RTX 4090
QLoRA 4-bit	4–5 GB	Fits on RTX 3090 / T4 (16GB) easily

# QLoRA configuration pattern
# Note: bitsandbytes requires a CUDA GPU; shown as a code pattern here

print("QLoRA setup (requires: pip install bitsandbytes transformers peft):")
print()
print("from transformers import BitsAndBytesConfig")
print()
print("# Step 1: Configure 4-bit quantization")
print("bnb_config = BitsAndBytesConfig(")
print("    load_in_4bit=True,                    # enable 4-bit loading")
print('    bnb_4bit_quant_type="nf4",            # NormalFloat4 data type')
print("    bnb_4bit_compute_dtype=torch.bfloat16,# compute in bf16")
print("    bnb_4bit_use_double_quant=True,       # quantize scale factors too")
print(")")
print()
print("# Step 2: Load model in 4-bit")
print("model = AutoModelForCausalLM.from_pretrained(")
print('    "meta-llama/Llama-2-7b-hf",')
print("    quantization_config=bnb_config,")
print('    device_map="auto",')
print(")")
print()
print("# Step 3: Prepare for k-bit training (important!)")
print("from peft import prepare_model_for_kbit_training")
print("model = prepare_model_for_kbit_training(model)")
print()
print("# Step 4: Apply LoRA on top of the quantized model")
print("lora_config = LoraConfig(")
print("    r=16,")
print("    lora_alpha=64,")
print('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],')
print("    lora_dropout=0.05,")
print('    bias="none",')
print("    task_type=TaskType.CAUSAL_LM,")
print(")")
print("model = get_peft_model(model, lora_config)")
print("model.print_trainable_parameters()")
print()

# ── Memory breakdown ─────────────────────────────────────────────────────────
llama_7b = 7_000_000_000
lora_params = 4 * 2 * 28 * (4096 * 16 + 16 * 4096)   # q,k,v,o × 28 layers, r=16
# 4-bit base weights, fp16 LoRA adapters
base_mem_4bit = llama_7b * 0.5 / 1e9  # 0.5 bytes per param in 4-bit
lora_mem      = lora_params * 2 / 1e9  # fp16 for adapters
# Optimizer for LoRA only (Adam, fp32 copy + m + v = 12 bytes per param)
optim_mem     = lora_params * 12 / 1e9
total_qlora   = base_mem_4bit + lora_mem + optim_mem

print("Memory breakdown for QLoRA on LLaMA-2-7B (r=16, q+k+v+o):")
print(f"  4-bit base weights (frozen)   : {base_mem_4bit:.2f} GB")
print(f"  LoRA adapter weights (fp16)   : {lora_mem:.3f} GB")
print(f"  Adam optimizer states (fp32)  : {optim_mem:.3f} GB")
print(f"  ─────────────────────────────")
print(f"  TOTAL (approx, no activations): {total_qlora:.2f} GB")
print()
print(f"  Compare: full fp16 training would need ~{llama_7b*16/1e9:.0f} GB")
print(f"  QLoRA savings: ~{llama_7b*16/1e9 / total_qlora:.1f}× memory reduction")

if HAS_TORCH:
    print()
    print("NF4 quantization illustration — comparing quantization grids:")
    # NF4 uses quantile-based levels for normally distributed data
    # Simulate what NF4 vs int4 levels look like
    weights = np.random.normal(0, 1, 1000)

    # int4: uniform levels in [-1, 1]
    int4_levels = np.linspace(-1, 1, 16)

    # NF4: quantile-based levels (equal probability mass between levels)
    quantiles = np.linspace(0, 1, 17)
    nf4_levels = np.quantile(weights, quantiles[:-1] + np.diff(quantiles) / 2)
    nf4_levels = nf4_levels / np.max(np.abs(nf4_levels))  # normalize to [-1, 1]

    # Quantization error comparison
    def quantize(vals, levels):
        clipped = np.clip(vals, levels.min(), levels.max())
        return levels[np.argmin(np.abs(clipped[:, None] - levels[None, :]), axis=1)]

    int4_q = quantize(weights / np.max(np.abs(weights)), int4_levels)
    nf4_q  = quantize(weights / np.max(np.abs(weights)), nf4_levels)

    print(f"  int4 quantization error (MSE): {np.mean((weights/np.max(np.abs(weights)) - int4_q)**2):.6f}")
    print(f"  NF4  quantization error (MSE): {np.mean((weights/np.max(np.abs(weights)) - nf4_q)**2):.6f}")
    print("  → NF4 has lower error for normally distributed weights")

9. Merging LoRA Weights¶

After fine-tuning, you have two options for deployment:

Option A: Keep adapters separate (recommended for flexibility)¶

Base model stays unchanged
Adapter checkpoint is tiny (~MBs)
Load with PeftModel.from_pretrained(base_model, adapter_path)
Can swap adapters at runtime for different tasks
Slight inference overhead (two forward passes: base + LoRA)

Option B: Merge adapters into base weights (recommended for production)¶

model.merge_and_unload() computes W = W₀ + (α/r)·BA and stores the result
No runtime overhead — single forward pass
Cannot be un-merged (keep the adapter checkpoint as backup)
Required when deploying to inference engines that don’t support PEFT

def demonstrate_merge_pattern():
    """Show the merge workflow as a code pattern."""
    print("Merging LoRA adapters into base model — code pattern:")
    print()
    print("# After training:")
    print("# 1. Save adapter weights only (small checkpoint)")
    print('peft_model.save_pretrained("./adapter_only")')
    print("# Saves: adapter_config.json, adapter_model.safetensors")
    print("# Size: ~1-10 MB vs 14 GB for the full model")
    print()
    print("# 2. Load and merge for deployment")
    print('base = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")')
    print('peft_model = PeftModel.from_pretrained(base, "./adapter_only")')
    print("merged_model = peft_model.merge_and_unload()  # W = W₀ + (α/r)·BA")
    print()
    print("# 3. Save the merged model")
    print('merged_model.save_pretrained("./finetuned_model")')
    print('tokenizer.save_pretrained("./finetuned_model")')
    print()
    print("# 4. Use exactly like any other HF model — no PEFT needed")
    print('final = AutoModelForCausalLM.from_pretrained("./finetuned_model")')


if HAS_TORCH and HAS_PEFT:
    print("Demonstrating merge with a simple LoRA model...")
    print()

    # Create a tiny linear model to demonstrate merge_and_unload equivalence
    in_f, out_f, r = 64, 64, 4
    lora_demo = LoRALinear(in_f, out_f, rank=r, alpha=8.0)

    # Simulate some training — give lora_B non-zero values
    with torch.no_grad():
        nn.init.normal_(lora_demo.lora_B.weight, std=0.1)

    # Test input
    torch.manual_seed(42)
    x_test = torch.randn(4, in_f)

    # Output BEFORE merge
    lora_demo.eval()
    with torch.no_grad():
        out_before = lora_demo(x_test)

    # Manual merge: W_merged = W₀ + (α/r) · B · A
    with torch.no_grad():
        W0 = lora_demo.linear.weight.data        # (out_f, in_f)
        A  = lora_demo.lora_A.weight.data         # (r, in_f)
        B  = lora_demo.lora_B.weight.data         # (out_f, r)
        scale = lora_demo.alpha / lora_demo.rank
        delta_W = scale * (B @ A)                 # (out_f, in_f)
        W_merged = W0 + delta_W

    # Create merged linear layer
    merged_layer = nn.Linear(in_f, out_f, bias=False)
    merged_layer.weight.data = W_merged

    # Output AFTER merge
    with torch.no_grad():
        out_after = merged_layer(x_test)

    # Verify equivalence
    max_diff = (out_before - out_after).abs().max().item()
    print(f"Max difference before vs after merge: {max_diff:.2e}")
    print(f"Outputs are identical: {max_diff < 1e-5}")
    print()
    print(f"ΔW stats:")
    print(f"  ΔW shape     : {delta_W.shape}")
    print(f"  ΔW Frobenius : {torch.norm(delta_W).item():.4f}")
    print(f"  W₀ Frobenius : {torch.norm(W0).item():.4f}")
    print(f"  Relative change: {torch.norm(delta_W)/torch.norm(W0)*100:.2f}%")
    print()
    demonstrate_merge_pattern()
else:
    demonstrate_merge_pattern()

print()
print("When to merge vs keep separate:")
print("  Keep separate : experimenting, A/B testing, multi-task (swap adapters)")
print("  Merge         : production deployment, serving with standard runtimes")

10. Practical Tips Cheat Sheet¶

Hyperparameter      Typical range       Notes
────────────────────────────────────────────────────────────────────────────
rank (r)            4–64                Higher = more capacity, more memory
                                        r=8 is a good default start
                                        r=64 approaches full fine-tuning quality

alpha               r–4r                Keep alpha/r ratio ~2–4
                                        alpha=2r means scale factor = 2.0
                                        alpha=r means scale factor = 1.0

target_modules      q_proj, v_proj      Minimum effective set
                    + k_proj, o_proj    Better results, ~2× more params
                    + up/down_proj      Max coverage for instruction tuning

lora_dropout        0.05–0.1            Regularization
                                        Set to 0 for very small datasets
                                        (<500 examples)

learning_rate       1e-4 – 3e-4         Higher than full fine-tuning (1e-5)
                                        LoRA params are random-initialized

epochs              1–3                 LLMs overfit quickly
                                        Use early stopping on validation loss

batch_size          4–32                Use gradient accumulation to reach
                    (with GA)           effective batch of 16–32

scheduler           cosine / linear     Cosine with warmup (5–10%) works well

────────────────────────────────────────────────────────────────────────────

Common pitfalls¶

Forgetting prepare_model_for_kbit_training with QLoRA — leads to NaN losses
Using fp16 with bfloat16 models — LLaMA uses bf16 internally; use compute_dtype=torch.bfloat16
Catastrophic forgetting — fine-tune on too much data and the model forgets general capabilities; keep epochs low
Data formatting errors — missing EOS token, wrong instruction template; always inspect 5-10 tokenized examples
rank too high — r=64 with small datasets leads to overfitting; start with r=8

11. When to Use What¶

Method	GPU Memory	Training Speed	Quality	Use Case
Full fine-tuning	40–80 GB	Slowest	Best	Large budget, highest quality
LoRA	8–16 GB	Fast	Near-full	Most cases
QLoRA	4–8 GB	Medium	Slight loss	Consumer GPU / Colab
Prompt tuning	<1 GB	Fastest	Limited	Very constrained

Decision flowchart¶

Have access to multiple A100/H100s?
├── Yes → Full fine-tuning (maximum quality)
└── No
    ├── Single A100 (40/80 GB) or RTX 4090 (24 GB)?
    │   └── LoRA (fp16) — best quality/memory tradeoff
    ├── T4 (16 GB) or RTX 3090 (24 GB)?
    │   └── QLoRA (4-bit) — can fine-tune 7B–13B models
    ├── Colab free tier / RTX 3060 (8-12 GB)?
    │   └── QLoRA on 3B–7B models, or LoRA on GPT-2 / small models
    └── CPU only?
        └── Prompt engineering / RAG — fine-tuning not practical

Model size guidelines for QLoRA on consumer hardware¶

Model size	Min VRAM	Recommended GPU
1–3B	4 GB	RTX 3060 / T4
7B	6–8 GB	RTX 3080 / T4 16GB
13B	10–12 GB	RTX 3090 / A10G
30B	20–24 GB	RTX 4090 / A5000
70B	40–48 GB	A100 40GB (tight)

12. Exercises¶

Exercise 1: Rank ablation study¶

Experiment with rank r ∈ {4, 8, 16, 32} and measure:

Number of trainable parameters
Validation loss after 3 epochs
Training time per epoch

Plot trainable parameters vs validation loss to find the sweet spot.

results = []
for r in [4, 8, 16, 32]:
    config = LoraConfig(r=r, lora_alpha=2*r, target_modules=["c_attn"],
                        bias="none", task_type=TaskType.CAUSAL_LM)
    model = get_peft_model(AutoModelForCausalLM.from_pretrained("gpt2"), config)
    # ... train and evaluate ...
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    results.append({"r": r, "trainable": trainable, "val_loss": val_loss})

Exercise 2: Target module comparison¶

Apply LoRA to different module combinations and compare:

["c_attn"] — query+key+value combined (GPT-2)
["c_attn", "c_proj"] — attention + output projection
["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"] — attention + MLP

Expected: more modules → slightly better quality, more parameters.

Exercise 3: Early stopping on perplexity¶

Implement a EarlyStoppingCallback that monitors validation perplexity:

from transformers import TrainerCallback

class PerplexityEarlyStopping(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.best_ppl = float('inf')
        self.counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        ppl = math.exp(metrics["eval_loss"])
        if ppl < self.best_ppl:
            self.best_ppl = ppl
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                control.should_training_stop = True
        return control

Exercise 4: Domain-specific fine-tuning¶

Create a synthetic dataset for a domain-specific task:

Option A — Code generation:

examples = [
    {"instruction": "Write a Python function that sorts a list",
     "input": "", "output": "def sort_list(lst): return sorted(lst)"},
    # ... 50+ examples
]

Option B — Medical QA (synthetic):

examples = [
    {"instruction": "Answer the medical question briefly",
     "input": "What is hypertension?",
     "output": "High blood pressure, defined as readings consistently above 130/80 mmHg."},
    # ... 50+ examples
]

Fine-tune GPT-2 with LoRA (r=8) and measure domain-specific perplexity.

Exercise 5: Verify merge equivalence¶

Fine-tune a small LoRA model, then verify that merged outputs are identical:

# After fine-tuning peft_model:
test_input = tokenizer("Once upon a time", return_tensors="pt")

# Output from PEFT model (adapter active)
with torch.no_grad():
    out_peft = peft_model(**test_input).logits

# Merge and output from merged model
merged = peft_model.merge_and_unload()
with torch.no_grad():
    out_merged = merged(**test_input).logits

# Should be < 1e-5
print(f"Max diff: {(out_peft - out_merged).abs().max().item():.2e}")
assert (out_peft - out_merged).abs().max().item() < 1e-4, "Merge not equivalent!"

Expected result: Max difference < 1e-4 (small floating-point error only).