LLM Fine-Tuning with LoRA and QLoRAΒΆ

Fine-tuning a 7B parameter model used to require 8Γ— A100s. LoRA changed that β€” it adapts large models with <1% of trainable parameters, fitting in a single GPU or even CPU. This notebook covers the theory and practice of LoRA/QLoRA fine-tuning.

1. SetupΒΆ

Import required libraries and check hardware availability.

import numpy as np
import math
import sys
from typing import Optional, List, Dict

# --- PyTorch ---
try:
    import torch
    import torch.nn as nn
    HAS_TORCH = True
    print(f"PyTorch {torch.__version__} available")
except ImportError:
    HAS_TORCH = False
    print("PyTorch not available β€” some cells will show patterns only")

# --- Transformers ---
try:
    import transformers
    from transformers import (
        AutoTokenizer, AutoModelForCausalLM,
        TrainingArguments, Trainer,
        DataCollatorForLanguageModeling,
    )
    HAS_TRANSFORMERS = True
    print(f"Transformers {transformers.__version__} available")
except ImportError:
    HAS_TRANSFORMERS = False
    print("Transformers not available β€” will simulate outputs")

# --- PEFT ---
try:
    import peft
    from peft import LoraConfig, get_peft_model, TaskType, PeftModel
    HAS_PEFT = True
    print(f"PEFT {peft.__version__} available")
except ImportError:
    HAS_PEFT = False
    print("PEFT not available β€” will show code patterns and simulate outputs")

# --- Datasets ---
try:
    from datasets import Dataset
    HAS_DATASETS = True
    print("Datasets library available")
except ImportError:
    HAS_DATASETS = False
    print("Datasets not available β€” will use plain Python structures")

# --- Print available memory ---
if HAS_TORCH:
    if torch.cuda.is_available():
        device = torch.device("cuda")
        total_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
        free_mem, _ = torch.cuda.mem_get_info()
        free_mem_gb = free_mem / (1024 ** 3)
        print(f"\nGPU: {torch.cuda.get_device_name(0)}")
        print(f"  Total VRAM : {total_mem:.1f} GB")
        print(f"  Free VRAM  : {free_mem_gb:.1f} GB")
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        device = torch.device("mps")
        print("\nApple MPS (Metal) backend available")
        import psutil
        ram_gb = psutil.virtual_memory().total / (1024 ** 3)
        print(f"  System RAM (shared with GPU): {ram_gb:.1f} GB")
    else:
        device = torch.device("cpu")
        try:
            import psutil
            ram_gb = psutil.virtual_memory().total / (1024 ** 3)
            avail_gb = psutil.virtual_memory().available / (1024 ** 3)
            print(f"\nCPU only β€” Total RAM: {ram_gb:.1f} GB, Available: {avail_gb:.1f} GB")
        except ImportError:
            print("\nCPU only")
    print(f"Device selected: {device}")
else:
    device = None
    print("\nNo torch β€” device: N/A")

2. Why Full Fine-Tuning Is ExpensiveΒΆ

Before understanding LoRA, we need to appreciate why full fine-tuning is prohibitive.

Memory breakdown for trainingΒΆ

During training you need to hold in memory:

Component

Memory cost

Model weights

params Γ— bytes_per_param

Gradients

params Γ— bytes_per_param (same dtype)

Optimizer states

params Γ— 8 bytes (Adam keeps fp32 copy + two moments)

Activations

Varies with batch size and sequence length

Formula (fp16 mixed precision with Adam):

Total β‰ˆ params Γ— (2 + 2 + 12) bytes  =  params Γ— 16 bytes
                  |   |   └── Adam: fp32 param copy (4) + m (4) + v (4)
                  |   └────── fp16 gradients
                  └────────── fp16 weights

For a 7B model: 7 Γ— 10⁹ Γ— 16 = 112 GB β€” that’s more than two A100 80GB GPUs just for parameters!

def compute_training_memory(params: int, dtype: str = "fp16") -> Dict[str, float]:
    """
    Estimate GPU memory (GB) for training a model with Adam optimizer.
    
    Assumptions:
      - Mixed-precision training: weights + grads in `dtype`, optimizer states in fp32
      - Adam keeps: fp32 param copy, first moment (m), second moment (v)
    """
    bytes_map = {"fp32": 4, "fp16": 2, "bf16": 2, "int8": 1, "int4": 0.5}
    bpp = bytes_map[dtype]          # bytes per parameter for weights/grads

    weights_gb    = params * bpp / 1e9
    gradients_gb  = params * bpp / 1e9
    # Adam in fp32: param_copy(4) + m(4) + v(4) = 12 bytes per param
    optimizer_gb  = params * 12 / 1e9
    total_gb      = weights_gb + gradients_gb + optimizer_gb

    return {
        "dtype":        dtype,
        "weights_GB":   round(weights_gb,   2),
        "gradients_GB": round(gradients_gb, 2),
        "optimizer_GB": round(optimizer_gb, 2),
        "total_GB":     round(total_gb,     2),
    }


def inference_memory(params: int, dtype: str) -> float:
    """Estimate inference-only memory (weights only, no optimizer)."""
    bytes_map = {"fp32": 4, "fp16": 2, "bf16": 2, "int8": 1, "int4": 0.5}
    return round(params * bytes_map[dtype] / 1e9, 2)


# ── GPT-2 (117 M params) ───────────────────────────────────────────────────
gpt2_params = 117_000_000
print("=" * 62)
print(f"GPT-2  ({gpt2_params/1e6:.0f}M params)")
print("=" * 62)
for dtype in ["fp32", "fp16"]:
    m = compute_training_memory(gpt2_params, dtype)
    print(f"  [{dtype:>4}]  weights={m['weights_GB']:.2f} GB  "
          f"grads={m['gradients_GB']:.2f} GB  "
          f"optim={m['optimizer_GB']:.2f} GB  "
          f"β†’ TOTAL={m['total_GB']:.2f} GB")

print()

# ── LLaMA-2-7B (7 B params) ───────────────────────────────────────────────
llama_params = 7_000_000_000
print("=" * 62)
print(f"LLaMA-2-7B  ({llama_params/1e9:.0f}B params)")
print("=" * 62)
for dtype in ["fp32", "fp16", "int8", "int4"]:
    m = compute_training_memory(llama_params, dtype)
    print(f"  [{dtype:>4}]  weights={m['weights_GB']:6.1f} GB  "
          f"grads={m['gradients_GB']:6.1f} GB  "
          f"optim={m['optimizer_GB']:6.1f} GB  "
          f"β†’ TOTAL={m['total_GB']:6.1f} GB")

print()
print("Inference-only (weights only, no optimizer):")
print(f"  {'dtype':<6}  {'LLaMA-7B':>10}")
print(f"  {'------':<6}  {'--------':>10}")
for dtype in ["fp32", "fp16", "int8", "int4"]:
    gb = inference_memory(llama_params, dtype)
    print(f"  {dtype:<6}  {gb:>9.1f} GB")

print()
print("Key insight: fp32 training needs ~112 GB β€” requires multiple A100s.")
print("4-bit QLoRA brings this to ~4-5 GB β€” fits on a consumer GPU!")

3. LoRA: Low-Rank AdaptationΒΆ

The key insight: Pre-trained weight matrices live in a high-dimensional space, but the task-specific change (Ξ”W) has a much lower intrinsic rank.

The MathΒΆ

Given a pre-trained weight matrix Wβ‚€ ∈ ℝ^(dΓ—k), instead of learning a full update Ξ”W ∈ ℝ^(dΓ—k), LoRA decomposes it:

\[\Delta W = BA\]

where B ∈ ℝ^(dΓ—r) and A ∈ ℝ^(rΓ—k), with rank r β‰ͺ min(d, k).

The modified forward pass becomes:

\[h = W_0 x + \frac{\alpha}{r} BA \cdot x\]

where Ξ± is a scaling hyperparameter (often set to 2r).

InitializationΒΆ

  • A is initialized with a Gaussian (random noise) so the LoRA branch starts active

  • B is initialized to zero so Ξ”W = 0 at the start β€” identical to the pretrained model

Parameter savingsΒΆ

For a weight matrix of shape (4096 Γ— 4096) with rank r=8:

Parameters

Original W

4096 Γ— 4096 = 16,777,216

LoRA A + B

(4096Γ—8) + (8Γ—4096) = 65,536

Compression ratio

256Γ— fewer parameters

Where to inject LoRAΒΆ

LoRA is typically injected into attention projection matrices:

  • q_proj (query) β€” most impactful

  • v_proj (value) β€” most impactful

  • Optionally: k_proj, o_proj, up_proj, down_proj

Feed-forward layers are less commonly targeted but can improve results.

4. LoRA from ScratchΒΆ

Implement a minimal LoRALinear layer to understand the mechanics.

if HAS_TORCH:
    class LoRALinear(nn.Module):
        """
        Linear layer augmented with Low-Rank Adaptation (LoRA).

        Forward pass: y = Wβ‚€Β·x + (Ξ±/r)Β·BΒ·AΒ·x
        Only A and B are trained; Wβ‚€ is frozen.
        """
        def __init__(self, in_features: int, out_features: int,
                     rank: int = 4, alpha: float = 1.0):
            super().__init__()
            self.linear = nn.Linear(in_features, out_features, bias=False)
            self.lora_A  = nn.Linear(in_features, rank, bias=False)
            self.lora_B  = nn.Linear(rank, out_features, bias=False)
            self.alpha   = alpha
            self.rank    = rank

            # Freeze the original weights β€” they will not accumulate gradients
            self.linear.weight.requires_grad = False

            # A ~ N(0, 0.02) so the LoRA branch is non-zero from the start
            nn.init.normal_(self.lora_A.weight, std=0.02)
            # B = 0 so Ξ”W = BΒ·A = 0 initially β†’ identical to pretrained model
            nn.init.zeros_(self.lora_B.weight)

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            # Original path (frozen) + LoRA path (trainable)
            return self.linear(x) + (self.alpha / self.rank) * self.lora_B(self.lora_A(x))

        def count_parameters(self) -> Dict[str, int]:
            total    = sum(p.numel() for p in self.parameters())
            frozen   = sum(p.numel() for p in self.parameters() if not p.requires_grad)
            trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
            return {"total": total, "frozen": frozen, "trainable": trainable}


    # ── Demonstration ─────────────────────────────────────────────────────────
    in_f, out_f, rank = 512, 512, 8
    original_layer = nn.Linear(in_f, out_f, bias=False)
    lora_layer     = LoRALinear(in_f, out_f, rank=rank, alpha=16.0)

    orig_params  = sum(p.numel() for p in original_layer.parameters())
    stats        = lora_layer.count_parameters()

    print(f"Layer shape: ({in_f} Γ— {out_f}), LoRA rank: {rank}")
    print()
    print(f"  Original layer parameters  : {orig_params:>10,}")
    print(f"  LoRA total parameters       : {stats['total']:>10,}")
    print(f"    β”œβ”€ Frozen (Wβ‚€)            : {stats['frozen']:>10,}")
    print(f"    └─ Trainable (A + B)      : {stats['trainable']:>10,}")
    print()
    print(f"  LoRA A shape: {tuple(lora_layer.lora_A.weight.shape)}  "
          f"(in={in_f} β†’ rank={rank})")
    print(f"  LoRA B shape: {tuple(lora_layer.lora_B.weight.shape)}  "
          f"(rank={rank} β†’ out={out_f})")
    print()
    compression = orig_params / stats['trainable']
    print(f"  Compression ratio: {compression:.1f}Γ—  "
          f"({stats['trainable'] / orig_params * 100:.2f}% of original)")

    # ── Forward pass verification ──────────────────────────────────────────────
    batch_size, seq_len = 2, 32
    x = torch.randn(batch_size, seq_len, in_f)

    lora_layer.eval()
    with torch.no_grad():
        out = lora_layer(x)

    print()
    print(f"Forward pass:")
    print(f"  Input shape  : {tuple(x.shape)}")
    print(f"  Output shape : {tuple(out.shape)}")
    print(f"  Output mean  : {out.mean().item():.6f}")
    print(f"  Output std   : {out.std().item():.6f}")
    print()
    print("Verification: B=0 init means output β‰ˆ Wβ‚€Β·x at epoch 0")

    # Compare initial output to base linear (Wβ‚€)
    # Copy base weights into lora_layer.linear for direct comparison
    lora_layer.linear.weight.data = original_layer.weight.data.clone()
    # Reinitialize B to zero to confirm Ξ”W=0
    nn.init.zeros_(lora_layer.lora_B.weight)

    with torch.no_grad():
        base_out = original_layer(x)
        lora_out = lora_layer(x)
        max_diff = (lora_out - base_out).abs().max().item()

    print(f"  Max difference (B=0): {max_diff:.2e}  ← should be ~0")

else:
    print("PyTorch not available. LoRA layer code pattern:")
    print()
    print("class LoRALinear(nn.Module):")
    print("    def __init__(self, in_features, out_features, rank=4, alpha=1.0):")
    print("        super().__init__()")
    print("        self.linear = nn.Linear(in_features, out_features, bias=False)")
    print("        self.lora_A = nn.Linear(in_features, rank, bias=False)")
    print("        self.lora_B = nn.Linear(rank, out_features, bias=False)")
    print("        self.linear.weight.requires_grad = False")
    print("        nn.init.normal_(self.lora_A.weight, std=0.02)")
    print("        nn.init.zeros_(self.lora_B.weight)")
    print()
    print("    def forward(self, x):")
    print("        return self.linear(x) + (self.alpha / self.rank) * self.lora_B(self.lora_A(x))")
    print()
    print("Simulated parameter counts for (512 Γ— 512) layer with rank=8:")
    print(f"  Original:    {512*512:>10,}  params")
    print(f"  LoRA A+B:    {512*8 + 8*512:>10,}  params")
    print(f"  Compression: {512*512 / (512*8 + 8*512):.1f}Γ—")

5. LoRA with the PEFT LibraryΒΆ

The Hugging Face PEFT (Parameter-Efficient Fine-Tuning) library wraps any transformer model with LoRA adapters in a few lines. It handles:

  • Automatic injection into target modules

  • Freezing the base model

  • Saving/loading only adapter weights (~MBs instead of GBs)

  • Merging adapters back into the base model for deployment

def simulate_peft_output():
    """Print what PEFT output looks like when libraries are unavailable."""
    print("PEFT code pattern (requires: pip install peft transformers):")
    print()
    print("from peft import LoraConfig, get_peft_model, TaskType")
    print()
    print("config = LoraConfig(")
    print('    r=8,                               # LoRA rank')
    print('    lora_alpha=32,                     # scaling = alpha/r = 4')
    print('    target_modules=["q_proj", "v_proj"],# which layers to adapt')
    print('    lora_dropout=0.1,                  # dropout on LoRA path')
    print('    bias="none",                       # do not adapt biases')
    print('    task_type=TaskType.CAUSAL_LM,      # task type')
    print(")")
    print()
    print("base_model = AutoModelForCausalLM.from_pretrained('gpt2')")
    print("model = get_peft_model(base_model, config)")
    print("model.print_trainable_parameters()")
    print()
    print("Simulated output for GPT-2 (124M) with LoRA r=8 on q_proj + v_proj:")

    # Simulate parameter counts
    # GPT-2 small: 12 layers, 768 hidden, q/v projections are 768Γ—768
    gpt2_total = 124_000_000
    n_layers = 12
    hidden   = 768
    r        = 8
    # Each q_proj and v_proj: 768Γ—768
    # LoRA per proj: (768Γ—r + rΓ—768) = 2 Γ— 768 Γ— r
    lora_per_proj = 2 * hidden * r   # A + B matrices
    lora_total = n_layers * 2 * lora_per_proj  # 2 projections per layer
    pct = lora_total / gpt2_total * 100
    print(f"  trainable params: {lora_total:,} || "
          f"all params: {gpt2_total:,} || "
          f"trainable%: {pct:.4f}")


if HAS_TRANSFORMERS and HAS_PEFT:
    print("Loading GPT-2 and applying LoRA via PEFT...")
    print()

    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained("gpt2")

    # Define LoRA configuration
    config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["c_attn"],  # GPT-2 uses c_attn (combined qkv)
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    # Wrap the base model
    peft_model = get_peft_model(base_model, config)

    # Print trainable parameters
    peft_model.print_trainable_parameters()

    # Inspect the architecture
    print()
    print("LoRA config:")
    print(f"  r (rank)      : {config.r}")
    print(f"  alpha         : {config.lora_alpha}")
    print(f"  alpha/r ratio : {config.lora_alpha / config.r:.1f}")
    print(f"  target modules: {config.target_modules}")
    print(f"  dropout       : {config.lora_dropout}")
    print(f"  bias          : {config.bias}")

    # Show which modules have LoRA adapters
    print()
    print("Modules with LoRA adapters:")
    for name, module in peft_model.named_modules():
        if "lora_A" in name and "weight" not in name:
            print(f"  {name}")

    # Clean up to free memory
    del base_model, peft_model
    if HAS_TORCH and torch.cuda.is_available():
        torch.cuda.empty_cache()

else:
    simulate_peft_output()

6. Dataset Preparation for Instruction TuningΒΆ

Alpaca FormatΒΆ

The most common format for instruction fine-tuning follows the Alpaca template:

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}

When there is no additional input, the ### Input: section is omitted.

import random
random.seed(42)
np.random.seed(42)

# ── Alpaca prompt template ─────────────────────────────────────────────────
PROMPT_TEMPLATE = """### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}"""

PROMPT_NO_INPUT = """### Instruction:
{instruction}

### Response:
{output}"""


def format_alpaca(example: Dict) -> str:
    """Format a single example into the Alpaca prompt format."""
    if example.get("input", "").strip():
        return PROMPT_TEMPLATE.format(**example)
    return PROMPT_NO_INPUT.format(**example)


# ── Synthetic sentiment classification dataset (50 examples) ──────────────
POSITIVE_PHRASES = [
    "I absolutely love this product!",
    "Best purchase I've made all year.",
    "Exceptional quality and fast shipping.",
    "Highly recommend to everyone.",
    "Exceeded my expectations in every way.",
    "Outstanding customer service.",
    "Will definitely buy again.",
    "Fantastic value for the price.",
    "Works perfectly right out of the box.",
    "Five stars, no hesitation.",
]
NEGATIVE_PHRASES = [
    "Terrible quality, broke after one day.",
    "Very disappointed with this purchase.",
    "Does not work as described.",
    "Complete waste of money.",
    "The worst product I have ever bought.",
    "Arrived damaged and customer support ignored me.",
    "Would not recommend to anyone.",
    "Returned immediately, total garbage.",
    "Instructions were useless and product failed.",
    "One star is too generous.",
]
NEUTRAL_PHRASES = [
    "The product is okay, nothing special.",
    "It does what it says, nothing more.",
    "Average quality for the price.",
    "Some features are good, others lacking.",
    "Acceptable but room for improvement.",
]

raw_dataset = []
for _ in range(18):
    raw_dataset.append({
        "instruction": "Classify the sentiment of the following product review as positive, negative, or neutral.",
        "input": random.choice(POSITIVE_PHRASES),
        "output": "positive",
    })
for _ in range(18):
    raw_dataset.append({
        "instruction": "Classify the sentiment of the following product review as positive, negative, or neutral.",
        "input": random.choice(NEGATIVE_PHRASES),
        "output": "negative",
    })
for _ in range(14):
    raw_dataset.append({
        "instruction": "Classify the sentiment of the following product review as positive, negative, or neutral.",
        "input": random.choice(NEUTRAL_PHRASES),
        "output": "neutral",
    })

random.shuffle(raw_dataset)
print(f"Dataset size: {len(raw_dataset)} examples")
print()
print("Sample formatted prompt:")
print("-" * 50)
print(format_alpaca(raw_dataset[0]))
print("-" * 50)

# ── Tokenization ──────────────────────────────────────────────────────────
if HAS_TRANSFORMERS:
    print("\nTokenizing dataset with GPT-2 tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

    def tokenize_example(example: Dict, max_length: int = 128) -> Dict:
        text = format_alpaca(example)
        tokens = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors=None,
        )
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized = [tokenize_example(ex) for ex in raw_dataset]

    print(f"  Tokenized {len(tokenized)} examples")
    print(f"  Input IDs shape: ({len(tokenized)}, {len(tokenized[0]['input_ids'])})")
    print(f"  Vocab size: {tokenizer.vocab_size:,}")
    print()

    # Show token length distribution
    # (before padding β€” count non-pad tokens)
    actual_lengths = [
        sum(1 for tok in ex["input_ids"] if tok != tokenizer.pad_token_id)
        for ex in tokenized
    ]
    print(f"  Token length stats (excluding padding):")
    print(f"    min={min(actual_lengths)}, "
          f"max={max(actual_lengths)}, "
          f"mean={np.mean(actual_lengths):.1f}")

    if HAS_DATASETS:
        hf_dataset = Dataset.from_list(tokenized)
        hf_dataset = hf_dataset.train_test_split(test_size=0.1, seed=42)
        print(f"\n  Train split: {len(hf_dataset['train'])} examples")
        print(f"  Test split : {len(hf_dataset['test'])} examples")

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,  # causal LM, not masked LM
    )
    print("\n  DataCollatorForLanguageModeling (causal LM) ready")

else:
    print("\nTransformers not available. Showing tokenization pattern:")
    print()
    print("tokenizer = AutoTokenizer.from_pretrained('gpt2')")
    print("tokenizer.pad_token = tokenizer.eos_token")
    print()
    print("tokens = tokenizer(text, truncation=True, max_length=128,")
    print("                   padding='max_length', return_tensors=None)")
    print("tokens['labels'] = tokens['input_ids'].copy()")

7. Training with PEFT TrainerΒΆ

A complete fine-tuning script using Hugging Face Trainer with a PEFT model.

Key training arguments for memory efficiencyΒΆ

Argument

Value

Why

fp16=True

True

Half-precision training, ~2Γ— memory saving

gradient_checkpointing

True

Recompute activations on backward pass

per_device_train_batch_size

4

Small batch fits in memory

gradient_accumulation_steps

4

Effective batch = 4Γ—4 = 16

optim

"adamw_torch"

Standard optimizer

def simulate_training_log():
    """Print what a LoRA fine-tuning training log looks like."""
    print("Simulated training output (GPU not available):")
    print()
    print("trainable params: 294,912 || all params: 124,734,720 || "
          "trainable%: 0.2364")
    print()
    header = f"{'Step':>6}  {'Training Loss':>14}  {'Epoch':>6}"
    print(header)
    print("-" * len(header))

    np.random.seed(1)
    base_loss = 3.8
    for step in range(1, 12):
        epoch = round(step * 3 / 11, 2)
        loss = base_loss * np.exp(-0.15 * step) + np.random.normal(0, 0.02)
        print(f"{step:>6}  {loss:>14.4f}  {epoch:>6.2f}")

    print()
    print("Training complete.")
    print("  Runtime        : ~4 min on A100, ~15 min on T4")
    print("  Adapter size   : ~1.2 MB  (vs ~500 MB for full GPT-2)")
    print("  Peak VRAM used : ~3.1 GB  (full fine-tune would need ~8 GB)")
    print()

    # Memory savings comparison
    print("Memory savings comparison (GPT-2 124M):")
    print(f"  {'Method':<25}  {'VRAM':>8}  {'Adapter size':>14}")
    print(f"  {'-'*25}  {'-'*8}  {'-'*14}")
    print(f"  {'Full fine-tuning':<25}  {'~8 GB':>8}  {'~500 MB':>14}")
    print(f"  {'LoRA (r=8, q+v)':<25}  {'~3 GB':>8}  {'~1.2 MB':>14}")
    print(f"  {'LoRA (r=4, q+v)':<25}  {'~2 GB':>8}  {'~0.6 MB':>14}")


if HAS_TRANSFORMERS and HAS_PEFT and HAS_DATASETS and HAS_TORCH:
    if device is not None and str(device) != "cpu":
        print("Setting up PEFT Trainer for LoRA fine-tuning...")

        # Load fresh base model + tokenizer
        tokenizer_ft = AutoTokenizer.from_pretrained("gpt2")
        tokenizer_ft.pad_token = tokenizer_ft.eos_token
        base_model_ft = AutoModelForCausalLM.from_pretrained("gpt2")

        # Apply LoRA
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["c_attn"],
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.CAUSAL_LM,
        )
        peft_model_ft = get_peft_model(base_model_ft, lora_config)
        peft_model_ft.print_trainable_parameters()

        # Training arguments optimized for memory efficiency
        training_args = TrainingArguments(
            output_dir="./lora_gpt2_sentiment",
            num_train_epochs=3,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,      # effective batch size = 16
            fp16=True,                          # half-precision
            gradient_checkpointing=True,         # trade compute for memory
            learning_rate=2e-4,
            lr_scheduler_type="cosine",
            warmup_ratio=0.05,
            logging_steps=1,
            save_strategy="no",
            report_to="none",
            optim="adamw_torch",
        )

        trainer = Trainer(
            model=peft_model_ft,
            args=training_args,
            train_dataset=hf_dataset["train"],
            eval_dataset=hf_dataset["test"],
            data_collator=data_collator,
        )

        print("\nStarting training...")
        trainer.train()
        print("Training complete!")

        del base_model_ft, peft_model_ft, trainer
        torch.cuda.empty_cache()

    else:
        print("No GPU available β€” showing training setup and simulated output.")
        print()
        print("Training arguments for memory-efficient fine-tuning:")
        print()
        print("training_args = TrainingArguments(")
        print('    output_dir="./lora_gpt2_sentiment",')  
        print('    num_train_epochs=3,')
        print('    per_device_train_batch_size=4,')
        print('    gradient_accumulation_steps=4,  # effective batch = 16')
        print('    fp16=True,                      # half-precision')
        print('    gradient_checkpointing=True,     # recompute activations')
        print('    learning_rate=2e-4,')
        print('    lr_scheduler_type="cosine",')
        print('    warmup_ratio=0.05,')
        print('    optim="adamw_torch",')
        print(")")
        print()
        simulate_training_log()
else:
    simulate_training_log()

8. QLoRA: Quantized LoRAΒΆ

QLoRA (Dettmers et al., 2023) combines 4-bit quantization of the base model with LoRA adapters, enabling fine-tuning of 65B parameter models on a single 48GB GPU β€” or 7B models on a 6GB GPU.

Three innovations in QLoRAΒΆ

  1. NF4 (NormalFloat4) β€” A new data type optimal for normally distributed weights. Unlike standard int4, NF4 has equal spacing in probability space, minimizing quantization error for weights that follow a bell curve.

  2. Double quantization β€” Quantize the quantization constants themselves. Each block of 64 weights has one scale factor; these scale factors are then quantized from 32-bit to 8-bit, saving ~0.5 bits per parameter.

  3. Paged optimizers β€” Use NVIDIA unified memory to page optimizer states to CPU RAM when GPU is full, preventing OOM errors during long sequences.

Memory comparison: LLaMA-7BΒΆ

Method

VRAM

Notes

Full fine-tune fp32

112 GB

Not feasible on single GPU

Full fine-tune fp16

14 GB

Requires A100 80GB

LoRA fp16

8–10 GB

Requires A100 40GB or RTX 4090

QLoRA 4-bit

4–5 GB

Fits on RTX 3090 / T4 (16GB) easily

# QLoRA configuration pattern
# Note: bitsandbytes requires a CUDA GPU; shown as a code pattern here

print("QLoRA setup (requires: pip install bitsandbytes transformers peft):")
print()
print("from transformers import BitsAndBytesConfig")
print()
print("# Step 1: Configure 4-bit quantization")
print("bnb_config = BitsAndBytesConfig(")
print("    load_in_4bit=True,                    # enable 4-bit loading")
print('    bnb_4bit_quant_type="nf4",            # NormalFloat4 data type')
print("    bnb_4bit_compute_dtype=torch.bfloat16,# compute in bf16")
print("    bnb_4bit_use_double_quant=True,       # quantize scale factors too")
print(")")
print()
print("# Step 2: Load model in 4-bit")
print("model = AutoModelForCausalLM.from_pretrained(")
print('    "meta-llama/Llama-2-7b-hf",')
print("    quantization_config=bnb_config,")
print('    device_map="auto",')
print(")")
print()
print("# Step 3: Prepare for k-bit training (important!)")
print("from peft import prepare_model_for_kbit_training")
print("model = prepare_model_for_kbit_training(model)")
print()
print("# Step 4: Apply LoRA on top of the quantized model")
print("lora_config = LoraConfig(")
print("    r=16,")
print("    lora_alpha=64,")
print('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],')
print("    lora_dropout=0.05,")
print('    bias="none",')
print("    task_type=TaskType.CAUSAL_LM,")
print(")")
print("model = get_peft_model(model, lora_config)")
print("model.print_trainable_parameters()")
print()

# ── Memory breakdown ─────────────────────────────────────────────────────────
llama_7b = 7_000_000_000
lora_params = 4 * 2 * 28 * (4096 * 16 + 16 * 4096)   # q,k,v,o Γ— 28 layers, r=16
# 4-bit base weights, fp16 LoRA adapters
base_mem_4bit = llama_7b * 0.5 / 1e9  # 0.5 bytes per param in 4-bit
lora_mem      = lora_params * 2 / 1e9  # fp16 for adapters
# Optimizer for LoRA only (Adam, fp32 copy + m + v = 12 bytes per param)
optim_mem     = lora_params * 12 / 1e9
total_qlora   = base_mem_4bit + lora_mem + optim_mem

print("Memory breakdown for QLoRA on LLaMA-2-7B (r=16, q+k+v+o):")
print(f"  4-bit base weights (frozen)   : {base_mem_4bit:.2f} GB")
print(f"  LoRA adapter weights (fp16)   : {lora_mem:.3f} GB")
print(f"  Adam optimizer states (fp32)  : {optim_mem:.3f} GB")
print(f"  ─────────────────────────────")
print(f"  TOTAL (approx, no activations): {total_qlora:.2f} GB")
print()
print(f"  Compare: full fp16 training would need ~{llama_7b*16/1e9:.0f} GB")
print(f"  QLoRA savings: ~{llama_7b*16/1e9 / total_qlora:.1f}Γ— memory reduction")

if HAS_TORCH:
    print()
    print("NF4 quantization illustration β€” comparing quantization grids:")
    # NF4 uses quantile-based levels for normally distributed data
    # Simulate what NF4 vs int4 levels look like
    weights = np.random.normal(0, 1, 1000)

    # int4: uniform levels in [-1, 1]
    int4_levels = np.linspace(-1, 1, 16)

    # NF4: quantile-based levels (equal probability mass between levels)
    quantiles = np.linspace(0, 1, 17)
    nf4_levels = np.quantile(weights, quantiles[:-1] + np.diff(quantiles) / 2)
    nf4_levels = nf4_levels / np.max(np.abs(nf4_levels))  # normalize to [-1, 1]

    # Quantization error comparison
    def quantize(vals, levels):
        clipped = np.clip(vals, levels.min(), levels.max())
        return levels[np.argmin(np.abs(clipped[:, None] - levels[None, :]), axis=1)]

    int4_q = quantize(weights / np.max(np.abs(weights)), int4_levels)
    nf4_q  = quantize(weights / np.max(np.abs(weights)), nf4_levels)

    print(f"  int4 quantization error (MSE): {np.mean((weights/np.max(np.abs(weights)) - int4_q)**2):.6f}")
    print(f"  NF4  quantization error (MSE): {np.mean((weights/np.max(np.abs(weights)) - nf4_q)**2):.6f}")
    print("  β†’ NF4 has lower error for normally distributed weights")

9. Merging LoRA WeightsΒΆ

After fine-tuning, you have two options for deployment:

10. Practical Tips Cheat SheetΒΆ

Hyperparameter      Typical range       Notes
────────────────────────────────────────────────────────────────────────────
rank (r)            4–64                Higher = more capacity, more memory
                                        r=8 is a good default start
                                        r=64 approaches full fine-tuning quality

alpha               r–4r                Keep alpha/r ratio ~2–4
                                        alpha=2r means scale factor = 2.0
                                        alpha=r means scale factor = 1.0

target_modules      q_proj, v_proj      Minimum effective set
                    + k_proj, o_proj    Better results, ~2Γ— more params
                    + up/down_proj      Max coverage for instruction tuning

lora_dropout        0.05–0.1            Regularization
                                        Set to 0 for very small datasets
                                        (<500 examples)

learning_rate       1e-4 – 3e-4         Higher than full fine-tuning (1e-5)
                                        LoRA params are random-initialized

epochs              1–3                 LLMs overfit quickly
                                        Use early stopping on validation loss

batch_size          4–32                Use gradient accumulation to reach
                    (with GA)           effective batch of 16–32

scheduler           cosine / linear     Cosine with warmup (5–10%) works well

────────────────────────────────────────────────────────────────────────────

Common pitfallsΒΆ

  • Forgetting prepare_model_for_kbit_training with QLoRA β€” leads to NaN losses

  • Using fp16 with bfloat16 models β€” LLaMA uses bf16 internally; use compute_dtype=torch.bfloat16

  • Catastrophic forgetting β€” fine-tune on too much data and the model forgets general capabilities; keep epochs low

  • Data formatting errors β€” missing EOS token, wrong instruction template; always inspect 5-10 tokenized examples

  • rank too high β€” r=64 with small datasets leads to overfitting; start with r=8

11. When to Use WhatΒΆ

Method

GPU Memory

Training Speed

Quality

Use Case

Full fine-tuning

40–80 GB

Slowest

Best

Large budget, highest quality

LoRA

8–16 GB

Fast

Near-full

Most cases

QLoRA

4–8 GB

Medium

Slight loss

Consumer GPU / Colab

Prompt tuning

<1 GB

Fastest

Limited

Very constrained

Decision flowchartΒΆ

Have access to multiple A100/H100s?
β”œβ”€β”€ Yes β†’ Full fine-tuning (maximum quality)
└── No
    β”œβ”€β”€ Single A100 (40/80 GB) or RTX 4090 (24 GB)?
    β”‚   └── LoRA (fp16) β€” best quality/memory tradeoff
    β”œβ”€β”€ T4 (16 GB) or RTX 3090 (24 GB)?
    β”‚   └── QLoRA (4-bit) β€” can fine-tune 7B–13B models
    β”œβ”€β”€ Colab free tier / RTX 3060 (8-12 GB)?
    β”‚   └── QLoRA on 3B–7B models, or LoRA on GPT-2 / small models
    └── CPU only?
        └── Prompt engineering / RAG β€” fine-tuning not practical

Model size guidelines for QLoRA on consumer hardwareΒΆ

Model size

Min VRAM

Recommended GPU

1–3B

4 GB

RTX 3060 / T4

7B

6–8 GB

RTX 3080 / T4 16GB

13B

10–12 GB

RTX 3090 / A10G

30B

20–24 GB

RTX 4090 / A5000

70B

40–48 GB

A100 40GB (tight)

12. ExercisesΒΆ

Exercise 1: Rank ablation studyΒΆ

Experiment with rank r ∈ {4, 8, 16, 32} and measure:

  • Number of trainable parameters

  • Validation loss after 3 epochs

  • Training time per epoch

Plot trainable parameters vs validation loss to find the sweet spot.

results = []
for r in [4, 8, 16, 32]:
    config = LoraConfig(r=r, lora_alpha=2*r, target_modules=["c_attn"],
                        bias="none", task_type=TaskType.CAUSAL_LM)
    model = get_peft_model(AutoModelForCausalLM.from_pretrained("gpt2"), config)
    # ... train and evaluate ...
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    results.append({"r": r, "trainable": trainable, "val_loss": val_loss})

Exercise 2: Target module comparisonΒΆ

Apply LoRA to different module combinations and compare:

  • ["c_attn"] β€” query+key+value combined (GPT-2)

  • ["c_attn", "c_proj"] β€” attention + output projection

  • ["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"] β€” attention + MLP

Expected: more modules β†’ slightly better quality, more parameters.

Exercise 3: Early stopping on perplexityΒΆ

Implement a EarlyStoppingCallback that monitors validation perplexity:

from transformers import TrainerCallback

class PerplexityEarlyStopping(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.best_ppl = float('inf')
        self.counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        ppl = math.exp(metrics["eval_loss"])
        if ppl < self.best_ppl:
            self.best_ppl = ppl
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                control.should_training_stop = True
        return control

Exercise 4: Domain-specific fine-tuningΒΆ

Create a synthetic dataset for a domain-specific task:

Option A β€” Code generation:

examples = [
    {"instruction": "Write a Python function that sorts a list",
     "input": "", "output": "def sort_list(lst): return sorted(lst)"},
    # ... 50+ examples
]

Option B β€” Medical QA (synthetic):

examples = [
    {"instruction": "Answer the medical question briefly",
     "input": "What is hypertension?",
     "output": "High blood pressure, defined as readings consistently above 130/80 mmHg."},
    # ... 50+ examples
]

Fine-tune GPT-2 with LoRA (r=8) and measure domain-specific perplexity.

Exercise 5: Verify merge equivalenceΒΆ

Fine-tune a small LoRA model, then verify that merged outputs are identical:

# After fine-tuning peft_model:
test_input = tokenizer("Once upon a time", return_tensors="pt")

# Output from PEFT model (adapter active)
with torch.no_grad():
    out_peft = peft_model(**test_input).logits

# Merge and output from merged model
merged = peft_model.merge_and_unload()
with torch.no_grad():
    out_merged = merged(**test_input).logits

# Should be < 1e-5
print(f"Max diff: {(out_peft - out_merged).abs().max().item():.2e}")
assert (out_peft - out_merged).abs().max().item() < 1e-4, "Merge not equivalent!"

Expected result: Max difference < 1e-4 (small floating-point error only).