Run this notebook: Open in Colab Open in Kaggle

Quantization: GPTQ, AWQ, GGUF & bitsandbytes¶

Compress LLMs by 2-4x in size and memory with minimal quality loss — run 70B models on consumer GPUs.

Why Quantization?¶

Llama 3 70B in float16 = 140 GB VRAM — requires 2x A100 GPUs
Llama 3 70B in 4-bit = ~40 GB — fits on a single A100 or 2x 3090s
Llama 3 70B Q4_K_M (GGUF) = ~43 GB — runs on Mac M2 Ultra

Methods Comparison¶

Method	Bits	Format	Speed	Quality	Use Case
bitsandbytes NF4	4-bit	HF native	Medium	★★★★	QLoRA training
GPTQ	4-bit	GPTQ	Fast	★★★★	GPU inference
AWQ	4-bit	AWQ	Fastest	★★★★★	GPU inference (best)
GGUF Q4_K_M	~4-bit	GGUF	Fast CPU	★★★★	CPU / Apple Silicon
GGUF Q8_0	8-bit	GGUF	Slower	★★★★★	Near lossless, CPU

# Install dependencies
# !pip install transformers accelerate bitsandbytes
# For GPTQ: !pip install auto-gptq optimum
# For AWQ:  !pip install autoawq
# For GGUF: !pip install llama-cpp-python (or use Ollama)

1. bitsandbytes — 4-bit NF4 (QLoRA’s Engine)¶

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# NF4 quantization config (used by QLoRA)
bnb_4bit_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',          # Normal Float 4 — best for LLM weights
    bnb_4bit_compute_dtype=torch.bfloat16,  # bf16 for compute (not fp16 — avoids overflow)
    bnb_4bit_use_double_quant=True,     # Quantize the quantization constants too (~0.4 bits/param saved)
)

# 8-bit quantization (simpler, slightly lower quality)
bnb_8bit_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,  # Outlier threshold — higher = more 8-bit, lower = more fp16
)

def load_quantized_bnb(model_id: str, bits: int = 4) -> tuple:
    """Load a model with bitsandbytes quantization."""
    config = bnb_4bit_config if bits == 4 else bnb_8bit_config
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=config,
        device_map='auto',  # auto-distribute across GPUs
        torch_dtype=torch.bfloat16
    )
    return model, tokenizer

def measure_model_memory(model) -> float:
    """Estimate model memory in GB."""
    total_params = sum(p.numel() * p.element_size() for p in model.parameters())
    return total_params / 1e9

# Example usage:
# model, tokenizer = load_quantized_bnb('meta-llama/Meta-Llama-3-8B', bits=4)
# print(f'Memory: {measure_model_memory(model):.2f} GB')
# → ~4.5 GB for 8B model in 4-bit (vs 16 GB in float16)

print('bitsandbytes configs ready.')
print()
print('Memory comparison for 8B model:')
print('  float16:  ~16 GB')
print('  8-bit:    ~8 GB   (2x compression)')
print('  4-bit NF4: ~4.5 GB (3.5x compression)')

2. AWQ — Activation-Aware Weight Quantization (Best Quality)¶

# AWQ is the highest quality 4-bit method for GPU inference
# Pre-quantized AWQ models available on HuggingFace: TheBloke/*, solidrust/*, etc.

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# --- Option A: Load a pre-quantized AWQ model (recommended) ---
# Many models available as: TheBloke/Llama-3-8B-AWQ, etc.

def load_awq_pretrained(model_id: str) -> tuple:
    """Load a pre-quantized AWQ model from HuggingFace."""
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoAWQForCausalLM.from_quantized(
        model_id,
        fuse_layers=True,     # Fuse attention layers for speed
        trust_remote_code=False,
        safetensors=True
    )
    return model, tokenizer

# --- Option B: Quantize a model yourself ---
def quantize_with_awq(model_id: str, output_dir: str, calibration_samples: int = 128) -> None:
    """
    Quantize a model to AWQ 4-bit.
    Needs a calibration dataset (~128 samples) to find weight scales.
    """
    from datasets import load_dataset
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoAWQForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
    
    # Calibration data (small sample from training distribution)
    dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
    calibration_data = [
        tokenizer(row['text'], return_tensors='pt').input_ids
        for row in dataset.select(range(calibration_samples))
        if len(row['text']) > 100
    ]
    
    quant_config = {
        'zero_point': True,
        'q_group_size': 128,  # Group size for quantization
        'w_bit': 4,           # 4-bit weights
        'version': 'GEMM',    # GEMM for best speed
    }
    
    model.quantize(tokenizer, quant_config=quant_config, calib_data=calibration_data)
    model.save_quantized(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f'AWQ quantized model saved to {output_dir}')

print('AWQ functions defined.')
print()
print('Why AWQ beats GPTQ:')
print('  • Identifies which weights matter most (activation-aware)')
print('  • Preserves salient channels in fp16, quantizes the rest')
print('  • ~1.5x faster inference than GPTQ on GPU')

3. GPTQ — GPU-Optimized Post-Training Quantization¶

from transformers import AutoModelForCausalLM, GPTQConfig, AutoTokenizer

# GPTQ quantizes layer-by-layer using second-order information
# Pre-quantized GPTQ models: TheBloke/* on HuggingFace

def load_gptq_pretrained(model_id: str) -> tuple:
    """Load a pre-quantized GPTQ model."""
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map='auto',
        torch_dtype=torch.float16
    )
    return model, tokenizer

def quantize_with_gptq(model_id: str, output_dir: str) -> None:
    """Quantize a model to GPTQ 4-bit."""
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    gptq_config = GPTQConfig(
        bits=4,
        dataset='wikitext2',  # Calibration dataset
        tokenizer=tokenizer,
        group_size=128,       # Smaller group = better quality, larger model
        desc_act=True,        # Use activation ordering (better quality)
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=gptq_config,
        torch_dtype=torch.float16
    )
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f'GPTQ model saved to {output_dir}')

print('GPTQ functions defined.')
print()
print('GPTQ group sizes:')
print('  group_size=128  → Standard, good balance')
print('  group_size=64   → Better quality, slightly larger')
print('  group_size=32   → Best quality, larger model')
print('  group_size=-1   → No grouping (lower quality)')

4. GGUF — Universal Format for CPU + Apple Silicon¶

# GGUF (successor to GGML) is used by llama.cpp and Ollama
# Run quantized models on CPU, Apple Metal, or GPU with llama.cpp

from llama_cpp import Llama

# GGUF quantization levels
GGUF_QUANT_LEVELS = {
    'Q2_K':   {'size': '~2.5 bit', 'quality': '★★★',   'notes': 'Very small, noticeable quality loss'},
    'Q3_K_M': {'size': '~3.35 bit', 'quality': '★★★½',  'notes': 'Small model, acceptable quality'},
    'Q4_K_M': {'size': '~4.5 bit', 'quality': '★★★★',  'notes': 'Recommended: best size/quality balance'},
    'Q5_K_M': {'size': '~5.5 bit', 'quality': '★★★★½', 'notes': 'Near lossless on most tasks'},
    'Q6_K':   {'size': '~6.6 bit', 'quality': '★★★★★', 'notes': 'Excellent quality, still efficient'},
    'Q8_0':   {'size': '~8.5 bit', 'quality': '★★★★★', 'notes': 'Near float16 quality'},
    'F16':    {'size': '16 bit',   'quality': '★★★★★', 'notes': 'Full quality, no compression'},
}

def load_gguf_model(model_path: str, n_gpu_layers: int = -1, context_size: int = 4096) -> Llama:
    """
    Load a GGUF model with llama.cpp.
    n_gpu_layers=-1 uses all GPU layers (fully GPU), 0 = CPU only.
    """
    return Llama(
        model_path=model_path,
        n_gpu_layers=n_gpu_layers,
        n_ctx=context_size,
        verbose=False
    )

def gguf_generate(model: Llama, prompt: str, max_tokens: int = 256) -> str:
    """Generate text with a GGUF model."""
    output = model(
        prompt,
        max_tokens=max_tokens,
        stop=['</s>', 'User:', '\n\n'],
        echo=False
    )
    return output['choices'][0]['text'].strip()

print('GGUF quantization levels:')
for level, info in GGUF_QUANT_LEVELS.items():
    print(f"  {level:8s} ({info['size']:8s}) {info['quality']}  {info['notes']}")

print()
print('Recommendation: Q4_K_M for most use cases')
print('Download GGUF models: ollama pull llama3.3 or from HF (TheBloke/*)')

5. Benchmark: Quality vs. Size¶

import matplotlib.pyplot as plt
import numpy as np

# Approximate perplexity on WikiText-2 for Llama 3 8B
benchmarks = [
    {'method': 'float16 (baseline)', 'size_gb': 16.0,  'ppl': 6.14, 'color': '#2ecc71'},
    {'method': 'NF4 (bitsandbytes)', 'size_gb': 4.5,   'ppl': 6.56, 'color': '#3498db'},
    {'method': 'AWQ 4-bit',          'size_gb': 4.5,   'ppl': 6.25, 'color': '#9b59b6'},
    {'method': 'GPTQ 4-bit',         'size_gb': 4.5,   'ppl': 6.38, 'color': '#e67e22'},
    {'method': 'GGUF Q4_K_M',        'size_gb': 4.7,   'ppl': 6.30, 'color': '#e74c3c'},
    {'method': 'GGUF Q8_0',          'size_gb': 8.6,   'ppl': 6.16, 'color': '#1abc9c'},
]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

names = [b['method'] for b in benchmarks]
sizes = [b['size_gb'] for b in benchmarks]
ppls = [b['ppl'] for b in benchmarks]
colors = [b['color'] for b in benchmarks]

# Size comparison
bars = ax1.barh(names, sizes, color=colors)
ax1.set_xlabel('Model Size (GB)')
ax1.set_title('Memory Footprint (Llama 3 8B)')
ax1.axvline(x=16, color='gray', linestyle='--', alpha=0.5, label='float16')

# Perplexity (lower = better)
bars2 = ax2.barh(names, ppls, color=colors)
ax2.set_xlabel('Perplexity on WikiText-2 (lower = better)')
ax2.set_title('Quality: Perplexity')
ax2.axvline(x=6.14, color='gray', linestyle='--', alpha=0.5, label='float16 baseline')
ax2.set_xlim(5.8, 7.0)

plt.tight_layout()
plt.show()

print('Key takeaway: AWQ and GGUF Q4_K_M achieve ~97-99% of float16 quality at 30% the size.')

6. Choosing the Right Method¶

Deployment Target              → Recommended Method
──────────────────────────────────────────────────────
GPU inference (production)     → AWQ (fastest, best quality)
GPU inference (alternative)    → GPTQ (widely supported)
QLoRA fine-tuning              → bitsandbytes NF4 (required for QLoRA)
CPU / Apple Silicon (Mac)      → GGUF Q4_K_M (via llama.cpp / Ollama)
Edge devices                   → GGUF Q3_K_M or Q2_K
Near-lossless quality          → GGUF Q8_0 or bitsandbytes 8-bit

Exercises¶

Load a 7B model in float16 vs. NF4 and measure VRAM with torch.cuda.memory_allocated().
Download a pre-quantized AWQ model from HuggingFace and benchmark generation speed.
Run a GGUF model with Ollama — compare Q4_K_M vs. Q8_0 quality on your task.
Quantize a small model (GPT-2) with GPTQ and measure perplexity before and after.
Compare inference speed: AWQ vs. GPTQ vs. NF4 on the same prompt.