Quantization: GPTQ, AWQ, GGUF & bitsandbytesΒΆ
Compress LLMs by 2-4x in size and memory with minimal quality loss β run 70B models on consumer GPUs.
Why Quantization?ΒΆ
Llama 3 70B in float16 = 140 GB VRAM β requires 2x A100 GPUs
Llama 3 70B in 4-bit = ~40 GB β fits on a single A100 or 2x 3090s
Llama 3 70B Q4_K_M (GGUF) = ~43 GB β runs on Mac M2 Ultra
Methods ComparisonΒΆ
Method |
Bits |
Format |
Speed |
Quality |
Use Case |
|---|---|---|---|---|---|
bitsandbytes NF4 |
4-bit |
HF native |
Medium |
β β β β |
QLoRA training |
GPTQ |
4-bit |
GPTQ |
Fast |
β β β β |
GPU inference |
AWQ |
4-bit |
AWQ |
Fastest |
β β β β β |
GPU inference (best) |
GGUF Q4_K_M |
~4-bit |
GGUF |
Fast CPU |
β β β β |
CPU / Apple Silicon |
GGUF Q8_0 |
8-bit |
GGUF |
Slower |
β β β β β |
Near lossless, CPU |
# Install dependencies
# !pip install transformers accelerate bitsandbytes
# For GPTQ: !pip install auto-gptq optimum
# For AWQ: !pip install autoawq
# For GGUF: !pip install llama-cpp-python (or use Ollama)
1. bitsandbytes β 4-bit NF4 (QLoRAβs Engine)ΒΆ
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# NF4 quantization config (used by QLoRA)
bnb_4bit_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4', # Normal Float 4 β best for LLM weights
bnb_4bit_compute_dtype=torch.bfloat16, # bf16 for compute (not fp16 β avoids overflow)
bnb_4bit_use_double_quant=True, # Quantize the quantization constants too (~0.4 bits/param saved)
)
# 8-bit quantization (simpler, slightly lower quality)
bnb_8bit_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0, # Outlier threshold β higher = more 8-bit, lower = more fp16
)
def load_quantized_bnb(model_id: str, bits: int = 4) -> tuple:
"""Load a model with bitsandbytes quantization."""
config = bnb_4bit_config if bits == 4 else bnb_8bit_config
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=config,
device_map='auto', # auto-distribute across GPUs
torch_dtype=torch.bfloat16
)
return model, tokenizer
def measure_model_memory(model) -> float:
"""Estimate model memory in GB."""
total_params = sum(p.numel() * p.element_size() for p in model.parameters())
return total_params / 1e9
# Example usage:
# model, tokenizer = load_quantized_bnb('meta-llama/Meta-Llama-3-8B', bits=4)
# print(f'Memory: {measure_model_memory(model):.2f} GB')
# β ~4.5 GB for 8B model in 4-bit (vs 16 GB in float16)
print('bitsandbytes configs ready.')
print()
print('Memory comparison for 8B model:')
print(' float16: ~16 GB')
print(' 8-bit: ~8 GB (2x compression)')
print(' 4-bit NF4: ~4.5 GB (3.5x compression)')
2. AWQ β Activation-Aware Weight Quantization (Best Quality)ΒΆ
# AWQ is the highest quality 4-bit method for GPU inference
# Pre-quantized AWQ models available on HuggingFace: TheBloke/*, solidrust/*, etc.
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
# --- Option A: Load a pre-quantized AWQ model (recommended) ---
# Many models available as: TheBloke/Llama-3-8B-AWQ, etc.
def load_awq_pretrained(model_id: str) -> tuple:
"""Load a pre-quantized AWQ model from HuggingFace."""
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoAWQForCausalLM.from_quantized(
model_id,
fuse_layers=True, # Fuse attention layers for speed
trust_remote_code=False,
safetensors=True
)
return model, tokenizer
# --- Option B: Quantize a model yourself ---
def quantize_with_awq(model_id: str, output_dir: str, calibration_samples: int = 128) -> None:
"""
Quantize a model to AWQ 4-bit.
Needs a calibration dataset (~128 samples) to find weight scales.
"""
from datasets import load_dataset
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoAWQForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
# Calibration data (small sample from training distribution)
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
calibration_data = [
tokenizer(row['text'], return_tensors='pt').input_ids
for row in dataset.select(range(calibration_samples))
if len(row['text']) > 100
]
quant_config = {
'zero_point': True,
'q_group_size': 128, # Group size for quantization
'w_bit': 4, # 4-bit weights
'version': 'GEMM', # GEMM for best speed
}
model.quantize(tokenizer, quant_config=quant_config, calib_data=calibration_data)
model.save_quantized(output_dir)
tokenizer.save_pretrained(output_dir)
print(f'AWQ quantized model saved to {output_dir}')
print('AWQ functions defined.')
print()
print('Why AWQ beats GPTQ:')
print(' β’ Identifies which weights matter most (activation-aware)')
print(' β’ Preserves salient channels in fp16, quantizes the rest')
print(' β’ ~1.5x faster inference than GPTQ on GPU')
3. GPTQ β GPU-Optimized Post-Training QuantizationΒΆ
from transformers import AutoModelForCausalLM, GPTQConfig, AutoTokenizer
# GPTQ quantizes layer-by-layer using second-order information
# Pre-quantized GPTQ models: TheBloke/* on HuggingFace
def load_gptq_pretrained(model_id: str) -> tuple:
"""Load a pre-quantized GPTQ model."""
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map='auto',
torch_dtype=torch.float16
)
return model, tokenizer
def quantize_with_gptq(model_id: str, output_dir: str) -> None:
"""Quantize a model to GPTQ 4-bit."""
tokenizer = AutoTokenizer.from_pretrained(model_id)
gptq_config = GPTQConfig(
bits=4,
dataset='wikitext2', # Calibration dataset
tokenizer=tokenizer,
group_size=128, # Smaller group = better quality, larger model
desc_act=True, # Use activation ordering (better quality)
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=gptq_config,
torch_dtype=torch.float16
)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f'GPTQ model saved to {output_dir}')
print('GPTQ functions defined.')
print()
print('GPTQ group sizes:')
print(' group_size=128 β Standard, good balance')
print(' group_size=64 β Better quality, slightly larger')
print(' group_size=32 β Best quality, larger model')
print(' group_size=-1 β No grouping (lower quality)')
4. GGUF β Universal Format for CPU + Apple SiliconΒΆ
# GGUF (successor to GGML) is used by llama.cpp and Ollama
# Run quantized models on CPU, Apple Metal, or GPU with llama.cpp
from llama_cpp import Llama
# GGUF quantization levels
GGUF_QUANT_LEVELS = {
'Q2_K': {'size': '~2.5 bit', 'quality': 'β
β
β
', 'notes': 'Very small, noticeable quality loss'},
'Q3_K_M': {'size': '~3.35 bit', 'quality': 'β
β
β
Β½', 'notes': 'Small model, acceptable quality'},
'Q4_K_M': {'size': '~4.5 bit', 'quality': 'β
β
β
β
', 'notes': 'Recommended: best size/quality balance'},
'Q5_K_M': {'size': '~5.5 bit', 'quality': 'β
β
β
β
Β½', 'notes': 'Near lossless on most tasks'},
'Q6_K': {'size': '~6.6 bit', 'quality': 'β
β
β
β
β
', 'notes': 'Excellent quality, still efficient'},
'Q8_0': {'size': '~8.5 bit', 'quality': 'β
β
β
β
β
', 'notes': 'Near float16 quality'},
'F16': {'size': '16 bit', 'quality': 'β
β
β
β
β
', 'notes': 'Full quality, no compression'},
}
def load_gguf_model(model_path: str, n_gpu_layers: int = -1, context_size: int = 4096) -> Llama:
"""
Load a GGUF model with llama.cpp.
n_gpu_layers=-1 uses all GPU layers (fully GPU), 0 = CPU only.
"""
return Llama(
model_path=model_path,
n_gpu_layers=n_gpu_layers,
n_ctx=context_size,
verbose=False
)
def gguf_generate(model: Llama, prompt: str, max_tokens: int = 256) -> str:
"""Generate text with a GGUF model."""
output = model(
prompt,
max_tokens=max_tokens,
stop=['</s>', 'User:', '\n\n'],
echo=False
)
return output['choices'][0]['text'].strip()
print('GGUF quantization levels:')
for level, info in GGUF_QUANT_LEVELS.items():
print(f" {level:8s} ({info['size']:8s}) {info['quality']} {info['notes']}")
print()
print('Recommendation: Q4_K_M for most use cases')
print('Download GGUF models: ollama pull llama3.3 or from HF (TheBloke/*)')
5. Benchmark: Quality vs. SizeΒΆ
import matplotlib.pyplot as plt
import numpy as np
# Approximate perplexity on WikiText-2 for Llama 3 8B
benchmarks = [
{'method': 'float16 (baseline)', 'size_gb': 16.0, 'ppl': 6.14, 'color': '#2ecc71'},
{'method': 'NF4 (bitsandbytes)', 'size_gb': 4.5, 'ppl': 6.56, 'color': '#3498db'},
{'method': 'AWQ 4-bit', 'size_gb': 4.5, 'ppl': 6.25, 'color': '#9b59b6'},
{'method': 'GPTQ 4-bit', 'size_gb': 4.5, 'ppl': 6.38, 'color': '#e67e22'},
{'method': 'GGUF Q4_K_M', 'size_gb': 4.7, 'ppl': 6.30, 'color': '#e74c3c'},
{'method': 'GGUF Q8_0', 'size_gb': 8.6, 'ppl': 6.16, 'color': '#1abc9c'},
]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
names = [b['method'] for b in benchmarks]
sizes = [b['size_gb'] for b in benchmarks]
ppls = [b['ppl'] for b in benchmarks]
colors = [b['color'] for b in benchmarks]
# Size comparison
bars = ax1.barh(names, sizes, color=colors)
ax1.set_xlabel('Model Size (GB)')
ax1.set_title('Memory Footprint (Llama 3 8B)')
ax1.axvline(x=16, color='gray', linestyle='--', alpha=0.5, label='float16')
# Perplexity (lower = better)
bars2 = ax2.barh(names, ppls, color=colors)
ax2.set_xlabel('Perplexity on WikiText-2 (lower = better)')
ax2.set_title('Quality: Perplexity')
ax2.axvline(x=6.14, color='gray', linestyle='--', alpha=0.5, label='float16 baseline')
ax2.set_xlim(5.8, 7.0)
plt.tight_layout()
plt.show()
print('Key takeaway: AWQ and GGUF Q4_K_M achieve ~97-99% of float16 quality at 30% the size.')
6. Choosing the Right MethodΒΆ
Deployment Target β Recommended Method
ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
GPU inference (production) β AWQ (fastest, best quality)
GPU inference (alternative) β GPTQ (widely supported)
QLoRA fine-tuning β bitsandbytes NF4 (required for QLoRA)
CPU / Apple Silicon (Mac) β GGUF Q4_K_M (via llama.cpp / Ollama)
Edge devices β GGUF Q3_K_M or Q2_K
Near-lossless quality β GGUF Q8_0 or bitsandbytes 8-bit
ExercisesΒΆ
Load a 7B model in float16 vs. NF4 and measure VRAM with
torch.cuda.memory_allocated().Download a pre-quantized AWQ model from HuggingFace and benchmark generation speed.
Run a GGUF model with Ollama β compare Q4_K_M vs. Q8_0 quality on your task.
Quantize a small model (GPT-2) with GPTQ and measure perplexity before and after.
Compare inference speed: AWQ vs. GPTQ vs. NF4 on the same prompt.