Section 8: Regression & Release ValidationΒΆ
Golden Baselines, Cross-Version Testing & CI/CD for AI HardwareΒΆ
Duration: 4 hours
Difficulty: IntermediateβAdvanced
8.1 Why Regression Validation MattersΒΆ
Every release of a driver, firmware, SDK, or compiler can break something. Regression validation ensures:
Correctness hasnβt changed (outputs match golden baselines)
Performance hasnβt degraded (throughput/latency within tolerance)
Functionality hasnβt been lost (all tests that previously passed still pass)
At companies like AMD, NVIDIA, and Qualcomm, regression suites run thousands of tests on every commit.
8.2 Golden BaselinesΒΆ
What is a Golden Baseline?ΒΆ
A golden baseline is a known-correct reference result for a specific:
Model + weights + input β expected output
Kernel + input shape + dtype β expected output tensor
Benchmark + config β expected performance range
import torch
import json
import hashlib
from pathlib import Path
class GoldenBaselineManager:
"""Manage golden baseline results for regression testing."""
def __init__(self, baseline_dir="baselines"):
self.baseline_dir = Path(baseline_dir)
self.baseline_dir.mkdir(parents=True, exist_ok=True)
def _key(self, test_name, config):
"""Generate a unique key for a test configuration."""
config_str = json.dumps(config, sort_keys=True)
return f"{test_name}_{hashlib.sha256(config_str.encode()).hexdigest()[:12]}"
def save_baseline(self, test_name, config, result_tensor, metadata=None):
"""Save a golden baseline result."""
key = self._key(test_name, config)
baseline = {
"test_name": test_name,
"config": config,
"metadata": metadata or {},
"tensor_shape": list(result_tensor.shape),
"tensor_dtype": str(result_tensor.dtype),
}
# Save metadata
meta_path = self.baseline_dir / f"{key}.json"
with open(meta_path, "w") as f:
json.dump(baseline, f, indent=2)
# Save tensor
tensor_path = self.baseline_dir / f"{key}.pt"
torch.save(result_tensor.cpu(), tensor_path)
print(f"Saved baseline: {test_name} [{key}]")
def load_baseline(self, test_name, config):
"""Load a golden baseline for comparison."""
key = self._key(test_name, config)
meta_path = self.baseline_dir / f"{key}.json"
tensor_path = self.baseline_dir / f"{key}.pt"
if not meta_path.exists():
return None, None
with open(meta_path) as f:
metadata = json.load(f)
tensor = torch.load(tensor_path, weights_only=True)
return metadata, tensor
def compare_to_baseline(self, test_name, config, actual_tensor,
atol=1e-5, rtol=1e-4):
"""Compare current result to golden baseline."""
metadata, expected = self.load_baseline(test_name, config)
if expected is None:
print(f"NO BASELINE [{test_name}] β saving current result as baseline")
self.save_baseline(test_name, config, actual_tensor)
return "NEW_BASELINE"
actual_cpu = actual_tensor.cpu().float()
expected_f = expected.float()
if torch.allclose(actual_cpu, expected_f, atol=atol, rtol=rtol):
print(f"PASS [{test_name}] matches baseline")
return "PASS"
else:
diff = (actual_cpu - expected_f).abs()
max_diff = diff.max().item()
mean_diff = diff.mean().item()
violations = (diff > atol + rtol * expected_f.abs()).sum().item()
print(f"REGRESSION [{test_name}] "
f"max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}, "
f"violations={violations}/{actual_cpu.numel()}")
return "REGRESSION"
# Usage
baseline_mgr = GoldenBaselineManager("./baselines")
# Generate baseline for GEMM
torch.manual_seed(42)
A = torch.randn(1024, 1024, device='cuda', dtype=torch.float16)
B = torch.randn(1024, 1024, device='cuda', dtype=torch.float16)
C = torch.matmul(A, B)
config = {"op": "gemm", "M": 1024, "N": 1024, "K": 1024, "dtype": "fp16"}
result = baseline_mgr.compare_to_baseline("gemm_1024", config, C)
Performance BaselinesΒΆ
Performance baselines record throughput (TFLOPS, tokens/sec, images/sec) and latency from a known-good configuration, along with a configurable tolerance threshold (typically 5%). Unlike correctness baselines (which compare tensor values), performance baselines compare scalar metrics and flag a regression when the current measurement drops below baseline Γ (1 - tolerance). The PerformanceBaseline class maintains a history of all recorded values, enabling trend analysis: gradual 1-2% degradation over multiple releases is just as actionable as a sudden 10% drop, but only visible when historical data is preserved. Hardware vendors use these baselines to gate every driver and firmware release β a release that regresses GEMM TFLOPS by more than 5% is blocked until the root cause is identified.
Define __init__()ΒΆ
Usage
Define save()ΒΆ
Usage
Define record()ΒΆ
Usage
Define gemm_benchmark()ΒΆ
Usage
import time
class PerformanceBaseline:
"""Track and validate performance metrics against baselines."""
def __init__(self, baseline_path="perf_baselines.json"):
self.path = Path(baseline_path)
self.baselines = {}
if self.path.exists():
with open(self.path) as f:
self.baselines = json.load(f)
def save(self):
with open(self.path, "w") as f:
json.dump(self.baselines, f, indent=2)
def record(self, test_name, metric_name, value, tolerance_pct=5):
"""Record a performance metric and check for regression."""
key = f"{test_name}/{metric_name}"
if key not in self.baselines:
self.baselines[key] = {
"baseline": value,
"tolerance_pct": tolerance_pct,
"history": [value],
}
self.save()
print(f"NEW PERF BASELINE [{key}]: {value:.4f}")
return "NEW_BASELINE"
baseline_val = self.baselines[key]["baseline"]
tol = self.baselines[key]["tolerance_pct"]
lower_bound = baseline_val * (1 - tol / 100)
self.baselines[key]["history"].append(value)
self.save()
if value >= lower_bound:
pct_change = (value - baseline_val) / baseline_val * 100
print(f"PASS [{key}]: {value:.4f} "
f"(baseline={baseline_val:.4f}, {pct_change:+.1f}%)")
return "PASS"
else:
pct_change = (value - baseline_val) / baseline_val * 100
print(f"PERF REGRESSION [{key}]: {value:.4f} "
f"(baseline={baseline_val:.4f}, {pct_change:.1f}%, "
f"tolerance={tol}%)")
return "REGRESSION"
# Usage
perf = PerformanceBaseline()
# Benchmark and record
def gemm_benchmark():
A = torch.randn(4096, 4096, device='cuda', dtype=torch.float16)
B = torch.randn(4096, 4096, device='cuda', dtype=torch.float16)
for _ in range(10):
torch.matmul(A, B)
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(100):
torch.matmul(A, B)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start
tflops = 2 * 4096**3 * 100 / elapsed / 1e12
return tflops
tflops = gemm_benchmark()
perf.record("gemm_4096_fp16", "tflops", tflops, tolerance_pct=5)
8.3 Cross-Version TestingΒΆ
Version MatrixΒΆ
Cross-version testing systematically validates all supported combinations of driver version x CUDA/ROCm toolkit x framework version x model. The combinatorial explosion is real β 3 drivers x 3 CUDA versions x 3 PyTorch versions x 3 models = 81 test configurations. Incompatible combinations are filtered (e.g., CUDA 12.6 requires driver >= 550), and each valid combination runs the full kernel + model validation suite. The version compatibility report is a key deliverable for hardware vendors: customers (Meta, Microsoft, Google) require explicit certification that their framework version works on the vendorβs latest driver. A single failure in the matrix blocks the release for that combination, and the report is reviewed by engineering management before sign-off.
import itertools
def generate_test_matrix():
"""Generate a cross-version test matrix."""
drivers = ["535.129.03", "545.23.08", "550.54.15"] # NVIDIA driver
cuda_versions = ["12.2", "12.4", "12.6"] # CUDA toolkit
pytorch_versions = ["2.2.0", "2.3.0", "2.4.0"] # PyTorch
models = ["resnet50", "llama-7b", "whisper-base"] # Test models
matrix = []
for driver, cuda, pytorch, model in itertools.product(
drivers, cuda_versions, pytorch_versions, models
):
# Skip incompatible combinations
if cuda == "12.6" and driver < "550":
continue
matrix.append({
"driver": driver,
"cuda": cuda,
"pytorch": pytorch,
"model": model,
})
print(f"Test matrix: {len(matrix)} combinations")
return matrix
def version_compatibility_report(results):
"""Generate a version compatibility report."""
print("\n=== Version Compatibility Report ===\n")
print(f"{'Driver':<15} {'CUDA':<8} {'PyTorch':<10} {'Model':<15} {'Status'}")
print("-" * 65)
for r in results:
status = "PASS" if r.get("passed") else "FAIL"
print(f"{r['driver']:<15} {r['cuda']:<8} {r['pytorch']:<10} "
f"{r['model']:<15} {status}")
passed = sum(1 for r in results if r.get("passed"))
print(f"\n{passed}/{len(results)} passed "
f"({100*passed/len(results):.0f}%)")
Docker-Based Version TestingΒΆ
# Dockerfile for testing specific version combinations
ARG CUDA_VERSION=12.4.0
ARG UBUNTU_VERSION=22.04
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG PYTORCH_VERSION=2.4.0
ARG PYTHON_VERSION=3.11
RUN apt-get update && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-pip
RUN pip install torch==${PYTORCH_VERSION} torchvision torchaudio
COPY test_suite/ /tests/
CMD ["python", "-m", "pytest", "/tests/", "-v", "--tb=short"]
# Run tests across versions
for cuda in 12.2.0 12.4.0 12.6.0; do
for pytorch in 2.2.0 2.3.0 2.4.0; do
echo "Testing CUDA=${cuda} PyTorch=${pytorch}"
docker build \
--build-arg CUDA_VERSION=${cuda} \
--build-arg PYTORCH_VERSION=${pytorch} \
-t test-${cuda}-${pytorch} .
docker run --gpus all test-${cuda}-${pytorch} 2>&1 | tail -5
done
done
8.4 CI/CD Pipeline for AI Hardware ValidationΒΆ
GitHub Actions / GitLab CI PipelineΒΆ
# .github/workflows/gpu-validation.yml
name: GPU Validation Suite
on:
push:
branches: [main, release/*]
schedule:
- cron: '0 2 * * *' # Daily at 2am
jobs:
kernel-tests:
runs-on: [self-hosted, gpu, A100]
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
- name: Setup Environment
run: |
pip install torch torchvision onnxruntime
nvidia-smi
- name: Kernel Correctness Tests
run: python -m pytest tests/kernels/ -v --tb=short
- name: Performance Benchmarks
run: python benchmarks/run_benchmarks.py --output results.json
- name: Check Performance Regression
run: python scripts/check_regression.py results.json --baseline baselines/
- name: Upload Results
uses: actions/upload-artifact@v4
with:
name: validation-results
path: results.json
model-tests:
runs-on: [self-hosted, gpu, A100]
timeout-minutes: 120
strategy:
matrix:
model: [resnet50, vit_b_16, llama-7b]
dtype: [fp16, bf16]
steps:
- uses: actions/checkout@v4
- name: Model Validation
run: |
python tests/models/validate_model.py \
--model ${{ matrix.model }} \
--dtype ${{ matrix.dtype }} \
--baseline baselines/
distributed-tests:
runs-on: [self-hosted, gpu-cluster, 8xA100]
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
- name: All-Reduce Correctness
run: torchrun --nproc_per_node=8 tests/distributed/test_allreduce.py
- name: Training Convergence
run: torchrun --nproc_per_node=8 tests/distributed/test_convergence.py
Test Suite OrganizationΒΆ
tests/
βββ kernels/
β βββ test_gemm.py # GEMM correctness + edge cases
β βββ test_conv.py # Convolution tests
β βββ test_attention.py # Attention kernel tests
β βββ test_softmax.py # Softmax tests
β βββ test_layernorm.py # LayerNorm/RMSNorm tests
βββ frameworks/
β βββ test_pytorch_ops.py # PyTorch op coverage
β βββ test_onnx_export.py # ONNX export/import parity
β βββ test_torch_compile.py # Compiler correctness
βββ models/
β βββ test_llm.py # LLM correctness + performance
β βββ test_cv.py # CV model tests
β βββ test_speech.py # Speech model tests
βββ distributed/
β βββ test_allreduce.py # Collective correctness
β βββ test_bandwidth.py # Communication bandwidth
β βββ test_convergence.py # Training convergence parity
βββ pipeline/
β βββ test_e2e_inference.py # End-to-end pipeline
β βββ test_data_loading.py # Data pipeline performance
βββ hardware/
β βββ test_power.py # Power validation
β βββ test_thermal.py # Thermal validation
β βββ test_memory.py # Memory integrity
βββ conftest.py # Shared fixtures (GPU setup, tolerances)
Pytest Conftest for GPU TestsΒΆ
# tests/conftest.py
import pytest
import torch
TOLERANCES = {
torch.float32: {"atol": 1e-5, "rtol": 1e-4},
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
torch.bfloat16: {"atol": 1e-2, "rtol": 5e-2},
}
@pytest.fixture(autouse=True)
def gpu_setup():
"""Ensure GPU is ready before each test."""
if not torch.cuda.is_available():
pytest.skip("No GPU available")
torch.cuda.synchronize()
torch.cuda.empty_cache()
yield
torch.cuda.synchronize()
torch.cuda.empty_cache()
@pytest.fixture
def baseline_manager():
from baseline_manager import GoldenBaselineManager
return GoldenBaselineManager("baselines")
@pytest.fixture(params=[torch.float16, torch.bfloat16])
def dtype(request):
return request.param
def pytest_addoption(parser):
parser.addoption("--baseline-dir", default="baselines")
parser.addoption("--tolerance-mode", default="normal",
choices=["strict", "normal", "relaxed"])
8.5 Release Validation WorkflowΒΆ
Pre-Release ChecklistΒΆ
## Release Validation Checklist: Driver vX.Y.Z
### Hardware
- [ ] Power validation: sustained load within TDP Β±5%
- [ ] Thermal validation: stabilized temps, no unexpected throttling
- [ ] Memory integrity: 24h soak test, zero ECC errors
- [ ] Stability: 48h mixed workload, no hangs or crashes
### Kernels
- [ ] GEMM: all shapes Γ {FP32, FP16, BF16, FP8} β PASS
- [ ] Convolution: ResNet/YOLO shapes β PASS
- [ ] Attention: FlashAttention + standard β PASS
- [ ] Softmax: all edge cases β PASS
- [ ] LayerNorm/RMSNorm: all configs β PASS
### Frameworks
- [ ] PyTorch op coverage: >99% of critical ops
- [ ] ONNX Runtime EP: all supported ops β PASS
- [ ] torch.compile: correctness verified
- [ ] Mixed precision (AMP): correctness verified
### Models
- [ ] LLM (Llama 7B/70B): perplexity within Β±0.5 of baseline
- [ ] CV (ResNet-50, ViT): accuracy within Β±0.5%
- [ ] Speech (Whisper): WER within Β±1%
- [ ] Performance: throughput within Β±5% of baseline
### Distributed
- [ ] AllReduce correctness: 2, 4, 8 GPU β PASS
- [ ] Multi-node: 2-node, 4-node β PASS
- [ ] Training convergence: matches single-GPU baseline
### Release
- [ ] All regression tests pass
- [ ] No new XID errors / RAS events
- [ ] Performance baselines updated
- [ ] Release notes reviewed
Automated Release GateΒΆ
import json
import sys
def evaluate_release_gate(results_path):
"""Determine if a release passes the quality gate."""
with open(results_path) as f:
results = json.load(f)
# Hard gates (must pass)
hard_gates = {
"kernel_correctness": results.get("kernel_pass_rate", 0) >= 100,
"model_correctness": results.get("model_accuracy_parity", False),
"no_crashes": results.get("crash_count", 1) == 0,
"no_ecc_errors": results.get("ecc_uncorrectable", 1) == 0,
}
# Soft gates (warn if failing)
soft_gates = {
"performance_within_5pct": results.get("max_perf_regression_pct", 100) <= 5,
"memory_within_10pct": results.get("max_mem_regression_pct", 100) <= 10,
}
print("=== Release Gate Evaluation ===")
all_pass = True
for name, passed in hard_gates.items():
status = "PASS" if passed else "BLOCK"
print(f" [HARD] {name}: {status}")
if not passed:
all_pass = False
for name, passed in soft_gates.items():
status = "PASS" if passed else "WARN"
print(f" [SOFT] {name}: {status}")
print(f"\nRelease: {'APPROVED' if all_pass else 'BLOCKED'}")
return all_pass
# Usage
# if not evaluate_release_gate("results.json"):
# sys.exit(1)
8.6 Continuous BenchmarkingΒΆ
Performance Tracking Over TimeΒΆ
Continuous benchmarking records every benchmark run with timestamps and uses statistical analysis to detect regressions automatically. The detect_regression method computes a z-score against a sliding window of historical results: if the current value is more than \(2\sigma\) below the mean, it is flagged as a regression. This statistical approach is superior to fixed-threshold comparisons because it adapts to natural performance variance (thermal conditions, background processes, DVFS state). The benchmark history is stored in JSONL format for easy integration with visualization tools (Grafana, custom dashboards) and CI/CD systems (GitHub Actions, GitLab CI, Jenkins). Over months of nightly runs, this data reveals gradual trends like memory usage creep or latency degradation that per-release testing would miss.
import json
from datetime import datetime
class ContinuousBenchmark:
"""Track performance over time for trend analysis."""
def __init__(self, db_path="benchmark_history.jsonl"):
self.db_path = Path(db_path)
def record(self, test_name, metrics, metadata=None):
"""Record a benchmark result with timestamp."""
entry = {
"timestamp": datetime.now().isoformat(),
"test_name": test_name,
"metrics": metrics,
"metadata": metadata or {},
}
with open(self.db_path, "a") as f:
f.write(json.dumps(entry) + "\n")
def get_history(self, test_name, last_n=50):
"""Get historical results for a test."""
results = []
if self.db_path.exists():
with open(self.db_path) as f:
for line in f:
entry = json.loads(line)
if entry["test_name"] == test_name:
results.append(entry)
return results[-last_n:]
def detect_regression(self, test_name, current_value, metric_name,
window=10, threshold_sigma=2):
"""Detect regression using statistical analysis."""
history = self.get_history(test_name, last_n=window)
if len(history) < 5:
return False, "Insufficient history"
values = [h["metrics"][metric_name] for h in history
if metric_name in h["metrics"]]
if not values:
return False, "No metric data"
mean = sum(values) / len(values)
variance = sum((v - mean) ** 2 for v in values) / len(values)
std = variance ** 0.5
if std == 0:
return current_value < mean * 0.95, f"Flat baseline, current={current_value:.4f}"
z_score = (current_value - mean) / std
is_regression = z_score < -threshold_sigma
return is_regression, (
f"mean={mean:.4f}, std={std:.4f}, z={z_score:.2f}, "
f"threshold=-{threshold_sigma}Ο"
)
Key TakeawaysΒΆ
Golden baselines are the foundation of regression testing β save known-good results
Cross-version testing catches compatibility issues before customers do
CI/CD automation ensures every commit is validated (no manual βit works on my machineβ)
Performance tracking over time detects gradual degradation (not just sudden breaks)
Release gates prevent shipping broken software β hard gates block, soft gates warn
Previous: 07_datacenter_validation.ipynb
Next: 09_benchmarking_industry.ipynb
Back to Overview: README.md