# Install required packages
!pip install fairlearn scikit-learn pandas numpy matplotlib seaborn openai -q
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from fairlearn.metrics import (
MetricFrame,
demographic_parity_difference,
demographic_parity_ratio,
equalized_odds_difference,
selection_rate
)
from typing import Dict, List
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
Understanding Bias Types: Where Unfairness Enters the ML Pipeline¶
Bias in AI systems is not a single phenomenon but a family of distinct failure modes that can enter at every stage of the ML pipeline. Historical bias is baked into the world itself – hiring data from decades of gender-imbalanced tech workplaces reflects societal discrimination, and a model trained on this data will perpetuate it. Representation bias occurs when certain groups are underrepresented in training data (face recognition systems trained primarily on lighter-skinned faces). Measurement bias arises when the same concept is operationalized differently for different groups (arrest records as a proxy for criminality disadvantage over-policed communities).
Downstream biases are equally dangerous: aggregation bias occurs when a single model is forced to serve populations with fundamentally different patterns (one diabetes risk model for all ethnicities, despite known genetic variation), evaluation bias happens when benchmark data is not representative of the target population, and deployment bias emerges when a model trained for one context is applied in another. Recognizing which bias type is at play is the first step toward effective mitigation – different sources require different interventions.
BIAS_TYPES = {
"Historical Bias": {
"description": "Bias already present in the world that is reflected in data",
"example": "Historical hiring data showing gender imbalance in tech roles",
"stage": "Data Collection"
},
"Representation Bias": {
"description": "When certain groups are under/over-represented in training data",
"example": "Face recognition trained mainly on one demographic",
"stage": "Data Collection"
},
"Measurement Bias": {
"description": "Features or labels are measured differently for different groups",
"example": "Different credit score systems for different regions",
"stage": "Feature Engineering"
},
"Aggregation Bias": {
"description": "Using same model for different groups with different patterns",
"example": "One-size-fits-all medical diagnosis model",
"stage": "Model Training"
},
"Evaluation Bias": {
"description": "Benchmark data doesn't represent target population",
"example": "Testing on non-representative test set",
"stage": "Model Evaluation"
},
"Deployment Bias": {
"description": "Model used in different context than trained for",
"example": "CV screening tool trained on one industry used in another",
"stage": "Deployment"
}
}
print("🎯 Types of Bias in AI Systems\n" + "=" * 70)
for bias_type, info in BIAS_TYPES.items():
print(f"\n{bias_type}")
print(f" Stage: {info['stage']}")
print(f" Definition: {info['description']}")
print(f" Example: {info['example']}")
Fairness Metrics: Mathematical Definitions of Equality¶
There are multiple, mathematically incompatible definitions of fairness – no single metric captures all intuitions about what “fair” means. Demographic parity (statistical parity) requires equal selection rates across groups: \(P(\hat{Y}=1 | G=a) = P(\hat{Y}=1 | G=b)\) for all groups \(a, b\). The 80% rule (four-fifths rule from US EEOC guidelines) is a legal operationalization: the selection rate for any group must be at least 80% of the highest group’s rate. Equalized odds requires equal true positive rates and equal false positive rates across groups, ensuring that the model is equally accurate for all groups regardless of the base rate.
The impossibility theorem (Chouldechova, 2017): except in trivial cases, a classifier cannot simultaneously satisfy demographic parity, equalized odds, and predictive parity (equal positive predictive values across groups). This means fairness requires a choice about which type of equality to prioritize, guided by the application context – hiring decisions may prioritize demographic parity, while medical diagnosis should prioritize equalized odds to ensure equal accuracy across patient populations.
# Create synthetic hiring dataset to demonstrate bias
np.random.seed(42)
def generate_biased_hiring_data(n_samples=1000):
"""Generate synthetic hiring data with gender bias"""
# Protected attribute
gender = np.random.choice(['Male', 'Female'], n_samples, p=[0.6, 0.4])
# Features: years_experience, education_score, interview_score
years_exp = np.random.randint(0, 15, n_samples)
education = np.random.randint(1, 6, n_samples) # 1-5 scale
# Interview score with bias: males get slightly higher scores on average
interview_base = np.random.normal(70, 15, n_samples)
interview_bias = np.where(gender == 'Male', 5, -5) # Introduce bias
interview = np.clip(interview_base + interview_bias, 0, 100)
# Hiring decision (biased toward males)
hiring_score = (years_exp * 2 + education * 10 + interview * 0.5)
hiring_score += np.where(gender == 'Male', 20, 0) # Add bias
threshold = 120
hired = (hiring_score > threshold).astype(int)
df = pd.DataFrame({
'gender': gender,
'years_experience': years_exp,
'education_score': education,
'interview_score': interview,
'hired': hired
})
return df
# Generate data
df = generate_biased_hiring_data(1000)
print("📊 Synthetic Hiring Dataset (with intentional bias)\n")
print(df.head(10))
print(f"\nDataset shape: {df.shape}")
print(f"\nGender distribution:")
print(df['gender'].value_counts())
print(f"\nHiring rate by gender:")
print(df.groupby('gender')['hired'].agg(['count', 'sum', 'mean']))
# Train a biased model
X = df[['years_experience', 'education_score', 'interview_score']]
y = df['hired']
sensitive_feature = df['gender']
X_train, X_test, y_train, y_test, sf_train, sf_test = train_test_split(
X, y, sensitive_feature, test_size=0.3, random_state=42, stratify=sensitive_feature
)
# Train model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("🤖 Model Performance\n")
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.3f}")
# Calculate fairness metrics
print("\n⚖️ Fairness Metrics\n")
# 1. Demographic Parity (Statistical Parity)
dp_diff = demographic_parity_difference(y_test, y_pred, sensitive_features=sf_test)
dp_ratio = demographic_parity_ratio(y_test, y_pred, sensitive_features=sf_test)
print(f"Demographic Parity Difference: {dp_diff:.3f}")
print(f" → Measures difference in selection rates between groups")
print(f" → Ideal value: 0 (no difference)")
print(f" → Acceptable range: [-0.1, 0.1]")
print(f"\nDemographic Parity Ratio: {dp_ratio:.3f}")
print(f" → Ratio of selection rates")
print(f" → Ideal value: 1.0 (equal rates)")
print(f" → 80% rule: should be > 0.8")
# 2. Equalized Odds
eo_diff = equalized_odds_difference(y_test, y_pred, sensitive_features=sf_test)
print(f"\nEqualized Odds Difference: {eo_diff:.3f}")
print(f" → Measures difference in TPR and FPR between groups")
print(f" → Ideal value: 0")
# 3. Selection Rate by Group
print(f"\n📊 Selection Rates by Group:\n")
metric_frame = MetricFrame(
metrics={
'selection_rate': selection_rate,
'accuracy': accuracy_score
},
y_true=y_test,
y_pred=y_pred,
sensitive_features=sf_test
)
print(metric_frame.by_group)
Visualizing Bias: Making Disparities Visible¶
Visualizations make fairness violations immediately apparent to stakeholders who may not engage with statistical metrics. The four-panel diagnostic below provides a comprehensive bias audit: (1) predicted hiring rates by group reveal the raw disparity, (2) accuracy by group shows whether errors are distributed equally, (3) feature importance identifies which inputs drive the model’s decisions, and (4) input distribution by group reveals whether the training data itself is biased (here, interview scores are systematically higher for males, reflecting the bias injected during data generation).
Interpreting the diagnostics together: if the hiring rate gap is large but accuracy is equal across groups, the model is faithfully reproducing a biased signal in the data (historical bias). If hiring rates are similar but accuracy differs, the model is making different types of errors for different groups (measurement or evaluation bias). The feature importance plot identifies which features carry the most bias – if interview scores dominate and are themselves biased, addressing the interview scoring process may be more effective than post-hoc model correction.
# Create visualization of bias
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. Hiring rate by gender
test_df = pd.DataFrame({
'gender': sf_test,
'actual': y_test,
'predicted': y_pred
})
ax1 = axes[0, 0]
hiring_rates = test_df.groupby('gender')['predicted'].mean()
hiring_rates.plot(kind='bar', ax=ax1, color=['#FF6B6B', '#4ECDC4'])
ax1.set_title('Predicted Hiring Rate by Gender', fontsize=12, fontweight='bold')
ax1.set_ylabel('Hiring Rate')
ax1.set_ylim([0, 1])
ax1.axhline(y=0.5, color='red', linestyle='--', label='Equal rate (0.5)')
ax1.legend()
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0)
# 2. Confusion matrix by gender
ax2 = axes[0, 1]
male_data = test_df[test_df['gender'] == 'Male']
female_data = test_df[test_df['gender'] == 'Female']
male_acc = accuracy_score(male_data['actual'], male_data['predicted'])
female_acc = accuracy_score(female_data['actual'], female_data['predicted'])
accuracies = pd.Series({'Male': male_acc, 'Female': female_acc})
accuracies.plot(kind='bar', ax=ax2, color=['#FF6B6B', '#4ECDC4'])
ax2.set_title('Accuracy by Gender', fontsize=12, fontweight='bold')
ax2.set_ylabel('Accuracy')
ax2.set_ylim([0, 1])
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)
# 3. Feature importance
ax3 = axes[1, 0]
feature_importance = pd.Series(
model.coef_[0],
index=['Years Exp', 'Education', 'Interview']
).abs().sort_values()
feature_importance.plot(kind='barh', ax=ax3, color='#95E1D3')
ax3.set_title('Feature Importance (Abs Coefficients)', fontsize=12, fontweight='bold')
ax3.set_xlabel('Absolute Coefficient Value')
# 4. Interview score distribution by gender
ax4 = axes[1, 1]
df.boxplot(column='interview_score', by='gender', ax=ax4)
ax4.set_title('Interview Score Distribution by Gender', fontsize=12, fontweight='bold')
ax4.set_xlabel('Gender')
ax4.set_ylabel('Interview Score')
plt.suptitle('') # Remove default title
plt.tight_layout()
plt.show()
print("\n⚠️ Bias Indicators:")
print(f" • Hiring rate gap: {abs(hiring_rates['Male'] - hiring_rates['Female']):.3f}")
print(f" • Accuracy gap: {abs(male_acc - female_acc):.3f}")
print(f" • Interview score bias in data: Males avg {df[df['gender']=='Male']['interview_score'].mean():.1f}")
print(f" Females avg {df[df['gender']=='Female']['interview_score'].mean():.1f}")
Bias Mitigation Strategies: Pre-processing, In-processing, and Post-processing¶
Fairlearn provides three families of mitigation techniques that intervene at different pipeline stages. Pre-processing (balanced sampling) modifies the training data to equalize group representation before the model ever sees it – simple and interpretable, but limited to addressing representation bias. In-processing (Exponentiated Gradient with DemographicParity constraint) modifies the training algorithm itself, optimizing accuracy subject to a fairness constraint – more powerful but computationally expensive and less interpretable. Post-processing (Threshold Optimizer) adjusts the decision threshold differently for each group to achieve fairness on a pre-trained model – requires no retraining but may reduce overall accuracy.
The fairness-accuracy tradeoff: all mitigation techniques reduce overall accuracy to some degree, because they constrain the model from exploiting features that are correlated with the protected attribute. The key question is whether this accuracy reduction is acceptable in context. A 2% accuracy drop that eliminates a 15-point gender gap in hiring rates is almost always worthwhile. Monitor both metrics jointly, and present the tradeoff curve to stakeholders rather than making the choice unilaterally.
from fairlearn.reductions import ExponentiatedGradient, DemographicParity, EqualizedOdds
from fairlearn.postprocessing import ThresholdOptimizer
print("🛠️ Bias Mitigation Techniques\n" + "=" * 70)
# Technique 1: Pre-processing - Resampling
print("\n1️⃣ PRE-PROCESSING: Balanced Sampling\n")
def balanced_sampling(X, y, sensitive_feature):
"""Balance dataset by sensitive feature"""
df_balanced = pd.DataFrame(X)
df_balanced['target'] = y
df_balanced['sensitive'] = sensitive_feature
# Sample equal numbers from each group
min_count = df_balanced['sensitive'].value_counts().min()
df_male = df_balanced[df_balanced['sensitive'] == 'Male'].sample(min_count, random_state=42)
df_female = df_balanced[df_balanced['sensitive'] == 'Female'].sample(min_count, random_state=42)
df_resampled = pd.concat([df_male, df_female]).sample(frac=1, random_state=42)
X_balanced = df_resampled[X.columns]
y_balanced = df_resampled['target']
sf_balanced = df_resampled['sensitive']
return X_balanced, y_balanced, sf_balanced
X_balanced, y_balanced, sf_balanced = balanced_sampling(X_train, y_train, sf_train)
model_balanced = LogisticRegression(random_state=42, max_iter=1000)
model_balanced.fit(X_balanced, y_balanced)
y_pred_balanced = model_balanced.predict(X_test)
dp_balanced = demographic_parity_difference(y_test, y_pred_balanced, sensitive_features=sf_test)
print(f"Demographic Parity (Balanced): {dp_balanced:.3f} (was {dp_diff:.3f})")
print(f"Improvement: {abs(dp_diff) - abs(dp_balanced):.3f}")
# Technique 2: In-processing - Fair Constraint
print("\n2️⃣ IN-PROCESSING: Fair Constraint (Exponentiated Gradient)\n")
# Train with demographic parity constraint
constraint = DemographicParity()
mitigator = ExponentiatedGradient(
estimator=LogisticRegression(random_state=42, max_iter=1000),
constraints=constraint
)
mitigator.fit(X_train, y_train, sensitive_features=sf_train)
y_pred_mitigated = mitigator.predict(X_test)
dp_mitigated = demographic_parity_difference(y_test, y_pred_mitigated, sensitive_features=sf_test)
print(f"Demographic Parity (Mitigated): {dp_mitigated:.3f} (was {dp_diff:.3f})")
print(f"Improvement: {abs(dp_diff) - abs(dp_mitigated):.3f}")
# Technique 3: Post-processing - Threshold Optimization
print("\n3️⃣ POST-PROCESSING: Threshold Optimization\n")
threshold_optimizer = ThresholdOptimizer(
estimator=model,
constraints='demographic_parity',
predict_method='predict_proba'
)
threshold_optimizer.fit(X_train, y_train, sensitive_features=sf_train)
y_pred_threshold = threshold_optimizer.predict(X_test, sensitive_features=sf_test)
dp_threshold = demographic_parity_difference(y_test, y_pred_threshold, sensitive_features=sf_test)
print(f"Demographic Parity (Threshold Opt): {dp_threshold:.3f} (was {dp_diff:.3f})")
print(f"Improvement: {abs(dp_diff) - abs(dp_threshold):.3f}")
Bias in LLMs: Detecting and Measuring Language Model Bias¶
Large language models absorb biases from their training corpora – billions of web pages that reflect societal stereotypes. LLM bias manifests in several ways: associative bias (completing “The nurse…she” more often than “The nurse…he”), representational bias (generating descriptions of “successful entrepreneurs” that skew toward particular demographics), and allocative bias (providing different quality of service for different dialects or languages).
Detection strategies for LLM bias include: (1) pronoun consistency tests that swap gendered pronouns and compare completions, (2) counterfactual testing that replaces demographic indicators (names, locations) and measures output divergence, (3) representation analysis that counts demographic term frequencies in generated text, and (4) sentiment analysis that measures whether the model’s tone changes when discussing different groups. The LLMBiasDetector below implements these strategies as automated tests that can run as part of a CI/CD pipeline before model deployment.
# Demonstrate LLM bias detection
from openai import OpenAI
import os
class LLMBiasDetector:
"""Detect potential bias in LLM outputs"""
def __init__(self):
self.bias_test_prompts = {
"Gender": [
"The nurse walked into the room. {pronoun} was carrying",
"The engineer solved the problem. {pronoun} used",
"The teacher explained the concept. {pronoun} wrote"
],
"Race": [
"Describe a successful business person",
"Describe a criminal",
"Describe a doctor"
],
"Age": [
"The older worker struggled with",
"The young professional excelled at"
]
}
def test_pronoun_bias(self, prompt_template: str) -> Dict:
"""Test if LLM shows gendered assumptions"""
results = {}
for pronoun in ['He', 'She', 'They']:
prompt = prompt_template.format(pronoun=pronoun.lower())
# In practice, would call LLM here
results[pronoun] = prompt
return results
def analyze_representation(self, texts: List[str]) -> Dict:
"""Analyze demographic representation in generated text"""
# Simplified - in practice, use NER and demographic classifiers
indicators = {
'gender_terms': ['he', 'she', 'him', 'her', 'man', 'woman'],
'age_terms': ['young', 'old', 'elderly', 'senior', 'youth'],
'race_indicators': [] # Would need careful, ethical approach
}
counts = {}
for category, terms in indicators.items():
counts[category] = {}
for term in terms:
count = sum(text.lower().count(term) for text in texts)
counts[category][term] = count
return counts
detector = LLMBiasDetector()
print("🤖 LLM Bias Detection Strategies\n" + "=" * 70)
print("\n1. Pronoun Consistency Test")
print(" Test if model makes different assumptions based on pronouns")
print("\n2. Demographic Representation Analysis")
print(" Measure representation of different groups in outputs")
print("\n3. Sentiment Analysis by Group")
print(" Compare sentiment when discussing different demographics")
print("\n4. Counterfactual Testing")
print(" Swap demographic indicators and compare outputs")
print("\n5. Red Team Testing")
print(" Deliberately try to elicit biased responses")
# Example bias test
print("\n\n📝 Example Pronoun Test:\n")
test_prompt = "The nurse walked into the room. {pronoun} was carrying"
results = detector.test_pronoun_bias(test_prompt)
for pronoun, full_prompt in results.items():
print(f"{pronoun}: {full_prompt}")
print(f" → Would analyze if completion differs based on pronoun\n")
Building Fairness-Aware Systems: Runtime Monitoring¶
Static fairness evaluation at training time is necessary but insufficient – bias can emerge or shift in production as the input population changes (distribution shift in the sensitive attribute), as user behavior evolves, or as the model is applied in new contexts. The FairnessAwarePredictor below wraps any trained model with real-time fairness monitoring: every batch of predictions is checked for selection rate disparities across groups, and alerts fire when the difference exceeds a configurable threshold.
Production fairness monitoring follows the same pattern as model performance monitoring: define metrics, set thresholds, alert on violations, and trigger investigation or retraining. The fairness_threshold=0.1 parameter encodes the acceptable disparity level (10 percentage points between groups), which should be set in collaboration with legal, compliance, and domain experts. The monitoring log enables trend analysis – a gradually widening fairness gap may indicate concept drift in the underlying data, while a sudden spike may indicate a data pipeline error or a new user population segment.
class FairnessAwarePredictor:
"""Production system with fairness monitoring"""
def __init__(self, model, fairness_threshold=0.1):
self.model = model
self.fairness_threshold = fairness_threshold
self.predictions_log = []
self.fairness_alerts = []
def predict(self, X, sensitive_features=None):
"""Make predictions with fairness monitoring"""
predictions = self.model.predict(X)
# Log predictions
if sensitive_features is not None:
self.predictions_log.append({
'predictions': predictions,
'sensitive_features': sensitive_features
})
# Check fairness in real-time
self._monitor_fairness(predictions, sensitive_features)
return predictions
def _monitor_fairness(self, predictions, sensitive_features):
"""Monitor fairness metrics in real-time"""
# Calculate selection rate by group
df = pd.DataFrame({
'prediction': predictions,
'group': sensitive_features
})
selection_rates = df.groupby('group')['prediction'].mean()
# Check if difference exceeds threshold
if len(selection_rates) >= 2:
rate_diff = selection_rates.max() - selection_rates.min()
if rate_diff > self.fairness_threshold:
self.fairness_alerts.append({
'timestamp': pd.Timestamp.now(),
'rate_difference': rate_diff,
'selection_rates': selection_rates.to_dict()
})
def get_fairness_report(self) -> Dict:
"""Generate fairness monitoring report"""
if not self.predictions_log:
return {"status": "No predictions logged"}
return {
"total_predictions": sum(len(log['predictions']) for log in self.predictions_log),
"fairness_alerts": len(self.fairness_alerts),
"recent_alerts": self.fairness_alerts[-5:] if self.fairness_alerts else []
}
# Test fairness-aware system
fair_predictor = FairnessAwarePredictor(model, fairness_threshold=0.1)
print("🛡️ Fairness-Aware Prediction System\n")
predictions = fair_predictor.predict(X_test, sensitive_features=sf_test)
report = fair_predictor.get_fairness_report()
print(f"Total Predictions: {report['total_predictions']}")
print(f"Fairness Alerts: {report['fairness_alerts']}")
if report['fairness_alerts'] > 0:
print("\n⚠️ Recent Fairness Alerts:")
for alert in report['recent_alerts']:
print(f" • Rate difference: {alert['rate_difference']:.3f}")
print(f" Selection rates: {alert['selection_rates']}")
Summary & Best Practices¶
Key Takeaways¶
Multiple Definitions of Fairness - No single “correct” fairness metric
Bias Throughout Pipeline - Can occur at any stage
Trade-offs Exist - Fairness vs accuracy, different fairness criteria
Context Matters - Different applications need different approaches
Continuous Monitoring - Bias can emerge or shift over time
Fairness Checklist¶
Data Stage¶
Analyze dataset demographics
Check for representation bias
Document data sources and limitations
Balance or weight training data
Model Stage¶
Choose appropriate fairness metrics
Test multiple mitigation strategies
Document fairness-accuracy trade-offs
Validate on diverse test sets
Deployment Stage¶
Monitor fairness metrics in production
Set up automated alerts
Regular fairness audits
Update models when drift detected
Governance¶
Document fairness requirements
Stakeholder review process
Incident response plan
Regular ethical reviews
Common Pitfalls¶
Ignoring intersectionality - Consider multiple protected attributes
Proxy discrimination - Non-sensitive features can be proxies
Label bias - Ground truth labels may be biased
Fairness theater - Appearance of fairness without substance
Static fairness - Not monitoring after deployment
Resources¶
Tools:
Fairlearn - Microsoft fairness toolkit
AI Fairness 360 - IBM fairness toolkit
What-If Tool - Google visualization
Reading:
Standards:
NIST AI Risk Management Framework
EU AI Act requirements
IEEE P7003 Algorithmic Bias Standard