# Install required packages (run once)
# !pip install nltk rouge-score bert-score sacrebleu

import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# NLP metrics
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("✅ Setup complete")

Part 1: Introduction to LLM Evaluation

Why is LLM Evaluation Different?

Traditional ML: Single correct answer

  • Classification: Dog or Cat

  • Regression: Exact price

LLM Output: Many correct answers!

  • “The cat sat on the mat”

  • “A feline was resting on the rug”

  • “On the mat, there was a cat sitting”

Categories of LLM Metrics

  1. N-gram Overlap: BLEU, ROUGE, METEOR

    • Compare word/phrase overlap with reference

  2. Model-Based: Perplexity, BERTScore

    • Use ML models to evaluate

  3. Human Evaluation: Fluency, coherence, relevance

    • Human judgment (gold standard)

  4. Task-Specific: Accuracy, F1 for specific tasks

    • Depends on use case

Part 2: BLEU Score

BLEU (Bilingual Evaluation Understudy)

Originally for: Machine translation

How it works:

  1. Count n-gram matches (1-gram, 2-gram, 3-gram, 4-gram)

  2. Apply brevity penalty (penalize short outputs)

  3. Combine scores

Range: 0 to 1 (or 0 to 100)

  • 1.0: Perfect match

  • 0.0: No match

Good for: Translation, text generation

Limitations:

  • Doesn’t consider meaning

  • Requires reference text

  • Can miss paraphrases

def calculate_bleu(reference, candidate):
    """Calculate BLEU score"""
    reference_tokens = [reference.split()]
    candidate_tokens = candidate.split()
    
    # Use smoothing for short sentences
    smoothing = SmoothingFunction().method1
    
    # Calculate BLEU with different n-grams
    bleu1 = sentence_bleu(reference_tokens, candidate_tokens, weights=(1, 0, 0, 0))
    bleu2 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.5, 0.5, 0, 0))
    bleu4 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), 
                          smoothing_function=smoothing)
    
    return {
        'BLEU-1': bleu1,
        'BLEU-2': bleu2,
        'BLEU-4': bleu4
    }

# Example: Translation evaluation
reference = "The cat is sitting on the mat"

candidates = [
    ("The cat is sitting on the mat", "Perfect match"),
    ("The cat sits on the mat", "Close match"),
    ("A cat is on the mat", "Good match"),
    ("The feline rests on the rug", "Paraphrase (different words)"),
    ("Dog is running in park", "Completely wrong")
]

print("BLEU Score Examples:")
print("=" * 70)
print(f"Reference: '{reference}'\n")

for candidate, description in candidates:
    scores = calculate_bleu(reference, candidate)
    print(f"{description}:")
    print(f"  Candidate: '{candidate}'")
    print(f"  BLEU-1: {scores['BLEU-1']:.3f}")
    print(f"  BLEU-2: {scores['BLEU-2']:.3f}")
    print(f"  BLEU-4: {scores['BLEU-4']:.3f}")
    print()

Understanding BLEU Components

BLEU combines two key ideas: modified n-gram precision and a brevity penalty. The precision component counts how many n-grams in the candidate text appear in the reference, clipping counts to avoid rewarding repetition. The brevity penalty (BP) penalizes candidates that are shorter than the reference, preventing a system from gaming precision by outputting only the words it is most confident about. The final BLEU score is \(\text{BP} \cdot \exp\left(\sum_{n=1}^{N} w_n \log p_n\right)\), where \(p_n\) is the modified precision for n-grams of size \(n\) and \(w_n\) are weights (typically uniform at \(1/N\)). Understanding these internals helps explain why BLEU sometimes disagrees with human judgment – it is purely lexical and ignores word order beyond n-gram windows.

def detailed_bleu_analysis(reference, candidate):
    """Show detailed BLEU calculation"""
    ref_tokens = reference.split()
    cand_tokens = candidate.split()
    
    # 1-gram (unigram) matches
    ref_unigrams = Counter(ref_tokens)
    cand_unigrams = Counter(cand_tokens)
    
    matches = sum((ref_unigrams & cand_unigrams).values())
    precision = matches / len(cand_tokens) if len(cand_tokens) > 0 else 0
    
    # Brevity penalty
    bp = 1.0 if len(cand_tokens) >= len(ref_tokens) else np.exp(1 - len(ref_tokens)/len(cand_tokens))
    
    print(f"Reference: '{reference}' ({len(ref_tokens)} words)")
    print(f"Candidate: '{candidate}' ({len(cand_tokens)} words)")
    print(f"\n1-gram matches: {matches}/{len(cand_tokens)}")
    print(f"Precision: {precision:.3f}")
    print(f"Brevity Penalty: {bp:.3f}")
    
    if len(cand_tokens) < len(ref_tokens):
        print("  ⚠️ Candidate is shorter - penalty applied!")

# Demonstrate brevity penalty
print("\n📊 Brevity Penalty Example:")
print("=" * 60)
detailed_bleu_analysis(
    "The cat is sitting on the mat",
    "The cat"  # Too short!
)

Part 3: ROUGE Metrics

ROUGE (Recall-Oriented Understudy for Gisting Evaluation)

Originally for: Text summarization

Key Difference from BLEU:

  • BLEU: Precision-focused (generated text vs reference)

  • ROUGE: Recall-focused (reference vs generated text)

ROUGE Variants

  1. ROUGE-N: N-gram overlap (like BLEU)

  2. ROUGE-L: Longest Common Subsequence

  3. ROUGE-S: Skip-bigram matching

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Example: Summarization evaluation
reference_summary = """The quick brown fox jumped over the lazy dog. 
The dog was sleeping under a tree."""

generated_summaries = [
    ("The quick brown fox jumped over the lazy dog sleeping under a tree.", "Good summary"),
    ("A brown fox jumped over a dog.", "Acceptable summary"),
    ("The fox was quick and brown.", "Partial information"),
    ("There was a cat in the garden.", "Incorrect summary")
]

print("ROUGE Score Examples:")
print("=" * 70)
print(f"Reference: {reference_summary}\n")

for summary, description in generated_summaries:
    scores = scorer.score(reference_summary, summary)
    
    print(f"{description}:")
    print(f"  Generated: '{summary}'")
    print(f"  ROUGE-1: P={scores['rouge1'].precision:.3f}, R={scores['rouge1'].recall:.3f}, F1={scores['rouge1'].fmeasure:.3f}")
    print(f"  ROUGE-2: P={scores['rouge2'].precision:.3f}, R={scores['rouge2'].recall:.3f}, F1={scores['rouge2'].fmeasure:.3f}")
    print(f"  ROUGE-L: P={scores['rougeL'].precision:.3f}, R={scores['rougeL'].recall:.3f}, F1={scores['rougeL'].fmeasure:.3f}")
    print()

BLEU vs ROUGE: When to Use What?

Task

Metric

Why

Translation

BLEU

Precision matters - don’t add extra words

Summarization

ROUGE

Recall matters - capture key information

Question Answering

Both

Need precision AND recall

Creative Writing

Neither

Semantic meaning more important

# Compare BLEU and ROUGE on same example
def compare_bleu_rouge(reference, candidate):
    # BLEU
    bleu_scores = calculate_bleu(reference, candidate)
    
    # ROUGE
    rouge_scores = scorer.score(reference, candidate)
    
    print(f"Reference: '{reference}'")
    print(f"Candidate: '{candidate}'\n")
    
    print("BLEU Scores (Precision-focused):")
    print(f"  BLEU-4: {bleu_scores['BLEU-4']:.3f}")
    
    print("\nROUGE Scores (Recall-focused):")
    print(f"  ROUGE-1 F1: {rouge_scores['rouge1'].fmeasure:.3f}")
    print(f"  ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.3f}")

print("Example 1: Short candidate (missing info)")
print("=" * 60)
compare_bleu_rouge(
    "The cat is sitting on the mat and sleeping",
    "The cat is sitting"  # Incomplete
)

print("\n\nExample 2: Long candidate (extra info)")
print("=" * 60)
compare_bleu_rouge(
    "The cat is sitting",
    "The cat is sitting on the mat and sleeping"  # Too much
)

Part 4: Perplexity

What is Perplexity?

Definition: How “surprised” is the model by the text?

Formula: Perplexity = exp(average negative log-likelihood)

Lower is Better:

  • Lower perplexity = model is confident

  • Higher perplexity = model is confused

Typical Values:

  • 20-60: Excellent (GPT-4 level)

  • 60-100: Good

  • 100-200: Okay

  • > 200: Poor

Use Case: Evaluate language model quality (not individual outputs)

def calculate_perplexity(probabilities):
    """
    Calculate perplexity from token probabilities
    
    Args:
        probabilities: List of probabilities for each token
    """
    # Avoid log(0)
    probabilities = np.clip(probabilities, 1e-10, 1.0)
    
    # Calculate negative log-likelihood
    nll = -np.mean(np.log(probabilities))
    
    # Perplexity is exp(NLL)
    perplexity = np.exp(nll)
    
    return perplexity

# Simulate model predictions for "The cat sat on the mat"
scenarios = [
    {
        "name": "Confident Model (Good)",
        "probs": [0.95, 0.90, 0.92, 0.88, 0.91, 0.93],  # High probabilities
        "description": "Model is very confident about each word"
    },
    {
        "name": "Uncertain Model (Poor)",
        "probs": [0.30, 0.25, 0.28, 0.22, 0.26, 0.24],  # Low probabilities
        "description": "Model is confused about each word"
    },
    {
        "name": "Mixed Model",
        "probs": [0.95, 0.85, 0.40, 0.90, 0.35, 0.92],  # Variable
        "description": "Model confident about some words, confused about others"
    }
]

print("Perplexity Examples:")
print("=" * 70)

for scenario in scenarios:
    perp = calculate_perplexity(scenario['probs'])
    avg_prob = np.mean(scenario['probs'])
    
    print(f"\n{scenario['name']}:")
    print(f"  {scenario['description']}")
    print(f"  Token probabilities: {scenario['probs']}")
    print(f"  Average probability: {avg_prob:.3f}")
    print(f"  Perplexity: {perp:.2f}")
    
    if perp < 2:
        print("  ✅ Excellent - very confident")
    elif perp < 5:
        print("  ✅ Good - confident")
    elif perp < 10:
        print("  ⚠️ Fair - somewhat uncertain")
    else:
        print("  ❌ Poor - very uncertain")

Practical Perplexity Usage

Perplexity measures the language model’s overall confidence on a corpus, not the quality of any individual output. A model with perplexity 25 on English news text is, roughly speaking, “25 words uncertain” at each position – it has narrowed down the next token to about 25 equally likely candidates on average.

Good for:

  • Comparing language models on the same test set (lower perplexity = better language modeling)

  • Tracking training progress (perplexity should decrease over epochs)

  • Detecting out-of-distribution text (domain-shifted text yields higher perplexity)

Not good for:

  • Evaluating factual correctness (a model can be very confident and very wrong)

  • Measuring output quality for specific tasks like summarization

  • Comparing models across different tokenizers (different vocabularies make perplexity incomparable)

# Simulate perplexity on different text types
text_types = {
    "Training Domain": [0.85, 0.88, 0.90, 0.87, 0.89],
    "Similar Domain": [0.70, 0.75, 0.72, 0.68, 0.74],
    "Different Domain": [0.40, 0.35, 0.42, 0.38, 0.41],
    "Random Noise": [0.10, 0.08, 0.12, 0.09, 0.11]
}

results = []
for text_type, probs in text_types.items():
    perp = calculate_perplexity(probs)
    results.append({'Text Type': text_type, 'Perplexity': perp})

df = pd.DataFrame(results)
print("\nPerplexity by Text Type:")
print("=" * 50)
print(df.to_string(index=False))

print("\n💡 Insight: Perplexity increases as text becomes more")
print("   unfamiliar to the model!")

Part 5: Semantic Similarity (BERTScore)

Why BERTScore?

Problem with BLEU/ROUGE: Only lexical overlap

  • “The movie was great” vs “The film was excellent” = Low score!

  • But they mean the same thing!

BERTScore Solution: Use BERT embeddings

  • Captures semantic meaning

  • Handles paraphrases

  • More correlated with human judgment

How it works:

  1. Get BERT embeddings for each token

  2. Compute cosine similarity between embeddings

  3. Find best matching tokens

  4. Calculate precision, recall, F1

# BERTScore example
reference = "The movie was fantastic and entertaining"

candidates = [
    "The movie was fantastic and entertaining",  # Exact match
    "The film was excellent and enjoyable",      # Paraphrase
    "The movie was bad and boring",              # Opposite meaning
    "The cat sat on the mat"                     # Unrelated
]

print("BERTScore Evaluation:")
print("=" * 70)
print(f"Reference: '{reference}'\n")

# Calculate BERTScore
P, R, F1 = bert_score(candidates, [reference] * len(candidates), 
                      lang='en', verbose=False)

for i, candidate in enumerate(candidates):
    # Also calculate BLEU for comparison
    bleu = calculate_bleu(reference, candidate)['BLEU-4']
    
    print(f"Candidate {i+1}: '{candidate}'")
    print(f"  BLEU-4:     {bleu:.3f}")
    print(f"  BERTScore:  {F1[i]:.3f}")
    
    if i == 1:  # Paraphrase
        print("  💡 Note: BERTScore is higher because it understands synonyms!")
    print()

When to Use BERTScore

BERTScore leverages contextual embeddings from pre-trained transformers to match tokens between candidate and reference texts based on semantic similarity rather than exact string matching. A word like “excellent” will score highly against “fantastic” because their embedding vectors are close in BERT’s representation space.

Use BERTScore when:

  • Semantic meaning matters more than exact wording (e.g., paraphrase evaluation)

  • Evaluating creative or diverse outputs where many valid phrasings exist

  • You need a metric that correlates more closely with human judgment than BLEU/ROUGE

Use BLEU/ROUGE when:

  • Exact wording matters (e.g., code generation, factual extraction)

  • You need fast, lightweight metrics without GPU inference

  • Standard benchmark comparison requires established metrics for reproducibility

Part 6: Human Evaluation

Why Human Evaluation?

Automated metrics can’t capture:

  • Factual correctness

  • Helpfulness

  • Harmfulness

  • Cultural appropriateness

  • Creative quality

Common Evaluation Dimensions

# Human evaluation rubric example
evaluation_rubric = pd.DataFrame([
    {
        "Dimension": "Fluency",
        "Question": "Is the text grammatically correct and natural?",
        "Scale": "1 (Poor) - 5 (Excellent)",
        "Example 5": "Well-written, natural language",
        "Example 1": "Broken grammar, nonsensical"
    },
    {
        "Dimension": "Relevance",
        "Question": "Does it answer the question/task?",
        "Scale": "1 (Irrelevant) - 5 (Perfectly relevant)",
        "Example 5": "Directly addresses all aspects",
        "Example 1": "Completely off-topic"
    },
    {
        "Dimension": "Coherence",
        "Question": "Does it make logical sense?",
        "Scale": "1 (Incoherent) - 5 (Very coherent)",
        "Example 5": "Logical flow, clear reasoning",
        "Example 1": "Contradictory, confusing"
    },
    {
        "Dimension": "Factuality",
        "Question": "Is the information accurate?",
        "Scale": "1 (Many errors) - 5 (Fully accurate)",
        "Example 5": "All facts verified",
        "Example 1": "Multiple false claims"
    },
    {
        "Dimension": "Helpfulness",
        "Question": "Would this help the user?",
        "Scale": "1 (Not helpful) - 5 (Very helpful)",
        "Example 5": "Comprehensive, actionable",
        "Example 1": "Vague, unhelpful"
    }
])

print("Human Evaluation Rubric:")
print("=" * 100)
print(evaluation_rubric.to_string(index=False))

print("\n\n💡 Best Practices:")
print("  • Use multiple evaluators (3-5)")
print("  • Provide clear guidelines")
print("  • Calculate inter-rater agreement (Cohen's Kappa)")
print("  • Include examples for each score")
print("  • Randomize order to avoid bias")

Simulated Human Evaluation

Automated metrics can only approximate quality along predefined axes, but human evaluation captures nuances that no metric can – factual accuracy, helpfulness, cultural sensitivity, and reasoning coherence. The standard approach is to have multiple annotators rate outputs on Likert scales (typically 1-5) across several dimensions. Inter-annotator agreement, measured via Cohen’s Kappa or Krippendorff’s Alpha, tells you whether your rubric is clear enough for consistent scoring. Below, we simulate ratings from evaluators to illustrate how to aggregate and visualize multi-dimensional human evaluation results.

# Simulate human ratings for 3 model outputs
evaluations = pd.DataFrame([
    {"Model": "GPT-4", "Fluency": 5, "Relevance": 5, "Coherence": 5, "Factuality": 4, "Helpfulness": 5},
    {"Model": "GPT-3.5", "Fluency": 4, "Relevance": 4, "Coherence": 4, "Factuality": 3, "Helpfulness": 4},
    {"Model": "Basic LLM", "Fluency": 3, "Relevance": 3, "Coherence": 2, "Factuality": 2, "Helpfulness": 2}
])

# Calculate average scores
evaluations['Average'] = evaluations[['Fluency', 'Relevance', 'Coherence', 'Factuality', 'Helpfulness']].mean(axis=1)

print("Human Evaluation Results:")
print("=" * 80)
print(evaluations.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
evaluations_plot = evaluations.set_index('Model')[['Fluency', 'Relevance', 'Coherence', 'Factuality', 'Helpfulness']]
evaluations_plot.plot(kind='bar', ax=ax)
plt.title('Human Evaluation Scores by Model', fontsize=14, fontweight='bold')
plt.ylabel('Score (1-5)', fontsize=12)
plt.xlabel('Model', fontsize=12)
plt.legend(title='Dimension', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=0)
plt.grid(alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

Part 7: RAG Evaluation

RAG-Specific Metrics

RAG has 2 components to evaluate:

  1. Retrieval Quality:

    • Precision@K: How many retrieved docs are relevant?

    • Recall@K: What % of relevant docs were retrieved?

    • MRR (Mean Reciprocal Rank): Position of first relevant doc

  2. Generation Quality:

    • Faithfulness: Does output match retrieved docs?

    • Answer Relevance: Does it answer the question?

    • Context Relevance: Are retrieved docs relevant?

def precision_at_k(retrieved_docs, relevant_docs, k):
    """Calculate Precision@K for retrieval"""
    retrieved_k = set(retrieved_docs[:k])
    relevant_set = set(relevant_docs)
    
    return len(retrieved_k & relevant_set) / k if k > 0 else 0

def recall_at_k(retrieved_docs, relevant_docs, k):
    """Calculate Recall@K for retrieval"""
    retrieved_k = set(retrieved_docs[:k])
    relevant_set = set(relevant_docs)
    
    return len(retrieved_k & relevant_set) / len(relevant_set) if len(relevant_set) > 0 else 0

def mean_reciprocal_rank(retrieved_docs, relevant_docs):
    """Calculate MRR - position of first relevant document"""
    for i, doc in enumerate(retrieved_docs, 1):
        if doc in relevant_docs:
            return 1 / i
    return 0

# Example: Document retrieval evaluation
retrieved = ['doc3', 'doc1', 'doc7', 'doc2', 'doc9']  # Retrieved by RAG system
relevant = ['doc1', 'doc2', 'doc5']  # Actually relevant

print("RAG Retrieval Evaluation:")
print("=" * 60)
print(f"Retrieved documents: {retrieved}")
print(f"Relevant documents:  {relevant}\n")

for k in [1, 3, 5]:
    prec = precision_at_k(retrieved, relevant, k)
    rec = recall_at_k(retrieved, relevant, k)
    
    print(f"@{k}:")
    print(f"  Precision@{k}: {prec:.3f} ({prec*k:.0f}/{k} retrieved docs are relevant)")
    print(f"  Recall@{k}:    {rec:.3f} ({rec*len(relevant):.0f}/{len(relevant)} relevant docs found)")

mrr = mean_reciprocal_rank(retrieved, relevant)
print(f"\nMRR: {mrr:.3f}")
print(f"  First relevant doc found at position: {int(1/mrr) if mrr > 0 else 'Not found'}")

Faithfulness Evaluation

Key Question: Does the generated answer stay true to the retrieved context?

Faithfulness is the cornerstone metric for RAG systems. A response can be fluent and relevant yet completely fabricated if the model hallucinates information not present in the retrieved documents. Measuring faithfulness typically involves decomposing the answer into individual claims and verifying each claim against the source context. In practice, this is done using Natural Language Inference (NLI) models or an LLM-as-judge that checks entailment. The simplified overlap-based approach below provides intuition, but production systems should use NLI for more reliable detection of subtle fabrications.

# Example faithfulness evaluation (simplified)
def evaluate_faithfulness(context, answer):
    """
    Simple faithfulness check using text overlap
    In practice, use an LLM judge or NLI model
    """
    context_words = set(context.lower().split())
    answer_words = set(answer.lower().split())
    
    # Remove common words
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'is', 'was'}
    context_words -= stop_words
    answer_words -= stop_words
    
    # Calculate overlap
    overlap = len(context_words & answer_words) / len(answer_words) if len(answer_words) > 0 else 0
    
    return overlap

# Examples
context = "The Eiffel Tower was completed in 1889 and stands 330 meters tall."

answers = [
    ("The Eiffel Tower was completed in 1889.", "Faithful"),
    ("The Eiffel Tower stands 330 meters tall.", "Faithful"),
    ("The Eiffel Tower was completed in 1900.", "Unfaithful - wrong date!"),
    ("Paris is the capital of France.", "Unfaithful - not in context")
]

print("Faithfulness Evaluation:")
print("=" * 70)
print(f"Context: '{context}'\n")

for answer, description in answers:
    faithfulness = evaluate_faithfulness(context, answer)
    print(f"{description}:")
    print(f"  Answer: '{answer}'")
    print(f"  Faithfulness score: {faithfulness:.3f}")
    print()

🎯 Knowledge Check

Q1: When is ROUGE better than BLEU?
Q2: What does low perplexity indicate?
Q3: Why is BERTScore better for semantic evaluation?
Q4: What are the two components of RAG evaluation?

Click for answers

A1: Summarization tasks where recall (capturing key info) matters
A2: Model is confident and text is in-distribution
A3: It understands semantic meaning and synonyms, not just exact matches
A4: Retrieval quality (Precision@K, Recall@K, MRR) and generation quality (faithfulness, relevance)

📚 Summary

Metric Selection Guide

Task

Primary Metrics

Secondary Metrics

Translation

BLEU, BERTScore

Human eval

Summarization

ROUGE, BERTScore

Faithfulness

Question Answering

Exact Match, F1, BERTScore

Human relevance

Creative Writing

Human eval, BERTScore

Diversity metrics

RAG Systems

Precision@K, Faithfulness

Answer relevance

Model Comparison

Perplexity

Various task metrics

🚀 Next Steps

  1. Complete LLM Evaluation Challenge

  2. Read Notebook 4: Bias & Fairness Metrics

  3. Try evaluating your own LLM outputs

  4. Design a human evaluation study

Excellent! You now know how to properly evaluate LLM outputs! 🤖