Run this notebook: Open in Colab Open in Kaggle

Named Entity Recognition (NER)¶

BIO tagging, transformer-based NER with spaCy and Hugging Face, custom entity training, and evaluation.

# Install dependencies
# !pip install transformers torch datasets seqeval spacy

BIO Tagging Scheme¶

from typing import List, Tuple, Dict
from dataclasses import dataclass

@dataclass
class Entity:
    """Named entity representation"""
    text: str
    label: str
    start: int
    end: int
    confidence: float = 1.0

class BIOTagger:
    """BIO (Beginning, Inside, Outside) tagging scheme"""
    
    def __init__(self):
        # BIO tags: B-PER (Begin Person), I-PER (Inside Person), O (Outside)
        self.tag_set = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
        self.tag_to_id = {tag: i for i, tag in enumerate(self.tag_set)}
        self.id_to_tag = {i: tag for i, tag in enumerate(self.tag_set)}
    
    def tags_to_entities(self, tokens: List[str], tags: List[str]) -> List[Entity]:
        """Convert BIO tags to entities"""
        entities = []
        current_entity = None
        current_tokens = []
        
        for i, (token, tag) in enumerate(zip(tokens, tags)):
            if tag.startswith('B-'):
                # Save previous entity
                if current_entity:
                    entities.append(Entity(
                        text=' '.join(current_tokens),
                        label=current_entity,
                        start=i - len(current_tokens),
                        end=i
                    ))
                # Start new entity
                current_entity = tag[2:]  # Remove 'B-' prefix
                current_tokens = [token]
            elif tag.startswith('I-') and current_entity:
                # Continue current entity
                if tag[2:] == current_entity:
                    current_tokens.append(token)
            else:
                # Outside or end of entity
                if current_entity:
                    entities.append(Entity(
                        text=' '.join(current_tokens),
                        label=current_entity,
                        start=i - len(current_tokens),
                        end=i
                    ))
                current_entity = None
                current_tokens = []
        
        # Handle last entity
        if current_entity:
            entities.append(Entity(
                text=' '.join(current_tokens),
                label=current_entity,
                start=len(tokens) - len(current_tokens),
                end=len(tokens)
            ))
        
        return entities

# Test BIO tagger
tagger = BIOTagger()

tokens = ['Apple', 'Inc.', 'CEO', 'Tim', 'Cook', 'visited', 'New', 'York']
tags = ['B-ORG', 'I-ORG', 'O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC']

entities = tagger.tags_to_entities(tokens, tags)

print("BIO Tagging Example:\n")
print("Tokens:", tokens)
print("Tags:  ", tags)
print("\nExtracted Entities:")
for entity in entities:
    print(f"  {entity.text:15s} → {entity.label} (pos {entity.start}-{entity.end})")

NER with Transformers¶

# BERT-based NER (requires transformers)
'''
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load pre-trained NER model
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Extract entities
text = "Apple Inc. CEO Tim Cook announced new products in Cupertino, California."
entities = ner_pipeline(text)

for entity in entities:
    print(f"{entity['word']:20s} {entity['entity_group']:5s} {entity['score']:.3f}")
'''

print("Popular NER Models:\n")
print("General English:")
print("  • dslim/bert-base-NER")
print("  • dbmdz/bert-large-cased-finetuned-conll03-english")
print("  • Jean-Baptiste/roberta-large-ner-english")
print("\nMultilingual:")
print("  • xlm-roberta-large-finetuned-conll03-english")
print("  • Davlan/bert-base-multilingual-cased-ner-hrl")
print("\nDomain-Specific:")
print("  • allenai/scibert_scivocab_uncased (biomedical)")
print("  • ProsusAI/finbert (financial)")

Custom NER Model¶

import numpy as np

class SimpleNER:
    """Simple rule-based NER for demonstration"""
    
    def __init__(self):
        # Simple gazetteers (in production: use comprehensive lists)
        self.persons = {'tim cook', 'elon musk', 'jeff bezos', 'bill gates'}
        self.orgs = {'apple', 'google', 'microsoft', 'amazon', 'tesla'}
        self.locs = {'cupertino', 'new york', 'san francisco', 'seattle', 'london'}
    
    def extract_entities(self, text: str) -> List[Entity]:
        """Extract entities using simple matching"""
        text_lower = text.lower()
        entities = []
        
        # Check persons
        for person in self.persons:
            if person in text_lower:
                start = text_lower.find(person)
                entities.append(Entity(
                    text=text[start:start+len(person)],
                    label="PER",
                    start=start,
                    end=start+len(person)
                ))
        
        # Check organizations
        for org in self.orgs:
            if org in text_lower:
                start = text_lower.find(org)
                entities.append(Entity(
                    text=text[start:start+len(org)],
                    label="ORG",
                    start=start,
                    end=start+len(org)
                ))
        
        # Check locations
        for loc in self.locs:
            if loc in text_lower:
                start = text_lower.find(loc)
                entities.append(Entity(
                    text=text[start:start+len(loc)],
                    label="LOC",
                    start=start,
                    end=start+len(loc)
                ))
        
        # Sort by position
        entities.sort(key=lambda e: e.start)
        return entities

# Test simple NER
ner = SimpleNER()

test_texts = [
    "Tim Cook leads Apple in Cupertino.",
    "Elon Musk's Tesla is based in California.",
    "Microsoft was founded by Bill Gates in Seattle."
]

print("Simple NER Results:\n")
for text in test_texts:
    entities = ner.extract_entities(text)
    print(f"Text: {text}")
    for entity in entities:
        print(f"  → {entity.text:15s} ({entity.label})")
    print()

Production NER System¶

from collections import defaultdict
import time

class ProductionNER:
    """Production-ready NER with caching and batch processing"""
    
    def __init__(self, model_name: str = "bert-base-ner"):
        self.model_name = model_name
        self.ner = SimpleNER()
        self.cache = {}
        self.stats = {
            "total_requests": 0,
            "cache_hits": 0,
            "entities_extracted": defaultdict(int)
        }
    
    def extract(self, text: str, use_cache: bool = True) -> List[Entity]:
        """Extract entities with caching"""
        self.stats["total_requests"] += 1
        
        # Check cache
        if use_cache and text in self.cache:
            self.stats["cache_hits"] += 1
            return self.cache[text]
        
        # Extract entities
        entities = self.ner.extract_entities(text)
        
        # Update stats
        for entity in entities:
            self.stats["entities_extracted"][entity.label] += 1
        
        # Cache result
        if use_cache:
            self.cache[text] = entities
        
        return entities
    
    def batch_extract(self, texts: List[str]) -> List[List[Entity]]:
        """Process multiple texts"""
        return [self.extract(text) for text in texts]
    
    def get_stats(self) -> Dict:
        """Get extraction statistics"""
        cache_hit_rate = (
            self.stats["cache_hits"] / max(self.stats["total_requests"], 1)
        )
        return {
            "total_requests": self.stats["total_requests"],
            "cache_hits": self.stats["cache_hits"],
            "cache_hit_rate": cache_hit_rate,
            "entities_by_type": dict(self.stats["entities_extracted"]),
            "total_entities": sum(self.stats["entities_extracted"].values())
        }
    
    def clear_cache(self):
        """Clear cache"""
        self.cache.clear()

# Test production NER
prod_ner = ProductionNER()

texts = [
    "Tim Cook announced Apple products.",
    "Microsoft office in Seattle.",
    "Tim Cook announced Apple products.",  # Duplicate
    "Google office in New York."
]

print("Processing texts...\n")
results = prod_ner.batch_extract(texts)

for text, entities in zip(texts, results):
    print(f"Text: {text}")
    print(f"  Entities: {len(entities)}")

# Print stats
stats = prod_ner.get_stats()
print(f"\nStatistics:")
print(f"  Total requests: {stats['total_requests']}")
print(f"  Cache hit rate: {stats['cache_hit_rate']:.1%}")
print(f"  Total entities: {stats['total_entities']}")
print(f"  By type: {stats['entities_by_type']}")

Evaluation Metrics¶

def compute_ner_metrics(predicted: List[Entity], gold: List[Entity]) -> Dict[str, float]:
    """Compute NER evaluation metrics"""
    # Exact match (text + label + position)
    pred_set = {(e.text, e.label, e.start, e.end) for e in predicted}
    gold_set = {(e.text, e.label, e.start, e.end) for e in gold}
    
    tp = len(pred_set & gold_set)
    fp = len(pred_set - gold_set)
    fn = len(gold_set - pred_set)
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "true_positives": tp,
        "false_positives": fp,
        "false_negatives": fn
    }

# Test metrics
predicted = [
    Entity("Apple", "ORG", 0, 5),
    Entity("Tim Cook", "PER", 10, 18),
]

gold = [
    Entity("Apple", "ORG", 0, 5),
    Entity("Tim Cook", "PER", 10, 18),
    Entity("Cupertino", "LOC", 30, 39),  # Missed
]

metrics = compute_ner_metrics(predicted, gold)
print("NER Evaluation:")
print(f"  Precision: {metrics['precision']:.2%}")
print(f"  Recall: {metrics['recall']:.2%}")
print(f"  F1 Score: {metrics['f1']:.2%}")
print(f"\n  True Positives: {metrics['true_positives']}")
print(f"  False Positives: {metrics['false_positives']}")
print(f"  False Negatives: {metrics['false_negatives']}")

Best Practices¶

1. Model Selection¶

General: Use BERT/RoBERTa fine-tuned on CoNLL-2003
Domain-specific: Fine-tune on your domain (biomedical, legal, finance)
Multilingual: Use XLM-RoBERTa or mBERT
Fast inference: Use DistilBERT or quantized models

2. Data Annotation¶

Use consistent annotation guidelines
Annotate at least 1000-5000 examples
Include difficult edge cases
Use inter-annotator agreement metrics

3. Training Tips¶

Use BIO or BIOES tagging scheme
Apply data augmentation (synonym replacement, back-translation)
Handle class imbalance with weighted loss
Use CRF layer on top of BERT for better sequence predictions

4. Production Deployment¶

Cache frequent entities
Batch requests for efficiency
Monitor entity distribution drift
Implement fallback to rule-based systems

Common Challenges¶

Nested entities: “Bank of America” contains “America”
Ambiguity: “Washington” (person or location?)
Out-of-vocabulary: New entities not in training data
Context dependency: “Apple” (fruit or company?)

Key Takeaways¶

✅ NER is a sequence labeling task

✅ BIO tagging is the standard scheme

✅ BERT-based models achieve SOTA results

✅ Fine-tuning on domain data is crucial

✅ F1 score is the primary evaluation metric

✅ Production systems need caching and monitoring

Next: 02_translation.ipynb - Machine Translation