Named Entity Recognition (NER)ΒΆ
BIO tagging, transformer-based NER with spaCy and Hugging Face, custom entity training, and evaluation.
# Install dependencies
# !pip install transformers torch datasets seqeval spacy
BIO Tagging SchemeΒΆ
from typing import List, Tuple, Dict
from dataclasses import dataclass
@dataclass
class Entity:
"""Named entity representation"""
text: str
label: str
start: int
end: int
confidence: float = 1.0
class BIOTagger:
"""BIO (Beginning, Inside, Outside) tagging scheme"""
def __init__(self):
# BIO tags: B-PER (Begin Person), I-PER (Inside Person), O (Outside)
self.tag_set = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
self.tag_to_id = {tag: i for i, tag in enumerate(self.tag_set)}
self.id_to_tag = {i: tag for i, tag in enumerate(self.tag_set)}
def tags_to_entities(self, tokens: List[str], tags: List[str]) -> List[Entity]:
"""Convert BIO tags to entities"""
entities = []
current_entity = None
current_tokens = []
for i, (token, tag) in enumerate(zip(tokens, tags)):
if tag.startswith('B-'):
# Save previous entity
if current_entity:
entities.append(Entity(
text=' '.join(current_tokens),
label=current_entity,
start=i - len(current_tokens),
end=i
))
# Start new entity
current_entity = tag[2:] # Remove 'B-' prefix
current_tokens = [token]
elif tag.startswith('I-') and current_entity:
# Continue current entity
if tag[2:] == current_entity:
current_tokens.append(token)
else:
# Outside or end of entity
if current_entity:
entities.append(Entity(
text=' '.join(current_tokens),
label=current_entity,
start=i - len(current_tokens),
end=i
))
current_entity = None
current_tokens = []
# Handle last entity
if current_entity:
entities.append(Entity(
text=' '.join(current_tokens),
label=current_entity,
start=len(tokens) - len(current_tokens),
end=len(tokens)
))
return entities
# Test BIO tagger
tagger = BIOTagger()
tokens = ['Apple', 'Inc.', 'CEO', 'Tim', 'Cook', 'visited', 'New', 'York']
tags = ['B-ORG', 'I-ORG', 'O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC']
entities = tagger.tags_to_entities(tokens, tags)
print("BIO Tagging Example:\n")
print("Tokens:", tokens)
print("Tags: ", tags)
print("\nExtracted Entities:")
for entity in entities:
print(f" {entity.text:15s} β {entity.label} (pos {entity.start}-{entity.end})")
NER with TransformersΒΆ
# BERT-based NER (requires transformers)
'''
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# Load pre-trained NER model
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# Extract entities
text = "Apple Inc. CEO Tim Cook announced new products in Cupertino, California."
entities = ner_pipeline(text)
for entity in entities:
print(f"{entity['word']:20s} {entity['entity_group']:5s} {entity['score']:.3f}")
'''
print("Popular NER Models:\n")
print("General English:")
print(" β’ dslim/bert-base-NER")
print(" β’ dbmdz/bert-large-cased-finetuned-conll03-english")
print(" β’ Jean-Baptiste/roberta-large-ner-english")
print("\nMultilingual:")
print(" β’ xlm-roberta-large-finetuned-conll03-english")
print(" β’ Davlan/bert-base-multilingual-cased-ner-hrl")
print("\nDomain-Specific:")
print(" β’ allenai/scibert_scivocab_uncased (biomedical)")
print(" β’ ProsusAI/finbert (financial)")
Custom NER ModelΒΆ
import numpy as np
class SimpleNER:
"""Simple rule-based NER for demonstration"""
def __init__(self):
# Simple gazetteers (in production: use comprehensive lists)
self.persons = {'tim cook', 'elon musk', 'jeff bezos', 'bill gates'}
self.orgs = {'apple', 'google', 'microsoft', 'amazon', 'tesla'}
self.locs = {'cupertino', 'new york', 'san francisco', 'seattle', 'london'}
def extract_entities(self, text: str) -> List[Entity]:
"""Extract entities using simple matching"""
text_lower = text.lower()
entities = []
# Check persons
for person in self.persons:
if person in text_lower:
start = text_lower.find(person)
entities.append(Entity(
text=text[start:start+len(person)],
label="PER",
start=start,
end=start+len(person)
))
# Check organizations
for org in self.orgs:
if org in text_lower:
start = text_lower.find(org)
entities.append(Entity(
text=text[start:start+len(org)],
label="ORG",
start=start,
end=start+len(org)
))
# Check locations
for loc in self.locs:
if loc in text_lower:
start = text_lower.find(loc)
entities.append(Entity(
text=text[start:start+len(loc)],
label="LOC",
start=start,
end=start+len(loc)
))
# Sort by position
entities.sort(key=lambda e: e.start)
return entities
# Test simple NER
ner = SimpleNER()
test_texts = [
"Tim Cook leads Apple in Cupertino.",
"Elon Musk's Tesla is based in California.",
"Microsoft was founded by Bill Gates in Seattle."
]
print("Simple NER Results:\n")
for text in test_texts:
entities = ner.extract_entities(text)
print(f"Text: {text}")
for entity in entities:
print(f" β {entity.text:15s} ({entity.label})")
print()
Production NER SystemΒΆ
from collections import defaultdict
import time
class ProductionNER:
"""Production-ready NER with caching and batch processing"""
def __init__(self, model_name: str = "bert-base-ner"):
self.model_name = model_name
self.ner = SimpleNER()
self.cache = {}
self.stats = {
"total_requests": 0,
"cache_hits": 0,
"entities_extracted": defaultdict(int)
}
def extract(self, text: str, use_cache: bool = True) -> List[Entity]:
"""Extract entities with caching"""
self.stats["total_requests"] += 1
# Check cache
if use_cache and text in self.cache:
self.stats["cache_hits"] += 1
return self.cache[text]
# Extract entities
entities = self.ner.extract_entities(text)
# Update stats
for entity in entities:
self.stats["entities_extracted"][entity.label] += 1
# Cache result
if use_cache:
self.cache[text] = entities
return entities
def batch_extract(self, texts: List[str]) -> List[List[Entity]]:
"""Process multiple texts"""
return [self.extract(text) for text in texts]
def get_stats(self) -> Dict:
"""Get extraction statistics"""
cache_hit_rate = (
self.stats["cache_hits"] / max(self.stats["total_requests"], 1)
)
return {
"total_requests": self.stats["total_requests"],
"cache_hits": self.stats["cache_hits"],
"cache_hit_rate": cache_hit_rate,
"entities_by_type": dict(self.stats["entities_extracted"]),
"total_entities": sum(self.stats["entities_extracted"].values())
}
def clear_cache(self):
"""Clear cache"""
self.cache.clear()
# Test production NER
prod_ner = ProductionNER()
texts = [
"Tim Cook announced Apple products.",
"Microsoft office in Seattle.",
"Tim Cook announced Apple products.", # Duplicate
"Google office in New York."
]
print("Processing texts...\n")
results = prod_ner.batch_extract(texts)
for text, entities in zip(texts, results):
print(f"Text: {text}")
print(f" Entities: {len(entities)}")
# Print stats
stats = prod_ner.get_stats()
print(f"\nStatistics:")
print(f" Total requests: {stats['total_requests']}")
print(f" Cache hit rate: {stats['cache_hit_rate']:.1%}")
print(f" Total entities: {stats['total_entities']}")
print(f" By type: {stats['entities_by_type']}")
Evaluation MetricsΒΆ
def compute_ner_metrics(predicted: List[Entity], gold: List[Entity]) -> Dict[str, float]:
"""Compute NER evaluation metrics"""
# Exact match (text + label + position)
pred_set = {(e.text, e.label, e.start, e.end) for e in predicted}
gold_set = {(e.text, e.label, e.start, e.end) for e in gold}
tp = len(pred_set & gold_set)
fp = len(pred_set - gold_set)
fn = len(gold_set - pred_set)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
"precision": precision,
"recall": recall,
"f1": f1,
"true_positives": tp,
"false_positives": fp,
"false_negatives": fn
}
# Test metrics
predicted = [
Entity("Apple", "ORG", 0, 5),
Entity("Tim Cook", "PER", 10, 18),
]
gold = [
Entity("Apple", "ORG", 0, 5),
Entity("Tim Cook", "PER", 10, 18),
Entity("Cupertino", "LOC", 30, 39), # Missed
]
metrics = compute_ner_metrics(predicted, gold)
print("NER Evaluation:")
print(f" Precision: {metrics['precision']:.2%}")
print(f" Recall: {metrics['recall']:.2%}")
print(f" F1 Score: {metrics['f1']:.2%}")
print(f"\n True Positives: {metrics['true_positives']}")
print(f" False Positives: {metrics['false_positives']}")
print(f" False Negatives: {metrics['false_negatives']}")
Best PracticesΒΆ
1. Model SelectionΒΆ
General: Use BERT/RoBERTa fine-tuned on CoNLL-2003
Domain-specific: Fine-tune on your domain (biomedical, legal, finance)
Multilingual: Use XLM-RoBERTa or mBERT
Fast inference: Use DistilBERT or quantized models
2. Data AnnotationΒΆ
Use consistent annotation guidelines
Annotate at least 1000-5000 examples
Include difficult edge cases
Use inter-annotator agreement metrics
3. Training TipsΒΆ
Use BIO or BIOES tagging scheme
Apply data augmentation (synonym replacement, back-translation)
Handle class imbalance with weighted loss
Use CRF layer on top of BERT for better sequence predictions
4. Production DeploymentΒΆ
Cache frequent entities
Batch requests for efficiency
Monitor entity distribution drift
Implement fallback to rule-based systems
Common ChallengesΒΆ
Nested entities: βBank of Americaβ contains βAmericaβ
Ambiguity: βWashingtonβ (person or location?)
Out-of-vocabulary: New entities not in training data
Context dependency: βAppleβ (fruit or company?)
Key TakeawaysΒΆ
β NER is a sequence labeling task
β BIO tagging is the standard scheme
β BERT-based models achieve SOTA results
β Fine-tuning on domain data is crucial
β F1 score is the primary evaluation metric
β Production systems need caching and monitoring
Next: 02_translation.ipynb - Machine Translation