Information Extraction from Documentsยถ
Relation extraction, event detection, structured data extraction from unstructured text using LLMs and NLP pipelines.
# Install dependencies
# !pip install transformers torch spacy
# !python -m spacy download en_core_web_sm
Relation Extraction with Transformersยถ
# Relation extraction with transformers (requires transformers)
'''
from transformers import pipeline
# Zero-shot relation classification
classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli"
)
text = "Bill Gates founded Microsoft in 1975."
relations = ["founded", "works_at", "acquired", "invested_in"]
result = classifier(text, relations)
print(result)
# Span-based relation extraction with SpaCy
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple Inc. acquired Voysis for $30M.")
# Extract entities
for ent in doc.ents:
print(f"{ent.text} ({ent.label_})")
# Define relation patterns
matcher = Matcher(nlp.vocab)
pattern = [{"ENT_TYPE": "ORG"}, {"LEMMA": "acquire"}, {"ENT_TYPE": "ORG"}]
matcher.add("ACQUISITION", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(f"Relation: {span.text}")
'''
print("Popular Relation Extraction Models:\n")
print("Pretrained:")
print(" โข tacred models (40+ relation types)")
print(" โข SpanBERT (span-based relations)")
print(" โข REBEL (end-to-end relation extraction)")
print("\nFrameworks:")
print(" โข spaCy (rule-based + ML)")
print(" โข Stanford CoreNLP (dependency parsing)")
print(" โข AllenNLP (neural IE models)")
Simple Relation Extractionยถ
from typing import List, Tuple, Optional
from dataclasses import dataclass
import re
@dataclass
class Entity:
"""Named entity"""
text: str
label: str
start: int
end: int
@dataclass
class Relation:
"""Relation between two entities"""
subject: Entity
predicate: str
object: Entity
confidence: float = 1.0
def __str__(self):
return f"({self.subject.text}, {self.predicate}, {self.object.text})"
class SimpleRelationExtractor:
"""Rule-based relation extraction"""
def __init__(self):
# Simple entity gazetteer
self.persons = {'tim cook', 'bill gates', 'elon musk', 'jeff bezos'}
self.orgs = {'apple', 'microsoft', 'tesla', 'amazon', 'google'}
self.locations = {'cupertino', 'seattle', 'austin', 'dublin'}
# Relation patterns (verb-based)
self.relation_patterns = {
'CEO_OF': r'(\w+(?:\s+\w+)*?)\s+(?:is|was)\s+(?:the\s+)?(?:ceo|chief executive)\s+of\s+(\w+(?:\s+\w+)*)',
'FOUNDED': r'(\w+(?:\s+\w+)*?)\s+founded\s+(\w+(?:\s+\w+)*)',
'ACQUIRED': r'(\w+(?:\s+\w+)*?)\s+acquired\s+(\w+(?:\s+\w+)*)',
'LOCATED_IN': r'(\w+(?:\s+\w+)*?)\s+(?:in|at|located in)\s+(\w+(?:\s+\w+)*)',
'WORKS_AT': r'(\w+(?:\s+\w+)*?)\s+(?:works at|employed by)\s+(\w+(?:\s+\w+)*)'
}
def _extract_entities(self, text: str) -> List[Entity]:
"""Extract entities (simplified NER)"""
entities = []
text_lower = text.lower()
# Find persons
for person in self.persons:
for match in re.finditer(re.escape(person), text_lower):
entities.append(Entity(
text=text[match.start():match.end()],
label='PERSON',
start=match.start(),
end=match.end()
))
# Find organizations
for org in self.orgs:
for match in re.finditer(re.escape(org), text_lower):
entities.append(Entity(
text=text[match.start():match.end()],
label='ORG',
start=match.start(),
end=match.end()
))
# Find locations
for loc in self.locations:
for match in re.finditer(re.escape(loc), text_lower):
entities.append(Entity(
text=text[match.start():match.end()],
label='LOC',
start=match.start(),
end=match.end()
))
return sorted(entities, key=lambda e: e.start)
def extract_relations(self, text: str) -> Tuple[List[Entity], List[Relation]]:
"""Extract entities and relations from text"""
entities = self._extract_entities(text)
relations = []
text_lower = text.lower()
# Apply relation patterns
for relation_type, pattern in self.relation_patterns.items():
for match in re.finditer(pattern, text_lower):
subject_text = match.group(1).strip()
object_text = match.group(2).strip()
# Find matching entities
subject_entity = next(
(e for e in entities if e.text.lower() == subject_text),
Entity(subject_text, 'UNKNOWN', match.start(1), match.end(1))
)
object_entity = next(
(e for e in entities if e.text.lower() == object_text),
Entity(object_text, 'UNKNOWN', match.start(2), match.end(2))
)
relations.append(Relation(
subject=subject_entity,
predicate=relation_type,
object=object_entity,
confidence=0.8
))
return entities, relations
# Test relation extraction
extractor = SimpleRelationExtractor()
texts = [
"Tim Cook is the CEO of Apple.",
"Bill Gates founded Microsoft.",
"Apple acquired a startup in Dublin.",
"Elon Musk works at Tesla in Austin."
]
print("Relation Extraction Results:\n")
for text in texts:
entities, relations = extractor.extract_relations(text)
print(f"Text: {text}")
print(f"Entities: {[f'{e.text}({e.label})' for e in entities]}")
print(f"Relations: {[str(r) for r in relations]}")
print()
Event Extractionยถ
from typing import Dict, Any
from datetime import datetime
@dataclass
class Event:
"""Event with structured slots"""
event_type: str
trigger: str
slots: Dict[str, Any]
confidence: float = 1.0
def __str__(self):
slots_str = ', '.join(f"{k}={v}" for k, v in self.slots.items())
return f"{self.event_type}({slots_str})"
class EventExtractor:
"""Extract structured events from text"""
def __init__(self):
# Event templates
self.event_patterns = {
'ACQUISITION': {
'triggers': ['acquired', 'bought', 'purchased'],
'slots': ['acquirer', 'target', 'amount', 'date'],
'pattern': r'(\w+(?:\s+\w+)*?)\s+(?:acquired|bought)\s+(\w+(?:\s+\w+)*?)(?:\s+for\s+(\$[\d.]+[MBK]?))?(?:\s+(?:in|on)\s+(\w+\s+\d{4}))?'
},
'PRODUCT_LAUNCH': {
'triggers': ['launched', 'released', 'announced'],
'slots': ['company', 'product', 'date'],
'pattern': r'(\w+(?:\s+\w+)*?)\s+(?:launched|released)\s+(\w+(?:\s+\w+)*?)(?:\s+(?:in|on)\s+(\w+\s+\d{4}))?'
},
'FUNDING': {
'triggers': ['raised', 'secured', 'received'],
'slots': ['company', 'amount', 'investors', 'date'],
'pattern': r'(\w+(?:\s+\w+)*?)\s+(?:raised|secured)\s+(\$[\d.]+[MBK]?)(?:\s+from\s+(\w+(?:\s+\w+)*))?'
}
}
def extract_events(self, text: str) -> List[Event]:
"""Extract events from text"""
events = []
for event_type, config in self.event_patterns.items():
pattern = config['pattern']
for match in re.finditer(pattern, text, re.IGNORECASE):
slots = {}
if event_type == 'ACQUISITION':
slots['acquirer'] = match.group(1).strip() if match.group(1) else None
slots['target'] = match.group(2).strip() if match.group(2) else None
slots['amount'] = match.group(3).strip() if match.group(3) else None
slots['date'] = match.group(4).strip() if match.group(4) else None
trigger = 'acquired'
elif event_type == 'PRODUCT_LAUNCH':
slots['company'] = match.group(1).strip() if match.group(1) else None
slots['product'] = match.group(2).strip() if match.group(2) else None
slots['date'] = match.group(3).strip() if match.group(3) else None
trigger = 'launched'
elif event_type == 'FUNDING':
slots['company'] = match.group(1).strip() if match.group(1) else None
slots['amount'] = match.group(2).strip() if match.group(2) else None
slots['investors'] = match.group(3).strip() if match.group(3) else None
trigger = 'raised'
# Filter out None values
slots = {k: v for k, v in slots.items() if v is not None}
events.append(Event(
event_type=event_type,
trigger=trigger,
slots=slots,
confidence=0.75
))
return events
# Test event extraction
event_extractor = EventExtractor()
news_articles = [
"Apple acquired Voysis for $30M in April 2020.",
"Tesla launched the Cybertruck in November 2019.",
"Stripe raised $600M from investors.",
"Google bought YouTube for $1.65B in October 2006."
]
print("Event Extraction Results:\n")
for article in news_articles:
events = event_extractor.extract_events(article)
print(f"Article: {article}")
if events:
for event in events:
print(f" Event: {event}")
print(f" Type: {event.event_type}")
print(f" Trigger: {event.trigger}")
for slot, value in event.slots.items():
print(f" {slot}: {value}")
else:
print(" No events detected")
print()
Knowledge Graph Constructionยถ
from collections import defaultdict
@dataclass
class Triple:
"""RDF-style triple (subject, predicate, object)"""
subject: str
predicate: str
object: str
def __hash__(self):
return hash((self.subject, self.predicate, self.object))
def __eq__(self, other):
return (self.subject, self.predicate, self.object) == (other.subject, other.predicate, other.object)
class KnowledgeGraph:
"""Simple knowledge graph"""
def __init__(self):
self.triples = set()
self.entities = set()
self.relations = defaultdict(list)
def add_triple(self, subject: str, predicate: str, obj: str):
"""Add a triple to the graph"""
triple = Triple(subject, predicate, obj)
self.triples.add(triple)
self.entities.add(subject)
self.entities.add(obj)
self.relations[predicate].append((subject, obj))
def query(self, subject: Optional[str] = None, predicate: Optional[str] = None, obj: Optional[str] = None) -> List[Triple]:
"""Query the knowledge graph"""
results = []
for triple in self.triples:
if subject and triple.subject != subject:
continue
if predicate and triple.predicate != predicate:
continue
if obj and triple.object != obj:
continue
results.append(triple)
return results
def get_entity_info(self, entity: str) -> Dict[str, List[str]]:
"""Get all information about an entity"""
info = defaultdict(list)
# Entity as subject
for triple in self.query(subject=entity):
info[triple.predicate].append(triple.object)
# Entity as object
for triple in self.query(obj=entity):
info[f"INVERSE_{triple.predicate}"].append(triple.subject)
return dict(info)
def get_stats(self) -> Dict:
"""Get graph statistics"""
return {
"num_triples": len(self.triples),
"num_entities": len(self.entities),
"num_relations": len(self.relations),
"relations": list(self.relations.keys())
}
# Build knowledge graph from extracted information
kg = KnowledgeGraph()
relation_extractor = SimpleRelationExtractor()
documents = [
"Tim Cook is the CEO of Apple.",
"Apple is located in Cupertino.",
"Bill Gates founded Microsoft.",
"Microsoft is located in Seattle."
]
print("Building Knowledge Graph...\n")
for doc in documents:
entities, relations = relation_extractor.extract_relations(doc)
for relation in relations:
kg.add_triple(
subject=relation.subject.text,
predicate=relation.predicate,
obj=relation.object.text
)
# Query the knowledge graph
print("Knowledge Graph Statistics:")
stats = kg.get_stats()
print(f" Triples: {stats['num_triples']}")
print(f" Entities: {stats['num_entities']}")
print(f" Relations: {stats['num_relations']}")
print(f" Relation types: {stats['relations']}\n")
print("All triples:")
for triple in kg.triples:
print(f" ({triple.subject}, {triple.predicate}, {triple.object})")
print("\nQuery: Who is the CEO of Apple?")
results = kg.query(predicate="CEO_OF", obj="apple")
for triple in results:
print(f" Answer: {triple.subject}")
print("\nQuery: What do we know about Microsoft?")
info = kg.get_entity_info("microsoft")
for relation, values in info.items():
print(f" {relation}: {values}")
Production Information Extraction Systemยถ
import time
from collections import Counter
class ProductionIESystem:
"""Production-ready information extraction system"""
def __init__(self):
self.relation_extractor = SimpleRelationExtractor()
self.event_extractor = EventExtractor()
self.knowledge_graph = KnowledgeGraph()
self.stats = {
"documents_processed": 0,
"entities_extracted": 0,
"relations_extracted": 0,
"events_extracted": 0,
"entity_types": Counter(),
"relation_types": Counter(),
"event_types": Counter()
}
def process_document(self, text: str) -> Dict:
"""Extract all information from a document"""
start = time.time()
# Extract entities and relations
entities, relations = self.relation_extractor.extract_relations(text)
# Extract events
events = self.event_extractor.extract_events(text)
# Add to knowledge graph
for relation in relations:
self.knowledge_graph.add_triple(
subject=relation.subject.text,
predicate=relation.predicate,
obj=relation.object.text
)
# Update stats
self.stats["documents_processed"] += 1
self.stats["entities_extracted"] += len(entities)
self.stats["relations_extracted"] += len(relations)
self.stats["events_extracted"] += len(events)
for entity in entities:
self.stats["entity_types"][entity.label] += 1
for relation in relations:
self.stats["relation_types"][relation.predicate] += 1
for event in events:
self.stats["event_types"][event.event_type] += 1
processing_time = time.time() - start
return {
"entities": entities,
"relations": relations,
"events": events,
"processing_time_ms": processing_time * 1000
}
def batch_process(self, documents: List[str]) -> List[Dict]:
"""Process multiple documents"""
return [self.process_document(doc) for doc in documents]
def get_stats(self) -> Dict:
"""Get extraction statistics"""
return {
"documents_processed": self.stats["documents_processed"],
"entities_extracted": self.stats["entities_extracted"],
"relations_extracted": self.stats["relations_extracted"],
"events_extracted": self.stats["events_extracted"],
"avg_entities_per_doc": self.stats["entities_extracted"] / max(self.stats["documents_processed"], 1),
"entity_type_distribution": dict(self.stats["entity_types"]),
"relation_type_distribution": dict(self.stats["relation_types"]),
"event_type_distribution": dict(self.stats["event_types"]),
"knowledge_graph_stats": self.knowledge_graph.get_stats()
}
# Test production system
ie_system = ProductionIESystem()
news_corpus = [
"Tim Cook is the CEO of Apple in Cupertino.",
"Apple acquired Voysis for $30M in April 2020.",
"Bill Gates founded Microsoft.",
"Tesla launched the Cybertruck in November 2019.",
"Elon Musk works at Tesla in Austin."
]
print("Processing news corpus...\n")
results = ie_system.batch_process(news_corpus)
for i, (doc, result) in enumerate(zip(news_corpus, results), 1):
print(f"Document {i}: {doc}")
print(f" Entities: {len(result['entities'])}")
print(f" Relations: {len(result['relations'])}")
print(f" Events: {len(result['events'])}")
print(f" Processing time: {result['processing_time_ms']:.2f}ms\n")
# Print statistics
stats = ie_system.get_stats()
print("\nExtraction Statistics:")
print(f" Documents processed: {stats['documents_processed']}")
print(f" Total entities: {stats['entities_extracted']}")
print(f" Total relations: {stats['relations_extracted']}")
print(f" Total events: {stats['events_extracted']}")
print(f" Avg entities/doc: {stats['avg_entities_per_doc']:.1f}")
print(f"\nEntity types: {stats['entity_type_distribution']}")
print(f"Relation types: {stats['relation_type_distribution']}")
print(f"Event types: {stats['event_type_distribution']}")
print(f"\nKnowledge graph: {stats['knowledge_graph_stats']['num_triples']} triples, {stats['knowledge_graph_stats']['num_entities']} entities")
Best Practicesยถ
1. Model Selectionยถ
Relation extraction: SpanBERT, REBEL (end-to-end)
Event extraction: OneIE, ACE05 models
Coreference: Neuralcoref, SpanBERT coref
Knowledge graphs: REBEL, KnowBERT, LUKE
Domain-specific: Fine-tune on your domain (finance, biomedical)
2. Data Preparationยถ
Clean and normalize text
Use high-quality entity linking
Handle ambiguous entities (Apple company vs fruit)
Resolve coreferences before extraction
Annotate training data carefully
3. Training Tipsยถ
Use TACRED, ACE05, or domain-specific datasets
Pretrain on distant supervision data
Fine-tune on manually annotated examples
Use negative sampling for relations
Monitor precision and recall separately
4. Production Optimizationยถ
Pipeline NER โ Coreference โ Relations โ Events
Batch processing for efficiency
Cache extracted knowledge
Implement confidence thresholds
Validate extractions with rules
Deduplicate triples in knowledge graphs
Common Challengesยถ
Ambiguity: โAppleโ (company vs fruit), โAmazonโ (company vs river)
Long-distance dependencies: Relations spanning multiple sentences
Implicit relations: โMicrosoftโs Bill Gatesโ (FOUNDED implied)
Negation: โApple did NOT acquire Voysisโ
Temporal reasoning: โFormer CEOโ vs โCurrent CEOโ
Coreference errors: Incorrect pronoun resolution
Evaluation Metricsยถ
Relation Extraction:ยถ
Precision: Correctly extracted / Total extracted
Recall: Correctly extracted / Total gold relations
F1: Harmonic mean of precision and recall
Event Extraction:ยถ
Trigger identification: Detect event triggers
Argument extraction: Extract event slots
End-to-end: Both trigger and arguments correct
Knowledge Graphs:ยถ
Triple correctness: Accuracy of (subject, predicate, object)
Completeness: Coverage of domain knowledge
Consistency: No contradictory triples
Key Takeawaysยถ
โ IE transforms unstructured text into structured knowledge
โ Relations connect entities, events add temporal structure
โ Knowledge graphs enable complex querying and reasoning
โ Production systems need pipelines: NER โ Coref โ Relations โ Events
โ Domain-specific fine-tuning is essential for accuracy
โ Confidence thresholds and validation rules improve precision
๐ Series Complete!ยถ
Youโve completed the NLP specialization series. You now understand:
Named Entity Recognition (01_ner.ipynb)
Machine Translation (02_translation.ipynb)
Text Summarization (03_summarization.ipynb)
Sentiment Analysis (04_sentiment_analysis.ipynb)
Information Extraction (05_information_extraction.ipynb)
Next Steps:ยถ
Build domain-specific NLP applications
Fine-tune models on your data
Combine multiple tasks in pipelines
Deploy production NLP systems
Explore advanced topics (multi-modal, few-shot learning)
Return to: 00_START_HERE.ipynb - NLP Overview