Run this notebook: Open in Colab Open in Kaggle

Information Extraction from Documents¶

Relation extraction, event detection, structured data extraction from unstructured text using LLMs and NLP pipelines.

# Install dependencies
# !pip install transformers torch spacy
# !python -m spacy download en_core_web_sm

Relation Extraction with Transformers¶

# Relation extraction with transformers (requires transformers)
'''
from transformers import pipeline

# Zero-shot relation classification
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

text = "Bill Gates founded Microsoft in 1975."
relations = ["founded", "works_at", "acquired", "invested_in"]
result = classifier(text, relations)
print(result)

# Span-based relation extraction with SpaCy
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple Inc. acquired Voysis for $30M.")

# Extract entities
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Define relation patterns
matcher = Matcher(nlp.vocab)
pattern = [{"ENT_TYPE": "ORG"}, {"LEMMA": "acquire"}, {"ENT_TYPE": "ORG"}]
matcher.add("ACQUISITION", [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(f"Relation: {span.text}")
'''

print("Popular Relation Extraction Models:\n")
print("Pretrained:")
print("  • tacred models (40+ relation types)")
print("  • SpanBERT (span-based relations)")
print("  • REBEL (end-to-end relation extraction)")
print("\nFrameworks:")
print("  • spaCy (rule-based + ML)")
print("  • Stanford CoreNLP (dependency parsing)")
print("  • AllenNLP (neural IE models)")

Simple Relation Extraction¶

from typing import List, Tuple, Optional
from dataclasses import dataclass
import re

@dataclass
class Entity:
    """Named entity"""
    text: str
    label: str
    start: int
    end: int

@dataclass
class Relation:
    """Relation between two entities"""
    subject: Entity
    predicate: str
    object: Entity
    confidence: float = 1.0
    
    def __str__(self):
        return f"({self.subject.text}, {self.predicate}, {self.object.text})"

class SimpleRelationExtractor:
    """Rule-based relation extraction"""
    
    def __init__(self):
        # Simple entity gazetteer
        self.persons = {'tim cook', 'bill gates', 'elon musk', 'jeff bezos'}
        self.orgs = {'apple', 'microsoft', 'tesla', 'amazon', 'google'}
        self.locations = {'cupertino', 'seattle', 'austin', 'dublin'}
        
        # Relation patterns (verb-based)
        self.relation_patterns = {
            'CEO_OF': r'(\w+(?:\s+\w+)*?)\s+(?:is|was)\s+(?:the\s+)?(?:ceo|chief executive)\s+of\s+(\w+(?:\s+\w+)*)',
            'FOUNDED': r'(\w+(?:\s+\w+)*?)\s+founded\s+(\w+(?:\s+\w+)*)',
            'ACQUIRED': r'(\w+(?:\s+\w+)*?)\s+acquired\s+(\w+(?:\s+\w+)*)',
            'LOCATED_IN': r'(\w+(?:\s+\w+)*?)\s+(?:in|at|located in)\s+(\w+(?:\s+\w+)*)',
            'WORKS_AT': r'(\w+(?:\s+\w+)*?)\s+(?:works at|employed by)\s+(\w+(?:\s+\w+)*)'
        }
    
    def _extract_entities(self, text: str) -> List[Entity]:
        """Extract entities (simplified NER)"""
        entities = []
        text_lower = text.lower()
        
        # Find persons
        for person in self.persons:
            for match in re.finditer(re.escape(person), text_lower):
                entities.append(Entity(
                    text=text[match.start():match.end()],
                    label='PERSON',
                    start=match.start(),
                    end=match.end()
                ))
        
        # Find organizations
        for org in self.orgs:
            for match in re.finditer(re.escape(org), text_lower):
                entities.append(Entity(
                    text=text[match.start():match.end()],
                    label='ORG',
                    start=match.start(),
                    end=match.end()
                ))
        
        # Find locations
        for loc in self.locations:
            for match in re.finditer(re.escape(loc), text_lower):
                entities.append(Entity(
                    text=text[match.start():match.end()],
                    label='LOC',
                    start=match.start(),
                    end=match.end()
                ))
        
        return sorted(entities, key=lambda e: e.start)
    
    def extract_relations(self, text: str) -> Tuple[List[Entity], List[Relation]]:
        """Extract entities and relations from text"""
        entities = self._extract_entities(text)
        relations = []
        
        text_lower = text.lower()
        
        # Apply relation patterns
        for relation_type, pattern in self.relation_patterns.items():
            for match in re.finditer(pattern, text_lower):
                subject_text = match.group(1).strip()
                object_text = match.group(2).strip()
                
                # Find matching entities
                subject_entity = next(
                    (e for e in entities if e.text.lower() == subject_text),
                    Entity(subject_text, 'UNKNOWN', match.start(1), match.end(1))
                )
                object_entity = next(
                    (e for e in entities if e.text.lower() == object_text),
                    Entity(object_text, 'UNKNOWN', match.start(2), match.end(2))
                )
                
                relations.append(Relation(
                    subject=subject_entity,
                    predicate=relation_type,
                    object=object_entity,
                    confidence=0.8
                ))
        
        return entities, relations

# Test relation extraction
extractor = SimpleRelationExtractor()

texts = [
    "Tim Cook is the CEO of Apple.",
    "Bill Gates founded Microsoft.",
    "Apple acquired a startup in Dublin.",
    "Elon Musk works at Tesla in Austin."
]

print("Relation Extraction Results:\n")
for text in texts:
    entities, relations = extractor.extract_relations(text)
    
    print(f"Text: {text}")
    print(f"Entities: {[f'{e.text}({e.label})' for e in entities]}")
    print(f"Relations: {[str(r) for r in relations]}")
    print()

Event Extraction¶

from typing import Dict, Any
from datetime import datetime

@dataclass
class Event:
    """Event with structured slots"""
    event_type: str
    trigger: str
    slots: Dict[str, Any]
    confidence: float = 1.0
    
    def __str__(self):
        slots_str = ', '.join(f"{k}={v}" for k, v in self.slots.items())
        return f"{self.event_type}({slots_str})"

class EventExtractor:
    """Extract structured events from text"""
    
    def __init__(self):
        # Event templates
        self.event_patterns = {
            'ACQUISITION': {
                'triggers': ['acquired', 'bought', 'purchased'],
                'slots': ['acquirer', 'target', 'amount', 'date'],
                'pattern': r'(\w+(?:\s+\w+)*?)\s+(?:acquired|bought)\s+(\w+(?:\s+\w+)*?)(?:\s+for\s+(\$[\d.]+[MBK]?))?(?:\s+(?:in|on)\s+(\w+\s+\d{4}))?'
            },
            'PRODUCT_LAUNCH': {
                'triggers': ['launched', 'released', 'announced'],
                'slots': ['company', 'product', 'date'],
                'pattern': r'(\w+(?:\s+\w+)*?)\s+(?:launched|released)\s+(\w+(?:\s+\w+)*?)(?:\s+(?:in|on)\s+(\w+\s+\d{4}))?'
            },
            'FUNDING': {
                'triggers': ['raised', 'secured', 'received'],
                'slots': ['company', 'amount', 'investors', 'date'],
                'pattern': r'(\w+(?:\s+\w+)*?)\s+(?:raised|secured)\s+(\$[\d.]+[MBK]?)(?:\s+from\s+(\w+(?:\s+\w+)*))?'
            }
        }
    
    def extract_events(self, text: str) -> List[Event]:
        """Extract events from text"""
        events = []
        
        for event_type, config in self.event_patterns.items():
            pattern = config['pattern']
            
            for match in re.finditer(pattern, text, re.IGNORECASE):
                slots = {}
                
                if event_type == 'ACQUISITION':
                    slots['acquirer'] = match.group(1).strip() if match.group(1) else None
                    slots['target'] = match.group(2).strip() if match.group(2) else None
                    slots['amount'] = match.group(3).strip() if match.group(3) else None
                    slots['date'] = match.group(4).strip() if match.group(4) else None
                    trigger = 'acquired'
                    
                elif event_type == 'PRODUCT_LAUNCH':
                    slots['company'] = match.group(1).strip() if match.group(1) else None
                    slots['product'] = match.group(2).strip() if match.group(2) else None
                    slots['date'] = match.group(3).strip() if match.group(3) else None
                    trigger = 'launched'
                    
                elif event_type == 'FUNDING':
                    slots['company'] = match.group(1).strip() if match.group(1) else None
                    slots['amount'] = match.group(2).strip() if match.group(2) else None
                    slots['investors'] = match.group(3).strip() if match.group(3) else None
                    trigger = 'raised'
                
                # Filter out None values
                slots = {k: v for k, v in slots.items() if v is not None}
                
                events.append(Event(
                    event_type=event_type,
                    trigger=trigger,
                    slots=slots,
                    confidence=0.75
                ))
        
        return events

# Test event extraction
event_extractor = EventExtractor()

news_articles = [
    "Apple acquired Voysis for $30M in April 2020.",
    "Tesla launched the Cybertruck in November 2019.",
    "Stripe raised $600M from investors.",
    "Google bought YouTube for $1.65B in October 2006."
]

print("Event Extraction Results:\n")
for article in news_articles:
    events = event_extractor.extract_events(article)
    
    print(f"Article: {article}")
    if events:
        for event in events:
            print(f"  Event: {event}")
            print(f"  Type: {event.event_type}")
            print(f"  Trigger: {event.trigger}")
            for slot, value in event.slots.items():
                print(f"    {slot}: {value}")
    else:
        print("  No events detected")
    print()

Knowledge Graph Construction¶

from collections import defaultdict

@dataclass
class Triple:
    """RDF-style triple (subject, predicate, object)"""
    subject: str
    predicate: str
    object: str
    
    def __hash__(self):
        return hash((self.subject, self.predicate, self.object))
    
    def __eq__(self, other):
        return (self.subject, self.predicate, self.object) == (other.subject, other.predicate, other.object)

class KnowledgeGraph:
    """Simple knowledge graph"""
    
    def __init__(self):
        self.triples = set()
        self.entities = set()
        self.relations = defaultdict(list)
    
    def add_triple(self, subject: str, predicate: str, obj: str):
        """Add a triple to the graph"""
        triple = Triple(subject, predicate, obj)
        self.triples.add(triple)
        self.entities.add(subject)
        self.entities.add(obj)
        self.relations[predicate].append((subject, obj))
    
    def query(self, subject: Optional[str] = None, predicate: Optional[str] = None, obj: Optional[str] = None) -> List[Triple]:
        """Query the knowledge graph"""
        results = []
        
        for triple in self.triples:
            if subject and triple.subject != subject:
                continue
            if predicate and triple.predicate != predicate:
                continue
            if obj and triple.object != obj:
                continue
            results.append(triple)
        
        return results
    
    def get_entity_info(self, entity: str) -> Dict[str, List[str]]:
        """Get all information about an entity"""
        info = defaultdict(list)
        
        # Entity as subject
        for triple in self.query(subject=entity):
            info[triple.predicate].append(triple.object)
        
        # Entity as object
        for triple in self.query(obj=entity):
            info[f"INVERSE_{triple.predicate}"].append(triple.subject)
        
        return dict(info)
    
    def get_stats(self) -> Dict:
        """Get graph statistics"""
        return {
            "num_triples": len(self.triples),
            "num_entities": len(self.entities),
            "num_relations": len(self.relations),
            "relations": list(self.relations.keys())
        }

# Build knowledge graph from extracted information
kg = KnowledgeGraph()
relation_extractor = SimpleRelationExtractor()

documents = [
    "Tim Cook is the CEO of Apple.",
    "Apple is located in Cupertino.",
    "Bill Gates founded Microsoft.",
    "Microsoft is located in Seattle."
]

print("Building Knowledge Graph...\n")
for doc in documents:
    entities, relations = relation_extractor.extract_relations(doc)
    
    for relation in relations:
        kg.add_triple(
            subject=relation.subject.text,
            predicate=relation.predicate,
            obj=relation.object.text
        )

# Query the knowledge graph
print("Knowledge Graph Statistics:")
stats = kg.get_stats()
print(f"  Triples: {stats['num_triples']}")
print(f"  Entities: {stats['num_entities']}")
print(f"  Relations: {stats['num_relations']}")
print(f"  Relation types: {stats['relations']}\n")

print("All triples:")
for triple in kg.triples:
    print(f"  ({triple.subject}, {triple.predicate}, {triple.object})")

print("\nQuery: Who is the CEO of Apple?")
results = kg.query(predicate="CEO_OF", obj="apple")
for triple in results:
    print(f"  Answer: {triple.subject}")

print("\nQuery: What do we know about Microsoft?")
info = kg.get_entity_info("microsoft")
for relation, values in info.items():
    print(f"  {relation}: {values}")

Production Information Extraction System¶

import time
from collections import Counter

class ProductionIESystem:
    """Production-ready information extraction system"""
    
    def __init__(self):
        self.relation_extractor = SimpleRelationExtractor()
        self.event_extractor = EventExtractor()
        self.knowledge_graph = KnowledgeGraph()
        self.stats = {
            "documents_processed": 0,
            "entities_extracted": 0,
            "relations_extracted": 0,
            "events_extracted": 0,
            "entity_types": Counter(),
            "relation_types": Counter(),
            "event_types": Counter()
        }
    
    def process_document(self, text: str) -> Dict:
        """Extract all information from a document"""
        start = time.time()
        
        # Extract entities and relations
        entities, relations = self.relation_extractor.extract_relations(text)
        
        # Extract events
        events = self.event_extractor.extract_events(text)
        
        # Add to knowledge graph
        for relation in relations:
            self.knowledge_graph.add_triple(
                subject=relation.subject.text,
                predicate=relation.predicate,
                obj=relation.object.text
            )
        
        # Update stats
        self.stats["documents_processed"] += 1
        self.stats["entities_extracted"] += len(entities)
        self.stats["relations_extracted"] += len(relations)
        self.stats["events_extracted"] += len(events)
        
        for entity in entities:
            self.stats["entity_types"][entity.label] += 1
        
        for relation in relations:
            self.stats["relation_types"][relation.predicate] += 1
        
        for event in events:
            self.stats["event_types"][event.event_type] += 1
        
        processing_time = time.time() - start
        
        return {
            "entities": entities,
            "relations": relations,
            "events": events,
            "processing_time_ms": processing_time * 1000
        }
    
    def batch_process(self, documents: List[str]) -> List[Dict]:
        """Process multiple documents"""
        return [self.process_document(doc) for doc in documents]
    
    def get_stats(self) -> Dict:
        """Get extraction statistics"""
        return {
            "documents_processed": self.stats["documents_processed"],
            "entities_extracted": self.stats["entities_extracted"],
            "relations_extracted": self.stats["relations_extracted"],
            "events_extracted": self.stats["events_extracted"],
            "avg_entities_per_doc": self.stats["entities_extracted"] / max(self.stats["documents_processed"], 1),
            "entity_type_distribution": dict(self.stats["entity_types"]),
            "relation_type_distribution": dict(self.stats["relation_types"]),
            "event_type_distribution": dict(self.stats["event_types"]),
            "knowledge_graph_stats": self.knowledge_graph.get_stats()
        }

# Test production system
ie_system = ProductionIESystem()

news_corpus = [
    "Tim Cook is the CEO of Apple in Cupertino.",
    "Apple acquired Voysis for $30M in April 2020.",
    "Bill Gates founded Microsoft.",
    "Tesla launched the Cybertruck in November 2019.",
    "Elon Musk works at Tesla in Austin."
]

print("Processing news corpus...\n")
results = ie_system.batch_process(news_corpus)

for i, (doc, result) in enumerate(zip(news_corpus, results), 1):
    print(f"Document {i}: {doc}")
    print(f"  Entities: {len(result['entities'])}")
    print(f"  Relations: {len(result['relations'])}")
    print(f"  Events: {len(result['events'])}")
    print(f"  Processing time: {result['processing_time_ms']:.2f}ms\n")

# Print statistics
stats = ie_system.get_stats()
print("\nExtraction Statistics:")
print(f"  Documents processed: {stats['documents_processed']}")
print(f"  Total entities: {stats['entities_extracted']}")
print(f"  Total relations: {stats['relations_extracted']}")
print(f"  Total events: {stats['events_extracted']}")
print(f"  Avg entities/doc: {stats['avg_entities_per_doc']:.1f}")
print(f"\nEntity types: {stats['entity_type_distribution']}")
print(f"Relation types: {stats['relation_type_distribution']}")
print(f"Event types: {stats['event_type_distribution']}")
print(f"\nKnowledge graph: {stats['knowledge_graph_stats']['num_triples']} triples, {stats['knowledge_graph_stats']['num_entities']} entities")

Best Practices¶

1. Model Selection¶

Relation extraction: SpanBERT, REBEL (end-to-end)
Event extraction: OneIE, ACE05 models
Coreference: Neuralcoref, SpanBERT coref
Knowledge graphs: REBEL, KnowBERT, LUKE
Domain-specific: Fine-tune on your domain (finance, biomedical)

2. Data Preparation¶

Clean and normalize text
Use high-quality entity linking
Handle ambiguous entities (Apple company vs fruit)
Resolve coreferences before extraction
Annotate training data carefully

3. Training Tips¶

Use TACRED, ACE05, or domain-specific datasets
Pretrain on distant supervision data
Fine-tune on manually annotated examples
Use negative sampling for relations
Monitor precision and recall separately

4. Production Optimization¶

Pipeline NER → Coreference → Relations → Events
Batch processing for efficiency
Cache extracted knowledge
Implement confidence thresholds
Validate extractions with rules
Deduplicate triples in knowledge graphs

Common Challenges¶

Ambiguity: “Apple” (company vs fruit), “Amazon” (company vs river)
Long-distance dependencies: Relations spanning multiple sentences
Implicit relations: “Microsoft’s Bill Gates” (FOUNDED implied)
Negation: “Apple did NOT acquire Voysis”
Temporal reasoning: “Former CEO” vs “Current CEO”
Coreference errors: Incorrect pronoun resolution

Evaluation Metrics¶

Relation Extraction:¶

Precision: Correctly extracted / Total extracted
Recall: Correctly extracted / Total gold relations
F1: Harmonic mean of precision and recall

Event Extraction:¶

Trigger identification: Detect event triggers
Argument extraction: Extract event slots
End-to-end: Both trigger and arguments correct

Knowledge Graphs:¶

Triple correctness: Accuracy of (subject, predicate, object)
Completeness: Coverage of domain knowledge
Consistency: No contradictory triples

Key Takeaways¶

✅ IE transforms unstructured text into structured knowledge

✅ Relations connect entities, events add temporal structure

✅ Knowledge graphs enable complex querying and reasoning

✅ Production systems need pipelines: NER → Coref → Relations → Events

✅ Domain-specific fine-tuning is essential for accuracy

✅ Confidence thresholds and validation rules improve precision

🎉 Series Complete!¶

You’ve completed the NLP specialization series. You now understand:

Named Entity Recognition (01_ner.ipynb)
Machine Translation (02_translation.ipynb)
Text Summarization (03_summarization.ipynb)
Sentiment Analysis (04_sentiment_analysis.ipynb)
Information Extraction (05_information_extraction.ipynb)

Next Steps:¶

Build domain-specific NLP applications
Fine-tune models on your data
Combine multiple tasks in pipelines
Deploy production NLP systems
Explore advanced topics (multi-modal, few-shot learning)

Return to: 00_START_HERE.ipynb - NLP Overview