Advanced NLP Specialization β Start HereΒΆ
Overview of the NLP track: NER, machine translation, summarization, sentiment analysis, and information extraction.
# Install dependencies
# !pip install transformers torch datasets spacy nltk sentencepiece
NLP Task OverviewΒΆ
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
import numpy as np
@dataclass
class NLPTaskExample:
"""Example for NLP task demonstration"""
task: str
input_text: str
output: str
description: str
# Define task examples
task_examples = [
NLPTaskExample(
task="Named Entity Recognition",
input_text="Apple Inc. CEO Tim Cook announced new products in Cupertino.",
output="[Apple Inc.: ORG], [Tim Cook: PERSON], [Cupertino: LOC]",
description="Extract and classify named entities"
),
NLPTaskExample(
task="Machine Translation",
input_text="Hello, how are you?",
output="Bonjour, comment allez-vous? (French)",
description="Translate between languages"
),
NLPTaskExample(
task="Text Summarization",
input_text="[Long article about climate change...]",
output="Climate change poses significant risks. Scientists recommend urgent action.",
description="Generate concise summaries"
),
NLPTaskExample(
task="Sentiment Analysis",
input_text="This product is amazing! Best purchase ever.",
output="Sentiment: POSITIVE (confidence: 0.95)",
description="Classify emotional tone"
),
NLPTaskExample(
task="Information Extraction",
input_text="John Smith signed a 3-year contract worth $500,000.",
output="Person: John Smith, Duration: 3 years, Amount: $500,000",
description="Extract structured information"
),
]
# Display examples
print("NLP Task Examples:\n" + "=" * 80)
for example in task_examples:
print(f"\n{example.task}")
print(f" Description: {example.description}")
print(f" Input: {example.input_text}")
print(f" Output: {example.output}")
Text Preprocessing PipelineΒΆ
import re
from typing import List
class TextPreprocessor:
"""Advanced text preprocessing for NLP tasks"""
def __init__(self, lowercase: bool = True, remove_special: bool = False):
self.lowercase = lowercase
self.remove_special = remove_special
def clean_text(self, text: str) -> str:
"""Clean text"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
# Lowercase
if self.lowercase:
text = text.lower()
# Remove special characters (optional)
if self.remove_special:
text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
return text
def tokenize(self, text: str) -> List[str]:
"""Simple word tokenization"""
# In production: use proper tokenizer (spaCy, NLTK, HuggingFace)
return text.split()
def remove_stopwords(self, tokens: List[str]) -> List[str]:
"""Remove common stopwords"""
# Simple stopwords list
stopwords = {'the', 'is', 'at', 'which', 'on', 'a', 'an', 'and', 'or', 'but', 'in', 'with'}
return [token for token in tokens if token.lower() not in stopwords]
def preprocess(self, text: str, remove_stopwords: bool = False) -> List[str]:
"""Full preprocessing pipeline"""
text = self.clean_text(text)
tokens = self.tokenize(text)
if remove_stopwords:
tokens = self.remove_stopwords(tokens)
return tokens
# Test preprocessor
preprocessor = TextPreprocessor()
sample_text = " Hello, World! This is a test with extra spaces. "
print(f"Original: '{sample_text}'")
print(f"\nCleaned: '{preprocessor.clean_text(sample_text)}'")
print(f"\nTokens: {preprocessor.tokenize(preprocessor.clean_text(sample_text))}")
print(f"\nWithout stopwords: {preprocessor.preprocess(sample_text, remove_stopwords=True)}")
Model Architecture OverviewΒΆ
# Common NLP model architectures (requires transformers)
'''
from transformers import AutoTokenizer, AutoModel
# BERT-based models (encoder-only)
# Good for: Classification, NER, sentiment analysis
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# GPT-based models (decoder-only)
# Good for: Text generation, completion
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# T5/BART models (encoder-decoder)
# Good for: Translation, summarization, question answering
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
'''
print("NLP Model Architecture Guide:\n")
print("Encoder-Only (BERT, RoBERTa, DistilBERT):")
print(" β Named Entity Recognition")
print(" β Sentiment Analysis")
print(" β Text Classification")
print(" β Question Answering (extractive)")
print("\nDecoder-Only (GPT-2, GPT-3):")
print(" β Text Generation")
print(" β Story Writing")
print(" β Code Generation")
print("\nEncoder-Decoder (T5, BART, mBART):")
print(" β Translation")
print(" β Summarization")
print(" β Question Answering (generative)")
print(" β Paraphrasing")
Task-Specific DatasetsΒΆ
# Loading datasets with Hugging Face (requires datasets library)
'''
from datasets import load_dataset
# Named Entity Recognition
ner_dataset = load_dataset("conll2003")
print(f"NER samples: {len(ner_dataset['train'])}")
# Translation
translation_dataset = load_dataset("wmt14", "de-en")
print(f"Translation pairs: {len(translation_dataset['train'])}")
# Summarization
summarization_dataset = load_dataset("cnn_dailymail", "3.0.0")
print(f"Summarization examples: {len(summarization_dataset['train'])}")
# Sentiment Analysis
sentiment_dataset = load_dataset("imdb")
print(f"Sentiment examples: {len(sentiment_dataset['train'])}")
'''
print("Popular NLP Datasets:\n")
print("Named Entity Recognition:")
print(" β’ CoNLL-2003 (English news)")
print(" β’ OntoNotes 5.0 (multi-domain)")
print(" β’ WikiANN (multilingual)")
print("\nTranslation:")
print(" β’ WMT (Workshop on Machine Translation)")
print(" β’ OPUS (parallel corpora)")
print(" β’ Tatoeba (multilingual sentences)")
print("\nSummarization:")
print(" β’ CNN/DailyMail (news articles)")
print(" β’ XSum (extreme summarization)")
print(" β’ PubMed (scientific papers)")
print("\nSentiment Analysis:")
print(" β’ IMDb (movie reviews)")
print(" β’ Yelp (business reviews)")
print(" β’ Twitter Sentiment140")
Evaluation MetricsΒΆ
class NLPMetrics:
"""Common NLP evaluation metrics"""
@staticmethod
def precision_recall_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
"""Compute precision, recall, F1 score"""
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
return precision, recall, f1
@staticmethod
def bleu_score_simple(reference: List[str], hypothesis: List[str]) -> float:
"""Simplified BLEU score (1-gram only)"""
# In production: use sacrebleu or nltk.translate.bleu_score
ref_set = set(reference)
hyp_set = set(hypothesis)
overlap = len(ref_set & hyp_set)
return overlap / len(hyp_set) if hyp_set else 0
@staticmethod
def rouge_score_simple(reference: str, hypothesis: str) -> Dict[str, float]:
"""Simplified ROUGE score"""
# In production: use rouge-score library
ref_words = reference.lower().split()
hyp_words = hypothesis.lower().split()
ref_set = set(ref_words)
hyp_set = set(hyp_words)
overlap = len(ref_set & hyp_set)
precision = overlap / len(hyp_set) if hyp_set else 0
recall = overlap / len(ref_set) if ref_set else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
return {"precision": precision, "recall": recall, "f1": f1}
# Test metrics
metrics = NLPMetrics()
# Test precision/recall/F1
tp, fp, fn = 80, 10, 20
p, r, f1 = metrics.precision_recall_f1(tp, fp, fn)
print(f"Classification Metrics (TP={tp}, FP={fp}, FN={fn}):")
print(f" Precision: {p:.2%}")
print(f" Recall: {r:.2%}")
print(f" F1 Score: {f1:.2%}")
# Test ROUGE
reference = "The cat sat on the mat"
hypothesis = "The cat was on the mat"
rouge = metrics.rouge_score_simple(reference, hypothesis)
print(f"\nROUGE Score:")
print(f" Precision: {rouge['precision']:.2%}")
print(f" Recall: {rouge['recall']:.2%}")
print(f" F1: {rouge['f1']:.2%}")
NLP Pipeline TemplateΒΆ
from abc import ABC, abstractmethod
class NLPPipeline(ABC):
"""Base class for NLP pipelines"""
def __init__(self, model_name: str):
self.model_name = model_name
self.preprocessor = TextPreprocessor()
@abstractmethod
def process(self, text: str) -> any:
"""Process text and return task-specific output"""
pass
def batch_process(self, texts: List[str]) -> List[any]:
"""Process multiple texts"""
return [self.process(text) for text in texts]
def preprocess(self, text: str) -> str:
"""Preprocess text"""
return self.preprocessor.clean_text(text)
# Example implementation
class SimpleSentimentPipeline(NLPPipeline):
"""Simple sentiment analysis pipeline"""
def process(self, text: str) -> Dict[str, any]:
"""Analyze sentiment"""
text = self.preprocess(text)
# Simple rule-based sentiment (demo only)
positive_words = {'good', 'great', 'excellent', 'amazing', 'wonderful', 'best'}
negative_words = {'bad', 'terrible', 'awful', 'worst', 'horrible', 'poor'}
words = set(text.lower().split())
pos_count = len(words & positive_words)
neg_count = len(words & negative_words)
if pos_count > neg_count:
sentiment = "POSITIVE"
score = 0.8
elif neg_count > pos_count:
sentiment = "NEGATIVE"
score = 0.2
else:
sentiment = "NEUTRAL"
score = 0.5
return {
"text": text,
"sentiment": sentiment,
"score": score,
"positive_words": pos_count,
"negative_words": neg_count
}
# Test pipeline
pipeline = SimpleSentimentPipeline("rule-based")
test_texts = [
"This product is amazing! Best purchase ever.",
"Terrible quality. Worst experience.",
"It's okay, nothing special."
]
print("Sentiment Analysis Results:\n")
for text in test_texts:
result = pipeline.process(text)
print(f"Text: {text}")
print(f" Sentiment: {result['sentiment']} (score: {result['score']:.2f})")
print()
Series RoadmapΒΆ
Module 1: Named Entity RecognitionΒΆ
Sequence labeling with BERT
BIO/IOB tagging schemes
Custom entity types
Production NER systems
Module 2: Machine TranslationΒΆ
Sequence-to-sequence models
Attention mechanisms
Multilingual models (mBART, M2M-100)
Translation quality evaluation (BLEU, chrF)
Module 3: Text SummarizationΒΆ
Extractive vs abstractive
BART/T5 for summarization
Long document handling
ROUGE evaluation
Module 4: Sentiment AnalysisΒΆ
Fine-grained sentiment (1-5 stars)
Aspect-based sentiment
Emotion detection
Real-time sentiment monitoring
Module 5: Information ExtractionΒΆ
Relation extraction
Event extraction
Knowledge graph construction
Template filling
Best PracticesΒΆ
1. Model SelectionΒΆ
Start with pre-trained models (Hugging Face)
Fine-tune on domain-specific data
Consider model size vs accuracy tradeoff
Use DistilBERT for speed, RoBERTa for accuracy
2. Data PreparationΒΆ
Clean and normalize text consistently
Handle special characters and unicode
Balance training data across classes
Use data augmentation when needed
3. TrainingΒΆ
Use learning rate warmup
Apply gradient clipping
Monitor validation metrics
Save checkpoints regularly
4. Production DeploymentΒΆ
Optimize with ONNX/TensorRT
Implement caching for common inputs
Batch requests when possible
Monitor performance and errors
Key TakeawaysΒΆ
β Different tasks require different architectures
β Pre-trained models accelerate development
β Fine-tuning is essential for domain-specific tasks
β Evaluation metrics vary by task (F1, BLEU, ROUGE)
β Production systems need optimization and monitoring
β Multilingual models enable global applications
Next: 01_ner.ipynb - Named Entity Recognition