Run this notebook: Open in Colab Open in Kaggle

Advanced NLP Specialization — Start Here¶

Overview of the NLP track: NER, machine translation, summarization, sentiment analysis, and information extraction.

# Install dependencies
# !pip install transformers torch datasets spacy nltk sentencepiece

NLP Task Overview¶

from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
import numpy as np

@dataclass
class NLPTaskExample:
    """Example for NLP task demonstration"""
    task: str
    input_text: str
    output: str
    description: str

# Define task examples
task_examples = [
    NLPTaskExample(
        task="Named Entity Recognition",
        input_text="Apple Inc. CEO Tim Cook announced new products in Cupertino.",
        output="[Apple Inc.: ORG], [Tim Cook: PERSON], [Cupertino: LOC]",
        description="Extract and classify named entities"
    ),
    NLPTaskExample(
        task="Machine Translation",
        input_text="Hello, how are you?",
        output="Bonjour, comment allez-vous? (French)",
        description="Translate between languages"
    ),
    NLPTaskExample(
        task="Text Summarization",
        input_text="[Long article about climate change...]",
        output="Climate change poses significant risks. Scientists recommend urgent action.",
        description="Generate concise summaries"
    ),
    NLPTaskExample(
        task="Sentiment Analysis",
        input_text="This product is amazing! Best purchase ever.",
        output="Sentiment: POSITIVE (confidence: 0.95)",
        description="Classify emotional tone"
    ),
    NLPTaskExample(
        task="Information Extraction",
        input_text="John Smith signed a 3-year contract worth $500,000.",
        output="Person: John Smith, Duration: 3 years, Amount: $500,000",
        description="Extract structured information"
    ),
]

# Display examples
print("NLP Task Examples:\n" + "=" * 80)
for example in task_examples:
    print(f"\n{example.task}")
    print(f"  Description: {example.description}")
    print(f"  Input:  {example.input_text}")
    print(f"  Output: {example.output}")

Text Preprocessing Pipeline¶

import re
from typing import List

class TextPreprocessor:
    """Advanced text preprocessing for NLP tasks"""
    
    def __init__(self, lowercase: bool = True, remove_special: bool = False):
        self.lowercase = lowercase
        self.remove_special = remove_special
    
    def clean_text(self, text: str) -> str:
        """Clean text"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Lowercase
        if self.lowercase:
            text = text.lower()
        
        # Remove special characters (optional)
        if self.remove_special:
            text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        
        return text
    
    def tokenize(self, text: str) -> List[str]:
        """Simple word tokenization"""
        # In production: use proper tokenizer (spaCy, NLTK, HuggingFace)
        return text.split()
    
    def remove_stopwords(self, tokens: List[str]) -> List[str]:
        """Remove common stopwords"""
        # Simple stopwords list
        stopwords = {'the', 'is', 'at', 'which', 'on', 'a', 'an', 'and', 'or', 'but', 'in', 'with'}
        return [token for token in tokens if token.lower() not in stopwords]
    
    def preprocess(self, text: str, remove_stopwords: bool = False) -> List[str]:
        """Full preprocessing pipeline"""
        text = self.clean_text(text)
        tokens = self.tokenize(text)
        if remove_stopwords:
            tokens = self.remove_stopwords(tokens)
        return tokens

# Test preprocessor
preprocessor = TextPreprocessor()

sample_text = "  Hello, World!   This is a    test with   extra   spaces.  "
print(f"Original: '{sample_text}'")
print(f"\nCleaned: '{preprocessor.clean_text(sample_text)}'")
print(f"\nTokens: {preprocessor.tokenize(preprocessor.clean_text(sample_text))}")
print(f"\nWithout stopwords: {preprocessor.preprocess(sample_text, remove_stopwords=True)}")

Model Architecture Overview¶

# Common NLP model architectures (requires transformers)
'''
from transformers import AutoTokenizer, AutoModel

# BERT-based models (encoder-only)
# Good for: Classification, NER, sentiment analysis
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# GPT-based models (decoder-only)
# Good for: Text generation, completion
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# T5/BART models (encoder-decoder)
# Good for: Translation, summarization, question answering
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
'''

print("NLP Model Architecture Guide:\n")
print("Encoder-Only (BERT, RoBERTa, DistilBERT):")
print("  ✓ Named Entity Recognition")
print("  ✓ Sentiment Analysis")
print("  ✓ Text Classification")
print("  ✓ Question Answering (extractive)")
print("\nDecoder-Only (GPT-2, GPT-3):")
print("  ✓ Text Generation")
print("  ✓ Story Writing")
print("  ✓ Code Generation")
print("\nEncoder-Decoder (T5, BART, mBART):")
print("  ✓ Translation")
print("  ✓ Summarization")
print("  ✓ Question Answering (generative)")
print("  ✓ Paraphrasing")

Task-Specific Datasets¶

# Loading datasets with Hugging Face (requires datasets library)
'''
from datasets import load_dataset

# Named Entity Recognition
ner_dataset = load_dataset("conll2003")
print(f"NER samples: {len(ner_dataset['train'])}")

# Translation
translation_dataset = load_dataset("wmt14", "de-en")
print(f"Translation pairs: {len(translation_dataset['train'])}")

# Summarization
summarization_dataset = load_dataset("cnn_dailymail", "3.0.0")
print(f"Summarization examples: {len(summarization_dataset['train'])}")

# Sentiment Analysis
sentiment_dataset = load_dataset("imdb")
print(f"Sentiment examples: {len(sentiment_dataset['train'])}")
'''

print("Popular NLP Datasets:\n")
print("Named Entity Recognition:")
print("  • CoNLL-2003 (English news)")
print("  • OntoNotes 5.0 (multi-domain)")
print("  • WikiANN (multilingual)")
print("\nTranslation:")
print("  • WMT (Workshop on Machine Translation)")
print("  • OPUS (parallel corpora)")
print("  • Tatoeba (multilingual sentences)")
print("\nSummarization:")
print("  • CNN/DailyMail (news articles)")
print("  • XSum (extreme summarization)")
print("  • PubMed (scientific papers)")
print("\nSentiment Analysis:")
print("  • IMDb (movie reviews)")
print("  • Yelp (business reviews)")
print("  • Twitter Sentiment140")

Evaluation Metrics¶

class NLPMetrics:
    """Common NLP evaluation metrics"""
    
    @staticmethod
    def precision_recall_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
        """Compute precision, recall, F1 score"""
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        return precision, recall, f1
    
    @staticmethod
    def bleu_score_simple(reference: List[str], hypothesis: List[str]) -> float:
        """Simplified BLEU score (1-gram only)"""
        # In production: use sacrebleu or nltk.translate.bleu_score
        ref_set = set(reference)
        hyp_set = set(hypothesis)
        overlap = len(ref_set & hyp_set)
        return overlap / len(hyp_set) if hyp_set else 0
    
    @staticmethod
    def rouge_score_simple(reference: str, hypothesis: str) -> Dict[str, float]:
        """Simplified ROUGE score"""
        # In production: use rouge-score library
        ref_words = reference.lower().split()
        hyp_words = hypothesis.lower().split()
        
        ref_set = set(ref_words)
        hyp_set = set(hyp_words)
        overlap = len(ref_set & hyp_set)
        
        precision = overlap / len(hyp_set) if hyp_set else 0
        recall = overlap / len(ref_set) if ref_set else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {"precision": precision, "recall": recall, "f1": f1}

# Test metrics
metrics = NLPMetrics()

# Test precision/recall/F1
tp, fp, fn = 80, 10, 20
p, r, f1 = metrics.precision_recall_f1(tp, fp, fn)
print(f"Classification Metrics (TP={tp}, FP={fp}, FN={fn}):")
print(f"  Precision: {p:.2%}")
print(f"  Recall: {r:.2%}")
print(f"  F1 Score: {f1:.2%}")

# Test ROUGE
reference = "The cat sat on the mat"
hypothesis = "The cat was on the mat"
rouge = metrics.rouge_score_simple(reference, hypothesis)
print(f"\nROUGE Score:")
print(f"  Precision: {rouge['precision']:.2%}")
print(f"  Recall: {rouge['recall']:.2%}")
print(f"  F1: {rouge['f1']:.2%}")

NLP Pipeline Template¶

from abc import ABC, abstractmethod

class NLPPipeline(ABC):
    """Base class for NLP pipelines"""
    
    def __init__(self, model_name: str):
        self.model_name = model_name
        self.preprocessor = TextPreprocessor()
    
    @abstractmethod
    def process(self, text: str) -> any:
        """Process text and return task-specific output"""
        pass
    
    def batch_process(self, texts: List[str]) -> List[any]:
        """Process multiple texts"""
        return [self.process(text) for text in texts]
    
    def preprocess(self, text: str) -> str:
        """Preprocess text"""
        return self.preprocessor.clean_text(text)

# Example implementation
class SimpleSentimentPipeline(NLPPipeline):
    """Simple sentiment analysis pipeline"""
    
    def process(self, text: str) -> Dict[str, any]:
        """Analyze sentiment"""
        text = self.preprocess(text)
        
        # Simple rule-based sentiment (demo only)
        positive_words = {'good', 'great', 'excellent', 'amazing', 'wonderful', 'best'}
        negative_words = {'bad', 'terrible', 'awful', 'worst', 'horrible', 'poor'}
        
        words = set(text.lower().split())
        pos_count = len(words & positive_words)
        neg_count = len(words & negative_words)
        
        if pos_count > neg_count:
            sentiment = "POSITIVE"
            score = 0.8
        elif neg_count > pos_count:
            sentiment = "NEGATIVE"
            score = 0.2
        else:
            sentiment = "NEUTRAL"
            score = 0.5
        
        return {
            "text": text,
            "sentiment": sentiment,
            "score": score,
            "positive_words": pos_count,
            "negative_words": neg_count
        }

# Test pipeline
pipeline = SimpleSentimentPipeline("rule-based")

test_texts = [
    "This product is amazing! Best purchase ever.",
    "Terrible quality. Worst experience.",
    "It's okay, nothing special."
]

print("Sentiment Analysis Results:\n")
for text in test_texts:
    result = pipeline.process(text)
    print(f"Text: {text}")
    print(f"  Sentiment: {result['sentiment']} (score: {result['score']:.2f})")
    print()

Series Roadmap¶

Module 1: Named Entity Recognition¶

Sequence labeling with BERT
BIO/IOB tagging schemes
Custom entity types
Production NER systems

Module 2: Machine Translation¶

Sequence-to-sequence models
Attention mechanisms
Multilingual models (mBART, M2M-100)
Translation quality evaluation (BLEU, chrF)

Module 3: Text Summarization¶

Extractive vs abstractive
BART/T5 for summarization
Long document handling
ROUGE evaluation

Module 4: Sentiment Analysis¶

Fine-grained sentiment (1-5 stars)
Aspect-based sentiment
Emotion detection
Real-time sentiment monitoring

Module 5: Information Extraction¶

Relation extraction
Event extraction
Knowledge graph construction
Template filling

Best Practices¶

1. Model Selection¶

Start with pre-trained models (Hugging Face)
Fine-tune on domain-specific data
Consider model size vs accuracy tradeoff
Use DistilBERT for speed, RoBERTa for accuracy

2. Data Preparation¶

Clean and normalize text consistently
Handle special characters and unicode
Balance training data across classes
Use data augmentation when needed

3. Training¶

Use learning rate warmup
Apply gradient clipping
Monitor validation metrics
Save checkpoints regularly

4. Production Deployment¶

Optimize with ONNX/TensorRT
Implement caching for common inputs
Batch requests when possible
Monitor performance and errors

Key Takeaways¶

✅ Different tasks require different architectures

✅ Pre-trained models accelerate development

✅ Fine-tuning is essential for domain-specific tasks

✅ Evaluation metrics vary by task (F1, BLEU, ROUGE)

✅ Production systems need optimization and monitoring

✅ Multilingual models enable global applications

Next: 01_ner.ipynb - Named Entity Recognition