Run this notebook: Open in Colab Open in Kaggle

Machine Translation¶

Neural machine translation with MarianMT, NLLB-200, and Helsinki-NLP models. Evaluation with BLEU scores.

# Install dependencies
# !pip install transformers torch sentencepiece sacrebleu datasets

Translation with Transformers¶

# Machine translation with Hugging Face (requires transformers)
'''
from transformers import MarianMTModel, MarianTokenizer

# Load translation model (English to French)
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Translate text
text = "Hello, how are you today?"
inputs = tokenizer(text, return_tensors="pt", padding=True)
outputs = model.generate(**inputs, max_length=50)
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"English: {text}")
print(f"French:  {translation}")

# Multilingual translation with M2M-100
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# English to Spanish
tokenizer.src_lang = "en"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id("es"))
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Spanish: {translation}")
'''

print("Popular Translation Models:\n")
print("Bilingual (Helsinki-NLP OPUS):")
print("  • opus-mt-en-fr (English → French)")
print("  • opus-mt-en-es (English → Spanish)")
print("  • opus-mt-en-de (English → German)")
print("\nMultilingual:")
print("  • facebook/m2m100_418M (100 languages)")
print("  • facebook/mbart-large-50-many-to-many-mmt (50 languages)")
print("  • google/mt5-base (multilingual T5)")

Simple Translation System¶

from typing import List, Dict, Tuple
from dataclasses import dataclass
import re

@dataclass
class Translation:
    """Translation result"""
    source_text: str
    target_text: str
    source_lang: str
    target_lang: str
    confidence: float = 1.0

class SimpleTranslator:
    """Simple dictionary-based translator (demo only)"""
    
    def __init__(self):
        # Simple bilingual dictionaries
        self.en_to_fr = {
            'hello': 'bonjour',
            'world': 'monde',
            'how': 'comment',
            'are': 'allez',
            'you': 'vous',
            'good': 'bon',
            'morning': 'matin',
            'thank': 'merci',
            'please': 's\'il vous plaît'
        }
        
        self.en_to_es = {
            'hello': 'hola',
            'world': 'mundo',
            'how': 'cómo',
            'are': 'estás',
            'you': 'tú',
            'good': 'bueno',
            'morning': 'mañana',
            'thank': 'gracias',
            'please': 'por favor'
        }
    
    def translate(self, text: str, target_lang: str = "fr") -> Translation:
        """Translate text (word-by-word, demo only)"""
        # Clean and tokenize
        text_lower = text.lower()
        words = re.findall(r'\b\w+\b', text_lower)
        
        # Select dictionary
        if target_lang == "fr":
            dictionary = self.en_to_fr
        elif target_lang == "es":
            dictionary = self.en_to_es
        else:
            return Translation(text, text, "en", target_lang, 0.0)
        
        # Translate word by word
        translated_words = [dictionary.get(word, word) for word in words]
        translated_text = ' '.join(translated_words)
        
        return Translation(
            source_text=text,
            target_text=translated_text,
            source_lang="en",
            target_lang=target_lang,
            confidence=0.7
        )
    
    def batch_translate(self, texts: List[str], target_lang: str = "fr") -> List[Translation]:
        """Translate multiple texts"""
        return [self.translate(text, target_lang) for text in texts]

# Test simple translator
translator = SimpleTranslator()

test_texts = [
    "Hello world",
    "How are you",
    "Good morning"
]

print("English → French:\n")
for text in test_texts:
    result = translator.translate(text, "fr")
    print(f"{result.source_text:20s} → {result.target_text}")

print("\nEnglish → Spanish:\n")
for text in test_texts:
    result = translator.translate(text, "es")
    print(f"{result.source_text:20s} → {result.target_text}")

Translation Quality Metrics¶

from collections import Counter
import math

class TranslationMetrics:
    """Translation evaluation metrics"""
    
    @staticmethod
    def bleu_score(reference: str, hypothesis: str, n: int = 4) -> float:
        """Simplified BLEU score (unigram only for simplicity)"""
        # In production: use sacrebleu library
        ref_tokens = reference.lower().split()
        hyp_tokens = hypothesis.lower().split()
        
        if not hyp_tokens:
            return 0.0
        
        # Count matches
        ref_counter = Counter(ref_tokens)
        hyp_counter = Counter(hyp_tokens)
        
        matches = sum((hyp_counter & ref_counter).values())
        total = len(hyp_tokens)
        
        precision = matches / total if total > 0 else 0
        
        # Brevity penalty
        bp = 1.0
        if len(hyp_tokens) < len(ref_tokens):
            bp = math.exp(1 - len(ref_tokens) / len(hyp_tokens))
        
        return bp * precision
    
    @staticmethod
    def chrf_score(reference: str, hypothesis: str) -> float:
        """Character F-score (chrF)"""
        ref_chars = list(reference.replace(' ', ''))
        hyp_chars = list(hypothesis.replace(' ', ''))
        
        ref_counter = Counter(ref_chars)
        hyp_counter = Counter(hyp_chars)
        
        matches = sum((ref_counter & hyp_counter).values())
        
        precision = matches / len(hyp_chars) if hyp_chars else 0
        recall = matches / len(ref_chars) if ref_chars else 0
        
        f_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        return f_score
    
    @staticmethod
    def ter_score(reference: str, hypothesis: str) -> float:
        """Translation Edit Rate (simplified)"""
        # Number of edits needed / reference length
        ref_tokens = reference.lower().split()
        hyp_tokens = hypothesis.lower().split()
        
        # Simple edit distance approximation
        edits = abs(len(ref_tokens) - len(hyp_tokens))
        for r, h in zip(ref_tokens, hyp_tokens):
            if r != h:
                edits += 1
        
        ter = edits / len(ref_tokens) if ref_tokens else 0
        return ter

# Test metrics
metrics = TranslationMetrics()

reference = "Bonjour, comment allez-vous aujourd'hui?"
hypothesis1 = "Bonjour, comment allez-vous?"  # Good translation
hypothesis2 = "Hello, how are you?"  # Bad translation (English)

print("Translation Metrics:\n")
print(f"Reference: {reference}\n")

for i, hyp in enumerate([hypothesis1, hypothesis2], 1):
    bleu = metrics.bleu_score(reference, hyp)
    chrf = metrics.chrf_score(reference, hyp)
    ter = metrics.ter_score(reference, hyp)
    
    print(f"Hypothesis {i}: {hyp}")
    print(f"  BLEU:  {bleu:.3f}")
    print(f"  chrF:  {chrf:.3f}")
    print(f"  TER:   {ter:.3f}")
    print()

Production Translation System¶

from collections import defaultdict, deque
import time
import hashlib

class ProductionTranslator:
    """Production-ready translation system"""
    
    def __init__(self, supported_languages: List[str] = None):
        self.translator = SimpleTranslator()
        self.supported_languages = supported_languages or ["fr", "es"]
        self.cache = {}
        self.stats = {
            "total_translations": 0,
            "cache_hits": 0,
            "translations_by_lang": defaultdict(int),
            "avg_translation_time": 0,
            "translation_times": deque(maxlen=100)
        }
    
    def _get_cache_key(self, text: str, target_lang: str) -> str:
        """Generate cache key"""
        key_str = f"{text}_{target_lang}"
        return hashlib.md5(key_str.encode()).hexdigest()
    
    def translate(self, text: str, target_lang: str, use_cache: bool = True) -> Translation:
        """Translate with caching and monitoring"""
        start = time.time()
        
        # Validate language
        if target_lang not in self.supported_languages:
            raise ValueError(f"Language '{target_lang}' not supported. Supported: {self.supported_languages}")
        
        # Check cache
        cache_key = self._get_cache_key(text, target_lang)
        if use_cache and cache_key in self.cache:
            self.stats["cache_hits"] += 1
            self.stats["total_translations"] += 1
            return self.cache[cache_key]
        
        # Translate
        translation = self.translator.translate(text, target_lang)
        
        # Cache result
        if use_cache:
            self.cache[cache_key] = translation
        
        # Update stats
        translation_time = time.time() - start
        self.stats["translation_times"].append(translation_time)
        self.stats["total_translations"] += 1
        self.stats["translations_by_lang"][target_lang] += 1
        self.stats["avg_translation_time"] = sum(self.stats["translation_times"]) / len(self.stats["translation_times"])
        
        return translation
    
    def batch_translate(self, texts: List[str], target_lang: str) -> List[Translation]:
        """Translate multiple texts"""
        return [self.translate(text, target_lang) for text in texts]
    
    def get_stats(self) -> Dict:
        """Get translation statistics"""
        total = max(self.stats["total_translations"], 1)
        return {
            "total_translations": self.stats["total_translations"],
            "cache_hits": self.stats["cache_hits"],
            "cache_hit_rate": self.stats["cache_hits"] / total,
            "translations_by_language": dict(self.stats["translations_by_lang"]),
            "avg_translation_time_ms": self.stats["avg_translation_time"] * 1000
        }
    
    def clear_cache(self):
        """Clear translation cache"""
        self.cache.clear()

# Test production translator
prod_translator = ProductionTranslator()

# Translate texts
texts = [
    "Hello world",
    "Good morning",
    "Hello world",  # Duplicate (cache hit)
    "How are you"
]

print("Translating to French...\n")
for text in texts:
    result = prod_translator.translate(text, "fr")
    print(f"{result.source_text:20s} → {result.target_text}")

# Print stats
stats = prod_translator.get_stats()
print(f"\nStatistics:")
print(f"  Total translations: {stats['total_translations']}")
print(f"  Cache hits: {stats['cache_hits']}")
print(f"  Cache hit rate: {stats['cache_hit_rate']:.1%}")
print(f"  By language: {stats['translations_by_language']}")
print(f"  Avg time: {stats['avg_translation_time_ms']:.2f}ms")

Best Practices¶

1. Model Selection¶

Bilingual: Use Helsinki-NLP OPUS models (fast, accurate)
Multilingual: M2M-100 or mBART (100+ languages)
Domain-specific: Fine-tune on technical/medical texts
Low-resource: Use transfer learning from related languages

2. Data Preparation¶

Align parallel sentences carefully
Clean and normalize text (punctuation, encoding)
Handle special characters and emoji
Use back-translation for data augmentation

3. Training Tips¶

Use byte-pair encoding (BPE) or SentencePiece
Apply label smoothing (0.1)
Use beam search for decoding (beam size 4-5)
Monitor BLEU score on validation set

4. Production Optimization¶

Cache frequent translations
Batch requests for efficiency
Use quantization (INT8) for speed
Implement fallback to simpler models

Common Challenges¶

Word order: Languages have different syntax
Idioms: “It’s raining cats and dogs” ≠ literal translation
Context: “Bank” (financial or river?)
Gender/formality: Spanish “tú” vs “usted”
Low-resource languages: Limited training data

Evaluation Metrics¶

BLEU: Precision-based n-gram overlap (0-100, higher better)
chrF: Character-level F-score (better for morphologically rich languages)
TER: Translation Edit Rate (lower better)
COMET: Neural metric using cross-lingual embeddings

Key Takeaways¶

✅ Transformers are the current state-of-the-art

✅ M2M-100 enables direct translation between 100 languages

✅ BLEU is the standard metric but has limitations

✅ Caching improves production performance significantly

✅ Domain-specific fine-tuning is crucial for specialized text

✅ Beam search improves translation quality

Next: 03_summarization.ipynb - Text Summarization