Machine Translationยถ
Neural machine translation with MarianMT, NLLB-200, and Helsinki-NLP models. Evaluation with BLEU scores.
# Install dependencies
# !pip install transformers torch sentencepiece sacrebleu datasets
Translation with Transformersยถ
# Machine translation with Hugging Face (requires transformers)
'''
from transformers import MarianMTModel, MarianTokenizer
# Load translation model (English to French)
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Translate text
text = "Hello, how are you today?"
inputs = tokenizer(text, return_tensors="pt", padding=True)
outputs = model.generate(**inputs, max_length=50)
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"English: {text}")
print(f"French: {translation}")
# Multilingual translation with M2M-100
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
# English to Spanish
tokenizer.src_lang = "en"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id("es"))
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Spanish: {translation}")
'''
print("Popular Translation Models:\n")
print("Bilingual (Helsinki-NLP OPUS):")
print(" โข opus-mt-en-fr (English โ French)")
print(" โข opus-mt-en-es (English โ Spanish)")
print(" โข opus-mt-en-de (English โ German)")
print("\nMultilingual:")
print(" โข facebook/m2m100_418M (100 languages)")
print(" โข facebook/mbart-large-50-many-to-many-mmt (50 languages)")
print(" โข google/mt5-base (multilingual T5)")
Simple Translation Systemยถ
from typing import List, Dict, Tuple
from dataclasses import dataclass
import re
@dataclass
class Translation:
"""Translation result"""
source_text: str
target_text: str
source_lang: str
target_lang: str
confidence: float = 1.0
class SimpleTranslator:
"""Simple dictionary-based translator (demo only)"""
def __init__(self):
# Simple bilingual dictionaries
self.en_to_fr = {
'hello': 'bonjour',
'world': 'monde',
'how': 'comment',
'are': 'allez',
'you': 'vous',
'good': 'bon',
'morning': 'matin',
'thank': 'merci',
'please': 's\'il vous plaรฎt'
}
self.en_to_es = {
'hello': 'hola',
'world': 'mundo',
'how': 'cรณmo',
'are': 'estรกs',
'you': 'tรบ',
'good': 'bueno',
'morning': 'maรฑana',
'thank': 'gracias',
'please': 'por favor'
}
def translate(self, text: str, target_lang: str = "fr") -> Translation:
"""Translate text (word-by-word, demo only)"""
# Clean and tokenize
text_lower = text.lower()
words = re.findall(r'\b\w+\b', text_lower)
# Select dictionary
if target_lang == "fr":
dictionary = self.en_to_fr
elif target_lang == "es":
dictionary = self.en_to_es
else:
return Translation(text, text, "en", target_lang, 0.0)
# Translate word by word
translated_words = [dictionary.get(word, word) for word in words]
translated_text = ' '.join(translated_words)
return Translation(
source_text=text,
target_text=translated_text,
source_lang="en",
target_lang=target_lang,
confidence=0.7
)
def batch_translate(self, texts: List[str], target_lang: str = "fr") -> List[Translation]:
"""Translate multiple texts"""
return [self.translate(text, target_lang) for text in texts]
# Test simple translator
translator = SimpleTranslator()
test_texts = [
"Hello world",
"How are you",
"Good morning"
]
print("English โ French:\n")
for text in test_texts:
result = translator.translate(text, "fr")
print(f"{result.source_text:20s} โ {result.target_text}")
print("\nEnglish โ Spanish:\n")
for text in test_texts:
result = translator.translate(text, "es")
print(f"{result.source_text:20s} โ {result.target_text}")
Translation Quality Metricsยถ
from collections import Counter
import math
class TranslationMetrics:
"""Translation evaluation metrics"""
@staticmethod
def bleu_score(reference: str, hypothesis: str, n: int = 4) -> float:
"""Simplified BLEU score (unigram only for simplicity)"""
# In production: use sacrebleu library
ref_tokens = reference.lower().split()
hyp_tokens = hypothesis.lower().split()
if not hyp_tokens:
return 0.0
# Count matches
ref_counter = Counter(ref_tokens)
hyp_counter = Counter(hyp_tokens)
matches = sum((hyp_counter & ref_counter).values())
total = len(hyp_tokens)
precision = matches / total if total > 0 else 0
# Brevity penalty
bp = 1.0
if len(hyp_tokens) < len(ref_tokens):
bp = math.exp(1 - len(ref_tokens) / len(hyp_tokens))
return bp * precision
@staticmethod
def chrf_score(reference: str, hypothesis: str) -> float:
"""Character F-score (chrF)"""
ref_chars = list(reference.replace(' ', ''))
hyp_chars = list(hypothesis.replace(' ', ''))
ref_counter = Counter(ref_chars)
hyp_counter = Counter(hyp_chars)
matches = sum((ref_counter & hyp_counter).values())
precision = matches / len(hyp_chars) if hyp_chars else 0
recall = matches / len(ref_chars) if ref_chars else 0
f_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return f_score
@staticmethod
def ter_score(reference: str, hypothesis: str) -> float:
"""Translation Edit Rate (simplified)"""
# Number of edits needed / reference length
ref_tokens = reference.lower().split()
hyp_tokens = hypothesis.lower().split()
# Simple edit distance approximation
edits = abs(len(ref_tokens) - len(hyp_tokens))
for r, h in zip(ref_tokens, hyp_tokens):
if r != h:
edits += 1
ter = edits / len(ref_tokens) if ref_tokens else 0
return ter
# Test metrics
metrics = TranslationMetrics()
reference = "Bonjour, comment allez-vous aujourd'hui?"
hypothesis1 = "Bonjour, comment allez-vous?" # Good translation
hypothesis2 = "Hello, how are you?" # Bad translation (English)
print("Translation Metrics:\n")
print(f"Reference: {reference}\n")
for i, hyp in enumerate([hypothesis1, hypothesis2], 1):
bleu = metrics.bleu_score(reference, hyp)
chrf = metrics.chrf_score(reference, hyp)
ter = metrics.ter_score(reference, hyp)
print(f"Hypothesis {i}: {hyp}")
print(f" BLEU: {bleu:.3f}")
print(f" chrF: {chrf:.3f}")
print(f" TER: {ter:.3f}")
print()
Production Translation Systemยถ
from collections import defaultdict, deque
import time
import hashlib
class ProductionTranslator:
"""Production-ready translation system"""
def __init__(self, supported_languages: List[str] = None):
self.translator = SimpleTranslator()
self.supported_languages = supported_languages or ["fr", "es"]
self.cache = {}
self.stats = {
"total_translations": 0,
"cache_hits": 0,
"translations_by_lang": defaultdict(int),
"avg_translation_time": 0,
"translation_times": deque(maxlen=100)
}
def _get_cache_key(self, text: str, target_lang: str) -> str:
"""Generate cache key"""
key_str = f"{text}_{target_lang}"
return hashlib.md5(key_str.encode()).hexdigest()
def translate(self, text: str, target_lang: str, use_cache: bool = True) -> Translation:
"""Translate with caching and monitoring"""
start = time.time()
# Validate language
if target_lang not in self.supported_languages:
raise ValueError(f"Language '{target_lang}' not supported. Supported: {self.supported_languages}")
# Check cache
cache_key = self._get_cache_key(text, target_lang)
if use_cache and cache_key in self.cache:
self.stats["cache_hits"] += 1
self.stats["total_translations"] += 1
return self.cache[cache_key]
# Translate
translation = self.translator.translate(text, target_lang)
# Cache result
if use_cache:
self.cache[cache_key] = translation
# Update stats
translation_time = time.time() - start
self.stats["translation_times"].append(translation_time)
self.stats["total_translations"] += 1
self.stats["translations_by_lang"][target_lang] += 1
self.stats["avg_translation_time"] = sum(self.stats["translation_times"]) / len(self.stats["translation_times"])
return translation
def batch_translate(self, texts: List[str], target_lang: str) -> List[Translation]:
"""Translate multiple texts"""
return [self.translate(text, target_lang) for text in texts]
def get_stats(self) -> Dict:
"""Get translation statistics"""
total = max(self.stats["total_translations"], 1)
return {
"total_translations": self.stats["total_translations"],
"cache_hits": self.stats["cache_hits"],
"cache_hit_rate": self.stats["cache_hits"] / total,
"translations_by_language": dict(self.stats["translations_by_lang"]),
"avg_translation_time_ms": self.stats["avg_translation_time"] * 1000
}
def clear_cache(self):
"""Clear translation cache"""
self.cache.clear()
# Test production translator
prod_translator = ProductionTranslator()
# Translate texts
texts = [
"Hello world",
"Good morning",
"Hello world", # Duplicate (cache hit)
"How are you"
]
print("Translating to French...\n")
for text in texts:
result = prod_translator.translate(text, "fr")
print(f"{result.source_text:20s} โ {result.target_text}")
# Print stats
stats = prod_translator.get_stats()
print(f"\nStatistics:")
print(f" Total translations: {stats['total_translations']}")
print(f" Cache hits: {stats['cache_hits']}")
print(f" Cache hit rate: {stats['cache_hit_rate']:.1%}")
print(f" By language: {stats['translations_by_language']}")
print(f" Avg time: {stats['avg_translation_time_ms']:.2f}ms")
Best Practicesยถ
1. Model Selectionยถ
Bilingual: Use Helsinki-NLP OPUS models (fast, accurate)
Multilingual: M2M-100 or mBART (100+ languages)
Domain-specific: Fine-tune on technical/medical texts
Low-resource: Use transfer learning from related languages
2. Data Preparationยถ
Align parallel sentences carefully
Clean and normalize text (punctuation, encoding)
Handle special characters and emoji
Use back-translation for data augmentation
3. Training Tipsยถ
Use byte-pair encoding (BPE) or SentencePiece
Apply label smoothing (0.1)
Use beam search for decoding (beam size 4-5)
Monitor BLEU score on validation set
4. Production Optimizationยถ
Cache frequent translations
Batch requests for efficiency
Use quantization (INT8) for speed
Implement fallback to simpler models
Common Challengesยถ
Word order: Languages have different syntax
Idioms: โItโs raining cats and dogsโ โ literal translation
Context: โBankโ (financial or river?)
Gender/formality: Spanish โtรบโ vs โustedโ
Low-resource languages: Limited training data
Evaluation Metricsยถ
BLEU: Precision-based n-gram overlap (0-100, higher better)
chrF: Character-level F-score (better for morphologically rich languages)
TER: Translation Edit Rate (lower better)
COMET: Neural metric using cross-lingual embeddings
Key Takeawaysยถ
โ Transformers are the current state-of-the-art
โ M2M-100 enables direct translation between 100 languages
โ BLEU is the standard metric but has limitations
โ Caching improves production performance significantly
โ Domain-specific fine-tuning is crucial for specialized text
โ Beam search improves translation quality
Next: 03_summarization.ipynb - Text Summarization