Integration Guide: Using Tokenizers with Popular FrameworksΒΆ
Connecting Tokenizers to ML WorkflowsΒΆ
This guide shows how to integrate tokenizers with:
π€ Transformers
PyTorch & TensorFlow
FastAPI / Flask (APIs)
Streaming applications
Database storage
1. HuggingFace Transformers IntegrationΒΆ
Loading Pretrained Models with TokenizersΒΆ
from transformers import AutoTokenizer, AutoModel
import torch
# Load tokenizer and model together
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Tokenize and get embeddings
text = "Hello, world!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
# Extract embeddings
embeddings = outputs.last_hidden_state
print(f"Shape: {embeddings.shape}") # [batch_size, seq_len, hidden_dim]
Batch Processing with TransformersΒΆ
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
# Batch of texts
texts = [
"This movie is great!",
"I didn't like it.",
"Amazing performance!",
]
# Tokenize with padding and truncation
inputs = tokenizer(
texts,
padding=True, # Pad to longest in batch
truncation=True, # Truncate if too long
max_length=512, # Max sequence length
return_tensors="pt" # Return PyTorch tensors
)
# Forward pass
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.softmax(outputs.logits, dim=-1)
print(f"Predictions shape: {predictions.shape}") # [3, num_classes]
Custom Dataset with TokenizationΒΆ
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
class TextDataset(Dataset):
"""Custom dataset with tokenization"""
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
# Tokenize
encoding = self.tokenizer(
text,
add_special_tokens=True,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'label': torch.tensor(label, dtype=torch.long)
}
# Usage
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
texts = ["Text 1", "Text 2", "Text 3"]
labels = [0, 1, 0]
dataset = TextDataset(texts, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
for batch in dataloader:
print(f"Input IDs shape: {batch['input_ids'].shape}")
print(f"Attention mask shape: {batch['attention_mask'].shape}")
print(f"Labels shape: {batch['label'].shape}")
break
2. PyTorch IntegrationΒΆ
Using Tokenizers in PyTorch ModelsΒΆ
import torch
import torch.nn as nn
from tokenizers import Tokenizer
class TextClassifier(nn.Module):
"""Simple classifier using tokenized input"""
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, num_classes)
def forward(self, input_ids, attention_mask):
# input_ids: [batch_size, seq_len]
embedded = self.embedding(input_ids) # [batch, seq, embed]
# Pack padded sequence (using attention_mask)
lengths = attention_mask.sum(dim=1).cpu()
packed = nn.utils.rnn.pack_padded_sequence(
embedded, lengths, batch_first=True, enforce_sorted=False
)
# LSTM
_, (hidden, _) = self.lstm(packed) # hidden: [1, batch, hidden]
# Classify
output = self.fc(hidden.squeeze(0)) # [batch, num_classes]
return output
# Load tokenizer
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.get_vocab_size()
# Create model
model = TextClassifier(vocab_size=vocab_size, embedding_dim=128, hidden_dim=256, num_classes=2)
# Tokenize input
texts = ["This is great!", "Not good."]
encodings = tokenizer.encode_batch(texts)
# Convert to tensors
input_ids = torch.tensor([enc.ids for enc in encodings])
attention_mask = torch.tensor([enc.attention_mask for enc in encodings])
# Forward pass
outputs = model(input_ids, attention_mask)
print(f"Outputs: {outputs.shape}") # [2, 2]
3. TensorFlow / Keras IntegrationΒΆ
TensorFlow Dataset PipelineΒΆ
import tensorflow as tf
from transformers import AutoTokenizer
def create_tf_dataset(texts, labels, tokenizer, max_length=128, batch_size=32):
"""Create TensorFlow dataset with tokenization"""
# Tokenize all texts
encodings = tokenizer(
texts,
padding='max_length',
truncation=True,
max_length=max_length,
return_tensors='tf'
)
# Create dataset
dataset = tf.data.Dataset.from_tensor_slices((
{
'input_ids': encodings['input_ids'],
'attention_mask': encodings['attention_mask']
},
labels
))
return dataset.batch(batch_size)
# Usage
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
texts = ["Text 1", "Text 2", "Text 3"] * 10
labels = [0, 1, 0] * 10
dataset = create_tf_dataset(texts, labels, tokenizer)
for batch in dataset.take(1):
inputs, labels = batch
print(f"Input IDs: {inputs['input_ids'].shape}")
print(f"Labels: {labels.shape}")
Custom Keras LayerΒΆ
import tensorflow as tf
from transformers import TFAutoModel
class TokenizedTextModel(tf.keras.Model):
"""Keras model with tokenization"""
def __init__(self, model_name, num_classes):
super().__init__()
self.transformer = TFAutoModel.from_pretrained(model_name)
self.dropout = tf.keras.layers.Dropout(0.3)
self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')
def call(self, inputs, training=False):
# inputs: dict with 'input_ids' and 'attention_mask'
outputs = self.transformer(inputs, training=training)
# Use [CLS] token representation
cls_output = outputs.last_hidden_state[:, 0, :]
# Classify
dropped = self.dropout(cls_output, training=training)
return self.classifier(dropped)
# Create model
model = TokenizedTextModel('bert-base-uncased', num_classes=2)
# Compile
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Train (dataset from above)
# model.fit(dataset, epochs=3)
4. FastAPI Integration (REST API)ΒΆ
Creating a Tokenization APIΒΆ
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer
from typing import List, Optional
import uvicorn
app = FastAPI(title="Tokenization API")
# Load tokenizer at startup
tokenizer = None
@app.on_event("startup")
async def load_tokenizer():
global tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print("β
Tokenizer loaded")
# Request/Response models
class TokenizeRequest(BaseModel):
text: str
max_length: Optional[int] = 512
add_special_tokens: Optional[bool] = True
class TokenizeResponse(BaseModel):
tokens: List[str]
token_ids: List[int]
attention_mask: List[int]
num_tokens: int
class BatchTokenizeRequest(BaseModel):
texts: List[str]
max_length: Optional[int] = 512
padding: Optional[bool] = True
# Endpoints
@app.post("/tokenize", response_model=TokenizeResponse)
async def tokenize_text(request: TokenizeRequest):
"""Tokenize a single text"""
try:
encoding = tokenizer(
request.text,
max_length=request.max_length,
truncation=True,
add_special_tokens=request.add_special_tokens,
return_attention_mask=True
)
tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
return TokenizeResponse(
tokens=tokens,
token_ids=encoding['input_ids'],
attention_mask=encoding['attention_mask'],
num_tokens=len(encoding['input_ids'])
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/batch_tokenize")
async def batch_tokenize(request: BatchTokenizeRequest):
"""Tokenize multiple texts"""
try:
encodings = tokenizer(
request.texts,
max_length=request.max_length,
truncation=True,
padding=request.padding,
return_attention_mask=True
)
results = []
for i in range(len(request.texts)):
tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'][i])
results.append({
'text': request.texts[i],
'tokens': tokens,
'token_ids': encodings['input_ids'][i],
'num_tokens': len(encodings['input_ids'][i])
})
return results
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/vocab_size")
async def get_vocab_size():
"""Get tokenizer vocabulary size"""
return {"vocab_size": len(tokenizer)}
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {"status": "healthy", "tokenizer_loaded": tokenizer is not None}
# Run: uvicorn api:app --reload
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
Client ExampleΒΆ
import requests
# Tokenize single text
response = requests.post(
"http://localhost:8000/tokenize",
json={"text": "Hello, world!", "max_length": 512}
)
print(response.json())
# Batch tokenize
response = requests.post(
"http://localhost:8000/batch_tokenize",
json={"texts": ["Text 1", "Text 2", "Text 3"], "padding": True}
)
print(response.json())
5. Flask Integration (Simple API)ΒΆ
from flask import Flask, request, jsonify
from transformers import AutoTokenizer
app = Flask(__name__)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
@app.route('/tokenize', methods=['POST'])
def tokenize():
"""Tokenize endpoint"""
data = request.json
text = data.get('text', '')
if not text:
return jsonify({'error': 'No text provided'}), 400
# Tokenize
encoding = tokenizer(text, return_attention_mask=True)
tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
return jsonify({
'tokens': tokens,
'token_ids': encoding['input_ids'],
'num_tokens': len(tokens)
})
@app.route('/health', methods=['GET'])
def health():
return jsonify({'status': 'ok'})
if __name__ == '__main__':
app.run(debug=True, port=5000)
6. Database IntegrationΒΆ
Storing Tokenized DataΒΆ
import sqlite3
import json
from transformers import AutoTokenizer
class TokenizedDatabase:
"""Store pre-tokenized texts in database"""
def __init__(self, db_path='tokenized_data.db'):
self.db_path = db_path
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
self._init_db()
def _init_db(self):
"""Create database schema"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS tokenized_texts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
original_text TEXT NOT NULL,
token_ids TEXT NOT NULL,
num_tokens INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
def add_text(self, text):
"""Tokenize and store text"""
encoding = self.tokenizer(text)
token_ids = json.dumps(encoding['input_ids'])
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO tokenized_texts (original_text, token_ids, num_tokens)
VALUES (?, ?, ?)
''', (text, token_ids, len(encoding['input_ids'])))
conn.commit()
text_id = cursor.lastrowid
conn.close()
return text_id
def get_tokenized(self, text_id):
"""Retrieve tokenized text"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
SELECT original_text, token_ids, num_tokens
FROM tokenized_texts
WHERE id = ?
''', (text_id,))
row = cursor.fetchone()
conn.close()
if row:
return {
'text': row[0],
'token_ids': json.loads(row[1]),
'num_tokens': row[2]
}
return None
def batch_add(self, texts):
"""Batch insert tokenized texts"""
encodings = self.tokenizer(texts)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
data = [
(text, json.dumps(ids), len(ids))
for text, ids in zip(texts, encodings['input_ids'])
]
cursor.executemany('''
INSERT INTO tokenized_texts (original_text, token_ids, num_tokens)
VALUES (?, ?, ?)
''', data)
conn.commit()
conn.close()
# Usage
db = TokenizedDatabase()
text_id = db.add_text("Hello, world!")
result = db.get_tokenized(text_id)
print(result)
7. Streaming ApplicationsΒΆ
Process Streaming DataΒΆ
from transformers import AutoTokenizer
import time
class StreamingTokenizer:
"""Tokenize streaming data efficiently"""
def __init__(self, model_name="bert-base-uncased", buffer_size=100):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.buffer = []
self.buffer_size = buffer_size
def add(self, text):
"""Add text to buffer"""
self.buffer.append(text)
# Process when buffer is full
if len(self.buffer) >= self.buffer_size:
return self.flush()
return None
def flush(self):
"""Process buffered texts"""
if not self.buffer:
return []
# Batch tokenize
encodings = self.tokenizer(
self.buffer,
padding=True,
truncation=True,
return_tensors='pt'
)
# Clear buffer
texts = self.buffer.copy()
self.buffer = []
return list(zip(texts, encodings['input_ids'], encodings['attention_mask']))
# Usage - simulate streaming
stream_tokenizer = StreamingTokenizer(buffer_size=10)
# Simulate incoming data
for i in range(25):
text = f"Streaming message {i}"
result = stream_tokenizer.add(text)
if result:
print(f"Processed batch of {len(result)} texts")
# Process the batch...
time.sleep(0.1)
# Don't forget remaining items
final_batch = stream_tokenizer.flush()
if final_batch:
print(f"Processed final batch of {len(final_batch)} texts")
8. Integration ChecklistΒΆ
"""
INTEGRATION CHECKLIST
====================
Model Loading:
β
Tokenizer loaded at startup (not per request)
β
Model and tokenizer versions match
β
Proper error handling for loading failures
Performance:
β
Using batch encoding for multiple texts
β
Appropriate max_length set
β
Padding strategy chosen (max_length vs longest)
β
Caching enabled if applicable
Memory Management:
β
Tokenizer shared across requests (not recreated)
β
Large batches split into smaller chunks
β
Proper cleanup of large tensors
Error Handling:
β
Empty input validation
β
Unicode error handling
β
Truncation warnings
β
OOM protection
API Design (if applicable):
β
Health check endpoint
β
Batch processing endpoint
β
Rate limiting
β
Request validation
Testing:
β
Unit tests for edge cases
β
Integration tests with models
β
Load testing
β
Multilingual test cases
"""
SummaryΒΆ
Key Integration Points:
π€ Transformers: Use
AutoTokenizerfor seamless integrationPyTorch/TF: Create custom datasets with tokenization
APIs: Load tokenizer once at startup, use batch processing
Databases: Store pre-tokenized data for faster retrieval
Streaming: Buffer and batch process for efficiency
Remember: Always match tokenizer to your model! π―