Token ExplorationΒΆ
Token Exploration - Advanced ExamplesΒΆ
Explore how different types of text get tokenized.
Learning objectives:
See how common vs uncommon words tokenize
Understand context-dependent tokenization
Compare different languages
Analyze code tokenization
Explore special characters and numbers
Run this after understanding the basics in tiktoken_example.py
SetupΒΆ
Import tiktoken for all tokenization operations in this notebook. We will use the cl100k_base encoding throughout β the same BPE vocabulary used by GPT-4 β so every observation about token counts, boundaries, and efficiency directly applies to real OpenAI API usage.
import tiktoken
Helper: Print Section HeadersΒΆ
A small utility function to visually separate the eight exploration topics with formatted banners. Keeping display logic in a helper keeps the analysis cells clean.
def print_section(title):
"""Helper to print section headers"""
print("\n" + "=" * 70)
print(title)
print("=" * 70)
Helper: Analyze Text TokenizationΒΆ
What: Given a text string and an encoding, display the full tokenization breakdown: token count, character count, characters-per-token ratio, and a line-by-line mapping of each token ID to its text piece.
Why: The characters-per-token ratio is a key efficiency metric. English prose averages about 4 characters per token (~0.75 words per token) with cl100k_base. Ratios significantly below 4 indicate the tokenizer is splitting the text into many small pieces (common with rare words, non-English text, or special characters), while ratios above 4 indicate highly efficient tokenization of common words.
def analyze_text(text, encoding, label=""):
"""Analyze and display tokenization details for given text"""
tokens = encoding.encode(text)
decoded_tokens = [encoding.decode([t]) for t in tokens]
print(f"\n{label}")
print(f"Text: '{text}'")
print(f"Token count: {len(tokens)}")
print(f"Character count: {len(text)}")
print(f"Ratio: {len(text)/len(tokens):.2f} chars/token")
print(f"\nToken breakdown:")
for tid, tok in zip(tokens, decoded_tokens):
print(f" {tid:6d} β '{tok}'")
def main():
# Initialize encoding (GPT-4)
encoding = tiktoken.get_encoding("cl100k_base")
# =================================================================
# PART 1: Common vs Uncommon Words
# =================================================================
print_section("PART 1: COMMON VS UNCOMMON WORDS")
common_words = [
"hello",
"world",
"computer",
"artificial intelligence"
]
print("\nπ Common words (usually 1 token each):")
for word in common_words:
analyze_text(word, encoding, f"Word: {word}")
uncommon_words = [
"supercalifragilisticexpialidocious",
"pneumonoultramicroscopicsilicovolcanoconiosis",
"antidisestablishmentarianism"
]
print("\nπ Uncommon words (split into multiple subword tokens):")
for word in uncommon_words:
tokens = encoding.encode(word)
print(f"\n'{word}': {len(tokens)} tokens")
decoded = [encoding.decode([t]) for t in tokens]
print(f" Split as: {decoded}")
# =================================================================
# PART 2: Context-Dependent Tokenization
# =================================================================
print_section("PART 2: CONTEXT-DEPENDENT TOKENIZATION")
print("\nπ Same word, different tokens based on context:")
contexts = [
"red", # No space, lowercase
" red", # Leading space
"Red", # Capitalized, no space
" Red", # Leading space + capitalized
"RED", # All caps
" RED" # Leading space + all caps
]
for ctx in contexts:
tokens = encoding.encode(ctx)
print(f"'{ctx}' β Token ID: {tokens[0]}")
print("\nπ Why? Tokenizers learn from real text patterns:")
print(" - Words mid-sentence usually have leading space")
print(" - Sentence starts are capitalized without space")
print(" - This context-awareness improves efficiency")
# =================================================================
# PART 3: Code Tokenization
# =================================================================
print_section("PART 3: CODE TOKENIZATION")
code_samples = [
'def hello_world():',
'print("Hello, World!")',
'x = [1, 2, 3, 4, 5]',
'if x > 10:',
' return True',
'// This is a comment',
'const greeting = "hello";'
]
print("\nπ How code gets tokenized:")
for code in code_samples:
analyze_text(code, encoding, f"Code: {code}")
# =================================================================
# PART 4: Numbers and Special Characters
# =================================================================
print_section("PART 4: NUMBERS AND SPECIAL CHARACTERS")
print("\nπ Numbers:")
numbers = ["123", "1234", "12345", "123456", "1,234,567", "3.14159"]
for num in numbers:
tokens = encoding.encode(num)
print(f"'{num}' β {len(tokens)} tokens: {tokens}")
print("\nπ Special characters:")
special = ["!!!", "???", "...", "---", "***", "@@", "###"]
for spec in special:
tokens = encoding.encode(spec)
print(f"'{spec}' β {len(tokens)} tokens: {tokens}")
print("\nπ Emojis:")
emojis = ["π", "π", "β€οΈ", "π", "π"]
for emoji in emojis:
tokens = encoding.encode(emoji)
print(f"'{emoji}' β {len(tokens)} tokens: {tokens}")
# =================================================================
# PART 5: Different Languages
# =================================================================
print_section("PART 5: MULTILINGUAL TOKENIZATION")
translations = {
"English": "How are you?",
"Spanish": "ΒΏCΓ³mo estΓ‘s?",
"French": "Comment allez-vous?",
"German": "Wie geht es dir?",
"Chinese": "δ½ ε₯½ε?",
"Japanese": "ε
ζ°γ§γγ?",
"Arabic": "ΩΩΩ ΨΨ§ΩΩΨ",
"Russian": "ΠΠ°ΠΊ Π΄Π΅Π»Π°?",
"Korean": "μ΄λ»κ² μ§λ΄μΈμ?"
}
print("\nπ Same question in different languages:")
print(f"{'Language':<12} {'Text':<25} {'Tokens':<8} {'Chars':<8} {'Ratio'}")
print("-" * 70)
for lang, text in translations.items():
tokens = encoding.encode(text)
ratio = len(text) / len(tokens)
print(f"{lang:<12} {text:<25} {len(tokens):<8} {len(text):<8} {ratio:.2f}")
print("\nπ‘ Notice: Non-English languages often need more tokens!")
print(" This means:")
print(" - Higher API costs for same semantic content")
print(" - Less text fits in the same context window")
print(" - Processing may be slower")
# =================================================================
# PART 6: Token Efficiency Comparison
# =================================================================
print_section("PART 6: TOKEN EFFICIENCY ANALYSIS")
print("\nπ Comparing different ways to express the same idea:")
variations = [
"Please help me with this problem.",
"Please assist me with this issue.",
"Could you please help me solve this?",
"I need help with this.",
"Help me with this."
]
print("\nSame request, different token counts:")
for var in variations:
tokens = encoding.encode(var)
print(f"{len(tokens):2d} tokens: '{var}'")
print("\nπ‘ Tip: Concise language = fewer tokens = lower costs!")
# =================================================================
# PART 7: Whitespace Handling
# =================================================================
print_section("PART 7: WHITESPACE AND FORMATTING")
print("\nπ How whitespace affects tokenization:")
whitespace_examples = [
"hello world", # Single space
"hello world", # Double space
"hello world", # Triple space
"hello\nworld", # Newline
"hello\tworld", # Tab
"hello world ", # Trailing space
" hello world", # Leading space
]
for example in whitespace_examples:
tokens = encoding.encode(example)
visual = example.replace(' ', 'Β·').replace('\n', 'β΅').replace('\t', 'β')
print(f"'{visual}' β {len(tokens)} tokens")
# =================================================================
# PART 8: Real-World Examples
# =================================================================
print_section("PART 8: REAL-WORLD EXAMPLES")
print("\nπ Typical text samples:")
samples = {
"Tweet": "Just learned about tokenization! π It's how LLMs process text. #AI #MachineLearning",
"Email": "Dear John,\n\nI hope this email finds you well. I wanted to follow up on our meeting yesterday.",
"Code comment": "# Calculate the sum of all even numbers in the list",
"Function name": "calculate_total_revenue_for_quarter",
"URL": "https://platform.openai.com/docs/api-reference",
"JSON": '{"name": "John", "age": 30, "city": "New York"}',
"SQL": "SELECT * FROM users WHERE age > 18 ORDER BY created_at DESC;"
}
print(f"\n{'Type':<15} {'Tokens':<8} {'Characters':<12} {'Efficiency'}")
print("-" * 55)
for text_type, text in samples.items():
tokens = encoding.encode(text)
efficiency = len(text) / len(tokens)
print(f"{text_type:<15} {len(tokens):<8} {len(text):<12} {efficiency:.2f} c/t")
print("\nπ‘ Different content types have different token efficiency!")
# =================================================================
# SUMMARY
# =================================================================
print_section("KEY TAKEAWAYS")
print("""
1. Common words β 1 token, uncommon words β multiple tokens
2. Context matters: " red" β "red" β "Red" β " Red"
3. Code is tokenized efficiently (keywords, operators, identifiers)
4. Numbers: Usually 1-3 digits per token
5. Non-English languages need MORE tokens for same content
β Higher costs, less context window space
6. Whitespace (spaces, newlines, tabs) affects tokenization
7. Token efficiency varies by content type:
- Code: ~3-4 chars/token
- English text: ~4 chars/token
- Non-English: ~2-3 chars/token
8. To minimize tokens:
β Use concise language
β Avoid repetition
β Remove unnecessary whitespace
β Stick to common words when possible
""")
print("\n" + "=" * 70)
print("Experiment with your own text! Modify the examples above.")
print("=" * 70 + "\n")
if __name__ == "__main__":
main()