Run this notebook: Open in Colab Open in Kaggle

Text-to-Speech: TTS with OpenAI, Coqui & Edge TTS¶

Convert text to natural-sounding speech using cloud APIs and open-source local models.

TTS Options in 2026¶

Model	Provider	Quality	Cost	Local?
TTS-1 / TTS-1-HD	OpenAI	★★★★★	$0.015/1K chars	No
Edge TTS	Microsoft	★★★★	Free	No (API)
Coqui TTS	Open source	★★★★	Free	✅ Yes
Bark	Suno AI	★★★★	Free	✅ Yes
Kokoro	Open source	★★★★★	Free	✅ Yes
gTTS	Google	★★★	Free	No

Rule of thumb: OpenAI TTS for production, Kokoro/Coqui for local/privacy.

# Install dependencies
# !pip install openai edge-tts gtts pydub
# For local: !pip install TTS  (Coqui)
# For Bark: !pip install git+https://github.com/suno-ai/bark.git

1. OpenAI TTS API¶

from openai import OpenAI
from pathlib import Path

client = OpenAI()

def text_to_speech_openai(
    text: str,
    output_path: str = 'output.mp3',
    voice: str = 'alloy',
    model: str = 'tts-1'
) -> str:
    """
    Convert text to speech using OpenAI TTS.
    
    Voices: alloy, echo, fable, onyx, nova, shimmer
    Models: tts-1 (fast, $0.015/1K), tts-1-hd (high quality, $0.030/1K)
    """
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=text,
        response_format='mp3'
    )
    response.stream_to_file(output_path)
    print(f'Saved: {output_path}')
    return output_path

# Six available voices
OPENAI_VOICES = {
    'alloy':   'Neutral, balanced',
    'echo':    'Soft, measured',
    'fable':   'Warm, storytelling',
    'onyx':    'Deep, authoritative',
    'nova':    'Bright, energetic',
    'shimmer': 'Gentle, friendly',
}

print('OpenAI TTS voices:')
for voice, desc in OPENAI_VOICES.items():
    print(f'  {voice:10s} — {desc}')

print('\ntts_openai() function ready.')
# Example:
# text_to_speech_openai('Hello! This is OpenAI TTS.', 'hello.mp3', voice='nova')

2. Voice Comparison — Same Text, All 6 Voices¶

import os

SAMPLE_TEXT = 'Welcome to the world of AI-generated speech. Each voice has a unique character.'

def compare_voices(text: str, output_dir: str = 'voice_samples') -> None:
    """Generate speech in all 6 OpenAI voices for comparison."""
    os.makedirs(output_dir, exist_ok=True)
    
    for voice in OPENAI_VOICES:
        path = f'{output_dir}/{voice}.mp3'
        text_to_speech_openai(text, path, voice=voice)
        print(f'  {voice}: {path}')
    
    print(f'\nAll voices saved to {output_dir}/')

# Uncomment to run:
# compare_voices(SAMPLE_TEXT)
print('compare_voices() function ready — generates all 6 OpenAI voice samples.')

3. Edge TTS — Free, 400+ Voices, 100+ Languages¶

# edge-tts uses Microsoft's Azure TTS engine for free via Edge browser API
# !pip install edge-tts

import asyncio
import edge_tts

async def edge_tts_speak(
    text: str,
    output_path: str = 'edge_output.mp3',
    voice: str = 'en-US-JennyNeural'
) -> str:
    """Text-to-speech using Microsoft Edge TTS (free, 400+ voices)."""
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(output_path)
    return output_path

def tts_edge(text: str, output_path: str = 'edge_output.mp3', voice: str = 'en-US-JennyNeural') -> str:
    """Sync wrapper for edge TTS."""
    return asyncio.run(edge_tts_speak(text, output_path, voice))

# Popular voices
EDGE_VOICES = [
    'en-US-JennyNeural',      # US English female (natural)
    'en-US-GuyNeural',        # US English male
    'en-GB-SoniaNeural',      # British English female
    'en-AU-NatashaNeural',    # Australian English female
    'es-ES-ElviraNeural',     # Spanish female
    'fr-FR-DeniseNeural',     # French female
    'de-DE-KatjaNeural',      # German female
    'ja-JP-NanamiNeural',     # Japanese female
    'zh-CN-XiaoxiaoNeural',   # Chinese Mandarin female
]

print('Edge TTS voices (sample):')
for v in EDGE_VOICES:
    print(f'  {v}')

# List all voices:
# voices = await edge_tts.list_voices()
# en_voices = [v for v in voices if v['Locale'].startswith('en')]

print('\ntts_edge() ready — free, no API key needed.')

4. gTTS — Google Text-to-Speech (Simple, Free)¶

# !pip install gtts
from gtts import gTTS

def tts_google(text: str, output_path: str = 'gtts_output.mp3', lang: str = 'en') -> str:
    """
    Simple TTS using Google Translate TTS.
    lang: 'en', 'es', 'fr', 'de', 'ja', 'zh', etc.
    """
    tts = gTTS(text=text, lang=lang, slow=False)
    tts.save(output_path)
    return output_path

# Multilingual demo
phrases = [
    ('Hello, world!',          'en'),
    ('Bonjour le monde!',      'fr'),
    ('Hola mundo!',            'es'),
    ('Hallo Welt!',            'de'),
    ('こんにちは世界！',          'ja'),
]

print('gTTS multilingual examples:')
for phrase, lang in phrases:
    print(f'  [{lang}] {phrase}')
    # Uncomment to generate:
    # tts_google(phrase, f'hello_{lang}.mp3', lang=lang)

print('\ntts_google() ready — supports 50+ languages.')

5. Coqui TTS — Local, High Quality, Voice Cloning¶

# Coqui TTS: local models, voice cloning, XTTS for multilingual
# !pip install TTS

# List available models:
# from TTS.api import TTS
# TTS.list_models()

# Key models:
COQUI_MODELS = {
    'tts_models/en/ljspeech/tacotron2-DDC':   'Classic English, fast',
    'tts_models/multilingual/multi-dataset/xtts_v2': 'XTTS v2 — multilingual + voice cloning (best)',
    'tts_models/en/vctk/vits':                'Multiple speakers (109 voices)',
}

def tts_coqui_basic(text: str, output_path: str = 'coqui_output.wav') -> str:
    """Basic Coqui TTS (English, single speaker)."""
    from TTS.api import TTS
    tts = TTS('tts_models/en/ljspeech/tacotron2-DDC')
    tts.tts_to_file(text=text, file_path=output_path)
    return output_path

def tts_coqui_clone(text: str, speaker_wav: str, output_path: str = 'cloned.wav', language: str = 'en') -> str:
    """
    Voice cloning with XTTS v2.
    speaker_wav: path to 6+ second reference audio of the target voice.
    Supports: en, es, fr, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja.
    """
    from TTS.api import TTS
    tts = TTS('tts_models/multilingual/multi-dataset/xtts_v2')
    tts.tts_to_file(
        text=text,
        speaker_wav=speaker_wav,
        language=language,
        file_path=output_path
    )
    return output_path

print('Coqui TTS models:')
for model, desc in COQUI_MODELS.items():
    print(f'  {model}')
    print(f'    → {desc}')

print('\nVoice cloning usage:')
print('  tts_coqui_clone("Hello!", speaker_wav="your_voice.wav")')
print('  → Generates speech in the cloned voice.')

6. Speech Rate & Pitch Control¶

# Edge TTS supports SSML for fine control over rate, pitch, and emphasis

async def edge_tts_ssml(
    text: str,
    output_path: str,
    voice: str = 'en-US-JennyNeural',
    rate: str = '+0%',    # -50% to +200%
    pitch: str = '+0Hz',  # -50Hz to +50Hz
    volume: str = '+0%'   # -100% to +100%
) -> None:
    """Edge TTS with SSML rate/pitch/volume control."""
    communicate = edge_tts.Communicate(
        text, voice,
        rate=rate,
        pitch=pitch,
        volume=volume
    )
    await communicate.save(output_path)

# Examples of rate settings
rate_examples = [
    ('-30%', 'Slow — audiobooks, accessibility'),
    ('+0%',  'Normal'),
    ('+30%', 'Fast — quick summaries'),
    ('+100%','Very fast — power user'),
]

print('Rate examples:')
for rate, desc in rate_examples:
    print(f'  rate="{rate}" → {desc}')

print('\nUsage:')
print('  asyncio.run(edge_tts_ssml("Hello!", "out.mp3", rate="-20%", pitch="+5Hz"))')

7. Audio Playback in Jupyter¶

from IPython.display import Audio, display

def play_audio(file_path: str) -> None:
    """Play audio file inline in Jupyter."""
    display(Audio(file_path, autoplay=True))

# After generating speech:
# output = text_to_speech_openai('Hello!', 'demo.mp3')
# play_audio('demo.mp3')

print('play_audio() — call after generating speech to hear it in the notebook.')

8. Real-World Pipeline: Article → Podcast¶

import textwrap

def article_to_podcast(
    article_text: str,
    output_path: str = 'podcast.mp3',
    voice: str = 'nova',
    max_chars: int = 4096
) -> str:
    """
    Convert a long article to a podcast audio file.
    OpenAI TTS limit: 4096 chars per request → chunk long texts.
    """
    # Split into chunks
    chunks = textwrap.wrap(article_text, max_chars, break_long_words=False, break_on_hyphens=False)
    print(f'Splitting into {len(chunks)} chunk(s)...')
    
    if len(chunks) == 1:
        return text_to_speech_openai(article_text, output_path, voice=voice)
    
    # Generate each chunk
    import tempfile, os
    chunk_files = []
    for i, chunk in enumerate(chunks):
        path = f'/tmp/chunk_{i}.mp3'
        text_to_speech_openai(chunk, path, voice=voice)
        chunk_files.append(path)
    
    # Concatenate with pydub
    from pydub import AudioSegment
    combined = AudioSegment.empty()
    for f in chunk_files:
        combined += AudioSegment.from_mp3(f)
    combined.export(output_path, format='mp3')
    
    # Cleanup
    for f in chunk_files:
        os.remove(f)
    
    print(f'Podcast saved: {output_path} ({len(combined)/1000:.1f}s)')
    return output_path

# Demo
sample_article = """
Artificial intelligence is transforming every industry. 
From healthcare to finance, AI systems are automating complex tasks 
that previously required human expertise. Large language models can 
now write code, analyze data, and answer complex questions with 
remarkable accuracy. The pace of progress continues to accelerate.
""".strip()

print('article_to_podcast() ready.')
print('Usage: article_to_podcast(long_article_text, "episode1.mp3")')

9. TTS Comparison: Speed, Quality, Cost¶

# Benchmark results on same text (approximately):

# OpenAI TTS-1
# ✅ Quality: Excellent (natural prosody)
# ✅ Speed: ~1s for short text
# ❌ Cost: $0.015/1K chars
# ❌ Requires API key

# Edge TTS (Microsoft)
# ✅ Quality: Very good
# ✅ Speed: ~1-2s
# ✅ Free
# ✅ 400+ voices, 100+ languages
# ❌ Requires internet

# Coqui XTTS v2
# ✅ Quality: Excellent
# ✅ Free, local, private
# ✅ Voice cloning from 6s sample
# ❌ Slow on CPU (~5-30x real-time)
# ❌ Needs GPU for production

# gTTS (Google Translate)
# ✅ Free
# ✅ 50+ languages
# ❌ Robotic quality
# ❌ No rate/pitch control

Exercises¶

Use OpenAI TTS to generate the same text in all 6 voices and compare them.
Build a CLI tool that reads a text file aloud using Edge TTS.
Convert a news article to a podcast MP3 using article_to_podcast().
Use Coqui XTTS to clone your own voice from a 10-second recording.
Build a multilingual greeting app: detect language and use matching TTS voice.