Text-to-Speech: TTS with OpenAI, Coqui & Edge TTSΒΆ
Convert text to natural-sounding speech using cloud APIs and open-source local models.
TTS Options in 2026ΒΆ
Model |
Provider |
Quality |
Cost |
Local? |
|---|---|---|---|---|
TTS-1 / TTS-1-HD |
OpenAI |
β β β β β |
$0.015/1K chars |
No |
Edge TTS |
Microsoft |
β β β β |
Free |
No (API) |
Coqui TTS |
Open source |
β β β β |
Free |
β Yes |
Bark |
Suno AI |
β β β β |
Free |
β Yes |
Kokoro |
Open source |
β β β β β |
Free |
β Yes |
gTTS |
β β β |
Free |
No |
Rule of thumb: OpenAI TTS for production, Kokoro/Coqui for local/privacy.
# Install dependencies
# !pip install openai edge-tts gtts pydub
# For local: !pip install TTS (Coqui)
# For Bark: !pip install git+https://github.com/suno-ai/bark.git
1. OpenAI TTS APIΒΆ
from openai import OpenAI
from pathlib import Path
client = OpenAI()
def text_to_speech_openai(
text: str,
output_path: str = 'output.mp3',
voice: str = 'alloy',
model: str = 'tts-1'
) -> str:
"""
Convert text to speech using OpenAI TTS.
Voices: alloy, echo, fable, onyx, nova, shimmer
Models: tts-1 (fast, $0.015/1K), tts-1-hd (high quality, $0.030/1K)
"""
response = client.audio.speech.create(
model=model,
voice=voice,
input=text,
response_format='mp3'
)
response.stream_to_file(output_path)
print(f'Saved: {output_path}')
return output_path
# Six available voices
OPENAI_VOICES = {
'alloy': 'Neutral, balanced',
'echo': 'Soft, measured',
'fable': 'Warm, storytelling',
'onyx': 'Deep, authoritative',
'nova': 'Bright, energetic',
'shimmer': 'Gentle, friendly',
}
print('OpenAI TTS voices:')
for voice, desc in OPENAI_VOICES.items():
print(f' {voice:10s} β {desc}')
print('\ntts_openai() function ready.')
# Example:
# text_to_speech_openai('Hello! This is OpenAI TTS.', 'hello.mp3', voice='nova')
2. Voice Comparison β Same Text, All 6 VoicesΒΆ
import os
SAMPLE_TEXT = 'Welcome to the world of AI-generated speech. Each voice has a unique character.'
def compare_voices(text: str, output_dir: str = 'voice_samples') -> None:
"""Generate speech in all 6 OpenAI voices for comparison."""
os.makedirs(output_dir, exist_ok=True)
for voice in OPENAI_VOICES:
path = f'{output_dir}/{voice}.mp3'
text_to_speech_openai(text, path, voice=voice)
print(f' {voice}: {path}')
print(f'\nAll voices saved to {output_dir}/')
# Uncomment to run:
# compare_voices(SAMPLE_TEXT)
print('compare_voices() function ready β generates all 6 OpenAI voice samples.')
3. Edge TTS β Free, 400+ Voices, 100+ LanguagesΒΆ
# edge-tts uses Microsoft's Azure TTS engine for free via Edge browser API
# !pip install edge-tts
import asyncio
import edge_tts
async def edge_tts_speak(
text: str,
output_path: str = 'edge_output.mp3',
voice: str = 'en-US-JennyNeural'
) -> str:
"""Text-to-speech using Microsoft Edge TTS (free, 400+ voices)."""
communicate = edge_tts.Communicate(text, voice)
await communicate.save(output_path)
return output_path
def tts_edge(text: str, output_path: str = 'edge_output.mp3', voice: str = 'en-US-JennyNeural') -> str:
"""Sync wrapper for edge TTS."""
return asyncio.run(edge_tts_speak(text, output_path, voice))
# Popular voices
EDGE_VOICES = [
'en-US-JennyNeural', # US English female (natural)
'en-US-GuyNeural', # US English male
'en-GB-SoniaNeural', # British English female
'en-AU-NatashaNeural', # Australian English female
'es-ES-ElviraNeural', # Spanish female
'fr-FR-DeniseNeural', # French female
'de-DE-KatjaNeural', # German female
'ja-JP-NanamiNeural', # Japanese female
'zh-CN-XiaoxiaoNeural', # Chinese Mandarin female
]
print('Edge TTS voices (sample):')
for v in EDGE_VOICES:
print(f' {v}')
# List all voices:
# voices = await edge_tts.list_voices()
# en_voices = [v for v in voices if v['Locale'].startswith('en')]
print('\ntts_edge() ready β free, no API key needed.')
4. gTTS β Google Text-to-Speech (Simple, Free)ΒΆ
# !pip install gtts
from gtts import gTTS
def tts_google(text: str, output_path: str = 'gtts_output.mp3', lang: str = 'en') -> str:
"""
Simple TTS using Google Translate TTS.
lang: 'en', 'es', 'fr', 'de', 'ja', 'zh', etc.
"""
tts = gTTS(text=text, lang=lang, slow=False)
tts.save(output_path)
return output_path
# Multilingual demo
phrases = [
('Hello, world!', 'en'),
('Bonjour le monde!', 'fr'),
('Hola mundo!', 'es'),
('Hallo Welt!', 'de'),
('γγγ«γ‘γ―δΈηοΌ', 'ja'),
]
print('gTTS multilingual examples:')
for phrase, lang in phrases:
print(f' [{lang}] {phrase}')
# Uncomment to generate:
# tts_google(phrase, f'hello_{lang}.mp3', lang=lang)
print('\ntts_google() ready β supports 50+ languages.')
5. Coqui TTS β Local, High Quality, Voice CloningΒΆ
# Coqui TTS: local models, voice cloning, XTTS for multilingual
# !pip install TTS
# List available models:
# from TTS.api import TTS
# TTS.list_models()
# Key models:
COQUI_MODELS = {
'tts_models/en/ljspeech/tacotron2-DDC': 'Classic English, fast',
'tts_models/multilingual/multi-dataset/xtts_v2': 'XTTS v2 β multilingual + voice cloning (best)',
'tts_models/en/vctk/vits': 'Multiple speakers (109 voices)',
}
def tts_coqui_basic(text: str, output_path: str = 'coqui_output.wav') -> str:
"""Basic Coqui TTS (English, single speaker)."""
from TTS.api import TTS
tts = TTS('tts_models/en/ljspeech/tacotron2-DDC')
tts.tts_to_file(text=text, file_path=output_path)
return output_path
def tts_coqui_clone(text: str, speaker_wav: str, output_path: str = 'cloned.wav', language: str = 'en') -> str:
"""
Voice cloning with XTTS v2.
speaker_wav: path to 6+ second reference audio of the target voice.
Supports: en, es, fr, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja.
"""
from TTS.api import TTS
tts = TTS('tts_models/multilingual/multi-dataset/xtts_v2')
tts.tts_to_file(
text=text,
speaker_wav=speaker_wav,
language=language,
file_path=output_path
)
return output_path
print('Coqui TTS models:')
for model, desc in COQUI_MODELS.items():
print(f' {model}')
print(f' β {desc}')
print('\nVoice cloning usage:')
print(' tts_coqui_clone("Hello!", speaker_wav="your_voice.wav")')
print(' β Generates speech in the cloned voice.')
6. Speech Rate & Pitch ControlΒΆ
# Edge TTS supports SSML for fine control over rate, pitch, and emphasis
async def edge_tts_ssml(
text: str,
output_path: str,
voice: str = 'en-US-JennyNeural',
rate: str = '+0%', # -50% to +200%
pitch: str = '+0Hz', # -50Hz to +50Hz
volume: str = '+0%' # -100% to +100%
) -> None:
"""Edge TTS with SSML rate/pitch/volume control."""
communicate = edge_tts.Communicate(
text, voice,
rate=rate,
pitch=pitch,
volume=volume
)
await communicate.save(output_path)
# Examples of rate settings
rate_examples = [
('-30%', 'Slow β audiobooks, accessibility'),
('+0%', 'Normal'),
('+30%', 'Fast β quick summaries'),
('+100%','Very fast β power user'),
]
print('Rate examples:')
for rate, desc in rate_examples:
print(f' rate="{rate}" β {desc}')
print('\nUsage:')
print(' asyncio.run(edge_tts_ssml("Hello!", "out.mp3", rate="-20%", pitch="+5Hz"))')
7. Audio Playback in JupyterΒΆ
from IPython.display import Audio, display
def play_audio(file_path: str) -> None:
"""Play audio file inline in Jupyter."""
display(Audio(file_path, autoplay=True))
# After generating speech:
# output = text_to_speech_openai('Hello!', 'demo.mp3')
# play_audio('demo.mp3')
print('play_audio() β call after generating speech to hear it in the notebook.')
8. Real-World Pipeline: Article β PodcastΒΆ
import textwrap
def article_to_podcast(
article_text: str,
output_path: str = 'podcast.mp3',
voice: str = 'nova',
max_chars: int = 4096
) -> str:
"""
Convert a long article to a podcast audio file.
OpenAI TTS limit: 4096 chars per request β chunk long texts.
"""
# Split into chunks
chunks = textwrap.wrap(article_text, max_chars, break_long_words=False, break_on_hyphens=False)
print(f'Splitting into {len(chunks)} chunk(s)...')
if len(chunks) == 1:
return text_to_speech_openai(article_text, output_path, voice=voice)
# Generate each chunk
import tempfile, os
chunk_files = []
for i, chunk in enumerate(chunks):
path = f'/tmp/chunk_{i}.mp3'
text_to_speech_openai(chunk, path, voice=voice)
chunk_files.append(path)
# Concatenate with pydub
from pydub import AudioSegment
combined = AudioSegment.empty()
for f in chunk_files:
combined += AudioSegment.from_mp3(f)
combined.export(output_path, format='mp3')
# Cleanup
for f in chunk_files:
os.remove(f)
print(f'Podcast saved: {output_path} ({len(combined)/1000:.1f}s)')
return output_path
# Demo
sample_article = """
Artificial intelligence is transforming every industry.
From healthcare to finance, AI systems are automating complex tasks
that previously required human expertise. Large language models can
now write code, analyze data, and answer complex questions with
remarkable accuracy. The pace of progress continues to accelerate.
""".strip()
print('article_to_podcast() ready.')
print('Usage: article_to_podcast(long_article_text, "episode1.mp3")')
9. TTS Comparison: Speed, Quality, CostΒΆ
# Benchmark results on same text (approximately):
# OpenAI TTS-1
# β
Quality: Excellent (natural prosody)
# β
Speed: ~1s for short text
# β Cost: $0.015/1K chars
# β Requires API key
# Edge TTS (Microsoft)
# β
Quality: Very good
# β
Speed: ~1-2s
# β
Free
# β
400+ voices, 100+ languages
# β Requires internet
# Coqui XTTS v2
# β
Quality: Excellent
# β
Free, local, private
# β
Voice cloning from 6s sample
# β Slow on CPU (~5-30x real-time)
# β Needs GPU for production
# gTTS (Google Translate)
# β
Free
# β
50+ languages
# β Robotic quality
# β No rate/pitch control
ExercisesΒΆ
Use OpenAI TTS to generate the same text in all 6 voices and compare them.
Build a CLI tool that reads a text file aloud using Edge TTS.
Convert a news article to a podcast MP3 using
article_to_podcast().Use Coqui XTTS to clone your own voice from a 10-second recording.
Build a multilingual greeting app: detect language and use matching TTS voice.