Run this notebook: Open in Colab Open in Kaggle

Whisper: Speech Recognition & Audio Understanding¶

OpenAI Whisper is the state-of-the-art open-source model for automatic speech recognition (ASR) — transcribing audio to text in 99 languages.

Why Whisper?¶

Multilingual: 99 languages, automatic language detection
Robust: Handles accents, background noise, and technical jargon
Open source: Free, runs locally, no API key needed
Timestamps: Word-level timestamps for subtitle generation
Translation: Transcribe + translate to English in one step

Model	Size	Speed	Quality
tiny	39M	Fastest	Good
base	74M	Fast	Better
small	244M	Medium	Great
medium	769M	Slow	Excellent
large-v3	1.5B	Slowest	Best

# Install dependencies
# !pip install openai-whisper torch torchaudio ffmpeg-python
# Also needs: brew install ffmpeg (Mac) or apt install ffmpeg (Linux)

1. Local Whisper Transcription¶

import whisper
import torch

# Load model (downloads on first use)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = whisper.load_model('base', device=device)
print(f'Loaded Whisper base on {device}')

def transcribe(audio_path: str, language: str = None) -> dict:
    """Transcribe audio file. language=None for auto-detection."""
    result = model.transcribe(
        audio_path,
        language=language,
        word_timestamps=True,
        verbose=False
    )
    return result

# Example usage (replace with your audio file):
# result = transcribe('my_audio.mp3')
# print('Text:', result['text'])
# print('Language:', result['language'])
print('transcribe() function ready — pass any audio file path.')

2. OpenAI Whisper API (Faster, No Local GPU)¶

from openai import OpenAI

client = OpenAI()

def transcribe_api(audio_path: str, language: str = None, task: str = 'transcribe') -> str:
    """
    Transcribe via OpenAI API.
    task: 'transcribe' (keep language) or 'translate' (→ English)
    Cost: $0.006/minute
    """
    with open(audio_path, 'rb') as f:
        if task == 'translate':
            response = client.audio.translations.create(
                model='whisper-1',
                file=f,
                response_format='text'
            )
        else:
            response = client.audio.transcriptions.create(
                model='whisper-1',
                file=f,
                language=language,
                response_format='verbose_json',
                timestamp_granularities=['word']
            )
    return response

print('OpenAI Whisper API function defined.')

3. Generate SRT Subtitles¶

def seconds_to_srt_time(seconds: float) -> str:
    """Convert seconds to SRT timestamp format: HH:MM:SS,mmm"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}'

def segments_to_srt(segments: list) -> str:
    """Convert Whisper segments to SRT subtitle format."""
    lines = []
    for i, seg in enumerate(segments, 1):
        start = seconds_to_srt_time(seg['start'])
        end   = seconds_to_srt_time(seg['end'])
        text  = seg['text'].strip()
        lines.append(f'{i}\n{start} --> {end}\n{text}\n')
    return '\n'.join(lines)

def transcribe_to_srt(audio_path: str, output_path: str = None) -> str:
    """Transcribe audio and save as SRT subtitle file."""
    result = model.transcribe(audio_path, word_timestamps=True)
    srt_content = segments_to_srt(result['segments'])

    if output_path is None:
        output_path = audio_path.rsplit('.', 1)[0] + '.srt'

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(srt_content)

    print(f'Saved subtitles to: {output_path}')
    return srt_content

# Demo with synthetic segments
demo_segments = [
    {'start': 0.0,  'end': 3.5,  'text': 'Welcome to the multimodal AI course.'},
    {'start': 3.5,  'end': 7.2,  'text': 'Today we will learn about Whisper speech recognition.'},
    {'start': 7.2,  'end': 11.0, 'text': 'It can transcribe audio in 99 languages automatically.'},
]

print('Sample SRT output:')
print(segments_to_srt(demo_segments))

4. Faster Whisper (4x Speed, Same Quality)¶

# faster-whisper uses CTranslate2 backend — 4x faster, 2x less memory
# !pip install faster-whisper

from faster_whisper import WhisperModel

# int8 quantization for even more speed on CPU
fast_model = WhisperModel('base', device='cpu', compute_type='int8')

def fast_transcribe(audio_path: str):
    segments, info = fast_model.transcribe(audio_path, beam_size=5)
    print(f'Detected language: {info.language} (probability: {info.language_probability:.2%})')
    full_text = ' '.join(seg.text for seg in segments)
    return full_text

print('faster-whisper model loaded (int8 quantized for speed)')

5. Real-World Applications¶

# Meeting transcription pipeline
def process_meeting(audio_file):
    # 1. Transcribe
    result = transcribe(audio_file)
    # 2. Summarize with LLM
    from openai import OpenAI
    client = OpenAI()
    summary = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{'role': 'user', 'content': f'Summarize this meeting transcript in bullet points:\n\n{result["text"]}'}]
    ).choices[0].message.content
    return {'transcript': result['text'], 'summary': summary, 'language': result['language']}

Exercises¶

Record a 1-minute audio clip and transcribe it with Whisper base vs. large-v3. Compare accuracy.
Download a YouTube video (using yt-dlp), extract the audio, and generate subtitles.
Build a meeting transcription + summarization pipeline using Whisper + GPT-4o.
Test multilingual transcription: record yourself speaking two languages and check auto-detection.