OpenAI Audio & Vision: Whisper and GPT-4V · Lesson 1

Whisper: Audio Pre- and Post-Processing

Improve Whisper transcription quality by trimming leading silence, splitting long files into segments, and running GPT post-processing to add punctuation, fix financial terminology, and remove non-ASCII artefacts.

25 min read3 questions in quizReady prompt includedIn progress

Practical exercise

What to do after this lesson

Download any WAV file longer than 2 minutes, trim leading silence with PyDub, split into 60-second segments, transcribe each through Whisper, and join the results. Pass the final text through GPT to add punctuation.

Task grader

Download any WAV file longer than 2 minutes, trim leading silence with PyDub, split into 60-second segments, transcribe each through Whisper, and join the results. Pass the final text through GPT to add punctuation.

Your answer

Ready-to-use prompt

Template for this lesson

Copy and adapt to your context. Text in angle brackets should be replaced.

from pydub import AudioSegment
from openai import OpenAI
from pathlib import Path
import os

client = OpenAI()

def trim_and_segment(path, seg_ms=60_000, threshold_db=-20.0):
    audio = AudioSegment.from_file(path)
    trim = next(
        (i for i in range(0, len(audio), 10)
         if audio[i:i+10].dBFS >= threshold_db),
        0,
    )
    audio = audio[trim:]
    return [audio[s:s+seg_ms] for s in range(0, len(audio), seg_ms)]

def transcribe_segments(segments):
    texts = []
    for i, seg in enumerate(segments):
        tmp = f"/tmp/seg_{i:02d}.wav"
        seg.export(tmp, format="wav")
        with open(tmp, "rb") as f:
            texts.append(client.audio.transcriptions.create(
                model="whisper-1", file=f).text)
    return " ".join(texts)

Step 1: trim silence with PyDub

from pydub import AudioSegment from pathlib import Path def milliseconds_until_sound(sound, silence_threshold_in_decibels=-20.0, chunk_size=10): trim_ms = 0 while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold_in_decibels and trim_ms < len(sound): trim_ms += chunk_size return trim_ms def trim_start(filepath): path = Path(filepath) audio = AudioSegment.from_file(filepath, format="wav") start_trim = milliseconds_until_sound(audio) trimmed = audio[start_trim:] new_filename = path.parent / f"trimmed_{path.name}" trimmed.export(new_filename, format="wav") return trimmed, new_filename

Step 2: segment and transcribe

from openai import OpenAI import os client = OpenAI() one_minute = 60 * 1000 # ms def transcribe_audio(file, output_dir): with open(os.path.join(output_dir, file), "rb") as f: return client.audio.transcriptions.create(model="whisper-1", file=f).text # Segment the trimmed audio trimmed_audio = AudioSegment.from_wav(trimmed_filename) for i, start in enumerate(range(0, len(trimmed_audio), one_minute)): trimmed_audio[start:start + one_minute].export(f"seg_{i:02d}.wav", format="wav")

Step 3: GPT post-processing

def punctuation_assistant(text): return client.chat.completions.create( model="gpt-3.5-turbo", temperature=0, messages=[ {"role": "system", "content": "Add punctuation and capitalize. Preserve original words."}, {"role": "user", "content": text} ] ).choices[0].message.content # Remove non-ASCII (for English transcripts only) ascii_text = "".join(c for c in raw_transcript if ord(c) < 128) clean = punctuation_assistant(ascii_text)

For domain-specific term correction (e.g. "five two nine" → "529 (Education Savings Plan)"), use a GPT-4 model with a detailed system prompt that understands financial context.

Report a bug

Whisper: Audio Pre- and Post-Processing

Task grader

Prompt sandbox

Quiz — 3 questions

Discussion

Why processing around Whisper matters

Step 1: trim silence with PyDub

Step 2: segment and transcribe

Step 3: GPT post-processing