Whisper: Audio Pre- and Post-Processing
Improve Whisper transcription quality by trimming leading silence, splitting long files into segments, and running GPT post-processing to add punctuation, fix financial terminology, and remove non-ASCII artefacts.
Download any WAV file longer than 2 minutes, trim leading silence with PyDub, split into 60-second segments, transcribe each through Whisper, and join the results. Pass the final text through GPT to add punctuation.
Task grader
Copy and adapt to your context. Text in angle brackets should be replaced.
from pydub import AudioSegment
from openai import OpenAI
from pathlib import Path
import os
client = OpenAI()
def trim_and_segment(path, seg_ms=60_000, threshold_db=-20.0):
audio = AudioSegment.from_file(path)
trim = next(
(i for i in range(0, len(audio), 10)
if audio[i:i+10].dBFS >= threshold_db),
0,
)
audio = audio[trim:]
return [audio[s:s+seg_ms] for s in range(0, len(audio), seg_ms)]
def transcribe_segments(segments):
texts = []
for i, seg in enumerate(segments):
tmp = f"/tmp/seg_{i:02d}.wav"
seg.export(tmp, format="wav")
with open(tmp, "rb") as f:
texts.append(client.audio.transcriptions.create(
model="whisper-1", file=f).text)
return " ".join(texts)