#!/usr/bin/env python3 """ Transcribe audio files using local Faster-Whisper (CPU-only) Usage: transcribe.py [--model MODEL] [--output-format text|json|srt] """ import argparse import os import sys import json from faster_whisper import WhisperModel def transcribe(audio_path, model_size="base", output_format="text"): """Transcribe audio file to text""" if not os.path.exists(audio_path): print(f"Error: File not found: {audio_path}", file=sys.stderr) sys.exit(1) # Load model (cached in ~/.cache/huggingface/hub) print(f"Loading Whisper model: {model_size}", file=sys.stderr) model = WhisperModel(model_size, device="cpu", compute_type="int8") # Transcribe print(f"Transcribing: {audio_path}", file=sys.stderr) segments, info = model.transcribe(audio_path, beam_size=5) # Process results language = info.language language_prob = info.language_probability results = [] full_text = [] for segment in segments: results.append({ "start": segment.start, "end": segment.end, "text": segment.text.strip() }) full_text.append(segment.text.strip()) # Output format if output_format == "json": output = { "language": language, "language_probability": language_prob, "segments": results, "text": " ".join(full_text) } print(json.dumps(output, indent=2)) elif output_format == "srt": for i, segment in enumerate(results, 1): start = format_timestamp(segment["start"]) end = format_timestamp(segment["end"]) print(f"{i}") print(f"{start} --> {end}") print(f"{segment['text']}\n") else: # text print(" ".join(full_text)) return " ".join(full_text) def format_timestamp(seconds): """Format seconds to SRT timestamp""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) millis = int((seconds % 1) * 1000) return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" if __name__ == "__main__": parser = argparse.ArgumentParser(description="Transcribe audio using Faster-Whisper") parser.add_argument("audio_file", help="Path to audio file") parser.add_argument("--model", default="base", choices=["tiny", "base", "small", "medium", "large"], help="Whisper model size (default: base)") parser.add_argument("--output-format", default="text", choices=["text", "json", "srt"], help="Output format (default: text)") args = parser.parse_args() # Allow override from environment model = os.environ.get("WHISPER_MODEL", args.model) transcribe(args.audio_file, model, args.output_format)