88 lines
2.9 KiB
Python
Executable File
88 lines
2.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Transcribe audio files using local Faster-Whisper (CPU-only)
|
|
Usage: transcribe.py <audio_file> [--model MODEL] [--output-format text|json|srt]
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import json
|
|
from faster_whisper import WhisperModel
|
|
|
|
def transcribe(audio_path, model_size="base", output_format="text"):
|
|
"""Transcribe audio file to text"""
|
|
|
|
if not os.path.exists(audio_path):
|
|
print(f"Error: File not found: {audio_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Load model (cached in ~/.cache/huggingface/hub)
|
|
print(f"Loading Whisper model: {model_size}", file=sys.stderr)
|
|
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
|
|
|
# Transcribe
|
|
print(f"Transcribing: {audio_path}", file=sys.stderr)
|
|
segments, info = model.transcribe(audio_path, beam_size=5)
|
|
|
|
# Process results
|
|
language = info.language
|
|
language_prob = info.language_probability
|
|
|
|
results = []
|
|
full_text = []
|
|
|
|
for segment in segments:
|
|
results.append({
|
|
"start": segment.start,
|
|
"end": segment.end,
|
|
"text": segment.text.strip()
|
|
})
|
|
full_text.append(segment.text.strip())
|
|
|
|
# Output format
|
|
if output_format == "json":
|
|
output = {
|
|
"language": language,
|
|
"language_probability": language_prob,
|
|
"segments": results,
|
|
"text": " ".join(full_text)
|
|
}
|
|
print(json.dumps(output, indent=2))
|
|
elif output_format == "srt":
|
|
for i, segment in enumerate(results, 1):
|
|
start = format_timestamp(segment["start"])
|
|
end = format_timestamp(segment["end"])
|
|
print(f"{i}")
|
|
print(f"{start} --> {end}")
|
|
print(f"{segment['text']}\n")
|
|
else: # text
|
|
print(" ".join(full_text))
|
|
|
|
return " ".join(full_text)
|
|
|
|
def format_timestamp(seconds):
|
|
"""Format seconds to SRT timestamp"""
|
|
hours = int(seconds // 3600)
|
|
minutes = int((seconds % 3600) // 60)
|
|
secs = int(seconds % 60)
|
|
millis = int((seconds % 1) * 1000)
|
|
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Transcribe audio using Faster-Whisper")
|
|
parser.add_argument("audio_file", help="Path to audio file")
|
|
parser.add_argument("--model", default="base",
|
|
choices=["tiny", "base", "small", "medium", "large"],
|
|
help="Whisper model size (default: base)")
|
|
parser.add_argument("--output-format", default="text",
|
|
choices=["text", "json", "srt"],
|
|
help="Output format (default: text)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Allow override from environment
|
|
model = os.environ.get("WHISPER_MODEL", args.model)
|
|
|
|
transcribe(args.audio_file, model, args.output_format)
|