Initial commit: workspace setup with skills, memory, config
This commit is contained in:
87
skills/local-whisper-stt/scripts/transcribe.py
Executable file
87
skills/local-whisper-stt/scripts/transcribe.py
Executable file
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Transcribe audio files using local Faster-Whisper (CPU-only)
|
||||
Usage: transcribe.py <audio_file> [--model MODEL] [--output-format text|json|srt]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
def transcribe(audio_path, model_size="base", output_format="text"):
|
||||
"""Transcribe audio file to text"""
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
print(f"Error: File not found: {audio_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load model (cached in ~/.cache/huggingface/hub)
|
||||
print(f"Loading Whisper model: {model_size}", file=sys.stderr)
|
||||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
|
||||
# Transcribe
|
||||
print(f"Transcribing: {audio_path}", file=sys.stderr)
|
||||
segments, info = model.transcribe(audio_path, beam_size=5)
|
||||
|
||||
# Process results
|
||||
language = info.language
|
||||
language_prob = info.language_probability
|
||||
|
||||
results = []
|
||||
full_text = []
|
||||
|
||||
for segment in segments:
|
||||
results.append({
|
||||
"start": segment.start,
|
||||
"end": segment.end,
|
||||
"text": segment.text.strip()
|
||||
})
|
||||
full_text.append(segment.text.strip())
|
||||
|
||||
# Output format
|
||||
if output_format == "json":
|
||||
output = {
|
||||
"language": language,
|
||||
"language_probability": language_prob,
|
||||
"segments": results,
|
||||
"text": " ".join(full_text)
|
||||
}
|
||||
print(json.dumps(output, indent=2))
|
||||
elif output_format == "srt":
|
||||
for i, segment in enumerate(results, 1):
|
||||
start = format_timestamp(segment["start"])
|
||||
end = format_timestamp(segment["end"])
|
||||
print(f"{i}")
|
||||
print(f"{start} --> {end}")
|
||||
print(f"{segment['text']}\n")
|
||||
else: # text
|
||||
print(" ".join(full_text))
|
||||
|
||||
return " ".join(full_text)
|
||||
|
||||
def format_timestamp(seconds):
|
||||
"""Format seconds to SRT timestamp"""
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
secs = int(seconds % 60)
|
||||
millis = int((seconds % 1) * 1000)
|
||||
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Transcribe audio using Faster-Whisper")
|
||||
parser.add_argument("audio_file", help="Path to audio file")
|
||||
parser.add_argument("--model", default="base",
|
||||
choices=["tiny", "base", "small", "medium", "large"],
|
||||
help="Whisper model size (default: base)")
|
||||
parser.add_argument("--output-format", default="text",
|
||||
choices=["text", "json", "srt"],
|
||||
help="Output format (default: text)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Allow override from environment
|
||||
model = os.environ.get("WHISPER_MODEL", args.model)
|
||||
|
||||
transcribe(args.audio_file, model, args.output_format)
|
||||
Reference in New Issue
Block a user