Initial commit: workspace setup with skills, memory, config
This commit is contained in:
79
skills/local-whisper-stt/SKILL.md
Normal file
79
skills/local-whisper-stt/SKILL.md
Normal file
@@ -0,0 +1,79 @@
|
||||
---
|
||||
name: local-whisper-stt
|
||||
description: Local speech-to-text transcription using Faster-Whisper. Use when receiving voice messages in Telegram (or other channels) that need to be transcribed to text. Automatically downloads and transcribes audio files using local CPU-based Whisper models. Supports multiple model sizes (tiny, base, small, medium, large) with automatic language detection.
|
||||
---
|
||||
|
||||
# Local Whisper STT
|
||||
|
||||
## Overview
|
||||
|
||||
Transcribes voice messages to text using local Faster-Whisper (CPU-based, no GPU required).
|
||||
|
||||
## When to Use
|
||||
|
||||
- User sends a voice message in Telegram
|
||||
- Need to transcribe audio to text locally (free, private)
|
||||
- Any audio transcription task where cloud STT is not desired
|
||||
|
||||
## Models Available
|
||||
|
||||
| Model | Size | Speed | Accuracy | Use Case |
|
||||
|-------|------|-------|----------|----------|
|
||||
| tiny | 39MB | Fastest | Basic | Quick testing, low resources |
|
||||
| base | 74MB | Fast | Good | Default for most use |
|
||||
| small | 244MB | Medium | Better | Better accuracy needed |
|
||||
| medium | 769MB | Slower | Very Good | High accuracy, more RAM |
|
||||
| large | 1550MB | Slowest | Best | Maximum accuracy |
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Receive voice message (Telegram provides OGG/Opus)
|
||||
2. Download audio file to temp location
|
||||
3. Load Faster-Whisper model (cached after first use)
|
||||
4. Transcribe audio to text
|
||||
5. Return transcription to conversation
|
||||
6. Cleanup temp file
|
||||
|
||||
## Usage
|
||||
|
||||
### From Telegram Voice Message
|
||||
|
||||
When a voice message arrives, the skill:
|
||||
1. Downloads the voice file from Telegram
|
||||
2. Transcribes using the configured model
|
||||
3. Returns text to the agent context
|
||||
|
||||
### Manual Transcription
|
||||
|
||||
```python
|
||||
# Transcribe a local audio file
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
model = WhisperModel("base", device="cpu", compute_type="int8")
|
||||
segments, info = model.transcribe("/path/to/audio.ogg", beam_size=5)
|
||||
|
||||
for segment in segments:
|
||||
print(segment.text)
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Default model: `base` (good balance of speed/accuracy on CPU)
|
||||
|
||||
To change model, edit the script or set environment variable:
|
||||
```bash
|
||||
export WHISPER_MODEL=small
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.8+
|
||||
- faster-whisper package
|
||||
- ~100MB-1.5GB disk space (depending on model)
|
||||
- No GPU required (CPU-only)
|
||||
|
||||
## Resources
|
||||
|
||||
### scripts/
|
||||
- `transcribe.py` - Main transcription script
|
||||
- `telegram_voice_handler.py` - Telegram-specific voice message handler
|
||||
96
skills/local-whisper-stt/scripts/telegram_voice_handler.py
Executable file
96
skills/local-whisper-stt/scripts/telegram_voice_handler.py
Executable file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Handle Telegram voice messages - download and transcribe
|
||||
Usage: telegram_voice_handler.py <bot_token> <file_id> [--model MODEL]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import urllib.request
|
||||
import tempfile
|
||||
|
||||
def download_voice_file(bot_token, file_id, output_path):
|
||||
"""Download voice file from Telegram"""
|
||||
|
||||
# Step 1: Get file path from Telegram
|
||||
file_info_url = f"https://api.telegram.org/bot{bot_token}/getFile?file_id={file_id}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(file_info_url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
if not data.get("ok"):
|
||||
print(f"Error getting file info: {data}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
file_path = data["result"]["file_path"]
|
||||
except Exception as e:
|
||||
print(f"Error fetching file info: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Step 2: Download the actual file
|
||||
download_url = f"https://api.telegram.org/file/bot{bot_token}/{file_path}"
|
||||
|
||||
try:
|
||||
urllib.request.urlretrieve(download_url, output_path)
|
||||
return output_path
|
||||
except Exception as e:
|
||||
print(f"Error downloading file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def transcribe_with_whisper(audio_path, model_size="base"):
|
||||
"""Transcribe using local Faster-Whisper"""
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
# Load model (cached after first use)
|
||||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
|
||||
# Transcribe
|
||||
segments, info = model.transcribe(audio_path, beam_size=5)
|
||||
|
||||
# Collect text
|
||||
full_text = []
|
||||
for segment in segments:
|
||||
full_text.append(segment.text.strip())
|
||||
|
||||
return {
|
||||
"text": " ".join(full_text),
|
||||
"language": info.language,
|
||||
"language_probability": info.language_probability
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Download and transcribe Telegram voice message")
|
||||
parser.add_argument("bot_token", help="Telegram bot token")
|
||||
parser.add_argument("file_id", help="Telegram voice file_id")
|
||||
parser.add_argument("--model", default="base",
|
||||
choices=["tiny", "base", "small", "medium", "large"],
|
||||
help="Whisper model size (default: base)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Allow override from environment
|
||||
model = os.environ.get("WHISPER_MODEL", args.model)
|
||||
|
||||
# Create temp file for download
|
||||
with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp:
|
||||
temp_path = tmp.name
|
||||
|
||||
try:
|
||||
# Download
|
||||
print(f"Downloading voice file...", file=sys.stderr)
|
||||
download_voice_file(args.bot_token, args.file_id, temp_path)
|
||||
|
||||
# Transcribe
|
||||
print(f"Transcribing with {model} model...", file=sys.stderr)
|
||||
result = transcribe_with_whisper(temp_path, model)
|
||||
|
||||
# Output result
|
||||
print(json.dumps(result))
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if os.path.exists(temp_path):
|
||||
os.remove(temp_path)
|
||||
87
skills/local-whisper-stt/scripts/transcribe.py
Executable file
87
skills/local-whisper-stt/scripts/transcribe.py
Executable file
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Transcribe audio files using local Faster-Whisper (CPU-only)
|
||||
Usage: transcribe.py <audio_file> [--model MODEL] [--output-format text|json|srt]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
def transcribe(audio_path, model_size="base", output_format="text"):
|
||||
"""Transcribe audio file to text"""
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
print(f"Error: File not found: {audio_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load model (cached in ~/.cache/huggingface/hub)
|
||||
print(f"Loading Whisper model: {model_size}", file=sys.stderr)
|
||||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
|
||||
# Transcribe
|
||||
print(f"Transcribing: {audio_path}", file=sys.stderr)
|
||||
segments, info = model.transcribe(audio_path, beam_size=5)
|
||||
|
||||
# Process results
|
||||
language = info.language
|
||||
language_prob = info.language_probability
|
||||
|
||||
results = []
|
||||
full_text = []
|
||||
|
||||
for segment in segments:
|
||||
results.append({
|
||||
"start": segment.start,
|
||||
"end": segment.end,
|
||||
"text": segment.text.strip()
|
||||
})
|
||||
full_text.append(segment.text.strip())
|
||||
|
||||
# Output format
|
||||
if output_format == "json":
|
||||
output = {
|
||||
"language": language,
|
||||
"language_probability": language_prob,
|
||||
"segments": results,
|
||||
"text": " ".join(full_text)
|
||||
}
|
||||
print(json.dumps(output, indent=2))
|
||||
elif output_format == "srt":
|
||||
for i, segment in enumerate(results, 1):
|
||||
start = format_timestamp(segment["start"])
|
||||
end = format_timestamp(segment["end"])
|
||||
print(f"{i}")
|
||||
print(f"{start} --> {end}")
|
||||
print(f"{segment['text']}\n")
|
||||
else: # text
|
||||
print(" ".join(full_text))
|
||||
|
||||
return " ".join(full_text)
|
||||
|
||||
def format_timestamp(seconds):
|
||||
"""Format seconds to SRT timestamp"""
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
secs = int(seconds % 60)
|
||||
millis = int((seconds % 1) * 1000)
|
||||
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Transcribe audio using Faster-Whisper")
|
||||
parser.add_argument("audio_file", help="Path to audio file")
|
||||
parser.add_argument("--model", default="base",
|
||||
choices=["tiny", "base", "small", "medium", "large"],
|
||||
help="Whisper model size (default: base)")
|
||||
parser.add_argument("--output-format", default="text",
|
||||
choices=["text", "json", "srt"],
|
||||
help="Output format (default: text)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Allow override from environment
|
||||
model = os.environ.get("WHISPER_MODEL", args.model)
|
||||
|
||||
transcribe(args.audio_file, model, args.output_format)
|
||||
Reference in New Issue
Block a user