Initial commit: workspace setup with skills, memory, config

2026-02-10 14:37:49 -06:00
commit d1357c5463
77 changed files with 10822 additions and 0 deletions
--- a/skills/local-whisper-stt/SKILL.md
+++ b/skills/local-whisper-stt/SKILL.md
@@ -0,0 +1,79 @@
+---
+name: local-whisper-stt
+description: Local speech-to-text transcription using Faster-Whisper. Use when receiving voice messages in Telegram (or other channels) that need to be transcribed to text. Automatically downloads and transcribes audio files using local CPU-based Whisper models. Supports multiple model sizes (tiny, base, small, medium, large) with automatic language detection.
+---
+
+# Local Whisper STT
+
+## Overview
+
+Transcribes voice messages to text using local Faster-Whisper (CPU-based, no GPU required).
+
+## When to Use
+
+- User sends a voice message in Telegram
+- Need to transcribe audio to text locally (free, private)
+- Any audio transcription task where cloud STT is not desired
+
+## Models Available
+
+| Model | Size | Speed | Accuracy | Use Case |
+|-------|------|-------|----------|----------|
+| tiny | 39MB | Fastest | Basic | Quick testing, low resources |
+| base | 74MB | Fast | Good | Default for most use |
+| small | 244MB | Medium | Better | Better accuracy needed |
+| medium | 769MB | Slower | Very Good | High accuracy, more RAM |
+| large | 1550MB | Slowest | Best | Maximum accuracy |
+
+## Workflow
+
+1. Receive voice message (Telegram provides OGG/Opus)
+2. Download audio file to temp location
+3. Load Faster-Whisper model (cached after first use)
+4. Transcribe audio to text
+5. Return transcription to conversation
+6. Cleanup temp file
+
+## Usage
+
+### From Telegram Voice Message
+
+When a voice message arrives, the skill:
+1. Downloads the voice file from Telegram
+2. Transcribes using the configured model
+3. Returns text to the agent context
+
+### Manual Transcription
+
+```python
+# Transcribe a local audio file
+from faster_whisper import WhisperModel
+
+model = WhisperModel("base", device="cpu", compute_type="int8")
+segments, info = model.transcribe("/path/to/audio.ogg", beam_size=5)
+
+for segment in segments:
+    print(segment.text)
+```
+
+## Configuration
+
+Default model: `base` (good balance of speed/accuracy on CPU)
+
+To change model, edit the script or set environment variable:
+```bash
+export WHISPER_MODEL=small
+```
+
+## Requirements
+
+- Python 3.8+
+- faster-whisper package
+- ~100MB-1.5GB disk space (depending on model)
+- No GPU required (CPU-only)
+
+## Resources
+
+### scripts/
+- `transcribe.py` - Main transcription script
+- `telegram_voice_handler.py` - Telegram-specific voice message handler
--- a/skills/local-whisper-stt/scripts/telegram_voice_handler.py
+++ b/skills/local-whisper-stt/scripts/telegram_voice_handler.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+"""
+Handle Telegram voice messages - download and transcribe
+Usage: telegram_voice_handler.py <bot_token> <file_id> [--model MODEL]
+"""
+
+import argparse
+import os
+import sys
+import json
+import urllib.request
+import tempfile
+
+def download_voice_file(bot_token, file_id, output_path):
+    """Download voice file from Telegram"""
+    
+    # Step 1: Get file path from Telegram
+    file_info_url = f"https://api.telegram.org/bot{bot_token}/getFile?file_id={file_id}"
+    
+    try:
+        with urllib.request.urlopen(file_info_url) as response:
+            data = json.loads(response.read().decode())
+            if not data.get("ok"):
+                print(f"Error getting file info: {data}", file=sys.stderr)
+                sys.exit(1)
+            
+            file_path = data["result"]["file_path"]
+    except Exception as e:
+        print(f"Error fetching file info: {e}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Step 2: Download the actual file
+    download_url = f"https://api.telegram.org/file/bot{bot_token}/{file_path}"
+    
+    try:
+        urllib.request.urlretrieve(download_url, output_path)
+        return output_path
+    except Exception as e:
+        print(f"Error downloading file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+def transcribe_with_whisper(audio_path, model_size="base"):
+    """Transcribe using local Faster-Whisper"""
+    
+    from faster_whisper import WhisperModel
+    
+    # Load model (cached after first use)
+    model = WhisperModel(model_size, device="cpu", compute_type="int8")
+    
+    # Transcribe
+    segments, info = model.transcribe(audio_path, beam_size=5)
+    
+    # Collect text
+    full_text = []
+    for segment in segments:
+        full_text.append(segment.text.strip())
+    
+    return {
+        "text": " ".join(full_text),
+        "language": info.language,
+        "language_probability": info.language_probability
+    }
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download and transcribe Telegram voice message")
+    parser.add_argument("bot_token", help="Telegram bot token")
+    parser.add_argument("file_id", help="Telegram voice file_id")
+    parser.add_argument("--model", default="base",
+                       choices=["tiny", "base", "small", "medium", "large"],
+                       help="Whisper model size (default: base)")
+    
+    args = parser.parse_args()
+    
+    # Allow override from environment
+    model = os.environ.get("WHISPER_MODEL", args.model)
+    
+    # Create temp file for download
+    with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp:
+        temp_path = tmp.name
+    
+    try:
+        # Download
+        print(f"Downloading voice file...", file=sys.stderr)
+        download_voice_file(args.bot_token, args.file_id, temp_path)
+        
+        # Transcribe
+        print(f"Transcribing with {model} model...", file=sys.stderr)
+        result = transcribe_with_whisper(temp_path, model)
+        
+        # Output result
+        print(json.dumps(result))
+        
+    finally:
+        # Cleanup
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
--- a/skills/local-whisper-stt/scripts/transcribe.py
+++ b/skills/local-whisper-stt/scripts/transcribe.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""
+Transcribe audio files using local Faster-Whisper (CPU-only)
+Usage: transcribe.py <audio_file> [--model MODEL] [--output-format text|json|srt]
+"""
+
+import argparse
+import os
+import sys
+import json
+from faster_whisper import WhisperModel
+
+def transcribe(audio_path, model_size="base", output_format="text"):
+    """Transcribe audio file to text"""
+    
+    if not os.path.exists(audio_path):
+        print(f"Error: File not found: {audio_path}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Load model (cached in ~/.cache/huggingface/hub)
+    print(f"Loading Whisper model: {model_size}", file=sys.stderr)
+    model = WhisperModel(model_size, device="cpu", compute_type="int8")
+    
+    # Transcribe
+    print(f"Transcribing: {audio_path}", file=sys.stderr)
+    segments, info = model.transcribe(audio_path, beam_size=5)
+    
+    # Process results
+    language = info.language
+    language_prob = info.language_probability
+    
+    results = []
+    full_text = []
+    
+    for segment in segments:
+        results.append({
+            "start": segment.start,
+            "end": segment.end,
+            "text": segment.text.strip()
+        })
+        full_text.append(segment.text.strip())
+    
+    # Output format
+    if output_format == "json":
+        output = {
+            "language": language,
+            "language_probability": language_prob,
+            "segments": results,
+            "text": " ".join(full_text)
+        }
+        print(json.dumps(output, indent=2))
+    elif output_format == "srt":
+        for i, segment in enumerate(results, 1):
+            start = format_timestamp(segment["start"])
+            end = format_timestamp(segment["end"])
+            print(f"{i}")
+            print(f"{start} --> {end}")
+            print(f"{segment['text']}\n")
+    else:  # text
+        print(" ".join(full_text))
+    
+    return " ".join(full_text)
+
+def format_timestamp(seconds):
+    """Format seconds to SRT timestamp"""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    millis = int((seconds % 1) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Transcribe audio using Faster-Whisper")
+    parser.add_argument("audio_file", help="Path to audio file")
+    parser.add_argument("--model", default="base", 
+                       choices=["tiny", "base", "small", "medium", "large"],
+                       help="Whisper model size (default: base)")
+    parser.add_argument("--output-format", default="text",
+                       choices=["text", "json", "srt"],
+                       help="Output format (default: text)")
+    
+    args = parser.parse_args()
+    
+    # Allow override from environment
+    model = os.environ.get("WHISPER_MODEL", args.model)
+    
+    transcribe(args.audio_file, model, args.output_format)