Files
jarvis-memory/skills/qdrant-memory/scripts/llm_router.py

103 lines
3.1 KiB
Python

#!/usr/bin/env python3
"""LLM Router for cheap metadata + compaction.
Goal:
- Prefer Minimax m2.5 for tagging + compaction.
- Fallback to Gemini Flash (or any other OpenRouter model) if Minimax fails.
This uses OpenRouter's OpenAI-compatible API.
Env:
OPENROUTER_API_KEY (required)
OPENROUTER_BASE_URL default: https://openrouter.ai/api/v1
LLM_PRIMARY_MODEL default: openrouter/minimax/minimax-m2.5
LLM_FALLBACK_MODEL default: openrouter/google/gemini-2.5-flash
LLM_TIMEOUT default: 60
Notes:
- We keep this dependency-light (urllib only).
- We request strict JSON when asked.
"""
import json
import os
import sys
import urllib.request
BASE_URL = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1").rstrip("/")
API_KEY = os.getenv("OPENROUTER_API_KEY", "")
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "openrouter/minimax/minimax-m2.5")
FALLBACK_MODEL = os.getenv("LLM_FALLBACK_MODEL", "openrouter/google/gemini-2.5-flash")
TIMEOUT = int(os.getenv("LLM_TIMEOUT", "60"))
def _post_chat(model: str, messages, response_format=None, temperature=0.2):
if not API_KEY:
raise RuntimeError("OPENROUTER_API_KEY is required")
body = {
"model": model,
"messages": messages,
"temperature": temperature,
}
if response_format:
body["response_format"] = response_format
req = urllib.request.Request(
f"{BASE_URL}/chat/completions",
data=json.dumps(body).encode("utf-8"),
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}",
},
)
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
return json.loads(r.read().decode("utf-8"))
def chat_json(system: str, user: str) -> dict:
"""Return parsed JSON object. Try primary then fallback."""
messages = [
{"role": "system", "content": system},
{"role": "user", "content": user},
]
last_err = None
for model in (PRIMARY_MODEL, FALLBACK_MODEL):
try:
resp = _post_chat(model, messages, response_format={"type": "json_object"}, temperature=0.2)
content = resp["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
last_err = e
continue
raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}")
def chat_text(system: str, user: str) -> str:
"""Return text. Try primary then fallback."""
messages = [
{"role": "system", "content": system},
{"role": "user", "content": user},
]
last_err = None
for model in (PRIMARY_MODEL, FALLBACK_MODEL):
try:
resp = _post_chat(model, messages, response_format=None, temperature=0.2)
return resp["choices"][0]["message"]["content"]
except Exception as e:
last_err = e
continue
raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}")
if __name__ == "__main__":
# tiny self-test
if len(sys.argv) > 1 and sys.argv[1] == "--ping":
out = chat_json("Return JSON with key ok=true", "ping")
print(json.dumps(out))