103 lines
3.1 KiB
Python
103 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""LLM Router for cheap metadata + compaction.
|
|
|
|
Goal:
|
|
- Prefer Minimax m2.5 for tagging + compaction.
|
|
- Fallback to Gemini Flash (or any other OpenRouter model) if Minimax fails.
|
|
|
|
This uses OpenRouter's OpenAI-compatible API.
|
|
|
|
Env:
|
|
OPENROUTER_API_KEY (required)
|
|
OPENROUTER_BASE_URL default: https://openrouter.ai/api/v1
|
|
LLM_PRIMARY_MODEL default: openrouter/minimax/minimax-m2.5
|
|
LLM_FALLBACK_MODEL default: openrouter/google/gemini-2.5-flash
|
|
LLM_TIMEOUT default: 60
|
|
|
|
Notes:
|
|
- We keep this dependency-light (urllib only).
|
|
- We request strict JSON when asked.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import urllib.request
|
|
|
|
BASE_URL = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1").rstrip("/")
|
|
API_KEY = os.getenv("OPENROUTER_API_KEY", "")
|
|
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "openrouter/minimax/minimax-m2.5")
|
|
FALLBACK_MODEL = os.getenv("LLM_FALLBACK_MODEL", "openrouter/google/gemini-2.5-flash")
|
|
TIMEOUT = int(os.getenv("LLM_TIMEOUT", "60"))
|
|
|
|
|
|
def _post_chat(model: str, messages, response_format=None, temperature=0.2):
|
|
if not API_KEY:
|
|
raise RuntimeError("OPENROUTER_API_KEY is required")
|
|
|
|
body = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"temperature": temperature,
|
|
}
|
|
if response_format:
|
|
body["response_format"] = response_format
|
|
|
|
req = urllib.request.Request(
|
|
f"{BASE_URL}/chat/completions",
|
|
data=json.dumps(body).encode("utf-8"),
|
|
headers={
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {API_KEY}",
|
|
},
|
|
)
|
|
|
|
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
|
|
return json.loads(r.read().decode("utf-8"))
|
|
|
|
|
|
def chat_json(system: str, user: str) -> dict:
|
|
"""Return parsed JSON object. Try primary then fallback."""
|
|
messages = [
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": user},
|
|
]
|
|
|
|
last_err = None
|
|
for model in (PRIMARY_MODEL, FALLBACK_MODEL):
|
|
try:
|
|
resp = _post_chat(model, messages, response_format={"type": "json_object"}, temperature=0.2)
|
|
content = resp["choices"][0]["message"]["content"]
|
|
return json.loads(content)
|
|
except Exception as e:
|
|
last_err = e
|
|
continue
|
|
|
|
raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}")
|
|
|
|
|
|
def chat_text(system: str, user: str) -> str:
|
|
"""Return text. Try primary then fallback."""
|
|
messages = [
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": user},
|
|
]
|
|
|
|
last_err = None
|
|
for model in (PRIMARY_MODEL, FALLBACK_MODEL):
|
|
try:
|
|
resp = _post_chat(model, messages, response_format=None, temperature=0.2)
|
|
return resp["choices"][0]["message"]["content"]
|
|
except Exception as e:
|
|
last_err = e
|
|
continue
|
|
|
|
raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# tiny self-test
|
|
if len(sys.argv) > 1 and sys.argv[1] == "--ping":
|
|
out = chat_json("Return JSON with key ok=true", "ping")
|
|
print(json.dumps(out))
|