jarvis-memory/skills/qdrant-memory/scripts/llm_router.py

#!/usr/bin/env python3
"""LLM Router for cheap metadata + compaction.

Goal:
- Prefer Minimax m2.5 for tagging + compaction.
- Fallback to Gemini Flash (or any other OpenRouter model) if Minimax fails.

This uses OpenRouter's OpenAI-compatible API.

Env:
  OPENROUTER_API_KEY   (required)
  OPENROUTER_BASE_URL  default: https://openrouter.ai/api/v1
  LLM_PRIMARY_MODEL    default: openrouter/minimax/minimax-m2.5
  LLM_FALLBACK_MODEL   default: openrouter/google/gemini-2.5-flash
  LLM_TIMEOUT          default: 60

Notes:
- We keep this dependency-light (urllib only).
- We request strict JSON when asked.
"""

import json
import os
import sys
import urllib.request

BASE_URL = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1").rstrip("/")
API_KEY = os.getenv("OPENROUTER_API_KEY", "")
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "openrouter/minimax/minimax-m2.5")
FALLBACK_MODEL = os.getenv("LLM_FALLBACK_MODEL", "openrouter/google/gemini-2.5-flash")
TIMEOUT = int(os.getenv("LLM_TIMEOUT", "60"))


def _post_chat(model: str, messages, response_format=None, temperature=0.2):
    if not API_KEY:
        raise RuntimeError("OPENROUTER_API_KEY is required")

    body = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
    }
    if response_format:
        body["response_format"] = response_format

    req = urllib.request.Request(
        f"{BASE_URL}/chat/completions",
        data=json.dumps(body).encode("utf-8"),
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {API_KEY}",
        },
    )

    with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
        return json.loads(r.read().decode("utf-8"))


def chat_json(system: str, user: str) -> dict:
    """Return parsed JSON object. Try primary then fallback."""
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ]

    last_err = None
    for model in (PRIMARY_MODEL, FALLBACK_MODEL):
        try:
            resp = _post_chat(model, messages, response_format={"type": "json_object"}, temperature=0.2)
            content = resp["choices"][0]["message"]["content"]
            return json.loads(content)
        except Exception as e:
            last_err = e
            continue

    raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}")


def chat_text(system: str, user: str) -> str:
    """Return text. Try primary then fallback."""
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ]

    last_err = None
    for model in (PRIMARY_MODEL, FALLBACK_MODEL):
        try:
            resp = _post_chat(model, messages, response_format=None, temperature=0.2)
            return resp["choices"][0]["message"]["content"]
        except Exception as e:
            last_err = e
            continue

    raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}")


if __name__ == "__main__":
    # tiny self-test
    if len(sys.argv) > 1 and sys.argv[1] == "--ping":
        out = chat_json("Return JSON with key ok=true", "ping")
        print(json.dumps(out))