jarvis-memory/skills/mem-redis/scripts/cron_capture.py

#!/usr/bin/env python3
"""
Cron Capture: Append NEW session transcript messages to Redis (no LLM / no heartbeat).

Goal: minimize token spend by capturing context out-of-band.

- Tracks per-session file offsets (byte position) in a JSON state file.
- No-ops if the transcript file hasn't changed since last run.
- Stores user/assistant visible text to Redis (chronological order via RPUSH).
- Optionally stores model "thinking" separately (disabled by default) so it can be
  queried only when explicitly needed.

Usage:
  python3 cron_capture.py [--user-id rob] [--include-thinking]

Suggested cron (every 5 minutes):
  */5 * * * * cd ~/.openclaw/workspace && python3 skills/mem-redis/scripts/cron_capture.py --user-id $USER

Env:
  OPENCLAW_WORKSPACE: override workspace path (default: ~/.openclaw/workspace)
  OPENCLAW_SESSIONS_DIR: override sessions dir (default: ~/.openclaw/agents/main/sessions)
  REDIS_HOST / REDIS_PORT / USER_ID
"""

import argparse
import json
import os
import sys
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
USER_ID = os.getenv("USER_ID", "yourname")

DEFAULT_WORKSPACE = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace")))
DEFAULT_SESSIONS_DIR = Path(os.getenv("OPENCLAW_SESSIONS_DIR", str(Path.home() / ".openclaw" / "agents" / "main" / "sessions")))

STATE_FILE = DEFAULT_WORKSPACE / ".mem_capture_state.json"


@dataclass
class ParsedMessage:
    role: str  # user|assistant
    text: str
    thinking: Optional[str]
    timestamp: str
    session_id: str


def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()


def find_latest_transcript(sessions_dir: Path) -> Optional[Path]:
    files = list(sessions_dir.glob("*.jsonl"))
    if not files:
        return None
    return max(files, key=lambda p: p.stat().st_mtime)


def load_state() -> Dict[str, Any]:
    if not STATE_FILE.exists():
        return {}
    try:
        return json.loads(STATE_FILE.read_text())
    except Exception:
        return {}


def save_state(state: Dict[str, Any]) -> None:
    try:
        STATE_FILE.write_text(json.dumps(state, indent=2, sort_keys=True))
    except Exception as e:
        print(f"[cron_capture] Warning: could not write state: {e}", file=sys.stderr)


def extract_text_and_thinking(content: Any) -> Tuple[str, Optional[str]]:
    """Extract visible text and optional thinking from OpenClaw message content."""
    if isinstance(content, str):
        return content, None

    text_parts: List[str] = []
    thinking_parts: List[str] = []

    if isinstance(content, list):
        for item in content:
            if not isinstance(item, dict):
                continue
            if "text" in item and isinstance(item["text"], str):
                text_parts.append(item["text"])
            if "thinking" in item and isinstance(item["thinking"], str):
                thinking_parts.append(item["thinking"])

    text = "".join(text_parts).strip()
    thinking = "\n".join(thinking_parts).strip() if thinking_parts else None
    return text, thinking


def parse_new_messages(transcript_path: Path, start_offset: int, include_thinking: bool) -> Tuple[List[ParsedMessage], int]:
    """Parse messages from transcript_path starting at byte offset."""
    session_id = transcript_path.stem
    msgs: List[ParsedMessage] = []

    with transcript_path.open("rb") as f:
        f.seek(start_offset)
        while True:
            line = f.readline()
            if not line:
                break
            try:
                entry = json.loads(line.decode("utf-8", errors="replace").strip())
            except Exception:
                continue

            if entry.get("type") != "message" or "message" not in entry:
                continue
            msg = entry.get("message") or {}
            role = msg.get("role")
            if role not in ("user", "assistant"):
                continue

            # Skip tool results explicitly
            if role == "toolResult":
                continue

            text, thinking = extract_text_and_thinking(msg.get("content"))
            if not text and not (include_thinking and thinking):
                continue

            msgs.append(
                ParsedMessage(
                    role=role,
                    text=text[:8000],
                    thinking=(thinking[:16000] if (include_thinking and thinking) else None),
                    timestamp=entry.get("timestamp") or _now_iso(),
                    session_id=session_id,
                )
            )

        end_offset = f.tell()

    return msgs, end_offset


def append_to_redis(user_id: str, messages: List[ParsedMessage]) -> int:
    if not messages:
        return 0

    import redis  # lazy import so --dry-run works without deps
    r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)

    key = f"mem:{user_id}"
    thinking_key = f"mem_thinking:{user_id}"

    # RPUSH keeps chronological order.
    for m in messages:
        payload: Dict[str, Any] = {
            "role": m.role,
            "content": m.text,
            "timestamp": m.timestamp,
            "user_id": user_id,
            "session": m.session_id,
        }
        r.rpush(key, json.dumps(payload))

        if m.thinking:
            t_payload = {
                "role": m.role,
                "thinking": m.thinking,
                "timestamp": m.timestamp,
                "user_id": user_id,
                "session": m.session_id,
            }
            r.rpush(thinking_key, json.dumps(t_payload))

    return len(messages)


def main() -> None:
    parser = argparse.ArgumentParser(description="Cron capture: append new transcript messages to Redis")
    parser.add_argument("--user-id", default=USER_ID)
    parser.add_argument("--include-thinking", action="store_true", help="Store thinking into mem_thinking:<user>")
    parser.add_argument("--sessions-dir", default=str(DEFAULT_SESSIONS_DIR))
    parser.add_argument("--dry-run", action="store_true", help="Parse + update state, but do not write to Redis")
    args = parser.parse_args()

    sessions_dir = Path(args.sessions_dir)
    transcript = find_latest_transcript(sessions_dir)
    if not transcript:
        print("[cron_capture] No session transcripts found")
        return

    st = load_state()
    key = str(transcript)
    info = st.get(key, {})
    last_offset = int(info.get("offset", 0))
    last_size = int(info.get("size", 0))

    cur_size = transcript.stat().st_size
    if cur_size == last_size and last_offset > 0:
        print("[cron_capture] No changes")
        return

    messages, end_offset = parse_new_messages(transcript, last_offset, include_thinking=args.include_thinking)
    if not messages:
        # Still update size/offset so we don't re-read noise lines.
        st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
        save_state(st)
        print("[cron_capture] No new user/assistant messages")
        return

    if args.dry_run:
        st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
        save_state(st)
        print(f"[cron_capture] DRY RUN: would append {len(messages)} messages to Redis mem:{args.user_id}")
        return

    count = append_to_redis(args.user_id, messages)

    st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
    save_state(st)

    print(f"[cron_capture] Appended {count} messages to Redis mem:{args.user_id}")


if __name__ == "__main__":
    main()