forked from SpeedyFoxAi/jarvis-memory
231 lines
7.5 KiB
Python
231 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cron Capture: Append NEW session transcript messages to Redis (no LLM / no heartbeat).
|
|
|
|
Goal: minimize token spend by capturing context out-of-band.
|
|
|
|
- Tracks per-session file offsets (byte position) in a JSON state file.
|
|
- No-ops if the transcript file hasn't changed since last run.
|
|
- Stores user/assistant visible text to Redis (chronological order via RPUSH).
|
|
- Optionally stores model "thinking" separately (disabled by default) so it can be
|
|
queried only when explicitly needed.
|
|
|
|
Usage:
|
|
python3 cron_capture.py [--user-id rob] [--include-thinking]
|
|
|
|
Suggested cron (every 5 minutes):
|
|
*/5 * * * * cd ~/.openclaw/workspace && python3 skills/mem-redis/scripts/cron_capture.py --user-id $USER
|
|
|
|
Env:
|
|
OPENCLAW_WORKSPACE: override workspace path (default: ~/.openclaw/workspace)
|
|
OPENCLAW_SESSIONS_DIR: override sessions dir (default: ~/.openclaw/agents/main/sessions)
|
|
REDIS_HOST / REDIS_PORT / USER_ID
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
|
|
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
|
|
USER_ID = os.getenv("USER_ID", "yourname")
|
|
|
|
DEFAULT_WORKSPACE = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace")))
|
|
DEFAULT_SESSIONS_DIR = Path(os.getenv("OPENCLAW_SESSIONS_DIR", str(Path.home() / ".openclaw" / "agents" / "main" / "sessions")))
|
|
|
|
STATE_FILE = DEFAULT_WORKSPACE / ".mem_capture_state.json"
|
|
|
|
|
|
@dataclass
|
|
class ParsedMessage:
|
|
role: str # user|assistant
|
|
text: str
|
|
thinking: Optional[str]
|
|
timestamp: str
|
|
session_id: str
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def find_latest_transcript(sessions_dir: Path) -> Optional[Path]:
|
|
files = list(sessions_dir.glob("*.jsonl"))
|
|
if not files:
|
|
return None
|
|
return max(files, key=lambda p: p.stat().st_mtime)
|
|
|
|
|
|
def load_state() -> Dict[str, Any]:
|
|
if not STATE_FILE.exists():
|
|
return {}
|
|
try:
|
|
return json.loads(STATE_FILE.read_text())
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def save_state(state: Dict[str, Any]) -> None:
|
|
try:
|
|
STATE_FILE.write_text(json.dumps(state, indent=2, sort_keys=True))
|
|
except Exception as e:
|
|
print(f"[cron_capture] Warning: could not write state: {e}", file=sys.stderr)
|
|
|
|
|
|
def extract_text_and_thinking(content: Any) -> Tuple[str, Optional[str]]:
|
|
"""Extract visible text and optional thinking from OpenClaw message content."""
|
|
if isinstance(content, str):
|
|
return content, None
|
|
|
|
text_parts: List[str] = []
|
|
thinking_parts: List[str] = []
|
|
|
|
if isinstance(content, list):
|
|
for item in content:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
if "text" in item and isinstance(item["text"], str):
|
|
text_parts.append(item["text"])
|
|
if "thinking" in item and isinstance(item["thinking"], str):
|
|
thinking_parts.append(item["thinking"])
|
|
|
|
text = "".join(text_parts).strip()
|
|
thinking = "\n".join(thinking_parts).strip() if thinking_parts else None
|
|
return text, thinking
|
|
|
|
|
|
def parse_new_messages(transcript_path: Path, start_offset: int, include_thinking: bool) -> Tuple[List[ParsedMessage], int]:
|
|
"""Parse messages from transcript_path starting at byte offset."""
|
|
session_id = transcript_path.stem
|
|
msgs: List[ParsedMessage] = []
|
|
|
|
with transcript_path.open("rb") as f:
|
|
f.seek(start_offset)
|
|
while True:
|
|
line = f.readline()
|
|
if not line:
|
|
break
|
|
try:
|
|
entry = json.loads(line.decode("utf-8", errors="replace").strip())
|
|
except Exception:
|
|
continue
|
|
|
|
if entry.get("type") != "message" or "message" not in entry:
|
|
continue
|
|
msg = entry.get("message") or {}
|
|
role = msg.get("role")
|
|
if role not in ("user", "assistant"):
|
|
continue
|
|
|
|
# Skip tool results explicitly
|
|
if role == "toolResult":
|
|
continue
|
|
|
|
text, thinking = extract_text_and_thinking(msg.get("content"))
|
|
if not text and not (include_thinking and thinking):
|
|
continue
|
|
|
|
msgs.append(
|
|
ParsedMessage(
|
|
role=role,
|
|
text=text[:8000],
|
|
thinking=(thinking[:16000] if (include_thinking and thinking) else None),
|
|
timestamp=entry.get("timestamp") or _now_iso(),
|
|
session_id=session_id,
|
|
)
|
|
)
|
|
|
|
end_offset = f.tell()
|
|
|
|
return msgs, end_offset
|
|
|
|
|
|
def append_to_redis(user_id: str, messages: List[ParsedMessage]) -> int:
|
|
if not messages:
|
|
return 0
|
|
|
|
import redis # lazy import so --dry-run works without deps
|
|
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
|
|
|
|
key = f"mem:{user_id}"
|
|
thinking_key = f"mem_thinking:{user_id}"
|
|
|
|
# RPUSH keeps chronological order.
|
|
for m in messages:
|
|
payload: Dict[str, Any] = {
|
|
"role": m.role,
|
|
"content": m.text,
|
|
"timestamp": m.timestamp,
|
|
"user_id": user_id,
|
|
"session": m.session_id,
|
|
}
|
|
r.rpush(key, json.dumps(payload))
|
|
|
|
if m.thinking:
|
|
t_payload = {
|
|
"role": m.role,
|
|
"thinking": m.thinking,
|
|
"timestamp": m.timestamp,
|
|
"user_id": user_id,
|
|
"session": m.session_id,
|
|
}
|
|
r.rpush(thinking_key, json.dumps(t_payload))
|
|
|
|
return len(messages)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Cron capture: append new transcript messages to Redis")
|
|
parser.add_argument("--user-id", default=USER_ID)
|
|
parser.add_argument("--include-thinking", action="store_true", help="Store thinking into mem_thinking:<user>")
|
|
parser.add_argument("--sessions-dir", default=str(DEFAULT_SESSIONS_DIR))
|
|
parser.add_argument("--dry-run", action="store_true", help="Parse + update state, but do not write to Redis")
|
|
args = parser.parse_args()
|
|
|
|
sessions_dir = Path(args.sessions_dir)
|
|
transcript = find_latest_transcript(sessions_dir)
|
|
if not transcript:
|
|
print("[cron_capture] No session transcripts found")
|
|
return
|
|
|
|
st = load_state()
|
|
key = str(transcript)
|
|
info = st.get(key, {})
|
|
last_offset = int(info.get("offset", 0))
|
|
last_size = int(info.get("size", 0))
|
|
|
|
cur_size = transcript.stat().st_size
|
|
if cur_size == last_size and last_offset > 0:
|
|
print("[cron_capture] No changes")
|
|
return
|
|
|
|
messages, end_offset = parse_new_messages(transcript, last_offset, include_thinking=args.include_thinking)
|
|
if not messages:
|
|
# Still update size/offset so we don't re-read noise lines.
|
|
st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
|
|
save_state(st)
|
|
print("[cron_capture] No new user/assistant messages")
|
|
return
|
|
|
|
if args.dry_run:
|
|
st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
|
|
save_state(st)
|
|
print(f"[cron_capture] DRY RUN: would append {len(messages)} messages to Redis mem:{args.user_id}")
|
|
return
|
|
|
|
count = append_to_redis(args.user_id, messages)
|
|
|
|
st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
|
|
save_state(st)
|
|
|
|
print(f"[cron_capture] Appended {count} messages to Redis mem:{args.user_id}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|