Initial commit: Jarvis Memory system

2026-02-23 12:13:04 -06:00
commit e8854cd959
72 changed files with 14801 additions and 0 deletions
--- a/skills/qdrant-memory/scripts/harvest_sessions.py
+++ b/skills/qdrant-memory/scripts/harvest_sessions.py
@@ -0,0 +1,341 @@
+#!/usr/bin/env python3
+"""
+Harvest all session JSONL files and store to Qdrant.
+
+Scans all session files, extracts conversation turns, and stores to Qdrant
+with proper user_id and deduplication.
+
+Usage: python3 harvest_sessions.py [--user-id rob] [--dry-run]
+"""
+
+import argparse
+import hashlib
+import json
+import os
+import sys
+import urllib.request
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+QDRANT_URL = "http://10.0.0.40:6333"
+COLLECTION_NAME = "kimi_memories"
+OLLAMA_URL = "http://10.0.0.10:11434/v1"
+SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions")
+
+# In-memory cache for deduplication
+_recent_hashes = set()
+
+def get_content_hash(user_msg: str, ai_response: str) -> str:
+    """Generate hash for deduplication"""
+    content = f"{user_msg.strip()}::{ai_response.strip()}"
+    return hashlib.md5(content.encode()).hexdigest()
+
+def is_duplicate(user_id: str, content_hash: str) -> bool:
+    """Check if this content already exists for this user"""
+    if content_hash in _recent_hashes:
+        return True
+    
+    try:
+        search_body = {
+            "filter": {
+                "must": [
+                    {"key": "user_id", "match": {"value": user_id}},
+                    {"key": "content_hash", "match": {"value": content_hash}}
+                ]
+            },
+            "limit": 1,
+            "with_payload": False
+        }
+        
+        req = urllib.request.Request(
+            f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
+            data=json.dumps(search_body).encode(),
+            headers={"Content-Type": "application/json"}
+        )
+        
+        with urllib.request.urlopen(req, timeout=10) as response:
+            result = json.loads(response.read().decode())
+            points = result.get("result", {}).get("points", [])
+            if len(points) > 0:
+                return True
+    except Exception:
+        pass
+    
+    return False
+
+def get_embedding(text: str) -> Optional[List[float]]:
+    """Generate embedding using snowflake-arctic-embed2"""
+    data = json.dumps({
+        "model": "snowflake-arctic-embed2",
+        "input": text[:8192]
+    }).encode()
+    
+    req = urllib.request.Request(
+        f"{OLLAMA_URL}/embeddings",
+        data=data,
+        headers={"Content-Type": "application/json"}
+    )
+    
+    try:
+        with urllib.request.urlopen(req, timeout=30) as response:
+            result = json.loads(response.read().decode())
+            return result["data"][0]["embedding"]
+    except Exception as e:
+        print(f"[Harvest] Embedding error: {e}", file=sys.stderr)
+        return None
+
+def store_turn(user_id: str, user_msg: str, ai_response: str, 
+               date_str: str, conversation_id: str, turn_number: int,
+               session_id: str, dry_run: bool = False) -> Dict:
+    """Store a single conversation turn to Qdrant"""
+    
+    content_hash = get_content_hash(user_msg, ai_response)
+    
+    # Check duplicate
+    if is_duplicate(user_id, content_hash):
+        return {"skipped": True, "reason": "duplicate"}
+    
+    if dry_run:
+        return {"skipped": False, "dry_run": True}
+    
+    # Generate embeddings
+    user_embedding = get_embedding(f"[{user_id}]: {user_msg}")
+    ai_embedding = get_embedding(f"[Kimi]: {ai_response}")
+    summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}..."
+    summary_embedding = get_embedding(summary)
+    
+    if not all([user_embedding, ai_embedding, summary_embedding]):
+        return {"skipped": True, "reason": "embedding_failed"}
+    
+    tags = ["conversation", "harvested", f"user:{user_id}", date_str]
+    importance = "high" if any(kw in (user_msg + ai_response).lower() 
+                               for kw in ["remember", "important", "always", "never", "rule"]) else "medium"
+    
+    points = []
+    
+    # User message
+    points.append({
+        "id": str(uuid.uuid4()),
+        "vector": user_embedding,
+        "payload": {
+            "user_id": user_id,
+            "text": f"[{user_id}]: {user_msg[:2000]}",
+            "date": date_str,
+            "tags": tags + ["user-message"],
+            "importance": importance,
+            "source": "session_harvest",
+            "source_type": "user",
+            "category": "Full Conversation",
+            "confidence": "high",
+            "verified": True,
+            "created_at": datetime.now().isoformat(),
+            "conversation_id": conversation_id,
+            "turn_number": turn_number,
+            "session_id": session_id,
+            "content_hash": content_hash
+        }
+    })
+    
+    # AI response
+    points.append({
+        "id": str(uuid.uuid4()),
+        "vector": ai_embedding,
+        "payload": {
+            "user_id": user_id,
+            "text": f"[Kimi]: {ai_response[:2000]}",
+            "date": date_str,
+            "tags": tags + ["ai-response"],
+            "importance": importance,
+            "source": "session_harvest",
+            "source_type": "assistant",
+            "category": "Full Conversation",
+            "confidence": "high",
+            "verified": True,
+            "created_at": datetime.now().isoformat(),
+            "conversation_id": conversation_id,
+            "turn_number": turn_number,
+            "session_id": session_id,
+            "content_hash": content_hash
+        }
+    })
+    
+    # Summary
+    if summary_embedding:
+        points.append({
+            "id": str(uuid.uuid4()),
+            "vector": summary_embedding,
+            "payload": {
+                "user_id": user_id,
+                "text": f"[Turn {turn_number}] {summary}",
+                "date": date_str,
+                "tags": tags + ["summary"],
+                "importance": importance,
+                "source": "session_harvest_summary",
+                "source_type": "system",
+                "category": "Conversation Summary",
+                "confidence": "high",
+                "verified": True,
+                "created_at": datetime.now().isoformat(),
+                "conversation_id": conversation_id,
+                "turn_number": turn_number,
+                "session_id": session_id,
+                "content_hash": content_hash,
+                "user_message": user_msg[:500],
+                "ai_response": ai_response[:800]
+            }
+        })
+    
+    # Upload
+    upsert_data = {"points": points}
+    
+    req = urllib.request.Request(
+        f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
+        data=json.dumps(upsert_data).encode(),
+        headers={"Content-Type": "application/json"},
+        method="PUT"
+    )
+    
+    try:
+        with urllib.request.urlopen(req, timeout=30) as response:
+            result = json.loads(response.read().decode())
+            if result.get("status") == "ok":
+                _recent_hashes.add(content_hash)
+                return {"skipped": False, "stored": True}
+    except Exception as e:
+        print(f"[Harvest] Storage error: {e}", file=sys.stderr)
+    
+    return {"skipped": True, "reason": "upload_failed"}
+
+def parse_session_file(filepath: Path) -> List[Dict]:
+    """Parse a session JSONL file and extract conversation turns"""
+    turns = []
+    turn_number = 0
+    
+    try:
+        with open(filepath, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                    if entry.get('type') == 'message' and 'message' in entry:
+                        msg = entry['message']
+                        role = msg.get('role')
+                        
+                        if role == 'toolResult':
+                            continue
+                        
+                        content = ""
+                        if isinstance(msg.get('content'), list):
+                            for item in msg['content']:
+                                if isinstance(item, dict):
+                                    if 'text' in item:
+                                        content += item['text']
+                                    elif 'thinking' in item:
+                                        content += f"[thinking: {item['thinking'][:200]}...]"
+                        elif isinstance(msg.get('content'), str):
+                            content = msg['content']
+                        
+                        if content and role in ('user', 'assistant'):
+                            turn_number += 1
+                            timestamp = entry.get('timestamp', '')
+                            date_str = timestamp[:10] if timestamp else datetime.now().strftime("%Y-%m-%d")
+                            
+                            turns.append({
+                                'turn': turn_number,
+                                'role': role,
+                                'content': content[:2000],
+                                'date': date_str,
+                                'session': filepath.stem
+                            })
+                except json.JSONDecodeError:
+                    continue
+    except Exception as e:
+        print(f"[Harvest] Error reading {filepath}: {e}", file=sys.stderr)
+    
+    return turns
+
+def main():
+    parser = argparse.ArgumentParser(description="Harvest session files to Qdrant")
+    parser.add_argument("--user-id", default="yourname", help="User ID for storage")
+    parser.add_argument("--dry-run", action="store_true", help="Don't actually store")
+    parser.add_argument("--limit", type=int, default=0, help="Limit sessions (0=all)")
+    args = parser.parse_args()
+    
+    # Find all session files
+    session_files = sorted(SESSIONS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime)
+    
+    if args.limit > 0:
+        session_files = session_files[:args.limit]
+    
+    print(f"Found {len(session_files)} session files")
+    
+    total_stored = 0
+    total_skipped = 0
+    total_failed = 0
+    
+    for i, session_file in enumerate(session_files, 1):
+        print(f"\n[{i}/{len(session_files)}] Processing: {session_file.name}")
+        
+        turns = parse_session_file(session_file)
+        if not turns:
+            print("  No turns found")
+            continue
+        
+        print(f"  Found {len(turns)} turns")
+        
+        # Pair user messages with AI responses
+        conversation_id = str(uuid.uuid4())
+        j = 0
+        while j < len(turns):
+            turn = turns[j]
+            
+            if turn['role'] == 'user':
+                user_msg = turn['content']
+                ai_response = ""
+                
+                # Look for next AI response
+                if j + 1 < len(turns) and turns[j + 1]['role'] == 'assistant':
+                    ai_response = turns[j + 1]['content']
+                    j += 2
+                else:
+                    j += 1
+                
+                if user_msg and ai_response:
+                    result = store_turn(
+                        user_id=args.user_id,
+                        user_msg=user_msg,
+                        ai_response=ai_response,
+                        date_str=turn['date'],
+                        conversation_id=conversation_id,
+                        turn_number=turn['turn'],
+                        session_id=turn['session'],
+                        dry_run=args.dry_run
+                    )
+                    
+                    if result.get("skipped"):
+                        if result.get("reason") == "duplicate":
+                            total_skipped += 1
+                        else:
+                            total_failed += 1
+                    else:
+                        total_stored += 1
+                        if total_stored % 10 == 0:
+                            print(f"  Progress: {total_stored} stored, {total_skipped} skipped")
+            else:
+                j += 1
+    
+    print(f"\n{'='*50}")
+    print(f"Harvest complete:")
+    print(f"  Stored: {total_stored} turns ({total_stored * 3} embeddings)")
+    print(f"  Skipped (duplicates): {total_skipped}")
+    print(f"  Failed: {total_failed}")
+    
+    if args.dry_run:
+        print("\n[DRY RUN] Nothing was actually stored")
+
+if __name__ == "__main__":
+    main()