#!/usr/bin/env python3 """ Harvest all session JSONL files and store to Qdrant. Scans all session files, extracts conversation turns, and stores to Qdrant with proper user_id and deduplication. Usage: python3 harvest_sessions.py [--user-id rob] [--dry-run] """ import argparse import hashlib import json import os import sys import urllib.request import uuid from datetime import datetime from pathlib import Path from typing import List, Optional, Dict, Any QDRANT_URL = "http://10.0.0.40:6333" COLLECTION_NAME = "kimi_memories" OLLAMA_URL = "http://10.0.0.10:11434/v1" SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions") # In-memory cache for deduplication _recent_hashes = set() def get_content_hash(user_msg: str, ai_response: str) -> str: """Generate hash for deduplication""" content = f"{user_msg.strip()}::{ai_response.strip()}" return hashlib.md5(content.encode()).hexdigest() def is_duplicate(user_id: str, content_hash: str) -> bool: """Check if this content already exists for this user""" if content_hash in _recent_hashes: return True try: search_body = { "filter": { "must": [ {"key": "user_id", "match": {"value": user_id}}, {"key": "content_hash", "match": {"value": content_hash}} ] }, "limit": 1, "with_payload": False } req = urllib.request.Request( f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", data=json.dumps(search_body).encode(), headers={"Content-Type": "application/json"} ) with urllib.request.urlopen(req, timeout=10) as response: result = json.loads(response.read().decode()) points = result.get("result", {}).get("points", []) if len(points) > 0: return True except Exception: pass return False def get_embedding(text: str) -> Optional[List[float]]: """Generate embedding using snowflake-arctic-embed2""" data = json.dumps({ "model": "snowflake-arctic-embed2", "input": text[:8192] }).encode() req = urllib.request.Request( f"{OLLAMA_URL}/embeddings", data=data, headers={"Content-Type": "application/json"} ) try: with urllib.request.urlopen(req, timeout=30) as response: result = json.loads(response.read().decode()) return result["data"][0]["embedding"] except Exception as e: print(f"[Harvest] Embedding error: {e}", file=sys.stderr) return None def store_turn(user_id: str, user_msg: str, ai_response: str, date_str: str, conversation_id: str, turn_number: int, session_id: str, dry_run: bool = False) -> Dict: """Store a single conversation turn to Qdrant""" content_hash = get_content_hash(user_msg, ai_response) # Check duplicate if is_duplicate(user_id, content_hash): return {"skipped": True, "reason": "duplicate"} if dry_run: return {"skipped": False, "dry_run": True} # Generate embeddings user_embedding = get_embedding(f"[{user_id}]: {user_msg}") ai_embedding = get_embedding(f"[Kimi]: {ai_response}") summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}..." summary_embedding = get_embedding(summary) if not all([user_embedding, ai_embedding, summary_embedding]): return {"skipped": True, "reason": "embedding_failed"} tags = ["conversation", "harvested", f"user:{user_id}", date_str] importance = "high" if any(kw in (user_msg + ai_response).lower() for kw in ["remember", "important", "always", "never", "rule"]) else "medium" points = [] # User message points.append({ "id": str(uuid.uuid4()), "vector": user_embedding, "payload": { "user_id": user_id, "text": f"[{user_id}]: {user_msg[:2000]}", "date": date_str, "tags": tags + ["user-message"], "importance": importance, "source": "session_harvest", "source_type": "user", "category": "Full Conversation", "confidence": "high", "verified": True, "created_at": datetime.now().isoformat(), "conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id, "content_hash": content_hash } }) # AI response points.append({ "id": str(uuid.uuid4()), "vector": ai_embedding, "payload": { "user_id": user_id, "text": f"[Kimi]: {ai_response[:2000]}", "date": date_str, "tags": tags + ["ai-response"], "importance": importance, "source": "session_harvest", "source_type": "assistant", "category": "Full Conversation", "confidence": "high", "verified": True, "created_at": datetime.now().isoformat(), "conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id, "content_hash": content_hash } }) # Summary if summary_embedding: points.append({ "id": str(uuid.uuid4()), "vector": summary_embedding, "payload": { "user_id": user_id, "text": f"[Turn {turn_number}] {summary}", "date": date_str, "tags": tags + ["summary"], "importance": importance, "source": "session_harvest_summary", "source_type": "system", "category": "Conversation Summary", "confidence": "high", "verified": True, "created_at": datetime.now().isoformat(), "conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id, "content_hash": content_hash, "user_message": user_msg[:500], "ai_response": ai_response[:800] } }) # Upload upsert_data = {"points": points} req = urllib.request.Request( f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", data=json.dumps(upsert_data).encode(), headers={"Content-Type": "application/json"}, method="PUT" ) try: with urllib.request.urlopen(req, timeout=30) as response: result = json.loads(response.read().decode()) if result.get("status") == "ok": _recent_hashes.add(content_hash) return {"skipped": False, "stored": True} except Exception as e: print(f"[Harvest] Storage error: {e}", file=sys.stderr) return {"skipped": True, "reason": "upload_failed"} def parse_session_file(filepath: Path) -> List[Dict]: """Parse a session JSONL file and extract conversation turns""" turns = [] turn_number = 0 try: with open(filepath, 'r') as f: for line in f: line = line.strip() if not line: continue try: entry = json.loads(line) if entry.get('type') == 'message' and 'message' in entry: msg = entry['message'] role = msg.get('role') if role == 'toolResult': continue content = "" if isinstance(msg.get('content'), list): for item in msg['content']: if isinstance(item, dict): if 'text' in item: content += item['text'] elif 'thinking' in item: content += f"[thinking: {item['thinking'][:200]}...]" elif isinstance(msg.get('content'), str): content = msg['content'] if content and role in ('user', 'assistant'): turn_number += 1 timestamp = entry.get('timestamp', '') date_str = timestamp[:10] if timestamp else datetime.now().strftime("%Y-%m-%d") turns.append({ 'turn': turn_number, 'role': role, 'content': content[:2000], 'date': date_str, 'session': filepath.stem }) except json.JSONDecodeError: continue except Exception as e: print(f"[Harvest] Error reading {filepath}: {e}", file=sys.stderr) return turns def main(): parser = argparse.ArgumentParser(description="Harvest session files to Qdrant") parser.add_argument("--user-id", default="yourname", help="User ID for storage") parser.add_argument("--dry-run", action="store_true", help="Don't actually store") parser.add_argument("--limit", type=int, default=0, help="Limit sessions (0=all)") args = parser.parse_args() # Find all session files session_files = sorted(SESSIONS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime) if args.limit > 0: session_files = session_files[:args.limit] print(f"Found {len(session_files)} session files") total_stored = 0 total_skipped = 0 total_failed = 0 for i, session_file in enumerate(session_files, 1): print(f"\n[{i}/{len(session_files)}] Processing: {session_file.name}") turns = parse_session_file(session_file) if not turns: print(" No turns found") continue print(f" Found {len(turns)} turns") # Pair user messages with AI responses conversation_id = str(uuid.uuid4()) j = 0 while j < len(turns): turn = turns[j] if turn['role'] == 'user': user_msg = turn['content'] ai_response = "" # Look for next AI response if j + 1 < len(turns) and turns[j + 1]['role'] == 'assistant': ai_response = turns[j + 1]['content'] j += 2 else: j += 1 if user_msg and ai_response: result = store_turn( user_id=args.user_id, user_msg=user_msg, ai_response=ai_response, date_str=turn['date'], conversation_id=conversation_id, turn_number=turn['turn'], session_id=turn['session'], dry_run=args.dry_run ) if result.get("skipped"): if result.get("reason") == "duplicate": total_skipped += 1 else: total_failed += 1 else: total_stored += 1 if total_stored % 10 == 0: print(f" Progress: {total_stored} stored, {total_skipped} skipped") else: j += 1 print(f"\n{'='*50}") print(f"Harvest complete:") print(f" Stored: {total_stored} turns ({total_stored * 3} embeddings)") print(f" Skipped (duplicates): {total_skipped}") print(f" Failed: {total_failed}") if args.dry_run: print("\n[DRY RUN] Nothing was actually stored") if __name__ == "__main__": main()