Files
jarvis-memory/skills/qdrant-memory/scripts/harvest_sessions.py

341 lines
12 KiB
Python
Raw Normal View History

2026-02-23 12:13:04 -06:00
#!/usr/bin/env python3
"""
Harvest all session JSONL files and store to Qdrant.
Scans all session files, extracts conversation turns, and stores to Qdrant
with proper user_id and deduplication.
Usage: python3 harvest_sessions.py [--user-id rob] [--dry-run]
"""
import argparse
import hashlib
import json
import os
import sys
import urllib.request
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://10.0.0.10:11434/v1"
SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions")
# In-memory cache for deduplication
_recent_hashes = set()
def get_content_hash(user_msg: str, ai_response: str) -> str:
"""Generate hash for deduplication"""
content = f"{user_msg.strip()}::{ai_response.strip()}"
return hashlib.md5(content.encode()).hexdigest()
def is_duplicate(user_id: str, content_hash: str) -> bool:
"""Check if this content already exists for this user"""
if content_hash in _recent_hashes:
return True
try:
search_body = {
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}},
{"key": "content_hash", "match": {"value": content_hash}}
]
},
"limit": 1,
"with_payload": False
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=json.dumps(search_body).encode(),
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if len(points) > 0:
return True
except Exception:
pass
return False
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"[Harvest] Embedding error: {e}", file=sys.stderr)
return None
def store_turn(user_id: str, user_msg: str, ai_response: str,
date_str: str, conversation_id: str, turn_number: int,
session_id: str, dry_run: bool = False) -> Dict:
"""Store a single conversation turn to Qdrant"""
content_hash = get_content_hash(user_msg, ai_response)
# Check duplicate
if is_duplicate(user_id, content_hash):
return {"skipped": True, "reason": "duplicate"}
if dry_run:
return {"skipped": False, "dry_run": True}
# Generate embeddings
user_embedding = get_embedding(f"[{user_id}]: {user_msg}")
ai_embedding = get_embedding(f"[Kimi]: {ai_response}")
summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}..."
summary_embedding = get_embedding(summary)
if not all([user_embedding, ai_embedding, summary_embedding]):
return {"skipped": True, "reason": "embedding_failed"}
tags = ["conversation", "harvested", f"user:{user_id}", date_str]
importance = "high" if any(kw in (user_msg + ai_response).lower()
for kw in ["remember", "important", "always", "never", "rule"]) else "medium"
points = []
# User message
points.append({
"id": str(uuid.uuid4()),
"vector": user_embedding,
"payload": {
"user_id": user_id,
"text": f"[{user_id}]: {user_msg[:2000]}",
"date": date_str,
"tags": tags + ["user-message"],
"importance": importance,
"source": "session_harvest",
"source_type": "user",
"category": "Full Conversation",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"session_id": session_id,
"content_hash": content_hash
}
})
# AI response
points.append({
"id": str(uuid.uuid4()),
"vector": ai_embedding,
"payload": {
"user_id": user_id,
"text": f"[Kimi]: {ai_response[:2000]}",
"date": date_str,
"tags": tags + ["ai-response"],
"importance": importance,
"source": "session_harvest",
"source_type": "assistant",
"category": "Full Conversation",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"session_id": session_id,
"content_hash": content_hash
}
})
# Summary
if summary_embedding:
points.append({
"id": str(uuid.uuid4()),
"vector": summary_embedding,
"payload": {
"user_id": user_id,
"text": f"[Turn {turn_number}] {summary}",
"date": date_str,
"tags": tags + ["summary"],
"importance": importance,
"source": "session_harvest_summary",
"source_type": "system",
"category": "Conversation Summary",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"session_id": session_id,
"content_hash": content_hash,
"user_message": user_msg[:500],
"ai_response": ai_response[:800]
}
})
# Upload
upsert_data = {"points": points}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
_recent_hashes.add(content_hash)
return {"skipped": False, "stored": True}
except Exception as e:
print(f"[Harvest] Storage error: {e}", file=sys.stderr)
return {"skipped": True, "reason": "upload_failed"}
def parse_session_file(filepath: Path) -> List[Dict]:
"""Parse a session JSONL file and extract conversation turns"""
turns = []
turn_number = 0
try:
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
if entry.get('type') == 'message' and 'message' in entry:
msg = entry['message']
role = msg.get('role')
if role == 'toolResult':
continue
content = ""
if isinstance(msg.get('content'), list):
for item in msg['content']:
if isinstance(item, dict):
if 'text' in item:
content += item['text']
elif 'thinking' in item:
content += f"[thinking: {item['thinking'][:200]}...]"
elif isinstance(msg.get('content'), str):
content = msg['content']
if content and role in ('user', 'assistant'):
turn_number += 1
timestamp = entry.get('timestamp', '')
date_str = timestamp[:10] if timestamp else datetime.now().strftime("%Y-%m-%d")
turns.append({
'turn': turn_number,
'role': role,
'content': content[:2000],
'date': date_str,
'session': filepath.stem
})
except json.JSONDecodeError:
continue
except Exception as e:
print(f"[Harvest] Error reading {filepath}: {e}", file=sys.stderr)
return turns
def main():
parser = argparse.ArgumentParser(description="Harvest session files to Qdrant")
parser.add_argument("--user-id", default="yourname", help="User ID for storage")
parser.add_argument("--dry-run", action="store_true", help="Don't actually store")
parser.add_argument("--limit", type=int, default=0, help="Limit sessions (0=all)")
args = parser.parse_args()
# Find all session files
session_files = sorted(SESSIONS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime)
if args.limit > 0:
session_files = session_files[:args.limit]
print(f"Found {len(session_files)} session files")
total_stored = 0
total_skipped = 0
total_failed = 0
for i, session_file in enumerate(session_files, 1):
print(f"\n[{i}/{len(session_files)}] Processing: {session_file.name}")
turns = parse_session_file(session_file)
if not turns:
print(" No turns found")
continue
print(f" Found {len(turns)} turns")
# Pair user messages with AI responses
conversation_id = str(uuid.uuid4())
j = 0
while j < len(turns):
turn = turns[j]
if turn['role'] == 'user':
user_msg = turn['content']
ai_response = ""
# Look for next AI response
if j + 1 < len(turns) and turns[j + 1]['role'] == 'assistant':
ai_response = turns[j + 1]['content']
j += 2
else:
j += 1
if user_msg and ai_response:
result = store_turn(
user_id=args.user_id,
user_msg=user_msg,
ai_response=ai_response,
date_str=turn['date'],
conversation_id=conversation_id,
turn_number=turn['turn'],
session_id=turn['session'],
dry_run=args.dry_run
)
if result.get("skipped"):
if result.get("reason") == "duplicate":
total_skipped += 1
else:
total_failed += 1
else:
total_stored += 1
if total_stored % 10 == 0:
print(f" Progress: {total_stored} stored, {total_skipped} skipped")
else:
j += 1
print(f"\n{'='*50}")
print(f"Harvest complete:")
print(f" Stored: {total_stored} turns ({total_stored * 3} embeddings)")
print(f" Skipped (duplicates): {total_skipped}")
print(f" Failed: {total_failed}")
if args.dry_run:
print("\n[DRY RUN] Nothing was actually stored")
if __name__ == "__main__":
main()