forked from SpeedyFoxAi/jarvis-memory
Initial commit: Jarvis Memory system
This commit is contained in:
341
skills/qdrant-memory/scripts/harvest_sessions.py
Executable file
341
skills/qdrant-memory/scripts/harvest_sessions.py
Executable file
@@ -0,0 +1,341 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Harvest all session JSONL files and store to Qdrant.
|
||||
|
||||
Scans all session files, extracts conversation turns, and stores to Qdrant
|
||||
with proper user_id and deduplication.
|
||||
|
||||
Usage: python3 harvest_sessions.py [--user-id rob] [--dry-run]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
QDRANT_URL = "http://10.0.0.40:6333"
|
||||
COLLECTION_NAME = "kimi_memories"
|
||||
OLLAMA_URL = "http://10.0.0.10:11434/v1"
|
||||
SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions")
|
||||
|
||||
# In-memory cache for deduplication
|
||||
_recent_hashes = set()
|
||||
|
||||
def get_content_hash(user_msg: str, ai_response: str) -> str:
|
||||
"""Generate hash for deduplication"""
|
||||
content = f"{user_msg.strip()}::{ai_response.strip()}"
|
||||
return hashlib.md5(content.encode()).hexdigest()
|
||||
|
||||
def is_duplicate(user_id: str, content_hash: str) -> bool:
|
||||
"""Check if this content already exists for this user"""
|
||||
if content_hash in _recent_hashes:
|
||||
return True
|
||||
|
||||
try:
|
||||
search_body = {
|
||||
"filter": {
|
||||
"must": [
|
||||
{"key": "user_id", "match": {"value": user_id}},
|
||||
{"key": "content_hash", "match": {"value": content_hash}}
|
||||
]
|
||||
},
|
||||
"limit": 1,
|
||||
"with_payload": False
|
||||
}
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
|
||||
data=json.dumps(search_body).encode(),
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
with urllib.request.urlopen(req, timeout=10) as response:
|
||||
result = json.loads(response.read().decode())
|
||||
points = result.get("result", {}).get("points", [])
|
||||
if len(points) > 0:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
def get_embedding(text: str) -> Optional[List[float]]:
|
||||
"""Generate embedding using snowflake-arctic-embed2"""
|
||||
data = json.dumps({
|
||||
"model": "snowflake-arctic-embed2",
|
||||
"input": text[:8192]
|
||||
}).encode()
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{OLLAMA_URL}/embeddings",
|
||||
data=data,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
result = json.loads(response.read().decode())
|
||||
return result["data"][0]["embedding"]
|
||||
except Exception as e:
|
||||
print(f"[Harvest] Embedding error: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def store_turn(user_id: str, user_msg: str, ai_response: str,
|
||||
date_str: str, conversation_id: str, turn_number: int,
|
||||
session_id: str, dry_run: bool = False) -> Dict:
|
||||
"""Store a single conversation turn to Qdrant"""
|
||||
|
||||
content_hash = get_content_hash(user_msg, ai_response)
|
||||
|
||||
# Check duplicate
|
||||
if is_duplicate(user_id, content_hash):
|
||||
return {"skipped": True, "reason": "duplicate"}
|
||||
|
||||
if dry_run:
|
||||
return {"skipped": False, "dry_run": True}
|
||||
|
||||
# Generate embeddings
|
||||
user_embedding = get_embedding(f"[{user_id}]: {user_msg}")
|
||||
ai_embedding = get_embedding(f"[Kimi]: {ai_response}")
|
||||
summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}..."
|
||||
summary_embedding = get_embedding(summary)
|
||||
|
||||
if not all([user_embedding, ai_embedding, summary_embedding]):
|
||||
return {"skipped": True, "reason": "embedding_failed"}
|
||||
|
||||
tags = ["conversation", "harvested", f"user:{user_id}", date_str]
|
||||
importance = "high" if any(kw in (user_msg + ai_response).lower()
|
||||
for kw in ["remember", "important", "always", "never", "rule"]) else "medium"
|
||||
|
||||
points = []
|
||||
|
||||
# User message
|
||||
points.append({
|
||||
"id": str(uuid.uuid4()),
|
||||
"vector": user_embedding,
|
||||
"payload": {
|
||||
"user_id": user_id,
|
||||
"text": f"[{user_id}]: {user_msg[:2000]}",
|
||||
"date": date_str,
|
||||
"tags": tags + ["user-message"],
|
||||
"importance": importance,
|
||||
"source": "session_harvest",
|
||||
"source_type": "user",
|
||||
"category": "Full Conversation",
|
||||
"confidence": "high",
|
||||
"verified": True,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"conversation_id": conversation_id,
|
||||
"turn_number": turn_number,
|
||||
"session_id": session_id,
|
||||
"content_hash": content_hash
|
||||
}
|
||||
})
|
||||
|
||||
# AI response
|
||||
points.append({
|
||||
"id": str(uuid.uuid4()),
|
||||
"vector": ai_embedding,
|
||||
"payload": {
|
||||
"user_id": user_id,
|
||||
"text": f"[Kimi]: {ai_response[:2000]}",
|
||||
"date": date_str,
|
||||
"tags": tags + ["ai-response"],
|
||||
"importance": importance,
|
||||
"source": "session_harvest",
|
||||
"source_type": "assistant",
|
||||
"category": "Full Conversation",
|
||||
"confidence": "high",
|
||||
"verified": True,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"conversation_id": conversation_id,
|
||||
"turn_number": turn_number,
|
||||
"session_id": session_id,
|
||||
"content_hash": content_hash
|
||||
}
|
||||
})
|
||||
|
||||
# Summary
|
||||
if summary_embedding:
|
||||
points.append({
|
||||
"id": str(uuid.uuid4()),
|
||||
"vector": summary_embedding,
|
||||
"payload": {
|
||||
"user_id": user_id,
|
||||
"text": f"[Turn {turn_number}] {summary}",
|
||||
"date": date_str,
|
||||
"tags": tags + ["summary"],
|
||||
"importance": importance,
|
||||
"source": "session_harvest_summary",
|
||||
"source_type": "system",
|
||||
"category": "Conversation Summary",
|
||||
"confidence": "high",
|
||||
"verified": True,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"conversation_id": conversation_id,
|
||||
"turn_number": turn_number,
|
||||
"session_id": session_id,
|
||||
"content_hash": content_hash,
|
||||
"user_message": user_msg[:500],
|
||||
"ai_response": ai_response[:800]
|
||||
}
|
||||
})
|
||||
|
||||
# Upload
|
||||
upsert_data = {"points": points}
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
|
||||
data=json.dumps(upsert_data).encode(),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="PUT"
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
result = json.loads(response.read().decode())
|
||||
if result.get("status") == "ok":
|
||||
_recent_hashes.add(content_hash)
|
||||
return {"skipped": False, "stored": True}
|
||||
except Exception as e:
|
||||
print(f"[Harvest] Storage error: {e}", file=sys.stderr)
|
||||
|
||||
return {"skipped": True, "reason": "upload_failed"}
|
||||
|
||||
def parse_session_file(filepath: Path) -> List[Dict]:
|
||||
"""Parse a session JSONL file and extract conversation turns"""
|
||||
turns = []
|
||||
turn_number = 0
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
if entry.get('type') == 'message' and 'message' in entry:
|
||||
msg = entry['message']
|
||||
role = msg.get('role')
|
||||
|
||||
if role == 'toolResult':
|
||||
continue
|
||||
|
||||
content = ""
|
||||
if isinstance(msg.get('content'), list):
|
||||
for item in msg['content']:
|
||||
if isinstance(item, dict):
|
||||
if 'text' in item:
|
||||
content += item['text']
|
||||
elif 'thinking' in item:
|
||||
content += f"[thinking: {item['thinking'][:200]}...]"
|
||||
elif isinstance(msg.get('content'), str):
|
||||
content = msg['content']
|
||||
|
||||
if content and role in ('user', 'assistant'):
|
||||
turn_number += 1
|
||||
timestamp = entry.get('timestamp', '')
|
||||
date_str = timestamp[:10] if timestamp else datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
turns.append({
|
||||
'turn': turn_number,
|
||||
'role': role,
|
||||
'content': content[:2000],
|
||||
'date': date_str,
|
||||
'session': filepath.stem
|
||||
})
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"[Harvest] Error reading {filepath}: {e}", file=sys.stderr)
|
||||
|
||||
return turns
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Harvest session files to Qdrant")
|
||||
parser.add_argument("--user-id", default="yourname", help="User ID for storage")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Don't actually store")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Limit sessions (0=all)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Find all session files
|
||||
session_files = sorted(SESSIONS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime)
|
||||
|
||||
if args.limit > 0:
|
||||
session_files = session_files[:args.limit]
|
||||
|
||||
print(f"Found {len(session_files)} session files")
|
||||
|
||||
total_stored = 0
|
||||
total_skipped = 0
|
||||
total_failed = 0
|
||||
|
||||
for i, session_file in enumerate(session_files, 1):
|
||||
print(f"\n[{i}/{len(session_files)}] Processing: {session_file.name}")
|
||||
|
||||
turns = parse_session_file(session_file)
|
||||
if not turns:
|
||||
print(" No turns found")
|
||||
continue
|
||||
|
||||
print(f" Found {len(turns)} turns")
|
||||
|
||||
# Pair user messages with AI responses
|
||||
conversation_id = str(uuid.uuid4())
|
||||
j = 0
|
||||
while j < len(turns):
|
||||
turn = turns[j]
|
||||
|
||||
if turn['role'] == 'user':
|
||||
user_msg = turn['content']
|
||||
ai_response = ""
|
||||
|
||||
# Look for next AI response
|
||||
if j + 1 < len(turns) and turns[j + 1]['role'] == 'assistant':
|
||||
ai_response = turns[j + 1]['content']
|
||||
j += 2
|
||||
else:
|
||||
j += 1
|
||||
|
||||
if user_msg and ai_response:
|
||||
result = store_turn(
|
||||
user_id=args.user_id,
|
||||
user_msg=user_msg,
|
||||
ai_response=ai_response,
|
||||
date_str=turn['date'],
|
||||
conversation_id=conversation_id,
|
||||
turn_number=turn['turn'],
|
||||
session_id=turn['session'],
|
||||
dry_run=args.dry_run
|
||||
)
|
||||
|
||||
if result.get("skipped"):
|
||||
if result.get("reason") == "duplicate":
|
||||
total_skipped += 1
|
||||
else:
|
||||
total_failed += 1
|
||||
else:
|
||||
total_stored += 1
|
||||
if total_stored % 10 == 0:
|
||||
print(f" Progress: {total_stored} stored, {total_skipped} skipped")
|
||||
else:
|
||||
j += 1
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Harvest complete:")
|
||||
print(f" Stored: {total_stored} turns ({total_stored * 3} embeddings)")
|
||||
print(f" Skipped (duplicates): {total_skipped}")
|
||||
print(f" Failed: {total_failed}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] Nothing was actually stored")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user