341 lines
12 KiB
Python
Executable File
341 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Harvest all session JSONL files and store to Qdrant.
|
|
|
|
Scans all session files, extracts conversation turns, and stores to Qdrant
|
|
with proper user_id and deduplication.
|
|
|
|
Usage: python3 harvest_sessions.py [--user-id rob] [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import sys
|
|
import urllib.request
|
|
import uuid
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any
|
|
|
|
QDRANT_URL = "http://10.0.0.40:6333"
|
|
COLLECTION_NAME = "kimi_memories"
|
|
OLLAMA_URL = "http://10.0.0.10:11434/v1"
|
|
SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions")
|
|
|
|
# In-memory cache for deduplication
|
|
_recent_hashes = set()
|
|
|
|
def get_content_hash(user_msg: str, ai_response: str) -> str:
|
|
"""Generate hash for deduplication"""
|
|
content = f"{user_msg.strip()}::{ai_response.strip()}"
|
|
return hashlib.md5(content.encode()).hexdigest()
|
|
|
|
def is_duplicate(user_id: str, content_hash: str) -> bool:
|
|
"""Check if this content already exists for this user"""
|
|
if content_hash in _recent_hashes:
|
|
return True
|
|
|
|
try:
|
|
search_body = {
|
|
"filter": {
|
|
"must": [
|
|
{"key": "user_id", "match": {"value": user_id}},
|
|
{"key": "content_hash", "match": {"value": content_hash}}
|
|
]
|
|
},
|
|
"limit": 1,
|
|
"with_payload": False
|
|
}
|
|
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
|
|
data=json.dumps(search_body).encode(),
|
|
headers={"Content-Type": "application/json"}
|
|
)
|
|
|
|
with urllib.request.urlopen(req, timeout=10) as response:
|
|
result = json.loads(response.read().decode())
|
|
points = result.get("result", {}).get("points", [])
|
|
if len(points) > 0:
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
return False
|
|
|
|
def get_embedding(text: str) -> Optional[List[float]]:
|
|
"""Generate embedding using snowflake-arctic-embed2"""
|
|
data = json.dumps({
|
|
"model": "snowflake-arctic-embed2",
|
|
"input": text[:8192]
|
|
}).encode()
|
|
|
|
req = urllib.request.Request(
|
|
f"{OLLAMA_URL}/embeddings",
|
|
data=data,
|
|
headers={"Content-Type": "application/json"}
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
|
result = json.loads(response.read().decode())
|
|
return result["data"][0]["embedding"]
|
|
except Exception as e:
|
|
print(f"[Harvest] Embedding error: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
def store_turn(user_id: str, user_msg: str, ai_response: str,
|
|
date_str: str, conversation_id: str, turn_number: int,
|
|
session_id: str, dry_run: bool = False) -> Dict:
|
|
"""Store a single conversation turn to Qdrant"""
|
|
|
|
content_hash = get_content_hash(user_msg, ai_response)
|
|
|
|
# Check duplicate
|
|
if is_duplicate(user_id, content_hash):
|
|
return {"skipped": True, "reason": "duplicate"}
|
|
|
|
if dry_run:
|
|
return {"skipped": False, "dry_run": True}
|
|
|
|
# Generate embeddings
|
|
user_embedding = get_embedding(f"[{user_id}]: {user_msg}")
|
|
ai_embedding = get_embedding(f"[Kimi]: {ai_response}")
|
|
summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}..."
|
|
summary_embedding = get_embedding(summary)
|
|
|
|
if not all([user_embedding, ai_embedding, summary_embedding]):
|
|
return {"skipped": True, "reason": "embedding_failed"}
|
|
|
|
tags = ["conversation", "harvested", f"user:{user_id}", date_str]
|
|
importance = "high" if any(kw in (user_msg + ai_response).lower()
|
|
for kw in ["remember", "important", "always", "never", "rule"]) else "medium"
|
|
|
|
points = []
|
|
|
|
# User message
|
|
points.append({
|
|
"id": str(uuid.uuid4()),
|
|
"vector": user_embedding,
|
|
"payload": {
|
|
"user_id": user_id,
|
|
"text": f"[{user_id}]: {user_msg[:2000]}",
|
|
"date": date_str,
|
|
"tags": tags + ["user-message"],
|
|
"importance": importance,
|
|
"source": "session_harvest",
|
|
"source_type": "user",
|
|
"category": "Full Conversation",
|
|
"confidence": "high",
|
|
"verified": True,
|
|
"created_at": datetime.now().isoformat(),
|
|
"conversation_id": conversation_id,
|
|
"turn_number": turn_number,
|
|
"session_id": session_id,
|
|
"content_hash": content_hash
|
|
}
|
|
})
|
|
|
|
# AI response
|
|
points.append({
|
|
"id": str(uuid.uuid4()),
|
|
"vector": ai_embedding,
|
|
"payload": {
|
|
"user_id": user_id,
|
|
"text": f"[Kimi]: {ai_response[:2000]}",
|
|
"date": date_str,
|
|
"tags": tags + ["ai-response"],
|
|
"importance": importance,
|
|
"source": "session_harvest",
|
|
"source_type": "assistant",
|
|
"category": "Full Conversation",
|
|
"confidence": "high",
|
|
"verified": True,
|
|
"created_at": datetime.now().isoformat(),
|
|
"conversation_id": conversation_id,
|
|
"turn_number": turn_number,
|
|
"session_id": session_id,
|
|
"content_hash": content_hash
|
|
}
|
|
})
|
|
|
|
# Summary
|
|
if summary_embedding:
|
|
points.append({
|
|
"id": str(uuid.uuid4()),
|
|
"vector": summary_embedding,
|
|
"payload": {
|
|
"user_id": user_id,
|
|
"text": f"[Turn {turn_number}] {summary}",
|
|
"date": date_str,
|
|
"tags": tags + ["summary"],
|
|
"importance": importance,
|
|
"source": "session_harvest_summary",
|
|
"source_type": "system",
|
|
"category": "Conversation Summary",
|
|
"confidence": "high",
|
|
"verified": True,
|
|
"created_at": datetime.now().isoformat(),
|
|
"conversation_id": conversation_id,
|
|
"turn_number": turn_number,
|
|
"session_id": session_id,
|
|
"content_hash": content_hash,
|
|
"user_message": user_msg[:500],
|
|
"ai_response": ai_response[:800]
|
|
}
|
|
})
|
|
|
|
# Upload
|
|
upsert_data = {"points": points}
|
|
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
|
|
data=json.dumps(upsert_data).encode(),
|
|
headers={"Content-Type": "application/json"},
|
|
method="PUT"
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
|
result = json.loads(response.read().decode())
|
|
if result.get("status") == "ok":
|
|
_recent_hashes.add(content_hash)
|
|
return {"skipped": False, "stored": True}
|
|
except Exception as e:
|
|
print(f"[Harvest] Storage error: {e}", file=sys.stderr)
|
|
|
|
return {"skipped": True, "reason": "upload_failed"}
|
|
|
|
def parse_session_file(filepath: Path) -> List[Dict]:
|
|
"""Parse a session JSONL file and extract conversation turns"""
|
|
turns = []
|
|
turn_number = 0
|
|
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
entry = json.loads(line)
|
|
if entry.get('type') == 'message' and 'message' in entry:
|
|
msg = entry['message']
|
|
role = msg.get('role')
|
|
|
|
if role == 'toolResult':
|
|
continue
|
|
|
|
content = ""
|
|
if isinstance(msg.get('content'), list):
|
|
for item in msg['content']:
|
|
if isinstance(item, dict):
|
|
if 'text' in item:
|
|
content += item['text']
|
|
elif 'thinking' in item:
|
|
content += f"[thinking: {item['thinking'][:200]}...]"
|
|
elif isinstance(msg.get('content'), str):
|
|
content = msg['content']
|
|
|
|
if content and role in ('user', 'assistant'):
|
|
turn_number += 1
|
|
timestamp = entry.get('timestamp', '')
|
|
date_str = timestamp[:10] if timestamp else datetime.now().strftime("%Y-%m-%d")
|
|
|
|
turns.append({
|
|
'turn': turn_number,
|
|
'role': role,
|
|
'content': content[:2000],
|
|
'date': date_str,
|
|
'session': filepath.stem
|
|
})
|
|
except json.JSONDecodeError:
|
|
continue
|
|
except Exception as e:
|
|
print(f"[Harvest] Error reading {filepath}: {e}", file=sys.stderr)
|
|
|
|
return turns
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Harvest session files to Qdrant")
|
|
parser.add_argument("--user-id", default="yourname", help="User ID for storage")
|
|
parser.add_argument("--dry-run", action="store_true", help="Don't actually store")
|
|
parser.add_argument("--limit", type=int, default=0, help="Limit sessions (0=all)")
|
|
args = parser.parse_args()
|
|
|
|
# Find all session files
|
|
session_files = sorted(SESSIONS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime)
|
|
|
|
if args.limit > 0:
|
|
session_files = session_files[:args.limit]
|
|
|
|
print(f"Found {len(session_files)} session files")
|
|
|
|
total_stored = 0
|
|
total_skipped = 0
|
|
total_failed = 0
|
|
|
|
for i, session_file in enumerate(session_files, 1):
|
|
print(f"\n[{i}/{len(session_files)}] Processing: {session_file.name}")
|
|
|
|
turns = parse_session_file(session_file)
|
|
if not turns:
|
|
print(" No turns found")
|
|
continue
|
|
|
|
print(f" Found {len(turns)} turns")
|
|
|
|
# Pair user messages with AI responses
|
|
conversation_id = str(uuid.uuid4())
|
|
j = 0
|
|
while j < len(turns):
|
|
turn = turns[j]
|
|
|
|
if turn['role'] == 'user':
|
|
user_msg = turn['content']
|
|
ai_response = ""
|
|
|
|
# Look for next AI response
|
|
if j + 1 < len(turns) and turns[j + 1]['role'] == 'assistant':
|
|
ai_response = turns[j + 1]['content']
|
|
j += 2
|
|
else:
|
|
j += 1
|
|
|
|
if user_msg and ai_response:
|
|
result = store_turn(
|
|
user_id=args.user_id,
|
|
user_msg=user_msg,
|
|
ai_response=ai_response,
|
|
date_str=turn['date'],
|
|
conversation_id=conversation_id,
|
|
turn_number=turn['turn'],
|
|
session_id=turn['session'],
|
|
dry_run=args.dry_run
|
|
)
|
|
|
|
if result.get("skipped"):
|
|
if result.get("reason") == "duplicate":
|
|
total_skipped += 1
|
|
else:
|
|
total_failed += 1
|
|
else:
|
|
total_stored += 1
|
|
if total_stored % 10 == 0:
|
|
print(f" Progress: {total_stored} stored, {total_skipped} skipped")
|
|
else:
|
|
j += 1
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"Harvest complete:")
|
|
print(f" Stored: {total_stored} turns ({total_stored * 3} embeddings)")
|
|
print(f" Skipped (duplicates): {total_skipped}")
|
|
print(f" Failed: {total_failed}")
|
|
|
|
if args.dry_run:
|
|
print("\n[DRY RUN] Nothing was actually stored")
|
|
|
|
if __name__ == "__main__":
|
|
main() |