Files
true-recall/migrate_memories.py

187 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
Migrate memories from kimi_memories to memories_tr
- Reads from kimi_memories (Qdrant)
- Cleans/strips noise (metadata, thinking tags)
- Stores to memories_tr (Qdrant)
- Keeps original kimi_memories intact
"""
import json
import urllib.request
import urllib.error
from datetime import datetime
from typing import List, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
SOURCE_COLLECTION = "kimi_memories"
TARGET_COLLECTION = "memories_tr"
def clean_content(text: str) -> str:
"""Clean noise from content"""
if not text:
return ""
cleaned = text
# Remove metadata JSON blocks
import re
cleaned = re.sub(r'Conversation info \(untrusted metadata\):\s*```json\s*\{[\s\S]*?\}\s*```', '', cleaned)
# Remove thinking tags
cleaned = re.sub(r'\[thinking:[^\]]*\]', '', cleaned)
# Remove timestamp lines
cleaned = re.sub(r'\[\w{3} \d{4}-\d{2}-\d{2} \d{2}:\d{2} [A-Z]{3}\]', '', cleaned)
# Clean up whitespace
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
cleaned = cleaned.strip()
return cleaned
def get_all_points(collection: str) -> List[Dict]:
"""Get all points from a collection"""
all_points = []
offset = None
max_iterations = 1000
iterations = 0
while iterations < max_iterations:
iterations += 1
scroll_data = {
"limit": 100,
"with_payload": True,
"with_vector": True
}
if offset:
scroll_data["offset"] = offset
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{collection}/points/scroll",
data=json.dumps(scroll_data).encode(),
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if not points:
break
all_points.extend(points)
offset = result.get("result", {}).get("next_page_offset")
if not offset:
break
except urllib.error.HTTPError as e:
print(f"Error: {e}")
break
return all_points
def store_points(collection: str, points: List[Dict]) -> int:
"""Store points to collection"""
if not points:
return 0
# Batch upload
batch_size = 100
stored = 0
for i in range(0, len(points), batch_size):
batch = points[i:i+batch_size]
points_data = {
"points": batch
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{collection}/points",
data=json.dumps(points_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
if response.status == 200:
stored += len(batch)
except urllib.error.HTTPError as e:
print(f"Error storing batch: {e}")
return stored
def migrate_point(point: Dict) -> Dict:
"""Clean a single point"""
payload = point.get("payload", {})
# Clean user and AI messages
user_msg = clean_content(payload.get("user_message", ""))
ai_msg = clean_content(payload.get("ai_response", ""))
# Keep other fields
cleaned_payload = {
**payload,
"user_message": user_msg,
"ai_response": ai_msg,
"migrated_from": "kimi_memories",
"migrated_at": datetime.now().isoformat()
}
return {
"id": point.get("id"),
"vector": point.get("vector"),
"payload": cleaned_payload
}
def main():
print("=" * 60)
print("Memory Migration: kimi_memories → memories_tr")
print("=" * 60)
print()
# Check source
print(f"📥 Reading from {SOURCE_COLLECTION}...")
source_points = get_all_points(SOURCE_COLLECTION)
print(f" Found {len(source_points)} points")
if not source_points:
print("❌ No points to migrate")
return
# Clean points
print(f"\n🧹 Cleaning {len(source_points)} points...")
cleaned_points = [migrate_point(p) for p in source_points]
print(f" ✓ Cleaned")
# Store to target
print(f"\n💾 Storing to {TARGET_COLLECTION}...")
stored = store_points(TARGET_COLLECTION, cleaned_points)
print(f" ✓ Stored {stored} points")
# Verify
print(f"\n🔍 Verifying...")
target_points = get_all_points(TARGET_COLLECTION)
print(f" Target now has {len(target_points)} points")
# Summary
print()
print("=" * 60)
print("Migration Summary:")
print(f" Source ({SOURCE_COLLECTION}): {len(source_points)} points")
print(f" Target ({TARGET_COLLECTION}): {len(target_points)} points")
print(f" Cleaned & migrated: {stored} points")
print("=" * 60)
if stored == len(source_points):
print("\n✅ Migration complete!")
else:
print(f"\n⚠️ Warning: Only migrated {stored}/{len(source_points)} points")
if __name__ == "__main__":
main()