Initial commit: TrueRecall v2.2 with 30b curator and timer-based curation
This commit is contained in:
187
migrate_memories.py
Normal file
187
migrate_memories.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Migrate memories from kimi_memories to memories_tr
|
||||
- Reads from kimi_memories (Qdrant)
|
||||
- Cleans/strips noise (metadata, thinking tags)
|
||||
- Stores to memories_tr (Qdrant)
|
||||
- Keeps original kimi_memories intact
|
||||
"""
|
||||
|
||||
import json
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any
|
||||
|
||||
QDRANT_URL = "http://10.0.0.40:6333"
|
||||
SOURCE_COLLECTION = "kimi_memories"
|
||||
TARGET_COLLECTION = "memories_tr"
|
||||
|
||||
def clean_content(text: str) -> str:
|
||||
"""Clean noise from content"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
cleaned = text
|
||||
|
||||
# Remove metadata JSON blocks
|
||||
import re
|
||||
cleaned = re.sub(r'Conversation info \(untrusted metadata\):\s*```json\s*\{[\s\S]*?\}\s*```', '', cleaned)
|
||||
|
||||
# Remove thinking tags
|
||||
cleaned = re.sub(r'\[thinking:[^\]]*\]', '', cleaned)
|
||||
|
||||
# Remove timestamp lines
|
||||
cleaned = re.sub(r'\[\w{3} \d{4}-\d{2}-\d{2} \d{2}:\d{2} [A-Z]{3}\]', '', cleaned)
|
||||
|
||||
# Clean up whitespace
|
||||
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
return cleaned
|
||||
|
||||
def get_all_points(collection: str) -> List[Dict]:
|
||||
"""Get all points from a collection"""
|
||||
all_points = []
|
||||
offset = None
|
||||
max_iterations = 1000
|
||||
iterations = 0
|
||||
|
||||
while iterations < max_iterations:
|
||||
iterations += 1
|
||||
scroll_data = {
|
||||
"limit": 100,
|
||||
"with_payload": True,
|
||||
"with_vector": True
|
||||
}
|
||||
|
||||
if offset:
|
||||
scroll_data["offset"] = offset
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
||||
data=json.dumps(scroll_data).encode(),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST"
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as response:
|
||||
result = json.loads(response.read().decode())
|
||||
points = result.get("result", {}).get("points", [])
|
||||
|
||||
if not points:
|
||||
break
|
||||
|
||||
all_points.extend(points)
|
||||
|
||||
offset = result.get("result", {}).get("next_page_offset")
|
||||
if not offset:
|
||||
break
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"Error: {e}")
|
||||
break
|
||||
|
||||
return all_points
|
||||
|
||||
def store_points(collection: str, points: List[Dict]) -> int:
|
||||
"""Store points to collection"""
|
||||
if not points:
|
||||
return 0
|
||||
|
||||
# Batch upload
|
||||
batch_size = 100
|
||||
stored = 0
|
||||
|
||||
for i in range(0, len(points), batch_size):
|
||||
batch = points[i:i+batch_size]
|
||||
|
||||
points_data = {
|
||||
"points": batch
|
||||
}
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{QDRANT_URL}/collections/{collection}/points",
|
||||
data=json.dumps(points_data).encode(),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="PUT"
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as response:
|
||||
if response.status == 200:
|
||||
stored += len(batch)
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"Error storing batch: {e}")
|
||||
|
||||
return stored
|
||||
|
||||
def migrate_point(point: Dict) -> Dict:
|
||||
"""Clean a single point"""
|
||||
payload = point.get("payload", {})
|
||||
|
||||
# Clean user and AI messages
|
||||
user_msg = clean_content(payload.get("user_message", ""))
|
||||
ai_msg = clean_content(payload.get("ai_response", ""))
|
||||
|
||||
# Keep other fields
|
||||
cleaned_payload = {
|
||||
**payload,
|
||||
"user_message": user_msg,
|
||||
"ai_response": ai_msg,
|
||||
"migrated_from": "kimi_memories",
|
||||
"migrated_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
return {
|
||||
"id": point.get("id"),
|
||||
"vector": point.get("vector"),
|
||||
"payload": cleaned_payload
|
||||
}
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Memory Migration: kimi_memories → memories_tr")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Check source
|
||||
print(f"📥 Reading from {SOURCE_COLLECTION}...")
|
||||
source_points = get_all_points(SOURCE_COLLECTION)
|
||||
print(f" Found {len(source_points)} points")
|
||||
|
||||
if not source_points:
|
||||
print("❌ No points to migrate")
|
||||
return
|
||||
|
||||
# Clean points
|
||||
print(f"\n🧹 Cleaning {len(source_points)} points...")
|
||||
cleaned_points = [migrate_point(p) for p in source_points]
|
||||
print(f" ✓ Cleaned")
|
||||
|
||||
# Store to target
|
||||
print(f"\n💾 Storing to {TARGET_COLLECTION}...")
|
||||
stored = store_points(TARGET_COLLECTION, cleaned_points)
|
||||
print(f" ✓ Stored {stored} points")
|
||||
|
||||
# Verify
|
||||
print(f"\n🔍 Verifying...")
|
||||
target_points = get_all_points(TARGET_COLLECTION)
|
||||
print(f" Target now has {len(target_points)} points")
|
||||
|
||||
# Summary
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Migration Summary:")
|
||||
print(f" Source ({SOURCE_COLLECTION}): {len(source_points)} points")
|
||||
print(f" Target ({TARGET_COLLECTION}): {len(target_points)} points")
|
||||
print(f" Cleaned & migrated: {stored} points")
|
||||
print("=" * 60)
|
||||
|
||||
if stored == len(source_points):
|
||||
print("\n✅ Migration complete!")
|
||||
else:
|
||||
print(f"\n⚠️ Warning: Only migrated {stored}/{len(source_points)} points")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user