187 lines
5.3 KiB
Python
187 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migrate memories from kimi_memories to memories_tr
|
|
- Reads from kimi_memories (Qdrant)
|
|
- Cleans/strips noise (metadata, thinking tags)
|
|
- Stores to memories_tr (Qdrant)
|
|
- Keeps original kimi_memories intact
|
|
"""
|
|
|
|
import json
|
|
import urllib.request
|
|
import urllib.error
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
|
|
QDRANT_URL = "http://10.0.0.40:6333"
|
|
SOURCE_COLLECTION = "kimi_memories"
|
|
TARGET_COLLECTION = "memories_tr"
|
|
|
|
def clean_content(text: str) -> str:
|
|
"""Clean noise from content"""
|
|
if not text:
|
|
return ""
|
|
|
|
cleaned = text
|
|
|
|
# Remove metadata JSON blocks
|
|
import re
|
|
cleaned = re.sub(r'Conversation info \(untrusted metadata\):\s*```json\s*\{[\s\S]*?\}\s*```', '', cleaned)
|
|
|
|
# Remove thinking tags
|
|
cleaned = re.sub(r'\[thinking:[^\]]*\]', '', cleaned)
|
|
|
|
# Remove timestamp lines
|
|
cleaned = re.sub(r'\[\w{3} \d{4}-\d{2}-\d{2} \d{2}:\d{2} [A-Z]{3}\]', '', cleaned)
|
|
|
|
# Clean up whitespace
|
|
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
|
cleaned = cleaned.strip()
|
|
|
|
return cleaned
|
|
|
|
def get_all_points(collection: str) -> List[Dict]:
|
|
"""Get all points from a collection"""
|
|
all_points = []
|
|
offset = None
|
|
max_iterations = 1000
|
|
iterations = 0
|
|
|
|
while iterations < max_iterations:
|
|
iterations += 1
|
|
scroll_data = {
|
|
"limit": 100,
|
|
"with_payload": True,
|
|
"with_vector": True
|
|
}
|
|
|
|
if offset:
|
|
scroll_data["offset"] = offset
|
|
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
|
data=json.dumps(scroll_data).encode(),
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST"
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=60) as response:
|
|
result = json.loads(response.read().decode())
|
|
points = result.get("result", {}).get("points", [])
|
|
|
|
if not points:
|
|
break
|
|
|
|
all_points.extend(points)
|
|
|
|
offset = result.get("result", {}).get("next_page_offset")
|
|
if not offset:
|
|
break
|
|
except urllib.error.HTTPError as e:
|
|
print(f"Error: {e}")
|
|
break
|
|
|
|
return all_points
|
|
|
|
def store_points(collection: str, points: List[Dict]) -> int:
|
|
"""Store points to collection"""
|
|
if not points:
|
|
return 0
|
|
|
|
# Batch upload
|
|
batch_size = 100
|
|
stored = 0
|
|
|
|
for i in range(0, len(points), batch_size):
|
|
batch = points[i:i+batch_size]
|
|
|
|
points_data = {
|
|
"points": batch
|
|
}
|
|
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{collection}/points",
|
|
data=json.dumps(points_data).encode(),
|
|
headers={"Content-Type": "application/json"},
|
|
method="PUT"
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=60) as response:
|
|
if response.status == 200:
|
|
stored += len(batch)
|
|
except urllib.error.HTTPError as e:
|
|
print(f"Error storing batch: {e}")
|
|
|
|
return stored
|
|
|
|
def migrate_point(point: Dict) -> Dict:
|
|
"""Clean a single point"""
|
|
payload = point.get("payload", {})
|
|
|
|
# Clean user and AI messages
|
|
user_msg = clean_content(payload.get("user_message", ""))
|
|
ai_msg = clean_content(payload.get("ai_response", ""))
|
|
|
|
# Keep other fields
|
|
cleaned_payload = {
|
|
**payload,
|
|
"user_message": user_msg,
|
|
"ai_response": ai_msg,
|
|
"migrated_from": "kimi_memories",
|
|
"migrated_at": datetime.now().isoformat()
|
|
}
|
|
|
|
return {
|
|
"id": point.get("id"),
|
|
"vector": point.get("vector"),
|
|
"payload": cleaned_payload
|
|
}
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Memory Migration: kimi_memories → memories_tr")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
# Check source
|
|
print(f"📥 Reading from {SOURCE_COLLECTION}...")
|
|
source_points = get_all_points(SOURCE_COLLECTION)
|
|
print(f" Found {len(source_points)} points")
|
|
|
|
if not source_points:
|
|
print("❌ No points to migrate")
|
|
return
|
|
|
|
# Clean points
|
|
print(f"\n🧹 Cleaning {len(source_points)} points...")
|
|
cleaned_points = [migrate_point(p) for p in source_points]
|
|
print(f" ✓ Cleaned")
|
|
|
|
# Store to target
|
|
print(f"\n💾 Storing to {TARGET_COLLECTION}...")
|
|
stored = store_points(TARGET_COLLECTION, cleaned_points)
|
|
print(f" ✓ Stored {stored} points")
|
|
|
|
# Verify
|
|
print(f"\n🔍 Verifying...")
|
|
target_points = get_all_points(TARGET_COLLECTION)
|
|
print(f" Target now has {len(target_points)} points")
|
|
|
|
# Summary
|
|
print()
|
|
print("=" * 60)
|
|
print("Migration Summary:")
|
|
print(f" Source ({SOURCE_COLLECTION}): {len(source_points)} points")
|
|
print(f" Target ({TARGET_COLLECTION}): {len(target_points)} points")
|
|
print(f" Cleaned & migrated: {stored} points")
|
|
print("=" * 60)
|
|
|
|
if stored == len(source_points):
|
|
print("\n✅ Migration complete!")
|
|
else:
|
|
print(f"\n⚠️ Warning: Only migrated {stored}/{len(source_points)} points")
|
|
|
|
if __name__ == "__main__":
|
|
main() |