Files
jarvis-memory/skills/qdrant-memory/scripts/bulk_migrate.py

299 lines
9.9 KiB
Python
Raw Normal View History

2026-02-23 12:13:04 -06:00
#!/usr/bin/env python3
"""
Bulk memory migration to Qdrant kimi_memories collection
Uses snowflake-arctic-embed2 (1024 dimensions)
"""
import json
import os
import re
import sys
import urllib.request
import uuid
from datetime import datetime
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://localhost:11434/v1"
MEMORY_DIR = "/root/.openclaw/workspace/memory"
MEMORY_MD = "/root/.openclaw/workspace/MEMORY.md"
def get_embedding(text):
"""Generate embedding using snowflake-arctic-embed2 via Ollama"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192] # Limit text length
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def store_memory(text, embedding, tags=None, importance="medium", date=None,
source="memory_backup", confidence="high", source_type="user",
verified=True):
"""Store memory in Qdrant with metadata"""
if date is None:
date = datetime.now().strftime("%Y-%m-%d")
point_id = str(uuid.uuid4())
payload = {
"text": text,
"date": date,
"tags": tags or [],
"importance": importance,
"confidence": confidence,
"source_type": source_type,
"verified": verified,
"source": source,
"created_at": datetime.now().isoformat(),
"access_count": 0
}
point = {
"id": point_id,
"vector": embedding,
"payload": payload
}
data = json.dumps({"points": [point]}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("result", {}).get("status") == "ok"
except Exception as e:
print(f"Error storing memory: {e}", file=sys.stderr)
return False
def extract_memories_from_file(filepath, importance="medium"):
"""Extract memory entries from a markdown file"""
memories = []
try:
with open(filepath, 'r') as f:
content = f.read()
except Exception as e:
print(f"Error reading {filepath}: {e}", file=sys.stderr)
return memories
# Extract date from filename or content
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filepath)
date = date_match.group(1) if date_match else datetime.now().strftime("%Y-%m-%d")
# Parse sections
lines = content.split('\n')
current_section = None
current_content = []
for line in lines:
# Section headers
if line.startswith('# ') and 'Memory' in line:
continue # Skip title
elif line.startswith('## '):
# Save previous section
if current_section and current_content:
section_text = '\n'.join(current_content).strip()
if len(section_text) > 20:
memories.append({
"text": f"{current_section}: {section_text}",
"date": date,
"tags": extract_tags(current_section, section_text),
"importance": importance
})
current_section = line[3:].strip()
current_content = []
elif line.startswith('### '):
# Save previous section
if current_section and current_content:
section_text = '\n'.join(current_content).strip()
if len(section_text) > 20:
memories.append({
"text": f"{current_section}: {section_text}",
"date": date,
"tags": extract_tags(current_section, section_text),
"importance": importance
})
current_section = line[4:].strip()
current_content = []
else:
if current_section:
current_content.append(line)
# Save final section
if current_section and current_content:
section_text = '\n'.join(current_content).strip()
if len(section_text) > 20:
memories.append({
"text": f"{current_section}: {section_text}",
"date": date,
"tags": extract_tags(current_section, section_text),
"importance": importance
})
return memories
def extract_tags(section, content):
"""Extract relevant tags from section and content"""
tags = []
# Section-based tags
if any(word in section.lower() for word in ['voice', 'tts', 'stt', 'audio']):
tags.extend(['voice', 'audio'])
if any(word in section.lower() for word in ['memory', 'qdrant', 'remember']):
tags.extend(['memory', 'qdrant'])
if any(word in section.lower() for word in ['redis', 'agent', 'message', 'max']):
tags.extend(['redis', 'messaging', 'agent'])
if any(word in section.lower() for word in ['youtube', 'seo', 'content']):
tags.extend(['youtube', 'content'])
if any(word in section.lower() for word in ['search', 'searxng', 'web']):
tags.extend(['search', 'web'])
if any(word in section.lower() for word in ['setup', 'install', 'bootstrap']):
tags.extend(['setup', 'configuration'])
# Content-based tags
content_lower = content.lower()
if 'voice' in content_lower:
tags.append('voice')
if 'memory' in content_lower:
tags.append('memory')
if 'qdrant' in content_lower:
tags.append('qdrant')
if 'redis' in content_lower:
tags.append('redis')
if 'youtube' in content_lower:
tags.append('youtube')
if 'rob' in content_lower:
tags.append('user')
return list(set(tags)) # Remove duplicates
def extract_core_memories_from_memory_md():
"""Extract high-importance memories from MEMORY.md"""
memories = []
try:
with open(MEMORY_MD, 'r') as f:
content = f.read()
except Exception as e:
print(f"Error reading MEMORY.md: {e}", file=sys.stderr)
return memories
# Core sections with high importance
sections = [
("Identity & Names", "high"),
("Core Preferences", "high"),
("Communication Rules", "high"),
("Voice Settings", "high"),
("Lessons Learned", "high"),
]
for section_name, importance in sections:
pattern = f"## {section_name}.*?(?=## |$)"
match = re.search(pattern, content, re.DOTALL)
if match:
section_text = match.group(0).strip()
# Extract subsections
subsections = re.findall(r'### (.+?)\n', section_text)
for sub in subsections:
sub_pattern = f"### {re.escape(sub)}.*?(?=### |## |$)"
sub_match = re.search(sub_pattern, section_text, re.DOTALL)
if sub_match:
sub_text = sub_match.group(0).strip()
if len(sub_text) > 50:
memories.append({
"text": f"{section_name} - {sub}: {sub_text[:500]}",
"date": "2026-02-10",
"tags": extract_tags(section_name, sub_text) + ['core', 'longterm'],
"importance": importance
})
return memories
def main():
print("Starting bulk memory migration to kimi_memories...")
print(f"Collection: {COLLECTION_NAME}")
print(f"Model: snowflake-arctic-embed2 (1024 dims)")
print()
all_memories = []
# Extract from daily logs
for filename in sorted(os.listdir(MEMORY_DIR)):
if filename.endswith('.md') and filename.startswith('2026'):
filepath = os.path.join(MEMORY_DIR, filename)
print(f"Processing {filename}...")
memories = extract_memories_from_file(filepath, importance="medium")
all_memories.extend(memories)
print(f" Extracted {len(memories)} memories")
# Extract from MEMORY.md
print("Processing MEMORY.md...")
core_memories = extract_core_memories_from_memory_md()
all_memories.extend(core_memories)
print(f" Extracted {len(core_memories)} core memories")
print(f"\nTotal memories to store: {len(all_memories)}")
print()
# Store each memory
success_count = 0
fail_count = 0
for i, memory in enumerate(all_memories, 1):
print(f"[{i}/{len(all_memories)}] Storing: {memory['text'][:60]}...")
# Generate embedding
embedding = get_embedding(memory['text'])
if embedding is None:
print(f" ❌ Failed to generate embedding")
fail_count += 1
continue
# Store in Qdrant
if store_memory(
text=memory['text'],
embedding=embedding,
tags=memory['tags'],
importance=memory['importance'],
date=memory['date'],
source="bulk_migration",
confidence="high",
source_type="user",
verified=True
):
print(f" ✅ Stored")
success_count += 1
else:
print(f" ❌ Failed to store")
fail_count += 1
print()
print("=" * 50)
print(f"Migration complete!")
print(f" Success: {success_count}")
print(f" Failed: {fail_count}")
print(f" Total: {len(all_memories)}")
print("=" * 50)
if __name__ == "__main__":
main()