#!/usr/bin/env python3
"""
Bulk memory migration to Qdrant kimi_memories collection
Uses snowflake-arctic-embed2 (1024 dimensions)
"""

import json
import os
import re
import sys
import urllib.request
import uuid
from datetime import datetime

QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://localhost:11434/v1"

MEMORY_DIR = "/root/.openclaw/workspace/memory"
MEMORY_MD = "/root/.openclaw/workspace/MEMORY.md"

def get_embedding(text):
    """Generate embedding using snowflake-arctic-embed2 via Ollama"""
    data = json.dumps({
        "model": "snowflake-arctic-embed2",
        "input": text[:8192]  # Limit text length
    }).encode()
    
    req = urllib.request.Request(
        f"{OLLAMA_URL}/embeddings",
        data=data,
        headers={"Content-Type": "application/json"}
    )
    
    try:
        with urllib.request.urlopen(req, timeout=60) as response:
            result = json.loads(response.read().decode())
            return result["data"][0]["embedding"]
    except Exception as e:
        print(f"Error generating embedding: {e}", file=sys.stderr)
        return None

def store_memory(text, embedding, tags=None, importance="medium", date=None, 
                 source="memory_backup", confidence="high", source_type="user",
                 verified=True):
    """Store memory in Qdrant with metadata"""
    
    if date is None:
        date = datetime.now().strftime("%Y-%m-%d")
    
    point_id = str(uuid.uuid4())
    
    payload = {
        "text": text,
        "date": date,
        "tags": tags or [],
        "importance": importance,
        "confidence": confidence,
        "source_type": source_type,
        "verified": verified,
        "source": source,
        "created_at": datetime.now().isoformat(),
        "access_count": 0
    }
    
    point = {
        "id": point_id,
        "vector": embedding,
        "payload": payload
    }
    
    data = json.dumps({"points": [point]}).encode()
    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
        data=data,
        headers={"Content-Type": "application/json"}
    )
    
    try:
        with urllib.request.urlopen(req, timeout=30) as response:
            result = json.loads(response.read().decode())
            return result.get("result", {}).get("status") == "ok"
    except Exception as e:
        print(f"Error storing memory: {e}", file=sys.stderr)
        return False

def extract_memories_from_file(filepath, importance="medium"):
    """Extract memory entries from a markdown file"""
    memories = []
    
    try:
        with open(filepath, 'r') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading {filepath}: {e}", file=sys.stderr)
        return memories
    
    # Extract date from filename or content
    date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filepath)
    date = date_match.group(1) if date_match else datetime.now().strftime("%Y-%m-%d")
    
    # Parse sections
    lines = content.split('\n')
    current_section = None
    current_content = []
    
    for line in lines:
        # Section headers
        if line.startswith('# ') and 'Memory' in line:
            continue  # Skip title
        elif line.startswith('## '):
            # Save previous section
            if current_section and current_content:
                section_text = '\n'.join(current_content).strip()
                if len(section_text) > 20:
                    memories.append({
                        "text": f"{current_section}: {section_text}",
                        "date": date,
                        "tags": extract_tags(current_section, section_text),
                        "importance": importance
                    })
            current_section = line[3:].strip()
            current_content = []
        elif line.startswith('### '):
            # Save previous section
            if current_section and current_content:
                section_text = '\n'.join(current_content).strip()
                if len(section_text) > 20:
                    memories.append({
                        "text": f"{current_section}: {section_text}",
                        "date": date,
                        "tags": extract_tags(current_section, section_text),
                        "importance": importance
                    })
            current_section = line[4:].strip()
            current_content = []
        else:
            if current_section:
                current_content.append(line)
    
    # Save final section
    if current_section and current_content:
        section_text = '\n'.join(current_content).strip()
        if len(section_text) > 20:
            memories.append({
                "text": f"{current_section}: {section_text}",
                "date": date,
                "tags": extract_tags(current_section, section_text),
                "importance": importance
            })
    
    return memories

def extract_tags(section, content):
    """Extract relevant tags from section and content"""
    tags = []
    
    # Section-based tags
    if any(word in section.lower() for word in ['voice', 'tts', 'stt', 'audio']):
        tags.extend(['voice', 'audio'])
    if any(word in section.lower() for word in ['memory', 'qdrant', 'remember']):
        tags.extend(['memory', 'qdrant'])
    if any(word in section.lower() for word in ['redis', 'agent', 'message', 'max']):
        tags.extend(['redis', 'messaging', 'agent'])
    if any(word in section.lower() for word in ['youtube', 'seo', 'content']):
        tags.extend(['youtube', 'content'])
    if any(word in section.lower() for word in ['search', 'searxng', 'web']):
        tags.extend(['search', 'web'])
    if any(word in section.lower() for word in ['setup', 'install', 'bootstrap']):
        tags.extend(['setup', 'configuration'])
    
    # Content-based tags
    content_lower = content.lower()
    if 'voice' in content_lower:
        tags.append('voice')
    if 'memory' in content_lower:
        tags.append('memory')
    if 'qdrant' in content_lower:
        tags.append('qdrant')
    if 'redis' in content_lower:
        tags.append('redis')
    if 'youtube' in content_lower:
        tags.append('youtube')
    if 'rob' in content_lower:
        tags.append('user')
    
    return list(set(tags))  # Remove duplicates

def extract_core_memories_from_memory_md():
    """Extract high-importance memories from MEMORY.md"""
    memories = []
    
    try:
        with open(MEMORY_MD, 'r') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading MEMORY.md: {e}", file=sys.stderr)
        return memories
    
    # Core sections with high importance
    sections = [
        ("Identity & Names", "high"),
        ("Core Preferences", "high"),
        ("Communication Rules", "high"),
        ("Voice Settings", "high"),
        ("Lessons Learned", "high"),
    ]
    
    for section_name, importance in sections:
        pattern = f"## {section_name}.*?(?=## |$)"
        match = re.search(pattern, content, re.DOTALL)
        if match:
            section_text = match.group(0).strip()
            # Extract subsections
            subsections = re.findall(r'### (.+?)\n', section_text)
            for sub in subsections:
                sub_pattern = f"### {re.escape(sub)}.*?(?=### |## |$)"
                sub_match = re.search(sub_pattern, section_text, re.DOTALL)
                if sub_match:
                    sub_text = sub_match.group(0).strip()
                    if len(sub_text) > 50:
                        memories.append({
                            "text": f"{section_name} - {sub}: {sub_text[:500]}",
                            "date": "2026-02-10",
                            "tags": extract_tags(section_name, sub_text) + ['core', 'longterm'],
                            "importance": importance
                        })
    
    return memories

def main():
    print("Starting bulk memory migration to kimi_memories...")
    print(f"Collection: {COLLECTION_NAME}")
    print(f"Model: snowflake-arctic-embed2 (1024 dims)")
    print()
    
    all_memories = []
    
    # Extract from daily logs
    for filename in sorted(os.listdir(MEMORY_DIR)):
        if filename.endswith('.md') and filename.startswith('2026'):
            filepath = os.path.join(MEMORY_DIR, filename)
            print(f"Processing {filename}...")
            memories = extract_memories_from_file(filepath, importance="medium")
            all_memories.extend(memories)
            print(f"  Extracted {len(memories)} memories")
    
    # Extract from MEMORY.md
    print("Processing MEMORY.md...")
    core_memories = extract_core_memories_from_memory_md()
    all_memories.extend(core_memories)
    print(f"  Extracted {len(core_memories)} core memories")
    
    print(f"\nTotal memories to store: {len(all_memories)}")
    print()
    
    # Store each memory
    success_count = 0
    fail_count = 0
    
    for i, memory in enumerate(all_memories, 1):
        print(f"[{i}/{len(all_memories)}] Storing: {memory['text'][:60]}...")
        
        # Generate embedding
        embedding = get_embedding(memory['text'])
        if embedding is None:
            print(f"  ❌ Failed to generate embedding")
            fail_count += 1
            continue
        
        # Store in Qdrant
        if store_memory(
            text=memory['text'],
            embedding=embedding,
            tags=memory['tags'],
            importance=memory['importance'],
            date=memory['date'],
            source="bulk_migration",
            confidence="high",
            source_type="user",
            verified=True
        ):
            print(f"  ✅ Stored")
            success_count += 1
        else:
            print(f"  ❌ Failed to store")
            fail_count += 1
    
    print()
    print("=" * 50)
    print(f"Migration complete!")
    print(f"  Success: {success_count}")
    print(f"  Failed: {fail_count}")
    print(f"  Total: {len(all_memories)}")
    print("=" * 50)

if __name__ == "__main__":
    main()