#!/usr/bin/env python3
"""
Store content to kimi_kb (Knowledge Base) - Manual only with batch support

Usage:
    Single entry:
        python3 kb_store.py "Content text" --title "Title" --domain "Category" --tags "tag1,tag2"
        python3 kb_store.py "Content" --title "X" --url "https://example.com" --source "docs.site"
    
    Batch mode:
        python3 kb_store.py --batch-file entries.json --batch-size 100

Features:
    - Single or batch upload
    - Duplicate detection by title/URL
    - Domain categorization
    - Access tracking
"""

import argparse
import json
import os
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any

QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION = "kimi_kb"
OLLAMA_URL = "http://localhost:11434/v1"
DEFAULT_BATCH_SIZE = 100


def check_existing(title: str = None, url: str = None) -> tuple:
    """Check if entry already exists by title or URL"""
    try:
        # Check by URL first if provided
        if url:
            scroll_data = json.dumps({
                "limit": 10,
                "with_payload": True,
                "filter": {"must": [{"key": "url", "match": {"value": url}}]}
            }).encode()
            req = urllib.request.Request(
                f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
                data=scroll_data,
                headers={"Content-Type": "application/json"},
                method="POST"
            )
            with urllib.request.urlopen(req, timeout=10) as response:
                result = json.loads(response.read().decode())
                points = result.get("result", {}).get("points", [])
                if points:
                    return points[0]["id"], "url"
        
        # Check by title
        if title:
            scroll_data = json.dumps({
                "limit": 10,
                "with_payload": True,
                "filter": {"must": [{"key": "title", "match": {"value": title}}]}
            }).encode()
            req = urllib.request.Request(
                f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
                data=scroll_data,
                headers={"Content-Type": "application/json"},
                method="POST"
            )
            with urllib.request.urlopen(req, timeout=10) as response:
                result = json.loads(response.read().decode())
                points = result.get("result", {}).get("points", [])
                if points:
                    return points[0]["id"], "title"
    except Exception as e:
        print(f"Warning: Could not check existing: {e}", file=sys.stderr)
    return None, None


def get_embedding(text: str) -> Optional[List[float]]:
    """Generate embedding using snowflake-arctic-embed2"""
    data = json.dumps({
        "model": "snowflake-arctic-embed2",
        "input": text[:8192]
    }).encode()
    
    req = urllib.request.Request(
        f"{OLLAMA_URL}/embeddings",
        data=data,
        headers={"Content-Type": "application/json"}
    )
    
    try:
        with urllib.request.urlopen(req, timeout=60) as response:
            result = json.loads(response.read().decode())
            return result["data"][0]["embedding"]
    except Exception as e:
        print(f"Error generating embedding: {e}", file=sys.stderr)
        return None


def batch_upload_embeddings(texts: List[str]) -> List[Optional[List[float]]]:
    """Generate embeddings for multiple texts in batch"""
    if not texts:
        return []
    
    data = json.dumps({
        "model": "snowflake-arctic-embed2",
        "input": [t[:8192] for t in texts]
    }).encode()
    
    req = urllib.request.Request(
        f"{OLLAMA_URL}/embeddings",
        data=data,
        headers={"Content-Type": "application/json"}
    )
    
    try:
        with urllib.request.urlopen(req, timeout=120) as response:
            result = json.loads(response.read().decode())
            return [d["embedding"] for d in result["data"]]
    except Exception as e:
        print(f"Error generating batch embeddings: {e}", file=sys.stderr)
        return [None] * len(texts)


def upload_points_batch(points: List[Dict[str, Any]], batch_size: int = DEFAULT_BATCH_SIZE) -> tuple:
    """Upload points in batches to Qdrant"""
    total = len(points)
    uploaded = 0
    failed = 0
    
    for i in range(0, total, batch_size):
        batch = points[i:i + batch_size]
        
        upsert_data = {"points": batch}
        
        req = urllib.request.Request(
            f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
            data=json.dumps(upsert_data).encode(),
            headers={"Content-Type": "application/json"},
            method="PUT"
        )
        
        try:
            with urllib.request.urlopen(req, timeout=60) as response:
                result = json.loads(response.read().decode())
                if result.get("status") == "ok":
                    uploaded += len(batch)
                    print(f"  ✅ Uploaded batch {i//batch_size + 1}: {len(batch)} points")
                else:
                    print(f"  ❌ Batch {i//batch_size + 1} failed: {result}")
                    failed += len(batch)
        except Exception as e:
            print(f"  ❌ Batch {i//batch_size + 1} error: {e}", file=sys.stderr)
            failed += len(batch)
    
    return uploaded, failed


def store_single(
    text: str,
    embedding: List[float],
    title: str = None,
    url: str = None,
    source: str = None,
    domain: str = "general",
    tags: List[str] = None,
    content_type: str = "document",
    replace: bool = False
) -> bool:
    """Store single KB entry"""
    
    # Check for existing entry
    existing_id, match_type = check_existing(title=title, url=url)
    if existing_id:
        if not replace:
            print(f"⚠️  Entry '{title}' already exists (matched by {match_type}, ID: {existing_id})")
            print(f"   Use --replace to overwrite")
            return False
    
    point_id = existing_id if existing_id else str(uuid.uuid4())
    
    payload = {
        "text": text,
        "title": title or "Untitled",
        "url": url or "",
        "source": source or "manual",
        "domain": domain or "general",
        "tags": tags or [],
        "content_type": content_type,
        "date": datetime.now().strftime("%Y-%m-%d"),
        "created_at": datetime.now().isoformat(),
        "access_count": 0
    }
    
    point = {
        "points": [{
            "id": point_id,
            "vector": embedding,
            "payload": payload
        }]
    }
    
    data = json.dumps(point).encode()
    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
        data=data,
        headers={"Content-Type": "application/json"},
        method="PUT"
    )
    
    try:
        with urllib.request.urlopen(req, timeout=30) as response:
            result = json.loads(response.read().decode())
            return result.get("status") == "ok"
    except Exception as e:
        print(f"Error storing to KB: {e}", file=sys.stderr)
        return False


def store_batch(
    entries: List[Dict[str, Any]],
    batch_size: int = DEFAULT_BATCH_SIZE,
    check_duplicates: bool = True
) -> tuple:
    """Store multiple KB entries in batch with optional duplicate checking"""
    if not entries:
        return 0, 0
    
    print(f"Processing {len(entries)} entries...")
    
    # Filter duplicates if requested
    entries_to_process = []
    duplicates = 0
    
    if check_duplicates:
        for entry in entries:
            existing_id, match_type = check_existing(
                title=entry.get("title"),
                url=entry.get("url")
            )
            if existing_id:
                print(f"  ⏭️  Skipping duplicate: {entry.get('title', 'Untitled')} ({match_type})")
                duplicates += 1
            else:
                entries_to_process.append(entry)
    else:
        entries_to_process = entries
    
    if not entries_to_process:
        print(f"All {len(entries)} entries already exist")
        return 0, 0
    
    print(f"Generating embeddings for {len(entries_to_process)} entries...")
    texts = [e["content"] for e in entries_to_process]
    embeddings = batch_upload_embeddings(texts)
    
    # Prepare points
    points = []
    failed_embeddings = 0
    
    for entry, embedding in zip(entries_to_process, embeddings):
        if embedding is None:
            failed_embeddings += 1
            continue
        
        point_id = str(uuid.uuid4())
        
        payload = {
            "text": entry["content"],
            "title": entry.get("title", "Untitled"),
            "url": entry.get("url", ""),
            "source": entry.get("source", "manual"),
            "domain": entry.get("domain", "general"),
            "tags": entry.get("tags", []),
            "content_type": entry.get("type", "document"),
            "date": datetime.now().strftime("%Y-%m-%d"),
            "created_at": datetime.now().isoformat(),
            "access_count": 0
        }
        
        points.append({
            "id": point_id,
            "vector": embedding,
            "payload": payload
        })
    
    if not points:
        return 0, failed_embeddings + duplicates
    
    # Upload in batches
    print(f"Uploading {len(points)} entries in batches of {batch_size}...")
    uploaded, failed_upload = upload_points_batch(points, batch_size)
    
    return uploaded, failed_embeddings + failed_upload + duplicates


def main():
    parser = argparse.ArgumentParser(description="Store content to kimi_kb")
    parser.add_argument("content", nargs="?", help="Content to store")
    parser.add_argument("--title", default=None, help="Title of the content")
    parser.add_argument("--url", default=None, help="Source URL if from web")
    parser.add_argument("--source", default=None, help="Source name")
    parser.add_argument("--domain", default="general", help="Domain/category")
    parser.add_argument("--tags", default=None, help="Comma-separated tags")
    parser.add_argument("--type", default="document", choices=["document", "web", "code", "note"],
                       help="Content type")
    parser.add_argument("--replace", action="store_true", help="Replace existing entry")
    parser.add_argument("--batch-file", help="JSON file with multiple entries")
    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size")
    parser.add_argument("--no-check-duplicates", action="store_true", help="Skip duplicate checking in batch mode")
    
    args = parser.parse_args()
    
    # Batch mode
    if args.batch_file:
        print(f"Batch mode: Loading entries from {args.batch_file}")
        try:
            with open(args.batch_file, 'r') as f:
                entries = json.load(f)
            
            if not isinstance(entries, list):
                print("Batch file must contain a JSON array", file=sys.stderr)
                sys.exit(1)
            
            print(f"Loaded {len(entries)} entries")
            uploaded, failed = store_batch(
                entries,
                args.batch_size,
                check_duplicates=not args.no_check_duplicates
            )
            
            print(f"\n{'=' * 50}")
            print(f"Batch complete: {uploaded} uploaded, {failed} failed")
            sys.exit(0 if failed == 0 else 1)
            
        except Exception as e:
            print(f"Error: {e}", file=sys.stderr)
            sys.exit(1)
    
    # Single entry mode
    if not args.content:
        print("Error: Provide content or use --batch-file", file=sys.stderr)
        parser.print_help()
        sys.exit(1)
    
    tags = [t.strip() for t in args.tags.split(",")] if args.tags else []
    
    print(f"Generating embedding...")
    embedding = get_embedding(args.content)
    
    if embedding is None:
        print("❌ Failed to generate embedding")
        sys.exit(1)
    
    print(f"Storing to kimi_kb: {args.title or 'Untitled'}...")
    
    if store_single(
        text=args.content,
        embedding=embedding,
        title=args.title,
        url=args.url,
        source=args.source,
        domain=args.domain,
        tags=tags,
        content_type=args.type,
        replace=args.replace
    ):
        print(f"✅ Stored to kimi_kb ({args.domain})")
    else:
        print("❌ Failed to store")
        sys.exit(1)


if __name__ == "__main__":
    main()