jarvis-memory/skills/qdrant-memory/scripts/extract_facts.py

#!/usr/bin/env python3
"""
Fact Extraction Script - Parse daily logs and extract atomic memories

This script parses memory/YYYY-MM-DD.md files and extracts individual facts
for storage in Qdrant as atomic memory units (Mem0-style), NOT whole files.

NOTE: Configured for COMPREHENSIVE capture (even minor facts) - user has
abundant storage resources. Thresholds are intentionally low to maximize
memory retention. Use --min-length flag to adjust filtering if needed.

Usage:
    extract_facts.py [--date 2026-02-15] [--dry-run] [--batch-size 50]
    extract_facts.py --backfill-all  # Process all missing dates

Features:
    - Parses markdown sections as individual facts
    - Generates embeddings per fact (not per file)
    - Stores with rich metadata (tags, importance, source)
    - Batch upload support
    - Duplicate detection
"""

import argparse
import json
import os
import re
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any, Tuple

# Configuration
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_EMBED_URL = "http://localhost:11434/v1"
MEMORY_DIR = Path("/root/.openclaw/workspace/memory")
DEFAULT_BATCH_SIZE = 50


def get_embedding(text: str) -> Optional[List[float]]:
    """Generate embedding using snowflake-arctic-embed2 via Ollama"""
    data = json.dumps({
        "model": "snowflake-arctic-embed2",
        "input": text[:8192]  # Limit to 8k chars
    }).encode()

    req = urllib.request.Request(
        f"{OLLAMA_EMBED_URL}/embeddings",
        data=data,
        headers={"Content-Type": "application/json"}
    )

    try:
        with urllib.request.urlopen(req, timeout=30) as response:
            result = json.loads(response.read().decode())
            return result["data"][0]["embedding"]
    except Exception as e:
        print(f"Error generating embedding: {e}", file=sys.stderr)
        return None


def batch_get_embeddings(texts: List[str]) -> List[Optional[List[float]]]:
    """Generate embeddings for multiple texts in batch"""
    if not texts:
        return []

    data = json.dumps({
        "model": "snowflake-arctic-embed2",
        "input": [t[:8192] for t in texts]
    }).encode()

    req = urllib.request.Request(
        f"{OLLAMA_EMBED_URL}/embeddings",
        data=data,
        headers={"Content-Type": "application/json"}
    )

    try:
        with urllib.request.urlopen(req, timeout=120) as response:
            result = json.loads(response.read().decode())
            return [d["embedding"] for d in result["data"]]
    except Exception as e:
        print(f"Error generating batch embeddings: {e}", file=sys.stderr)
        return [None] * len(texts)


def parse_markdown_sections(content: str, date_str: str) -> List[Dict[str, Any]]:
    """
    Parse markdown content into atomic facts - COMPREHENSIVE CAPTURE.

    Extracts EVERYTHING:
    - ## Headers as fact categories
    - Individual bullet points as atomic facts
    - Paragraphs as standalone facts
    - Code blocks as facts
    - Table rows as facts
    - Lines with **bold** as critical rules
    - URLs/links as facts
    - Key-value pairs (Key: Value)
    """
    facts = []
    lines = content.split('\n')
    current_section = "General"
    current_section_content = []
    in_code_block = False
    code_block_content = []
    code_block_language = ""

    def flush_section_content():
        """Convert accumulated section content into facts"""
        nonlocal current_section_content
        if not current_section_content:
            return

        # Join lines and split into paragraphs
        full_text = '\n'.join(current_section_content)
        paragraphs = [p.strip() for p in full_text.split('\n\n') if p.strip()]

        for para in paragraphs:
            if len(para) < 5:  # Skip very short fragments
                continue

            # Split long paragraphs into sentence-level facts
            if len(para) > 300:
                sentences = [s.strip() for s in para.replace('. ', '.\n').split('\n') if s.strip()]
                for sentence in sentences:
                    if len(sentence) > 10:
                        facts.append({
                            "text": f"{current_section}: {sentence[:500]}",
                            "tags": extract_tags(sentence, date_str),
                            "importance": "high" if "**" in sentence else "medium",
                            "source_type": "inferred",
                            "category": current_section
                        })
            else:
                # Store whole paragraph as fact
                facts.append({
                    "text": f"{current_section}: {para[:500]}",
                    "tags": extract_tags(para, date_str),
                    "importance": "high" if "**" in para else "medium",
                    "source_type": "inferred",
                    "category": current_section
                })

        current_section_content = []

    def extract_tags(text: str, date_str: str) -> List[str]:
        """Extract relevant tags from text"""
        tags = ["atomic-fact", date_str]

        # Content-based tags
        text_lower = text.lower()
        tag_mappings = {
            "preference": "preferences",
            "config": "configuration",
            "hardware": "hardware",
            "security": "security",
            "youtube": "youtube",
            "video": "video",
            "workflow": "workflow",
            "rule": "rules",
            "critical": "critical",
            "decision": "decisions",
            "research": "research",
            "process": "process",
            "step": "steps",
        }

        for keyword, tag in tag_mappings.items():
            if keyword in text_lower:
                tags.append(tag)

        return tags

    for i, line in enumerate(lines):
        line = line.strip()

        # Code blocks
        if line.startswith('```'):
            if in_code_block:
                # End of code block
                if code_block_content:
                    code_text = '\n'.join(code_block_content)
                    facts.append({
                        "text": f"{current_section} [Code: {code_block_language}]: {code_text[:800]}",
                        "tags": ["code-block", "atomic-fact", date_str, code_block_language],
                        "importance": "medium",
                        "source_type": "inferred",
                        "category": current_section
                    })
                code_block_content = []
                code_block_language = ""
                in_code_block = False
            else:
                # Start of code block
                flush_section_content()
                in_code_block = True
                code_block_language = line[3:].strip() or "text"
            continue

        if in_code_block:
            code_block_content.append(line)
            continue

        # Skip empty lines
        if not line:
            flush_section_content()
            continue

        # Section headers (##)
        if line.startswith('## '):
            flush_section_content()
            current_section = line[3:].strip()
            facts.append({
                "text": f"Section: {current_section}",
                "tags": ["section-header", "atomic-fact", date_str],
                "importance": "medium",
                "source_type": "inferred",
                "category": current_section
            })
            continue

        # Skip main title (# Title)
        if line.startswith('# ') and i == 0:
            continue

        # Bullet points (all levels)
        if line.startswith('- ') or line.startswith('* ') or line.startswith('+ '):
            flush_section_content()
            fact_text = line[2:].strip()
            if len(fact_text) > 3:
                facts.append({
                    "text": f"{current_section}: {fact_text[:500]}",
                    "tags": extract_tags(fact_text, date_str),
                    "importance": "high" if "**" in fact_text else "medium",
                    "source_type": "inferred",
                    "category": current_section
                })
            continue

        # Numbered lists
        if re.match(r'^\d+\.\s', line):
            flush_section_content()
            fact_text = re.sub(r'^\d+\.\s*', '', line)
            if len(fact_text) > 3:
                facts.append({
                    "text": f"{current_section}: {fact_text[:500]}",
                    "tags": extract_tags(fact_text, date_str),
                    "importance": "high" if "**" in fact_text else "medium",
                    "source_type": "inferred",
                    "category": current_section
                })
            continue

        # URLs / Links
        url_match = re.search(r'https?://[^\s<>"\')\]]+', line)
        if url_match and len(line) < 300:
            facts.append({
                "text": f"{current_section}: {line[:400]}",
                "tags": ["url", "link", "atomic-fact", date_str],
                "importance": "medium",
                "source_type": "inferred",
                "category": current_section
            })
            continue

        # Key-value pairs (Key: Value)
        if ':' in line and len(line) < 200 and not line.startswith('**'):
            key_part = line.split(':')[0].strip()
            if key_part and len(key_part) < 50 and not key_part.startswith('#'):
                facts.append({
                    "text": f"{current_section}: {line[:400]}",
                    "tags": extract_tags(line, date_str) + ["key-value"],
                    "importance": "medium",
                    "source_type": "inferred",
                    "category": current_section
                })
                continue

        # Bold text / critical rules
        if '**' in line:
            flush_section_content()
            facts.append({
                "text": f"{current_section}: {line[:500]}",
                "tags": ["critical-rule", "high-priority", date_str],
                "importance": "high",
                "source_type": "user",
                "category": current_section
            })
            continue

        # Table rows (| col1 | col2 |)
        if '|' in line and not line.startswith('#'):
            cells = [c.strip() for c in line.split('|') if c.strip()]
            if cells and not all(c.replace('-', '').replace(':', '') == '' for c in cells):
                facts.append({
                    "text": f"{current_section} [Table]: {' | '.join(cells)[:400]}",
                    "tags": ["table-row", "atomic-fact", date_str],
                    "importance": "medium",
                    "source_type": "inferred",
                    "category": current_section
                })
            continue

        # Accumulate regular content
        if len(line) > 2:
            current_section_content.append(line)

    # Flush remaining content
    flush_section_content()

    return facts


def check_existing_facts(date_str: str) -> set:
    """Check which facts from this date are already stored"""
    try:
        scroll_data = json.dumps({
            "limit": 1000,
            "with_payload": True,
            "filter": {
                "must": [{"key": "tags", "match": {"value": date_str}}]
            }
        }).encode()

        req = urllib.request.Request(
            f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
            data=scroll_data,
            headers={"Content-Type": "application/json"},
            method="POST"
        )
        with urllib.request.urlopen(req, timeout=30) as response:
            result = json.loads(response.read().decode())
            points = result.get("result", {}).get("points", [])
            # Return set of text previews (first 100 chars) for comparison
            return {p["payload"]["text"][:100] for p in points if "text" in p["payload"]}
    except Exception as e:
        print(f"Warning: Could not check existing facts: {e}", file=sys.stderr)
        return set()


def upload_facts_batch(facts: List[Dict[str, Any]], batch_size: int = 50) -> Tuple[int, int]:
    """Upload facts to Qdrant in batches"""
    total = len(facts)
    uploaded = 0
    failed = 0

    for i in range(0, total, batch_size):
        batch = facts[i:i + batch_size]

        # Generate embeddings for this batch
        texts = [f["text"] for f in batch]
        embeddings = batch_get_embeddings(texts)

        # Prepare points
        points = []
        for fact, embedding in zip(batch, embeddings):
            if embedding is None:
                failed += 1
                continue

            point_id = str(uuid.uuid4())
            date_str = fact.get("date", datetime.now().strftime("%Y-%m-%d"))

            payload = {
                "text": fact["text"],
                "date": date_str,
                "tags": fact.get("tags", []),
                "importance": fact.get("importance", "medium"),
                "source": fact.get("source", "fact-extraction"),
                "source_type": fact.get("source_type", "inferred"),
                "category": fact.get("category", "general"),
                "confidence": fact.get("confidence", "high"),
                "verified": fact.get("verified", True),
                "created_at": datetime.now().isoformat(),
                "access_count": 0,
                "last_accessed": datetime.now().isoformat()
            }

            # NOTE: Memories never expire - user requested permanent retention
            # No expires_at field set = memories persist indefinitely

            points.append({
                "id": point_id,
                "vector": embedding,
                "payload": payload
            })

        if not points:
            continue

        # Upload batch
        upsert_data = {"points": points}
        req = urllib.request.Request(
            f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
            data=json.dumps(upsert_data).encode(),
            headers={"Content-Type": "application/json"},
            method="PUT"
        )

        try:
            with urllib.request.urlopen(req, timeout=60) as response:
                result = json.loads(response.read().decode())
                if result.get("status") == "ok":
                    uploaded += len(points)
                    print(f"  ✅ Batch {i//batch_size + 1}: {len(points)} facts uploaded")
                else:
                    print(f"  ❌ Batch {i//batch_size + 1}: Failed")
                    failed += len(points)
        except Exception as e:
            print(f"  ❌ Batch {i//batch_size + 1}: {e}", file=sys.stderr)
            failed += len(points)

    return uploaded, failed


def process_single_date(date_str: str, dry_run: bool = False, batch_size: int = 50) -> Tuple[int, int]:
    """Process a single date's memory file"""
    file_path = MEMORY_DIR / f"{date_str}.md"

    if not file_path.exists():
        print(f"  ⚠️  File not found: {file_path}")
        return 0, 0

    print(f"Processing {date_str}...")

    with open(file_path, 'r') as f:
        content = f.read()

    # Parse into atomic facts
    facts = parse_markdown_sections(content, date_str)

    if not facts:
        print(f"  ⚠️  No facts extracted from {date_str}")
        return 0, 0

    print(f"  📄 Extracted {len(facts)} atomic facts")

    # Check for existing (skip duplicates)
    existing = check_existing_facts(date_str)
    new_facts = [f for f in facts if f["text"][:100] not in existing]

    if existing:
        print(f"  ⏭️  Skipping {len(facts) - len(new_facts)} duplicates")

    if not new_facts:
        print(f"  ✅ All facts already stored for {date_str}")
        return 0, 0

    print(f"  📤 Uploading {len(new_facts)} new facts...")

    if dry_run:
        print(f"  [DRY RUN] Would upload {len(new_facts)} facts")
        for f in new_facts[:3]:  # Show first 3
            print(f"    - {f['text'][:80]}...")
        if len(new_facts) > 3:
            print(f"    ... and {len(new_facts) - 3} more")
        return len(new_facts), 0

    # Add date to each fact
    for f in new_facts:
        f["date"] = date_str

    uploaded, failed = upload_facts_batch(new_facts, batch_size)
    return uploaded, failed


def get_all_memory_dates() -> List[str]:
    """Get all memory file dates sorted"""
    if not MEMORY_DIR.exists():
        return []

    dates = []
    for f in MEMORY_DIR.glob("????-??-??.md"):
        dates.append(f.stem)

    dates.sort()
    return dates


def main():
    parser = argparse.ArgumentParser(
        description="Extract atomic facts from daily logs and store in Qdrant"
    )
    parser.add_argument("--date", help="Specific date to process (YYYY-MM-DD)")
    parser.add_argument("--backfill-all", action="store_true",
                        help="Process all memory files")
    parser.add_argument("--dry-run", action="store_true",
                        help="Show what would be stored without uploading")
    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE,
                        help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})")
    parser.add_argument("--force", action="store_true",
                        help="Re-process even if already stored")

    args = parser.parse_args()

    print(f"=== Fact Extraction ===")
    print(f"Time: {datetime.now().isoformat()}")
    print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
    print(f"Batch size: {args.batch_size}")
    print()

    if args.date:
        # Single date
        uploaded, failed = process_single_date(args.date, args.dry_run, args.batch_size)
        print(f"\n{'=' * 50}")
        print(f"Summary for {args.date}:")
        print(f"  Uploaded: {uploaded}")
        print(f"  Failed: {failed}")

    elif args.backfill_all:
        # All dates
        dates = get_all_memory_dates()
        print(f"Found {len(dates)} memory files to process")
        print()

        total_uploaded = 0
        total_failed = 0

        for date_str in dates:
            uploaded, failed = process_single_date(date_str, args.dry_run, args.batch_size)
            total_uploaded += uploaded
            total_failed += failed
            print()

        print(f"{'=' * 50}")
        print(f"Total Summary:")
        print(f"  Files processed: {len(dates)}")
        print(f"  Total uploaded: {total_uploaded}")
        print(f"  Total failed: {total_failed}")

    else:
        # Default to today
        today = datetime.now().strftime("%Y-%m-%d")
        uploaded, failed = process_single_date(today, args.dry_run, args.batch_size)
        print(f"\n{'=' * 50}")
        print(f"Summary for {today}:")
        print(f"  Uploaded: {uploaded}")
        print(f"  Failed: {failed}")

    print()
    print("✅ Fact extraction complete!")
    print("\nNext steps:")
    print("  - Search facts: python3 search_memories.py 'your query'")
    print("  - View by date: Check Qdrant with tag filter for date")


if __name__ == "__main__":
    main()