Files
jarvis-memory/skills/qdrant-memory/scripts/extract_facts.py

554 lines
19 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Fact Extraction Script - Parse daily logs and extract atomic memories
This script parses memory/YYYY-MM-DD.md files and extracts individual facts
for storage in Qdrant as atomic memory units (Mem0-style), NOT whole files.
NOTE: Configured for COMPREHENSIVE capture (even minor facts) - user has
abundant storage resources. Thresholds are intentionally low to maximize
memory retention. Use --min-length flag to adjust filtering if needed.
Usage:
extract_facts.py [--date 2026-02-15] [--dry-run] [--batch-size 50]
extract_facts.py --backfill-all # Process all missing dates
Features:
- Parses markdown sections as individual facts
- Generates embeddings per fact (not per file)
- Stores with rich metadata (tags, importance, source)
- Batch upload support
- Duplicate detection
"""
import argparse
import json
import os
import re
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any, Tuple
# Configuration
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_EMBED_URL = "http://localhost:11434/v1"
MEMORY_DIR = Path("/root/.openclaw/workspace/memory")
DEFAULT_BATCH_SIZE = 50
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2 via Ollama"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192] # Limit to 8k chars
}).encode()
req = urllib.request.Request(
f"{OLLAMA_EMBED_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def batch_get_embeddings(texts: List[str]) -> List[Optional[List[float]]]:
"""Generate embeddings for multiple texts in batch"""
if not texts:
return []
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": [t[:8192] for t in texts]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_EMBED_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=120) as response:
result = json.loads(response.read().decode())
return [d["embedding"] for d in result["data"]]
except Exception as e:
print(f"Error generating batch embeddings: {e}", file=sys.stderr)
return [None] * len(texts)
def parse_markdown_sections(content: str, date_str: str) -> List[Dict[str, Any]]:
"""
Parse markdown content into atomic facts - COMPREHENSIVE CAPTURE.
Extracts EVERYTHING:
- ## Headers as fact categories
- Individual bullet points as atomic facts
- Paragraphs as standalone facts
- Code blocks as facts
- Table rows as facts
- Lines with **bold** as critical rules
- URLs/links as facts
- Key-value pairs (Key: Value)
"""
facts = []
lines = content.split('\n')
current_section = "General"
current_section_content = []
in_code_block = False
code_block_content = []
code_block_language = ""
def flush_section_content():
"""Convert accumulated section content into facts"""
nonlocal current_section_content
if not current_section_content:
return
# Join lines and split into paragraphs
full_text = '\n'.join(current_section_content)
paragraphs = [p.strip() for p in full_text.split('\n\n') if p.strip()]
for para in paragraphs:
if len(para) < 5: # Skip very short fragments
continue
# Split long paragraphs into sentence-level facts
if len(para) > 300:
sentences = [s.strip() for s in para.replace('. ', '.\n').split('\n') if s.strip()]
for sentence in sentences:
if len(sentence) > 10:
facts.append({
"text": f"{current_section}: {sentence[:500]}",
"tags": extract_tags(sentence, date_str),
"importance": "high" if "**" in sentence else "medium",
"source_type": "inferred",
"category": current_section
})
else:
# Store whole paragraph as fact
facts.append({
"text": f"{current_section}: {para[:500]}",
"tags": extract_tags(para, date_str),
"importance": "high" if "**" in para else "medium",
"source_type": "inferred",
"category": current_section
})
current_section_content = []
def extract_tags(text: str, date_str: str) -> List[str]:
"""Extract relevant tags from text"""
tags = ["atomic-fact", date_str]
# Content-based tags
text_lower = text.lower()
tag_mappings = {
"preference": "preferences",
"config": "configuration",
"hardware": "hardware",
"security": "security",
"youtube": "youtube",
"video": "video",
"workflow": "workflow",
"rule": "rules",
"critical": "critical",
"decision": "decisions",
"research": "research",
"process": "process",
"step": "steps",
}
for keyword, tag in tag_mappings.items():
if keyword in text_lower:
tags.append(tag)
return tags
for i, line in enumerate(lines):
line = line.strip()
# Code blocks
if line.startswith('```'):
if in_code_block:
# End of code block
if code_block_content:
code_text = '\n'.join(code_block_content)
facts.append({
"text": f"{current_section} [Code: {code_block_language}]: {code_text[:800]}",
"tags": ["code-block", "atomic-fact", date_str, code_block_language],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
code_block_content = []
code_block_language = ""
in_code_block = False
else:
# Start of code block
flush_section_content()
in_code_block = True
code_block_language = line[3:].strip() or "text"
continue
if in_code_block:
code_block_content.append(line)
continue
# Skip empty lines
if not line:
flush_section_content()
continue
# Section headers (##)
if line.startswith('## '):
flush_section_content()
current_section = line[3:].strip()
facts.append({
"text": f"Section: {current_section}",
"tags": ["section-header", "atomic-fact", date_str],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Skip main title (# Title)
if line.startswith('# ') and i == 0:
continue
# Bullet points (all levels)
if line.startswith('- ') or line.startswith('* ') or line.startswith('+ '):
flush_section_content()
fact_text = line[2:].strip()
if len(fact_text) > 3:
facts.append({
"text": f"{current_section}: {fact_text[:500]}",
"tags": extract_tags(fact_text, date_str),
"importance": "high" if "**" in fact_text else "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Numbered lists
if re.match(r'^\d+\.\s', line):
flush_section_content()
fact_text = re.sub(r'^\d+\.\s*', '', line)
if len(fact_text) > 3:
facts.append({
"text": f"{current_section}: {fact_text[:500]}",
"tags": extract_tags(fact_text, date_str),
"importance": "high" if "**" in fact_text else "medium",
"source_type": "inferred",
"category": current_section
})
continue
# URLs / Links
url_match = re.search(r'https?://[^\s<>"\')\]]+', line)
if url_match and len(line) < 300:
facts.append({
"text": f"{current_section}: {line[:400]}",
"tags": ["url", "link", "atomic-fact", date_str],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Key-value pairs (Key: Value)
if ':' in line and len(line) < 200 and not line.startswith('**'):
key_part = line.split(':')[0].strip()
if key_part and len(key_part) < 50 and not key_part.startswith('#'):
facts.append({
"text": f"{current_section}: {line[:400]}",
"tags": extract_tags(line, date_str) + ["key-value"],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Bold text / critical rules
if '**' in line:
flush_section_content()
facts.append({
"text": f"{current_section}: {line[:500]}",
"tags": ["critical-rule", "high-priority", date_str],
"importance": "high",
"source_type": "user",
"category": current_section
})
continue
# Table rows (| col1 | col2 |)
if '|' in line and not line.startswith('#'):
cells = [c.strip() for c in line.split('|') if c.strip()]
if cells and not all(c.replace('-', '').replace(':', '') == '' for c in cells):
facts.append({
"text": f"{current_section} [Table]: {' | '.join(cells)[:400]}",
"tags": ["table-row", "atomic-fact", date_str],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Accumulate regular content
if len(line) > 2:
current_section_content.append(line)
# Flush remaining content
flush_section_content()
return facts
def check_existing_facts(date_str: str) -> set:
"""Check which facts from this date are already stored"""
try:
scroll_data = json.dumps({
"limit": 1000,
"with_payload": True,
"filter": {
"must": [{"key": "tags", "match": {"value": date_str}}]
}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
# Return set of text previews (first 100 chars) for comparison
return {p["payload"]["text"][:100] for p in points if "text" in p["payload"]}
except Exception as e:
print(f"Warning: Could not check existing facts: {e}", file=sys.stderr)
return set()
def upload_facts_batch(facts: List[Dict[str, Any]], batch_size: int = 50) -> Tuple[int, int]:
"""Upload facts to Qdrant in batches"""
total = len(facts)
uploaded = 0
failed = 0
for i in range(0, total, batch_size):
batch = facts[i:i + batch_size]
# Generate embeddings for this batch
texts = [f["text"] for f in batch]
embeddings = batch_get_embeddings(texts)
# Prepare points
points = []
for fact, embedding in zip(batch, embeddings):
if embedding is None:
failed += 1
continue
point_id = str(uuid.uuid4())
date_str = fact.get("date", datetime.now().strftime("%Y-%m-%d"))
payload = {
"text": fact["text"],
"date": date_str,
"tags": fact.get("tags", []),
"importance": fact.get("importance", "medium"),
"source": fact.get("source", "fact-extraction"),
"source_type": fact.get("source_type", "inferred"),
"category": fact.get("category", "general"),
"confidence": fact.get("confidence", "high"),
"verified": fact.get("verified", True),
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat()
}
# NOTE: Memories never expire - user requested permanent retention
# No expires_at field set = memories persist indefinitely
points.append({
"id": point_id,
"vector": embedding,
"payload": payload
})
if not points:
continue
# Upload batch
upsert_data = {"points": points}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
uploaded += len(points)
print(f" ✅ Batch {i//batch_size + 1}: {len(points)} facts uploaded")
else:
print(f" ❌ Batch {i//batch_size + 1}: Failed")
failed += len(points)
except Exception as e:
print(f" ❌ Batch {i//batch_size + 1}: {e}", file=sys.stderr)
failed += len(points)
return uploaded, failed
def process_single_date(date_str: str, dry_run: bool = False, batch_size: int = 50) -> Tuple[int, int]:
"""Process a single date's memory file"""
file_path = MEMORY_DIR / f"{date_str}.md"
if not file_path.exists():
print(f" ⚠️ File not found: {file_path}")
return 0, 0
print(f"Processing {date_str}...")
with open(file_path, 'r') as f:
content = f.read()
# Parse into atomic facts
facts = parse_markdown_sections(content, date_str)
if not facts:
print(f" ⚠️ No facts extracted from {date_str}")
return 0, 0
print(f" 📄 Extracted {len(facts)} atomic facts")
# Check for existing (skip duplicates)
existing = check_existing_facts(date_str)
new_facts = [f for f in facts if f["text"][:100] not in existing]
if existing:
print(f" ⏭️ Skipping {len(facts) - len(new_facts)} duplicates")
if not new_facts:
print(f" ✅ All facts already stored for {date_str}")
return 0, 0
print(f" 📤 Uploading {len(new_facts)} new facts...")
if dry_run:
print(f" [DRY RUN] Would upload {len(new_facts)} facts")
for f in new_facts[:3]: # Show first 3
print(f" - {f['text'][:80]}...")
if len(new_facts) > 3:
print(f" ... and {len(new_facts) - 3} more")
return len(new_facts), 0
# Add date to each fact
for f in new_facts:
f["date"] = date_str
uploaded, failed = upload_facts_batch(new_facts, batch_size)
return uploaded, failed
def get_all_memory_dates() -> List[str]:
"""Get all memory file dates sorted"""
if not MEMORY_DIR.exists():
return []
dates = []
for f in MEMORY_DIR.glob("????-??-??.md"):
dates.append(f.stem)
dates.sort()
return dates
def main():
parser = argparse.ArgumentParser(
description="Extract atomic facts from daily logs and store in Qdrant"
)
parser.add_argument("--date", help="Specific date to process (YYYY-MM-DD)")
parser.add_argument("--backfill-all", action="store_true",
help="Process all memory files")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be stored without uploading")
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE,
help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})")
parser.add_argument("--force", action="store_true",
help="Re-process even if already stored")
args = parser.parse_args()
print(f"=== Fact Extraction ===")
print(f"Time: {datetime.now().isoformat()}")
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
print(f"Batch size: {args.batch_size}")
print()
if args.date:
# Single date
uploaded, failed = process_single_date(args.date, args.dry_run, args.batch_size)
print(f"\n{'=' * 50}")
print(f"Summary for {args.date}:")
print(f" Uploaded: {uploaded}")
print(f" Failed: {failed}")
elif args.backfill_all:
# All dates
dates = get_all_memory_dates()
print(f"Found {len(dates)} memory files to process")
print()
total_uploaded = 0
total_failed = 0
for date_str in dates:
uploaded, failed = process_single_date(date_str, args.dry_run, args.batch_size)
total_uploaded += uploaded
total_failed += failed
print()
print(f"{'=' * 50}")
print(f"Total Summary:")
print(f" Files processed: {len(dates)}")
print(f" Total uploaded: {total_uploaded}")
print(f" Total failed: {total_failed}")
else:
# Default to today
today = datetime.now().strftime("%Y-%m-%d")
uploaded, failed = process_single_date(today, args.dry_run, args.batch_size)
print(f"\n{'=' * 50}")
print(f"Summary for {today}:")
print(f" Uploaded: {uploaded}")
print(f" Failed: {failed}")
print()
print("✅ Fact extraction complete!")
print("\nNext steps:")
print(" - Search facts: python3 search_memories.py 'your query'")
print(" - View by date: Check Qdrant with tag filter for date")
if __name__ == "__main__":
main()