jarvis-memory/skills/qdrant-memory/scripts/scrape_to_kb.py

#!/usr/bin/env python3
"""
Scrape web content and store in knowledge_base collection
Usage: scrape_to_kb.py <url> <domain> <path> [--title "Title"] [--subjects "a,b,c"]
"""

import argparse
import sys
import re
import hashlib
import urllib.request
import urllib.error
from html import unescape

QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "knowledge_base"
OLLAMA_EMBED_URL = "http://localhost:11434/api/embed"

def fetch_url(url):
    """Fetch URL content"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    req = urllib.request.Request(url, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=30) as response:
            return response.read().decode('utf-8', errors='ignore')
    except Exception as e:
        print(f"❌ Error fetching {url}: {e}", file=sys.stderr)
        return None

def extract_text(html):
    """Extract clean text from HTML"""
    # Remove script and style tags
    html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
    html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)

    # Extract title
    title_match = re.search(r'<title[^>]*>([^<]*)</title>', html, re.IGNORECASE)
    title = title_match.group(1).strip() if title_match else "Untitled"
    title = unescape(title)

    # Remove nav/header/footer common patterns
    html = re.sub(r'<nav[^>]*>.*?</nav>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
    html = re.sub(r'<header[^>]*>.*?</header>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
    html = re.sub(r'<footer[^>]*>.*?</footer>', ' ', html, flags=re.DOTALL | re.IGNORECASE)

    # Convert common block elements to newlines
    html = re.sub(r'</(p|div|h[1-6]|li|tr)>', '\n', html, flags=re.IGNORECASE)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.IGNORECASE)

    # Remove all remaining tags
    text = re.sub(r'<[^>]+>', ' ', html)

    # Clean up whitespace
    text = unescape(text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = '\n'.join(line.strip() for line in text.split('\n'))
    text = '\n'.join(line for line in text.split('\n') if line)

    return title, text

def chunk_text(text, max_chars=2000, overlap=200):
    """Split text into overlapping chunks"""
    chunks = []
    start = 0

    while start < len(text):
        end = start + max_chars

        # Try to break at sentence or paragraph
        if end < len(text):
            # Look for paragraph break
            para_break = text.rfind('\n\n', start, end)
            if para_break > start + 500:
                end = para_break
            else:
                # Look for sentence break
                sent_break = max(
                    text.rfind('. ', start, end),
                    text.rfind('? ', start, end),
                    text.rfind('! ', start, end)
                )
                if sent_break > start + 500:
                    end = sent_break + 1

        chunk = text[start:end].strip()
        if len(chunk) > 100:  # Skip tiny chunks
            chunks.append(chunk)

        start = end - overlap
        if start >= len(text):
            break

    return chunks

def get_embedding(text):
    """Generate embedding via Ollama"""
    import json
    data = {
        "model": "nomic-embed-text",
        "input": text
    }
    req = urllib.request.Request(
        OLLAMA_EMBED_URL,
        data=json.dumps(data).encode(),
        headers={"Content-Type": "application/json"},
        method="POST"
    )
    try:
        with urllib.request.urlopen(req, timeout=60) as response:
            result = json.loads(response.read().decode())
            return result.get("embeddings", [None])[0]
    except Exception as e:
        print(f"❌ Error generating embedding: {e}", file=sys.stderr)
        return None

def compute_checksum(text):
    """Compute SHA256 checksum"""
    return f"sha256:{hashlib.sha256(text.encode()).hexdigest()}"

def store_in_kb(text, metadata):
    """Store chunk in knowledge_base"""
    import json
    import uuid

    embedding = get_embedding(text)
    if not embedding:
        return False

    point = {
        "id": str(uuid.uuid4()),
        "vector": embedding,
        "payload": metadata
    }

    url = f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points"
    req = urllib.request.Request(
        url,
        data=json.dumps({"points": [point]}).encode(),
        headers={"Content-Type": "application/json"},
        method="PUT"
    )

    try:
        with urllib.request.urlopen(req, timeout=10) as response:
            result = json.loads(response.read().decode())
            return result.get("status") == "ok"
    except Exception as e:
        print(f"❌ Error storing: {e}", file=sys.stderr)
        return False

def main():
    parser = argparse.ArgumentParser(description="Scrape URL to knowledge base")
    parser.add_argument("url", help="URL to scrape")
    parser.add_argument("domain", help="Knowledge domain (e.g., Python, OpenClaw)")
    parser.add_argument("path", help="Hierarchical path (e.g., OpenClaw/Docs/Overview)")
    parser.add_argument("--title", help="Override title")
    parser.add_argument("--subjects", help="Comma-separated subjects")
    parser.add_argument("--category", default="reference", help="Category: reference|tutorial|snippet|troubleshooting|concept")
    parser.add_argument("--content-type", default="web_page", help="Content type: web_page|code|markdown|pdf|note")

    args = parser.parse_args()

    print(f"🔍 Fetching {args.url}...")
    html = fetch_url(args.url)
    if not html:
        sys.exit(1)

    print("✂️  Extracting text...")
    title, text = extract_text(html)
    if args.title:
        title = args.title

    print(f"📄 Title: {title}")
    print(f"📝 Content length: {len(text)} chars")

    if len(text) < 200:
        print("❌ Content too short, skipping", file=sys.stderr)
        sys.exit(1)

    print("🧩 Chunking...")
    chunks = chunk_text(text)
    print(f"   {len(chunks)} chunks")

    subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else []
    checksum = compute_checksum(text)
    date_added = "2026-02-05"

    print("💾 Storing chunks...")
    stored = 0
    for i, chunk in enumerate(chunks):
        chunk_metadata = {
            "domain": args.domain,
            "path": f"{args.path}/chunk-{i+1}",
            "subjects": subjects,
            "category": args.category,
            "content_type": args.content_type,
            "title": f"{title} (part {i+1}/{len(chunks)})",
            "checksum": checksum,
            "source_url": args.url,
            "date_added": date_added,
            "chunk_index": i + 1,
            "total_chunks": len(chunks),
            "text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk
        }

        if store_in_kb(chunk, chunk_metadata):
            stored += 1
            print(f"   ✓ Chunk {i+1}/{len(chunks)}")
        else:
            print(f"   ✗ Chunk {i+1}/{len(chunks)} failed")

    print(f"\n🎉 Stored {stored}/{len(chunks)} chunks in knowledge_base")
    print(f"   Domain: {args.domain}")
    print(f"   Path: {args.path}")

if __name__ == "__main__":
    main()