#!/usr/bin/env python3 """ Scrape web content and store in knowledge_base collection Usage: scrape_to_kb.py [--title "Title"] [--subjects "a,b,c"] """ import argparse import sys import re import hashlib import urllib.request import urllib.error from html import unescape QDRANT_URL = "http://10.0.0.40:6333" COLLECTION_NAME = "knowledge_base" OLLAMA_EMBED_URL = "http://localhost:11434/api/embed" def fetch_url(url): """Fetch URL content""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } req = urllib.request.Request(url, headers=headers) try: with urllib.request.urlopen(req, timeout=30) as response: return response.read().decode('utf-8', errors='ignore') except Exception as e: print(f"❌ Error fetching {url}: {e}", file=sys.stderr) return None def extract_text(html): """Extract clean text from HTML""" # Remove script and style tags html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) # Extract title title_match = re.search(r']*>([^<]*)', html, re.IGNORECASE) title = title_match.group(1).strip() if title_match else "Untitled" title = unescape(title) # Remove nav/header/footer common patterns html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) # Convert common block elements to newlines html = re.sub(r'', '\n', html, flags=re.IGNORECASE) html = re.sub(r'', '\n', html, flags=re.IGNORECASE) # Remove all remaining tags text = re.sub(r'<[^>]+>', ' ', html) # Clean up whitespace text = unescape(text) text = re.sub(r'\n\s*\n', '\n\n', text) text = re.sub(r'[ \t]+', ' ', text) text = '\n'.join(line.strip() for line in text.split('\n')) text = '\n'.join(line for line in text.split('\n') if line) return title, text def chunk_text(text, max_chars=2000, overlap=200): """Split text into overlapping chunks""" chunks = [] start = 0 while start < len(text): end = start + max_chars # Try to break at sentence or paragraph if end < len(text): # Look for paragraph break para_break = text.rfind('\n\n', start, end) if para_break > start + 500: end = para_break else: # Look for sentence break sent_break = max( text.rfind('. ', start, end), text.rfind('? ', start, end), text.rfind('! ', start, end) ) if sent_break > start + 500: end = sent_break + 1 chunk = text[start:end].strip() if len(chunk) > 100: # Skip tiny chunks chunks.append(chunk) start = end - overlap if start >= len(text): break return chunks def get_embedding(text): """Generate embedding via Ollama""" import json data = { "model": "nomic-embed-text", "input": text } req = urllib.request.Request( OLLAMA_EMBED_URL, data=json.dumps(data).encode(), headers={"Content-Type": "application/json"}, method="POST" ) try: with urllib.request.urlopen(req, timeout=60) as response: result = json.loads(response.read().decode()) return result.get("embeddings", [None])[0] except Exception as e: print(f"❌ Error generating embedding: {e}", file=sys.stderr) return None def compute_checksum(text): """Compute SHA256 checksum""" return f"sha256:{hashlib.sha256(text.encode()).hexdigest()}" def store_in_kb(text, metadata): """Store chunk in knowledge_base""" import json import uuid embedding = get_embedding(text) if not embedding: return False point = { "id": str(uuid.uuid4()), "vector": embedding, "payload": metadata } url = f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points" req = urllib.request.Request( url, data=json.dumps({"points": [point]}).encode(), headers={"Content-Type": "application/json"}, method="PUT" ) try: with urllib.request.urlopen(req, timeout=10) as response: result = json.loads(response.read().decode()) return result.get("status") == "ok" except Exception as e: print(f"❌ Error storing: {e}", file=sys.stderr) return False def main(): parser = argparse.ArgumentParser(description="Scrape URL to knowledge base") parser.add_argument("url", help="URL to scrape") parser.add_argument("domain", help="Knowledge domain (e.g., Python, OpenClaw)") parser.add_argument("path", help="Hierarchical path (e.g., OpenClaw/Docs/Overview)") parser.add_argument("--title", help="Override title") parser.add_argument("--subjects", help="Comma-separated subjects") parser.add_argument("--category", default="reference", help="Category: reference|tutorial|snippet|troubleshooting|concept") parser.add_argument("--content-type", default="web_page", help="Content type: web_page|code|markdown|pdf|note") args = parser.parse_args() print(f"🔍 Fetching {args.url}...") html = fetch_url(args.url) if not html: sys.exit(1) print("✂️ Extracting text...") title, text = extract_text(html) if args.title: title = args.title print(f"📄 Title: {title}") print(f"📝 Content length: {len(text)} chars") if len(text) < 200: print("❌ Content too short, skipping", file=sys.stderr) sys.exit(1) print("🧩 Chunking...") chunks = chunk_text(text) print(f" {len(chunks)} chunks") subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else [] checksum = compute_checksum(text) date_added = "2026-02-05" print("💾 Storing chunks...") stored = 0 for i, chunk in enumerate(chunks): chunk_metadata = { "domain": args.domain, "path": f"{args.path}/chunk-{i+1}", "subjects": subjects, "category": args.category, "content_type": args.content_type, "title": f"{title} (part {i+1}/{len(chunks)})", "checksum": checksum, "source_url": args.url, "date_added": date_added, "chunk_index": i + 1, "total_chunks": len(chunks), "text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk } if store_in_kb(chunk, chunk_metadata): stored += 1 print(f" ✓ Chunk {i+1}/{len(chunks)}") else: print(f" ✗ Chunk {i+1}/{len(chunks)} failed") print(f"\n🎉 Stored {stored}/{len(chunks)} chunks in knowledge_base") print(f" Domain: {args.domain}") print(f" Path: {args.path}") if __name__ == "__main__": main()