Files
jarvis-memory/skills/qdrant-memory/scripts/scrape_to_kb.py

221 lines
7.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Scrape web content and store in knowledge_base collection
Usage: scrape_to_kb.py <url> <domain> <path> [--title "Title"] [--subjects "a,b,c"]
"""
import argparse
import sys
import re
import hashlib
import urllib.request
import urllib.error
from html import unescape
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "knowledge_base"
OLLAMA_EMBED_URL = "http://localhost:11434/api/embed"
def fetch_url(url):
"""Fetch URL content"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
req = urllib.request.Request(url, headers=headers)
try:
with urllib.request.urlopen(req, timeout=30) as response:
return response.read().decode('utf-8', errors='ignore')
except Exception as e:
print(f"❌ Error fetching {url}: {e}", file=sys.stderr)
return None
def extract_text(html):
"""Extract clean text from HTML"""
# Remove script and style tags
html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
# Extract title
title_match = re.search(r'<title[^>]*>([^<]*)</title>', html, re.IGNORECASE)
title = title_match.group(1).strip() if title_match else "Untitled"
title = unescape(title)
# Remove nav/header/footer common patterns
html = re.sub(r'<nav[^>]*>.*?</nav>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<header[^>]*>.*?</header>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<footer[^>]*>.*?</footer>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
# Convert common block elements to newlines
html = re.sub(r'</(p|div|h[1-6]|li|tr)>', '\n', html, flags=re.IGNORECASE)
html = re.sub(r'<br\s*/?>', '\n', html, flags=re.IGNORECASE)
# Remove all remaining tags
text = re.sub(r'<[^>]+>', ' ', html)
# Clean up whitespace
text = unescape(text)
text = re.sub(r'\n\s*\n', '\n\n', text)
text = re.sub(r'[ \t]+', ' ', text)
text = '\n'.join(line.strip() for line in text.split('\n'))
text = '\n'.join(line for line in text.split('\n') if line)
return title, text
def chunk_text(text, max_chars=2000, overlap=200):
"""Split text into overlapping chunks"""
chunks = []
start = 0
while start < len(text):
end = start + max_chars
# Try to break at sentence or paragraph
if end < len(text):
# Look for paragraph break
para_break = text.rfind('\n\n', start, end)
if para_break > start + 500:
end = para_break
else:
# Look for sentence break
sent_break = max(
text.rfind('. ', start, end),
text.rfind('? ', start, end),
text.rfind('! ', start, end)
)
if sent_break > start + 500:
end = sent_break + 1
chunk = text[start:end].strip()
if len(chunk) > 100: # Skip tiny chunks
chunks.append(chunk)
start = end - overlap
if start >= len(text):
break
return chunks
def get_embedding(text):
"""Generate embedding via Ollama"""
import json
data = {
"model": "nomic-embed-text",
"input": text
}
req = urllib.request.Request(
OLLAMA_EMBED_URL,
data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
return result.get("embeddings", [None])[0]
except Exception as e:
print(f"❌ Error generating embedding: {e}", file=sys.stderr)
return None
def compute_checksum(text):
"""Compute SHA256 checksum"""
return f"sha256:{hashlib.sha256(text.encode()).hexdigest()}"
def store_in_kb(text, metadata):
"""Store chunk in knowledge_base"""
import json
import uuid
embedding = get_embedding(text)
if not embedding:
return False
point = {
"id": str(uuid.uuid4()),
"vector": embedding,
"payload": metadata
}
url = f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points"
req = urllib.request.Request(
url,
data=json.dumps({"points": [point]}).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
print(f"❌ Error storing: {e}", file=sys.stderr)
return False
def main():
parser = argparse.ArgumentParser(description="Scrape URL to knowledge base")
parser.add_argument("url", help="URL to scrape")
parser.add_argument("domain", help="Knowledge domain (e.g., Python, OpenClaw)")
parser.add_argument("path", help="Hierarchical path (e.g., OpenClaw/Docs/Overview)")
parser.add_argument("--title", help="Override title")
parser.add_argument("--subjects", help="Comma-separated subjects")
parser.add_argument("--category", default="reference", help="Category: reference|tutorial|snippet|troubleshooting|concept")
parser.add_argument("--content-type", default="web_page", help="Content type: web_page|code|markdown|pdf|note")
args = parser.parse_args()
print(f"🔍 Fetching {args.url}...")
html = fetch_url(args.url)
if not html:
sys.exit(1)
print("✂️ Extracting text...")
title, text = extract_text(html)
if args.title:
title = args.title
print(f"📄 Title: {title}")
print(f"📝 Content length: {len(text)} chars")
if len(text) < 200:
print("❌ Content too short, skipping", file=sys.stderr)
sys.exit(1)
print("🧩 Chunking...")
chunks = chunk_text(text)
print(f" {len(chunks)} chunks")
subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else []
checksum = compute_checksum(text)
date_added = "2026-02-05"
print("💾 Storing chunks...")
stored = 0
for i, chunk in enumerate(chunks):
chunk_metadata = {
"domain": args.domain,
"path": f"{args.path}/chunk-{i+1}",
"subjects": subjects,
"category": args.category,
"content_type": args.content_type,
"title": f"{title} (part {i+1}/{len(chunks)})",
"checksum": checksum,
"source_url": args.url,
"date_added": date_added,
"chunk_index": i + 1,
"total_chunks": len(chunks),
"text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk
}
if store_in_kb(chunk, chunk_metadata):
stored += 1
print(f" ✓ Chunk {i+1}/{len(chunks)}")
else:
print(f" ✗ Chunk {i+1}/{len(chunks)} failed")
print(f"\n🎉 Stored {stored}/{len(chunks)} chunks in knowledge_base")
print(f" Domain: {args.domain}")
print(f" Path: {args.path}")
if __name__ == "__main__":
main()