Initial commit: Jarvis Memory system

2026-02-23 12:13:04 -06:00
commit e8854cd959
72 changed files with 14801 additions and 0 deletions
--- a/skills/qdrant-memory/scripts/batch_crawl.py
+++ b/skills/qdrant-memory/scripts/batch_crawl.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""
+Batch URL Crawler - Scrape multiple URLs to knowledge base
+Usage: batch_crawl.py urls.txt --domain "Python" --path "Docs/Tutorials"
+"""
+
+import argparse
+import sys
+import json
+import concurrent.futures
+import urllib.request
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent))
+from scrape_to_kb import fetch_url, extract_text, chunk_text, get_embedding, compute_checksum, store_in_kb
+
+QDRANT_URL = "http://10.0.0.40:6333"
+COLLECTION_NAME = "knowledge_base"
+
+def load_urls(url_source):
+    """Load URLs from file or JSON"""
+    if url_source.endswith('.json'):
+        with open(url_source) as f:
+            data = json.load(f)
+            return [(item['url'], item.get('title'), item.get('subjects', [])) 
+                    for item in data]
+    else:
+        with open(url_source) as f:
+            urls = []
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    # Parse URL [title] [subjects]
+                    parts = line.split(' ', 1)
+                    url = parts[0]
+                    title = None
+                    subjects = []
+                    if len(parts) > 1:
+                        # Check for [Title] and #subject1,#subject2
+                        rest = parts[1]
+                        if '[' in rest and ']' in rest:
+                            title_match = rest[rest.find('[')+1:rest.find(']')]
+                            title = title_match
+                            rest = rest[rest.find(']')+1:]
+                        if '#' in rest:
+                            subjects = [s.strip() for s in rest.split('#') if s.strip()]
+                    urls.append((url, title, subjects))
+            return urls
+
+def scrape_single(url_data, domain, path, category, content_type):
+    """Scrape a single URL"""
+    url, title_override, subjects = url_data
+    
+    try:
+        print(f"🔍 {url}")
+        html = fetch_url(url)
+        if not html:
+            return {"url": url, "status": "failed", "error": "fetch"}
+        
+        title, text = extract_text(html)
+        if title_override:
+            title = title_override
+        
+        if len(text) < 200:
+            return {"url": url, "status": "skipped", "reason": "too_short"}
+        
+        chunks = chunk_text(text)
+        checksum = compute_checksum(text)
+        
+        stored = 0
+        for i, chunk in enumerate(chunks):
+            chunk_metadata = {
+                "domain": domain,
+                "path": f"{path}/chunk-{i+1}",
+                "subjects": subjects,
+                "category": category,
+                "content_type": content_type,
+                "title": f"{title} (part {i+1}/{len(chunks)})",
+                "checksum": checksum,
+                "source_url": url,
+                "date_added": "2026-02-05",
+                "chunk_index": i + 1,
+                "total_chunks": len(chunks),
+                "text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk
+            }
+            
+            if store_in_kb(chunk, chunk_metadata):
+                stored += 1
+        
+        return {
+            "url": url,
+            "status": "success",
+            "chunks": len(chunks),
+            "stored": stored,
+            "title": title
+        }
+    except Exception as e:
+        return {"url": url, "status": "error", "error": str(e)}
+
+def main():
+    parser = argparse.ArgumentParser(description="Batch scrape URLs to knowledge base")
+    parser.add_argument("urls", help="File with URLs (.txt or .json)")
+    parser.add_argument("--domain", required=True, help="Knowledge domain")
+    parser.add_argument("--path", required=True, help="Hierarchical path")
+    parser.add_argument("--category", default="reference", 
+                       choices=["reference", "tutorial", "snippet", "troubleshooting", "concept"])
+    parser.add_argument("--content-type", default="web_page")
+    parser.add_argument("--workers", type=int, default=3, help="Concurrent workers (default: 3)")
+    parser.add_argument("--dry-run", action="store_true", help="Test without storing")
+    
+    args = parser.parse_args()
+    
+    urls = load_urls(args.urls)
+    print(f"📋 Loaded {len(urls)} URLs")
+    print(f"🏷️  Domain: {args.domain}")
+    print(f"📂 Path: {args.path}")
+    print(f"⚡ Workers: {args.workers}")
+    
+    if args.dry_run:
+        print("\n🔍 DRY RUN - No storage\n")
+        for url, title, subjects in urls:
+            print(f"  Would scrape: {url}")
+            if title:
+                print(f"    Title: {title}")
+            if subjects:
+                print(f"    Subjects: {', '.join(subjects)}")
+        return
+    
+    results = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
+        futures = {
+            executor.submit(scrape_single, url_data, args.domain, args.path, 
+                          args.category, args.content_type): url_data
+            for url_data in urls
+        }
+        
+        for future in concurrent.futures.as_completed(futures):
+            result = future.result()
+            results.append(result)
+            
+            if result["status"] == "success":
+                print(f"  ✓ {result['title'][:50]}... ({result['stored']}/{result['chunks']} chunks)")
+            elif result["status"] == "skipped":
+                print(f"  ⚠ Skipped: {result.get('reason')}")
+            else:
+                print(f"  ✗ Failed: {result.get('error', 'unknown')}")
+    
+    # Summary
+    success = sum(1 for r in results if r["status"] == "success")
+    failed = sum(1 for r in results if r["status"] in ["failed", "error"])
+    skipped = sum(1 for r in results if r["status"] == "skipped")
+    
+    print(f"\n📊 Summary:")
+    print(f"   ✓ Success: {success}")
+    print(f"   ✗ Failed: {failed}")
+    print(f"   ⚠ Skipped: {skipped}")
+
+if __name__ == "__main__":
+    main()