Initial commit: Jarvis Memory system

2026-02-23 12:13:04 -06:00
commit e8854cd959
72 changed files with 14801 additions and 0 deletions
--- a/skills/qdrant-memory/scripts/smart_search.py
+++ b/skills/qdrant-memory/scripts/smart_search.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+"""
+Hybrid search: knowledge_base first, then web search, store new findings.
+Usage: smart_search.py "query" [--domain "Domain"] [--min-kb-score 0.5] [--store-new]
+"""
+
+import argparse
+import sys
+import json
+import urllib.request
+import urllib.parse
+import re
+from datetime import datetime
+
+QDRANT_URL = "http://10.0.0.40:6333"
+OLLAMA_EMBED_URL = "http://localhost:11434/api/embed"
+SEARXNG_URL = "http://10.0.0.8:8888"
+KB_COLLECTION = "knowledge_base"
+
+def get_embedding(text):
+    """Generate embedding via Ollama"""
+    data = {
+        "model": "nomic-embed-text",
+        "input": text[:1000]  # Limit for speed
+    }
+    req = urllib.request.Request(
+        OLLAMA_EMBED_URL,
+        data=json.dumps(data).encode(),
+        headers={"Content-Type": "application/json"},
+        method="POST"
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=30) as response:
+            result = json.loads(response.read().decode())
+            return result.get("embeddings", [None])[0]
+    except Exception as e:
+        print(f"⚠️  Embedding error: {e}", file=sys.stderr)
+        return None
+
+def search_knowledge_base(query, domain=None, limit=5, min_score=0.5):
+    """Search knowledge base via vector similarity"""
+    embedding = get_embedding(query)
+    if not embedding:
+        return []
+    
+    search_data = {
+        "vector": embedding,
+        "limit": limit,
+        "with_payload": True
+    }
+    
+    # Note: score_threshold filters aggressively; we filter client-side instead
+    # to show users what scores were returned
+    
+    if domain:
+        search_data["filter"] = {
+            "must": [{"key": "domain", "match": {"value": domain}}]
+        }
+    
+    url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/search"
+    req = urllib.request.Request(
+        url,
+        data=json.dumps(search_data).encode(),
+        headers={"Content-Type": "application/json"},
+        method="POST"
+    )
+    
+    try:
+        with urllib.request.urlopen(req, timeout=10) as response:
+            result = json.loads(response.read().decode())
+            results = result.get("result", [])
+            # Filter by min_score client-side
+            return [r for r in results if r.get("score", 0) >= min_score]
+    except Exception as e:
+        print(f"⚠️  KB search error: {e}", file=sys.stderr)
+        return []
+
+def web_search(query, limit=5):
+    """Search via SearXNG"""
+    encoded_query = urllib.parse.quote(query)
+    url = f"{SEARXNG_URL}/?q={encoded_query}&format=json&safesearch=0"
+    
+    try:
+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
+        with urllib.request.urlopen(req, timeout=15) as response:
+            data = json.loads(response.read().decode())
+            return data.get("results", [])[:limit]
+    except Exception as e:
+        print(f"⚠️  Web search error: {e}", file=sys.stderr)
+        return []
+
+def fetch_and_extract(url):
+    """Fetch URL and extract clean text"""
+    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
+    req = urllib.request.Request(url, headers=headers)
+    
+    try:
+        with urllib.request.urlopen(req, timeout=20) as response:
+            html = response.read().decode('utf-8', errors='ignore')
+            
+            # Extract title
+            title_match = re.search(r'<title[^>]*>([^<]*)</title>', html, re.IGNORECASE)
+            title = title_match.group(1).strip() if title_match else "Untitled"
+            
+            # Clean HTML
+            html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
+            html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
+            html = re.sub(r'<[^>]+>', ' ', html)
+            text = re.sub(r'\s+', ' ', html).strip()
+            
+            return title, text[:3000]  # Limit content
+    except Exception as e:
+        return None, None
+
+def is_substantial(text, min_length=500):
+    """Check if content is substantial enough to store"""
+    return len(text) >= min_length
+
+def is_unique_content(text, kb_results, similarity_threshold=0.8):
+    """Check if content is unique compared to existing KB entries"""
+    if not kb_results:
+        return True
+    
+    # Simple check: if any KB result has very similar content, skip
+    text_lower = text.lower()
+    for result in kb_results:
+        payload = result.get("payload", {})
+        kb_text = payload.get("text_preview", "").lower()
+        
+        # Check for substantial overlap
+        if kb_text and len(kb_text) > 100:
+            # Simple word overlap check
+            kb_words = set(kb_text.split())
+            new_words = set(text_lower.split())
+            if kb_words and new_words:
+                overlap = len(kb_words & new_words) / len(kb_words)
+                if overlap > similarity_threshold:
+                    return False
+    return True
+
+def store_in_kb(text, metadata):
+    """Store content in knowledge base"""
+    import uuid
+    import hashlib
+    
+    embedding = get_embedding(text[:1000])
+    if not embedding:
+        return False
+    
+    # Add metadata fields
+    metadata["checksum"] = f"sha256:{hashlib.sha256(text.encode()).hexdigest()[:16]}"
+    metadata["date_scraped"] = datetime.now().isoformat()
+    metadata["text_preview"] = text[:300] + "..." if len(text) > 300 else text
+    
+    point = {
+        "id": str(uuid.uuid4()),
+        "vector": embedding,
+        "payload": metadata
+    }
+    
+    url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points"
+    req = urllib.request.Request(
+        url,
+        data=json.dumps({"points": [point]}).encode(),
+        headers={"Content-Type": "application/json"},
+        method="PUT"
+    )
+    
+    try:
+        with urllib.request.urlopen(req, timeout=10) as response:
+            result = json.loads(response.read().decode())
+            return result.get("status") == "ok"
+    except Exception as e:
+        print(f"⚠️  Store error: {e}", file=sys.stderr)
+        return False
+
+def suggest_domain(query, title, content):
+    """Suggest a domain based on query and content"""
+    query_lower = query.lower()
+    title_lower = title.lower()
+    content_lower = content[:500].lower()
+    
+    # Keyword mapping
+    domains = {
+        "Python": ["python", "pip", "django", "flask", "asyncio"],
+        "JavaScript": ["javascript", "js", "node", "react", "vue", "angular"],
+        "Linux": ["linux", "ubuntu", "debian", "systemd", "bash", "shell"],
+        "Networking": ["network", "dns", "tcp", "http", "ssl", "vpn"],
+        "Docker": ["docker", "container", "kubernetes", "k8s"],
+        "AI/ML": ["ai", "ml", "machine learning", "llm", "gpt", "model"],
+        "OpenClaw": ["openclaw"],
+        "Database": ["database", "sql", "postgres", "mysql", "redis"],
+        "Security": ["security", "encryption", "auth", "oauth", "jwt"],
+        "DevOps": ["devops", "ci/cd", "github actions", "jenkins"]
+    }
+    
+    combined = query_lower + " " + title_lower + " " + content_lower
+    
+    for domain, keywords in domains.items():
+        for kw in keywords:
+            if kw in combined:
+                return domain
+    
+    return "General"
+
+def main():
+    parser = argparse.ArgumentParser(description="Smart search: KB first, then web, store new")
+    parser.add_argument("query", help="Search query")
+    parser.add_argument("--domain", help="Filter KB by domain")
+    parser.add_argument("--min-kb-score", type=float, default=0.5, help="Minimum KB match score (default: 0.5)")
+    parser.add_argument("--store-new", action="store_true", help="Automatically store new web findings")
+    parser.add_argument("--web-limit", type=int, default=3, help="Number of web results to check")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    
+    args = parser.parse_args()
+    
+    results = {
+        "query": args.query,
+        "kb_results": [],
+        "web_results": [],
+        "stored_count": 0,
+        "timestamp": datetime.now().isoformat()
+    }
+    
+    # Step 1: Search knowledge base
+    print(f"🔍 Searching knowledge base (min score: {args.min_kb_score})...")
+    kb_results = search_knowledge_base(args.query, args.domain, limit=5, min_score=args.min_kb_score)
+    results["kb_results"] = kb_results
+    
+    if kb_results:
+        print(f"   ✓ Found {len(kb_results)} KB entries")
+        for r in kb_results:
+            payload = r.get("payload", {})
+            score = r.get("score", 0)
+            title = payload.get('title', 'Untitled')[:50]
+            source = payload.get('source_url', 'N/A')[:40]
+            print(f"     • {title}... (score: {score:.2f}) [{source}...]")
+    else:
+        print(f"   ✗ No KB matches above threshold ({args.min_kb_score})")
+    
+    # Step 2: Web search
+    print(f"\n🌐 Searching web...")
+    web_results = web_search(args.query, limit=args.web_limit)
+    results["web_results"] = web_results
+    
+    if not web_results:
+        print(f"   ✗ No web results")
+        if args.json:
+            print(json.dumps(results, indent=2))
+        return
+    
+    print(f"   ✓ Found {len(web_results)} web results")
+    
+    # Step 3: Check and optionally store new findings
+    new_stored = 0
+    
+    for web_result in web_results:
+        url = web_result.get("url", "")
+        title = web_result.get("title", "Untitled")
+        snippet = web_result.get("content", "")
+        
+        print(f"\n📄 Checking: {title}")
+        print(f"   URL: {url}")
+        
+        # Fetch full content
+        fetched_title, content = fetch_and_extract(url)
+        if not content:
+            print(f"   ⚠️  Could not fetch content")
+            continue
+        
+        title = fetched_title or title
+        
+        # Check if substantial
+        if not is_substantial(content):
+            print(f"   ⏭️  Content too short ({len(content)} chars), skipping")
+            continue
+        
+        # Check if unique
+        if not is_unique_content(content, kb_results):
+            print(f"   ⏭️  Similar content already in KB")
+            continue
+        
+        print(f"   ✓ New substantial content ({len(content)} chars)")
+        
+        # Auto-store or suggest
+        if args.store_new:
+            domain = suggest_domain(args.query, title, content)
+            subjects = [s.strip() for s in args.query.lower().split() if len(s) > 3]
+            
+            metadata = {
+                "domain": domain,
+                "path": f"{domain}/Web/{re.sub(r'[^\w\s-]', '', title)[:30]}",
+                "subjects": subjects,
+                "category": "reference",
+                "content_type": "web_page",
+                "title": title,
+                "source_url": url,
+                "date_added": datetime.now().strftime("%Y-%m-%d")
+            }
+            
+            if store_in_kb(content, metadata):
+                print(f"   ✅ Stored in KB (domain: {domain})")
+                new_stored += 1
+            else:
+                print(f"   ❌ Failed to store")
+        else:
+            print(f"   💡 Use --store-new to save this")
+    
+    results["stored_count"] = new_stored
+    
+    # Summary
+    print(f"\n📊 Summary:")
+    print(f"   KB results: {len(kb_results)}")
+    print(f"   Web results checked: {len(web_results)}")
+    print(f"   New items stored: {new_stored}")
+    
+    if args.json:
+        print(json.dumps(results, indent=2))
+
+if __name__ == "__main__":
+    main()