forked from SpeedyFoxAi/jarvis-memory
322 lines
11 KiB
Python
322 lines
11 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Hybrid search: knowledge_base first, then web search, store new findings.
|
||
|
|
Usage: smart_search.py "query" [--domain "Domain"] [--min-kb-score 0.5] [--store-new]
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
import urllib.request
|
||
|
|
import urllib.parse
|
||
|
|
import re
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
QDRANT_URL = "http://10.0.0.40:6333"
|
||
|
|
OLLAMA_EMBED_URL = "http://localhost:11434/api/embed"
|
||
|
|
SEARXNG_URL = "http://10.0.0.8:8888"
|
||
|
|
KB_COLLECTION = "knowledge_base"
|
||
|
|
|
||
|
|
def get_embedding(text):
|
||
|
|
"""Generate embedding via Ollama"""
|
||
|
|
data = {
|
||
|
|
"model": "nomic-embed-text",
|
||
|
|
"input": text[:1000] # Limit for speed
|
||
|
|
}
|
||
|
|
req = urllib.request.Request(
|
||
|
|
OLLAMA_EMBED_URL,
|
||
|
|
data=json.dumps(data).encode(),
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="POST"
|
||
|
|
)
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
return result.get("embeddings", [None])[0]
|
||
|
|
except Exception as e:
|
||
|
|
print(f"⚠️ Embedding error: {e}", file=sys.stderr)
|
||
|
|
return None
|
||
|
|
|
||
|
|
def search_knowledge_base(query, domain=None, limit=5, min_score=0.5):
|
||
|
|
"""Search knowledge base via vector similarity"""
|
||
|
|
embedding = get_embedding(query)
|
||
|
|
if not embedding:
|
||
|
|
return []
|
||
|
|
|
||
|
|
search_data = {
|
||
|
|
"vector": embedding,
|
||
|
|
"limit": limit,
|
||
|
|
"with_payload": True
|
||
|
|
}
|
||
|
|
|
||
|
|
# Note: score_threshold filters aggressively; we filter client-side instead
|
||
|
|
# to show users what scores were returned
|
||
|
|
|
||
|
|
if domain:
|
||
|
|
search_data["filter"] = {
|
||
|
|
"must": [{"key": "domain", "match": {"value": domain}}]
|
||
|
|
}
|
||
|
|
|
||
|
|
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/search"
|
||
|
|
req = urllib.request.Request(
|
||
|
|
url,
|
||
|
|
data=json.dumps(search_data).encode(),
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="POST"
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=10) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
results = result.get("result", [])
|
||
|
|
# Filter by min_score client-side
|
||
|
|
return [r for r in results if r.get("score", 0) >= min_score]
|
||
|
|
except Exception as e:
|
||
|
|
print(f"⚠️ KB search error: {e}", file=sys.stderr)
|
||
|
|
return []
|
||
|
|
|
||
|
|
def web_search(query, limit=5):
|
||
|
|
"""Search via SearXNG"""
|
||
|
|
encoded_query = urllib.parse.quote(query)
|
||
|
|
url = f"{SEARXNG_URL}/?q={encoded_query}&format=json&safesearch=0"
|
||
|
|
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request(url, headers={"Accept": "application/json"})
|
||
|
|
with urllib.request.urlopen(req, timeout=15) as response:
|
||
|
|
data = json.loads(response.read().decode())
|
||
|
|
return data.get("results", [])[:limit]
|
||
|
|
except Exception as e:
|
||
|
|
print(f"⚠️ Web search error: {e}", file=sys.stderr)
|
||
|
|
return []
|
||
|
|
|
||
|
|
def fetch_and_extract(url):
|
||
|
|
"""Fetch URL and extract clean text"""
|
||
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
|
||
|
|
req = urllib.request.Request(url, headers=headers)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=20) as response:
|
||
|
|
html = response.read().decode('utf-8', errors='ignore')
|
||
|
|
|
||
|
|
# Extract title
|
||
|
|
title_match = re.search(r'<title[^>]*>([^<]*)</title>', html, re.IGNORECASE)
|
||
|
|
title = title_match.group(1).strip() if title_match else "Untitled"
|
||
|
|
|
||
|
|
# Clean HTML
|
||
|
|
html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
|
||
|
|
html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
|
||
|
|
html = re.sub(r'<[^>]+>', ' ', html)
|
||
|
|
text = re.sub(r'\s+', ' ', html).strip()
|
||
|
|
|
||
|
|
return title, text[:3000] # Limit content
|
||
|
|
except Exception as e:
|
||
|
|
return None, None
|
||
|
|
|
||
|
|
def is_substantial(text, min_length=500):
|
||
|
|
"""Check if content is substantial enough to store"""
|
||
|
|
return len(text) >= min_length
|
||
|
|
|
||
|
|
def is_unique_content(text, kb_results, similarity_threshold=0.8):
|
||
|
|
"""Check if content is unique compared to existing KB entries"""
|
||
|
|
if not kb_results:
|
||
|
|
return True
|
||
|
|
|
||
|
|
# Simple check: if any KB result has very similar content, skip
|
||
|
|
text_lower = text.lower()
|
||
|
|
for result in kb_results:
|
||
|
|
payload = result.get("payload", {})
|
||
|
|
kb_text = payload.get("text_preview", "").lower()
|
||
|
|
|
||
|
|
# Check for substantial overlap
|
||
|
|
if kb_text and len(kb_text) > 100:
|
||
|
|
# Simple word overlap check
|
||
|
|
kb_words = set(kb_text.split())
|
||
|
|
new_words = set(text_lower.split())
|
||
|
|
if kb_words and new_words:
|
||
|
|
overlap = len(kb_words & new_words) / len(kb_words)
|
||
|
|
if overlap > similarity_threshold:
|
||
|
|
return False
|
||
|
|
return True
|
||
|
|
|
||
|
|
def store_in_kb(text, metadata):
|
||
|
|
"""Store content in knowledge base"""
|
||
|
|
import uuid
|
||
|
|
import hashlib
|
||
|
|
|
||
|
|
embedding = get_embedding(text[:1000])
|
||
|
|
if not embedding:
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Add metadata fields
|
||
|
|
metadata["checksum"] = f"sha256:{hashlib.sha256(text.encode()).hexdigest()[:16]}"
|
||
|
|
metadata["date_scraped"] = datetime.now().isoformat()
|
||
|
|
metadata["text_preview"] = text[:300] + "..." if len(text) > 300 else text
|
||
|
|
|
||
|
|
point = {
|
||
|
|
"id": str(uuid.uuid4()),
|
||
|
|
"vector": embedding,
|
||
|
|
"payload": metadata
|
||
|
|
}
|
||
|
|
|
||
|
|
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points"
|
||
|
|
req = urllib.request.Request(
|
||
|
|
url,
|
||
|
|
data=json.dumps({"points": [point]}).encode(),
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="PUT"
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=10) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
return result.get("status") == "ok"
|
||
|
|
except Exception as e:
|
||
|
|
print(f"⚠️ Store error: {e}", file=sys.stderr)
|
||
|
|
return False
|
||
|
|
|
||
|
|
def suggest_domain(query, title, content):
|
||
|
|
"""Suggest a domain based on query and content"""
|
||
|
|
query_lower = query.lower()
|
||
|
|
title_lower = title.lower()
|
||
|
|
content_lower = content[:500].lower()
|
||
|
|
|
||
|
|
# Keyword mapping
|
||
|
|
domains = {
|
||
|
|
"Python": ["python", "pip", "django", "flask", "asyncio"],
|
||
|
|
"JavaScript": ["javascript", "js", "node", "react", "vue", "angular"],
|
||
|
|
"Linux": ["linux", "ubuntu", "debian", "systemd", "bash", "shell"],
|
||
|
|
"Networking": ["network", "dns", "tcp", "http", "ssl", "vpn"],
|
||
|
|
"Docker": ["docker", "container", "kubernetes", "k8s"],
|
||
|
|
"AI/ML": ["ai", "ml", "machine learning", "llm", "gpt", "model"],
|
||
|
|
"OpenClaw": ["openclaw"],
|
||
|
|
"Database": ["database", "sql", "postgres", "mysql", "redis"],
|
||
|
|
"Security": ["security", "encryption", "auth", "oauth", "jwt"],
|
||
|
|
"DevOps": ["devops", "ci/cd", "github actions", "jenkins"]
|
||
|
|
}
|
||
|
|
|
||
|
|
combined = query_lower + " " + title_lower + " " + content_lower
|
||
|
|
|
||
|
|
for domain, keywords in domains.items():
|
||
|
|
for kw in keywords:
|
||
|
|
if kw in combined:
|
||
|
|
return domain
|
||
|
|
|
||
|
|
return "General"
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Smart search: KB first, then web, store new")
|
||
|
|
parser.add_argument("query", help="Search query")
|
||
|
|
parser.add_argument("--domain", help="Filter KB by domain")
|
||
|
|
parser.add_argument("--min-kb-score", type=float, default=0.5, help="Minimum KB match score (default: 0.5)")
|
||
|
|
parser.add_argument("--store-new", action="store_true", help="Automatically store new web findings")
|
||
|
|
parser.add_argument("--web-limit", type=int, default=3, help="Number of web results to check")
|
||
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
results = {
|
||
|
|
"query": args.query,
|
||
|
|
"kb_results": [],
|
||
|
|
"web_results": [],
|
||
|
|
"stored_count": 0,
|
||
|
|
"timestamp": datetime.now().isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
# Step 1: Search knowledge base
|
||
|
|
print(f"🔍 Searching knowledge base (min score: {args.min_kb_score})...")
|
||
|
|
kb_results = search_knowledge_base(args.query, args.domain, limit=5, min_score=args.min_kb_score)
|
||
|
|
results["kb_results"] = kb_results
|
||
|
|
|
||
|
|
if kb_results:
|
||
|
|
print(f" ✓ Found {len(kb_results)} KB entries")
|
||
|
|
for r in kb_results:
|
||
|
|
payload = r.get("payload", {})
|
||
|
|
score = r.get("score", 0)
|
||
|
|
title = payload.get('title', 'Untitled')[:50]
|
||
|
|
source = payload.get('source_url', 'N/A')[:40]
|
||
|
|
print(f" • {title}... (score: {score:.2f}) [{source}...]")
|
||
|
|
else:
|
||
|
|
print(f" ✗ No KB matches above threshold ({args.min_kb_score})")
|
||
|
|
|
||
|
|
# Step 2: Web search
|
||
|
|
print(f"\n🌐 Searching web...")
|
||
|
|
web_results = web_search(args.query, limit=args.web_limit)
|
||
|
|
results["web_results"] = web_results
|
||
|
|
|
||
|
|
if not web_results:
|
||
|
|
print(f" ✗ No web results")
|
||
|
|
if args.json:
|
||
|
|
print(json.dumps(results, indent=2))
|
||
|
|
return
|
||
|
|
|
||
|
|
print(f" ✓ Found {len(web_results)} web results")
|
||
|
|
|
||
|
|
# Step 3: Check and optionally store new findings
|
||
|
|
new_stored = 0
|
||
|
|
|
||
|
|
for web_result in web_results:
|
||
|
|
url = web_result.get("url", "")
|
||
|
|
title = web_result.get("title", "Untitled")
|
||
|
|
snippet = web_result.get("content", "")
|
||
|
|
|
||
|
|
print(f"\n📄 Checking: {title}")
|
||
|
|
print(f" URL: {url}")
|
||
|
|
|
||
|
|
# Fetch full content
|
||
|
|
fetched_title, content = fetch_and_extract(url)
|
||
|
|
if not content:
|
||
|
|
print(f" ⚠️ Could not fetch content")
|
||
|
|
continue
|
||
|
|
|
||
|
|
title = fetched_title or title
|
||
|
|
|
||
|
|
# Check if substantial
|
||
|
|
if not is_substantial(content):
|
||
|
|
print(f" ⏭️ Content too short ({len(content)} chars), skipping")
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Check if unique
|
||
|
|
if not is_unique_content(content, kb_results):
|
||
|
|
print(f" ⏭️ Similar content already in KB")
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f" ✓ New substantial content ({len(content)} chars)")
|
||
|
|
|
||
|
|
# Auto-store or suggest
|
||
|
|
if args.store_new:
|
||
|
|
domain = suggest_domain(args.query, title, content)
|
||
|
|
subjects = [s.strip() for s in args.query.lower().split() if len(s) > 3]
|
||
|
|
|
||
|
|
metadata = {
|
||
|
|
"domain": domain,
|
||
|
|
"path": f"{domain}/Web/{re.sub(r'[^\w\s-]', '', title)[:30]}",
|
||
|
|
"subjects": subjects,
|
||
|
|
"category": "reference",
|
||
|
|
"content_type": "web_page",
|
||
|
|
"title": title,
|
||
|
|
"source_url": url,
|
||
|
|
"date_added": datetime.now().strftime("%Y-%m-%d")
|
||
|
|
}
|
||
|
|
|
||
|
|
if store_in_kb(content, metadata):
|
||
|
|
print(f" ✅ Stored in KB (domain: {domain})")
|
||
|
|
new_stored += 1
|
||
|
|
else:
|
||
|
|
print(f" ❌ Failed to store")
|
||
|
|
else:
|
||
|
|
print(f" 💡 Use --store-new to save this")
|
||
|
|
|
||
|
|
results["stored_count"] = new_stored
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
print(f"\n📊 Summary:")
|
||
|
|
print(f" KB results: {len(kb_results)}")
|
||
|
|
print(f" Web results checked: {len(web_results)}")
|
||
|
|
print(f" New items stored: {new_stored}")
|
||
|
|
|
||
|
|
if args.json:
|
||
|
|
print(json.dumps(results, indent=2))
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|