Files
jarvis-memory/skills/qdrant-memory/scripts/kb_store.py

380 lines
12 KiB
Python
Raw Normal View History

2026-02-23 12:13:04 -06:00
#!/usr/bin/env python3
"""
Store content to kimi_kb (Knowledge Base) - Manual only with batch support
Usage:
Single entry:
python3 kb_store.py "Content text" --title "Title" --domain "Category" --tags "tag1,tag2"
python3 kb_store.py "Content" --title "X" --url "https://example.com" --source "docs.site"
Batch mode:
python3 kb_store.py --batch-file entries.json --batch-size 100
Features:
- Single or batch upload
- Duplicate detection by title/URL
- Domain categorization
- Access tracking
"""
import argparse
import json
import os
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION = "kimi_kb"
OLLAMA_URL = "http://localhost:11434/v1"
DEFAULT_BATCH_SIZE = 100
def check_existing(title: str = None, url: str = None) -> tuple:
"""Check if entry already exists by title or URL"""
try:
# Check by URL first if provided
if url:
scroll_data = json.dumps({
"limit": 10,
"with_payload": True,
"filter": {"must": [{"key": "url", "match": {"value": url}}]}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if points:
return points[0]["id"], "url"
# Check by title
if title:
scroll_data = json.dumps({
"limit": 10,
"with_payload": True,
"filter": {"must": [{"key": "title", "match": {"value": title}}]}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if points:
return points[0]["id"], "title"
except Exception as e:
print(f"Warning: Could not check existing: {e}", file=sys.stderr)
return None, None
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def batch_upload_embeddings(texts: List[str]) -> List[Optional[List[float]]]:
"""Generate embeddings for multiple texts in batch"""
if not texts:
return []
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": [t[:8192] for t in texts]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=120) as response:
result = json.loads(response.read().decode())
return [d["embedding"] for d in result["data"]]
except Exception as e:
print(f"Error generating batch embeddings: {e}", file=sys.stderr)
return [None] * len(texts)
def upload_points_batch(points: List[Dict[str, Any]], batch_size: int = DEFAULT_BATCH_SIZE) -> tuple:
"""Upload points in batches to Qdrant"""
total = len(points)
uploaded = 0
failed = 0
for i in range(0, total, batch_size):
batch = points[i:i + batch_size]
upsert_data = {"points": batch}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
uploaded += len(batch)
print(f" ✅ Uploaded batch {i//batch_size + 1}: {len(batch)} points")
else:
print(f" ❌ Batch {i//batch_size + 1} failed: {result}")
failed += len(batch)
except Exception as e:
print(f" ❌ Batch {i//batch_size + 1} error: {e}", file=sys.stderr)
failed += len(batch)
return uploaded, failed
def store_single(
text: str,
embedding: List[float],
title: str = None,
url: str = None,
source: str = None,
domain: str = "general",
tags: List[str] = None,
content_type: str = "document",
replace: bool = False
) -> bool:
"""Store single KB entry"""
# Check for existing entry
existing_id, match_type = check_existing(title=title, url=url)
if existing_id:
if not replace:
print(f"⚠️ Entry '{title}' already exists (matched by {match_type}, ID: {existing_id})")
print(f" Use --replace to overwrite")
return False
point_id = existing_id if existing_id else str(uuid.uuid4())
payload = {
"text": text,
"title": title or "Untitled",
"url": url or "",
"source": source or "manual",
"domain": domain or "general",
"tags": tags or [],
"content_type": content_type,
"date": datetime.now().strftime("%Y-%m-%d"),
"created_at": datetime.now().isoformat(),
"access_count": 0
}
point = {
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload
}]
}
data = json.dumps(point).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
data=data,
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
print(f"Error storing to KB: {e}", file=sys.stderr)
return False
def store_batch(
entries: List[Dict[str, Any]],
batch_size: int = DEFAULT_BATCH_SIZE,
check_duplicates: bool = True
) -> tuple:
"""Store multiple KB entries in batch with optional duplicate checking"""
if not entries:
return 0, 0
print(f"Processing {len(entries)} entries...")
# Filter duplicates if requested
entries_to_process = []
duplicates = 0
if check_duplicates:
for entry in entries:
existing_id, match_type = check_existing(
title=entry.get("title"),
url=entry.get("url")
)
if existing_id:
print(f" ⏭️ Skipping duplicate: {entry.get('title', 'Untitled')} ({match_type})")
duplicates += 1
else:
entries_to_process.append(entry)
else:
entries_to_process = entries
if not entries_to_process:
print(f"All {len(entries)} entries already exist")
return 0, 0
print(f"Generating embeddings for {len(entries_to_process)} entries...")
texts = [e["content"] for e in entries_to_process]
embeddings = batch_upload_embeddings(texts)
# Prepare points
points = []
failed_embeddings = 0
for entry, embedding in zip(entries_to_process, embeddings):
if embedding is None:
failed_embeddings += 1
continue
point_id = str(uuid.uuid4())
payload = {
"text": entry["content"],
"title": entry.get("title", "Untitled"),
"url": entry.get("url", ""),
"source": entry.get("source", "manual"),
"domain": entry.get("domain", "general"),
"tags": entry.get("tags", []),
"content_type": entry.get("type", "document"),
"date": datetime.now().strftime("%Y-%m-%d"),
"created_at": datetime.now().isoformat(),
"access_count": 0
}
points.append({
"id": point_id,
"vector": embedding,
"payload": payload
})
if not points:
return 0, failed_embeddings + duplicates
# Upload in batches
print(f"Uploading {len(points)} entries in batches of {batch_size}...")
uploaded, failed_upload = upload_points_batch(points, batch_size)
return uploaded, failed_embeddings + failed_upload + duplicates
def main():
parser = argparse.ArgumentParser(description="Store content to kimi_kb")
parser.add_argument("content", nargs="?", help="Content to store")
parser.add_argument("--title", default=None, help="Title of the content")
parser.add_argument("--url", default=None, help="Source URL if from web")
parser.add_argument("--source", default=None, help="Source name")
parser.add_argument("--domain", default="general", help="Domain/category")
parser.add_argument("--tags", default=None, help="Comma-separated tags")
parser.add_argument("--type", default="document", choices=["document", "web", "code", "note"],
help="Content type")
parser.add_argument("--replace", action="store_true", help="Replace existing entry")
parser.add_argument("--batch-file", help="JSON file with multiple entries")
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size")
parser.add_argument("--no-check-duplicates", action="store_true", help="Skip duplicate checking in batch mode")
args = parser.parse_args()
# Batch mode
if args.batch_file:
print(f"Batch mode: Loading entries from {args.batch_file}")
try:
with open(args.batch_file, 'r') as f:
entries = json.load(f)
if not isinstance(entries, list):
print("Batch file must contain a JSON array", file=sys.stderr)
sys.exit(1)
print(f"Loaded {len(entries)} entries")
uploaded, failed = store_batch(
entries,
args.batch_size,
check_duplicates=not args.no_check_duplicates
)
print(f"\n{'=' * 50}")
print(f"Batch complete: {uploaded} uploaded, {failed} failed")
sys.exit(0 if failed == 0 else 1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
# Single entry mode
if not args.content:
print("Error: Provide content or use --batch-file", file=sys.stderr)
parser.print_help()
sys.exit(1)
tags = [t.strip() for t in args.tags.split(",")] if args.tags else []
print(f"Generating embedding...")
embedding = get_embedding(args.content)
if embedding is None:
print("❌ Failed to generate embedding")
sys.exit(1)
print(f"Storing to kimi_kb: {args.title or 'Untitled'}...")
if store_single(
text=args.content,
embedding=embedding,
title=args.title,
url=args.url,
source=args.source,
domain=args.domain,
tags=tags,
content_type=args.type,
replace=args.replace
):
print(f"✅ Stored to kimi_kb ({args.domain})")
else:
print("❌ Failed to store")
sys.exit(1)
if __name__ == "__main__":
main()