forked from SpeedyFoxAi/jarvis-memory
380 lines
12 KiB
Python
380 lines
12 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Store content to kimi_kb (Knowledge Base) - Manual only with batch support
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
Single entry:
|
||
|
|
python3 kb_store.py "Content text" --title "Title" --domain "Category" --tags "tag1,tag2"
|
||
|
|
python3 kb_store.py "Content" --title "X" --url "https://example.com" --source "docs.site"
|
||
|
|
|
||
|
|
Batch mode:
|
||
|
|
python3 kb_store.py --batch-file entries.json --batch-size 100
|
||
|
|
|
||
|
|
Features:
|
||
|
|
- Single or batch upload
|
||
|
|
- Duplicate detection by title/URL
|
||
|
|
- Domain categorization
|
||
|
|
- Access tracking
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import urllib.request
|
||
|
|
import urllib.error
|
||
|
|
import uuid
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import List, Optional, Dict, Any
|
||
|
|
|
||
|
|
QDRANT_URL = "http://10.0.0.40:6333"
|
||
|
|
COLLECTION = "kimi_kb"
|
||
|
|
OLLAMA_URL = "http://localhost:11434/v1"
|
||
|
|
DEFAULT_BATCH_SIZE = 100
|
||
|
|
|
||
|
|
|
||
|
|
def check_existing(title: str = None, url: str = None) -> tuple:
|
||
|
|
"""Check if entry already exists by title or URL"""
|
||
|
|
try:
|
||
|
|
# Check by URL first if provided
|
||
|
|
if url:
|
||
|
|
scroll_data = json.dumps({
|
||
|
|
"limit": 10,
|
||
|
|
"with_payload": True,
|
||
|
|
"filter": {"must": [{"key": "url", "match": {"value": url}}]}
|
||
|
|
}).encode()
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
|
||
|
|
data=scroll_data,
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="POST"
|
||
|
|
)
|
||
|
|
with urllib.request.urlopen(req, timeout=10) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
points = result.get("result", {}).get("points", [])
|
||
|
|
if points:
|
||
|
|
return points[0]["id"], "url"
|
||
|
|
|
||
|
|
# Check by title
|
||
|
|
if title:
|
||
|
|
scroll_data = json.dumps({
|
||
|
|
"limit": 10,
|
||
|
|
"with_payload": True,
|
||
|
|
"filter": {"must": [{"key": "title", "match": {"value": title}}]}
|
||
|
|
}).encode()
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
|
||
|
|
data=scroll_data,
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="POST"
|
||
|
|
)
|
||
|
|
with urllib.request.urlopen(req, timeout=10) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
points = result.get("result", {}).get("points", [])
|
||
|
|
if points:
|
||
|
|
return points[0]["id"], "title"
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Warning: Could not check existing: {e}", file=sys.stderr)
|
||
|
|
return None, None
|
||
|
|
|
||
|
|
|
||
|
|
def get_embedding(text: str) -> Optional[List[float]]:
|
||
|
|
"""Generate embedding using snowflake-arctic-embed2"""
|
||
|
|
data = json.dumps({
|
||
|
|
"model": "snowflake-arctic-embed2",
|
||
|
|
"input": text[:8192]
|
||
|
|
}).encode()
|
||
|
|
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{OLLAMA_URL}/embeddings",
|
||
|
|
data=data,
|
||
|
|
headers={"Content-Type": "application/json"}
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=60) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
return result["data"][0]["embedding"]
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error generating embedding: {e}", file=sys.stderr)
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def batch_upload_embeddings(texts: List[str]) -> List[Optional[List[float]]]:
|
||
|
|
"""Generate embeddings for multiple texts in batch"""
|
||
|
|
if not texts:
|
||
|
|
return []
|
||
|
|
|
||
|
|
data = json.dumps({
|
||
|
|
"model": "snowflake-arctic-embed2",
|
||
|
|
"input": [t[:8192] for t in texts]
|
||
|
|
}).encode()
|
||
|
|
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{OLLAMA_URL}/embeddings",
|
||
|
|
data=data,
|
||
|
|
headers={"Content-Type": "application/json"}
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=120) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
return [d["embedding"] for d in result["data"]]
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error generating batch embeddings: {e}", file=sys.stderr)
|
||
|
|
return [None] * len(texts)
|
||
|
|
|
||
|
|
|
||
|
|
def upload_points_batch(points: List[Dict[str, Any]], batch_size: int = DEFAULT_BATCH_SIZE) -> tuple:
|
||
|
|
"""Upload points in batches to Qdrant"""
|
||
|
|
total = len(points)
|
||
|
|
uploaded = 0
|
||
|
|
failed = 0
|
||
|
|
|
||
|
|
for i in range(0, total, batch_size):
|
||
|
|
batch = points[i:i + batch_size]
|
||
|
|
|
||
|
|
upsert_data = {"points": batch}
|
||
|
|
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
|
||
|
|
data=json.dumps(upsert_data).encode(),
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="PUT"
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=60) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
if result.get("status") == "ok":
|
||
|
|
uploaded += len(batch)
|
||
|
|
print(f" ✅ Uploaded batch {i//batch_size + 1}: {len(batch)} points")
|
||
|
|
else:
|
||
|
|
print(f" ❌ Batch {i//batch_size + 1} failed: {result}")
|
||
|
|
failed += len(batch)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ❌ Batch {i//batch_size + 1} error: {e}", file=sys.stderr)
|
||
|
|
failed += len(batch)
|
||
|
|
|
||
|
|
return uploaded, failed
|
||
|
|
|
||
|
|
|
||
|
|
def store_single(
|
||
|
|
text: str,
|
||
|
|
embedding: List[float],
|
||
|
|
title: str = None,
|
||
|
|
url: str = None,
|
||
|
|
source: str = None,
|
||
|
|
domain: str = "general",
|
||
|
|
tags: List[str] = None,
|
||
|
|
content_type: str = "document",
|
||
|
|
replace: bool = False
|
||
|
|
) -> bool:
|
||
|
|
"""Store single KB entry"""
|
||
|
|
|
||
|
|
# Check for existing entry
|
||
|
|
existing_id, match_type = check_existing(title=title, url=url)
|
||
|
|
if existing_id:
|
||
|
|
if not replace:
|
||
|
|
print(f"⚠️ Entry '{title}' already exists (matched by {match_type}, ID: {existing_id})")
|
||
|
|
print(f" Use --replace to overwrite")
|
||
|
|
return False
|
||
|
|
|
||
|
|
point_id = existing_id if existing_id else str(uuid.uuid4())
|
||
|
|
|
||
|
|
payload = {
|
||
|
|
"text": text,
|
||
|
|
"title": title or "Untitled",
|
||
|
|
"url": url or "",
|
||
|
|
"source": source or "manual",
|
||
|
|
"domain": domain or "general",
|
||
|
|
"tags": tags or [],
|
||
|
|
"content_type": content_type,
|
||
|
|
"date": datetime.now().strftime("%Y-%m-%d"),
|
||
|
|
"created_at": datetime.now().isoformat(),
|
||
|
|
"access_count": 0
|
||
|
|
}
|
||
|
|
|
||
|
|
point = {
|
||
|
|
"points": [{
|
||
|
|
"id": point_id,
|
||
|
|
"vector": embedding,
|
||
|
|
"payload": payload
|
||
|
|
}]
|
||
|
|
}
|
||
|
|
|
||
|
|
data = json.dumps(point).encode()
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
|
||
|
|
data=data,
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="PUT"
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
return result.get("status") == "ok"
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error storing to KB: {e}", file=sys.stderr)
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def store_batch(
|
||
|
|
entries: List[Dict[str, Any]],
|
||
|
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
||
|
|
check_duplicates: bool = True
|
||
|
|
) -> tuple:
|
||
|
|
"""Store multiple KB entries in batch with optional duplicate checking"""
|
||
|
|
if not entries:
|
||
|
|
return 0, 0
|
||
|
|
|
||
|
|
print(f"Processing {len(entries)} entries...")
|
||
|
|
|
||
|
|
# Filter duplicates if requested
|
||
|
|
entries_to_process = []
|
||
|
|
duplicates = 0
|
||
|
|
|
||
|
|
if check_duplicates:
|
||
|
|
for entry in entries:
|
||
|
|
existing_id, match_type = check_existing(
|
||
|
|
title=entry.get("title"),
|
||
|
|
url=entry.get("url")
|
||
|
|
)
|
||
|
|
if existing_id:
|
||
|
|
print(f" ⏭️ Skipping duplicate: {entry.get('title', 'Untitled')} ({match_type})")
|
||
|
|
duplicates += 1
|
||
|
|
else:
|
||
|
|
entries_to_process.append(entry)
|
||
|
|
else:
|
||
|
|
entries_to_process = entries
|
||
|
|
|
||
|
|
if not entries_to_process:
|
||
|
|
print(f"All {len(entries)} entries already exist")
|
||
|
|
return 0, 0
|
||
|
|
|
||
|
|
print(f"Generating embeddings for {len(entries_to_process)} entries...")
|
||
|
|
texts = [e["content"] for e in entries_to_process]
|
||
|
|
embeddings = batch_upload_embeddings(texts)
|
||
|
|
|
||
|
|
# Prepare points
|
||
|
|
points = []
|
||
|
|
failed_embeddings = 0
|
||
|
|
|
||
|
|
for entry, embedding in zip(entries_to_process, embeddings):
|
||
|
|
if embedding is None:
|
||
|
|
failed_embeddings += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
point_id = str(uuid.uuid4())
|
||
|
|
|
||
|
|
payload = {
|
||
|
|
"text": entry["content"],
|
||
|
|
"title": entry.get("title", "Untitled"),
|
||
|
|
"url": entry.get("url", ""),
|
||
|
|
"source": entry.get("source", "manual"),
|
||
|
|
"domain": entry.get("domain", "general"),
|
||
|
|
"tags": entry.get("tags", []),
|
||
|
|
"content_type": entry.get("type", "document"),
|
||
|
|
"date": datetime.now().strftime("%Y-%m-%d"),
|
||
|
|
"created_at": datetime.now().isoformat(),
|
||
|
|
"access_count": 0
|
||
|
|
}
|
||
|
|
|
||
|
|
points.append({
|
||
|
|
"id": point_id,
|
||
|
|
"vector": embedding,
|
||
|
|
"payload": payload
|
||
|
|
})
|
||
|
|
|
||
|
|
if not points:
|
||
|
|
return 0, failed_embeddings + duplicates
|
||
|
|
|
||
|
|
# Upload in batches
|
||
|
|
print(f"Uploading {len(points)} entries in batches of {batch_size}...")
|
||
|
|
uploaded, failed_upload = upload_points_batch(points, batch_size)
|
||
|
|
|
||
|
|
return uploaded, failed_embeddings + failed_upload + duplicates
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Store content to kimi_kb")
|
||
|
|
parser.add_argument("content", nargs="?", help="Content to store")
|
||
|
|
parser.add_argument("--title", default=None, help="Title of the content")
|
||
|
|
parser.add_argument("--url", default=None, help="Source URL if from web")
|
||
|
|
parser.add_argument("--source", default=None, help="Source name")
|
||
|
|
parser.add_argument("--domain", default="general", help="Domain/category")
|
||
|
|
parser.add_argument("--tags", default=None, help="Comma-separated tags")
|
||
|
|
parser.add_argument("--type", default="document", choices=["document", "web", "code", "note"],
|
||
|
|
help="Content type")
|
||
|
|
parser.add_argument("--replace", action="store_true", help="Replace existing entry")
|
||
|
|
parser.add_argument("--batch-file", help="JSON file with multiple entries")
|
||
|
|
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size")
|
||
|
|
parser.add_argument("--no-check-duplicates", action="store_true", help="Skip duplicate checking in batch mode")
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Batch mode
|
||
|
|
if args.batch_file:
|
||
|
|
print(f"Batch mode: Loading entries from {args.batch_file}")
|
||
|
|
try:
|
||
|
|
with open(args.batch_file, 'r') as f:
|
||
|
|
entries = json.load(f)
|
||
|
|
|
||
|
|
if not isinstance(entries, list):
|
||
|
|
print("Batch file must contain a JSON array", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print(f"Loaded {len(entries)} entries")
|
||
|
|
uploaded, failed = store_batch(
|
||
|
|
entries,
|
||
|
|
args.batch_size,
|
||
|
|
check_duplicates=not args.no_check_duplicates
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f"\n{'=' * 50}")
|
||
|
|
print(f"Batch complete: {uploaded} uploaded, {failed} failed")
|
||
|
|
sys.exit(0 if failed == 0 else 1)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error: {e}", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
# Single entry mode
|
||
|
|
if not args.content:
|
||
|
|
print("Error: Provide content or use --batch-file", file=sys.stderr)
|
||
|
|
parser.print_help()
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
tags = [t.strip() for t in args.tags.split(",")] if args.tags else []
|
||
|
|
|
||
|
|
print(f"Generating embedding...")
|
||
|
|
embedding = get_embedding(args.content)
|
||
|
|
|
||
|
|
if embedding is None:
|
||
|
|
print("❌ Failed to generate embedding")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print(f"Storing to kimi_kb: {args.title or 'Untitled'}...")
|
||
|
|
|
||
|
|
if store_single(
|
||
|
|
text=args.content,
|
||
|
|
embedding=embedding,
|
||
|
|
title=args.title,
|
||
|
|
url=args.url,
|
||
|
|
source=args.source,
|
||
|
|
domain=args.domain,
|
||
|
|
tags=tags,
|
||
|
|
content_type=args.type,
|
||
|
|
replace=args.replace
|
||
|
|
):
|
||
|
|
print(f"✅ Stored to kimi_kb ({args.domain})")
|
||
|
|
else:
|
||
|
|
print("❌ Failed to store")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|