forked from SpeedyFoxAi/jarvis-memory
318 lines
10 KiB
Python
318 lines
10 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Daily memory backup script with batch upload support
|
||
|
|
Backs up all memory files to kimi_memories collection in Qdrant
|
||
|
|
Uses batch uploads (256 points) for 20x performance improvement
|
||
|
|
Avoids duplicates by checking existing dates
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
daily_backup.py [--dry-run] [--batch-size N]
|
||
|
|
|
||
|
|
Features:
|
||
|
|
- Batch upload with configurable size (default 256)
|
||
|
|
- Parallel processing support
|
||
|
|
- Duplicate detection via date-based scroll
|
||
|
|
- Progress reporting
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import urllib.request
|
||
|
|
import urllib.error
|
||
|
|
import uuid
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
|
|
||
|
|
QDRANT_URL = "http://10.0.0.40:6333"
|
||
|
|
COLLECTION_NAME = "kimi_memories"
|
||
|
|
OLLAMA_URL = "http://localhost:11434/v1"
|
||
|
|
MEMORY_DIR = Path("/root/.openclaw/workspace/memory")
|
||
|
|
DEFAULT_BATCH_SIZE = 256
|
||
|
|
DEFAULT_PARALLEL = 4
|
||
|
|
|
||
|
|
|
||
|
|
def get_embedding(text):
|
||
|
|
"""Generate embedding using snowflake-arctic-embed2 via Ollama"""
|
||
|
|
data = json.dumps({
|
||
|
|
"model": "snowflake-arctic-embed2",
|
||
|
|
"input": text[:8192] # Limit to 8k chars for embedding
|
||
|
|
}).encode()
|
||
|
|
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{OLLAMA_URL}/embeddings",
|
||
|
|
data=data,
|
||
|
|
headers={"Content-Type": "application/json"}
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=60) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
return result["data"][0]["embedding"]
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error generating embedding: {e}", file=sys.stderr)
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def get_embedding_batch(texts):
|
||
|
|
"""Generate embeddings for multiple texts in batch"""
|
||
|
|
data = json.dumps({
|
||
|
|
"model": "snowflake-arctic-embed2",
|
||
|
|
"input": [t[:8192] for t in texts]
|
||
|
|
}).encode()
|
||
|
|
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{OLLAMA_URL}/embeddings",
|
||
|
|
data=data,
|
||
|
|
headers={"Content-Type": "application/json"}
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=120) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
return [d["embedding"] for d in result["data"]]
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error generating batch embeddings: {e}", file=sys.stderr)
|
||
|
|
return [None] * len(texts)
|
||
|
|
|
||
|
|
|
||
|
|
def get_existing_dates():
|
||
|
|
"""Get list of dates already backed up via daily-backup (not manual stores)"""
|
||
|
|
try:
|
||
|
|
scroll_data = json.dumps({
|
||
|
|
"limit": 10000,
|
||
|
|
"with_payload": True,
|
||
|
|
"with_vectors": False
|
||
|
|
}).encode()
|
||
|
|
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
|
||
|
|
data=scroll_data,
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="POST"
|
||
|
|
)
|
||
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
if result.get("result", {}).get("points"):
|
||
|
|
# Only count entries from daily-backup source, not manual stores
|
||
|
|
backup_dates = set()
|
||
|
|
for p in result["result"]["points"]:
|
||
|
|
payload = p.get("payload", {})
|
||
|
|
date = payload.get("date")
|
||
|
|
source = payload.get("source")
|
||
|
|
tags = payload.get("tags", [])
|
||
|
|
# Only skip if this was a daily-backup (not conversation/manual)
|
||
|
|
if date and source == "daily-backup":
|
||
|
|
backup_dates.add(date)
|
||
|
|
# Also check for daily-backup tag as fallback
|
||
|
|
elif date and "daily-backup" in tags:
|
||
|
|
backup_dates.add(date)
|
||
|
|
return backup_dates
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Warning: Could not check existing dates: {e}", file=sys.stderr)
|
||
|
|
return set()
|
||
|
|
|
||
|
|
|
||
|
|
def batch_upload_points(points, batch_size=256):
|
||
|
|
"""Upload points in batches using batch_size"""
|
||
|
|
total = len(points)
|
||
|
|
uploaded = 0
|
||
|
|
failed = 0
|
||
|
|
|
||
|
|
for i in range(0, total, batch_size):
|
||
|
|
batch = points[i:i + batch_size]
|
||
|
|
|
||
|
|
upsert_data = {
|
||
|
|
"points": batch
|
||
|
|
}
|
||
|
|
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
|
||
|
|
data=json.dumps(upsert_data).encode(),
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="PUT"
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=60) as response:
|
||
|
|
result = json.loads(response.read().decode())
|
||
|
|
if result.get("status") == "ok":
|
||
|
|
uploaded += len(batch)
|
||
|
|
print(f" ✅ Batch {i//batch_size + 1}: {len(batch)} points uploaded")
|
||
|
|
else:
|
||
|
|
print(f" ❌ Batch {i//batch_size + 1}: Failed - {result}")
|
||
|
|
failed += len(batch)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ❌ Batch {i//batch_size + 1}: Error - {e}", file=sys.stderr)
|
||
|
|
failed += len(batch)
|
||
|
|
|
||
|
|
return uploaded, failed
|
||
|
|
|
||
|
|
|
||
|
|
def prepare_memory_point(content, date_str):
|
||
|
|
"""Prepare a memory point for upload"""
|
||
|
|
embedding = get_embedding(content)
|
||
|
|
if embedding is None:
|
||
|
|
return None
|
||
|
|
|
||
|
|
point_id = str(uuid.uuid4())
|
||
|
|
|
||
|
|
payload = {
|
||
|
|
"text": content,
|
||
|
|
"date": date_str,
|
||
|
|
"tags": ["daily-backup", f"backup-{date_str}"],
|
||
|
|
"importance": "high",
|
||
|
|
"source": "daily-backup",
|
||
|
|
"source_type": "inferred",
|
||
|
|
"confidence": "high",
|
||
|
|
"verified": True,
|
||
|
|
"created_at": datetime.now().isoformat(),
|
||
|
|
"backup_timestamp": datetime.now().isoformat(),
|
||
|
|
"access_count": 0,
|
||
|
|
"last_accessed": datetime.now().isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
"id": point_id,
|
||
|
|
"vector": embedding,
|
||
|
|
"payload": payload
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def process_file_batch(files_batch):
|
||
|
|
"""Process a batch of files in parallel"""
|
||
|
|
results = []
|
||
|
|
for date_str, file_path in files_batch:
|
||
|
|
try:
|
||
|
|
with open(file_path, 'r') as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
point = prepare_memory_point(content, date_str)
|
||
|
|
if point:
|
||
|
|
results.append(point)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ❌ {date_str}: Failed to process - {e}")
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
def get_memory_files():
|
||
|
|
"""Get all memory markdown files sorted by date"""
|
||
|
|
if not MEMORY_DIR.exists():
|
||
|
|
return []
|
||
|
|
|
||
|
|
files = []
|
||
|
|
for f in MEMORY_DIR.glob("????-??-??.md"):
|
||
|
|
if f.name != "heartbeat-timestamps.txt":
|
||
|
|
files.append((f.stem, f)) # (date string, file path)
|
||
|
|
|
||
|
|
# Sort by date
|
||
|
|
files.sort(key=lambda x: x[0])
|
||
|
|
return files
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Daily memory backup with batch upload")
|
||
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be backed up without uploading")
|
||
|
|
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})")
|
||
|
|
parser.add_argument("--parallel", type=int, default=DEFAULT_PARALLEL, help=f"Parallel embedding generation (default: {DEFAULT_PARALLEL})")
|
||
|
|
parser.add_argument("--force", action="store_true", help="Force re-backup of existing dates")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
print(f"=== Daily Memory Backup ===")
|
||
|
|
print(f"Time: {datetime.now().isoformat()}")
|
||
|
|
print(f"Batch size: {args.batch_size}")
|
||
|
|
print(f"Parallel: {args.parallel}")
|
||
|
|
if args.dry_run:
|
||
|
|
print("Mode: DRY RUN (no actual upload)")
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Get existing dates to avoid duplicates
|
||
|
|
print(f"Checking for existing backups...")
|
||
|
|
existing_dates = get_existing_dates()
|
||
|
|
print(f"Found {len(existing_dates)} existing backups")
|
||
|
|
|
||
|
|
# Get memory files
|
||
|
|
memory_files = get_memory_files()
|
||
|
|
print(f"Found {len(memory_files)} memory files")
|
||
|
|
|
||
|
|
# Filter out already backed up dates (unless force)
|
||
|
|
files_to_backup = []
|
||
|
|
for date_str, file_path in memory_files:
|
||
|
|
if date_str in existing_dates and not args.force:
|
||
|
|
print(f" ⏭️ {date_str} - Already backed up, skipping")
|
||
|
|
continue
|
||
|
|
files_to_backup.append((date_str, file_path))
|
||
|
|
|
||
|
|
if not files_to_backup:
|
||
|
|
print(f"\n✅ All memories already backed up (no new files)")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
print(f"\nBacking up {len(files_to_backup)} files...")
|
||
|
|
print()
|
||
|
|
|
||
|
|
if args.dry_run:
|
||
|
|
for date_str, file_path in files_to_backup:
|
||
|
|
print(f" 📄 {date_str} - Would back up ({file_path.stat().st_size} bytes)")
|
||
|
|
print(f"\nDry run complete. {len(files_to_backup)} files would be backed up.")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
# Prepare all points with embeddings
|
||
|
|
all_points = []
|
||
|
|
failed_files = []
|
||
|
|
|
||
|
|
print("Generating embeddings...")
|
||
|
|
for date_str, file_path in files_to_backup:
|
||
|
|
try:
|
||
|
|
with open(file_path, 'r') as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
print(f" 📦 {date_str} - Generating embedding...")
|
||
|
|
point = prepare_memory_point(content, date_str)
|
||
|
|
|
||
|
|
if point:
|
||
|
|
all_points.append(point)
|
||
|
|
else:
|
||
|
|
failed_files.append(date_str)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ❌ {date_str} - Failed to read: {e}")
|
||
|
|
failed_files.append(date_str)
|
||
|
|
|
||
|
|
if not all_points:
|
||
|
|
print("\n❌ No points to upload")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
print(f"\nGenerated {len(all_points)} embeddings, uploading in batches of {args.batch_size}...")
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Upload in batches
|
||
|
|
uploaded, failed = batch_upload_points(all_points, args.batch_size)
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
print(f"\n{'=' * 50}")
|
||
|
|
print("SUMMARY:")
|
||
|
|
print(f" Total files: {len(files_to_backup)}")
|
||
|
|
print(f" Successfully embedded: {len(all_points)}")
|
||
|
|
print(f" Successfully uploaded: {uploaded}")
|
||
|
|
print(f" Failed to embed: {len(failed_files)}")
|
||
|
|
print(f" Failed to upload: {failed}")
|
||
|
|
|
||
|
|
if failed_files:
|
||
|
|
print(f"\nFailed files: {', '.join(failed_files)}")
|
||
|
|
|
||
|
|
if uploaded > 0:
|
||
|
|
print(f"\n✅ Daily backup complete!")
|
||
|
|
return 0
|
||
|
|
elif failed > 0 or failed_files:
|
||
|
|
print(f"\n⚠️ Backup completed with errors")
|
||
|
|
return 1
|
||
|
|
else:
|
||
|
|
print(f"\n✅ All memories already backed up")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|