#!/usr/bin/env python3 """ Daily memory backup script with batch upload support Backs up all memory files to kimi_memories collection in Qdrant Uses batch uploads (256 points) for 20x performance improvement Avoids duplicates by checking existing dates Usage: daily_backup.py [--dry-run] [--batch-size N] Features: - Batch upload with configurable size (default 256) - Parallel processing support - Duplicate detection via date-based scroll - Progress reporting """ import argparse import json import os import sys import urllib.request import urllib.error import uuid from datetime import datetime from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed QDRANT_URL = "http://10.0.0.40:6333" COLLECTION_NAME = "kimi_memories" OLLAMA_URL = "http://localhost:11434/v1" MEMORY_DIR = Path("/root/.openclaw/workspace/memory") DEFAULT_BATCH_SIZE = 256 DEFAULT_PARALLEL = 4 def get_embedding(text): """Generate embedding using snowflake-arctic-embed2 via Ollama""" data = json.dumps({ "model": "snowflake-arctic-embed2", "input": text[:8192] # Limit to 8k chars for embedding }).encode() req = urllib.request.Request( f"{OLLAMA_URL}/embeddings", data=data, headers={"Content-Type": "application/json"} ) try: with urllib.request.urlopen(req, timeout=60) as response: result = json.loads(response.read().decode()) return result["data"][0]["embedding"] except Exception as e: print(f"Error generating embedding: {e}", file=sys.stderr) return None def get_embedding_batch(texts): """Generate embeddings for multiple texts in batch""" data = json.dumps({ "model": "snowflake-arctic-embed2", "input": [t[:8192] for t in texts] }).encode() req = urllib.request.Request( f"{OLLAMA_URL}/embeddings", data=data, headers={"Content-Type": "application/json"} ) try: with urllib.request.urlopen(req, timeout=120) as response: result = json.loads(response.read().decode()) return [d["embedding"] for d in result["data"]] except Exception as e: print(f"Error generating batch embeddings: {e}", file=sys.stderr) return [None] * len(texts) def get_existing_dates(): """Get list of dates already backed up via daily-backup (not manual stores)""" try: scroll_data = json.dumps({ "limit": 10000, "with_payload": True, "with_vectors": False }).encode() req = urllib.request.Request( f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", data=scroll_data, headers={"Content-Type": "application/json"}, method="POST" ) with urllib.request.urlopen(req, timeout=30) as response: result = json.loads(response.read().decode()) if result.get("result", {}).get("points"): # Only count entries from daily-backup source, not manual stores backup_dates = set() for p in result["result"]["points"]: payload = p.get("payload", {}) date = payload.get("date") source = payload.get("source") tags = payload.get("tags", []) # Only skip if this was a daily-backup (not conversation/manual) if date and source == "daily-backup": backup_dates.add(date) # Also check for daily-backup tag as fallback elif date and "daily-backup" in tags: backup_dates.add(date) return backup_dates except Exception as e: print(f"Warning: Could not check existing dates: {e}", file=sys.stderr) return set() def batch_upload_points(points, batch_size=256): """Upload points in batches using batch_size""" total = len(points) uploaded = 0 failed = 0 for i in range(0, total, batch_size): batch = points[i:i + batch_size] upsert_data = { "points": batch } req = urllib.request.Request( f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", data=json.dumps(upsert_data).encode(), headers={"Content-Type": "application/json"}, method="PUT" ) try: with urllib.request.urlopen(req, timeout=60) as response: result = json.loads(response.read().decode()) if result.get("status") == "ok": uploaded += len(batch) print(f" ✅ Batch {i//batch_size + 1}: {len(batch)} points uploaded") else: print(f" ❌ Batch {i//batch_size + 1}: Failed - {result}") failed += len(batch) except Exception as e: print(f" ❌ Batch {i//batch_size + 1}: Error - {e}", file=sys.stderr) failed += len(batch) return uploaded, failed def prepare_memory_point(content, date_str): """Prepare a memory point for upload""" embedding = get_embedding(content) if embedding is None: return None point_id = str(uuid.uuid4()) payload = { "text": content, "date": date_str, "tags": ["daily-backup", f"backup-{date_str}"], "importance": "high", "source": "daily-backup", "source_type": "inferred", "confidence": "high", "verified": True, "created_at": datetime.now().isoformat(), "backup_timestamp": datetime.now().isoformat(), "access_count": 0, "last_accessed": datetime.now().isoformat() } return { "id": point_id, "vector": embedding, "payload": payload } def process_file_batch(files_batch): """Process a batch of files in parallel""" results = [] for date_str, file_path in files_batch: try: with open(file_path, 'r') as f: content = f.read() point = prepare_memory_point(content, date_str) if point: results.append(point) except Exception as e: print(f" ❌ {date_str}: Failed to process - {e}") return results def get_memory_files(): """Get all memory markdown files sorted by date""" if not MEMORY_DIR.exists(): return [] files = [] for f in MEMORY_DIR.glob("????-??-??.md"): if f.name != "heartbeat-timestamps.txt": files.append((f.stem, f)) # (date string, file path) # Sort by date files.sort(key=lambda x: x[0]) return files def main(): parser = argparse.ArgumentParser(description="Daily memory backup with batch upload") parser.add_argument("--dry-run", action="store_true", help="Show what would be backed up without uploading") parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})") parser.add_argument("--parallel", type=int, default=DEFAULT_PARALLEL, help=f"Parallel embedding generation (default: {DEFAULT_PARALLEL})") parser.add_argument("--force", action="store_true", help="Force re-backup of existing dates") args = parser.parse_args() print(f"=== Daily Memory Backup ===") print(f"Time: {datetime.now().isoformat()}") print(f"Batch size: {args.batch_size}") print(f"Parallel: {args.parallel}") if args.dry_run: print("Mode: DRY RUN (no actual upload)") print() # Get existing dates to avoid duplicates print(f"Checking for existing backups...") existing_dates = get_existing_dates() print(f"Found {len(existing_dates)} existing backups") # Get memory files memory_files = get_memory_files() print(f"Found {len(memory_files)} memory files") # Filter out already backed up dates (unless force) files_to_backup = [] for date_str, file_path in memory_files: if date_str in existing_dates and not args.force: print(f" ⏭️ {date_str} - Already backed up, skipping") continue files_to_backup.append((date_str, file_path)) if not files_to_backup: print(f"\n✅ All memories already backed up (no new files)") return 0 print(f"\nBacking up {len(files_to_backup)} files...") print() if args.dry_run: for date_str, file_path in files_to_backup: print(f" 📄 {date_str} - Would back up ({file_path.stat().st_size} bytes)") print(f"\nDry run complete. {len(files_to_backup)} files would be backed up.") return 0 # Prepare all points with embeddings all_points = [] failed_files = [] print("Generating embeddings...") for date_str, file_path in files_to_backup: try: with open(file_path, 'r') as f: content = f.read() print(f" 📦 {date_str} - Generating embedding...") point = prepare_memory_point(content, date_str) if point: all_points.append(point) else: failed_files.append(date_str) except Exception as e: print(f" ❌ {date_str} - Failed to read: {e}") failed_files.append(date_str) if not all_points: print("\n❌ No points to upload") return 1 print(f"\nGenerated {len(all_points)} embeddings, uploading in batches of {args.batch_size}...") print() # Upload in batches uploaded, failed = batch_upload_points(all_points, args.batch_size) # Summary print(f"\n{'=' * 50}") print("SUMMARY:") print(f" Total files: {len(files_to_backup)}") print(f" Successfully embedded: {len(all_points)}") print(f" Successfully uploaded: {uploaded}") print(f" Failed to embed: {len(failed_files)}") print(f" Failed to upload: {failed}") if failed_files: print(f"\nFailed files: {', '.join(failed_files)}") if uploaded > 0: print(f"\n✅ Daily backup complete!") return 0 elif failed > 0 or failed_files: print(f"\n⚠️ Backup completed with errors") return 1 else: print(f"\n✅ All memories already backed up") return 0 if __name__ == "__main__": sys.exit(main())