skills/qdrant-memory/scripts/daily_backup.py

#!/usr/bin/env python3
"""
Daily memory backup script with batch upload support
Backs up all memory files to kimi_memories collection in Qdrant
Uses batch uploads (256 points) for 20x performance improvement
Avoids duplicates by checking existing dates

Usage:
    daily_backup.py [--dry-run] [--batch-size N]
    
Features:
    - Batch upload with configurable size (default 256)
    - Parallel processing support
    - Duplicate detection via date-based scroll
    - Progress reporting
"""

import argparse
import json
import os
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://localhost:11434/v1"
MEMORY_DIR = Path("/root/.openclaw/workspace/memory")
DEFAULT_BATCH_SIZE = 256
DEFAULT_PARALLEL = 4


def get_embedding(text):
    """Generate embedding using snowflake-arctic-embed2 via Ollama"""
    data = json.dumps({
        "model": "snowflake-arctic-embed2",
        "input": text[:8192]  # Limit to 8k chars for embedding
    }).encode()
    
    req = urllib.request.Request(
        f"{OLLAMA_URL}/embeddings",
        data=data,
        headers={"Content-Type": "application/json"}
    )
    
    try:
        with urllib.request.urlopen(req, timeout=60) as response:
            result = json.loads(response.read().decode())
            return result["data"][0]["embedding"]
    except Exception as e:
        print(f"Error generating embedding: {e}", file=sys.stderr)
        return None


def get_embedding_batch(texts):
    """Generate embeddings for multiple texts in batch"""
    data = json.dumps({
        "model": "snowflake-arctic-embed2",
        "input": [t[:8192] for t in texts]
    }).encode()
    
    req = urllib.request.Request(
        f"{OLLAMA_URL}/embeddings",
        data=data,
        headers={"Content-Type": "application/json"}
    )
    
    try:
        with urllib.request.urlopen(req, timeout=120) as response:
            result = json.loads(response.read().decode())
            return [d["embedding"] for d in result["data"]]
    except Exception as e:
        print(f"Error generating batch embeddings: {e}", file=sys.stderr)
        return [None] * len(texts)


def get_existing_dates():
    """Get list of dates already backed up via daily-backup (not manual stores)"""
    try:
        scroll_data = json.dumps({
            "limit": 10000,
            "with_payload": True,
            "with_vectors": False
        }).encode()

        req = urllib.request.Request(
            f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
            data=scroll_data,
            headers={"Content-Type": "application/json"},
            method="POST"
        )
        with urllib.request.urlopen(req, timeout=30) as response:
            result = json.loads(response.read().decode())
            if result.get("result", {}).get("points"):
                # Only count entries from daily-backup source, not manual stores
                backup_dates = set()
                for p in result["result"]["points"]:
                    payload = p.get("payload", {})
                    date = payload.get("date")
                    source = payload.get("source")
                    tags = payload.get("tags", [])
                    # Only skip if this was a daily-backup (not conversation/manual)
                    if date and source == "daily-backup":
                        backup_dates.add(date)
                    # Also check for daily-backup tag as fallback
                    elif date and "daily-backup" in tags:
                        backup_dates.add(date)
                return backup_dates
    except Exception as e:
        print(f"Warning: Could not check existing dates: {e}", file=sys.stderr)
    return set()


def batch_upload_points(points, batch_size=256):
    """Upload points in batches using batch_size"""
    total = len(points)
    uploaded = 0
    failed = 0
    
    for i in range(0, total, batch_size):
        batch = points[i:i + batch_size]
        
        upsert_data = {
            "points": batch
        }
        
        req = urllib.request.Request(
            f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
            data=json.dumps(upsert_data).encode(),
            headers={"Content-Type": "application/json"},
            method="PUT"
        )
        
        try:
            with urllib.request.urlopen(req, timeout=60) as response:
                result = json.loads(response.read().decode())
                if result.get("status") == "ok":
                    uploaded += len(batch)
                    print(f"  ✅ Batch {i//batch_size + 1}: {len(batch)} points uploaded")
                else:
                    print(f"  ❌ Batch {i//batch_size + 1}: Failed - {result}")
                    failed += len(batch)
        except Exception as e:
            print(f"  ❌ Batch {i//batch_size + 1}: Error - {e}", file=sys.stderr)
            failed += len(batch)
    
    return uploaded, failed


def prepare_memory_point(content, date_str):
    """Prepare a memory point for upload"""
    embedding = get_embedding(content)
    if embedding is None:
        return None
    
    point_id = str(uuid.uuid4())
    
    payload = {
        "text": content,
        "date": date_str,
        "tags": ["daily-backup", f"backup-{date_str}"],
        "importance": "high",
        "source": "daily-backup",
        "source_type": "inferred",
        "confidence": "high",
        "verified": True,
        "created_at": datetime.now().isoformat(),
        "backup_timestamp": datetime.now().isoformat(),
        "access_count": 0,
        "last_accessed": datetime.now().isoformat()
    }
    
    return {
        "id": point_id,
        "vector": embedding,
        "payload": payload
    }


def process_file_batch(files_batch):
    """Process a batch of files in parallel"""
    results = []
    for date_str, file_path in files_batch:
        try:
            with open(file_path, 'r') as f:
                content = f.read()
            
            point = prepare_memory_point(content, date_str)
            if point:
                results.append(point)
        except Exception as e:
            print(f"  ❌ {date_str}: Failed to process - {e}")
    
    return results


def get_memory_files():
    """Get all memory markdown files sorted by date"""
    if not MEMORY_DIR.exists():
        return []
    
    files = []
    for f in MEMORY_DIR.glob("????-??-??.md"):
        if f.name != "heartbeat-timestamps.txt":
            files.append((f.stem, f))  # (date string, file path)
    
    # Sort by date
    files.sort(key=lambda x: x[0])
    return files


def main():
    parser = argparse.ArgumentParser(description="Daily memory backup with batch upload")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be backed up without uploading")
    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})")
    parser.add_argument("--parallel", type=int, default=DEFAULT_PARALLEL, help=f"Parallel embedding generation (default: {DEFAULT_PARALLEL})")
    parser.add_argument("--force", action="store_true", help="Force re-backup of existing dates")
    args = parser.parse_args()
    
    print(f"=== Daily Memory Backup ===")
    print(f"Time: {datetime.now().isoformat()}")
    print(f"Batch size: {args.batch_size}")
    print(f"Parallel: {args.parallel}")
    if args.dry_run:
        print("Mode: DRY RUN (no actual upload)")
    print()
    
    # Get existing dates to avoid duplicates
    print(f"Checking for existing backups...")
    existing_dates = get_existing_dates()
    print(f"Found {len(existing_dates)} existing backups")
    
    # Get memory files
    memory_files = get_memory_files()
    print(f"Found {len(memory_files)} memory files")
    
    # Filter out already backed up dates (unless force)
    files_to_backup = []
    for date_str, file_path in memory_files:
        if date_str in existing_dates and not args.force:
            print(f"  ⏭️  {date_str} - Already backed up, skipping")
            continue
        files_to_backup.append((date_str, file_path))
    
    if not files_to_backup:
        print(f"\n✅ All memories already backed up (no new files)")
        return 0
    
    print(f"\nBacking up {len(files_to_backup)} files...")
    print()
    
    if args.dry_run:
        for date_str, file_path in files_to_backup:
            print(f"  📄 {date_str} - Would back up ({file_path.stat().st_size} bytes)")
        print(f"\nDry run complete. {len(files_to_backup)} files would be backed up.")
        return 0
    
    # Prepare all points with embeddings
    all_points = []
    failed_files = []
    
    print("Generating embeddings...")
    for date_str, file_path in files_to_backup:
        try:
            with open(file_path, 'r') as f:
                content = f.read()
            
            print(f"  📦 {date_str} - Generating embedding...")
            point = prepare_memory_point(content, date_str)
            
            if point:
                all_points.append(point)
            else:
                failed_files.append(date_str)
        except Exception as e:
            print(f"  ❌ {date_str} - Failed to read: {e}")
            failed_files.append(date_str)
    
    if not all_points:
        print("\n❌ No points to upload")
        return 1
    
    print(f"\nGenerated {len(all_points)} embeddings, uploading in batches of {args.batch_size}...")
    print()
    
    # Upload in batches
    uploaded, failed = batch_upload_points(all_points, args.batch_size)
    
    # Summary
    print(f"\n{'=' * 50}")
    print("SUMMARY:")
    print(f"  Total files: {len(files_to_backup)}")
    print(f"  Successfully embedded: {len(all_points)}")
    print(f"  Successfully uploaded: {uploaded}")
    print(f"  Failed to embed: {len(failed_files)}")
    print(f"  Failed to upload: {failed}")
    
    if failed_files:
        print(f"\nFailed files: {', '.join(failed_files)}")
    
    if uploaded > 0:
        print(f"\n✅ Daily backup complete!")
        return 0
    elif failed > 0 or failed_files:
        print(f"\n⚠️ Backup completed with errors")
        return 1
    else:
        print(f"\n✅ All memories already backed up")
        return 0


if __name__ == "__main__":
    sys.exit(main())
Initial commit: Jarvis Memory system 2026-02-23 12:13:04 -06:00			`#!/usr/bin/env python3`
			`"""`
			`Daily memory backup script with batch upload support`
			`Backs up all memory files to kimi_memories collection in Qdrant`
			`Uses batch uploads (256 points) for 20x performance improvement`
			`Avoids duplicates by checking existing dates`

			`Usage:`
			`daily_backup.py [--dry-run] [--batch-size N]`

			`Features:`
			`- Batch upload with configurable size (default 256)`
			`- Parallel processing support`
			`- Duplicate detection via date-based scroll`
			`- Progress reporting`
			`"""`

			`import argparse`
			`import json`
			`import os`
			`import sys`
			`import urllib.request`
			`import urllib.error`
			`import uuid`
			`from datetime import datetime`
			`from pathlib import Path`
			`from concurrent.futures import ThreadPoolExecutor, as_completed`

			`QDRANT_URL = "http://10.0.0.40:6333"`
			`COLLECTION_NAME = "kimi_memories"`
			`OLLAMA_URL = "http://localhost:11434/v1"`
			`MEMORY_DIR = Path("/root/.openclaw/workspace/memory")`
			`DEFAULT_BATCH_SIZE = 256`
			`DEFAULT_PARALLEL = 4`


			`def get_embedding(text):`
			`"""Generate embedding using snowflake-arctic-embed2 via Ollama"""`
			`data = json.dumps({`
			`"model": "snowflake-arctic-embed2",`
			`"input": text[:8192] # Limit to 8k chars for embedding`
			`}).encode()`

			`req = urllib.request.Request(`
			`f"{OLLAMA_URL}/embeddings",`
			`data=data,`
			`headers={"Content-Type": "application/json"}`
			`)`

			`try:`
			`with urllib.request.urlopen(req, timeout=60) as response:`
			`result = json.loads(response.read().decode())`
			`return result["data"][0]["embedding"]`
			`except Exception as e:`
			`print(f"Error generating embedding: {e}", file=sys.stderr)`
			`return None`


			`def get_embedding_batch(texts):`
			`"""Generate embeddings for multiple texts in batch"""`
			`data = json.dumps({`
			`"model": "snowflake-arctic-embed2",`
			`"input": [t[:8192] for t in texts]`
			`}).encode()`

			`req = urllib.request.Request(`
			`f"{OLLAMA_URL}/embeddings",`
			`data=data,`
			`headers={"Content-Type": "application/json"}`
			`)`

			`try:`
			`with urllib.request.urlopen(req, timeout=120) as response:`
			`result = json.loads(response.read().decode())`
			`return [d["embedding"] for d in result["data"]]`
			`except Exception as e:`
			`print(f"Error generating batch embeddings: {e}", file=sys.stderr)`
			`return [None] * len(texts)`


			`def get_existing_dates():`
			`"""Get list of dates already backed up via daily-backup (not manual stores)"""`
			`try:`
			`scroll_data = json.dumps({`
			`"limit": 10000,`
			`"with_payload": True,`
			`"with_vectors": False`
			`}).encode()`

			`req = urllib.request.Request(`
			`f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",`
			`data=scroll_data,`
			`headers={"Content-Type": "application/json"},`
			`method="POST"`
			`)`
			`with urllib.request.urlopen(req, timeout=30) as response:`
			`result = json.loads(response.read().decode())`
			`if result.get("result", {}).get("points"):`
			`# Only count entries from daily-backup source, not manual stores`
			`backup_dates = set()`
			`for p in result["result"]["points"]:`
			`payload = p.get("payload", {})`
			`date = payload.get("date")`
			`source = payload.get("source")`
			`tags = payload.get("tags", [])`
			`# Only skip if this was a daily-backup (not conversation/manual)`
			`if date and source == "daily-backup":`
			`backup_dates.add(date)`
			`# Also check for daily-backup tag as fallback`
			`elif date and "daily-backup" in tags:`
			`backup_dates.add(date)`
			`return backup_dates`
			`except Exception as e:`
			`print(f"Warning: Could not check existing dates: {e}", file=sys.stderr)`
			`return set()`


			`def batch_upload_points(points, batch_size=256):`
			`"""Upload points in batches using batch_size"""`
			`total = len(points)`
			`uploaded = 0`
			`failed = 0`

			`for i in range(0, total, batch_size):`
			`batch = points[i:i + batch_size]`

			`upsert_data = {`
			`"points": batch`
			`}`

			`req = urllib.request.Request(`
			`f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",`
			`data=json.dumps(upsert_data).encode(),`
			`headers={"Content-Type": "application/json"},`
			`method="PUT"`
			`)`

			`try:`
			`with urllib.request.urlopen(req, timeout=60) as response:`
			`result = json.loads(response.read().decode())`
			`if result.get("status") == "ok":`
			`uploaded += len(batch)`
			`print(f" ✅ Batch {i//batch_size + 1}: {len(batch)} points uploaded")`
			`else:`
			`print(f" ❌ Batch {i//batch_size + 1}: Failed - {result}")`
			`failed += len(batch)`
			`except Exception as e:`
			`print(f" ❌ Batch {i//batch_size + 1}: Error - {e}", file=sys.stderr)`
			`failed += len(batch)`

			`return uploaded, failed`


			`def prepare_memory_point(content, date_str):`
			`"""Prepare a memory point for upload"""`
			`embedding = get_embedding(content)`
			`if embedding is None:`
			`return None`

			`point_id = str(uuid.uuid4())`

			`payload = {`
			`"text": content,`
			`"date": date_str,`
			`"tags": ["daily-backup", f"backup-{date_str}"],`
			`"importance": "high",`
			`"source": "daily-backup",`
			`"source_type": "inferred",`
			`"confidence": "high",`
			`"verified": True,`
			`"created_at": datetime.now().isoformat(),`
			`"backup_timestamp": datetime.now().isoformat(),`
			`"access_count": 0,`
			`"last_accessed": datetime.now().isoformat()`
			`}`

			`return {`
			`"id": point_id,`
			`"vector": embedding,`
			`"payload": payload`
			`}`


			`def process_file_batch(files_batch):`
			`"""Process a batch of files in parallel"""`
			`results = []`
			`for date_str, file_path in files_batch:`
			`try:`
			`with open(file_path, 'r') as f:`
			`content = f.read()`

			`point = prepare_memory_point(content, date_str)`
			`if point:`
			`results.append(point)`
			`except Exception as e:`
			`print(f" ❌ {date_str}: Failed to process - {e}")`

			`return results`


			`def get_memory_files():`
			`"""Get all memory markdown files sorted by date"""`
			`if not MEMORY_DIR.exists():`
			`return []`

			`files = []`
			`for f in MEMORY_DIR.glob("????-??-??.md"):`
			`if f.name != "heartbeat-timestamps.txt":`
			`files.append((f.stem, f)) # (date string, file path)`

			`# Sort by date`
			`files.sort(key=lambda x: x[0])`
			`return files`


			`def main():`
			`parser = argparse.ArgumentParser(description="Daily memory backup with batch upload")`
			`parser.add_argument("--dry-run", action="store_true", help="Show what would be backed up without uploading")`
			`parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})")`
			`parser.add_argument("--parallel", type=int, default=DEFAULT_PARALLEL, help=f"Parallel embedding generation (default: {DEFAULT_PARALLEL})")`
			`parser.add_argument("--force", action="store_true", help="Force re-backup of existing dates")`
			`args = parser.parse_args()`

			`print(f"=== Daily Memory Backup ===")`
			`print(f"Time: {datetime.now().isoformat()}")`
			`print(f"Batch size: {args.batch_size}")`
			`print(f"Parallel: {args.parallel}")`
			`if args.dry_run:`
			`print("Mode: DRY RUN (no actual upload)")`
			`print()`

			`# Get existing dates to avoid duplicates`
			`print(f"Checking for existing backups...")`
			`existing_dates = get_existing_dates()`
			`print(f"Found {len(existing_dates)} existing backups")`

			`# Get memory files`
			`memory_files = get_memory_files()`
			`print(f"Found {len(memory_files)} memory files")`

			`# Filter out already backed up dates (unless force)`
			`files_to_backup = []`
			`for date_str, file_path in memory_files:`
			`if date_str in existing_dates and not args.force:`
			`print(f" ⏭️ {date_str} - Already backed up, skipping")`
			`continue`
			`files_to_backup.append((date_str, file_path))`

			`if not files_to_backup:`
			`print(f"\n✅ All memories already backed up (no new files)")`
			`return 0`

			`print(f"\nBacking up {len(files_to_backup)} files...")`
			`print()`

			`if args.dry_run:`
			`for date_str, file_path in files_to_backup:`
			`print(f" 📄 {date_str} - Would back up ({file_path.stat().st_size} bytes)")`
			`print(f"\nDry run complete. {len(files_to_backup)} files would be backed up.")`
			`return 0`

			`# Prepare all points with embeddings`
			`all_points = []`
			`failed_files = []`

			`print("Generating embeddings...")`
			`for date_str, file_path in files_to_backup:`
			`try:`
			`with open(file_path, 'r') as f:`
			`content = f.read()`

			`print(f" 📦 {date_str} - Generating embedding...")`
			`point = prepare_memory_point(content, date_str)`

			`if point:`
			`all_points.append(point)`
			`else:`
			`failed_files.append(date_str)`
			`except Exception as e:`
			`print(f" ❌ {date_str} - Failed to read: {e}")`
			`failed_files.append(date_str)`

			`if not all_points:`
			`print("\n❌ No points to upload")`
			`return 1`

			`print(f"\nGenerated {len(all_points)} embeddings, uploading in batches of {args.batch_size}...")`
			`print()`

			`# Upload in batches`
			`uploaded, failed = batch_upload_points(all_points, args.batch_size)`

			`# Summary`
			`print(f"\n{'=' * 50}")`
			`print("SUMMARY:")`
			`print(f" Total files: {len(files_to_backup)}")`
			`print(f" Successfully embedded: {len(all_points)}")`
			`print(f" Successfully uploaded: {uploaded}")`
			`print(f" Failed to embed: {len(failed_files)}")`
			`print(f" Failed to upload: {failed}")`

			`if failed_files:`
			`print(f"\nFailed files: {', '.join(failed_files)}")`

			`if uploaded > 0:`
			`print(f"\n✅ Daily backup complete!")`
			`return 0`
			`elif failed > 0 or failed_files:`
			`print(f"\n⚠️ Backup completed with errors")`
			`return 1`
			`else:`
			`print(f"\n✅ All memories already backed up")`
			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`