From e8854cd959afdcb8cb15287b469ed78d600929bb Mon Sep 17 00:00:00 2001 From: SpeedyFoxAi Date: Mon, 23 Feb 2026 12:13:04 -0600 Subject: [PATCH] Initial commit: Jarvis Memory system --- .gitignore | 45 ++ CHANGELOG.md | 148 ++++ CONTRIBUTORS.md | 48 ++ MANIFEST.md | 190 +++++ README.md | 554 +++++++++++++ RESTORE.md | 187 +++++ TUTORIAL.md | 281 +++++++ config/HEARTBEAT.md | 64 ++ docker-compose.yml | 71 ++ docs/MEM_DIAGRAM.md | 738 ++++++++++++++++++ install.sh | 463 +++++++++++ memory-architecture.html | 540 +++++++++++++ requirements.txt | 20 + skills/mem-redis/SKILL.md | 42 + skills/mem-redis/scripts/cron_backup.py | 204 +++++ skills/mem-redis/scripts/cron_capture.py | 230 ++++++ skills/mem-redis/scripts/hb_append.py | 161 ++++ skills/mem-redis/scripts/mem_retrieve.py | 87 +++ skills/mem-redis/scripts/save_mem.py | 149 ++++ skills/mem-redis/scripts/search_mem.py | 242 ++++++ skills/qdrant-memory/HARVEST.md | 137 ++++ skills/qdrant-memory/SKILL.md | 53 ++ skills/qdrant-memory/scripts/activity_log.py | 273 +++++++ skills/qdrant-memory/scripts/agent_chat.py | 191 +++++ skills/qdrant-memory/scripts/agent_check.py | 181 +++++ skills/qdrant-memory/scripts/api_scraper.py | 275 +++++++ skills/qdrant-memory/scripts/auto_store.py | 388 +++++++++ .../qdrant-memory/scripts/backfill_emails.py | 145 ++++ .../qdrant-memory/scripts/background_store.py | 78 ++ skills/qdrant-memory/scripts/batch_crawl.py | 159 ++++ skills/qdrant-memory/scripts/bulk_migrate.py | 298 +++++++ .../scripts/create_daily_memory.py | 72 ++ skills/qdrant-memory/scripts/daily_backup.py | 317 ++++++++ .../scripts/daily_conversation_backup.py | 347 ++++++++ skills/qdrant-memory/scripts/extract_facts.py | 553 +++++++++++++ .../scripts/get_conversation_context.py | 236 ++++++ .../qdrant-memory/scripts/get_user_context.py | 86 ++ .../qdrant-memory/scripts/harvest_newest.py | 191 +++++ .../qdrant-memory/scripts/harvest_sessions.py | 341 ++++++++ .../qdrant-memory/scripts/hb_check_email.py | 186 +++++ skills/qdrant-memory/scripts/hybrid_search.py | 135 ++++ .../scripts/init_all_collections.py | 242 ++++++ skills/qdrant-memory/scripts/init_kimi_kb.py | 112 +++ .../scripts/init_kimi_memories.py | 114 +++ .../scripts/init_projects_collection.py | 113 +++ skills/qdrant-memory/scripts/js_scraper.py | 190 +++++ skills/qdrant-memory/scripts/kb_review.py | 183 +++++ skills/qdrant-memory/scripts/kb_search.py | 136 ++++ skills/qdrant-memory/scripts/kb_store.py | 379 +++++++++ skills/qdrant-memory/scripts/llm_router.py | 102 +++ skills/qdrant-memory/scripts/log_activity.py | 77 ++ .../scripts/metadata_and_compact.py | 190 +++++ .../scripts/migrate_qd_snowflake.py | 158 ++++ .../scripts/monitor_ollama_models.py | 207 +++++ .../scripts/monitor_openclaw_repo.py | 249 ++++++ skills/qdrant-memory/scripts/notify_check.py | 65 ++ skills/qdrant-memory/scripts/q_save.py | 70 ++ skills/qdrant-memory/scripts/qd.py | 427 ++++++++++ skills/qdrant-memory/scripts/scrape_to_kb.py | 220 ++++++ .../qdrant-memory/scripts/search_memories.py | 189 +++++ skills/qdrant-memory/scripts/send_email.py | 64 ++ .../qdrant-memory/scripts/sliding_backup.sh | 20 + skills/qdrant-memory/scripts/smart_parser.py | 211 +++++ skills/qdrant-memory/scripts/smart_search.py | 321 ++++++++ .../scripts/store_conversation.py | 303 +++++++ skills/qdrant-memory/scripts/store_memory.py | 394 ++++++++++ skills/qdrant-memory/scripts/tagger.py | 88 +++ skills/task-queue/SKILL.md | 33 + skills/task-queue/scripts/add_task.py | 91 +++ skills/task-queue/scripts/heartbeat_worker.py | 443 +++++++++++ skills/task-queue/scripts/list_tasks.py | 77 ++ uninstall.sh | 227 ++++++ 72 files changed, 14801 insertions(+) create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 CONTRIBUTORS.md create mode 100644 MANIFEST.md create mode 100644 README.md create mode 100644 RESTORE.md create mode 100644 TUTORIAL.md create mode 100644 config/HEARTBEAT.md create mode 100644 docker-compose.yml create mode 100644 docs/MEM_DIAGRAM.md create mode 100755 install.sh create mode 100644 memory-architecture.html create mode 100644 requirements.txt create mode 100644 skills/mem-redis/SKILL.md create mode 100755 skills/mem-redis/scripts/cron_backup.py create mode 100644 skills/mem-redis/scripts/cron_capture.py create mode 100755 skills/mem-redis/scripts/hb_append.py create mode 100755 skills/mem-redis/scripts/mem_retrieve.py create mode 100755 skills/mem-redis/scripts/save_mem.py create mode 100755 skills/mem-redis/scripts/search_mem.py create mode 100644 skills/qdrant-memory/HARVEST.md create mode 100644 skills/qdrant-memory/SKILL.md create mode 100755 skills/qdrant-memory/scripts/activity_log.py create mode 100755 skills/qdrant-memory/scripts/agent_chat.py create mode 100755 skills/qdrant-memory/scripts/agent_check.py create mode 100755 skills/qdrant-memory/scripts/api_scraper.py create mode 100755 skills/qdrant-memory/scripts/auto_store.py create mode 100755 skills/qdrant-memory/scripts/backfill_emails.py create mode 100755 skills/qdrant-memory/scripts/background_store.py create mode 100755 skills/qdrant-memory/scripts/batch_crawl.py create mode 100755 skills/qdrant-memory/scripts/bulk_migrate.py create mode 100755 skills/qdrant-memory/scripts/create_daily_memory.py create mode 100755 skills/qdrant-memory/scripts/daily_backup.py create mode 100755 skills/qdrant-memory/scripts/daily_conversation_backup.py create mode 100755 skills/qdrant-memory/scripts/extract_facts.py create mode 100755 skills/qdrant-memory/scripts/get_conversation_context.py create mode 100755 skills/qdrant-memory/scripts/get_user_context.py create mode 100755 skills/qdrant-memory/scripts/harvest_newest.py create mode 100755 skills/qdrant-memory/scripts/harvest_sessions.py create mode 100755 skills/qdrant-memory/scripts/hb_check_email.py create mode 100755 skills/qdrant-memory/scripts/hybrid_search.py create mode 100755 skills/qdrant-memory/scripts/init_all_collections.py create mode 100755 skills/qdrant-memory/scripts/init_kimi_kb.py create mode 100755 skills/qdrant-memory/scripts/init_kimi_memories.py create mode 100755 skills/qdrant-memory/scripts/init_projects_collection.py create mode 100755 skills/qdrant-memory/scripts/js_scraper.py create mode 100755 skills/qdrant-memory/scripts/kb_review.py create mode 100755 skills/qdrant-memory/scripts/kb_search.py create mode 100755 skills/qdrant-memory/scripts/kb_store.py create mode 100644 skills/qdrant-memory/scripts/llm_router.py create mode 100755 skills/qdrant-memory/scripts/log_activity.py create mode 100644 skills/qdrant-memory/scripts/metadata_and_compact.py create mode 100755 skills/qdrant-memory/scripts/migrate_qd_snowflake.py create mode 100755 skills/qdrant-memory/scripts/monitor_ollama_models.py create mode 100755 skills/qdrant-memory/scripts/monitor_openclaw_repo.py create mode 100755 skills/qdrant-memory/scripts/notify_check.py create mode 100755 skills/qdrant-memory/scripts/q_save.py create mode 100755 skills/qdrant-memory/scripts/qd.py create mode 100755 skills/qdrant-memory/scripts/scrape_to_kb.py create mode 100755 skills/qdrant-memory/scripts/search_memories.py create mode 100755 skills/qdrant-memory/scripts/send_email.py create mode 100755 skills/qdrant-memory/scripts/sliding_backup.sh create mode 100755 skills/qdrant-memory/scripts/smart_parser.py create mode 100755 skills/qdrant-memory/scripts/smart_search.py create mode 100755 skills/qdrant-memory/scripts/store_conversation.py create mode 100755 skills/qdrant-memory/scripts/store_memory.py create mode 100644 skills/qdrant-memory/scripts/tagger.py create mode 100644 skills/task-queue/SKILL.md create mode 100755 skills/task-queue/scripts/add_task.py create mode 100755 skills/task-queue/scripts/heartbeat_worker.py create mode 100755 skills/task-queue/scripts/list_tasks.py create mode 100755 uninstall.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d5ed37c --- /dev/null +++ b/.gitignore @@ -0,0 +1,45 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environment +.env +.memory_env +*.env + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Logs +*.log + +# OS +.DS_Store +Thumbs.db + +# OpenClaw specific +.mem_last_turn +.gmail_imap.json +.google_tokens.json diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..71471a8 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,148 @@ +# Changelog + +All notable changes to the OpenClaw Jarvis-Like Memory System blueprint. + +## [1.5.0] - 2026-02-19 + +### Added (Community PR #1 by ecomm-michael) +- **cron_capture.py** - Token-free transcript capture via cron (no LLM calls, saves money) +- **Safer Redis→Qdrant flush** - Only clears Redis if ALL user turns stored successfully +- **Auto-dependency installation** - install.sh now auto-installs Docker, Python, Redis if missing +- **llm_router.py** - Routes to cheap LLMs (Minimax) via OpenRouter with fallback +- **metadata_and_compact.py** - Auto-generates tags, titles, summaries using cheap LLM +- **tagger.py** - Content tagging for better organization +- **Portable defaults** - Changed hardcoded 10.0.0.x IPs to localhost (127.0.0.1) with env overrides +- **PEP 668 compliance** - Creates Python venv if pip --user blocked + +### Changed +- **cron_backup.py** - Better error handling, preserves Redis on Qdrant failure +- **hb_append.py** - Doesn't store thinking in main buffer (separate mem_thinking key) +- **auto_store.py** - Uses SHA256 instead of MD5 for content hashing (portable) +- **init_kimi_memories.py** - Env-driven config with defaults +- **task-queue scripts** - Removed hardcoded SSH credentials (security cleanup) +- **docker-compose.yml** - Disabled container healthcheck (qdrant image lacks curl) + +### Security +- Changed default USER_ID from "rob" to "yourname" in all scripts (privacy) +- Removed hardcoded credentials from task-queue + +### Contributors +- **ecomm-michael** - Major contribution: portability, cron capture, safer backups, metadata pipeline + +--- + +## [1.4.0] - 2026-02-19 + +### Added +- **Compaction threshold recommendation** - Added guide to set OpenClaw to 90% to reduce timing window +- **Manual setup steps** - Clear instructions (not automated) for adjusting compaction setting +- **Explanation** - Why 90% helps and how it relates to the known timing issue + +### Changed +- README Known Issues section expanded with "Adjust Compaction Threshold" subsection +- Added manual configuration steps that users should do post-installation + +--- + +## [1.3.0] - 2026-02-19 + +### Added +- **Complete command reference** in README - documents all 4 memory commands with usage +- **Known Issues section** - documents the compaction timing window issue +- Command table showing what each command does, which layer it hits, and when to use it + +### Changed +- README Memory Commands section expanded with detailed reference table +- Added data flow diagrams for both manual and automated memory storage + +--- + +## [1.2.0] - 2026-02-19 + +### Added +- **Automatic backup functionality** in `install.sh` - backs up all modified files before changes +- **RESTORE.md** - Complete manual backup/restore documentation +- **Version tracking** - Added version number to README and this CHANGELOG + +### Changed +- `install.sh` now creates `.backups/` directory with timestamped `.bak.rush` files +- `install.sh` generates `MANIFEST.txt` with exact restore commands +- README now documents every single file that gets modified or created + +### Files Modified in This Release +- `install.sh` - Added backup functionality (Step 5) +- `README.md` - Added version header, file inventory section +- `MANIFEST.md` - Updated component list, added RESTORE.md + +### Files Added in This Release +- `RESTORE.md` - Complete restore documentation +- `CHANGELOG.md` - This file + +--- + +## [1.1.0] - 2026-02-19 + +### Added +- **uninstall.sh** - Interactive recovery/uninstall script +- Uninstall script removes: cron jobs, Redis buffer, Qdrant collections (optional), config files + +### Changed +- `README.md` - Added uninstall section +- `MANIFEST.md` - Added uninstall.sh to file list + +### Files Added in This Release +- `uninstall.sh` - Recovery script + +--- + +## [1.0.0] - 2026-02-18 + +### Added +- Initial release of complete Jarvis-like memory system +- **52 Python scripts** across 3 skills: + - mem-redis (5 scripts) - Fast buffer layer + - qdrant-memory (43 scripts) - Vector database layer + - task-queue (3 scripts) - Background job processing +- **install.sh** - One-command installer +- **docker-compose.yml** - Complete infrastructure setup (Qdrant, Redis, Ollama) +- **README.md** - Complete documentation +- **TUTORIAL.md** - YouTube video script +- **MANIFEST.md** - File index +- **docs/MEM_DIAGRAM.md** - Architecture documentation +- **.gitignore** - Excludes cache files, credentials + +### Features +- Three-layer memory architecture (Redis → Files → Qdrant) +- User-centric storage (not session-based) +- Semantic search with 1024-dim embeddings +- Automatic daily backups via cron +- Deduplication via content hashing +- Conversation threading with metadata + +### Infrastructure +- Qdrant at 10.0.0.40:6333 +- Redis at 10.0.0.36:6379 +- Ollama at 10.0.0.10:11434 with snowflake-arctic-embed2 + +--- + +## Version History Summary + +| Version | Date | Key Changes | +|---------|------|-------------| +| 1.2.0 | 2026-02-19 | Auto-backup, RESTORE.md, version tracking | +| 1.1.0 | 2026-02-19 | uninstall.sh recovery script | +| 1.0.0 | 2026-02-18 | Initial release, 52 scripts, full tutorial | + +--- + +## Version Numbering + +We follow [Semantic Versioning](https://semver.org/): +- **MAJOR** (X.0.0) - Breaking changes, major architecture changes +- **MINOR** (x.X.0) - New features, backwards compatible +- **PATCH** (x.x.X) - Bug fixes, small improvements + +--- + +*Last updated: February 19, 2026* diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 0000000..c0f623e --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,48 @@ +# Contributors + +Thank you to everyone who has contributed to the OpenClaw Jarvis-Like Memory System! + +## Core Development + +**mdkrush** (Rob) +- Original creator and maintainer +- Architecture design +- Documentation and tutorials +- GitHub: [@mdkrush](https://github.com/mdkrush) + +## Community Contributors + +### ecomm-michael +**Pull Request #1** - Major contribution (February 19, 2026) +- ✅ `cron_capture.py` - Token-free transcript capture via cron +- ✅ Safer Redis→Qdrant flush with better error handling +- ✅ Auto-dependency installation in `install.sh` +- ✅ Portable defaults (localhost vs hardcoded IPs) +- ✅ `llm_router.py` for cheap LLM routing +- ✅ `metadata_and_compact.py` for auto-tagging +- ✅ `tagger.py` for content organization +- ✅ Security cleanup (removed hardcoded credentials) +- ✅ SHA256 hashing for cross-platform compatibility +- GitHub: [@ecomm-michael](https://github.com/ecomm-michael) + +--- + +## How to Contribute + +1. **Fork** the repository +2. **Make your changes** (follow existing code style) +3. **Test thoroughly** (especially install/uninstall scripts) +4. **Document** what you changed +5. **Submit a Pull Request** + +### Contribution Guidelines + +- **Privacy first** - No personal identifiers in code +- **Portability** - Use env vars with sane defaults, not hardcoded paths +- **Backwards compatibility** - Don't break existing installs +- **Documentation** - Update README/CHANGELOG for user-facing changes +- **MIT licensed** - Your contributions will be MIT licensed + +--- + +*Thank you for making AI memory better for everyone!* 🚀 diff --git a/MANIFEST.md b/MANIFEST.md new file mode 100644 index 0000000..5d43833 --- /dev/null +++ b/MANIFEST.md @@ -0,0 +1,190 @@ +# OpenClaw Jarvis-Like Memory System - Complete Blueprint + +> **Version:** 1.5.0 +> **Date:** February 19, 2026 +> **Purpose:** Build an AI assistant that actually remembers + +--- + +## 📦 What's Included + +This blueprint contains everything needed to build a production-grade, multi-layer memory system for OpenClaw. + +### Core Components + +| Component | Purpose | Status | +|-----------|---------|--------| +| **mem-redis** | Redis buffer (Layer 1) | ✅ Complete | +| **qdrant-memory** | Vector DB (Layer 3) | ✅ Complete | +| **task-queue** | Background jobs | ✅ Complete | +| **install.sh** | One-command installer (with auto-backup) | ✅ Complete | +| **uninstall.sh** | Recovery/uninstall script | ✅ Complete | +| **RESTORE.md** | Manual backup/restore guide | ✅ Complete | +| **CHANGELOG.md** | Version history | ✅ Complete | +| **docker-compose.yml** | Infrastructure | ✅ Complete | + +### Files Overview + +``` +blueprint/ +├── install.sh ⭐ Main installer (auto-backs up existing files) +├── uninstall.sh 🧹 Recovery/uninstall script +├── RESTORE.md 🛡️ Manual backup/restore guide +├── CHANGELOG.md 📋 Version history +├── README.md ⭐ Start here (includes command reference & known issues) +├── TUTORIAL.md 🎬 YouTube script +├── docker-compose.yml 🐳 Infrastructure +├── requirements.txt 📦 Python deps +│ +├── skills/ +│ ├── mem-redis/ 🚀 Fast buffer +│ │ ├── SKILL.md +│ │ └── scripts/ +│ │ ├── hb_append.py # Heartbeat: new turns +│ │ ├── save_mem.py # Manual: all turns +│ │ ├── cron_backup.py # Daily: flush to Qdrant +│ │ ├── mem_retrieve.py # Read from Redis +│ │ └── search_mem.py # Search Redis+Qdrant +│ │ +│ ├── qdrant-memory/ 🧠 Long-term storage +│ │ ├── SKILL.md +│ │ ├── HARVEST.md +│ │ └── scripts/ +│ │ ├── auto_store.py # Store with embeddings +│ │ ├── q_save.py # Quick save +│ │ ├── search_memories.py # Semantic search +│ │ ├── init_kimi_memories.py # Initialize collection +│ │ ├── init_kimi_kb.py +│ │ ├── init_private_court_docs.py +│ │ ├── daily_conversation_backup.py +│ │ ├── harvest_sessions.py +│ │ ├── harvest_newest.py +│ │ ├── sliding_backup.sh +│ │ ├── store_conversation.py +│ │ ├── store_memory.py +│ │ ├── get_conversation_context.py +│ │ └── smart_search.py +│ │ +│ └── task-queue/ 📋 Background jobs +│ ├── SKILL.md +│ └── scripts/ +│ ├── add_task.py +│ ├── list_tasks.py +│ └── heartbeat_worker.py +│ +├── config/ +│ └── HEARTBEAT.md.template 📝 Copy to HEARTBEAT.md +│ +└── docs/ + └── MEM_DIAGRAM.md 📖 Full architecture docs +``` + +--- + +## 🚀 Quick Start + +```bash +# 1. Copy this blueprint to your workspace +cp -r blueprint/* ~/.openclaw/workspace/ + +# 2. Run the installer +cd ~/.openclaw/workspace +chmod +x install.sh +./install.sh + +# 3. Source environment and test +source .memory_env +python3 skills/mem-redis/scripts/save_mem.py --user-id yourname +``` + +--- + +## 🎥 For YouTube Creators + +See `TUTORIAL.md` for: +- Complete video script +- Section timestamps +- Thumbnail ideas +- Description template +- Tag suggestions + +--- + +## 🏗️ Architecture + +``` +Layer 1: Redis Buffer (fast, real-time) + ↓ +Layer 2: Daily Files (.md, human-readable) + ↓ +Layer 3: Qdrant (semantic, searchable) +``` + +**Commands:** +- `save mem` → Redis + File +- `save q` → Qdrant (embeddings) +- `q ` → Semantic search + +--- + +## 📊 Statistics + +| Metric | Value | +|--------|-------| +| Python Scripts | 52 | +| Lines of Code | ~5,000 | +| Documentation | 3,000+ lines | +| Architecture Diagrams | 5 | +| Skills | 3 | +| Installer Backups | Automatic `.bak.rush` files | + +## Version History + +| **Version** | **Date** | **Changes** | +|-------------|----------|-------------| +| 1.4.0 | Feb 19, 2026 | Compaction threshold recommendation (90%), manual setup docs | +| 1.3.0 | Feb 19, 2026 | Command reference, known issues documentation | +| 1.2.0 | Feb 19, 2026 | Auto-backup, RESTORE.md, version tracking | +| 1.1.0 | Feb 19, 2026 | Added uninstall.sh recovery script | +| 1.0.0 | Feb 18, 2026 | Initial release - 52 scripts, full tutorial | + +--- + +## ✅ Verification Checklist + +Before sharing this blueprint, verify: + +- [ ] All scripts are executable (`chmod +x`) +- [ ] Docker Compose starts all services +- [ ] Install script runs without errors +- [ ] Installer creates `.bak.rush` backups before modifying files +- [ ] `save mem` works +- [ ] `save q` works +- [ ] `q ` search works +- [ ] Cron jobs are configured +- [ ] HEARTBEAT.md template is correct +- [ ] RESTORE.md explains manual restore process + +--- + +## 🔗 Related Files + +| File | Description | +|------|-------------| +| MEM_DIAGRAM.md | Complete architecture documentation | +| install.sh | Automated installer (auto-backs up before changes) | +| uninstall.sh | Recovery/uninstall script | +| RESTORE.md | Manual backup/restore documentation | +| CHANGELOG.md | Version history | +| TUTORIAL.md | YouTube video script | +| docker-compose.yml | Infrastructure as code | + +--- + +## 📝 License + +MIT - Use this however you want. Attribution appreciated. + +--- + +**Ready to build Jarvis?** Run `./install.sh` 🚀 diff --git a/README.md b/README.md new file mode 100644 index 0000000..1df61f0 --- /dev/null +++ b/README.md @@ -0,0 +1,554 @@ +# OpenClaw Jarvis-Like Memory System + +> **Build an AI assistant that actually remembers you.** +> +> **GitHub:** https://github.com/mdkrush/openclaw-jarvis-memory +> +> **Version: 1.5.0** (February 19, 2026) +> +> **Changelog:** +> - v1.5.0: Merged community PR #1 - cron capture (token-free), safer backups, auto-dependencies, portable defaults +> - v1.4.0: Added compaction threshold recommendation (90%) with manual setup steps +> - v1.3.0: Added complete command reference, documented known issues with compaction timing +> - v1.2.0: Added automatic backup to installer, RESTORE.md documentation +> - v1.1.0: Added uninstall.sh recovery script +> - v1.0.0: Initial release with 52 scripts, complete tutorial + +This is a complete blueprint for implementing a production-grade, multi-layer memory system for OpenClaw that provides persistent, searchable, cross-session context — just like Jarvis from Iron Man. + +**Why not just use OpenClaw's built-in features or skills?** +> *I want a portable brain — one I can take to the next OpenClaw, or whatever AI project I adopt next.* +> +> This system is **modular and independent**. Your memories live in standard infrastructure (Redis, Qdrant, Markdown files) that any AI can access. You're not locked into a single platform. + +**⚙️ Configuration:** Copy `.memory_env.template` to `.memory_env` and set your infrastructure IPs/ports. All scripts use environment variables — no hardcoded addresses. + +[![YouTube Tutorial](https://img.shields.io/badge/YouTube-Tutorial-red)](https://youtube.com) +[![License](https://img.shields.io/badge/License-MIT-blue)]() + +## 🎯 What This Builds + +A three-layer memory architecture: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ LAYER 1: Redis Buffer (Fast Short-Term) │ +│ • Real-time accumulation │ +│ • Multi-session persistence │ +│ • Daily flush to Qdrant │ +├─────────────────────────────────────────────────────────────┤ +│ LAYER 2: Daily File Logs (.md) │ +│ • Human-readable audit trail │ +│ • Git-tracked, never lost │ +│ • Always accessible │ +├─────────────────────────────────────────────────────────────┤ +│ LAYER 3: Qdrant Vector DB (Semantic Long-Term) │ +│ • 1024-dim embeddings (snowflake-arctic-embed2) │ +│ • Semantic search across ALL conversations │ +│ • User-centric (Mem0-style architecture) │ +└─────────────────────────────────────────────────────────────┘ +``` + +## 🚀 Quick Start + +```bash +# 1. Clone/copy this blueprint to your workspace +cp -r openclaw-jarvis-memory/* ~/.openclaw/workspace/ + +# 2. Configure your environment +cd ~/.openclaw/workspace +cp .memory_env.template .memory_env +# Edit .memory_env with your actual IP addresses/ports + +# 3. Run the installer (automatically backs up existing files) +chmod +x install.sh +./install.sh + +# 4. Source the environment +source .memory_env + +# 5. Test it +python3 skills/mem-redis/scripts/save_mem.py --user-id yourname +``` + +**🔒 The installer automatically backs up** your existing `HEARTBEAT.md`, `.memory_env`, and crontab before making changes. Backups are stored in `.backups/` with timestamps. + +**See [RESTORE.md](RESTORE.md)** for how to restore from backups manually. + +--- + +## 📋 Files Modified by Installer + +When you run `./install.sh`, the following files in your OpenClaw workspace are **modified** (backed up first as `.bak.rush` files): + +### Files That Get Modified (with Backup) + +| File | Location | What Installer Does | Backup Location | +|------|----------|---------------------|-----------------| +| **crontab** | System crontab | Adds 2 daily cron jobs for backups | `.backups/install_*_crontab.bak.rush` | +| **HEARTBEAT.md** | `~/.openclaw/workspace/HEARTBEAT.md` | Creates or overwrites with memory automation | `.backups/install_*_HEARTBEAT.md.bak.rush` | +| **.memory_env** | `~/.openclaw/workspace/.memory_env` | Creates environment variables file | `.backups/install_*_memory_env.bak.rush` | + +### Files That Get Created (New) + +| File | Location | Purpose | +|------|----------|---------| +| **52 Python scripts** | `~/.openclaw/workspace/skills/mem-redis/scripts/` (5 files)
`~/.openclaw/workspace/skills/qdrant-memory/scripts/` (43 files)
`~/.openclaw/workspace/skills/task-queue/scripts/` (3 files) | Core memory system functionality | +| **SKILL.md** | `~/.openclaw/workspace/skills/mem-redis/SKILL.md` | Redis skill documentation | +| **SKILL.md** | `~/.openclaw/workspace/skills/qdrant-memory/SKILL.md` | Qdrant skill documentation | +| **SKILL.md** | `~/.openclaw/workspace/skills/task-queue/SKILL.md` | Task queue documentation | +| **memory/** | `~/.openclaw/workspace/memory/` | Daily markdown log files directory | +| **.gitkeep** | `~/.openclaw/workspace/memory/.gitkeep` | Keeps memory dir in git | +| **Backup Manifest** | `~/.openclaw/workspace/.backups/install_*_MANIFEST.txt` | Lists all backups with restore commands | + +### Full Path List for Manual Restore + +If you need to restore manually without using the uninstaller, here's every single file path: + +**Configuration Files (Modified):** +``` +~/.openclaw/workspace/HEARTBEAT.md # Automation config +~/.openclaw/workspace/.memory_env # Environment variables +~/.openclaw/workspace/.mem_last_turn # State tracking (created) +``` + +**Skill Files (Created - 52 total scripts):** +``` +# Redis Buffer (5 scripts) +~/.openclaw/workspace/skills/mem-redis/scripts/hb_append.py +~/.openclaw/workspace/skills/mem-redis/scripts/save_mem.py +~/.openclaw/workspace/skills/mem-redis/scripts/cron_backup.py +~/.openclaw/workspace/skills/mem-redis/scripts/mem_retrieve.py +~/.openclaw/workspace/skills/mem-redis/scripts/search_mem.py +~/.openclaw/workspace/skills/mem-redis/SKILL.md + +# Qdrant Memory (43 scripts - key ones listed) +~/.openclaw/workspace/skills/qdrant-memory/scripts/auto_store.py +~/.openclaw/workspace/skills/qdrant-memory/scripts/q_save.py +~/.openclaw/workspace/skills/qdrant-memory/scripts/search_memories.py +~/.openclaw/workspace/skills/qdrant-memory/scripts/init_kimi_memories.py +~/.openclaw/workspace/skills/qdrant-memory/scripts/init_kimi_kb.py +~/.openclaw/workspace/skills/qdrant-memory/scripts/init_private_court_docs.py +~/.openclaw/workspace/skills/qdrant-memory/scripts/daily_conversation_backup.py +~/.openclaw/workspace/skills/qdrant-memory/scripts/harvest_sessions.py +~/.openclaw/workspace/skills/qdrant-memory/scripts/sliding_backup.sh +~/.openclaw/workspace/skills/qdrant-memory/scripts/store_conversation.py +~/.openclaw/workspace/skills/qdrant-memory/SKILL.md +~/.openclaw/workspace/skills/qdrant-memory/HARVEST.md +# ... (33 more scripts - see skills/qdrant-memory/scripts/) + +# Task Queue (3 scripts) +~/.openclaw/workspace/skills/task-queue/scripts/add_task.py +~/.openclaw/workspace/skills/task-queue/scripts/heartbeat_worker.py +~/.openclaw/workspace/skills/task-queue/scripts/list_tasks.py +~/.openclaw/workspace/skills/task-queue/SKILL.md +``` + +**Directories Created:** +``` +~/.openclaw/workspace/skills/mem-redis/scripts/ +~/.openclaw/workspace/skills/qdrant-memory/scripts/ +~/.openclaw/workspace/skills/task-queue/scripts/ +~/.openclaw/workspace/memory/ +~/.openclaw/workspace/.backups/ +``` + +--- + +### 🧹 Uninstall/Recovery + +If you need to remove the memory system: + +```bash +./uninstall.sh +``` + +This interactive script will: +- Remove cron jobs +- Clear Redis buffer +- Optionally delete Qdrant collections (your memories) +- Remove configuration files +- Optionally remove all skill files + +## 📋 Prerequisites + +### Required Infrastructure + +| Service | Purpose | Install | +|---------|---------|---------| +| **Qdrant** | Vector database | `docker run -p 6333:6333 qdrant/qdrant` | +| **Redis** | Fast buffer | `docker run -p 6379:6379 redis` | +| **Ollama** | Embeddings | [ollama.ai](https://ollama.ai) + `ollama pull snowflake-arctic-embed2` | + +### Software Requirements + +- Python 3.8+ +- OpenClaw (obviously) +- `pip3 install redis qdrant-client requests` + +## 🏗️ Architecture + +### Memory Commands Reference + +These are the commands you can use once the memory system is installed: + +| Command | What It Does | Data Layer | When to Use | +|---------|--------------|------------|-------------| +| **`save mem`** | Saves ALL conversation turns to Redis buffer + daily file | Layer 1 (Redis) + Layer 2 (Files) | When you want to capture current session | +| **`save q`** | Stores current exchange to Qdrant with embeddings | Layer 3 (Qdrant) | When you want immediate long-term searchable memory | +| **`q `** | Semantic search across all stored memories | Layer 3 (Qdrant) | Find past conversations by meaning, not keywords | +| **`remember this`** | Quick note to daily file (manual note) | Layer 2 (Files) | Important facts you want to log | + +**Data Flow:** +``` +User: "save mem" → Redis Buffer + File Log (fast, persistent) +User: "save q" → Qdrant Vector DB (semantic, searchable) +User: "q " → Searches embeddings for similar content +``` + +### Automated Flow + +``` +Every Message (capture option A: heartbeat, capture option B: cron capture) + ↓ +Redis Buffer (fast, survives session reset) + ↓ +File Log (permanent, human-readable markdown) + ↓ +[Optional: User says "save q"] → Qdrant (semantic search) + +Cost note: cron capture avoids LLM heartbeats entirely and is the recommended default for token savings. + +Cron capture quick test (no Redis required): +```bash +python3 skills/mem-redis/scripts/cron_capture.py --dry-run --user-id yourname +``` + +Daily 3:00 AM (cron) + ↓ +Redis Buffer → Flush → Qdrant (with embeddings) + ↓ +Clear Redis (ready for new day) + +Daily 3:30 AM (cron) + ↓ +Daily Files → Sliding Backup → Archive +``` + +### Cron Capture (Token-Free Alternative) + +**New in v1.5.0:** `cron_capture.py` provides a **zero-token** alternative to heartbeat capture. + +**Why use it:** +- **Saves money** - No LLM calls to capture transcripts +- **Runs every 5 minutes** via cron (no session API needed) +- **Tracks file position** - Only reads NEW content since last run +- **Optional thinking capture** - Store model thinking separately + +**Setup:** +```bash +# Add to crontab (runs every 5 minutes) +*/5 * * * * cd ~/.openclaw/workspace && python3 skills/mem-redis/scripts/cron_capture.py --user-id yourname +``` + +**Test it:** +```bash +# Dry run (shows what would be captured) +python3 skills/mem-redis/scripts/cron_capture.py --dry-run --user-id yourname + +# Run for real +python3 skills/mem-redis/scripts/cron_capture.py --user-id yourname +``` + +**Capture Options Comparison:** + +| Method | Token Cost | Trigger | Best For | +|--------|------------|---------|----------| +| **Heartbeat** | ~1K tokens/turn | Every OpenClaw message | Real-time, always-on | +| **Cron Capture** | **FREE** | Every 5 minutes | Cost-conscious, periodic | +| **Manual `save mem`** | FREE | On demand | Important sessions | + +**Note:** You can use BOTH - cron capture for background accumulation, heartbeat for real-time critical sessions. + +--- + +## 📁 Project Structure + +``` +openclaw-jarvis-memory/ +├── install.sh # One-command installer +├── README.md # This file +├── docker-compose.yml # Spin up all infrastructure +├── requirements.txt # Python dependencies +├── .memory_env.template # Environment configuration template +├── skills/ +│ ├── mem-redis/ # Redis buffer skill +│ │ ├── SKILL.md +│ │ └── scripts/ +│ │ ├── hb_append.py +│ │ ├── save_mem.py +│ │ ├── cron_backup.py +│ │ ├── mem_retrieve.py +│ │ └── search_mem.py +│ └── qdrant-memory/ # Qdrant storage skill +│ ├── SKILL.md +│ ├── HARVEST.md +│ └── scripts/ +│ ├── auto_store.py +│ ├── q_save.py +│ ├── search_memories.py +│ ├── daily_conversation_backup.py +│ ├── harvest_sessions.py +│ ├── init_*.py +│ └── sliding_backup.sh +├── config/ +│ └── HEARTBEAT.md.template +└── docs/ + └── MEM_DIAGRAM.md # Complete architecture docs +``` + +## 🔧 Manual Setup (Without install.sh) + +### Step 1: Create Directory Structure + +```bash +mkdir -p ~/.openclaw/workspace/{skills/{mem-redis,qdrant-memory}/scripts,memory} +``` + +### Step 2: Copy Scripts + +See `skills/` directory in this repository. + +### Step 3: Configure Environment + +Create `~/.openclaw/workspace/.memory_env`: + +```bash +export USER_ID="yourname" +export REDIS_HOST="127.0.0.1" +export REDIS_PORT="6379" +export QDRANT_URL="http://127.0.0.1:6333" +export OLLAMA_URL="http://127.0.0.1:11434" +``` + +### Step 4: Initialize Qdrant Collections + +```bash +cd ~/.openclaw/workspace/skills/qdrant-memory/scripts +python3 init_kimi_memories.py +python3 init_kimi_kb.py +python3 init_private_court_docs.py +``` + +### Step 5: Set Up Cron + +```bash +# 3:00 AM - Redis to Qdrant flush +0 3 * * * cd ~/.openclaw/workspace && python3 skills/mem-redis/scripts/cron_backup.py + +# 3:30 AM - File backup +30 3 * * * ~/.openclaw/workspace/skills/qdrant-memory/scripts/sliding_backup.sh +``` + +### Step 6: Configure Heartbeat + +Add to `HEARTBEAT.md`: + +```markdown +## Memory Buffer (Every Heartbeat) + +```bash +python3 /root/.openclaw/workspace/skills/mem-redis/scripts/save_mem.py --user-id yourname +``` +``` + +## 🎥 YouTube Video Outline + +If you're making a video about this: + +1. **Introduction** (0-2 min) + - The problem: AI that forgets everything + - The solution: Multi-layer memory + +2. **Demo** (2-5 min) + - "What did we talk about yesterday?" + - Semantic search in action + +3. **Architecture** (5-10 min) + - Show the three layers + - Why each layer exists + +4. **Live Build** (10-25 min) + - Set up Qdrant + Redis + - Install the scripts + - Test the commands + +5. **Advanced Features** (25-30 min) + - Session harvesting + - Email integration + - Task queue + +6. **Wrap-up** (30-32 min) + - Recap + - GitHub link + - Call to action + +## 🔍 How It Works + +### Deduplication + +Each memory generates a SHA-256 content hash. Before storing to Qdrant, the system checks if this user already has this exact content — preventing duplicates while allowing the same content for different users. + +### Embeddings + +Every turn generates **3 embeddings**: +1. User message embedding +2. AI response embedding +3. Combined summary embedding + +This enables searching by user query, AI response, or overall concept. + +### Threading + +Memories are tagged with: +- `user_id`: Persistent identity +- `conversation_id`: Groups related turns +- `session_id`: Which chat instance +- `turn_number`: Sequential ordering + +## 🛠️ Customization + +### Change Embedding Model + +Edit `skills/qdrant-memory/scripts/auto_store.py`: + +```python +# Change this line +EMBEDDING_MODEL = "snowflake-arctic-embed2" # or your preferred model +``` + +### Add New Collections + +Copy `init_kimi_memories.py` and modify: + +```python +COLLECTION_NAME = "my_custom_collection" +``` + +### Adjust Cron Schedule + +Edit your crontab: + +```bash +# Every 6 hours instead of daily +0 */6 * * * python3 skills/mem-redis/scripts/cron_backup.py +``` + +## 📊 Monitoring + +### Check System Status + +```bash +# Redis buffer size +redis-cli -h $REDIS_HOST LLEN mem:yourname + +# Qdrant collection size +curl -s $QDRANT_URL/collections/kimi_memories | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" + +# Recent memories +python3 skills/mem-redis/scripts/mem_retrieve.py --limit 10 +``` + +## ⚠️ Known Issues + +### Gap Between Heartbeat/Save and Compaction + +**The Issue:** +There is a small timing window where data can be lost: + +1. OpenClaw session JSONL files get "compacted" (rotated/archived) periodically +2. If a heartbeat or `save mem` runs *after* compaction but *before* a new session starts, it may miss the last few turns +3. The Redis buffer tracks turns by number, but the source file has changed + +**Impact:** +- Low - happens only during active session compaction +- Affects only the most recent turns if timing is unlucky +- Daily file logs usually still have the data + +**Workaround:** +- Run `save mem` manually before ending important sessions +- The cron job at 3:00 AM catches anything missed during the day +- Use `save q` for critical exchanges (goes directly to Qdrant immediately) + +### Recommendation: Adjust Compaction Threshold + +To reduce how often this issue occurs, **set OpenClaw's session compaction threshold to 90%** (default is often lower). This makes compaction happen less frequently, shrinking the timing window. + +**Manual Steps (Not in Installer):** + +1. **Locate your OpenClaw config:** + ```bash + # Find your OpenClaw configuration file + ls ~/.openclaw/config/ # or wherever your config lives + ``` + +2. **Edit the compaction setting:** + ```bash + # Look for session or compaction settings + # Add or modify: + # "session_compaction_threshold": 90 + ``` + +3. **Alternative - via environment variable:** + ```bash + # Add to your shell profile or .memory_env: + export OPENCLAW_COMPACTION_THRESHOLD=90 + ``` + +4. **Restart OpenClaw gateway:** + ```bash + openclaw gateway restart + ``` + +**Why 90%?** +- Default is often 50-70%, causing frequent compactions +- 90% means files grow larger before rotation +- Less frequent compaction = smaller timing window for data loss +- Still protects disk space from runaway log files + +**Note:** The installer does NOT change this setting automatically, as it requires OpenClaw gateway restart and may vary by installation. This is a manual optimization step. + +--- + +## 🐛 Troubleshooting + +| Issue | Solution | +|-------|----------| +| "Redis connection failed" | Check Redis is running: `redis-cli -h $REDIS_HOST ping` | +| "Qdrant connection failed" | Check Qdrant: `curl $QDRANT_URL/collections` | +| "Embedding failed" | Ensure Ollama has snowflake-arctic-embed2 loaded | +| "No memories found" | Run `save q` first, or check collection exists | +| Cron not running | Check logs: `tail /var/log/memory-backup.log` | + +## 🤝 Contributing + +This is a community blueprint! If you improve it: + +1. Fork the repo +2. Make your changes +3. Submit a PR +4. Share your video/tutorial! + +## 📜 License + +MIT License — use this however you want. Attribution appreciated but not required. + +## 🙏 Credits + +- OpenClaw community +- Mem0 for the user-centric memory architecture inspiration +- Qdrant for the amazing vector database + +--- + +**Ready to build?** Run `./install.sh` and let's make AI that actually remembers! 🚀 diff --git a/RESTORE.md b/RESTORE.md new file mode 100644 index 0000000..2d2fe66 --- /dev/null +++ b/RESTORE.md @@ -0,0 +1,187 @@ +# Manual Backup & Restore Guide + +> **Peace of mind**: Every file modified by the installer is backed up before changes are made. + +## 📁 Where Backups Are Stored + +Backups are stored in: +``` +~/.openclaw/workspace/.backups/ +``` + +Each installation creates a unique timestamped backup set: +``` +.backups/ +├── install_20260219_083012_crontab.bak.rush +├── install_20260219_083012_HEARTBEAT.md.bak.rush +├── install_20260219_083012_memory_env.bak.rush +└── install_20260219_083012_MANIFEST.txt +``` + +## 📋 What Gets Backed Up + +| File | Why It's Backed Up | Restore Command | +|------|-------------------|-----------------| +| **Crontab** | Installer adds 2 cron jobs for daily backups | `crontab .backups/install_*_crontab.bak.rush` | +| **HEARTBEAT.md** | Installer creates/modifies automation config | `cp .backups/install_*_HEARTBEAT.md.bak.rush HEARTBEAT.md` | +| **.memory_env** | Installer creates environment variables | `cp .backups/install_*_memory_env.bak.rush .memory_env` | + +## 🔄 How to Restore + +### Quick Restore (One Command) + +Each backup includes a `MANIFEST.txt` with exact restore commands: + +```bash +cd ~/.openclaw/workspace/.backups +cat install_20260219_083012_MANIFEST.txt +``` + +### Step-by-Step Restore + +#### 1. Find Your Backup + +```bash +ls -la ~/.openclaw/workspace/.backups/ +``` + +Look for files with pattern: `install_YYYYMMDD_HHMMSS_*.bak.rush` + +#### 2. Restore Crontab (removes auto-backup jobs) + +```bash +# List current crontab +crontab -l + +# Restore from backup +crontab ~/.openclaw/workspace/.backups/install_20260219_083012_crontab.bak.rush + +# Verify +crontab -l +``` + +#### 3. Restore HEARTBEAT.md + +```bash +# Backup current first (just in case) +cp ~/.openclaw/workspace/HEARTBEAT.md ~/.openclaw/workspace/HEARTBEAT.md.manual_backup + +# Restore from installer backup +cp ~/.openclaw/workspace/.backups/install_20260219_083012_HEARTBEAT.md.bak.rush \ + ~/.openclaw/workspace/HEARTBEAT.md +``` + +#### 4. Restore .memory_env + +```bash +# Restore environment file +cp ~/.openclaw/workspace/.backups/install_20260219_083012_memory_env.bak.rush \ + ~/.openclaw/workspace/.memory_env + +# Re-source it +source ~/.openclaw/workspace/.memory_env +``` + +## 🛡️ Creating Your Own Backups + +Before making changes manually, create your own backup: + +```bash +cd ~/.openclaw/workspace + +# Backup everything important +tar -czf my_backup_$(date +%Y%m%d).tar.gz \ + HEARTBEAT.md \ + .memory_env \ + .backups/ \ + memory/ + +# Store it somewhere safe +cp my_backup_20260219.tar.gz ~/Documents/ +``` + +## ⚠️ When to Restore + +| Situation | Action | +|-----------|--------| +| Cron jobs causing issues | Restore crontab | +| HEARTBEAT.md corrupted | Restore HEARTBEAT.md | +| Wrong environment settings | Restore .memory_env | +| Complete removal wanted | Run `uninstall.sh` instead | +| Something broke | Check backup manifest, restore specific file | + +## 🔧 Full System Restore Example + +```bash +# 1. Go to workspace +cd ~/.openclaw/workspace + +# 2. Identify your backup timestamp +BACKUP_DATE="20260219_083012" + +# 3. Restore all files +crontab .backups/install_${BACKUP_DATE}_crontab.bak.rush +cp .backups/install_${BACKUP_DATE}_HEARTBEAT.md.bak.rush HEARTBEAT.md +cp .backups/install_${BACKUP_DATE}_memory_env.bak.rush .memory_env + +# 4. Source the restored environment +source .memory_env + +# 5. Verify +echo "Crontab:" +crontab -l | grep -E "(Memory System|cron_backup|sliding_backup)" + +echo "" +echo "HEARTBEAT.md exists:" +ls -la HEARTBEAT.md + +echo "" +echo ".memory_env:" +cat .memory_env +``` + +## 📝 Backup Naming Convention + +| Pattern | Meaning | +|---------|---------| +| `install_YYYYMMDD_HHMMSS_*.bak.rush` | Automatic backup from installer | +| `*.manual_backup` | User-created manual backup | +| `*_crontab.bak.rush` | Crontab backup | +| `*_HEARTBEAT.md.bak.rush` | HEARTBEAT.md backup | +| `*_memory_env.bak.rush` | Environment file backup | + +## 🗑️ Cleaning Up Old Backups + +Backups don't auto-delete. Clean up periodically: + +```bash +# List all backups +ls -la ~/.openclaw/workspace/.backups/ + +# Remove backups older than 30 days +find ~/.openclaw/workspace/.backups/ -name "*.bak.rush" -mtime +30 -delete + +# Or remove specific timestamp +rm ~/.openclaw/workspace/.backups/install_20260219_083012_* +``` + +## ❓ FAQ + +**Q: Will the installer overwrite my existing HEARTBEAT.md?** +A: It will backup the existing file first (as `HEARTBEAT.md.bak.rush`), then create the new one. + +**Q: Can I run the installer multiple times?** +A: Yes! Each run creates new backups. The installer is idempotent (safe to run again). + +**Q: What if I don't have a crontab yet?** +A: No problem - the installer detects this and won't try to backup a non-existent file. + +**Q: Are my memories (Qdrant data) backed up?** +A: No - these backups are for configuration files only. Your actual memories stay in Qdrant until you explicitly delete them via `uninstall.sh`. + +**Q: Where is the backup manifest?** +A: Each backup set includes a `install_YYYYMMDD_HHMMSS_MANIFEST.txt` with exact restore commands. + +--- + +*Remember: When in doubt, backup first!* 🛡️ diff --git a/TUTORIAL.md b/TUTORIAL.md new file mode 100644 index 0000000..dace101 --- /dev/null +++ b/TUTORIAL.md @@ -0,0 +1,281 @@ +# YouTube Tutorial Script: Building Jarvis-Like Memory for OpenClaw + +> **Video Title Ideas:** +> - "I Built a Jarvis Memory System for My AI Assistant" +> - "OpenClaw Memory That Actually Works (Full Build)" +> - "From Goldfish to Elephant: AI Memory Architecture" + +--- + +## Video Sections + +### [0:00-2:00] Introduction: The Problem + +**On screen:** Split screen showing normal AI vs. AI with memory + +**Script:** + +"Hey everyone! You know how most AI assistants are like goldfish? You say something, they respond, and then... poof. It's gone. Start a new session? Everything's gone. Reset the conversation? Gone. + +But what if I told you we can build an AI assistant that actually **remembers**? Not just the current session. Not just recent messages. But months of conversations, projects, preferences — all instantly searchable and semantically understood. + +Today we're building a Jarvis-like memory system for OpenClaw. Three layers. Full persistence. Semantic search. And it's all self-hosted." + +**Visual:** Show the three-layer architecture diagram + +--- + +### [2:00-5:00] Demo: Show It Working + +**On screen:** Live terminal demo + +**Script:** + +"Before we build, let me show you what this actually looks like. + +[Type] `q docker networking` + +See that? It found a conversation from two weeks ago where we talked about Docker networking. It didn't just keyword search — it understood the semantic meaning of my question. + +[Type] `save q` + +This saves our current conversation to long-term memory. Now even if I reset my session, this conversation is searchable forever. + +[Type] `save mem` + +This saves everything to the fast Redis buffer. Every night at 3 AM, this automatically flushes to our vector database. + +The result? An AI assistant that knows my infrastructure, remembers my projects, and can recall anything we've ever discussed." + +**Visual:** Show search results appearing from Qdrant + +--- + +### [5:00-10:00] Architecture Deep Dive + +**On screen:** Architecture diagram with each layer highlighted + +**Script:** + +"So how does this work? Three layers. + +**Layer 1: Redis Buffer** — Fast, real-time accumulation. Every message gets stored here instantly. It survives session resets because it's external to OpenClaw. Every night at 3 AM, we flush this to Qdrant. + +**Layer 2: Daily File Logs** — Human-readable Markdown files. Git-tracked, never lost, always accessible. This is your audit trail. You can grep these, read them, they're just text files. + +**Layer 3: Qdrant Vector Database** — The magic happens here. We generate 1024-dimensional embeddings using the snowflake-arctic-embed2 model. Every turn gets THREE embeddings: one for the user message, one for the AI response, and one combined summary. This enables semantic search. + +**Deduplication** — We hash every piece of content. Same user, same content? Skip it. Different user, same content? Store it. This prevents bloat. + +**User-centric design** — Memories follow YOU, not the session. Ask 'what did I say about X?' and it searches across ALL your conversations." + +**Visual:** Animated data flow showing messages → Redis → Files → Qdrant + +--- + +### [10:00-25:00] Live Build + +**On screen:** Terminal, code editor + +**Script:** + +"Alright, let's build this. I'm going to assume you have OpenClaw running. If not, check my previous video. + +**Step 1: Infrastructure** + +We need three things: Qdrant for vectors, Redis for fast buffer, and Ollama for embeddings. + +[Show] `docker-compose up -d` + +This spins up everything. Let's verify: + +[Show] `curl http://localhost:6333/collections` +[Show] `redis-cli ping` +[Show] `curl http://localhost:11434/api/tags` + +All green? Perfect. + +**Step 2: Install Python Dependencies** + +[Show] `pip3 install redis qdrant-client requests` + +**Step 3: Create Directory Structure** + +[Show] `mkdir -p skills/{mem-redis,qdrant-memory}/scripts memory` + +**Step 4: Copy the Scripts** + +Now we copy the scripts from the blueprint. I'm going to show you the key ones. + +[Show hb_append.py - explain the heartbeat logic] +[Show save_mem.py - explain Redis buffer] +[Show auto_store.py - explain Qdrant storage] +[Show search_memories.py - explain semantic search] + +Each script has a specific job. Let's trace through the data flow. + +When you say 'save mem', it calls save_mem.py which dumps all conversation turns to Redis. + +When you say 'save q', it calls auto_store.py which generates embeddings and stores to Qdrant. + +When you say 'q topic', it calls search_memories.py which converts your query to an embedding and finds similar vectors. + +**Step 5: Initialize Qdrant Collections** + +We need to create the collections before we can store anything. + +[Show] `python3 init_kimi_memories.py` + +This creates the collection with the right settings: 1024 dimensions, cosine similarity, user_id metadata. + +**Step 6: Test End-to-End** + +Let's save something. + +[Show] `python3 save_mem.py --user-id $(whoami)` + +Check Redis: + +[Show] `redis-cli LLEN mem:$(whoami)` + +See that? Our conversation is now in the buffer. + +Let's make it semantically searchable: + +[Show] `python3 auto_store.py` + +Now search for it: + +[Show] `python3 search_memories.py "your test query"` + +Boom! We just built a memory system." + +**Visual:** Code on left, terminal output on right + +--- + +### [25:00-30:00] Advanced Features + +**On screen:** Show additional scripts + +**Script:** + +"Once you have the basics, here are some advanced features. + +**Session Harvesting** — Got old OpenClaw sessions you want to import? Use harvest_sessions.py to bulk-import them into Qdrant. + +**Task Queue** — Want background jobs? The task-queue skill lets you queue tasks and execute them on heartbeat. + +**Email Integration** — Want your AI to check email? hb_check_email.py connects to Gmail and stores emails as memories. + +**QMD (Query Markdown)** — This is experimental but cool. It's a local-first hybrid search using BM25 + vectors. Works offline. + +Each of these extends the core system in different directions." + +**Visual:** Show each script running briefly + +--- + +### [30:00-32:00] Conclusion + +**On screen:** Summary slide with GitHub link + +**Script:** + +"So that's it! A complete Jarvis-like memory system for OpenClaw. + +We've built: +✅ Three-layer persistent memory +✅ Semantic search across all conversations +✅ User-centric storage (not session-based) +✅ Automatic daily backups +✅ Git-tracked audit trails + +The full blueprint is on GitHub — link in the description. It includes all the scripts, the install.sh one-command installer, docker-compose for infrastructure, and this complete documentation. + +If you build this, tag me on socials! I'd love to see your implementations. + +Questions? Drop them in the comments. If this was helpful, like and subscribe for more AI infrastructure content. + +Thanks for watching — now go build something that remembers! 🚀" + +**Visual:** End screen with subscribe button, social links + +--- + +## B-Roll / Screen Capture Checklist + +- [ ] Opening shot of architecture diagram +- [ ] Terminal showing `q` command working +- [ ] Redis CLI showing buffer size +- [ ] Qdrant web UI (if using) +- [ ] Daily Markdown file being opened +- [ ] Code editor showing scripts +- [ ] Docker Compose starting up +- [ ] Animated data flow diagram +- [ ] Search results appearing +- [ ] End screen with links + +## Thumbnail Ideas + +1. **Jarvis helmet** + "AI Memory" text +2. **Three-layer cake** diagram with labels +3. **Before/After split**: Goldfish vs. Elephant +4. **Terminal screenshot** with search results visible + +## Description Template + +``` +Build an AI assistant that actually REMEMBERS with this complete Jarvis-like memory system for OpenClaw. + +🧠 THREE-LAYER ARCHITECTURE: +• Redis buffer (fast, real-time) +• Daily file logs (human-readable) +• Qdrant vector DB (semantic search) + +🔧 WHAT YOU'LL LEARN: +• Multi-layer memory architecture +• Semantic search with embeddings +• User-centric storage (Mem0-style) +• Automatic backup systems +• Self-hosted infrastructure + +📦 RESOURCES: +Full blueprint: [GitHub link] +Docker Compose: Included +Install script: One-command setup + +⏱️ TIMESTAMPS: +0:00 - The Problem (AI goldfish) +2:00 - Live Demo +5:00 - Architecture Deep Dive +10:00 - Live Build +25:00 - Advanced Features +30:00 - Conclusion + +🛠️ STACK: +• OpenClaw +• Qdrant (vectors) +• Redis (buffer) +• Ollama (embeddings) + +#OpenClaw #AI #Memory #SelfHosted #Jarvis +``` + +## Tags for YouTube + +OpenClaw, AI Memory, Vector Database, Qdrant, Redis, Ollama, Self-Hosted AI, Jarvis AI, Memory Architecture, Semantic Search, Embeddings, LLM Memory + +--- + +## Follow-Up Video Ideas + +1. "Advanced Memory: Session Harvesting Tutorial" +2. "Building an AI Task Queue with Redis" +3. "Email Integration: AI That Reads Your Mail" +4. "QMD vs Qdrant: Which Memory System Should You Use?" +5. "Scaling Memory: From Personal to Multi-User" + +--- + +*Ready to record? Good luck! 🎬* diff --git a/config/HEARTBEAT.md b/config/HEARTBEAT.md new file mode 100644 index 0000000..fd293e0 --- /dev/null +++ b/config/HEARTBEAT.md @@ -0,0 +1,64 @@ +# HEARTBEAT.md + +# Keep this file empty (or with only comments) to skip heartbeat API calls. + +# Add tasks below when you want the agent to check something periodically. + +## Memory Buffer (Every Heartbeat) + +Saves ALL current session context to Redis short-term buffer. Runs automatically. +Does NOT clear buffer — preserves turns from other sessions until daily backup. + +```bash +python3 /root/.openclaw/workspace/skills/mem-redis/scripts/save_mem.py --user-id YOUR_USER_ID +``` + +Multiple sessions per day accumulate in Redis. Daily cron (3:00 AM) flushes everything to Qdrant. + +## Email Check (Every Heartbeat) + +Checks Gmail for messages from authorized senders. **Respond to any new emails found.** + +```bash +python3 /root/.openclaw/workspace/skills/qdrant-memory/scripts/hb_check_email.py +``` + +**Authorized senders only:** `your_email@gmail.com`, `spouse_email@gmail.com` + +*Edit `skills/qdrant-memory/scripts/hb_check_email.py` to set your authorized senders* + +**When new email found:** +1. Read the email subject and body +2. Search Qdrant for relevant context about the topic +3. Respond to the email with a helpful reply +4. Store the email and your response to Qdrant for memory + +--- + +## Manual Mode Only + +All OTHER heartbeat actions are **manual only** when explicitly requested. + +### When User Requests: +- **Check delayed notifications:** I will manually check the queue + +### No Automatic Actions: +❌ Auto-sending notifications from queue +❌ Auto-logging heartbeat timestamps + +## Available Manual Commands + +```bash +# Check delayed notifications +redis-cli -h 10.0.0.36 LRANGE delayed:notifications 0 0 + +# Manual full context save to Redis (all current session turns) +python3 /root/.openclaw/workspace/skills/mem-redis/scripts/save_mem.py --user-id YOUR_USER_ID +``` + +## Daily Tasks + +- Redis → Qdrant backup (cron 3:00 AM): `cron_backup.py` +- File-based backup (cron 3:30 AM): `sliding_backup.sh` + +## Future Tasks (add as needed) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..86071c2 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,71 @@ +version: '3.8' + +services: + # Vector Database - Long-term semantic memory + qdrant: + image: qdrant/qdrant:latest + container_name: qdrant-memory + ports: + - "6333:6333" + - "6334:6334" + volumes: + - qdrant-storage:/qdrant/storage + environment: + - QDRANT__SERVICE__HTTP_PORT=6333 + restart: unless-stopped + # qdrant image does not ship with curl/wget; disable container-level healthcheck. + # Host-level checks (curl to localhost:6333) are sufficient. + # healthcheck: + # test: ["CMD", "sh", "-lc", ": + sh -c " + ollama serve & + sleep 5 + ollama pull snowflake-arctic-embed2 + wait + " + +volumes: + qdrant-storage: + driver: local + redis-data: + driver: local + ollama-models: + driver: local + +networks: + default: + name: memory-system + driver: bridge diff --git a/docs/MEM_DIAGRAM.md b/docs/MEM_DIAGRAM.md new file mode 100644 index 0000000..c759bee --- /dev/null +++ b/docs/MEM_DIAGRAM.md @@ -0,0 +1,738 @@ +# Memory System Architecture Diagrams + +**Created:** February 18, 2026 +**Updated:** February 18, 2026 (v2.0 - Added QMD, Task Queue, Session Harvesting, Email Integration) +**Purpose:** Complete backup of memory system architecture for Google Slides presentations + +--- + +## Table of Contents + +1. [Part 1: Built-in Memory System (OpenClaw Default)](#part-1-built-in-memory-system-openclaw-default) +2. [Part 2: Custom Memory System (What We Built)](#part-2-custom-memory-system-what-we-built) +3. [Part 3: Comparison — Built-in vs Custom](#part-3-comparison--built-in-vs-custom) +4. [Part 4: QMD (Query Markdown) — OpenClaw Experimental](#part-4-qmd-query-markdown--openclaw-experimental) +5. [Part 5: Task Queue System](#part-5-task-queue-system) +6. [Part 6: Session Harvesting](#part-6-session-harvesting) +7. [Part 7: Email Integration](#part-7-email-integration) +8. [Part 8: PROJECTNAME.md Workflow](#part-8-projectnamemd-workflow) +9. [Part 9: Complete Infrastructure Reference](#part-9-complete-infrastructure-reference) + +--- + +## Part 1: Built-in Memory System (OpenClaw Default) + +### Architecture Diagram + +``` +┌─────────────────────────────────────┐ +│ OpenClaw Gateway Service │ +│ (Manages session state & routing) │ +└──────────────┬──────────────────────┘ + │ + ┌──────▼──────┐ + │ Session │ + │ Context │ + │ (In-Memory) │ + └──────┬──────┘ + │ + ┌──────▼──────────────────┐ + │ Message History Buffer │ + │ (Last N messages) │ + │ Default: 8k-32k tokens │ + └──────┬──────────────────┘ + │ + ┌──────▼────────┐ + │ Model Input │ + │ (LLM Call) │ + └───────────────┘ +``` + +### How Built-in Memory Works + +**Process Flow:** +1. **User sends message** → Added to session context +2. **Context accumulates** in memory (not persistent) +3. **Model receives** last N messages as context +4. **Session ends** → Context is **LOST** + +**Key Characteristics:** +- ✅ Works automatically (no setup) +- ✅ Fast (in-memory) +- ❌ **Lost on /new or /reset** +- ❌ **Lost when session expires** +- ❌ No cross-session memory +- ❌ Limited context window (~8k-32k tokens) + +### Built-in Limitations + +| Feature | Status | +|---------|--------| +| Session Persistence | ❌ NO | +| Cross-Session Memory | ❌ NO | +| User-Centric Storage | ❌ NO | +| Long-Term Memory | ❌ NO | +| Semantic Search | ❌ NO | +| Conversation Threading | ❌ NO | +| Automatic Backup | ❌ NO | + +--- + +## Part 2: Custom Memory System (What We Built) + +### Complete Architecture Diagram + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ MULTI-LAYER MEMORY SYSTEM │ +├─────────────────────────────────────────────────────────────────────┤ +│ │ +│ LAYER 0: Real-Time Session Context (OpenClaw Gateway) │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ Session JSONL → Live context (temporary only) │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────────▼──────────────────────────────────┐ │ +│ │ LAYER 1: Redis Buffer (Fast Short-Term) │ │ +│ │ ├─ Key: mem:rob │ │ +│ │ ├─ Accumulates new turns since last check │ │ +│ │ ├─ Heartbeat: Append-only (hb_append.py) │ │ +│ │ ├─ Manual: Full dump (save_mem.py) │ │ +│ │ └─ Flush: Daily 3:00 AM → Qdrant │ │ +│ └──────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────────▼──────────────────────────────────┐ │ +│ │ LAYER 2: Daily File Logs (.md) │ │ +│ │ ├─ Location: memory/YYYY-MM-DD.md │ │ +│ │ ├─ Format: Human-readable Markdown │ │ +│ │ ├─ Backup: 3:30 AM sliding_backup.sh │ │ +│ │ └─ Retention: Permanent (git-tracked) │ │ +│ └──────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────────▼──────────────────────────────────┐ │ +│ │ LAYER 3: Qdrant Vector DB (Semantic Long-Term) │ │ +│ │ ├─ Host: 10.0.0.40:6333 │ │ +│ │ ├─ Embeddings: snowflake-arctic-embed2 (1024-dim) │ │ +│ │ ├─ Collections: │ │ +│ │ │ • kimi_memories (conversations) │ │ +│ │ │ • kimi_kb (knowledge base) │ │ +│ │ │ • private_court_docs (legal) │ │ +│ │ ├─ Deduplication: Content hash per user │ │ +│ │ └─ User-centric: user_id: "rob" │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ CROSS-CUTTING: Task Queue (Redis) │ │ +│ │ ├─ tasks:pending → tasks:active → tasks:completed │ │ +│ │ └─ Heartbeat worker for background jobs │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ CROSS-CUTTING: Email Integration (Gmail) │ │ +│ │ ├─ hb_check_email.py (Heartbeat) │ │ +│ │ └─ Authorized senders: your_email@gmail.com │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### Detailed Component Breakdown + +#### Component 1: Daily File Logs +- **Location:** `/root/.openclaw/workspace/memory/YYYY-MM-DD.md` +- **Format:** Markdown with timestamps +- **Content:** Full conversation history +- **Access:** Direct file read +- **Retention:** Permanent (until deleted) +- **Auto-created:** Yes, every session +- **Backup:** `sliding_backup.sh` at 3:30 AM + +#### Component 2: Redis Buffer (mem-redis skill) +- **Host:** `10.0.0.36:6379` +- **Key:** `mem:rob` +- **Type:** List (LPUSH append) +- **Purpose:** Fast access, multi-session accumulation +- **Flush:** Daily at 3:00 AM to Qdrant +- **No TTL:** Data persists until successfully backed up +- **Fail-safe:** If cron fails, data stays in Redis + +**Scripts:** +| Script | Purpose | +|--------|---------| +| `hb_append.py` | Heartbeat: Add NEW turns only | +| `save_mem.py` | Manual: Save ALL turns (with --reset option) | +| `cron_backup.py` | Daily: Process Redis → Qdrant → Clear Redis | +| `mem_retrieve.py` | Manual: Retrieve recent turns from Redis | +| `search_mem.py` | Search both Redis (exact) + Qdrant (semantic) | + +#### Component 3: Qdrant Vector Database +- **Host:** `http://10.0.0.40:6333` +- **Embeddings Model:** `snowflake-arctic-embed2` at `10.0.0.10:11434` +- **Vector Dimensions:** 1024 +- **User-Centric:** All memories tagged with `user_id: "rob"` +- **Cross-Chat Search:** Find info from ANY past conversation + +**Collections:** +| Collection | Purpose | Content | +|------------|---------|---------| +| `kimi_memories` | Personal conversations | User + AI messages | +| `kimi_kb` | Knowledge base | Web data, docs, tutorials | +| `private_court_docs` | Legal documents | Court files, legal research | + +#### Component 4: Full Context Mode (Mem0-Style) + +**3 Embeddings Per Turn:** +1. User message embedding +2. AI response embedding +3. Combined summary embedding + +**Threading Metadata:** +- `user_id`: "rob" (persistent identifier) +- `conversation_id`: Groups related turns +- `session_id`: Which chat instance +- `turn_number`: Sequential ordering + +#### Deduplication System + +**What It Is:** +A content-based duplicate detection system that prevents storing the exact same information multiple times for the same user. + +**How It Works:** +1. **Content Hash Generation:** Each memory generates a SHA-256 hash of its content +2. **Per-User Scope:** Deduplication is per-user (same content from different users = allowed) +3. **Pre-Storage Check:** Before storing to Qdrant, check if hash exists for this user +4. **Skip if Duplicate:** If hash exists → skip storage, return "already exists" +5. **Store if New:** If hash doesn't exist → generate embeddings and store + +**Deduplication by Layer:** + +| Layer | Deduplication | Behavior | +|-------|---------------|----------| +| **Daily Files** | ❌ No | All turns appended (intentional — audit trail) | +| **Redis Buffer** | ❌ No | All turns stored (temporary, flushed daily) | +| **Qdrant (kimi_memories)** | ✅ Yes | Per-user content hash check | +| **Qdrant (kimi_kb)** | ✅ Yes | Per-collection content hash check | + +### Complete Script Reference + +``` +/root/.openclaw/workspace/ +├── memory/ +│ └── YYYY-MM-DD.md (daily logs) +│ +├── skills/ +│ ├── mem-redis/ +│ │ └── scripts/ +│ │ ├── hb_append.py (heartbeat: new turns only) +│ │ ├── save_mem.py (manual: all turns) +│ │ ├── cron_backup.py (daily flush to Qdrant) +│ │ ├── mem_retrieve.py (read from Redis) +│ │ └── search_mem.py (search Redis + Qdrant) +│ │ +│ ├── qdrant-memory/ +│ │ └── scripts/ +│ │ ├── auto_store.py (immediate Qdrant storage) +│ │ ├── background_store.py (async storage) +│ │ ├── q_save.py (quick save trigger) +│ │ ├── daily_conversation_backup.py (file → Qdrant) +│ │ ├── get_conversation_context.py (retrieve threads) +│ │ ├── search_memories.py (semantic search) +│ │ ├── harvest_sessions.py (bulk import old sessions) +│ │ ├── harvest_newest.py (specific sessions) +│ │ ├── hb_check_email.py (email integration) +│ │ ├── sliding_backup.sh (file backup) +│ │ ├── kb_store.py / kb_search.py (knowledge base) +│ │ └── court_store.py / court_search.py (legal docs) +│ │ +│ └── task-queue/ +│ └── scripts/ +│ ├── heartbeat_worker.py (process tasks) +│ ├── add_task.py (add background task) +│ └── list_tasks.py (view queue status) +│ +└── MEMORY_DEF/ + ├── README.md + ├── daily-backup.md + └── agent-messaging.md +``` + +### Technical Flow + +#### Real-Time (Every Message) +``` +User Input → AI Response + ↓ +Redis Buffer (fast append) + ↓ +File Log (persistent) + ↓ +[Optional: "save q"] → Qdrant (semantic) +``` + +#### Heartbeat (Every ~30-60 min) +``` +hb_append.py → Check for new turns → Append to Redis +hb_check_email.py → Check Gmail → Process new emails +heartbeat_worker.py → Check task queue → Execute tasks +``` + +#### Daily Backup (3:00 AM & 3:30 AM) +``` +3:00 AM: Redis Buffer → Flush → Qdrant (kimi_memories) + └─> Clear Redis after successful write + +3:30 AM: Daily Files → sliding_backup.sh → Archive + └─> daily_conversation_backup.py → Qdrant +``` + +#### On Retrieval ("search q" or "q ") +``` +Search Query + ↓ +search_mem.py + ├──► Redis (exact text match, recent) + └──► Qdrant (semantic similarity, long-term) + ↓ +Combined Results (Redis first, then Qdrant) + ↓ +Return context-enriched response +``` + +--- + +## Part 3: Comparison — Built-in vs Custom + +### Feature Comparison Table + +| Feature | Built-in | Custom System | +|---------|----------|---------------| +| **Session Persistence** | ❌ Lost on reset | ✅ Survives forever | +| **Cross-Session Memory** | ❌ None | ✅ All sessions linked | +| **User-Centric** | ❌ Session-based | ✅ User-based (Mem0-style) | +| **Semantic Search** | ❌ None | ✅ Full semantic retrieval | +| **Conversation Threading** | ❌ Linear only | ✅ Thread-aware | +| **Long-Term Storage** | ❌ Hours only | ✅ Permanent (disk + vector) | +| **Backup & Recovery** | ❌ None | ✅ Multi-layer redundancy | +| **Privacy** | ⚠️ Cloud dependent | ✅ Fully local/self-hosted | +| **Speed** | ✅ Fast (RAM) | ✅ Fast (Redis) + Deep (Qdrant) | +| **Cost** | ❌ OpenAI API tokens | ✅ Free (local infrastructure) | +| **Embeddings** | ❌ None | ✅ 1024-dim (snowflake) | +| **Cross-Reference** | ❌ None | ✅ Links related memories | +| **Task Queue** | ❌ None | ✅ Background job processing | +| **Email Integration** | ❌ None | ✅ Gmail via Pub/Sub | +| **Deduplication** | ❌ None | ✅ Content hash-based | + +### Why It's Better — Key Advantages + +#### 1. Mem0-Style Architecture +- Memories follow the **USER**, not the session +- Ask "what did I say about X?" → finds from **ANY** past conversation +- Persistent identity across all chats + +#### 2. Hybrid Storage Strategy +- **Redis:** Speed (real-time access) +- **Files:** Durability (never lost, human-readable) +- **Qdrant:** Intelligence (semantic search, similarity) + +#### 3. Multi-Modal Retrieval +- **Exact match:** File grep, exact text search +- **Semantic search:** Vector similarity, conceptual matching +- **Thread reconstruction:** Conversation_id grouping + +#### 4. Local-First Design +- No cloud dependencies +- No API costs (except initial setup) +- Full privacy control +- Works offline +- Self-hosted infrastructure + +#### 5. Triple Redundancy +| Layer | Purpose | Persistence | +|-------|---------|-------------| +| Redis | Speed | Temporary (daily flush) | +| Files | Durability | Permanent | +| Qdrant | Intelligence | Permanent | + +--- + +## Part 4: QMD (Query Markdown) — OpenClaw Experimental + +### What is QMD? + +**QMD** = **Query Markdown** — OpenClaw's experimental local-first memory backend that replaces the built-in SQLite indexer. + +**Key Difference:** +- Current system: SQLite + vector embeddings +- QMD: **BM25 + vectors + reranking** in a standalone binary + +### QMD Architecture + +``` +┌─────────────────────────────────────────────┐ +│ QMD Sidecar (Experimental) │ +│ ├─ BM25 (exact token matching) │ +│ ├─ Vector similarity (semantic) │ +│ └─ Reranking (smart result ordering) │ +└──────────────────┬──────────────────────────┘ + │ + ┌──────────▼──────────┐ + │ Markdown Source │ + │ memory/*.md │ + │ MEMORY.md │ + └─────────────────────┘ +``` + +### QMD vs Current System + +| Feature | Current (Qdrant) | QMD (Experimental) | +|---------|------------------|-------------------| +| **Storage** | Qdrant server (10.0.0.40) | Local SQLite + files | +| **Network** | Requires network | Fully offline | +| **Search** | Vector only | Hybrid (BM25 + vector) | +| **Exact tokens** | Weak | Strong (BM25) | +| **Embeddings** | snowflake-arctic-embed2 | Local GGUF models | +| **Git-friendly** | ❌ Opaque vectors | ✅ Markdown source | +| **Explainable** | Partial | Full (file.md#L12 citations) | +| **Status** | Production | Experimental | + +### When QMD Might Be Better + +✅ **Use QMD if:** +- You want **full offline** operation (no 10.0.0.40 dependency) +- You frequently search for **exact tokens** (IDs, function names, error codes) +- You want **human-editable** memory files +- You want **git-tracked** memory that survives system rebuilds + +❌ **Stick with Qdrant if:** +- Your current system is stable +- You need **multi-device** access to same memory +- You're happy with **semantic-only** search +- You need **production reliability** + +### QMD Configuration (OpenClaw) + +```json5 +memory: { + backend: "qmd", + citations: "auto", + qmd: { + includeDefaultMemory: true, + update: { interval: "5m", debounceMs: 15000 }, + limits: { maxResults: 6, timeoutMs: 4000 }, + paths: [ + { name: "docs", path: "~/notes", pattern: "**/*.md" } + ] + } +} +``` + +### QMD Prerequisites + +```bash +# Install QMD binary +bun install -g https://github.com/tobi/qmd + +# Install SQLite with extensions (macOS) +brew install sqlite + +# QMD auto-downloads GGUF models on first run (~0.6GB) +``` + +--- + +## Part 5: Task Queue System + +### Architecture + +``` +┌─────────────────────────────────────────────┐ +│ Redis Task Queue │ +│ ├─ tasks:pending (FIFO) │ +│ ├─ tasks:active (currently running) │ +│ ├─ tasks:completed (history) │ +│ └─ task:{id} (hash with details) │ +└──────────────────┬────────────────────────┘ + │ + ┌──────────▼──────────┐ + │ Heartbeat Worker │ + │ heartbeat_worker.py│ + └─────────────────────┘ +``` + +### Task Fields +- `id` - Unique task ID +- `description` - What to do +- `status` - pending/active/completed/failed +- `created_at` - Timestamp +- `created_by` - Who created the task +- `result` - Output from execution + +### Usage + +```bash +# Add a task +python3 skills/task-queue/scripts/add_task.py "Check server disk space" + +# List tasks +python3 skills/task-queue/scripts/list_tasks.py + +# Heartbeat auto-executes pending tasks +python3 skills/task-queue/scripts/heartbeat_worker.py +``` + +--- + +## Part 6: Session Harvesting + +### What is Session Harvesting? + +Bulk import of historical OpenClaw session JSONL files into Qdrant memory. + +### When to Use + +- After setting up new memory system → backfill existing sessions +- After discovering missed backups → recover data +- Periodically → if cron jobs missed data + +### Scripts + +| Script | Purpose | +|--------|---------| +| `harvest_sessions.py` | Auto-harvest (limited by memory) | +| `harvest_newest.py` | Specific sessions (recommended) | + +### Usage + +```bash +# Harvest specific sessions (recommended) +python3 harvest_newest.py --user-id rob session-1.jsonl session-2.jsonl + +# Find newest sessions to harvest +ls -t /root/.openclaw/agents/main/sessions/*.jsonl | head -20 + +# Auto-harvest with limit +python3 harvest_sessions.py --user-id rob --limit 10 +``` + +### How It Works + +1. **Parse** → Reads JSONL session file +2. **Pair** → Matches user message with AI response +3. **Embed** → Generates 3 embeddings (user, AI, summary) +4. **Deduplicate** → Checks content_hash before storing +5. **Store** → Upserts to Qdrant with user_id, conversation_id + +--- + +## Part 7: Email Integration + +### Architecture + +``` +┌─────────────────────────────────────────────┐ +│ Gmail Inbox │ +│ (your_email@gmail.com) │ +└──────────────────┬──────────────────────────┘ + │ + ┌──────────▼──────────┐ + │ hb_check_email.py │ + │ (Heartbeat) │ + └─────────────────────┘ +``` + +### Authorized Senders +- `your_email@gmail.com` (Configure in hb_check_email.py) +- Add more as needed + +### Usage + +```bash +# Check emails (runs automatically in heartbeat) +python3 skills/qdrant-memory/scripts/hb_check_email.py +``` + +### How It Works +1. Polls Gmail for new messages +2. Filters by authorized senders +3. Reads subject and body +4. Searches Qdrant for context +5. Responds with helpful reply +6. Stores email + response to Qdrant + +--- + +## Part 8: PROJECTNAME.md Workflow + +*See original document for full details — this is a summary reference.* + +### Purpose +Preserve context, decisions, and progress across sessions. + +### The Golden Rule — Append Only +**NEVER Overwrite. ALWAYS Append.** + +### File Structure Template +```markdown +# PROJECTNAME.md + +## Project Overview +- **Goal:** What we're achieving +- **Scope:** What's in/out +- **Success Criteria:** How we know it's done + +## Current Status +- [x] Completed tasks +- [ ] In progress +- [ ] Upcoming + +## Decisions Log +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-02-18 | Use X over Y | Because of Z | + +## Technical Details +- Infrastructure specs +- Code snippets +- Configuration + +## Blockers & Risks +- What's blocking progress +- Known issues + +## Next Steps +- Immediate actions +- Questions to resolve +``` + +### Real Examples + +| File | Project | Status | +|------|---------|--------| +| `MEM_DIAGRAM.md` | Memory system documentation | ✅ Active | +| `AUDIT-PLAN.md` | OpenClaw infrastructure audit | ✅ Completed | +| `YOUTUBE_UPDATE.md` | Video description optimization | 🔄 Ongoing | + +--- + +## Part 9: Complete Infrastructure Reference + +### Hardware/Network Topology + +``` +┌────────────────────────────────────────────────────────────────┐ +│ PROXMOX CLUSTER │ +├────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Ollama │ │ Qdrant │ │ Redis │ │ +│ │ 10.0.0.10 │ │ 10.0.0.40 │ │ 10.0.0.36 │ │ +│ │ GPU Node │ │ LXC │ │ LXC │ │ +│ │ Embeddings │ │ Vector DB │ │ Task Queue │ │ +│ │ 11434 │ │ 6333 │ │ 6379 │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ SearXNG │ │ Kokoro TTS │ │ OpenClaw │ │ +│ │ 10.0.0.8 │ │ 10.0.0.228 │ │ Workspace │ │ +│ │ Search │ │ Voice │ │ Kimi │ │ +│ │ 8888 │ │ 8880 │ │ │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ +└────────────────────────────────────────────────────────────────┘ +``` + +### Service Reference + +| Service | Purpose | Address | Model/Version | +|---------|---------|---------|-------------| +| Qdrant | Vector database | 10.0.0.40:6333 | v1.x | +| Redis | Buffer + tasks | 10.0.0.36:6379 | v7.x | +| Ollama | Embeddings | 10.0.0.10:11434 | snowflake-arctic-embed2 | +| SearXNG | Search | 10.0.0.8:8888 | Local | +| Kokoro TTS | Voice | 10.0.0.228:8880 | TTS | + +### Daily Automation Schedule + +| Time | Task | Script | +|------|------|--------| +| 3:00 AM | Redis → Qdrant flush | `cron_backup.py` | +| 3:30 AM | File-based sliding backup | `sliding_backup.sh` | +| Every 30-60 min | Heartbeat checks | `hb_append.py`, `hb_check_email.py` | + +### Manual Triggers + +| Command | What It Does | +|---------|--------------| +| `"save mem"` | Save ALL context to Redis + File | +| `"save q"` | Immediate Qdrant storage | +| `"q "` | Semantic search | +| `"search q "` | Full semantic search | +| `"remember this"` | Quick note to daily file | +| `"check messages"` | Check Redis for agent messages | +| `"send to Max"` | Send message to Max via Redis | + +### Environment Variables + +```bash +# Qdrant +QDRANT_URL=http://10.0.0.40:6333 + +# Redis +REDIS_HOST=10.0.0.36 +REDIS_PORT=6379 + +# Ollama +OLLAMA_URL=http://10.0.0.10:11434 + +# User +DEFAULT_USER_ID=rob +``` + +--- + +## Version History + +| Date | Version | Changes | +|------|---------|---------| +| 2026-02-18 | 1.0 | Initial documentation | +| 2026-02-18 | 2.0 | Added QMD, Task Queue, Session Harvesting, Email Integration, complete script reference | + +--- + +## Quick Reference Card + +### Memory Commands +``` +save mem → Redis + File (all turns) +save q → Qdrant (semantic, embeddings) +q → Search Qdrant +remember this → Quick note to file +``` + +### Architecture Layers +``` +Layer 0: Session Context (temporary) +Layer 1: Redis Buffer (fast, 3:00 AM flush) +Layer 2: File Logs (permanent, human-readable) +Layer 3: Qdrant (semantic, searchable) +``` + +### Key Files +``` +memory/YYYY-MM-DD.md → Daily conversation logs +MEMORY.md → Curated long-term memory +MEMORY_DEF/*.md → System documentation +skills/*/scripts/*.py → Automation scripts +``` + +### Infrastructure +``` +10.0.0.40:6333 → Qdrant (vectors) +10.0.0.36:6379 → Redis (buffer + tasks) +10.0.0.10:11434 → Ollama (embeddings) +``` + +--- + +*This document serves as the complete specification for the memory system.* +*For questions or updates, see MEMORY.md or the SKILL.md files in each skill directory.* diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..2780e73 --- /dev/null +++ b/install.sh @@ -0,0 +1,463 @@ +#!/bin/bash +# OpenClaw Jarvis-Like Memory System - Installation Script +# This script sets up the complete memory system from scratch + +set -e + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +WORKSPACE_DIR="${WORKSPACE_DIR:-$HOME/.openclaw/workspace}" +USER_ID="${USER_ID:-$(whoami)}" +REDIS_HOST="${REDIS_HOST:-127.0.0.1}" +REDIS_PORT="${REDIS_PORT:-6379}" +QDRANT_URL="${QDRANT_URL:-http://127.0.0.1:6333}" +OLLAMA_URL="${OLLAMA_URL:-http://127.0.0.1:11434}" + +# Optional toggles (avoid touching host config during tests) +SKIP_CRON="${SKIP_CRON:-0}" +SKIP_HEARTBEAT="${SKIP_HEARTBEAT:-0}" +SKIP_QDRANT_INIT="${SKIP_QDRANT_INIT:-0}" + +# If Redis/Qdrant/Ollama aren't reachable, attempt to start them via docker compose (recommended). +START_DOCKER="${START_DOCKER:-1}" + +# Backup directory +BACKUP_DIR="$WORKSPACE_DIR/.backups" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +BACKUP_PREFIX="$BACKUP_DIR/install_${TIMESTAMP}" + +echo "═══════════════════════════════════════════════════════════════" +echo " OpenClaw Jarvis-Like Memory System - Installer" +echo "═══════════════════════════════════════════════════════════════" +echo "" +echo -e "${BLUE}Backup Location: $BACKUP_DIR${NC}" +echo "" + +# Create backup directory +mkdir -p "$BACKUP_DIR" + +# Function to backup a file before modifying +backup_file() { + local file="$1" + local backup_name="$2" + if [ -f "$file" ]; then + cp "$file" "$backup_name" + echo -e "${GREEN} ✓ Backed up: $(basename $file) → $(basename $backup_name)${NC}" + return 0 + fi + return 1 +} + +# Helpers +have_cmd() { command -v "$1" >/dev/null 2>&1; } + +need_sudo() { + if [ "$(id -u)" -eq 0 ]; then + return 1 + fi + return 0 +} + +run_root() { + if need_sudo; then + sudo "$@" + else + "$@" + fi +} + +install_pkg_debian() { + local pkgs=("$@") + run_root apt-get update -y + run_root apt-get install -y "${pkgs[@]}" +} + +install_docker_debian() { + # Prefer distro packages for speed/simplicity. + install_pkg_debian ca-certificates curl gnupg lsb-release + if ! have_cmd docker; then + install_pkg_debian docker.io + run_root systemctl enable --now docker >/dev/null 2>&1 || true + fi + # docker compose v2 plugin (preferred) + if ! docker compose version >/dev/null 2>&1; then + install_pkg_debian docker-compose-plugin || true + fi + # fallback: docker-compose v1 + if ! docker compose version >/dev/null 2>&1 && ! have_cmd docker-compose; then + install_pkg_debian docker-compose || true + fi +} + +# Step 1: Check/install system dependencies +echo -e "${YELLOW}[1/10] Checking system dependencies...${NC}" + +if ! have_cmd curl; then + echo " • Installing curl" + if have_cmd apt-get; then + install_pkg_debian curl + else + echo -e "${RED} ✗ Missing curl and no supported package manager detected.${NC}" + exit 1 + fi +fi + +# Useful for quick testing (optional) +if ! have_cmd redis-cli; then + echo " • Installing redis-cli (redis-tools)" + if have_cmd apt-get; then + install_pkg_debian redis-tools + fi +fi + +if ! have_cmd python3; then + echo " • Installing python3" + if have_cmd apt-get; then + install_pkg_debian python3 python3-pip python3-venv + else + echo -e "${RED} ✗ Python 3 not found. Please install Python 3.8+${NC}" + exit 1 + fi +fi + +if ! have_cmd pip3; then + echo " • Installing pip3" + if have_cmd apt-get; then + install_pkg_debian python3-pip python3-venv + else + echo -e "${RED} ✗ pip3 not found. Please install python3-pip${NC}" + exit 1 + fi +fi + +# Docker is optional but recommended (used by docker-compose.yml) +if ! have_cmd docker; then + echo " • Docker not found — installing" + if have_cmd apt-get; then + install_docker_debian + else + echo -e "${RED} ✗ Docker not found and no supported package manager detected.${NC}" + echo " Install Docker manually, then re-run install.sh" + exit 1 + fi +else + echo " ✓ Docker found" +fi + +# Docker compose (v2 plugin preferred) +if docker compose version >/dev/null 2>&1; then + echo " ✓ docker compose (v2) found" +elif have_cmd docker-compose; then + echo " ✓ docker-compose (v1) found" +else + echo " • docker compose not found — attempting install" + if have_cmd apt-get; then + install_docker_debian + fi +fi + +PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}') +echo " ✓ Python $PYTHON_VERSION found" + +# Step 2: Create directory structure + copy blueprint files +echo -e "${YELLOW}[2/10] Creating directory structure...${NC}" +mkdir -p "$WORKSPACE_DIR"/{skills/{mem-redis,qdrant-memory,task-queue}/scripts,memory,MEMORY_DEF,docs,config} +touch "$WORKSPACE_DIR/memory/.gitkeep" + +# Copy scripts/docs from this repo into workspace (portable install) +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +cp -r "$SCRIPT_DIR/skills"/* "$WORKSPACE_DIR/skills/" 2>/dev/null || true +cp -r "$SCRIPT_DIR/docs"/* "$WORKSPACE_DIR/docs/" 2>/dev/null || true +cp -r "$SCRIPT_DIR/config"/* "$WORKSPACE_DIR/config/" 2>/dev/null || true + +echo " ✓ Directories created and files copied" + +# Step 3: Install Python dependencies (PEP 668-safe) +echo -e "${YELLOW}[3/10] Installing Python dependencies...${NC}" +REQ_FILE="$(dirname "$0")/requirements.txt" +PYTHON_BIN="python3" + +pip_user_install() { + if [ -f "$REQ_FILE" ]; then + pip3 install --user -r "$REQ_FILE" + else + pip3 install --user redis qdrant-client requests urllib3 + fi +} + +pip_venv_install() { + local venv_dir="$WORKSPACE_DIR/.venv" + python3 -m venv "$venv_dir" + "$venv_dir/bin/pip" install --upgrade pip setuptools wheel >/dev/null 2>&1 || true + if [ -f "$REQ_FILE" ]; then + "$venv_dir/bin/pip" install -r "$REQ_FILE" + else + "$venv_dir/bin/pip" install redis qdrant-client requests urllib3 + fi + PYTHON_BIN="$venv_dir/bin/python" +} + +# Try user install first (fast path) +if pip_user_install >/dev/null 2>&1; then + echo " ✓ Dependencies installed (pip --user)" +else + echo " ℹ️ pip --user install blocked (PEP 668). Creating venv in $WORKSPACE_DIR/.venv" + pip_venv_install + echo " ✓ Dependencies installed (venv)" +fi + +# Step 4: Test infrastructure connectivity +echo -e "${YELLOW}[4/10] Testing infrastructure...${NC}" + +docker_compose_up() { + local compose_dir="$(cd "$(dirname "$0")" && pwd)" + if docker compose version >/dev/null 2>&1; then + run_root docker compose -f "$compose_dir/docker-compose.yml" up -d + elif have_cmd docker-compose; then + run_root docker-compose -f "$compose_dir/docker-compose.yml" up -d + fi +} + +redis_ok=0 +qdrant_ok=0 +ollama_ok=0 + +# Test Redis +if "$PYTHON_BIN" -c "import redis; r=redis.Redis(host='$REDIS_HOST', port=$REDIS_PORT); r.ping()" 2>/dev/null; then + redis_ok=1 +fi + +# Test Qdrant +if curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then + qdrant_ok=1 +fi + +# Test Ollama +if curl -s "$OLLAMA_URL/api/tags" >/dev/null 2>&1; then + ollama_ok=1 +fi + +if [ "$START_DOCKER" = "1" ] && { [ $redis_ok -eq 0 ] || [ $qdrant_ok -eq 0 ] || [ $ollama_ok -eq 0 ]; }; then + echo " ℹ️ Infrastructure not reachable; attempting to start via docker compose" + docker_compose_up || true + sleep 2 + if "$PYTHON_BIN" -c "import redis; r=redis.Redis(host='$REDIS_HOST', port=$REDIS_PORT); r.ping()" 2>/dev/null; then redis_ok=1; fi + if curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then qdrant_ok=1; fi + if curl -s "$OLLAMA_URL/api/tags" >/dev/null 2>&1; then ollama_ok=1; fi +fi + +if [ $redis_ok -eq 1 ]; then + echo " ✓ Redis connection OK" +else + echo -e "${RED} ✗ Redis connection failed ($REDIS_HOST:$REDIS_PORT)${NC}" +fi + +if [ $qdrant_ok -eq 1 ]; then + echo " ✓ Qdrant connection OK" +else + echo -e "${RED} ✗ Qdrant connection failed ($QDRANT_URL)${NC}" +fi + +if [ $ollama_ok -eq 1 ]; then + echo " ✓ Ollama connection OK" +else + echo -e "${YELLOW} ⚠️ Ollama not reachable ($OLLAMA_URL)${NC}" + echo " Embeddings will fail until Ollama is running with snowflake-arctic-embed2." +fi + +# Step 5: Backup existing files before modifying +echo "" +echo -e "${YELLOW}[5/10] Creating backups of existing files...${NC}" +BACKUP_COUNT=0 + +# Backup existing crontab +if crontab -l 2>/dev/null >/dev/null; then + crontab -l > "${BACKUP_PREFIX}_crontab.bak.rush" 2>/dev/null + echo -e "${GREEN} ✓ Backed up crontab → .backups/install_${TIMESTAMP}_crontab.bak.rush${NC}" + ((BACKUP_COUNT++)) +else + echo " ℹ️ No existing crontab to backup" +fi + +# Backup existing HEARTBEAT.md +if backup_file "$WORKSPACE_DIR/HEARTBEAT.md" "${BACKUP_PREFIX}_HEARTBEAT.md.bak.rush"; then + ((BACKUP_COUNT++)) +fi + +# Backup existing .memory_env +if backup_file "$WORKSPACE_DIR/.memory_env" "${BACKUP_PREFIX}_memory_env.bak.rush"; then + ((BACKUP_COUNT++)) +fi + +if [ $BACKUP_COUNT -eq 0 ]; then + echo " ℹ️ No existing files to backup (fresh install)" +else + echo -e "${GREEN} ✓ $BACKUP_COUNT file(s) backed up to $BACKUP_DIR${NC}" +fi + +# Step 6: Create environment configuration +echo "" +echo -e "${YELLOW}[6/10] Creating environment configuration...${NC}" +cat > "$WORKSPACE_DIR/.memory_env" </dev/null > "$CRON_FILE" || true + + # Add memory backup cron jobs if not present + if ! grep -q "cron_backup.py" "$CRON_FILE" 2>/dev/null; then + echo "" >> "$CRON_FILE" + echo "# Memory System - Daily backup (3:00 AM)" >> "$CRON_FILE" + echo "0 3 * * * cd $WORKSPACE_DIR && $PYTHON_BIN skills/mem-redis/scripts/cron_backup.py >> /var/log/memory-backup.log 2>&1 || true" >> "$CRON_FILE" + fi + + if ! grep -q "sliding_backup.sh" "$CRON_FILE" 2>/dev/null; then + echo "" >> "$CRON_FILE" + echo "# Memory System - File backup (3:30 AM)" >> "$CRON_FILE" + echo "30 3 * * * $WORKSPACE_DIR/skills/qdrant-memory/scripts/sliding_backup.sh >> /var/log/memory-backup.log 2>&1 || true" >> "$CRON_FILE" + fi + + crontab "$CRON_FILE" + rm "$CRON_FILE" + echo " ✓ Cron jobs configured" +fi + +# Step 9: Create HEARTBEAT.md template +echo -e "${YELLOW}[9/10] Creating HEARTBEAT.md...${NC}" +if [ "$SKIP_HEARTBEAT" = "1" ]; then + echo " ℹ️ SKIP_HEARTBEAT=1 set; skipping HEARTBEAT.md write" +else + cat > "$WORKSPACE_DIR/HEARTBEAT.md" <<'EOF' +# HEARTBEAT.md - Memory System Automation + +## Memory Buffer (Every Heartbeat) + +Saves current session context to Redis buffer: + +```bash +python3 ~/.openclaw/workspace/skills/mem-redis/scripts/save_mem.py --user-id YOUR_USER_ID +``` + +## Daily Backup Schedule + +- **3:00 AM**: Redis buffer → Qdrant flush +- **3:30 AM**: File-based sliding backup + +## Manual Commands + +| Command | Action | +|---------|--------| +| `save mem` | Save all context to Redis | +| `save q` | Store immediately to Qdrant | +| `q ` | Search memories | + +EOF + echo " ✓ HEARTBEAT.md created" +fi + +# Create backup manifest +echo "" +echo -e "${YELLOW}Creating backup manifest...${NC}" +MANIFEST_FILE="${BACKUP_PREFIX}_MANIFEST.txt" +cat > "$MANIFEST_FILE" <> "$MANIFEST_FILE" + fi +done + +cat >> "$MANIFEST_FILE" < + + + + + + + + Memory Architecture Diagrams | SpeedyFoxAI + + + + + + + + + + +
+
+
+ + Architecture Reference +
+

+ Memory Architecture
+ Complete Diagrams +

+

+ Visual reference for the three-layer Jarvis-like memory system. Redis buffer, Markdown logs, and Qdrant vector database working together. +

+
+
+ + +
+
+

System Architecture Overview

+ + +
+
+
0
+

Session Context (OpenClaw Gateway)

+ Temporary Only +
+
+

Session JSONL → Live Context

+

Lost on /reset or session expiration • ~8k-32k tokens

+
+
+ +
+ + +
+
+
1
+

Redis Buffer (Fast Short-Term)

+ Real-Time +
+
+
+

Key: mem:rob

+

List data structure • LPUSH append

+
+
+

Flush: Daily 3:00 AM

+

cron_backup.py → Qdrant

+
+
+
+ hb_append.py + save_mem.py + mem:rob +
+
+ +
+ + +
+
+
2
+

Daily File Logs (.md)

+ Persistent +
+
+
+

Location: memory/YYYY-MM-DD.md

+

Human-readable Markdown

+
+
+

Git-tracked

+

Never lost • Always accessible

+
+
+
+ sliding_backup.sh + 3:30 AM daily +
+
+ +
+ + +
+
+
3
+

Qdrant Vector DB (Semantic Long-Term)

+ Searchable +
+
+
+

Embeddings

+

snowflake-arctic-embed2
1024 dimensions

+
+
+

Collections

+

kimi_memories
kimi_kb
private_court_docs

+
+
+

User-Centric

+

user_id: "rob"
Cross-session search

+
+
+
+ auto_store.py + search_memories.py + q +
+
+
+
+ + +
+
+

Data Flow & Automation

+ +
+ +
+
+
+ +
+

Real-Time

+
+
+User Input → AI Response + ↓ +Redis Buffer (fast) + ↓ +File Log (persistent) + ↓ +[Optional: save q] → Qdrant +
+

Every message triggers automatic append to Redis and file log.

+
+ + +
+
+
+ +
+

Heartbeat

+
+
+Every 30-60 min: + hb_append.py → New turns + hb_check_email.py → Gmail + heartbeat_worker.py → Tasks +
+

Periodic checks accumulate new turns since last save.

+
+ + +
+
+
+ +
+

Daily Backup

+
+
+3:00 AM: Redis → Qdrant + Clear Redis + +3:30 AM: Files → Backup + → Qdrant embeddings +
+

Nightly cron jobs flush buffers to permanent storage.

+
+
+
+
+ + +
+
+

Infrastructure Topology

+ +
+
+ Proxmox Cluster +
+ +
+
+
+ +
+

Ollama

+

10.0.0.10:11434

+

Embeddings

+
+ +
+
+ +
+

Qdrant

+

10.0.0.40:6333

+

Vector DB

+
+ +
+
+ +
+

Redis

+

10.0.0.36:6379

+

Task Queue

+
+ +
+
+ +
+

SearXNG

+

10.0.0.8:8888

+

Search

+
+
+
+
+
+ + +
+
+

Command Reference

+ +
+
+

+ + User Commands +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CommandActionLayer
save memSave all contextRedis + File
save qStore to QdrantVector DB
q topicSemantic searchVector DB
rememberQuick noteFile
+
+ +
+

+ + Automation +

+ + + + + + + + + + + + + + + + + + + + + + + + + +
ScheduleScriptAction
Heartbeathb_append.pyNew turns → Redis
3:00 AMcron_backup.pyRedis → Qdrant
3:30 AMsliding_backup.shFile → Backup
+
+
+
+
+ + +
+
+
+

Get the Complete Blueprint

+

+ Download the full Jarvis Memory blueprint with all scripts, documentation, and installation guide. +

+ + + Go to Downloads + +
+
+ + +
+
+
+
SpeedyFoxAI
+
+ © 2026 SpeedyFoxAI. All rights reserved. +
+
+ Home + Downloads +
+
+
+
+ + + + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..41bd0ad --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +# Python dependencies for Jarvis-Like Memory System +# Install with: pip3 install -r requirements.txt + +# Redis client for buffer layer +redis>=5.0.0 + +# Qdrant client for vector database +qdrant-client>=1.7.0 + +# HTTP requests for API calls +requests>=2.31.0 +urllib3>=2.0.0 + +# Date/time handling +python-dateutil>=2.8.0 + +# For Google integration (optional) +# google-auth-oauthlib>=1.0.0 +# google-auth-httplib2>=0.1.0 +# google-api-python-client>=2.100.0 diff --git a/skills/mem-redis/SKILL.md b/skills/mem-redis/SKILL.md new file mode 100644 index 0000000..3eb5169 --- /dev/null +++ b/skills/mem-redis/SKILL.md @@ -0,0 +1,42 @@ +# Memory Buffer Skill + +Redis-based short-term memory buffer for OpenClaw. + +## What It Does + +Accumulates conversation turns in real-time and flushes to Qdrant daily. + +## Commands + +```bash +# Manual save (all turns) +python3 scripts/save_mem.py --user-id yourname + +# Retrieve from buffer +python3 scripts/mem_retrieve.py --limit 10 + +# Search Redis + Qdrant +python3 scripts/search_mem.py "your query" +``` + +## Heartbeat Integration + +Add to HEARTBEAT.md: +```bash +python3 /path/to/skills/mem-redis/scripts/hb_append.py +``` + +## Cron + +```bash +# Daily flush at 3:00 AM +0 3 * * * python3 scripts/cron_backup.py +``` + +## Files + +- `hb_append.py` - Heartbeat: append new turns only +- `save_mem.py` - Manual: save all turns +- `cron_backup.py` - Daily: flush to Qdrant +- `mem_retrieve.py` - Read from Redis +- `search_mem.py` - Search Redis + Qdrant diff --git a/skills/mem-redis/scripts/cron_backup.py b/skills/mem-redis/scripts/cron_backup.py new file mode 100755 index 0000000..b7fed86 --- /dev/null +++ b/skills/mem-redis/scripts/cron_backup.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +""" +Daily Cron: Process Redis buffer → Qdrant → Clear Redis. + +This script runs once daily (via cron) to move buffered conversation +turns from Redis to durable Qdrant storage. Only clears Redis after +successful Qdrant write. + +Usage: python3 cron_backup.py [--user-id rob] [--dry-run] +""" + +import os +import sys +import json +import redis +import argparse +from datetime import datetime, timezone +from pathlib import Path + +# Add qdrant-memory to path (portable) +from pathlib import Path as _Path +WORKSPACE = _Path(os.getenv("OPENCLAW_WORKSPACE", str(_Path.home() / ".openclaw" / "workspace"))) +sys.path.insert(0, str(WORKSPACE / "skills" / "qdrant-memory" / "scripts")) + +try: + from auto_store import store_conversation_turn + QDRANT_AVAILABLE = True +except ImportError: + QDRANT_AVAILABLE = False + print("Warning: Qdrant storage not available, will simulate", file=sys.stderr) + +# Config +REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) +USER_ID = os.getenv("USER_ID", "yourname") + +def get_redis_items(user_id): + """Get all items from Redis list.""" + try: + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + key = f"mem:{user_id}" + + # Get all items (0 to -1 = entire list) + items = r.lrange(key, 0, -1) + + # Parse JSON + turns = [] + for item in items: + try: + turn = json.loads(item) + turns.append(turn) + except json.JSONDecodeError: + continue + + return turns, key + except Exception as e: + print(f"Error reading from Redis: {e}", file=sys.stderr) + return None, None + +def store_to_qdrant(turns, user_id): + """Store turns to Qdrant with file fallback.""" + if not QDRANT_AVAILABLE: + print("[DRY RUN] Would store to Qdrant:", file=sys.stderr) + for turn in turns[:3]: + print(f" - Turn {turn.get('turn', '?')}: {turn.get('role', '?')}", file=sys.stderr) + if len(turns) > 3: + print(f" ... and {len(turns) - 3} more", file=sys.stderr) + return True + + # Ensure chronological order (older -> newer) + try: + turns_sorted = sorted(turns, key=lambda t: (t.get('timestamp', ''), t.get('turn', 0))) + except Exception: + turns_sorted = turns + + user_turns = [t for t in turns_sorted if t.get('role') == 'user'] + if not user_turns: + return True + + success_count = 0 + attempted = 0 + + for i, turn in enumerate(turns_sorted): + if turn.get('role') != 'user': + continue + attempted += 1 + try: + # Pair with the next assistant message in chronological order (best effort) + ai_response = "" + j = i + 1 + while j < len(turns_sorted): + if turns_sorted[j].get('role') == 'assistant': + ai_response = turns_sorted[j].get('content', '') + break + if turns_sorted[j].get('role') == 'user': + break + j += 1 + + result = store_conversation_turn( + user_message=turn.get('content', ''), + ai_response=ai_response, + user_id=user_id, + turn_number=turn.get('turn', i), + conversation_id=f"mem-buffer-{turn.get('timestamp', 'unknown')[:10]}" + ) + + # store_conversation_turn returns success/skipped; treat skipped as ok + if result.get('success') or result.get('skipped'): + success_count += 1 + except Exception as e: + print(f"Error storing user turn {turn.get('turn', '?')}: {e}", file=sys.stderr) + + # Only consider Qdrant storage successful if we stored/skipped ALL user turns. + return attempted > 0 and success_count == attempted + +def store_to_file(turns, user_id): + """Fallback: Store turns to JSONL file.""" + from datetime import datetime + + workspace = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace"))) + backup_dir = workspace / "memory" / "redis-backups" + backup_dir.mkdir(exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = backup_dir / f"mem-backup-{user_id}-{timestamp}.jsonl" + + try: + with open(filename, 'w') as f: + for turn in turns: + f.write(json.dumps(turn) + '\n') + print(f"✅ Backed up {len(turns)} turns to file: {filename}") + return True + except Exception as e: + print(f"❌ File backup failed: {e}", file=sys.stderr) + return False + +def clear_redis(key): + """Clear Redis list after successful backup.""" + try: + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + r.delete(key) + return True + except Exception as e: + print(f"Error clearing Redis: {e}", file=sys.stderr) + return False + +def main(): + parser = argparse.ArgumentParser(description="Backup Redis mem buffer to Qdrant") + parser.add_argument("--user-id", default=USER_ID, help="User ID") + parser.add_argument("--dry-run", action="store_true", help="Don't actually clear Redis") + args = parser.parse_args() + + # Get items from Redis + turns, key = get_redis_items(args.user_id) + + if turns is None: + print("❌ Failed to read from Redis") + sys.exit(1) + + if not turns: + print(f"No items in Redis buffer (mem:{args.user_id})") + sys.exit(0) + + print(f"Found {len(turns)} turns in Redis buffer") + + # Try Qdrant first + qdrant_success = False + if not args.dry_run: + qdrant_success = store_to_qdrant(turns, args.user_id) + if qdrant_success: + print(f"✅ Stored Redis buffer to Qdrant (all user turns)") + else: + print("⚠️ Qdrant storage incomplete; will NOT clear Redis", file=sys.stderr) + else: + print("[DRY RUN] Would attempt Qdrant storage") + qdrant_success = True # Dry run pretends success + + # If Qdrant failed/incomplete, try file backup (still do NOT clear Redis unless user chooses) + file_success = False + if not qdrant_success: + print("⚠️ Qdrant storage failed/incomplete, writing file backup (Redis preserved)...") + file_success = store_to_file(turns, args.user_id) + if not file_success: + print("❌ Both Qdrant and file backup failed - Redis buffer preserved") + sys.exit(1) + # Exit non-zero so monitoring can alert; keep Redis for re-try. + sys.exit(1) + + # Clear Redis (only if not dry-run) + if args.dry_run: + print("[DRY RUN] Would clear Redis buffer") + sys.exit(0) + + if clear_redis(key): + print(f"✅ Cleared Redis buffer (mem:{args.user_id})") + else: + print(f"⚠️ Backup succeeded but failed to clear Redis - may duplicate on next run") + sys.exit(1) + + backup_type = "Qdrant" if qdrant_success else "file" + print(f"\n🎉 Successfully backed up {len(turns)} turns to {backup_type} long-term memory") + +if __name__ == "__main__": + main() diff --git a/skills/mem-redis/scripts/cron_capture.py b/skills/mem-redis/scripts/cron_capture.py new file mode 100644 index 0000000..992209f --- /dev/null +++ b/skills/mem-redis/scripts/cron_capture.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +""" +Cron Capture: Append NEW session transcript messages to Redis (no LLM / no heartbeat). + +Goal: minimize token spend by capturing context out-of-band. + +- Tracks per-session file offsets (byte position) in a JSON state file. +- No-ops if the transcript file hasn't changed since last run. +- Stores user/assistant visible text to Redis (chronological order via RPUSH). +- Optionally stores model "thinking" separately (disabled by default) so it can be + queried only when explicitly needed. + +Usage: + python3 cron_capture.py [--user-id rob] [--include-thinking] + +Suggested cron (every 5 minutes): + */5 * * * * cd ~/.openclaw/workspace && python3 skills/mem-redis/scripts/cron_capture.py --user-id $USER + +Env: + OPENCLAW_WORKSPACE: override workspace path (default: ~/.openclaw/workspace) + OPENCLAW_SESSIONS_DIR: override sessions dir (default: ~/.openclaw/agents/main/sessions) + REDIS_HOST / REDIS_PORT / USER_ID +""" + +import argparse +import json +import os +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) +USER_ID = os.getenv("USER_ID", "yourname") + +DEFAULT_WORKSPACE = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace"))) +DEFAULT_SESSIONS_DIR = Path(os.getenv("OPENCLAW_SESSIONS_DIR", str(Path.home() / ".openclaw" / "agents" / "main" / "sessions"))) + +STATE_FILE = DEFAULT_WORKSPACE / ".mem_capture_state.json" + + +@dataclass +class ParsedMessage: + role: str # user|assistant + text: str + thinking: Optional[str] + timestamp: str + session_id: str + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def find_latest_transcript(sessions_dir: Path) -> Optional[Path]: + files = list(sessions_dir.glob("*.jsonl")) + if not files: + return None + return max(files, key=lambda p: p.stat().st_mtime) + + +def load_state() -> Dict[str, Any]: + if not STATE_FILE.exists(): + return {} + try: + return json.loads(STATE_FILE.read_text()) + except Exception: + return {} + + +def save_state(state: Dict[str, Any]) -> None: + try: + STATE_FILE.write_text(json.dumps(state, indent=2, sort_keys=True)) + except Exception as e: + print(f"[cron_capture] Warning: could not write state: {e}", file=sys.stderr) + + +def extract_text_and_thinking(content: Any) -> Tuple[str, Optional[str]]: + """Extract visible text and optional thinking from OpenClaw message content.""" + if isinstance(content, str): + return content, None + + text_parts: List[str] = [] + thinking_parts: List[str] = [] + + if isinstance(content, list): + for item in content: + if not isinstance(item, dict): + continue + if "text" in item and isinstance(item["text"], str): + text_parts.append(item["text"]) + if "thinking" in item and isinstance(item["thinking"], str): + thinking_parts.append(item["thinking"]) + + text = "".join(text_parts).strip() + thinking = "\n".join(thinking_parts).strip() if thinking_parts else None + return text, thinking + + +def parse_new_messages(transcript_path: Path, start_offset: int, include_thinking: bool) -> Tuple[List[ParsedMessage], int]: + """Parse messages from transcript_path starting at byte offset.""" + session_id = transcript_path.stem + msgs: List[ParsedMessage] = [] + + with transcript_path.open("rb") as f: + f.seek(start_offset) + while True: + line = f.readline() + if not line: + break + try: + entry = json.loads(line.decode("utf-8", errors="replace").strip()) + except Exception: + continue + + if entry.get("type") != "message" or "message" not in entry: + continue + msg = entry.get("message") or {} + role = msg.get("role") + if role not in ("user", "assistant"): + continue + + # Skip tool results explicitly + if role == "toolResult": + continue + + text, thinking = extract_text_and_thinking(msg.get("content")) + if not text and not (include_thinking and thinking): + continue + + msgs.append( + ParsedMessage( + role=role, + text=text[:8000], + thinking=(thinking[:16000] if (include_thinking and thinking) else None), + timestamp=entry.get("timestamp") or _now_iso(), + session_id=session_id, + ) + ) + + end_offset = f.tell() + + return msgs, end_offset + + +def append_to_redis(user_id: str, messages: List[ParsedMessage]) -> int: + if not messages: + return 0 + + import redis # lazy import so --dry-run works without deps + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + + key = f"mem:{user_id}" + thinking_key = f"mem_thinking:{user_id}" + + # RPUSH keeps chronological order. + for m in messages: + payload: Dict[str, Any] = { + "role": m.role, + "content": m.text, + "timestamp": m.timestamp, + "user_id": user_id, + "session": m.session_id, + } + r.rpush(key, json.dumps(payload)) + + if m.thinking: + t_payload = { + "role": m.role, + "thinking": m.thinking, + "timestamp": m.timestamp, + "user_id": user_id, + "session": m.session_id, + } + r.rpush(thinking_key, json.dumps(t_payload)) + + return len(messages) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Cron capture: append new transcript messages to Redis") + parser.add_argument("--user-id", default=USER_ID) + parser.add_argument("--include-thinking", action="store_true", help="Store thinking into mem_thinking:") + parser.add_argument("--sessions-dir", default=str(DEFAULT_SESSIONS_DIR)) + parser.add_argument("--dry-run", action="store_true", help="Parse + update state, but do not write to Redis") + args = parser.parse_args() + + sessions_dir = Path(args.sessions_dir) + transcript = find_latest_transcript(sessions_dir) + if not transcript: + print("[cron_capture] No session transcripts found") + return + + st = load_state() + key = str(transcript) + info = st.get(key, {}) + last_offset = int(info.get("offset", 0)) + last_size = int(info.get("size", 0)) + + cur_size = transcript.stat().st_size + if cur_size == last_size and last_offset > 0: + print("[cron_capture] No changes") + return + + messages, end_offset = parse_new_messages(transcript, last_offset, include_thinking=args.include_thinking) + if not messages: + # Still update size/offset so we don't re-read noise lines. + st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()} + save_state(st) + print("[cron_capture] No new user/assistant messages") + return + + if args.dry_run: + st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()} + save_state(st) + print(f"[cron_capture] DRY RUN: would append {len(messages)} messages to Redis mem:{args.user_id}") + return + + count = append_to_redis(args.user_id, messages) + + st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()} + save_state(st) + + print(f"[cron_capture] Appended {count} messages to Redis mem:{args.user_id}") + + +if __name__ == "__main__": + main() diff --git a/skills/mem-redis/scripts/hb_append.py b/skills/mem-redis/scripts/hb_append.py new file mode 100755 index 0000000..5781ab2 --- /dev/null +++ b/skills/mem-redis/scripts/hb_append.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +Heartbeat: Append new conversation turns to Redis short-term buffer. + +This script runs during heartbeat to capture recent conversation context +before it gets compacted away. Stores in Redis until daily cron backs up to Qdrant. + +Usage: python3 hb_append.py [--user-id rob] +""" + +import os +import sys +import json +import redis +import argparse +from datetime import datetime, timezone +from pathlib import Path + +# Config +REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) +USER_ID = os.getenv("USER_ID", "yourname") + +# Paths (portable) +WORKSPACE = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace"))) +MEMORY_DIR = WORKSPACE / "memory" +SESSIONS_DIR = Path(os.getenv("OPENCLAW_SESSIONS_DIR", str(Path.home() / ".openclaw" / "agents" / "main" / "sessions"))) +STATE_FILE = WORKSPACE / ".mem_last_turn" + +def get_session_transcript(): + """Find the current session JSONL file.""" + files = list(SESSIONS_DIR.glob("*.jsonl")) + if not files: + return None + # Get most recently modified + return max(files, key=lambda p: p.stat().st_mtime) + +def parse_turns_since(last_turn_num): + """Extract conversation turns since last processed.""" + transcript_file = get_session_transcript() + if not transcript_file or not transcript_file.exists(): + return [] + + turns = [] + turn_counter = last_turn_num + try: + with open(transcript_file, 'r') as f: + for line in f: + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + # OpenClaw format: {"type": "message", "message": {"role": "...", ...}} + if entry.get('type') == 'message' and 'message' in entry: + msg = entry['message'] + role = msg.get('role') + + # Skip tool results for memory storage + if role == 'toolResult': + continue + + # Get content from message content array or string + content = "" + if isinstance(msg.get('content'), list): + # Extract text from content array + for item in msg['content']: + if isinstance(item, dict): + if 'text' in item: + content += item['text'] + # Intentionally do NOT store model thinking in the main buffer. + # If you need thinking, use cron_capture.py --include-thinking to store it + # separately under mem_thinking:. + elif 'thinking' in item: + pass + elif isinstance(msg.get('content'), str): + content = msg['content'] + + if content and role in ('user', 'assistant'): + turn_counter += 1 + turns.append({ + 'turn': turn_counter, + 'role': role, + 'content': content[:2000], + 'timestamp': entry.get('timestamp', datetime.now(timezone.utc).isoformat()), + 'user_id': USER_ID, + 'session': str(transcript_file.name).replace('.jsonl', '') + }) + except json.JSONDecodeError: + continue + except Exception as e: + print(f"Error reading transcript: {e}", file=sys.stderr) + return [] + + return turns + +def get_last_turn(): + """Get last turn number from state file.""" + if STATE_FILE.exists(): + try: + with open(STATE_FILE) as f: + return int(f.read().strip()) + except: + pass + return 0 + +def save_last_turn(turn_num): + """Save last turn number to state file.""" + try: + with open(STATE_FILE, 'w') as f: + f.write(str(turn_num)) + except Exception as e: + print(f"Warning: Could not save state: {e}", file=sys.stderr) + +def append_to_redis(turns, user_id): + """Append turns to Redis list.""" + if not turns: + return 0 + + try: + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + key = f"mem:{user_id}" + + # Add all turns to list (LPUSH puts newest at front) + for turn in turns: + r.lpush(key, json.dumps(turn)) + + return len(turns) + except Exception as e: + print(f"Error writing to Redis: {e}", file=sys.stderr) + return 0 + +def main(): + parser = argparse.ArgumentParser(description="Append new turns to Redis mem buffer") + parser.add_argument("--user-id", default=USER_ID, help="User ID for key naming") + args = parser.parse_args() + + # Get last processed turn + last_turn = get_last_turn() + + # Get new turns + new_turns = parse_turns_since(last_turn) + + if not new_turns: + print(f"No new turns since turn {last_turn}") + sys.exit(0) + + # Append to Redis + count = append_to_redis(new_turns, args.user_id) + + if count > 0: + # Update last turn tracker + max_turn = max(t['turn'] for t in new_turns) + save_last_turn(max_turn) + print(f"✅ Appended {count} turns to Redis (mem:{args.user_id})") + else: + print("❌ Failed to append to Redis") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/skills/mem-redis/scripts/mem_retrieve.py b/skills/mem-redis/scripts/mem_retrieve.py new file mode 100755 index 0000000..d2493ba --- /dev/null +++ b/skills/mem-redis/scripts/mem_retrieve.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +""" +Manual Retrieval: Get recent conversation turns from Redis buffer. + +Use this when context has been compacted or you need to recall recent details. + +Usage: python3 mem_retrieve.py [--limit 20] [--user-id rob] +""" + +import os +import sys +import json +import redis +import argparse +from datetime import datetime, timezone + +# Config +REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) +USER_ID = os.getenv("USER_ID", "yourname") + +def get_recent_turns(user_id, limit=20): + """Get recent turns from Redis buffer.""" + try: + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + key = f"mem:{user_id}" + + # Get most recent N items (0 to limit-1) + items = r.lrange(key, 0, limit - 1) + + # Parse and reverse (so oldest first) + turns = [] + for item in items: + try: + turn = json.loads(item) + turns.append(turn) + except json.JSONDecodeError: + continue + + # Reverse to chronological order + turns.reverse() + + return turns + except Exception as e: + print(f"Error reading from Redis: {e}", file=sys.stderr) + return [] + +def format_turn(turn): + """Format a turn for display.""" + role = turn.get('role', 'unknown') + content = turn.get('content', '') + turn_num = turn.get('turn', '?') + + # Truncate long content + if len(content) > 500: + content = content[:500] + "..." + + role_icon = "👤" if role == 'user' else "🤖" + return f"{role_icon} Turn {turn_num} ({role}):\n{content}\n" + +def main(): + parser = argparse.ArgumentParser(description="Retrieve recent turns from mem buffer") + parser.add_argument("--user-id", default=USER_ID, help="User ID") + parser.add_argument("--limit", type=int, default=20, help="Number of turns to retrieve") + args = parser.parse_args() + + # Get turns + turns = get_recent_turns(args.user_id, args.limit) + + if not turns: + print(f"No recent turns in memory buffer (mem:{args.user_id})") + print("\nPossible reasons:") + print(" - Heartbeat hasn't run yet") + print(" - Cron already backed up and cleared Redis") + print(" - Redis connection issue") + sys.exit(0) + + # Display + print(f"=== Recent {len(turns)} Turn(s) from Memory Buffer ===\n") + for turn in turns: + print(format_turn(turn)) + + print(f"\nBuffer key: mem:{args.user_id}") + print("Note: These turns are also in Redis until daily cron backs them up to Qdrant.") + +if __name__ == "__main__": + main() diff --git a/skills/mem-redis/scripts/save_mem.py b/skills/mem-redis/scripts/save_mem.py new file mode 100755 index 0000000..eb00901 --- /dev/null +++ b/skills/mem-redis/scripts/save_mem.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Save all conversation context to Redis (not just new turns). + +Unlike hb_append.py which only saves NEW turns since last run, +this script saves ALL context from the session (or resets and saves fresh). + +Usage: python3 save_mem.py [--user-id rob] [--reset] +""" + +import os +import sys +import json +import redis +import argparse +from datetime import datetime, timezone +from pathlib import Path + +# Config +REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) +USER_ID = os.getenv("USER_ID", "yourname") + +# Paths (portable) +WORKSPACE = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace"))) +SESSIONS_DIR = Path(os.getenv("OPENCLAW_SESSIONS_DIR", str(Path.home() / ".openclaw" / "agents" / "main" / "sessions"))) +STATE_FILE = WORKSPACE / ".mem_last_turn" + +def get_session_transcript(): + """Find the current session JSONL file.""" + files = list(SESSIONS_DIR.glob("*.jsonl")) + if not files: + return None + return max(files, key=lambda p: p.stat().st_mtime) + +def parse_all_turns(): + """Extract ALL conversation turns from current session.""" + transcript_file = get_session_transcript() + if not transcript_file or not transcript_file.exists(): + return [] + + turns = [] + turn_counter = 0 + try: + with open(transcript_file, 'r') as f: + for line in f: + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + if entry.get('type') == 'message' and 'message' in entry: + msg = entry['message'] + role = msg.get('role') + + if role == 'toolResult': + continue + + content = "" + if isinstance(msg.get('content'), list): + for item in msg['content']: + if isinstance(item, dict): + if 'text' in item: + content += item['text'] + # Do not mix thinking into the main content buffer. + elif 'thinking' in item: + pass + elif isinstance(msg.get('content'), str): + content = msg['content'] + + if content and role in ('user', 'assistant'): + turn_counter += 1 + turns.append({ + 'turn': turn_counter, + 'role': role, + 'content': content[:2000], + 'timestamp': entry.get('timestamp', datetime.now(timezone.utc).isoformat()), + 'user_id': USER_ID, + 'session': str(transcript_file.name).replace('.jsonl', '') + }) + except json.JSONDecodeError: + continue + except Exception as e: + print(f"Error reading transcript: {e}", file=sys.stderr) + return [] + + return turns + +def save_to_redis(turns, user_id, reset=False): + """Save turns to Redis. If reset, clear existing first.""" + if not turns: + return 0 + + try: + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + key = f"mem:{user_id}" + + # Clear existing if reset + if reset: + r.delete(key) + print(f"Cleared existing Redis buffer ({key})") + + # Add all turns (LPUSH puts newest at front, so we reverse to keep order) + for turn in reversed(turns): + r.lpush(key, json.dumps(turn)) + + return len(turns) + except Exception as e: + print(f"Error writing to Redis: {e}", file=sys.stderr) + return 0 + +def update_state(last_turn_num): + """Update last turn tracker.""" + try: + with open(STATE_FILE, 'w') as f: + f.write(str(last_turn_num)) + except Exception as e: + print(f"Warning: Could not save state: {e}", file=sys.stderr) + +def main(): + parser = argparse.ArgumentParser(description="Save all conversation context to Redis") + parser.add_argument("--user-id", default=USER_ID, help="User ID for key naming") + parser.add_argument("--reset", action="store_true", help="Clear existing buffer first") + args = parser.parse_args() + + # Get all turns + turns = parse_all_turns() + + if not turns: + print("No conversation turns found in session") + sys.exit(0) + + # Save to Redis + count = save_to_redis(turns, args.user_id, reset=args.reset) + + if count > 0: + # Update state to track last turn + max_turn = max(t['turn'] for t in turns) + update_state(max_turn) + + action = "Reset and saved" if args.reset else "Saved" + print(f"✅ {action} {count} turns to Redis (mem:{args.user_id})") + print(f" State updated to turn {max_turn}") + else: + print("❌ Failed to save to Redis") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/skills/mem-redis/scripts/search_mem.py b/skills/mem-redis/scripts/search_mem.py new file mode 100755 index 0000000..93ff452 --- /dev/null +++ b/skills/mem-redis/scripts/search_mem.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +""" +Search memory: First Redis (exact), then Qdrant (semantic). + +Usage: python3 search_mem.py "your search query" [--limit 10] [--user-id rob] + +Searches: +1. Redis (mem:{user_id}) - exact text match in recent buffer +2. Qdrant (kimi_memories) - semantic similarity search +""" + +import os +import sys +import json +import redis +import argparse +from pathlib import Path +from datetime import datetime + +# Config +REDIS_HOST = os.getenv("REDIS_HOST", "10.0.0.36") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) +USER_ID = os.getenv("USER_ID", "yourname") + +QDRANT_URL = os.getenv("QDRANT_URL", "http://10.0.0.40:6333") +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://10.0.0.10:11434/v1") + +def search_redis(query, user_id, limit=20): + """Search Redis buffer for exact text matches.""" + try: + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + key = f"mem:{user_id}" + + # Get all items from list + items = r.lrange(key, 0, -1) + if not items: + return [] + + query_lower = query.lower() + matches = [] + + for item in items: + try: + turn = json.loads(item) + content = turn.get('content', '').lower() + if query_lower in content: + matches.append({ + 'source': 'redis', + 'turn': turn.get('turn'), + 'role': turn.get('role'), + 'content': turn.get('content'), + 'timestamp': turn.get('timestamp'), + 'score': 'exact' + }) + except json.JSONDecodeError: + continue + + # Sort by turn number descending (newest first) + matches.sort(key=lambda x: x.get('turn', 0), reverse=True) + return matches[:limit] + except Exception as e: + print(f"Redis search error: {e}", file=sys.stderr) + return [] + +def get_embedding(text): + """Get embedding from Ollama.""" + import urllib.request + + payload = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=payload, + headers={"Content-Type": "application/json"}, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + result = json.loads(resp.read().decode()) + return result.get('data', [{}])[0].get('embedding') + except Exception as e: + print(f"Embedding error: {e}", file=sys.stderr) + return None + +def search_qdrant(query, user_id, limit=10): + """Search Qdrant for semantic similarity.""" + import urllib.request + + embedding = get_embedding(query) + if not embedding: + return [] + + payload = json.dumps({ + "vector": embedding, + "limit": limit, + "with_payload": True, + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}} + ] + } + }).encode() + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/kimi_memories/points/search", + data=payload, + headers={"Content-Type": "application/json"}, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + result = json.loads(resp.read().decode()) + points = result.get('result', []) + + matches = [] + for point in points: + payload = point.get('payload', {}) + matches.append({ + 'source': 'qdrant', + 'score': round(point.get('score', 0), 3), + 'turn': payload.get('turn_number'), + 'role': payload.get('role'), + 'content': payload.get('user_message') or payload.get('content', ''), + 'ai_response': payload.get('ai_response', ''), + 'timestamp': payload.get('timestamp'), + 'conversation_id': payload.get('conversation_id') + }) + return matches + except Exception as e: + print(f"Qdrant search error: {e}", file=sys.stderr) + return [] + +def format_result(result, index): + """Format a single search result.""" + source = result.get('source', 'unknown') + role = result.get('role', 'unknown') + turn = result.get('turn', '?') + score = result.get('score', '?') + + content = result.get('content', '') + if len(content) > 200: + content = content[:200] + "..." + + # Role emoji + role_emoji = "👤" if role == "user" else "🤖" + + # Source indicator + source_icon = "🔴" if source == "redis" else "🔵" + + lines = [ + f"{source_icon} [{index}] Turn {turn} ({role}):", + f" {role_emoji} {content}" + ] + + if source == "qdrant" and result.get('ai_response'): + ai_resp = result['ai_response'][:150] + if len(result['ai_response']) > 150: + ai_resp += "..." + lines.append(f" 💬 AI: {ai_resp}") + + if score != 'exact': + lines.append(f" 📊 Score: {score}") + else: + lines.append(f" 📊 Match: exact (Redis)") + + return "\n".join(lines) + +def main(): + parser = argparse.ArgumentParser(description="Search memory: Redis first, then Qdrant") + parser.add_argument("query", help="Search query") + parser.add_argument("--limit", type=int, default=10, help="Results per source (default: 10)") + parser.add_argument("--user-id", default=USER_ID, help="User ID") + parser.add_argument("--redis-only", action="store_true", help="Only search Redis") + parser.add_argument("--qdrant-only", action="store_true", help="Only search Qdrant") + args = parser.parse_args() + + print(f"🔍 Searching for: \"{args.query}\"\n") + + all_results = [] + + # Search Redis first (unless qdrant-only) + if not args.qdrant_only: + print("📍 Searching Redis (exact match)...") + redis_results = search_redis(args.query, args.user_id, limit=args.limit) + if redis_results: + print(f"✅ Found {len(redis_results)} matches in Redis\n") + all_results.extend(redis_results) + else: + print("❌ No exact matches in Redis\n") + + # Search Qdrant (unless redis-only) + if not args.redis_only: + print("🧠 Searching Qdrant (semantic similarity)...") + qdrant_results = search_qdrant(args.query, args.user_id, limit=args.limit) + if qdrant_results: + print(f"✅ Found {len(qdrant_results)} matches in Qdrant\n") + all_results.extend(qdrant_results) + else: + print("❌ No semantic matches in Qdrant\n") + + # Display results + if not all_results: + print("No results found in either Redis or Qdrant.") + sys.exit(0) + + print(f"=== Search Results ({len(all_results)} total) ===\n") + + # Sort: Redis first (chronological), then Qdrant (by score) + redis_sorted = [r for r in all_results if r['source'] == 'redis'] + qdrant_sorted = sorted( + [r for r in all_results if r['source'] == 'qdrant'], + key=lambda x: x.get('score', 0), + reverse=True + ) + + # Display Redis results first + if redis_sorted: + print("🔴 FROM REDIS (Recent Buffer):\n") + for i, result in enumerate(redis_sorted, 1): + print(format_result(result, i)) + print() + + # Then Qdrant results + if qdrant_sorted: + print("🔵 FROM QDRANT (Long-term Memory):\n") + for i, result in enumerate(qdrant_sorted, len(redis_sorted) + 1): + print(format_result(result, i)) + print() + + print(f"=== {len(all_results)} results ===") + if redis_sorted: + print(f" 🔴 Redis: {len(redis_sorted)} (exact, recent)") + if qdrant_sorted: + print(f" 🔵 Qdrant: {len(qdrant_sorted)} (semantic, long-term)") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/HARVEST.md b/skills/qdrant-memory/HARVEST.md new file mode 100644 index 0000000..b0750e4 --- /dev/null +++ b/skills/qdrant-memory/HARVEST.md @@ -0,0 +1,137 @@ +# Session Harvest Instructions + +## What is Session Harvesting? + +Session harvesting extracts conversation turns from OpenClaw session JSONL files and stores them to Qdrant long-term memory with proper embeddings and user_id linking. + +## When to Use + +- **After setting up a new memory system** — harvest existing sessions +- **After discovering missed backups** — recover data from session files +- **Periodically** — if cron jobs missed any data + +## Scripts + +| Script | Purpose | Usage | +|--------|---------|-------| +| `harvest_sessions.py` | Harvest all sessions (auto-sorts by mtime) | Limited by memory, may timeout | +| `harvest_newest.py` | Harvest specific sessions by name | Recommended for batch control | + +## Location + +``` +/root/.openclaw/workspace/skills/qdrant-memory/scripts/ +├── harvest_sessions.py # Auto-harvest (use --limit to control) +└── harvest_newest.py # Manual batch (specify session names) +``` + +## Usage + +### Method 1: Auto-Harvest with Limit + +```bash +# Harvest oldest 10 sessions (default sort) +python3 harvest_sessions.py --user-id rob --limit 10 + +# Dry run to see what would be stored +python3 harvest_sessions.py --user-id rob --dry-run --limit 5 +``` + +### Method 2: Batch by Session Name (Recommended) + +```bash +# Harvest specific sessions (newest first recommended) +python3 harvest_newest.py --user-id rob \ + session-uuid-1.jsonl \ + session-uuid-2.jsonl \ + session-uuid-3.jsonl +``` + +### Finding Newest Sessions + +```bash +# List 20 newest session files +ls -t /root/.openclaw/agents/main/sessions/*.jsonl | head -20 + +# Get just filenames for copy-paste +ls -t /root/.openclaw/agents/main/sessions/*.jsonl | head -20 | xargs -I{} basename {} +``` + +## How It Works + +1. **Parse** — Reads JSONL session file, extracts user/AI turns +2. **Pair** — Matches user message with next AI response +3. **Embed** — Generates 3 embeddings (user, AI, summary) via Ollama +4. **Deduplicate** — Checks content_hash before storing +5. **Store** — Upserts to Qdrant with user_id, conversation_id, turn_number + +## Deduplication + +- Uses MD5 hash of `user_message::ai_response` +- Checks Qdrant for existing `user_id + content_hash` +- Skips if already stored (returns "duplicate") +- Safe to run multiple times on same sessions + +## Output Format + +``` +[1] session-uuid.jsonl + Stored: 10, Skipped: 6 + +Total: 44 stored, 6 skipped +``` + +- **Stored** = New memories added to Qdrant +- **Skipped** = Duplicates (already in Qdrant) + +## Troubleshooting + +### Timeout / SIGKILL + +The embedding process is CPU-intensive. If killed: + +```bash +# Use smaller batches +python3 harvest_newest.py --user-id rob session1.jsonl session2.jsonl +``` + +### Check Qdrant Status + +```bash +curl -s http://10.0.0.40:6333/collections/kimi_memories | \ + python3 -c "import sys,json; d=json.load(sys.stdin); print(d['result']['points_count'])" +``` + +### Check Session Content + +```bash +# Count turns in a session +python3 -c " +import json +from pathlib import Path +f = Path('/root/.openclaw/agents/main/sessions/YOUR-SESSION.jsonl') +count = sum(1 for line in open(f) if 'user' in line or 'assistant' in line) +print(f'~{count} messages') +" +``` + +## Memory Architecture + +``` +Session JSONL (raw) + │ + ▼ + harvest_*.py + │ + ├──► Embeddings (Ollama snowflake-arctic-embed2) + │ + ▼ + Qdrant kimi_memories + │ + └──► Searchable via user_id: "rob" +``` + +--- + +**Created:** February 17, 2026 +**Author:** Kimi (audit session) diff --git a/skills/qdrant-memory/SKILL.md b/skills/qdrant-memory/SKILL.md new file mode 100644 index 0000000..9bc8880 --- /dev/null +++ b/skills/qdrant-memory/SKILL.md @@ -0,0 +1,53 @@ +# Qdrant Memory Skill + +Vector database storage for long-term semantic memory. + +## What It Does + +Stores conversations with embeddings for semantic search. + +## Commands + +```bash +# Initialize collections +python3 scripts/init_kimi_memories.py +python3 scripts/init_kimi_kb.py + +# Store immediately +python3 scripts/auto_store.py + +# Search memories +python3 scripts/search_memories.py "your query" + +# Harvest old sessions +python3 scripts/harvest_sessions.py --limit 10 +``` + +## Heartbeat Integration + +Add to HEARTBEAT.md: +```bash +python3 /path/to/skills/qdrant-memory/scripts/daily_conversation_backup.py +``` + +## Cron + +```bash +# Daily backup at 3:30 AM +30 3 * * * scripts/sliding_backup.sh +``` + +## Collections + +- `kimi_memories` - Conversations +- `kimi_kb` - Knowledge base +- `private_court_docs` - Legal docs + +## Files + +- `auto_store.py` - Store with embeddings +- `search_memories.py` - Semantic search +- `init_*.py` - Collection initialization +- `harvest_*.py` - Session harvesting +- `daily_conversation_backup.py` - Daily cron +- `sliding_backup.sh` - File backup diff --git a/skills/qdrant-memory/scripts/activity_log.py b/skills/qdrant-memory/scripts/activity_log.py new file mode 100755 index 0000000..4307625 --- /dev/null +++ b/skills/qdrant-memory/scripts/activity_log.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +""" +Shared Activity Log for Kimi and Max +Prevents duplicate work by logging actions to Qdrant +""" + +import argparse +import hashlib +import json +import sys +import uuid +from datetime import datetime, timezone +from typing import Optional + +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams, PointStruct + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "activity_log" +VECTOR_SIZE = 768 # nomic-embed-text + +# Embedding function (simple keyword-based for now, or use nomic) +def simple_embed(text: str) -> list[float]: + """Simple hash-based embedding for semantic similarity""" + # In production, use nomic-embed-text via API + # For now, use a simple approach that groups similar texts + words = text.lower().split() + vector = [0.0] * VECTOR_SIZE + for i, word in enumerate(words[:100]): # Limit to first 100 words + h = hash(word) % VECTOR_SIZE + vector[h] += 1.0 + # Normalize + norm = sum(x*x for x in vector) ** 0.5 + if norm > 0: + vector = [x/norm for x in vector] + return vector + +def init_collection(client: QdrantClient): + """Create activity_log collection if not exists""" + collections = [c.name for c in client.get_collections().collections] + if COLLECTION_NAME not in collections: + client.create_collection( + collection_name=COLLECTION_NAME, + vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE) + ) + print(f"Created collection: {COLLECTION_NAME}") + +def log_activity( + agent: str, + action_type: str, + description: str, + affected_files: Optional[list] = None, + status: str = "completed", + metadata: Optional[dict] = None +) -> str: + """ + Log an activity to the shared activity log + + Args: + agent: "Kimi" or "Max" + action_type: e.g., "cron_created", "file_edited", "config_changed", "task_completed" + description: Human-readable description of what was done + affected_files: List of file paths or systems affected + status: "completed", "in_progress", "blocked", "failed" + metadata: Additional key-value pairs + + Returns: + activity_id (UUID) + """ + client = QdrantClient(url=QDRANT_URL) + init_collection(client) + + activity_id = str(uuid.uuid4()) + timestamp = datetime.now(timezone.utc).isoformat() + date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + # Build searchable text + searchable_text = f"{agent} {action_type} {description} {' '.join(affected_files or [])}" + vector = simple_embed(searchable_text) + + payload = { + "agent": agent, + "action_type": action_type, + "description": description, + "affected_files": affected_files or [], + "status": status, + "timestamp": timestamp, + "date": date_str, + "activity_id": activity_id, + "metadata": metadata or {} + } + + client.upsert( + collection_name=COLLECTION_NAME, + points=[PointStruct(id=activity_id, vector=vector, payload=payload)] + ) + + return activity_id + +def get_recent_activities( + agent: Optional[str] = None, + action_type: Optional[str] = None, + hours: int = 24, + limit: int = 50 +) -> list[dict]: + """ + Query recent activities + + Args: + agent: Filter by agent name ("Kimi" or "Max") or None for both + action_type: Filter by action type or None for all + hours: Look back this many hours + limit: Max results + """ + client = QdrantClient(url=QDRANT_URL) + + # Get all points and filter client-side (Qdrant payload filtering can be tricky) + # For small collections, this is fine. For large ones, use scroll with filter + all_points = client.scroll( + collection_name=COLLECTION_NAME, + limit=1000 # Get recent batch + )[0] + + results = [] + cutoff = datetime.now(timezone.utc).timestamp() - (hours * 3600) + + for point in all_points: + payload = point.payload + ts = payload.get("timestamp", "") + try: + point_time = datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp() + except: + continue + + if point_time < cutoff: + continue + + if agent and payload.get("agent") != agent: + continue + + if action_type and payload.get("action_type") != action_type: + continue + + results.append(payload) + + # Sort by timestamp descending + results.sort(key=lambda x: x.get("timestamp", ""), reverse=True) + return results[:limit] + +def search_activities(query: str, limit: int = 10) -> list[dict]: + """Semantic search across activity descriptions""" + client = QdrantClient(url=QDRANT_URL) + vector = simple_embed(query) + + results = client.search( + collection_name=COLLECTION_NAME, + query_vector=vector, + limit=limit + ) + + return [r.payload for r in results] + +def check_for_duplicates(action_type: str, description_keywords: str, hours: int = 6) -> bool: + """ + Check if similar work was recently done + Returns True if duplicate detected, False otherwise + """ + recent = get_recent_activities(action_type=action_type, hours=hours) + + keywords = description_keywords.lower().split() + for activity in recent: + desc = activity.get("description", "").lower() + if all(kw in desc for kw in keywords): + print(f"⚠️ Duplicate detected: {activity['agent']} did similar work {activity['timestamp']}") + print(f" Description: {activity['description']}") + return True + + return False + +def main(): + parser = argparse.ArgumentParser(description="Shared Activity Log for Kimi/Max") + subparsers = parser.add_subparsers(dest="command", help="Command to run") + + # Log command + log_parser = subparsers.add_parser("log", help="Log an activity") + log_parser.add_argument("--agent", required=True, choices=["Kimi", "Max"], help="Which agent performed the action") + log_parser.add_argument("--action", required=True, help="Action type (e.g., cron_created, file_edited)") + log_parser.add_argument("--description", required=True, help="What was done") + log_parser.add_argument("--files", nargs="*", help="Files/systems affected") + log_parser.add_argument("--status", default="completed", choices=["completed", "in_progress", "blocked", "failed"]) + log_parser.add_argument("--check-duplicate", action="store_true", help="Check for duplicates before logging") + log_parser.add_argument("--duplicate-keywords", help="Keywords to check for duplicates (if different from description)") + + # Recent command + recent_parser = subparsers.add_parser("recent", help="Show recent activities") + recent_parser.add_argument("--agent", choices=["Kimi", "Max"], help="Filter by agent") + recent_parser.add_argument("--action", help="Filter by action type") + recent_parser.add_argument("--hours", type=int, default=24, help="Hours to look back") + recent_parser.add_argument("--limit", type=int, default=20, help="Max results") + + # Search command + search_parser = subparsers.add_parser("search", help="Search activities") + search_parser.add_argument("query", help="Search query") + search_parser.add_argument("--limit", type=int, default=10) + + # Check command + check_parser = subparsers.add_parser("check", help="Check for duplicate work") + check_parser.add_argument("--action", required=True, help="Action type") + check_parser.add_argument("--keywords", required=True, help="Keywords to check") + check_parser.add_argument("--hours", type=int, default=6, help="Hours to look back") + + args = parser.parse_args() + + if args.command == "log": + if args.check_duplicate: + keywords = args.duplicate_keywords or args.description + if check_for_duplicates(args.action, keywords): + response = input("Proceed anyway? (y/n): ") + if response.lower() != "y": + print("Cancelled.") + sys.exit(0) + + activity_id = log_activity( + agent=args.agent, + action_type=args.action, + description=args.description, + affected_files=args.files, + status=args.status + ) + print(f"✓ Logged activity: {activity_id}") + + elif args.command == "recent": + activities = get_recent_activities( + agent=args.agent, + action_type=args.action, + hours=args.hours, + limit=args.limit + ) + + print(f"\nRecent activities (last {args.hours}h):\n") + for a in activities: + agent_icon = "🤖" if a["agent"] == "Max" else "🎙️" + status_icon = { + "completed": "✓", + "in_progress": "◐", + "blocked": "✗", + "failed": "⚠" + }.get(a["status"], "?") + + print(f"{agent_icon} [{a['timestamp'][:19]}] {status_icon} {a['action_type']}") + print(f" {a['description']}") + if a['affected_files']: + print(f" Files: {', '.join(a['affected_files'])}") + print() + + elif args.command == "search": + results = search_activities(args.query, args.limit) + + print(f"\nSearch results for '{args.query}':\n") + for r in results: + print(f"[{r['agent']}] {r['action_type']}: {r['description']}") + print(f" {r['timestamp'][:19]} | Status: {r['status']}") + print() + + elif args.command == "check": + is_dup = check_for_duplicates(args.action, args.keywords, args.hours) + sys.exit(1 if is_dup else 0) + + else: + parser.print_help() + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/agent_chat.py b/skills/qdrant-memory/scripts/agent_chat.py new file mode 100755 index 0000000..eb1823d --- /dev/null +++ b/skills/qdrant-memory/scripts/agent_chat.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +""" +Agent Messaging System - Redis Streams +Kimi and Max shared communication channel +""" + +import argparse +import json +import time +import sys +from datetime import datetime, timezone + +import redis + +REDIS_HOST = "10.0.0.36" +REDIS_PORT = 6379 +STREAM_NAME = "agent-messages" +LAST_READ_KEY = "agent:last_read:{agent}" + +class AgentChat: + def __init__(self, agent_name): + self.agent = agent_name + self.r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + + def send(self, msg_type, message, reply_to=None, from_user=False): + """Send a message to the stream""" + entry = { + "agent": self.agent, + "type": msg_type, # idea, question, update, reply + "message": message, + "timestamp": datetime.now(timezone.utc).isoformat(), + "reply_to": reply_to or "", + "from_user": str(from_user).lower() # "true" if from Rob, "false" if from agent + } + + msg_id = self.r.xadd(STREAM_NAME, entry) + print(f"[{self.agent}] Sent: {msg_id}") + return msg_id + + def read_new(self, block_ms=1000): + """Read messages since last check""" + last_id = self.r.get(LAST_READ_KEY.format(agent=self.agent)) or "0" + + result = self.r.xread( + {STREAM_NAME: last_id}, + block=block_ms + ) + + if not result: + return [] + + messages = [] + for stream_name, entries in result: + for msg_id, data in entries: + messages.append({"id": msg_id, **data}) + # Update last read position + self.r.set(LAST_READ_KEY.format(agent=self.agent), msg_id) + + return messages + + def read_all(self, count=50): + """Read last N messages regardless of read status""" + entries = self.r.xrevrange(STREAM_NAME, count=count) + + messages = [] + for msg_id, data in entries: + messages.append({"id": msg_id, **data}) + + return messages + + def read_since(self, hours=24): + """Read messages from last N hours""" + cutoff = time.time() - (hours * 3600) + cutoff_ms = int(cutoff * 1000) + + # Get messages since cutoff (approximate using ID which is timestamp-based) + entries = self.r.xrange(STREAM_NAME, min=f"{cutoff_ms}-0", count=1000) + + messages = [] + for msg_id, data in entries: + messages.append({"id": msg_id, **data}) + + return messages + + def wait_for_reply(self, reply_to_id, timeout_sec=30): + """Block until a reply to a specific message arrives""" + start = time.time() + last_check = "0" + + while time.time() - start < timeout_sec: + result = self.r.xread({STREAM_NAME: last_check}, block=timeout_sec*1000) + + if result: + for stream_name, entries in result: + for msg_id, data in entries: + last_check = msg_id + if data.get("reply_to") == reply_to_id: + return {"id": msg_id, **data} + + time.sleep(0.5) + + return None + + def format_message(self, msg): + """Pretty print a message""" + ts = msg.get("timestamp", "")[11:19] # HH:MM:SS only + agent = msg.get("agent", "?") + msg_type = msg.get("type", "?") + text = msg.get("message", "") + reply_to = msg.get("reply_to", "") + from_user = msg.get("from_user", "false") == "true" + + icon = "🤖" if agent == "Max" else "🎙️" + type_icon = { + "idea": "💡", + "question": "❓", + "update": "📢", + "reply": "↩️" + }.get(msg_type, "•") + + # Show 📝 if message is from Rob (relayed by agent), otherwise show agent icon only + source_icon = "📝" if from_user else icon + + reply_info = f" [reply to {reply_to[:8]}...]" if reply_to else "" + return f"[{ts}] {source_icon} {agent} {type_icon} {text}{reply_info}" + +def main(): + parser = argparse.ArgumentParser(description="Agent messaging via Redis Streams") + parser.add_argument("--agent", required=True, choices=["Kimi", "Max"], help="Your agent name") + + subparsers = parser.add_subparsers(dest="command", help="Command") + + # Send command + send_p = subparsers.add_parser("send", help="Send a message") + send_p.add_argument("--type", default="update", choices=["idea", "question", "update", "reply"]) + send_p.add_argument("--message", "-m", required=True, help="Message text") + send_p.add_argument("--reply-to", help="Reply to message ID") + send_p.add_argument("--from-user", action="store_true", help="Mark as message from Rob (not from agent)") + + # Read command + read_p = subparsers.add_parser("read", help="Read messages") + read_p.add_argument("--new", action="store_true", help="Only unread messages") + read_p.add_argument("--all", action="store_true", help="Last 50 messages") + read_p.add_argument("--since", type=int, help="Messages from last N hours") + read_p.add_argument("--wait", action="store_true", help="Wait for new messages (blocking)") + + args = parser.parse_args() + + chat = AgentChat(args.agent) + + if args.command == "send": + msg_id = chat.send(args.type, args.message, args.reply_to, args.from_user) + print(f"Message ID: {msg_id}") + + elif args.command == "read": + if args.new or args.wait: + if args.wait: + print("Waiting for messages... (Ctrl+C to stop)") + try: + while True: + msgs = chat.read_new(block_ms=5000) + for m in msgs: + print(chat.format_message(m)) + except KeyboardInterrupt: + print("\nStopped.") + else: + msgs = chat.read_new() + for m in msgs: + print(chat.format_message(m)) + if not msgs: + print("No new messages.") + + elif args.since: + msgs = chat.read_since(args.since) + for m in msgs: + print(chat.format_message(m)) + if not msgs: + print(f"No messages in last {args.since} hours.") + + else: # default --all + msgs = chat.read_all() + for m in reversed(msgs): # Chronological order + print(chat.format_message(m)) + if not msgs: + print("No messages in stream.") + + else: + parser.print_help() + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/agent_check.py b/skills/qdrant-memory/scripts/agent_check.py new file mode 100755 index 0000000..2c81126 --- /dev/null +++ b/skills/qdrant-memory/scripts/agent_check.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +""" +Check agent messages from Redis stream +Usage: agent_check.py [--list N] [--check] [--last-minutes M] +""" + +import argparse +import sys +import json +import time +from datetime import datetime, timezone + +# Add parent to path for imports +sys.path.insert(0, '/root/.openclaw/workspace/skills/qdrant-memory') + +try: + import redis +except ImportError: + print("❌ Redis module not available") + sys.exit(1) + +REDIS_HOST = "10.0.0.36" +REDIS_PORT = 6379 +STREAM_KEY = "agent-messages" +LAST_CHECKED_KEY = "agent:last_check_timestamp" + +def get_redis_client(): + """Get Redis connection""" + try: + return redis.Redis( + host=REDIS_HOST, + port=REDIS_PORT, + decode_responses=True, + socket_connect_timeout=5, + socket_timeout=5 + ) + except Exception as e: + print(f"❌ Redis connection failed: {e}") + return None + +def get_messages_since(last_check=None, count=10): + """Get messages from Redis stream since last check""" + r = get_redis_client() + if not r: + return [] + + try: + # Get last N messages from stream + messages = r.xrevrange(STREAM_KEY, count=count) + + result = [] + for msg_id, msg_data in messages: + # Parse message data + data = {} + for k, v in msg_data.items(): + data[k] = v + + # Extract timestamp from message ID + timestamp_ms = int(msg_id.split('-')[0]) + msg_time = datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc) + + # Filter by last check if provided + if last_check: + if timestamp_ms <= last_check: + continue + + result.append({ + 'id': msg_id, + 'time': msg_time, + 'data': data + }) + + return result + except Exception as e: + print(f"❌ Error reading stream: {e}") + return [] + +def update_last_check(): + """Update the last check timestamp""" + r = get_redis_client() + if not r: + return False + + try: + now_ms = int(time.time() * 1000) + r.set(LAST_CHECKED_KEY, str(now_ms)) + return True + except Exception as e: + print(f"❌ Error updating timestamp: {e}") + return False + +def get_last_check_time(): + """Get the last check timestamp""" + r = get_redis_client() + if not r: + return None + + try: + last = r.get(LAST_CHECKED_KEY) + if last: + return int(last) + return None + except: + return None + +def format_message(msg): + """Format a message for display""" + time_str = msg['time'].strftime('%Y-%m-%d %H:%M:%S UTC') + data = msg['data'] + + sender = data.get('sender', 'unknown') + recipient = data.get('recipient', 'all') + msg_type = data.get('type', 'message') + content = data.get('content', '') + + return f"[{time_str}] {sender} → {recipient} ({msg_type}):\n {content[:200]}{'...' if len(content) > 200 else ''}" + +def main(): + parser = argparse.ArgumentParser(description="Check agent messages from Redis") + parser.add_argument("--list", "-l", type=int, metavar="N", help="List last N messages") + parser.add_argument("--check", "-c", action="store_true", help="Check for new messages since last check") + parser.add_argument("--last-minutes", "-m", type=int, metavar="M", help="Check messages from last M minutes") + parser.add_argument("--mark-read", action="store_true", help="Update last check timestamp after reading") + + args = parser.parse_args() + + if args.check: + last_check = get_last_check_time() + messages = get_messages_since(last_check) + + if messages: + print(f"🔔 {len(messages)} new message(s):") + for msg in reversed(messages): # Oldest first + print(format_message(msg)) + print() + else: + print("✅ No new messages") + + if args.mark_read: + update_last_check() + print("📌 Last check time updated") + + elif args.last_minutes: + since_ms = int((time.time() - args.last_minutes * 60) * 1000) + messages = get_messages_since(since_ms) + + if messages: + print(f"📨 {len(messages)} message(s) from last {args.last_minutes} minutes:") + for msg in reversed(messages): + print(format_message(msg)) + print() + else: + print(f"✅ No messages in last {args.last_minutes} minutes") + + elif args.list: + messages = get_messages_since(count=args.list) + + if messages: + print(f"📜 Last {len(messages)} message(s):") + for msg in reversed(messages): + print(format_message(msg)) + print() + else: + print("📭 No messages in stream") + + else: + # Default: check for new messages + last_check = get_last_check_time() + messages = get_messages_since(last_check) + + if messages: + print(f"🔔 {len(messages)} new message(s):") + for msg in reversed(messages): + print(format_message(msg)) + print() + update_last_check() + else: + print("✅ No new messages") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/api_scraper.py b/skills/qdrant-memory/scripts/api_scraper.py new file mode 100755 index 0000000..61c72d0 --- /dev/null +++ b/skills/qdrant-memory/scripts/api_scraper.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +API Scraper - REST API client with pagination support +Usage: api_scraper.py https://api.example.com/items --domain "API" --path "Endpoints/Items" +""" + +import argparse +import sys +import json +import urllib.request +from pathlib import Path +from datetime import datetime + +sys.path.insert(0, str(Path(__file__).parent)) +from scrape_to_kb import chunk_text, get_embedding, compute_checksum, store_in_kb + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "knowledge_base" + +class APIScraper: + def __init__(self, base_url, headers=None, rate_limit=0): + self.base_url = base_url + self.headers = headers or { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', + 'Accept': 'application/json' + } + self.rate_limit = rate_limit # seconds between requests + + def fetch(self, url, params=None): + """Fetch JSON from API""" + if params: + import urllib.parse + query = urllib.parse.urlencode(params) + url = f"{url}?{query}" if '?' not in url else f"{url}&{query}" + + req = urllib.request.Request(url, headers=self.headers) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + return json.loads(response.read().decode()) + except urllib.error.HTTPError as e: + print(f"❌ HTTP {e.code}: {e.reason}", file=sys.stderr) + return None + except Exception as e: + print(f"❌ Error: {e}", file=sys.stderr) + return None + + def paginate(self, endpoint, page_param="page", size_param="limit", + size=100, max_pages=None, data_key=None): + """Fetch paginated results""" + all_data = [] + page = 1 + + while True: + params = {page_param: page, size_param: size} + url = f"{self.base_url}{endpoint}" if not endpoint.startswith('http') else endpoint + + print(f"📄 Fetching page {page}...") + data = self.fetch(url, params) + + if not data: + break + + # Extract items from response + if data_key: + items = data.get(data_key, []) + elif isinstance(data, list): + items = data + else: + # Try common keys + for key in ['data', 'items', 'results', 'records', 'docs']: + if key in data: + items = data[key] + break + else: + items = [data] # Single item + + if not items: + break + + all_data.extend(items) + + # Check for more pages + if max_pages and page >= max_pages: + print(f" Reached max pages ({max_pages})") + break + + # Check if we got less than requested (last page) + if len(items) < size: + break + + page += 1 + + if self.rate_limit: + import time + time.sleep(self.rate_limit) + + return all_data + + def format_for_kb(self, items, format_template=None): + """Format API items as text for knowledge base""" + if not items: + return "" + + parts = [] + + for i, item in enumerate(items): + if format_template: + # Use custom template + try: + text = format_template.format(**item, index=i+1) + except KeyError: + text = json.dumps(item, indent=2) + else: + # Auto-format + text = self._auto_format(item) + + parts.append(text) + + return "\n\n---\n\n".join(parts) + + def _auto_format(self, item): + """Auto-format a JSON item as readable text""" + if isinstance(item, str): + return item + + if not isinstance(item, dict): + return json.dumps(item, indent=2) + + parts = [] + + # Title/Name first + for key in ['name', 'title', 'id', 'key']: + if key in item: + parts.append(f"# {item[key]}") + break + + # Description/summary + for key in ['description', 'summary', 'content', 'body', 'text']: + if key in item: + parts.append(f"\n{item[key]}") + break + + # Other fields + skip = ['name', 'title', 'id', 'key', 'description', 'summary', 'content', 'body', 'text'] + for key, value in item.items(): + if key in skip: + continue + if value is None: + continue + if isinstance(value, (list, dict)): + value = json.dumps(value, indent=2) + parts.append(f"\n**{key}:** {value}") + + return "\n".join(parts) + +def main(): + parser = argparse.ArgumentParser(description="Scrape REST API to knowledge base") + parser.add_argument("url", help="API endpoint URL") + parser.add_argument("--domain", required=True, help="Knowledge domain") + parser.add_argument("--path", required=True, help="Hierarchical path") + parser.add_argument("--paginate", action="store_true", help="Enable pagination") + parser.add_argument("--page-param", default="page", help="Page parameter name") + parser.add_argument("--size-param", default="limit", help="Page size parameter name") + parser.add_argument("--size", type=int, default=100, help="Items per page") + parser.add_argument("--max-pages", type=int, help="Max pages to fetch") + parser.add_argument("--data-key", help="Key containing data array in response") + parser.add_argument("--header", action='append', nargs=2, metavar=('KEY', 'VALUE'), + help="Custom headers (e.g., --header Authorization 'Bearer token')") + parser.add_argument("--format", help="Python format string for item display") + parser.add_argument("--category", default="reference") + parser.add_argument("--content-type", default="api_data") + parser.add_argument("--subjects", help="Comma-separated subjects") + parser.add_argument("--title", help="Content title") + parser.add_argument("--output", "-o", help="Save to JSON file instead of KB") + parser.add_argument("--rate-limit", type=float, default=0.5, + help="Seconds between requests (default: 0.5)") + + args = parser.parse_args() + + # Build headers + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', + 'Accept': 'application/json' + } + if args.header: + for key, value in args.header: + headers[key] = value + + scraper = APIScraper(args.url, headers=headers, rate_limit=args.rate_limit) + + print(f"🔌 API: {args.url}") + print(f"🏷️ Domain: {args.domain}") + print(f"📂 Path: {args.path}") + + # Fetch data + if args.paginate: + print("📄 Pagination enabled\n") + items = scraper.paginate( + args.url, + page_param=args.page_param, + size_param=args.size_param, + size=args.size, + max_pages=args.max_pages, + data_key=args.data_key + ) + else: + print("📄 Single request\n") + data = scraper.fetch(args.url) + if data_key := args.data_key: + items = data.get(data_key, []) if data else [] + elif isinstance(data, list): + items = data + else: + items = [data] if data else [] + + if not items: + print("❌ No data fetched", file=sys.stderr) + sys.exit(1) + + print(f"✓ Fetched {len(items)} items") + + if args.output: + with open(args.output, 'w') as f: + json.dump(items, f, indent=2) + print(f"💾 Saved raw data to {args.output}") + return + + # Format for KB + text = scraper.format_for_kb(items, args.format) + + print(f"📝 Formatted: {len(text)} chars") + + if len(text) < 200: + print("❌ Content too short", file=sys.stderr) + sys.exit(1) + + chunks = chunk_text(text) + print(f"🧩 Chunks: {len(chunks)}") + + subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else [] + checksum = compute_checksum(text) + title = args.title or f"API Data from {args.url}" + + print("💾 Storing...") + stored = 0 + for i, chunk in enumerate(chunks): + chunk_metadata = { + "domain": args.domain, + "path": f"{args.path}/chunk-{i+1}", + "subjects": subjects, + "category": args.category, + "content_type": args.content_type, + "title": f"{title} (part {i+1}/{len(chunks)})", + "checksum": checksum, + "source_url": args.url, + "date_added": datetime.now().strftime("%Y-%m-%d"), + "chunk_index": i + 1, + "total_chunks": len(chunks), + "text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk, + "scraper_type": "api_rest", + "item_count": len(items), + "api_endpoint": args.url + } + + if store_in_kb(chunk, chunk_metadata): + stored += 1 + print(f" ✓ Chunk {i+1}") + + print(f"\n🎉 Stored {stored}/{len(chunks)} chunks") + print(f" Source: {args.url}") + print(f" Items: {len(items)}") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/auto_store.py b/skills/qdrant-memory/scripts/auto_store.py new file mode 100755 index 0000000..aa2b3f3 --- /dev/null +++ b/skills/qdrant-memory/scripts/auto_store.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python3 +""" +Auto Conversation Memory - TRUE Mem0-style Full Context Storage + +User-centric memory - all conversations link to persistent user_id. +NOT session/chat-centric like old version. + +Features: +- Persistent user_id (e.g., "rob") across all conversations +- Cross-conversation retrieval (find memories from any chat) +- Automatic conversation threading +- Deduplication +- Mem0-style: memories belong to USER, not to session + +Usage: + python3 scripts/auto_store.py "user_message" "ai_response" \ + --user-id "rob" \ + --conversation-id \ + --turn + +Mem0 Architecture: + - user_id: "rob" (persistent across all your chats) + - conversation_id: Groups turns within one conversation + - session_id: Optional - tracks specific chat instance + - Retrieved by: user_id + semantic similarity (NOT session_id) +""" + +import argparse +import hashlib +import json +import os +import sys +import urllib.request +import uuid +from datetime import datetime +from typing import List, Optional, Dict, Any + +QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") +COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "kimi_memories") +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://127.0.0.1:11434/v1") + +# In-memory cache for deduplication (per process) +_recent_hashes = set() + +def get_content_hash(user_msg: str, ai_response: str) -> str: + """Generate hash for deduplication (stable across platforms).""" + content = f"{user_msg.strip()}::{ai_response.strip()}".encode("utf-8", errors="replace") + return hashlib.sha256(content).hexdigest() + +def is_duplicate(user_id: str, user_msg: str, ai_response: str) -> bool: + """ + Check if this conversation turn already exists for this user. + Uses: user_id + content_hash + """ + content_hash = get_content_hash(user_msg, ai_response) + + # Check in-memory cache first + if content_hash in _recent_hashes: + return True + + # Check Qdrant for existing entry with this user_id + content_hash + try: + search_body = { + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}}, + {"key": "content_hash", "match": {"value": content_hash}} + ] + }, + "limit": 1, + "with_payload": False + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=json.dumps(search_body).encode(), + headers={"Content-Type": "application/json"} + ) + + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + if len(points) > 0: + return True + except Exception: + pass + + return False + +def mark_stored(user_msg: str, ai_response: str): + """Mark content as stored in memory cache""" + content_hash = get_content_hash(user_msg, ai_response) + _recent_hashes.add(content_hash) + if len(_recent_hashes) > 1000: + _recent_hashes.clear() + +def get_embedding(text: str) -> Optional[List[float]]: + """Generate embedding using snowflake-arctic-embed2""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"[AutoMemory] Embedding error: {e}", file=sys.stderr) + return None + +def generate_conversation_summary(user_msg: str, ai_response: str) -> str: + """Generate a searchable summary of the conversation turn""" + summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}" + return summary + +def store_memory_point( + user_id: str, + text: str, + speaker: str, + date_str: str, + conversation_id: str, + turn_number: int, + session_id: Optional[str], + tags: List[str], + importance: str = "medium", + content_hash: Optional[str] = None +) -> Optional[str]: + """Store a single memory point to Qdrant with user_id""" + + embedding = get_embedding(text) + if embedding is None: + return None + + point_id = str(uuid.uuid4()) + + payload = { + # MEM0-STYLE: user_id is PRIMARY key + "user_id": user_id, + "text": text, + "date": date_str, + "tags": tags, + "importance": importance, + "source": "conversation_auto", + "source_type": "user" if speaker == "user" else "assistant", + "category": "Full Conversation", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat(), + "conversation_id": conversation_id, + "turn_number": turn_number, + "session_id": session_id or "" + } + + if content_hash: + payload["content_hash"] = content_hash + + upsert_data = { + "points": [{ + "id": point_id, + "vector": embedding, + "payload": payload + }] + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + if result.get("status") == "ok": + return point_id + except Exception as e: + print(f"[AutoMemory] Storage error: {e}", file=sys.stderr) + + return None + +def store_conversation_turn( + user_id: str, + user_message: str, + ai_response: str, + conversation_id: Optional[str] = None, + turn_number: Optional[int] = None, + session_id: Optional[str] = None, + date_str: Optional[str] = None, + skip_if_duplicate: bool = True +) -> Dict[str, Any]: + """ + Store a full conversation turn to Qdrant (Mem0-style) + + Args: + user_id: PERSISTENT user identifier (e.g., "rob") - REQUIRED + user_message: User's message + ai_response: AI's response + conversation_id: Groups related turns (auto-generated if None) + turn_number: Sequential turn number + session_id: Optional chat session identifier + date_str: Date in YYYY-MM-DD format + + Returns: + dict with success status and memory IDs + """ + if not user_id: + raise ValueError("user_id is required for Mem0-style storage") + + if date_str is None: + date_str = datetime.now().strftime("%Y-%m-%d") + + # Check for duplicates (per user) + if skip_if_duplicate and is_duplicate(user_id, user_message, ai_response): + return { + "user_point_id": None, + "ai_point_id": None, + "user_id": user_id, + "conversation_id": conversation_id or "", + "turn_number": turn_number or 1, + "success": True, + "skipped": True + } + + if conversation_id is None: + conversation_id = str(uuid.uuid4()) + + if turn_number is None: + turn_number = 1 + + # Tags include user_id for easy filtering + tags = [ + "conversation", + f"user:{user_id}", + date_str + ] + + if session_id: + tags.append(f"session:{session_id[:8]}") + + # Determine importance + importance = "high" if any(kw in (user_message + ai_response).lower() + for kw in ["remember", "important", "always", "never", "rule"]) else "medium" + + content_hash = get_content_hash(user_message, ai_response) + + # Store user message + user_text = f"[{user_id}]: {user_message}" + user_id_point = store_memory_point( + user_id=user_id, + text=user_text, + speaker="user", + date_str=date_str, + conversation_id=conversation_id, + turn_number=turn_number, + session_id=session_id, + tags=tags + ["user-message"], + importance=importance, + content_hash=content_hash + ) + + # Store AI response + ai_text = f"[Kimi]: {ai_response}" + ai_id_point = store_memory_point( + user_id=user_id, + text=ai_text, + speaker="assistant", + date_str=date_str, + conversation_id=conversation_id, + turn_number=turn_number, + session_id=session_id, + tags=tags + ["ai-response"], + importance=importance, + content_hash=content_hash + ) + + # Store summary + summary = generate_conversation_summary(user_message, ai_response) + summary_text = f"[Turn {turn_number}] {summary}" + + summary_embedding = get_embedding(summary_text) + if summary_embedding: + summary_id = str(uuid.uuid4()) + summary_payload = { + "user_id": user_id, + "text": summary_text, + "date": date_str, + "tags": tags + ["summary", "combined"], + "importance": importance, + "source": "conversation_summary", + "source_type": "system", + "category": "Conversation Summary", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat(), + "conversation_id": conversation_id, + "turn_number": turn_number, + "session_id": session_id or "", + "content_hash": content_hash, + "user_message": user_message[:500], + "ai_response": ai_response[:800] + } + + upsert_data = { + "points": [{ + "id": summary_id, + "vector": summary_embedding, + "payload": summary_payload + }] + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + json.loads(response.read().decode()) + except Exception as e: + print(f"[AutoMemory] Summary storage error: {e}", file=sys.stderr) + + # Mark as stored + if user_id_point and ai_id_point: + mark_stored(user_message, ai_response) + + return { + "user_point_id": user_id_point, + "ai_point_id": ai_id_point, + "user_id": user_id, + "conversation_id": conversation_id, + "turn_number": turn_number, + "success": bool(user_id_point and ai_id_point), + "skipped": False + } + +def main(): + parser = argparse.ArgumentParser( + description="Auto-store conversation turns to Qdrant (TRUE Mem0-style with user_id)" + ) + parser.add_argument("user_message", help="The user's message") + parser.add_argument("ai_response", help="The AI's response") + parser.add_argument("--user-id", required=True, + help="REQUIRED: Persistent user ID (e.g., 'rob')") + parser.add_argument("--conversation-id", + help="Conversation ID for threading (auto-generated if not provided)") + parser.add_argument("--turn", type=int, help="Turn number in conversation") + parser.add_argument("--session-id", + help="Optional: Session/chat instance ID") + parser.add_argument("--date", default=datetime.now().strftime("%Y-%m-%d"), + help="Date in YYYY-MM-DD format") + + args = parser.parse_args() + + result = store_conversation_turn( + user_id=args.user_id, + user_message=args.user_message, + ai_response=args.ai_response, + conversation_id=args.conversation_id, + turn_number=args.turn, + session_id=args.session_id, + date_str=args.date + ) + + if result.get("skipped"): + print(f"⚡ Skipped duplicate (already stored for user {result['user_id']})") + elif result["success"]: + print(f"✅ Stored for user '{result['user_id']}' turn {result['turn_number']}") + print(f" Conversation: {result['conversation_id'][:8]}...") + else: + print("❌ Failed to store conversation", file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/backfill_emails.py b/skills/qdrant-memory/scripts/backfill_emails.py new file mode 100755 index 0000000..1c55e1b --- /dev/null +++ b/skills/qdrant-memory/scripts/backfill_emails.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +""" +Backfill emails to Qdrant for a specific user. +One-time use to populate memories from existing emails. +""" + +import imaplib +import email +from email.policy import default +import json +import sys +import subprocess + +# Authorized senders with their user IDs +# Add your authorized emails here +AUTHORIZED_SENDERS = { + # "your_email@gmail.com": "yourname", + # "spouse_email@gmail.com": "spousename" +} + +# Gmail IMAP settings +IMAP_SERVER = "imap.gmail.com" +IMAP_PORT = 993 + +# Load credentials +CRED_FILE = "/root/.openclaw/workspace/.gmail_imap.json" + +def load_credentials(): + try: + with open(CRED_FILE, 'r') as f: + return json.load(f) + except Exception as e: + print(f"Error loading credentials: {e}") + return None + +def store_email_memory(user_id, sender, subject, body, date): + """Store email to Qdrant as memory for the user.""" + try: + # Format as conversation-like entry + email_text = f"[EMAIL from {sender}]\nSubject: {subject}\nDate: {date}\n\n{body}" + + # Store using auto_store.py (waits for completion) + script_path = "/root/.openclaw/workspace/skills/qdrant-memory/scripts/auto_store.py" + result = subprocess.run([ + "python3", script_path, + f"[Email] {subject}", + email_text, + "--user-id", user_id + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + print(f" ✓ Stored: {subject[:50]}") + else: + print(f" ✗ Failed: {subject[:50]}") + except Exception as e: + print(f" ✗ Error: {e}") + +def backfill(user_id=None, limit=20): + """Backfill emails for specific user or all authorized senders.""" + creds = load_credentials() + if not creds: + return + + email_addr = creds.get("email") + app_password = creds.get("app_password") + + if not email_addr or not app_password: + return + + try: + # Connect to IMAP + mail = imaplib.IMAP4_SSL(IMAP_SERVER, IMAP_PORT) + mail.login(email_addr, app_password) + mail.select("inbox") + + # Get ALL emails + status, messages = mail.search(None, "ALL") + + if status != "OK" or not messages[0]: + print("No emails found.") + mail.logout() + return + + email_ids = messages[0].split() + print(f"Found {len(email_ids)} total emails") + + # Filter by user if specified + target_emails = [] + if user_id: + # Find email address for this user + for auth_email, uid in AUTHORIZED_SENDERS.items(): + if uid == user_id: + target_emails.append(auth_email.lower()) + else: + target_emails = [e.lower() for e in AUTHORIZED_SENDERS.keys()] + + # Process emails + stored_count = 0 + for eid in email_ids[-limit:]: + status, msg_data = mail.fetch(eid, "(RFC822)") + if status != "OK": + continue + + msg = email.message_from_bytes(msg_data[0][1], policy=default) + sender = msg.get("From", "").lower() + subject = msg.get("Subject", "") + date = msg.get("Date", "") + + # Extract body + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + body = part.get_content() + break + else: + body = msg.get_content() + body = body.strip()[:2000] if body else "" + + # Check if from target sender + for auth_email, uid in AUTHORIZED_SENDERS.items(): + if auth_email.lower() in sender: + if user_id and uid != user_id: + continue + print(f"\nStoring for {uid}:") + store_email_memory(uid, sender, subject, body, date) + stored_count += 1 + break + + print(f"\nDone! Stored {stored_count} emails to Qdrant.") + + mail.close() + mail.logout() + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Backfill emails to Qdrant") + parser.add_argument("--user-id", help="Specific user to backfill (rob or jennifer)") + parser.add_argument("--limit", type=int, default=20, help="Max emails to process") + args = parser.parse_args() + + backfill(user_id=args.user_id, limit=args.limit) \ No newline at end of file diff --git a/skills/qdrant-memory/scripts/background_store.py b/skills/qdrant-memory/scripts/background_store.py new file mode 100755 index 0000000..bd665bb --- /dev/null +++ b/skills/qdrant-memory/scripts/background_store.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +""" +Background Conversation Storage - Fire-and-forget wrapper (Mem0-style) + +Usage: + background_store.py "user_message" "ai_response" \ + --user-id "rob" \ + [--turn N] \ + [--session-id UUID] + +Zero delay for user - storage happens asynchronously. +Mem0-style: user_id is REQUIRED (persistent across all chats). +""" + +import argparse +import subprocess +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent.resolve() +AUTO_STORE = SCRIPT_DIR / "auto_store.py" + +def store_in_background( + user_id: str, + user_message: str, + ai_response: str, + turn: int = None, + session_id: str = None +): + """Fire off storage without waiting - returns immediately""" + + cmd = [ + sys.executable, + str(AUTO_STORE), + user_message, + ai_response, + "--user-id", user_id + ] + + if turn: + cmd.extend(["--turn", str(turn)]) + + if session_id: + cmd.extend(["--session-id", session_id]) + + # Fire and forget + subprocess.Popen( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True + ) + + return True + +def main(): + parser = argparse.ArgumentParser( + description="Store conversation in background (Mem0-style, zero delay)" + ) + parser.add_argument("user_message", help="User's message") + parser.add_argument("ai_response", help="AI's response") + parser.add_argument("--user-id", required=True, + help="REQUIRED: Persistent user ID (e.g., 'rob')") + parser.add_argument("--turn", type=int, help="Turn number") + parser.add_argument("--session-id", help="Optional session/chat ID") + + args = parser.parse_args() + + store_in_background( + user_id=args.user_id, + user_message=args.user_message, + ai_response=args.ai_response, + turn=args.turn, + session_id=args.session_id + ) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/batch_crawl.py b/skills/qdrant-memory/scripts/batch_crawl.py new file mode 100755 index 0000000..446f1bf --- /dev/null +++ b/skills/qdrant-memory/scripts/batch_crawl.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Batch URL Crawler - Scrape multiple URLs to knowledge base +Usage: batch_crawl.py urls.txt --domain "Python" --path "Docs/Tutorials" +""" + +import argparse +import sys +import json +import concurrent.futures +import urllib.request +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from scrape_to_kb import fetch_url, extract_text, chunk_text, get_embedding, compute_checksum, store_in_kb + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "knowledge_base" + +def load_urls(url_source): + """Load URLs from file or JSON""" + if url_source.endswith('.json'): + with open(url_source) as f: + data = json.load(f) + return [(item['url'], item.get('title'), item.get('subjects', [])) + for item in data] + else: + with open(url_source) as f: + urls = [] + for line in f: + line = line.strip() + if line and not line.startswith('#'): + # Parse URL [title] [subjects] + parts = line.split(' ', 1) + url = parts[0] + title = None + subjects = [] + if len(parts) > 1: + # Check for [Title] and #subject1,#subject2 + rest = parts[1] + if '[' in rest and ']' in rest: + title_match = rest[rest.find('[')+1:rest.find(']')] + title = title_match + rest = rest[rest.find(']')+1:] + if '#' in rest: + subjects = [s.strip() for s in rest.split('#') if s.strip()] + urls.append((url, title, subjects)) + return urls + +def scrape_single(url_data, domain, path, category, content_type): + """Scrape a single URL""" + url, title_override, subjects = url_data + + try: + print(f"🔍 {url}") + html = fetch_url(url) + if not html: + return {"url": url, "status": "failed", "error": "fetch"} + + title, text = extract_text(html) + if title_override: + title = title_override + + if len(text) < 200: + return {"url": url, "status": "skipped", "reason": "too_short"} + + chunks = chunk_text(text) + checksum = compute_checksum(text) + + stored = 0 + for i, chunk in enumerate(chunks): + chunk_metadata = { + "domain": domain, + "path": f"{path}/chunk-{i+1}", + "subjects": subjects, + "category": category, + "content_type": content_type, + "title": f"{title} (part {i+1}/{len(chunks)})", + "checksum": checksum, + "source_url": url, + "date_added": "2026-02-05", + "chunk_index": i + 1, + "total_chunks": len(chunks), + "text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk + } + + if store_in_kb(chunk, chunk_metadata): + stored += 1 + + return { + "url": url, + "status": "success", + "chunks": len(chunks), + "stored": stored, + "title": title + } + except Exception as e: + return {"url": url, "status": "error", "error": str(e)} + +def main(): + parser = argparse.ArgumentParser(description="Batch scrape URLs to knowledge base") + parser.add_argument("urls", help="File with URLs (.txt or .json)") + parser.add_argument("--domain", required=True, help="Knowledge domain") + parser.add_argument("--path", required=True, help="Hierarchical path") + parser.add_argument("--category", default="reference", + choices=["reference", "tutorial", "snippet", "troubleshooting", "concept"]) + parser.add_argument("--content-type", default="web_page") + parser.add_argument("--workers", type=int, default=3, help="Concurrent workers (default: 3)") + parser.add_argument("--dry-run", action="store_true", help="Test without storing") + + args = parser.parse_args() + + urls = load_urls(args.urls) + print(f"📋 Loaded {len(urls)} URLs") + print(f"🏷️ Domain: {args.domain}") + print(f"📂 Path: {args.path}") + print(f"⚡ Workers: {args.workers}") + + if args.dry_run: + print("\n🔍 DRY RUN - No storage\n") + for url, title, subjects in urls: + print(f" Would scrape: {url}") + if title: + print(f" Title: {title}") + if subjects: + print(f" Subjects: {', '.join(subjects)}") + return + + results = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor: + futures = { + executor.submit(scrape_single, url_data, args.domain, args.path, + args.category, args.content_type): url_data + for url_data in urls + } + + for future in concurrent.futures.as_completed(futures): + result = future.result() + results.append(result) + + if result["status"] == "success": + print(f" ✓ {result['title'][:50]}... ({result['stored']}/{result['chunks']} chunks)") + elif result["status"] == "skipped": + print(f" ⚠ Skipped: {result.get('reason')}") + else: + print(f" ✗ Failed: {result.get('error', 'unknown')}") + + # Summary + success = sum(1 for r in results if r["status"] == "success") + failed = sum(1 for r in results if r["status"] in ["failed", "error"]) + skipped = sum(1 for r in results if r["status"] == "skipped") + + print(f"\n📊 Summary:") + print(f" ✓ Success: {success}") + print(f" ✗ Failed: {failed}") + print(f" ⚠ Skipped: {skipped}") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/bulk_migrate.py b/skills/qdrant-memory/scripts/bulk_migrate.py new file mode 100755 index 0000000..1690fb2 --- /dev/null +++ b/skills/qdrant-memory/scripts/bulk_migrate.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +""" +Bulk memory migration to Qdrant kimi_memories collection +Uses snowflake-arctic-embed2 (1024 dimensions) +""" + +import json +import os +import re +import sys +import urllib.request +import uuid +from datetime import datetime + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_URL = "http://localhost:11434/v1" + +MEMORY_DIR = "/root/.openclaw/workspace/memory" +MEMORY_MD = "/root/.openclaw/workspace/MEMORY.md" + +def get_embedding(text): + """Generate embedding using snowflake-arctic-embed2 via Ollama""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] # Limit text length + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"Error generating embedding: {e}", file=sys.stderr) + return None + +def store_memory(text, embedding, tags=None, importance="medium", date=None, + source="memory_backup", confidence="high", source_type="user", + verified=True): + """Store memory in Qdrant with metadata""" + + if date is None: + date = datetime.now().strftime("%Y-%m-%d") + + point_id = str(uuid.uuid4()) + + payload = { + "text": text, + "date": date, + "tags": tags or [], + "importance": importance, + "confidence": confidence, + "source_type": source_type, + "verified": verified, + "source": source, + "created_at": datetime.now().isoformat(), + "access_count": 0 + } + + point = { + "id": point_id, + "vector": embedding, + "payload": payload + } + + data = json.dumps({"points": [point]}).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result.get("result", {}).get("status") == "ok" + except Exception as e: + print(f"Error storing memory: {e}", file=sys.stderr) + return False + +def extract_memories_from_file(filepath, importance="medium"): + """Extract memory entries from a markdown file""" + memories = [] + + try: + with open(filepath, 'r') as f: + content = f.read() + except Exception as e: + print(f"Error reading {filepath}: {e}", file=sys.stderr) + return memories + + # Extract date from filename or content + date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filepath) + date = date_match.group(1) if date_match else datetime.now().strftime("%Y-%m-%d") + + # Parse sections + lines = content.split('\n') + current_section = None + current_content = [] + + for line in lines: + # Section headers + if line.startswith('# ') and 'Memory' in line: + continue # Skip title + elif line.startswith('## '): + # Save previous section + if current_section and current_content: + section_text = '\n'.join(current_content).strip() + if len(section_text) > 20: + memories.append({ + "text": f"{current_section}: {section_text}", + "date": date, + "tags": extract_tags(current_section, section_text), + "importance": importance + }) + current_section = line[3:].strip() + current_content = [] + elif line.startswith('### '): + # Save previous section + if current_section and current_content: + section_text = '\n'.join(current_content).strip() + if len(section_text) > 20: + memories.append({ + "text": f"{current_section}: {section_text}", + "date": date, + "tags": extract_tags(current_section, section_text), + "importance": importance + }) + current_section = line[4:].strip() + current_content = [] + else: + if current_section: + current_content.append(line) + + # Save final section + if current_section and current_content: + section_text = '\n'.join(current_content).strip() + if len(section_text) > 20: + memories.append({ + "text": f"{current_section}: {section_text}", + "date": date, + "tags": extract_tags(current_section, section_text), + "importance": importance + }) + + return memories + +def extract_tags(section, content): + """Extract relevant tags from section and content""" + tags = [] + + # Section-based tags + if any(word in section.lower() for word in ['voice', 'tts', 'stt', 'audio']): + tags.extend(['voice', 'audio']) + if any(word in section.lower() for word in ['memory', 'qdrant', 'remember']): + tags.extend(['memory', 'qdrant']) + if any(word in section.lower() for word in ['redis', 'agent', 'message', 'max']): + tags.extend(['redis', 'messaging', 'agent']) + if any(word in section.lower() for word in ['youtube', 'seo', 'content']): + tags.extend(['youtube', 'content']) + if any(word in section.lower() for word in ['search', 'searxng', 'web']): + tags.extend(['search', 'web']) + if any(word in section.lower() for word in ['setup', 'install', 'bootstrap']): + tags.extend(['setup', 'configuration']) + + # Content-based tags + content_lower = content.lower() + if 'voice' in content_lower: + tags.append('voice') + if 'memory' in content_lower: + tags.append('memory') + if 'qdrant' in content_lower: + tags.append('qdrant') + if 'redis' in content_lower: + tags.append('redis') + if 'youtube' in content_lower: + tags.append('youtube') + if 'rob' in content_lower: + tags.append('user') + + return list(set(tags)) # Remove duplicates + +def extract_core_memories_from_memory_md(): + """Extract high-importance memories from MEMORY.md""" + memories = [] + + try: + with open(MEMORY_MD, 'r') as f: + content = f.read() + except Exception as e: + print(f"Error reading MEMORY.md: {e}", file=sys.stderr) + return memories + + # Core sections with high importance + sections = [ + ("Identity & Names", "high"), + ("Core Preferences", "high"), + ("Communication Rules", "high"), + ("Voice Settings", "high"), + ("Lessons Learned", "high"), + ] + + for section_name, importance in sections: + pattern = f"## {section_name}.*?(?=## |$)" + match = re.search(pattern, content, re.DOTALL) + if match: + section_text = match.group(0).strip() + # Extract subsections + subsections = re.findall(r'### (.+?)\n', section_text) + for sub in subsections: + sub_pattern = f"### {re.escape(sub)}.*?(?=### |## |$)" + sub_match = re.search(sub_pattern, section_text, re.DOTALL) + if sub_match: + sub_text = sub_match.group(0).strip() + if len(sub_text) > 50: + memories.append({ + "text": f"{section_name} - {sub}: {sub_text[:500]}", + "date": "2026-02-10", + "tags": extract_tags(section_name, sub_text) + ['core', 'longterm'], + "importance": importance + }) + + return memories + +def main(): + print("Starting bulk memory migration to kimi_memories...") + print(f"Collection: {COLLECTION_NAME}") + print(f"Model: snowflake-arctic-embed2 (1024 dims)") + print() + + all_memories = [] + + # Extract from daily logs + for filename in sorted(os.listdir(MEMORY_DIR)): + if filename.endswith('.md') and filename.startswith('2026'): + filepath = os.path.join(MEMORY_DIR, filename) + print(f"Processing {filename}...") + memories = extract_memories_from_file(filepath, importance="medium") + all_memories.extend(memories) + print(f" Extracted {len(memories)} memories") + + # Extract from MEMORY.md + print("Processing MEMORY.md...") + core_memories = extract_core_memories_from_memory_md() + all_memories.extend(core_memories) + print(f" Extracted {len(core_memories)} core memories") + + print(f"\nTotal memories to store: {len(all_memories)}") + print() + + # Store each memory + success_count = 0 + fail_count = 0 + + for i, memory in enumerate(all_memories, 1): + print(f"[{i}/{len(all_memories)}] Storing: {memory['text'][:60]}...") + + # Generate embedding + embedding = get_embedding(memory['text']) + if embedding is None: + print(f" ❌ Failed to generate embedding") + fail_count += 1 + continue + + # Store in Qdrant + if store_memory( + text=memory['text'], + embedding=embedding, + tags=memory['tags'], + importance=memory['importance'], + date=memory['date'], + source="bulk_migration", + confidence="high", + source_type="user", + verified=True + ): + print(f" ✅ Stored") + success_count += 1 + else: + print(f" ❌ Failed to store") + fail_count += 1 + + print() + print("=" * 50) + print(f"Migration complete!") + print(f" Success: {success_count}") + print(f" Failed: {fail_count}") + print(f" Total: {len(all_memories)}") + print("=" * 50) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/create_daily_memory.py b/skills/qdrant-memory/scripts/create_daily_memory.py new file mode 100755 index 0000000..b6f18db --- /dev/null +++ b/skills/qdrant-memory/scripts/create_daily_memory.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +""" +Create today's memory file if it doesn't exist +Usage: create_daily_memory.py [date] +""" + +import sys +import os +from datetime import datetime, timezone + +def get_cst_date(): + """Get current date in CST (America/Chicago)""" + from datetime import datetime, timezone + import time + + # CST is UTC-6 (standard time) or UTC-5 (daylight time) + # Use a simple approximation: check if DST is active + now = datetime.now(timezone.utc) + + # Convert to approximate CST (this is a simplified version) + # For production, use pytz or zoneinfo + is_dst = time.localtime().tm_isdst > 0 + offset = -5 if is_dst else -6 # CDT or CST + + cst_now = now.replace(hour=(now.hour + offset) % 24) + return cst_now.strftime('%Y-%m-%d') + +def create_daily_memory(date_str=None): + """Create memory file for the given date""" + if date_str is None: + date_str = get_cst_date() + + memory_dir = "/root/.openclaw/workspace/memory" + filepath = os.path.join(memory_dir, f"{date_str}.md") + + # Ensure directory exists + os.makedirs(memory_dir, exist_ok=True) + + # Check if file already exists + if os.path.exists(filepath): + print(f"✅ Memory file already exists: {filepath}") + return filepath + + # Create new daily memory file + content = f"""# {date_str} — Daily Memory Log + +## Session Start +- **Date:** {date_str} +- **Agent:** Kimi + +## Activities + +*(Log activities, decisions, and important context here)* + +## Notes + +--- +*Stored for long-term memory retention* +""" + + try: + with open(filepath, 'w') as f: + f.write(content) + print(f"✅ Created memory file: {filepath}") + return filepath + except Exception as e: + print(f"❌ Error creating memory file: {e}") + return None + +if __name__ == "__main__": + date_arg = sys.argv[1] if len(sys.argv) > 1 else None + create_daily_memory(date_arg) diff --git a/skills/qdrant-memory/scripts/daily_backup.py b/skills/qdrant-memory/scripts/daily_backup.py new file mode 100755 index 0000000..5ec36f4 --- /dev/null +++ b/skills/qdrant-memory/scripts/daily_backup.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +""" +Daily memory backup script with batch upload support +Backs up all memory files to kimi_memories collection in Qdrant +Uses batch uploads (256 points) for 20x performance improvement +Avoids duplicates by checking existing dates + +Usage: + daily_backup.py [--dry-run] [--batch-size N] + +Features: + - Batch upload with configurable size (default 256) + - Parallel processing support + - Duplicate detection via date-based scroll + - Progress reporting +""" + +import argparse +import json +import os +import sys +import urllib.request +import urllib.error +import uuid +from datetime import datetime +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_URL = "http://localhost:11434/v1" +MEMORY_DIR = Path("/root/.openclaw/workspace/memory") +DEFAULT_BATCH_SIZE = 256 +DEFAULT_PARALLEL = 4 + + +def get_embedding(text): + """Generate embedding using snowflake-arctic-embed2 via Ollama""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] # Limit to 8k chars for embedding + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"Error generating embedding: {e}", file=sys.stderr) + return None + + +def get_embedding_batch(texts): + """Generate embeddings for multiple texts in batch""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": [t[:8192] for t in texts] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=120) as response: + result = json.loads(response.read().decode()) + return [d["embedding"] for d in result["data"]] + except Exception as e: + print(f"Error generating batch embeddings: {e}", file=sys.stderr) + return [None] * len(texts) + + +def get_existing_dates(): + """Get list of dates already backed up via daily-backup (not manual stores)""" + try: + scroll_data = json.dumps({ + "limit": 10000, + "with_payload": True, + "with_vectors": False + }).encode() + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=scroll_data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + if result.get("result", {}).get("points"): + # Only count entries from daily-backup source, not manual stores + backup_dates = set() + for p in result["result"]["points"]: + payload = p.get("payload", {}) + date = payload.get("date") + source = payload.get("source") + tags = payload.get("tags", []) + # Only skip if this was a daily-backup (not conversation/manual) + if date and source == "daily-backup": + backup_dates.add(date) + # Also check for daily-backup tag as fallback + elif date and "daily-backup" in tags: + backup_dates.add(date) + return backup_dates + except Exception as e: + print(f"Warning: Could not check existing dates: {e}", file=sys.stderr) + return set() + + +def batch_upload_points(points, batch_size=256): + """Upload points in batches using batch_size""" + total = len(points) + uploaded = 0 + failed = 0 + + for i in range(0, total, batch_size): + batch = points[i:i + batch_size] + + upsert_data = { + "points": batch + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + if result.get("status") == "ok": + uploaded += len(batch) + print(f" ✅ Batch {i//batch_size + 1}: {len(batch)} points uploaded") + else: + print(f" ❌ Batch {i//batch_size + 1}: Failed - {result}") + failed += len(batch) + except Exception as e: + print(f" ❌ Batch {i//batch_size + 1}: Error - {e}", file=sys.stderr) + failed += len(batch) + + return uploaded, failed + + +def prepare_memory_point(content, date_str): + """Prepare a memory point for upload""" + embedding = get_embedding(content) + if embedding is None: + return None + + point_id = str(uuid.uuid4()) + + payload = { + "text": content, + "date": date_str, + "tags": ["daily-backup", f"backup-{date_str}"], + "importance": "high", + "source": "daily-backup", + "source_type": "inferred", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "backup_timestamp": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat() + } + + return { + "id": point_id, + "vector": embedding, + "payload": payload + } + + +def process_file_batch(files_batch): + """Process a batch of files in parallel""" + results = [] + for date_str, file_path in files_batch: + try: + with open(file_path, 'r') as f: + content = f.read() + + point = prepare_memory_point(content, date_str) + if point: + results.append(point) + except Exception as e: + print(f" ❌ {date_str}: Failed to process - {e}") + + return results + + +def get_memory_files(): + """Get all memory markdown files sorted by date""" + if not MEMORY_DIR.exists(): + return [] + + files = [] + for f in MEMORY_DIR.glob("????-??-??.md"): + if f.name != "heartbeat-timestamps.txt": + files.append((f.stem, f)) # (date string, file path) + + # Sort by date + files.sort(key=lambda x: x[0]) + return files + + +def main(): + parser = argparse.ArgumentParser(description="Daily memory backup with batch upload") + parser.add_argument("--dry-run", action="store_true", help="Show what would be backed up without uploading") + parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})") + parser.add_argument("--parallel", type=int, default=DEFAULT_PARALLEL, help=f"Parallel embedding generation (default: {DEFAULT_PARALLEL})") + parser.add_argument("--force", action="store_true", help="Force re-backup of existing dates") + args = parser.parse_args() + + print(f"=== Daily Memory Backup ===") + print(f"Time: {datetime.now().isoformat()}") + print(f"Batch size: {args.batch_size}") + print(f"Parallel: {args.parallel}") + if args.dry_run: + print("Mode: DRY RUN (no actual upload)") + print() + + # Get existing dates to avoid duplicates + print(f"Checking for existing backups...") + existing_dates = get_existing_dates() + print(f"Found {len(existing_dates)} existing backups") + + # Get memory files + memory_files = get_memory_files() + print(f"Found {len(memory_files)} memory files") + + # Filter out already backed up dates (unless force) + files_to_backup = [] + for date_str, file_path in memory_files: + if date_str in existing_dates and not args.force: + print(f" ⏭️ {date_str} - Already backed up, skipping") + continue + files_to_backup.append((date_str, file_path)) + + if not files_to_backup: + print(f"\n✅ All memories already backed up (no new files)") + return 0 + + print(f"\nBacking up {len(files_to_backup)} files...") + print() + + if args.dry_run: + for date_str, file_path in files_to_backup: + print(f" 📄 {date_str} - Would back up ({file_path.stat().st_size} bytes)") + print(f"\nDry run complete. {len(files_to_backup)} files would be backed up.") + return 0 + + # Prepare all points with embeddings + all_points = [] + failed_files = [] + + print("Generating embeddings...") + for date_str, file_path in files_to_backup: + try: + with open(file_path, 'r') as f: + content = f.read() + + print(f" 📦 {date_str} - Generating embedding...") + point = prepare_memory_point(content, date_str) + + if point: + all_points.append(point) + else: + failed_files.append(date_str) + except Exception as e: + print(f" ❌ {date_str} - Failed to read: {e}") + failed_files.append(date_str) + + if not all_points: + print("\n❌ No points to upload") + return 1 + + print(f"\nGenerated {len(all_points)} embeddings, uploading in batches of {args.batch_size}...") + print() + + # Upload in batches + uploaded, failed = batch_upload_points(all_points, args.batch_size) + + # Summary + print(f"\n{'=' * 50}") + print("SUMMARY:") + print(f" Total files: {len(files_to_backup)}") + print(f" Successfully embedded: {len(all_points)}") + print(f" Successfully uploaded: {uploaded}") + print(f" Failed to embed: {len(failed_files)}") + print(f" Failed to upload: {failed}") + + if failed_files: + print(f"\nFailed files: {', '.join(failed_files)}") + + if uploaded > 0: + print(f"\n✅ Daily backup complete!") + return 0 + elif failed > 0 or failed_files: + print(f"\n⚠️ Backup completed with errors") + return 1 + else: + print(f"\n✅ All memories already backed up") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/qdrant-memory/scripts/daily_conversation_backup.py b/skills/qdrant-memory/scripts/daily_conversation_backup.py new file mode 100755 index 0000000..5c2d8b5 --- /dev/null +++ b/skills/qdrant-memory/scripts/daily_conversation_backup.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +""" +Daily Conversation Backup - Store day's conversations to Qdrant (Mem0-style) + +Reads the daily memory file and stores all conversation turns to Qdrant +as full context (Mem0-style) with persistent user_id. Run at 3:30am daily. + +Usage: + daily_conversation_backup.py [YYYY-MM-DD] + # If no date provided, processes yesterday's log + +Mem0-style: All conversations linked to persistent user_id. +""" + +import argparse +import hashlib +import json +import os +import re +import sys +import urllib.request +import uuid +from datetime import datetime, timedelta +from typing import List, Optional, Dict, Any + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_URL = "http://10.0.0.10:11434/v1" +MEMORY_DIR = "/root/.openclaw/workspace/memory" + +# DEFAULT USER - Mem0-style: memories belong to user +DEFAULT_USER_ID = "yourname" + +def get_content_hash(user_msg: str, ai_response: str) -> str: + """Generate hash for deduplication""" + content = f"{user_msg.strip()}::{ai_response.strip()}" + return hashlib.md5(content.encode()).hexdigest() + +def get_embedding(text: str) -> Optional[List[float]]: + """Generate embedding using snowflake-arctic-embed2""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"[DailyBackup] Embedding error: {e}", file=sys.stderr) + return None + +def is_duplicate(user_id: str, content_hash: str) -> bool: + """Check if already stored for this user""" + try: + search_body = { + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}}, + {"key": "content_hash", "match": {"value": content_hash}} + ] + }, + "limit": 1, + "with_payload": False + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=json.dumps(search_body).encode(), + headers={"Content-Type": "application/json"} + ) + + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + return len(points) > 0 + except Exception: + pass + return False + +def parse_daily_log(date_str: str) -> List[Dict[str, str]]: + """Parse the daily memory file into conversation turns""" + log_file = os.path.join(MEMORY_DIR, f"{date_str}.md") + + if not os.path.exists(log_file): + print(f"[DailyBackup] No log file found for {date_str}") + return [] + + with open(log_file, 'r') as f: + content = f.read() + + conversations = [] + turn_number = 0 + + # Split by headers (## [timestamp] ...) + sections = re.split(r'\n##\s+', content) + + for section in sections: + if not section.strip(): + continue + + lines = section.strip().split('\n') + if not lines: + continue + + header = lines[0] + body = '\n'.join(lines[1:]).strip() + + # Extract user message from header + user_match = re.search(r'\[.*?\]\s*(.+)', header) + if user_match: + user_msg = user_match.group(1) + else: + user_msg = header + + # Extract AI response + ai_match = re.search(r'(?:Kimi|Assistant|AI)[:\s]+(.+?)(?=\n##|\Z)', body, re.DOTALL | re.IGNORECASE) + if ai_match: + ai_response = ai_match.group(1).strip() + else: + paragraphs = body.split('\n\n') + if len(paragraphs) > 1: + ai_response = '\n\n'.join(paragraphs[1:]).strip() + else: + ai_response = body + + if user_msg and ai_response: + turn_number += 1 + conversations.append({ + 'user': user_msg, + 'ai': ai_response, + 'turn_number': turn_number, + 'date': date_str + }) + + return conversations + +def store_conversation_turn( + user_id: str, + user_message: str, + ai_response: str, + conversation_id: str, + turn_number: int, + date_str: str +) -> bool: + """Store a single conversation turn to Qdrant (Mem0-style)""" + + content_hash = get_content_hash(user_message, ai_response) + + # Check duplicate + if is_duplicate(user_id, content_hash): + return True # Already stored, skip silently + + # Generate embeddings + user_embedding = get_embedding(user_message) + ai_embedding = get_embedding(ai_response) + summary = f"Q: {user_message[:200]}... A: {ai_response[:300]}..." + summary_embedding = get_embedding(summary) + + if not all([user_embedding, ai_embedding, summary_embedding]): + return False + + tags = ["conversation", "daily-backup", date_str, f"user:{user_id}"] + importance = "high" if any(kw in (user_message + ai_response).lower() + for kw in ["remember", "important", "always", "never", "rule", "decision"]) else "medium" + + points = [] + + # User message + user_id_point = str(uuid.uuid4()) + points.append({ + "id": user_id_point, + "vector": user_embedding, + "payload": { + "user_id": user_id, + "text": f"[{user_id}]: {user_message}", + "date": date_str, + "tags": tags + ["user-message"], + "importance": importance, + "source": "conversation_daily_backup", + "source_type": "user", + "category": "Full Conversation", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat(), + "conversation_id": conversation_id, + "turn_number": turn_number, + "content_hash": content_hash + } + }) + + # AI response + ai_id = str(uuid.uuid4()) + points.append({ + "id": ai_id, + "vector": ai_embedding, + "payload": { + "user_id": user_id, + "text": f"[Kimi]: {ai_response}", + "date": date_str, + "tags": tags + ["ai-response"], + "importance": importance, + "source": "conversation_daily_backup", + "source_type": "assistant", + "category": "Full Conversation", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat(), + "conversation_id": conversation_id, + "turn_number": turn_number, + "content_hash": content_hash + } + }) + + # Summary + summary_id = str(uuid.uuid4()) + points.append({ + "id": summary_id, + "vector": summary_embedding, + "payload": { + "user_id": user_id, + "text": f"[Turn {turn_number}] {summary}", + "date": date_str, + "tags": tags + ["summary", "combined"], + "importance": importance, + "source": "conversation_summary", + "source_type": "system", + "category": "Conversation Summary", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat(), + "conversation_id": conversation_id, + "turn_number": turn_number, + "content_hash": content_hash, + "user_message": user_message[:500], + "ai_response": ai_response[:800] + } + }) + + # Upload to Qdrant + upsert_data = {"points": points} + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except Exception as e: + print(f"[DailyBackup] Storage error: {e}", file=sys.stderr) + return False + +def main(): + parser = argparse.ArgumentParser( + description="Daily conversation backup to Qdrant (Mem0-style)" + ) + parser.add_argument( + "date", + nargs="?", + help="Date to process (YYYY-MM-DD). Default: yesterday" + ) + parser.add_argument( + "--user-id", + default=DEFAULT_USER_ID, + help=f"User ID (default: {DEFAULT_USER_ID})" + ) + + args = parser.parse_args() + + if args.date: + date_str = args.date + else: + yesterday = datetime.now() - timedelta(days=1) + date_str = yesterday.strftime("%Y-%m-%d") + + user_id = args.user_id + + print(f"📅 Processing daily log for {date_str} (user: {user_id})...") + + conversations = parse_daily_log(date_str) + + if not conversations: + print(f"⚠️ No conversations found for {date_str}") + sys.exit(0) + + print(f"📝 Found {len(conversations)} conversation turns") + + stored = 0 + skipped = 0 + failed = 0 + + for conv in conversations: + conversation_id = str(uuid.uuid4()) + + content_hash = get_content_hash(conv['user'], conv['ai']) + if is_duplicate(user_id, content_hash): + skipped += 1 + print(f" ⏭️ Turn {conv['turn_number']} skipped (duplicate)") + continue + + success = store_conversation_turn( + user_id=user_id, + user_message=conv['user'], + ai_response=conv['ai'], + conversation_id=conversation_id, + turn_number=conv['turn_number'], + date_str=date_str + ) + + if success: + stored += 1 + print(f" ✅ Turn {conv['turn_number']} stored") + else: + failed += 1 + print(f" ❌ Turn {conv['turn_number']} failed") + + print(f"\n{'='*50}") + print(f"Daily backup complete for {date_str} (user: {user_id}):") + print(f" Stored: {stored} turns ({stored * 3} embeddings)") + print(f" Skipped: {skipped} turns (duplicates)") + print(f" Failed: {failed} turns") + + if stored > 0: + print(f"\n✅ Daily backup: {stored} conversations stored to Qdrant") + + sys.exit(0 if failed == 0 else 1) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/extract_facts.py b/skills/qdrant-memory/scripts/extract_facts.py new file mode 100755 index 0000000..3f15135 --- /dev/null +++ b/skills/qdrant-memory/scripts/extract_facts.py @@ -0,0 +1,553 @@ +#!/usr/bin/env python3 +""" +Fact Extraction Script - Parse daily logs and extract atomic memories + +This script parses memory/YYYY-MM-DD.md files and extracts individual facts +for storage in Qdrant as atomic memory units (Mem0-style), NOT whole files. + +NOTE: Configured for COMPREHENSIVE capture (even minor facts) - user has +abundant storage resources. Thresholds are intentionally low to maximize +memory retention. Use --min-length flag to adjust filtering if needed. + +Usage: + extract_facts.py [--date 2026-02-15] [--dry-run] [--batch-size 50] + extract_facts.py --backfill-all # Process all missing dates + +Features: + - Parses markdown sections as individual facts + - Generates embeddings per fact (not per file) + - Stores with rich metadata (tags, importance, source) + - Batch upload support + - Duplicate detection +""" + +import argparse +import json +import os +import re +import sys +import urllib.request +import urllib.error +import uuid +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Dict, Any, Tuple + +# Configuration +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_EMBED_URL = "http://localhost:11434/v1" +MEMORY_DIR = Path("/root/.openclaw/workspace/memory") +DEFAULT_BATCH_SIZE = 50 + + +def get_embedding(text: str) -> Optional[List[float]]: + """Generate embedding using snowflake-arctic-embed2 via Ollama""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] # Limit to 8k chars + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_EMBED_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"Error generating embedding: {e}", file=sys.stderr) + return None + + +def batch_get_embeddings(texts: List[str]) -> List[Optional[List[float]]]: + """Generate embeddings for multiple texts in batch""" + if not texts: + return [] + + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": [t[:8192] for t in texts] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_EMBED_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=120) as response: + result = json.loads(response.read().decode()) + return [d["embedding"] for d in result["data"]] + except Exception as e: + print(f"Error generating batch embeddings: {e}", file=sys.stderr) + return [None] * len(texts) + + +def parse_markdown_sections(content: str, date_str: str) -> List[Dict[str, Any]]: + """ + Parse markdown content into atomic facts - COMPREHENSIVE CAPTURE. + + Extracts EVERYTHING: + - ## Headers as fact categories + - Individual bullet points as atomic facts + - Paragraphs as standalone facts + - Code blocks as facts + - Table rows as facts + - Lines with **bold** as critical rules + - URLs/links as facts + - Key-value pairs (Key: Value) + """ + facts = [] + lines = content.split('\n') + current_section = "General" + current_section_content = [] + in_code_block = False + code_block_content = [] + code_block_language = "" + + def flush_section_content(): + """Convert accumulated section content into facts""" + nonlocal current_section_content + if not current_section_content: + return + + # Join lines and split into paragraphs + full_text = '\n'.join(current_section_content) + paragraphs = [p.strip() for p in full_text.split('\n\n') if p.strip()] + + for para in paragraphs: + if len(para) < 5: # Skip very short fragments + continue + + # Split long paragraphs into sentence-level facts + if len(para) > 300: + sentences = [s.strip() for s in para.replace('. ', '.\n').split('\n') if s.strip()] + for sentence in sentences: + if len(sentence) > 10: + facts.append({ + "text": f"{current_section}: {sentence[:500]}", + "tags": extract_tags(sentence, date_str), + "importance": "high" if "**" in sentence else "medium", + "source_type": "inferred", + "category": current_section + }) + else: + # Store whole paragraph as fact + facts.append({ + "text": f"{current_section}: {para[:500]}", + "tags": extract_tags(para, date_str), + "importance": "high" if "**" in para else "medium", + "source_type": "inferred", + "category": current_section + }) + + current_section_content = [] + + def extract_tags(text: str, date_str: str) -> List[str]: + """Extract relevant tags from text""" + tags = ["atomic-fact", date_str] + + # Content-based tags + text_lower = text.lower() + tag_mappings = { + "preference": "preferences", + "config": "configuration", + "hardware": "hardware", + "security": "security", + "youtube": "youtube", + "video": "video", + "workflow": "workflow", + "rule": "rules", + "critical": "critical", + "decision": "decisions", + "research": "research", + "process": "process", + "step": "steps", + } + + for keyword, tag in tag_mappings.items(): + if keyword in text_lower: + tags.append(tag) + + return tags + + for i, line in enumerate(lines): + line = line.strip() + + # Code blocks + if line.startswith('```'): + if in_code_block: + # End of code block + if code_block_content: + code_text = '\n'.join(code_block_content) + facts.append({ + "text": f"{current_section} [Code: {code_block_language}]: {code_text[:800]}", + "tags": ["code-block", "atomic-fact", date_str, code_block_language], + "importance": "medium", + "source_type": "inferred", + "category": current_section + }) + code_block_content = [] + code_block_language = "" + in_code_block = False + else: + # Start of code block + flush_section_content() + in_code_block = True + code_block_language = line[3:].strip() or "text" + continue + + if in_code_block: + code_block_content.append(line) + continue + + # Skip empty lines + if not line: + flush_section_content() + continue + + # Section headers (##) + if line.startswith('## '): + flush_section_content() + current_section = line[3:].strip() + facts.append({ + "text": f"Section: {current_section}", + "tags": ["section-header", "atomic-fact", date_str], + "importance": "medium", + "source_type": "inferred", + "category": current_section + }) + continue + + # Skip main title (# Title) + if line.startswith('# ') and i == 0: + continue + + # Bullet points (all levels) + if line.startswith('- ') or line.startswith('* ') or line.startswith('+ '): + flush_section_content() + fact_text = line[2:].strip() + if len(fact_text) > 3: + facts.append({ + "text": f"{current_section}: {fact_text[:500]}", + "tags": extract_tags(fact_text, date_str), + "importance": "high" if "**" in fact_text else "medium", + "source_type": "inferred", + "category": current_section + }) + continue + + # Numbered lists + if re.match(r'^\d+\.\s', line): + flush_section_content() + fact_text = re.sub(r'^\d+\.\s*', '', line) + if len(fact_text) > 3: + facts.append({ + "text": f"{current_section}: {fact_text[:500]}", + "tags": extract_tags(fact_text, date_str), + "importance": "high" if "**" in fact_text else "medium", + "source_type": "inferred", + "category": current_section + }) + continue + + # URLs / Links + url_match = re.search(r'https?://[^\s<>"\')\]]+', line) + if url_match and len(line) < 300: + facts.append({ + "text": f"{current_section}: {line[:400]}", + "tags": ["url", "link", "atomic-fact", date_str], + "importance": "medium", + "source_type": "inferred", + "category": current_section + }) + continue + + # Key-value pairs (Key: Value) + if ':' in line and len(line) < 200 and not line.startswith('**'): + key_part = line.split(':')[0].strip() + if key_part and len(key_part) < 50 and not key_part.startswith('#'): + facts.append({ + "text": f"{current_section}: {line[:400]}", + "tags": extract_tags(line, date_str) + ["key-value"], + "importance": "medium", + "source_type": "inferred", + "category": current_section + }) + continue + + # Bold text / critical rules + if '**' in line: + flush_section_content() + facts.append({ + "text": f"{current_section}: {line[:500]}", + "tags": ["critical-rule", "high-priority", date_str], + "importance": "high", + "source_type": "user", + "category": current_section + }) + continue + + # Table rows (| col1 | col2 |) + if '|' in line and not line.startswith('#'): + cells = [c.strip() for c in line.split('|') if c.strip()] + if cells and not all(c.replace('-', '').replace(':', '') == '' for c in cells): + facts.append({ + "text": f"{current_section} [Table]: {' | '.join(cells)[:400]}", + "tags": ["table-row", "atomic-fact", date_str], + "importance": "medium", + "source_type": "inferred", + "category": current_section + }) + continue + + # Accumulate regular content + if len(line) > 2: + current_section_content.append(line) + + # Flush remaining content + flush_section_content() + + return facts + + +def check_existing_facts(date_str: str) -> set: + """Check which facts from this date are already stored""" + try: + scroll_data = json.dumps({ + "limit": 1000, + "with_payload": True, + "filter": { + "must": [{"key": "tags", "match": {"value": date_str}}] + } + }).encode() + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=scroll_data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + # Return set of text previews (first 100 chars) for comparison + return {p["payload"]["text"][:100] for p in points if "text" in p["payload"]} + except Exception as e: + print(f"Warning: Could not check existing facts: {e}", file=sys.stderr) + return set() + + +def upload_facts_batch(facts: List[Dict[str, Any]], batch_size: int = 50) -> Tuple[int, int]: + """Upload facts to Qdrant in batches""" + total = len(facts) + uploaded = 0 + failed = 0 + + for i in range(0, total, batch_size): + batch = facts[i:i + batch_size] + + # Generate embeddings for this batch + texts = [f["text"] for f in batch] + embeddings = batch_get_embeddings(texts) + + # Prepare points + points = [] + for fact, embedding in zip(batch, embeddings): + if embedding is None: + failed += 1 + continue + + point_id = str(uuid.uuid4()) + date_str = fact.get("date", datetime.now().strftime("%Y-%m-%d")) + + payload = { + "text": fact["text"], + "date": date_str, + "tags": fact.get("tags", []), + "importance": fact.get("importance", "medium"), + "source": fact.get("source", "fact-extraction"), + "source_type": fact.get("source_type", "inferred"), + "category": fact.get("category", "general"), + "confidence": fact.get("confidence", "high"), + "verified": fact.get("verified", True), + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat() + } + + # NOTE: Memories never expire - user requested permanent retention + # No expires_at field set = memories persist indefinitely + + points.append({ + "id": point_id, + "vector": embedding, + "payload": payload + }) + + if not points: + continue + + # Upload batch + upsert_data = {"points": points} + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + if result.get("status") == "ok": + uploaded += len(points) + print(f" ✅ Batch {i//batch_size + 1}: {len(points)} facts uploaded") + else: + print(f" ❌ Batch {i//batch_size + 1}: Failed") + failed += len(points) + except Exception as e: + print(f" ❌ Batch {i//batch_size + 1}: {e}", file=sys.stderr) + failed += len(points) + + return uploaded, failed + + +def process_single_date(date_str: str, dry_run: bool = False, batch_size: int = 50) -> Tuple[int, int]: + """Process a single date's memory file""" + file_path = MEMORY_DIR / f"{date_str}.md" + + if not file_path.exists(): + print(f" ⚠️ File not found: {file_path}") + return 0, 0 + + print(f"Processing {date_str}...") + + with open(file_path, 'r') as f: + content = f.read() + + # Parse into atomic facts + facts = parse_markdown_sections(content, date_str) + + if not facts: + print(f" ⚠️ No facts extracted from {date_str}") + return 0, 0 + + print(f" 📄 Extracted {len(facts)} atomic facts") + + # Check for existing (skip duplicates) + existing = check_existing_facts(date_str) + new_facts = [f for f in facts if f["text"][:100] not in existing] + + if existing: + print(f" ⏭️ Skipping {len(facts) - len(new_facts)} duplicates") + + if not new_facts: + print(f" ✅ All facts already stored for {date_str}") + return 0, 0 + + print(f" 📤 Uploading {len(new_facts)} new facts...") + + if dry_run: + print(f" [DRY RUN] Would upload {len(new_facts)} facts") + for f in new_facts[:3]: # Show first 3 + print(f" - {f['text'][:80]}...") + if len(new_facts) > 3: + print(f" ... and {len(new_facts) - 3} more") + return len(new_facts), 0 + + # Add date to each fact + for f in new_facts: + f["date"] = date_str + + uploaded, failed = upload_facts_batch(new_facts, batch_size) + return uploaded, failed + + +def get_all_memory_dates() -> List[str]: + """Get all memory file dates sorted""" + if not MEMORY_DIR.exists(): + return [] + + dates = [] + for f in MEMORY_DIR.glob("????-??-??.md"): + dates.append(f.stem) + + dates.sort() + return dates + + +def main(): + parser = argparse.ArgumentParser( + description="Extract atomic facts from daily logs and store in Qdrant" + ) + parser.add_argument("--date", help="Specific date to process (YYYY-MM-DD)") + parser.add_argument("--backfill-all", action="store_true", + help="Process all memory files") + parser.add_argument("--dry-run", action="store_true", + help="Show what would be stored without uploading") + parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, + help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})") + parser.add_argument("--force", action="store_true", + help="Re-process even if already stored") + + args = parser.parse_args() + + print(f"=== Fact Extraction ===") + print(f"Time: {datetime.now().isoformat()}") + print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}") + print(f"Batch size: {args.batch_size}") + print() + + if args.date: + # Single date + uploaded, failed = process_single_date(args.date, args.dry_run, args.batch_size) + print(f"\n{'=' * 50}") + print(f"Summary for {args.date}:") + print(f" Uploaded: {uploaded}") + print(f" Failed: {failed}") + + elif args.backfill_all: + # All dates + dates = get_all_memory_dates() + print(f"Found {len(dates)} memory files to process") + print() + + total_uploaded = 0 + total_failed = 0 + + for date_str in dates: + uploaded, failed = process_single_date(date_str, args.dry_run, args.batch_size) + total_uploaded += uploaded + total_failed += failed + print() + + print(f"{'=' * 50}") + print(f"Total Summary:") + print(f" Files processed: {len(dates)}") + print(f" Total uploaded: {total_uploaded}") + print(f" Total failed: {total_failed}") + + else: + # Default to today + today = datetime.now().strftime("%Y-%m-%d") + uploaded, failed = process_single_date(today, args.dry_run, args.batch_size) + print(f"\n{'=' * 50}") + print(f"Summary for {today}:") + print(f" Uploaded: {uploaded}") + print(f" Failed: {failed}") + + print() + print("✅ Fact extraction complete!") + print("\nNext steps:") + print(" - Search facts: python3 search_memories.py 'your query'") + print(" - View by date: Check Qdrant with tag filter for date") + + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/get_conversation_context.py b/skills/qdrant-memory/scripts/get_conversation_context.py new file mode 100755 index 0000000..23c7cb5 --- /dev/null +++ b/skills/qdrant-memory/scripts/get_conversation_context.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +""" +Mem0-Style Conversation Retrieval - User-centric memory search + +Retrieves memories by USER, not by session/chat. +Cross-conversation search across all of Rob's memories. + +Usage: + # Search user's memories across all conversations + python3 scripts/get_conversation_context.py --user-id "rob" "what was the decision about Qdrant?" + + # Get specific conversation + python3 scripts/get_conversation_context.py --user-id "rob" --conversation-id + + # Get all conversations for user + python3 scripts/get_conversation_context.py --user-id "rob" --limit 50 + +Mem0-style: Memories belong to USER, not to session. +""" + +import argparse +import json +import sys +import urllib.request +from datetime import datetime +from typing import List, Optional, Dict, Any + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_URL = "http://10.0.0.10:11434/v1" + +def get_embedding(text: str) -> Optional[List[float]]: + """Generate embedding using snowflake-arctic-embed2""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"[Retrieval] Embedding error: {e}", file=sys.stderr) + return None + +def search_user_memories(user_id: str, query: str, limit: int = 10) -> List[Dict]: + """ + MEM0-STYLE: Search memories for a specific user across all conversations. + NOT session-based - user-centric. + """ + embedding = get_embedding(query) + if embedding is None: + return [] + + # Search with user_id filter (MEM0: memories belong to user) + search_data = json.dumps({ + "vector": embedding, + "limit": limit, + "with_payload": True, + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}}, + {"key": "source_type", "match": {"value": "system"}} # Search summaries + ] + } + }).encode() + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search", + data=search_data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("result", []) + except Exception as e: + print(f"[Retrieval] Search error: {e}", file=sys.stderr) + return [] + +def get_user_conversations(user_id: str, limit: int = 100) -> List[Dict]: + """Get all conversations for a user (Mem0-style)""" + + scroll_data = json.dumps({ + "limit": limit, + "with_payload": True, + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}}, + {"key": "source_type", "match": {"value": "system"}} # Get summaries + ] + } + }).encode() + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=scroll_data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("result", {}).get("points", []) + except Exception as e: + print(f"[Retrieval] Fetch error: {e}", file=sys.stderr) + return [] + +def get_conversation_by_id(user_id: str, conversation_id: str, limit: int = 100) -> List[Dict]: + """Get full conversation by ID (with user verification)""" + + scroll_data = json.dumps({ + "limit": limit, + "with_payload": True, + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}}, + {"key": "conversation_id", "match": {"value": conversation_id}} + ] + } + }).encode() + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=scroll_data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("result", {}).get("points", []) + except Exception as e: + print(f"[Retrieval] Fetch error: {e}", file=sys.stderr) + return [] + +def format_conversation(points: List[Dict]) -> str: + """Format conversation into readable transcript""" + + def sort_key(p): + turn = p.get("payload", {}).get("turn_number", 0) + source = p.get("payload", {}).get("source_type", "") + return (turn, 0 if source in ["user", "assistant"] else 1) + + sorted_points = sorted(points, key=sort_key) + + output = [] + current_turn = 0 + + for point in sorted_points: + payload = point.get("payload", {}) + text = payload.get("text", "") + source = payload.get("source_type", "unknown") + turn = payload.get("turn_number", 0) + date = payload.get("date", "unknown") + user = payload.get("user_id", "unknown") + + if payload.get("source") == "conversation_summary": + continue + + if turn != current_turn: + output.append(f"\n--- Turn {turn} [{date}] ---") + current_turn = turn + + output.append(text) + + return "\n".join(output) + +def main(): + parser = argparse.ArgumentParser( + description="Mem0-style conversation retrieval (user-centric)" + ) + parser.add_argument("query", nargs="?", help="Search query") + parser.add_argument("--user-id", required=True, + help="REQUIRED: User ID (e.g., 'rob')") + parser.add_argument("--conversation-id", + help="Get specific conversation") + parser.add_argument("--limit", type=int, default=10, + help="Max results") + parser.add_argument("--format", choices=["transcript", "json"], + default="transcript") + + args = parser.parse_args() + + if not args.user_id: + print("❌ --user-id is required for Mem0-style retrieval", file=sys.stderr) + sys.exit(1) + + points = [] + + if args.conversation_id: + print(f"🔍 Fetching conversation for user '{args.user_id}': {args.conversation_id}") + points = get_conversation_by_id(args.user_id, args.conversation_id, args.limit * 3) + + elif args.query: + print(f"🔍 Searching memories for user '{args.user_id}': {args.query}") + points = search_user_memories(args.user_id, args.query, args.limit) + + else: + print(f"🔍 Fetching all memories for user '{args.user_id}'") + points = get_user_conversations(args.user_id, args.limit) + + if not points: + print(f"❌ No memories found for user '{args.user_id}'") + sys.exit(1) + + if args.format == "json": + print(json.dumps(points, indent=2)) + else: + # Group by conversation_id + conversations = {} + for p in points: + convo_id = p.get("payload", {}).get("conversation_id") + if convo_id not in conversations: + conversations[convo_id] = [] + conversations[convo_id].append(p) + + for i, (convo_id, convo_points) in enumerate(conversations.items(), 1): + print(f"\n{'='*60}") + print(f"📜 Conversation {i}: {convo_id}") + print(f"{'='*60}") + print(format_conversation(convo_points)) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/get_user_context.py b/skills/qdrant-memory/scripts/get_user_context.py new file mode 100755 index 0000000..3fc1704 --- /dev/null +++ b/skills/qdrant-memory/scripts/get_user_context.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +Quick user context for email replies. +Returns recent memory summary, not full conversations. +""" + +import json +import sys +import urllib.request +from typing import Optional + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" + +def get_user_context(user_id: str, limit: int = 5) -> str: + """Get recent context for user - returns formatted summary.""" + + # Use scroll to get recent memories for user + data = json.dumps({ + "limit": 10, # Get more to find profile + "with_payload": True, + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}} + ] + } + }).encode() + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + + if not points: + return "" + + # Prioritize: 1) Profile info, 2) Recent user message, 3) Recent context + profile = None + recent_user = None + recent_context = [] + + for point in points: + payload = point.get("payload", {}) + text = payload.get("text", "") + source_type = payload.get("source_type", "") + + # Look for profile (contains "Profile" or key identifying info) + if "profile" in text.lower() or "lives in" in text.lower(): + profile = text[:200] + elif source_type == "user" and not recent_user: + recent_user = text[:150] + elif source_type in ["assistant", "system"]: + clean = text.replace("\r\n", " ").replace("\n", " ")[:150] + recent_context.append(clean) + + # Build output: profile first if exists, then recent context + parts = [] + if profile: + parts.append(f"[PROFILE] {profile}") + if recent_user: + parts.append(f"[USER] {recent_user}") + if recent_context: + parts.append(f"[CONTEXT] {recent_context[0][:100]}") + + return " || ".join(parts) if parts else "" + + except Exception as e: + return "" + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Get quick user context") + parser.add_argument("--user-id", required=True, help="User ID") + parser.add_argument("--limit", type=int, default=5, help="Max memories") + args = parser.parse_args() + + context = get_user_context(args.user_id, args.limit) + if context: + print(context) \ No newline at end of file diff --git a/skills/qdrant-memory/scripts/harvest_newest.py b/skills/qdrant-memory/scripts/harvest_newest.py new file mode 100755 index 0000000..897f0d7 --- /dev/null +++ b/skills/qdrant-memory/scripts/harvest_newest.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +""" +Harvest session files by explicit list (newest first). +""" + +import argparse +import hashlib +import json +import os +import sys +import urllib.request +import uuid +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Dict, Any + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_URL = "http://10.0.0.10:11434/v1" +SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions") + +_recent_hashes = set() + +def get_content_hash(user_msg: str, ai_response: str) -> str: + content = f"{user_msg.strip()}::{ai_response.strip()}" + return hashlib.md5(content.encode()).hexdigest() + +def is_duplicate(user_id: str, content_hash: str) -> bool: + if content_hash in _recent_hashes: + return True + try: + search_body = { + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}}, + {"key": "content_hash", "match": {"value": content_hash}} + ] + }, + "limit": 1, + "with_payload": False + } + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=json.dumps(search_body).encode(), + headers={"Content-Type": "application/json"} + ) + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + if result.get("result", {}).get("points", []): + return True + except Exception: + pass + return False + +def get_embedding(text: str) -> Optional[List[float]]: + data = json.dumps({"model": "snowflake-arctic-embed2", "input": text[:8192]}).encode() + req = urllib.request.Request(f"{OLLAMA_URL}/embeddings", data=data, headers={"Content-Type": "application/json"}) + try: + with urllib.request.urlopen(req, timeout=30) as response: + return json.loads(response.read().decode())["data"][0]["embedding"] + except Exception: + return None + +def store_turn(user_id: str, user_msg: str, ai_response: str, date_str: str, + conversation_id: str, turn_number: int, session_id: str) -> bool: + content_hash = get_content_hash(user_msg, ai_response) + if is_duplicate(user_id, content_hash): + return False # Skipped (duplicate) + + user_emb = get_embedding(f"[{user_id}]: {user_msg}") + ai_emb = get_embedding(f"[Kimi]: {ai_response}") + summary_emb = get_embedding(f"Q: {user_msg[:200]} A: {ai_response[:300]}") + + if not all([user_emb, ai_emb, summary_emb]): + return False + + tags = ["conversation", "harvested", f"user:{user_id}", date_str] + importance = "high" if any(kw in (user_msg + ai_response).lower() for kw in ["remember", "important", "always", "never", "rule"]) else "medium" + + points = [ + {"id": str(uuid.uuid4()), "vector": user_emb, "payload": { + "user_id": user_id, "text": f"[{user_id}]: {user_msg[:2000]}", "date": date_str, + "tags": tags + ["user-message"], "importance": importance, "source": "session_harvest", + "source_type": "user", "category": "Full Conversation", "confidence": "high", + "conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id, "content_hash": content_hash + }}, + {"id": str(uuid.uuid4()), "vector": ai_emb, "payload": { + "user_id": user_id, "text": f"[Kimi]: {ai_response[:2000]}", "date": date_str, + "tags": tags + ["ai-response"], "importance": importance, "source": "session_harvest", + "source_type": "assistant", "category": "Full Conversation", "confidence": "high", + "conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id, "content_hash": content_hash + }}, + {"id": str(uuid.uuid4()), "vector": summary_emb, "payload": { + "user_id": user_id, "text": f"[Turn {turn_number}] Q: {user_msg[:200]} A: {ai_response[:300]}", "date": date_str, + "tags": tags + ["summary"], "importance": importance, "source": "session_harvest", + "source_type": "system", "category": "Conversation Summary", "confidence": "high", + "conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id, + "content_hash": content_hash, "user_message": user_msg[:500], "ai_response": ai_response[:800] + }} + ] + + req = urllib.request.Request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps({"points": points}).encode(), headers={"Content-Type": "application/json"}, method="PUT") + try: + with urllib.request.urlopen(req, timeout=30) as response: + if json.loads(response.read().decode()).get("status") == "ok": + _recent_hashes.add(content_hash) + return True + except Exception: + pass + return False + +def parse_and_store(filepath: Path, user_id: str) -> tuple: + turns = [] + turn_num = 0 + try: + with open(filepath, 'r') as f: + for line in f: + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + if entry.get('type') != 'message' or 'message' not in entry: + continue + msg = entry['message'] + role = msg.get('role') + if role == 'toolResult': + continue + content = "" + if isinstance(msg.get('content'), list): + for item in msg['content']: + if isinstance(item, dict) and 'text' in item: + content += item['text'] + elif isinstance(msg.get('content'), str): + content = msg['content'] + if content and role in ('user', 'assistant'): + turn_num += 1 + ts = entry.get('timestamp', '') + turns.append({'turn': turn_num, 'role': role, 'content': content[:2000], + 'date': ts[:10] if ts else datetime.now().strftime("%Y-%m-%d")}) + except json.JSONDecodeError: + continue + except Exception as e: + print(f" Error: {e}", file=sys.stderr) + return 0, 0 + + stored, skipped = 0, 0 + conv_id = str(uuid.uuid4()) + i = 0 + while i < len(turns): + if turns[i]['role'] == 'user': + user_msg = turns[i]['content'] + ai_resp = "" + if i + 1 < len(turns) and turns[i + 1]['role'] == 'assistant': + ai_resp = turns[i + 1]['content'] + i += 2 + else: + i += 1 + if user_msg and ai_resp: + if store_turn(user_id, user_msg, ai_resp, turns[i-1]['date'] if i > 0 else "", conv_id, turns[i-1]['turn'] if i > 0 else 0, filepath.stem): + stored += 1 + else: + skipped += 1 + else: + i += 1 + return stored, skipped + +def main(): + parser = argparse.ArgumentParser(description="Harvest sessions by name") + parser.add_argument("--user-id", default="yourname") + parser.add_argument("sessions", nargs="*", help="Session filenames to process") + args = parser.parse_args() + + total_stored, total_skipped = 0, 0 + for i, name in enumerate(args.sessions, 1): + path = SESSIONS_DIR / name + if not path.exists(): + print(f"[{i}] Not found: {name}") + continue + print(f"[{i}] {name}") + s, sk = parse_and_store(path, args.user_id) + total_stored += s + total_skipped += sk + if s > 0: + print(f" Stored: {s}, Skipped: {sk}") + + print(f"\nTotal: {total_stored} stored, {total_skipped} skipped") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/harvest_sessions.py b/skills/qdrant-memory/scripts/harvest_sessions.py new file mode 100755 index 0000000..b9e1fcc --- /dev/null +++ b/skills/qdrant-memory/scripts/harvest_sessions.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python3 +""" +Harvest all session JSONL files and store to Qdrant. + +Scans all session files, extracts conversation turns, and stores to Qdrant +with proper user_id and deduplication. + +Usage: python3 harvest_sessions.py [--user-id rob] [--dry-run] +""" + +import argparse +import hashlib +import json +import os +import sys +import urllib.request +import uuid +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Dict, Any + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_URL = "http://10.0.0.10:11434/v1" +SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions") + +# In-memory cache for deduplication +_recent_hashes = set() + +def get_content_hash(user_msg: str, ai_response: str) -> str: + """Generate hash for deduplication""" + content = f"{user_msg.strip()}::{ai_response.strip()}" + return hashlib.md5(content.encode()).hexdigest() + +def is_duplicate(user_id: str, content_hash: str) -> bool: + """Check if this content already exists for this user""" + if content_hash in _recent_hashes: + return True + + try: + search_body = { + "filter": { + "must": [ + {"key": "user_id", "match": {"value": user_id}}, + {"key": "content_hash", "match": {"value": content_hash}} + ] + }, + "limit": 1, + "with_payload": False + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=json.dumps(search_body).encode(), + headers={"Content-Type": "application/json"} + ) + + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + if len(points) > 0: + return True + except Exception: + pass + + return False + +def get_embedding(text: str) -> Optional[List[float]]: + """Generate embedding using snowflake-arctic-embed2""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"[Harvest] Embedding error: {e}", file=sys.stderr) + return None + +def store_turn(user_id: str, user_msg: str, ai_response: str, + date_str: str, conversation_id: str, turn_number: int, + session_id: str, dry_run: bool = False) -> Dict: + """Store a single conversation turn to Qdrant""" + + content_hash = get_content_hash(user_msg, ai_response) + + # Check duplicate + if is_duplicate(user_id, content_hash): + return {"skipped": True, "reason": "duplicate"} + + if dry_run: + return {"skipped": False, "dry_run": True} + + # Generate embeddings + user_embedding = get_embedding(f"[{user_id}]: {user_msg}") + ai_embedding = get_embedding(f"[Kimi]: {ai_response}") + summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}..." + summary_embedding = get_embedding(summary) + + if not all([user_embedding, ai_embedding, summary_embedding]): + return {"skipped": True, "reason": "embedding_failed"} + + tags = ["conversation", "harvested", f"user:{user_id}", date_str] + importance = "high" if any(kw in (user_msg + ai_response).lower() + for kw in ["remember", "important", "always", "never", "rule"]) else "medium" + + points = [] + + # User message + points.append({ + "id": str(uuid.uuid4()), + "vector": user_embedding, + "payload": { + "user_id": user_id, + "text": f"[{user_id}]: {user_msg[:2000]}", + "date": date_str, + "tags": tags + ["user-message"], + "importance": importance, + "source": "session_harvest", + "source_type": "user", + "category": "Full Conversation", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "conversation_id": conversation_id, + "turn_number": turn_number, + "session_id": session_id, + "content_hash": content_hash + } + }) + + # AI response + points.append({ + "id": str(uuid.uuid4()), + "vector": ai_embedding, + "payload": { + "user_id": user_id, + "text": f"[Kimi]: {ai_response[:2000]}", + "date": date_str, + "tags": tags + ["ai-response"], + "importance": importance, + "source": "session_harvest", + "source_type": "assistant", + "category": "Full Conversation", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "conversation_id": conversation_id, + "turn_number": turn_number, + "session_id": session_id, + "content_hash": content_hash + } + }) + + # Summary + if summary_embedding: + points.append({ + "id": str(uuid.uuid4()), + "vector": summary_embedding, + "payload": { + "user_id": user_id, + "text": f"[Turn {turn_number}] {summary}", + "date": date_str, + "tags": tags + ["summary"], + "importance": importance, + "source": "session_harvest_summary", + "source_type": "system", + "category": "Conversation Summary", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "conversation_id": conversation_id, + "turn_number": turn_number, + "session_id": session_id, + "content_hash": content_hash, + "user_message": user_msg[:500], + "ai_response": ai_response[:800] + } + }) + + # Upload + upsert_data = {"points": points} + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + if result.get("status") == "ok": + _recent_hashes.add(content_hash) + return {"skipped": False, "stored": True} + except Exception as e: + print(f"[Harvest] Storage error: {e}", file=sys.stderr) + + return {"skipped": True, "reason": "upload_failed"} + +def parse_session_file(filepath: Path) -> List[Dict]: + """Parse a session JSONL file and extract conversation turns""" + turns = [] + turn_number = 0 + + try: + with open(filepath, 'r') as f: + for line in f: + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + if entry.get('type') == 'message' and 'message' in entry: + msg = entry['message'] + role = msg.get('role') + + if role == 'toolResult': + continue + + content = "" + if isinstance(msg.get('content'), list): + for item in msg['content']: + if isinstance(item, dict): + if 'text' in item: + content += item['text'] + elif 'thinking' in item: + content += f"[thinking: {item['thinking'][:200]}...]" + elif isinstance(msg.get('content'), str): + content = msg['content'] + + if content and role in ('user', 'assistant'): + turn_number += 1 + timestamp = entry.get('timestamp', '') + date_str = timestamp[:10] if timestamp else datetime.now().strftime("%Y-%m-%d") + + turns.append({ + 'turn': turn_number, + 'role': role, + 'content': content[:2000], + 'date': date_str, + 'session': filepath.stem + }) + except json.JSONDecodeError: + continue + except Exception as e: + print(f"[Harvest] Error reading {filepath}: {e}", file=sys.stderr) + + return turns + +def main(): + parser = argparse.ArgumentParser(description="Harvest session files to Qdrant") + parser.add_argument("--user-id", default="yourname", help="User ID for storage") + parser.add_argument("--dry-run", action="store_true", help="Don't actually store") + parser.add_argument("--limit", type=int, default=0, help="Limit sessions (0=all)") + args = parser.parse_args() + + # Find all session files + session_files = sorted(SESSIONS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime) + + if args.limit > 0: + session_files = session_files[:args.limit] + + print(f"Found {len(session_files)} session files") + + total_stored = 0 + total_skipped = 0 + total_failed = 0 + + for i, session_file in enumerate(session_files, 1): + print(f"\n[{i}/{len(session_files)}] Processing: {session_file.name}") + + turns = parse_session_file(session_file) + if not turns: + print(" No turns found") + continue + + print(f" Found {len(turns)} turns") + + # Pair user messages with AI responses + conversation_id = str(uuid.uuid4()) + j = 0 + while j < len(turns): + turn = turns[j] + + if turn['role'] == 'user': + user_msg = turn['content'] + ai_response = "" + + # Look for next AI response + if j + 1 < len(turns) and turns[j + 1]['role'] == 'assistant': + ai_response = turns[j + 1]['content'] + j += 2 + else: + j += 1 + + if user_msg and ai_response: + result = store_turn( + user_id=args.user_id, + user_msg=user_msg, + ai_response=ai_response, + date_str=turn['date'], + conversation_id=conversation_id, + turn_number=turn['turn'], + session_id=turn['session'], + dry_run=args.dry_run + ) + + if result.get("skipped"): + if result.get("reason") == "duplicate": + total_skipped += 1 + else: + total_failed += 1 + else: + total_stored += 1 + if total_stored % 10 == 0: + print(f" Progress: {total_stored} stored, {total_skipped} skipped") + else: + j += 1 + + print(f"\n{'='*50}") + print(f"Harvest complete:") + print(f" Stored: {total_stored} turns ({total_stored * 3} embeddings)") + print(f" Skipped (duplicates): {total_skipped}") + print(f" Failed: {total_failed}") + + if args.dry_run: + print("\n[DRY RUN] Nothing was actually stored") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/skills/qdrant-memory/scripts/hb_check_email.py b/skills/qdrant-memory/scripts/hb_check_email.py new file mode 100755 index 0000000..f97ad12 --- /dev/null +++ b/skills/qdrant-memory/scripts/hb_check_email.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +""" +Email checker for heartbeat using Redis ID tracking. +Tracks seen email IDs in Redis to avoid missing read emails. +Stores emails to Qdrant with sender-specific user_id for memory. +Only alerts on emails from authorized senders. +""" + +import imaplib +import email +from email.policy import default +import json +import sys +import redis +import subprocess +from datetime import datetime + +# Authorized senders with their user IDs for Qdrant storage +# Add your authorized emails here +AUTHORIZED_SENDERS = { + # "your_email@gmail.com": "yourname", + # "spouse_email@gmail.com": "spousename" +} + +# Gmail IMAP settings +IMAP_SERVER = "imap.gmail.com" +IMAP_PORT = 993 + +# Redis config +REDIS_HOST = "10.0.0.36" +REDIS_PORT = 6379 +REDIS_KEY = "email:seen_ids" + +# Load credentials +CRED_FILE = "/root/.openclaw/workspace/.gmail_imap.json" + +def load_credentials(): + try: + with open(CRED_FILE, 'r') as f: + return json.load(f) + except Exception as e: + return None + +def get_redis(): + try: + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + r.ping() # Test connection + return r + except Exception as e: + return None + +def store_email_memory(user_id, sender, subject, body, date): + """Store email to Qdrant as memory for the user.""" + try: + # Format as conversation-like entry + email_text = f"[EMAIL from {sender}]\nSubject: {subject}\n\n{body}" + + # Store using background_store.py (fire-and-forget) + script_path = "/root/.openclaw/workspace/skills/qdrant-memory/scripts/background_store.py" + subprocess.Popen([ + "python3", script_path, + f"[Email] {subject}", + email_text, + "--user-id", user_id + ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except Exception as e: + pass # Silent fail + +def get_user_context(user_id): + """Fetch recent context from Qdrant for the user.""" + try: + script_path = "/root/.openclaw/workspace/skills/qdrant-memory/scripts/get_user_context.py" + result = subprocess.run([ + "python3", script_path, + "--user-id", user_id, + "--limit", "3" + ], capture_output=True, text=True, timeout=10) + + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except Exception as e: + pass + return None + +def check_emails(): + creds = load_credentials() + if not creds: + return # Silent fail + + email_addr = creds.get("email") + app_password = creds.get("app_password") + + if not email_addr or not app_password: + return # Silent fail + + r = get_redis() + if not r: + return # Silent fail if Redis unavailable + + try: + # Connect to IMAP + mail = imaplib.IMAP4_SSL(IMAP_SERVER, IMAP_PORT) + mail.login(email_addr, app_password) + mail.select("inbox") + + # Get ALL emails (not just unseen) + status, messages = mail.search(None, "ALL") + + if status != "OK" or not messages[0]: + mail.logout() + return # No emails + + email_ids = messages[0].split() + + # Get already-seen IDs from Redis + seen_ids = set(r.smembers(REDIS_KEY)) + + # Check last 10 emails for new ones + for eid in email_ids[-10:]: + eid_str = eid.decode() if isinstance(eid, bytes) else str(eid) + + # Skip if already seen + if eid_str in seen_ids: + continue + + status, msg_data = mail.fetch(eid, "(RFC822)") + if status != "OK": + continue + + msg = email.message_from_bytes(msg_data[0][1], policy=default) + sender = msg.get("From", "").lower() + subject = msg.get("Subject", "") + date = msg.get("Date", "") + + # Extract email body + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + body = part.get_content() + break + else: + body = msg.get_content() + + # Clean up body (limit size) + body = body.strip()[:2000] if body else "" + + # Check if sender is authorized and get their user_id + user_id = None + for auth_email, uid in AUTHORIZED_SENDERS.items(): + if auth_email.lower() in sender: + user_id = uid + break + + # Mark as seen in Redis regardless of sender (avoid re-checking) + r.sadd(REDIS_KEY, eid_str) + + if user_id: + # Store to Qdrant for memory + store_email_memory(user_id, sender, subject, body, date) + # Get user context from Qdrant before alerting + context = get_user_context(user_id) + # Output for Kimi to respond (with context hint) + print(f"[EMAIL] User: {user_id} | From: {sender.strip()} | Subject: {subject} | Date: {date}") + if context: + print(f"[CONTEXT] {context}") + + # Cleanup old IDs (keep last 100) + all_ids = r.smembers(REDIS_KEY) + if len(all_ids) > 100: + # Convert to int, sort, keep only highest 100 + id_ints = sorted([int(x) for x in all_ids if x.isdigit()]) + to_remove = id_ints[:-100] + for old_id in to_remove: + r.srem(REDIS_KEY, str(old_id)) + + mail.close() + mail.logout() + + except Exception as e: + # Silent fail - no output + pass + +if __name__ == "__main__": + check_emails() + sys.exit(0) \ No newline at end of file diff --git a/skills/qdrant-memory/scripts/hybrid_search.py b/skills/qdrant-memory/scripts/hybrid_search.py new file mode 100755 index 0000000..63085ad --- /dev/null +++ b/skills/qdrant-memory/scripts/hybrid_search.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Hybrid search: Search both file-based memory and Qdrant vectors +Usage: hybrid_search.py "Query text" [--file-limit 3] [--vector-limit 3] +""" + +import argparse +import json +import os +import subprocess +import sys +import re +from datetime import datetime, timedelta + +WORKSPACE = "/root/.openclaw/workspace" +MEMORY_DIR = f"{WORKSPACE}/memory" + +def search_files(query, limit=3): + """Search recent memory files for keyword matches""" + results = [] + + # Get recent memory files (last 30 days) + files = [] + today = datetime.now() + for i in range(30): + date_str = (today - timedelta(days=i)).strftime("%Y-%m-%d") + filepath = f"{MEMORY_DIR}/{date_str}.md" + if os.path.exists(filepath): + files.append((date_str, filepath)) + + # Simple keyword search + query_lower = query.lower() + keywords = set(query_lower.split()) + + for date_str, filepath in files[:7]: # Check last 7 days max + try: + with open(filepath, 'r') as f: + content = f.read() + + # Find sections that match + lines = content.split('\n') + for i, line in enumerate(lines): + line_lower = line.lower() + if any(kw in line_lower for kw in keywords): + # Get context (3 lines before and after) + start = max(0, i - 3) + end = min(len(lines), i + 4) + context = '\n'.join(lines[start:end]) + + # Simple relevance score based on keyword matches + score = sum(1 for kw in keywords if kw in line_lower) / len(keywords) + + results.append({ + "source": f"file:{filepath}", + "date": date_str, + "score": score, + "text": context.strip(), + "type": "file" + }) + + if len(results) >= limit * 2: # Get more then dedupe + break + + except Exception as e: + continue + + # Sort by score and return top N + results.sort(key=lambda x: x["score"], reverse=True) + return results[:limit] + +def search_qdrant(query, limit=3): + """Search Qdrant using the search_memories script""" + try: + script_path = f"{WORKSPACE}/skills/qdrant-memory/scripts/search_memories.py" + result = subprocess.run( + ["python3", script_path, query, "--limit", str(limit), "--json"], + capture_output=True, text=True, timeout=60 + ) + + if result.returncode == 0: + memories = json.loads(result.stdout) + for m in memories: + m["type"] = "vector" + m["source"] = "qdrant" + return memories + except Exception as e: + print(f"Qdrant search failed (falling back to files only): {e}", file=sys.stderr) + + return [] + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Hybrid memory search") + parser.add_argument("query", help="Search query") + parser.add_argument("--file-limit", type=int, default=3, help="Max file results") + parser.add_argument("--vector-limit", type=int, default=3, help="Max vector results") + parser.add_argument("--json", action="store_true", help="Output as JSON") + + args = parser.parse_args() + + print(f"Searching for: '{args.query}'\n", file=sys.stderr) + + # Search both sources + file_results = search_files(args.query, args.file_limit) + vector_results = search_qdrant(args.query, args.vector_limit) + + # Combine results + all_results = file_results + vector_results + + if not all_results: + print("No memories found matching your query.") + sys.exit(0) + + if args.json: + print(json.dumps(all_results, indent=2)) + else: + print(f"📁 File-based results ({len(file_results)}):") + print("-" * 50) + for r in file_results: + print(f"[{r['date']}] Score: {r['score']:.2f}") + print(r['text'][:300]) + if len(r['text']) > 300: + print("...") + print() + + print(f"\n🔍 Vector (Qdrant) results ({len(vector_results)}):") + print("-" * 50) + for r in vector_results: + print(f"[{r.get('date', 'unknown')}] Score: {r.get('score', 0):.3f} [{r.get('importance', 'medium')}]") + text = r.get('text', '') + print(text[:300]) + if len(text) > 300: + print("...") + if r.get('tags'): + print(f"Tags: {', '.join(r['tags'])}") + print() diff --git a/skills/qdrant-memory/scripts/init_all_collections.py b/skills/qdrant-memory/scripts/init_all_collections.py new file mode 100755 index 0000000..1ca10df --- /dev/null +++ b/skills/qdrant-memory/scripts/init_all_collections.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +""" +Initialize Qdrant collections for Kimi Memory System +Creates 3 collections with snowflake-arctic-embed2 (1024 dims) using Qdrant 2025 best practices: + +1. kimi_memories - Personal memories, preferences, lessons learned +2. kimi_kb - Knowledge base for web search, documents, scraped data +3. private_court_docs - Court documents and legal discussions + +Features: +- on_disk=True for vectors (minimize RAM usage) +- on_disk_payload=True for payload +- Optimizer config for efficient indexing +- Binary quantization support (2025+ feature) + +Usage: init_all_collections.py [--recreate] +""" + +import argparse +import json +import sys + +QDRANT_URL = "http://10.0.0.40:6333" + +# Collection configurations +COLLECTIONS = { + "kimi_memories": { + "description": "Personal memories, preferences, lessons learned", + "vector_size": 1024 + }, + "kimi_kb": { + "description": "Knowledge base - web data, documents, reference materials", + "vector_size": 1024 + }, + "private_court_docs": { + "description": "Court documents and legal discussions", + "vector_size": 1024 + } +} + +def make_request(url, data=None, method="GET"): + """Make HTTP request with proper method""" + import urllib.request + req = urllib.request.Request(url, method=method) + if data: + req.data = json.dumps(data).encode() + req.add_header("Content-Type", "application/json") + return req + +def collection_exists(name): + """Check if collection exists""" + import urllib.request + import urllib.error + try: + req = make_request(f"{QDRANT_URL}/collections/{name}") + with urllib.request.urlopen(req, timeout=5) as response: + return True + except urllib.error.HTTPError as e: + if e.code == 404: + return False + raise + except Exception: + return False + +def get_collection_info(name): + """Get collection info""" + import urllib.request + try: + req = make_request(f"{QDRANT_URL}/collections/{name}") + with urllib.request.urlopen(req, timeout=5) as response: + return json.loads(response.read().decode()) + except Exception as e: + return None + +def create_collection(name, vector_size=1024): + """Create a collection with Qdrant 2025 best practices""" + import urllib.request + + config = { + "vectors": { + "size": vector_size, + "distance": "Cosine", + "on_disk": True, # Store vectors on disk to minimize RAM + "quantization_config": { + "binary": { + "always_ram": True # Keep compressed vectors in RAM for fast search + } + } + }, + "on_disk_payload": True, # Store payload on disk + "shard_number": 1, # Single node setup + "replication_factor": 1, # Single copy (set to 2 for production with HA) + "optimizers_config": { + "indexing_threshold": 20000, # Start indexing after 20k points + "default_segment_number": 0, # Fewer/larger segments for better throughput + "deleted_threshold": 0.2, # Vacuum when 20% deleted + "vacuum_min_vector_number": 1000 # Min vectors before vacuum + } + } + + req = make_request( + f"{QDRANT_URL}/collections/{name}", + data=config, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("result") == True + except Exception as e: + print(f"Error creating collection {name}: {e}", file=sys.stderr) + return False + +def delete_collection(name): + """Delete a collection""" + import urllib.request + req = make_request(f"{QDRANT_URL}/collections/{name}", method="DELETE") + + try: + with urllib.request.urlopen(req, timeout=5) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except Exception as e: + print(f"Error deleting collection {name}: {e}", file=sys.stderr) + return False + +def main(): + import urllib.request + + parser = argparse.ArgumentParser(description="Initialize all Qdrant collections with 2025 best practices") + parser.add_argument("--recreate", action="store_true", help="Delete and recreate all collections") + parser.add_argument("--force", action="store_true", help="Force recreate even with existing data") + args = parser.parse_args() + + # Check Qdrant connection + try: + req = urllib.request.Request(f"{QDRANT_URL}/") + with urllib.request.urlopen(req, timeout=3) as response: + pass + except Exception as e: + print(f"❌ Cannot connect to Qdrant at {QDRANT_URL}: {e}", file=sys.stderr) + sys.exit(1) + + print(f"✅ Connected to Qdrant at {QDRANT_URL}\n") + + # Check if Ollama is available for embeddings + try: + req = urllib.request.Request("http://localhost:11434/api/tags") + with urllib.request.urlopen(req, timeout=3) as response: + ollama_status = "✅" + except Exception: + ollama_status = "⚠️" + + print(f"Ollama (localhost): {ollama_status} - Embeddings endpoint\n") + + created = [] + skipped = [] + errors = [] + recreated = [] + + for name, config in COLLECTIONS.items(): + print(f"--- {name} ---") + print(f" Description: {config['description']}") + + exists = collection_exists(name) + + if exists: + info = get_collection_info(name) + if info: + actual_size = info.get("result", {}).get("config", {}).get("params", {}).get("vectors", {}).get("size", "?") + points = info.get("result", {}).get("points_count", 0) + on_disk = info.get("result", {}).get("config", {}).get("params", {}).get("vectors", {}).get("on_disk", False) + + print(f" ℹ️ Existing collection:") + print(f" Points: {points}") + print(f" Vector size: {actual_size}") + print(f" On disk: {on_disk}") + + if args.recreate: + if points > 0 and not args.force: + print(f" ⚠️ Collection has {points} points. Use --force to recreate with data loss.") + skipped.append(name) + continue + + print(f" Deleting existing collection...") + if delete_collection(name): + print(f" ✅ Deleted") + exists = False + else: + print(f" ❌ Failed to delete", file=sys.stderr) + errors.append(name) + continue + else: + print(f" ⚠️ Already exists, skipping (use --recreate to update)") + skipped.append(name) + continue + + if not exists: + print(f" Creating collection with 2025 best practices...") + print(f" - on_disk=True (vectors)") + print(f" - on_disk_payload=True") + print(f" - Binary quantization") + print(f" - Optimizer config") + + if create_collection(name, config["vector_size"]): + print(f" ✅ Created (vector size: {config['vector_size']})") + if args.recreate and name in [c for c in COLLECTIONS]: + recreated.append(name) + else: + created.append(name) + else: + print(f" ❌ Failed to create", file=sys.stderr) + errors.append(name) + print() + + # Summary + print("=" * 50) + print("SUMMARY:") + if created: + print(f" Created: {', '.join(created)}") + if recreated: + print(f" Recreated: {', '.join(recreated)}") + if skipped: + print(f" Skipped: {', '.join(skipped)}") + if errors: + print(f" Errors: {', '.join(errors)}") + sys.exit(1) + + print("\n🎉 All collections ready with 2025 best practices!") + print("\nCollections configured for snowflake-arctic-embed2 (1024 dims)") + print("- kimi_memories: Personal memories (on_disk=True)") + print("- kimi_kb: Knowledge base (on_disk=True)") + print("- private_court_docs: Court documents (on_disk=True)") + print("\nFeatures enabled:") + print(" ✓ Vectors stored on disk (minimizes RAM)") + print(" ✓ Payload stored on disk") + print(" ✓ Binary quantization for fast search") + print(" ✓ Optimized indexing thresholds") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/init_kimi_kb.py b/skills/qdrant-memory/scripts/init_kimi_kb.py new file mode 100755 index 0000000..f3400e6 --- /dev/null +++ b/skills/qdrant-memory/scripts/init_kimi_kb.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +Initialize kimi_kb collection (Knowledge Base) +Vector size: 1024 (snowflake-arctic-embed2) + +Usage: init_kimi_kb.py [--recreate] +""" + +import argparse +import sys +import urllib.request +import json + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_kb" +VECTOR_SIZE = 1024 + +def make_request(url, data=None, method="GET"): + req = urllib.request.Request(url, method=method) + if data: + req.data = json.dumps(data).encode() + req.add_header("Content-Type", "application/json") + return req + +def collection_exists(): + try: + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}") + with urllib.request.urlopen(req, timeout=5) as response: + return True + except urllib.error.HTTPError as e: + if e.code == 404: + return False + raise + except Exception: + return False + +def get_info(): + try: + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}") + with urllib.request.urlopen(req, timeout=5) as response: + return json.loads(response.read().decode()) + except Exception: + return None + +def create_collection(): + config = { + "vectors": { + "size": VECTOR_SIZE, + "distance": "Cosine" + } + } + req = make_request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}", + data=config, + method="PUT" + ) + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("result") == True + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return False + +def delete_collection(): + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}", method="DELETE") + try: + with urllib.request.urlopen(req, timeout=5) as response: + return json.loads(response.read().decode()).get("status") == "ok" + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return False + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Initialize kimi_kb collection") + parser.add_argument("--recreate", action="store_true", help="Delete and recreate") + args = parser.parse_args() + + try: + req = make_request(f"{QDRANT_URL}/") + with urllib.request.urlopen(req, timeout=3) as response: + pass + except Exception as e: + print(f"❌ Cannot connect to Qdrant: {e}", file=sys.stderr) + sys.exit(1) + + print(f"✅ Qdrant: {QDRANT_URL}") + print(f"Collection: {COLLECTION_NAME}") + print(f"Vector size: {VECTOR_SIZE} (snowflake-arctic-embed2)\n") + + exists = collection_exists() + + if exists: + if args.recreate: + print(f"Deleting existing...") + delete_collection() + exists = False + else: + info = get_info() + if info: + size = info.get("result", {}).get("vectors_config", {}).get("params", {}).get("vectors", {}).get("size", "?") + points = info.get("result", {}).get("points_count", 0) + print(f"⚠️ Already exists (vector size: {size}, points: {points})") + sys.exit(0) + + if not exists: + if create_collection(): + print(f"✅ Created {COLLECTION_NAME}") + print(f" Vector size: {VECTOR_SIZE}, Distance: Cosine") + else: + print(f"❌ Failed", file=sys.stderr) + sys.exit(1) diff --git a/skills/qdrant-memory/scripts/init_kimi_memories.py b/skills/qdrant-memory/scripts/init_kimi_memories.py new file mode 100755 index 0000000..091f8a1 --- /dev/null +++ b/skills/qdrant-memory/scripts/init_kimi_memories.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +""" +Initialize kimi_memories collection (Personal Memories) +Vector size: 1024 (snowflake-arctic-embed2) + +Usage: init_kimi_memories.py [--recreate] +""" + +import argparse +import sys +import urllib.request +import json + +import os + +QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") +COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "kimi_memories") +VECTOR_SIZE = int(os.getenv("QDRANT_VECTOR_SIZE", "1024")) + +def make_request(url, data=None, method="GET"): + req = urllib.request.Request(url, method=method) + if data: + req.data = json.dumps(data).encode() + req.add_header("Content-Type", "application/json") + return req + +def collection_exists(): + try: + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}") + with urllib.request.urlopen(req, timeout=5) as response: + return True + except urllib.error.HTTPError as e: + if e.code == 404: + return False + raise + except Exception: + return False + +def get_info(): + try: + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}") + with urllib.request.urlopen(req, timeout=5) as response: + return json.loads(response.read().decode()) + except Exception: + return None + +def create_collection(): + config = { + "vectors": { + "size": VECTOR_SIZE, + "distance": "Cosine" + } + } + req = make_request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}", + data=config, + method="PUT" + ) + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("result") == True + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return False + +def delete_collection(): + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}", method="DELETE") + try: + with urllib.request.urlopen(req, timeout=5) as response: + return json.loads(response.read().decode()).get("status") == "ok" + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return False + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Initialize kimi_memories collection") + parser.add_argument("--recreate", action="store_true", help="Delete and recreate") + args = parser.parse_args() + + try: + req = make_request(f"{QDRANT_URL}/") + with urllib.request.urlopen(req, timeout=3) as response: + pass + except Exception as e: + print(f"❌ Cannot connect to Qdrant: {e}", file=sys.stderr) + sys.exit(1) + + print(f"✅ Qdrant: {QDRANT_URL}") + print(f"Collection: {COLLECTION_NAME}") + print(f"Vector size: {VECTOR_SIZE} (snowflake-arctic-embed2)\n") + + exists = collection_exists() + + if exists: + if args.recreate: + print(f"Deleting existing...") + delete_collection() + exists = False + else: + info = get_info() + if info: + size = info.get("result", {}).get("vectors_config", {}).get("params", {}).get("vectors", {}).get("size", "?") + points = info.get("result", {}).get("points_count", 0) + print(f"⚠️ Already exists (vector size: {size}, points: {points})") + sys.exit(0) + + if not exists: + if create_collection(): + print(f"✅ Created {COLLECTION_NAME}") + print(f" Vector size: {VECTOR_SIZE}, Distance: Cosine") + else: + print(f"❌ Failed", file=sys.stderr) + sys.exit(1) diff --git a/skills/qdrant-memory/scripts/init_projects_collection.py b/skills/qdrant-memory/scripts/init_projects_collection.py new file mode 100755 index 0000000..aee87aa --- /dev/null +++ b/skills/qdrant-memory/scripts/init_projects_collection.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Initialize Qdrant collection for Projects +Usage: init_projects_collection.py [--recreate] +""" + +import argparse +import sys +import urllib.request +import json + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "projects" + +def make_request(url, data=None, method="GET"): + """Make HTTP request with proper method""" + req = urllib.request.Request(url, method=method) + if data: + req.data = json.dumps(data).encode() + req.add_header("Content-Type", "application/json") + return req + +def collection_exists(): + """Check if collection exists""" + try: + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}") + with urllib.request.urlopen(req, timeout=5) as response: + return True + except urllib.error.HTTPError as e: + if e.code == 404: + return False + raise + except Exception as e: + print(f"Error checking collection: {e}", file=sys.stderr) + return False + +def create_collection(): + """Create the projects collection using PUT""" + config = { + "vectors": { + "size": 768, # nomic-embed-text outputs 768 dimensions + "distance": "Cosine" + } + } + + req = make_request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}", + data=config, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("result") == True + except Exception as e: + print(f"Error creating collection: {e}", file=sys.stderr) + return False + +def delete_collection(): + """Delete collection if exists""" + req = make_request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}", + method="DELETE" + ) + + try: + with urllib.request.urlopen(req, timeout=5) as response: + return True + except Exception as e: + print(f"Error deleting collection: {e}", file=sys.stderr) + return False + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Initialize Qdrant projects collection") + parser.add_argument("--recreate", action="store_true", help="Delete and recreate collection") + + args = parser.parse_args() + + # Check if Qdrant is reachable + try: + req = make_request(f"{QDRANT_URL}/") + with urllib.request.urlopen(req, timeout=3) as response: + pass + except Exception as e: + print(f"❌ Cannot connect to Qdrant at {QDRANT_URL}: {e}", file=sys.stderr) + sys.exit(1) + + print(f"✅ Connected to Qdrant at {QDRANT_URL}") + + exists = collection_exists() + + if exists and args.recreate: + print(f"Deleting existing collection '{COLLECTION_NAME}'...") + if delete_collection(): + print(f"✅ Deleted collection") + exists = False + else: + print(f"❌ Failed to delete collection", file=sys.stderr) + sys.exit(1) + + if not exists: + print(f"Creating collection '{COLLECTION_NAME}'...") + if create_collection(): + print(f"✅ Created collection '{COLLECTION_NAME}'") + print(f" Vector size: 768, Distance: Cosine") + else: + print(f"❌ Failed to create collection", file=sys.stderr) + sys.exit(1) + else: + print(f"✅ Collection '{COLLECTION_NAME}' already exists") + + print("\n🎉 Qdrant projects collection ready!") diff --git a/skills/qdrant-memory/scripts/js_scraper.py b/skills/qdrant-memory/scripts/js_scraper.py new file mode 100755 index 0000000..06ca6d2 --- /dev/null +++ b/skills/qdrant-memory/scripts/js_scraper.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +""" +JavaScript Scraper - Headless browser for JS-heavy sites +Uses Playwright to render dynamic content before scraping +Usage: js_scraper.py --domain "React" --path "Docs/Hooks" --wait-for "#content" +""" + +import argparse +import sys +import json +from pathlib import Path +from playwright.sync_api import sync_playwright + +sys.path.insert(0, str(Path(__file__).parent)) +from scrape_to_kb import chunk_text, get_embedding, compute_checksum, store_in_kb + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "knowledge_base" + +def scrape_js_site(url, wait_for=None, wait_time=2000, scroll=False, viewport=None): + """Scrape JavaScript-rendered site using Playwright""" + + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + + context_options = {} + if viewport: + context_options["viewport"] = {"width": viewport[0], "height": viewport[1]} + + context = browser.new_context(**context_options) + page = context.new_page() + + # Set user agent + page.set_extra_http_headers({ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + }) + + try: + print(f"🌐 Loading {url}...") + page.goto(url, wait_until="networkidle", timeout=30000) + + # Wait for specific element if requested + if wait_for: + print(f"⏳ Waiting for {wait_for}...") + page.wait_for_selector(wait_for, timeout=10000) + + # Additional wait for any animations/final renders + page.wait_for_timeout(wait_time) + + # Scroll to bottom if requested (for infinite scroll pages) + if scroll: + print("📜 Scrolling...") + prev_height = 0 + while True: + page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + page.wait_for_timeout(500) + new_height = page.evaluate("document.body.scrollHeight") + if new_height == prev_height: + break + prev_height = new_height + + # Get page data + title = page.title() + + # Extract clean text + text = page.evaluate("""() => { + // Remove script/style/nav/header/footer + const scripts = document.querySelectorAll('script, style, nav, header, footer, aside, .advertisement, .ads'); + scripts.forEach(el => el.remove()); + + // Get main content if available, else body + const main = document.querySelector('main, article, [role="main"], .content, .post-content, .entry-content'); + const content = main || document.body; + + return content.innerText; + }""") + + # Get any JSON-LD structured data + json_ld = page.evaluate("""() => { + const scripts = document.querySelectorAll('script[type="application/ld+json"]'); + const data = []; + scripts.forEach(s => { + try { + data.push(JSON.parse(s.textContent)); + } catch(e) {} + }); + return data; + }""") + + # Get meta description + meta_desc = page.evaluate("""() => { + const meta = document.querySelector('meta[name=\"description\"], meta[property=\"og:description\"]'); + return meta ? meta.content : ''; + }""") + + browser.close() + + return { + "title": title, + "text": text, + "meta_description": meta_desc, + "json_ld": json_ld, + "url": page.url # Final URL after redirects + } + + except Exception as e: + browser.close() + raise e + +def main(): + parser = argparse.ArgumentParser(description="Scrape JavaScript-heavy sites") + parser.add_argument("url", help="URL to scrape") + parser.add_argument("--domain", required=True, help="Knowledge domain") + parser.add_argument("--path", required=True, help="Hierarchical path") + parser.add_argument("--wait-for", help="CSS selector to wait for") + parser.add_argument("--wait-time", type=int, default=2000, help="Wait time in ms after load") + parser.add_argument("--scroll", action="store_true", help="Scroll to bottom (for infinite scroll)") + parser.add_argument("--viewport", help="Viewport size (e.g., 1920x1080)") + parser.add_argument("--category", default="reference") + parser.add_argument("--content-type", default="web_page") + parser.add_argument("--subjects", help="Comma-separated subjects") + parser.add_argument("--title", help="Override title") + + args = parser.parse_args() + + viewport = None + if args.viewport: + w, h = args.viewport.split('x') + viewport = (int(w), int(h)) + + try: + result = scrape_js_site( + args.url, + wait_for=args.wait_for, + wait_time=args.wait_time, + scroll=args.scroll, + viewport=viewport + ) + except Exception as e: + print(f"❌ Error: {e}", file=sys.stderr) + sys.exit(1) + + title = args.title or result["title"] + text = result["text"] + + print(f"📄 Title: {title}") + print(f"📝 Content: {len(text)} chars") + + if len(text) < 200: + print("❌ Content too short", file=sys.stderr) + sys.exit(1) + + # Add meta description if available + if result["meta_description"]: + text = f"Description: {result['meta_description']}\n\n{text}" + + chunks = chunk_text(text) + print(f"🧩 Chunks: {len(chunks)}") + + subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else [] + checksum = compute_checksum(text) + + print("💾 Storing...") + stored = 0 + for i, chunk in enumerate(chunks): + chunk_metadata = { + "domain": args.domain, + "path": f"{args.path}/chunk-{i+1}", + "subjects": subjects, + "category": args.category, + "content_type": args.content_type, + "title": f"{title} (part {i+1}/{len(chunks)})", + "checksum": checksum, + "source_url": result["url"], + "date_added": "2026-02-05", + "chunk_index": i + 1, + "total_chunks": len(chunks), + "text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk, + "scraper_type": "playwright_headless", + "rendered": True + } + + if store_in_kb(chunk, chunk_metadata): + stored += 1 + print(f" ✓ Chunk {i+1}") + + print(f"\n🎉 Stored {stored}/{len(chunks)} chunks") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/kb_review.py b/skills/qdrant-memory/scripts/kb_review.py new file mode 100755 index 0000000..e37ddda --- /dev/null +++ b/skills/qdrant-memory/scripts/kb_review.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Review knowledge base for outdated entries +Usage: kb_review.py [--days 180] [--domains "Domain1,Domain2"] [--dry-run] +""" + +import argparse +import sys +import json +import urllib.request +from datetime import datetime, timedelta + +QDRANT_URL = "http://10.0.0.40:6333" +KB_COLLECTION = "knowledge_base" + +# Domains where freshness matters (tech changes fast) +FAST_MOVING_DOMAINS = ["AI/ML", "Python", "JavaScript", "Docker", "OpenClaw", "DevOps"] + +def make_request(url, data=None, method="GET"): + """Make HTTP request""" + req = urllib.request.Request(url, method=method) + if data: + req.data = json.dumps(data).encode() + req.add_header("Content-Type", "application/json") + return req + +def get_all_entries(limit=1000): + """Get all entries from knowledge base""" + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/scroll" + + data = { + "limit": limit, + "with_payload": True + } + + req = make_request(url, data, "POST") + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result.get("result", {}).get("points", []) + except Exception as e: + print(f"❌ Error fetching entries: {e}", file=sys.stderr) + return [] + +def parse_date(date_str): + """Parse date string to datetime""" + if not date_str: + return None + + formats = [ + "%Y-%m-%d", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%S.%f" + ] + + for fmt in formats: + try: + return datetime.strptime(date_str.split('.')[0], fmt) + except: + continue + + return None + +def is_outdated(entry, threshold_days, fast_moving_multiplier=0.5): + """Check if entry is outdated""" + payload = entry.get("payload", {}) + + # Check date_scraped first, then date_added + date_str = payload.get("date_scraped") or payload.get("date_added") + entry_date = parse_date(date_str) + + if not entry_date: + return False, None # No date, can't determine + + domain = payload.get("domain", "") + + # Fast-moving domains get shorter threshold + if domain in FAST_MOVING_DOMAINS: + effective_threshold = int(threshold_days * fast_moving_multiplier) + else: + effective_threshold = threshold_days + + age = datetime.now() - entry_date + is_old = age.days > effective_threshold + + return is_old, { + "age_days": age.days, + "threshold": effective_threshold, + "domain": domain, + "date": date_str + } + +def delete_entry(entry_id): + """Delete entry from knowledge base""" + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/delete" + data = {"points": [entry_id]} + + req = make_request(url, data, "POST") + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except Exception as e: + print(f"❌ Error deleting: {e}", file=sys.stderr) + return False + +def main(): + parser = argparse.ArgumentParser(description="Review knowledge base for outdated entries") + parser.add_argument("--days", type=int, default=180, help="Age threshold in days") + parser.add_argument("--domains", help="Comma-separated domains to check (default: all)") + parser.add_argument("--fast-moving-only", action="store_true", help="Only check fast-moving domains") + parser.add_argument("--dry-run", action="store_true", help="Show what would be deleted") + parser.add_argument("--delete", action="store_true", help="Actually delete outdated entries") + + args = parser.parse_args() + + print(f"🔍 Fetching knowledge base entries...") + entries = get_all_entries() + + if not entries: + print("❌ No entries found") + return + + print(f" Total entries: {len(entries)}") + + # Filter by domain if specified + if args.domains: + target_domains = [d.strip() for d in args.domains.split(",")] + entries = [e for e in entries if e.get("payload", {}).get("domain") in target_domains] + print(f" Filtered to domains: {target_domains}") + elif args.fast_moving_only: + entries = [e for e in entries if e.get("payload", {}).get("domain") in FAST_MOVING_DOMAINS] + print(f" Filtered to fast-moving domains: {FAST_MOVING_DOMAINS}") + + # Check for outdated entries + outdated = [] + for entry in entries: + is_old, info = is_outdated(entry, args.days) + if is_old: + outdated.append({ + "entry": entry, + "info": info + }) + + if not outdated: + print(f"\n✅ No outdated entries found!") + return + + print(f"\n⚠️ Found {len(outdated)} outdated entries:") + print(f" (Threshold: {args.days} days, fast-moving: {int(args.days * 0.5)} days)") + + for item in outdated: + entry = item["entry"] + info = item["info"] + payload = entry.get("payload", {}) + + print(f"\n 📄 {payload.get('title', 'Untitled')}") + print(f" Domain: {info['domain']} | Age: {info['age_days']} days | Threshold: {info['threshold']} days") + print(f" Date: {info['date']}") + print(f" Path: {payload.get('path', 'N/A')}") + + if args.delete and not args.dry_run: + if delete_entry(entry.get("id")): + print(f" ✅ Deleted") + else: + print(f" ❌ Failed to delete") + elif args.dry_run: + print(f" [Would delete in non-dry-run mode]") + + # Summary + print(f"\n📊 Summary:") + print(f" Total checked: {len(entries)}") + print(f" Outdated: {len(outdated)}") + + if args.dry_run: + print(f"\n💡 Use --delete to remove these entries") + elif not args.delete: + print(f"\n💡 Use --dry-run to preview, --delete to remove") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/kb_search.py b/skills/qdrant-memory/scripts/kb_search.py new file mode 100755 index 0000000..ff00599 --- /dev/null +++ b/skills/qdrant-memory/scripts/kb_search.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +Search kimi_kb (Knowledge Base) - Manual only + +Usage: + python3 kb_search.py "query" + python3 kb_search.py "docker volumes" --domain "Docker" + python3 kb_search.py "query" --include-urls +""" + +import json +import sys +import urllib.request +from pathlib import Path + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION = "kimi_kb" +OLLAMA_URL = "http://localhost:11434/v1" + +def get_embedding(text): + """Generate embedding using snowflake-arctic-embed2""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"Error generating embedding: {e}", file=sys.stderr) + return None + +def search_kb(query, domain=None, limit=5): + """Search knowledge base""" + + embedding = get_embedding(query) + if embedding is None: + return None + + # Build filter if domain specified + filter_clause = {} + if domain: + filter_clause = { + "must": [ + {"key": "domain", "match": {"value": domain}} + ] + } + + search_body = { + "vector": embedding, + "limit": limit, + "with_payload": True, + "with_vector": False + } + + if filter_clause: + search_body["filter"] = filter_clause + + data = json.dumps(search_body).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points/search", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result.get("result", []) + except Exception as e: + print(f"Error searching KB: {e}", file=sys.stderr) + return None + +def format_result(point, idx): + """Format a search result for display""" + payload = point.get("payload", {}) + score = point.get("score", 0) + + output = f"\n[{idx}] {payload.get('title', 'Untitled')} (score: {score:.3f})\n" + output += f" Domain: {payload.get('domain', 'unknown')}\n" + + if payload.get('url'): + output += f" URL: {payload['url']}\n" + if payload.get('source'): + output += f" Source: {payload['source']}\n" + + text = payload.get('text', '')[:300] + if len(payload.get('text', '')) > 300: + text += "..." + output += f" Content: {text}\n" + + return output + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Search kimi_kb") + parser.add_argument("query", help="Search query") + parser.add_argument("--domain", default=None, help="Filter by domain") + parser.add_argument("--limit", type=int, default=5, help="Number of results") + parser.add_argument("--json", action="store_true", help="Output as JSON") + + args = parser.parse_args() + + print(f"🔍 Searching kimi_kb: {args.query}") + if args.domain: + print(f" Filter: domain={args.domain}") + print() + + results = search_kb(args.query, args.domain, args.limit) + + if results is None: + print("❌ Search failed", file=sys.stderr) + sys.exit(1) + + if not results: + print("No results found in kimi_kb") + return + + if args.json: + print(json.dumps(results, indent=2)) + else: + print(f"Found {len(results)} results:\n") + for i, point in enumerate(results, 1): + print(format_result(point, i)) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/kb_store.py b/skills/qdrant-memory/scripts/kb_store.py new file mode 100755 index 0000000..d36b7b8 --- /dev/null +++ b/skills/qdrant-memory/scripts/kb_store.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +""" +Store content to kimi_kb (Knowledge Base) - Manual only with batch support + +Usage: + Single entry: + python3 kb_store.py "Content text" --title "Title" --domain "Category" --tags "tag1,tag2" + python3 kb_store.py "Content" --title "X" --url "https://example.com" --source "docs.site" + + Batch mode: + python3 kb_store.py --batch-file entries.json --batch-size 100 + +Features: + - Single or batch upload + - Duplicate detection by title/URL + - Domain categorization + - Access tracking +""" + +import argparse +import json +import os +import sys +import urllib.request +import urllib.error +import uuid +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Dict, Any + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION = "kimi_kb" +OLLAMA_URL = "http://localhost:11434/v1" +DEFAULT_BATCH_SIZE = 100 + + +def check_existing(title: str = None, url: str = None) -> tuple: + """Check if entry already exists by title or URL""" + try: + # Check by URL first if provided + if url: + scroll_data = json.dumps({ + "limit": 10, + "with_payload": True, + "filter": {"must": [{"key": "url", "match": {"value": url}}]} + }).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll", + data=scroll_data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + if points: + return points[0]["id"], "url" + + # Check by title + if title: + scroll_data = json.dumps({ + "limit": 10, + "with_payload": True, + "filter": {"must": [{"key": "title", "match": {"value": title}}]} + }).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll", + data=scroll_data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + if points: + return points[0]["id"], "title" + except Exception as e: + print(f"Warning: Could not check existing: {e}", file=sys.stderr) + return None, None + + +def get_embedding(text: str) -> Optional[List[float]]: + """Generate embedding using snowflake-arctic-embed2""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"Error generating embedding: {e}", file=sys.stderr) + return None + + +def batch_upload_embeddings(texts: List[str]) -> List[Optional[List[float]]]: + """Generate embeddings for multiple texts in batch""" + if not texts: + return [] + + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": [t[:8192] for t in texts] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=120) as response: + result = json.loads(response.read().decode()) + return [d["embedding"] for d in result["data"]] + except Exception as e: + print(f"Error generating batch embeddings: {e}", file=sys.stderr) + return [None] * len(texts) + + +def upload_points_batch(points: List[Dict[str, Any]], batch_size: int = DEFAULT_BATCH_SIZE) -> tuple: + """Upload points in batches to Qdrant""" + total = len(points) + uploaded = 0 + failed = 0 + + for i in range(0, total, batch_size): + batch = points[i:i + batch_size] + + upsert_data = {"points": batch} + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + if result.get("status") == "ok": + uploaded += len(batch) + print(f" ✅ Uploaded batch {i//batch_size + 1}: {len(batch)} points") + else: + print(f" ❌ Batch {i//batch_size + 1} failed: {result}") + failed += len(batch) + except Exception as e: + print(f" ❌ Batch {i//batch_size + 1} error: {e}", file=sys.stderr) + failed += len(batch) + + return uploaded, failed + + +def store_single( + text: str, + embedding: List[float], + title: str = None, + url: str = None, + source: str = None, + domain: str = "general", + tags: List[str] = None, + content_type: str = "document", + replace: bool = False +) -> bool: + """Store single KB entry""" + + # Check for existing entry + existing_id, match_type = check_existing(title=title, url=url) + if existing_id: + if not replace: + print(f"⚠️ Entry '{title}' already exists (matched by {match_type}, ID: {existing_id})") + print(f" Use --replace to overwrite") + return False + + point_id = existing_id if existing_id else str(uuid.uuid4()) + + payload = { + "text": text, + "title": title or "Untitled", + "url": url or "", + "source": source or "manual", + "domain": domain or "general", + "tags": tags or [], + "content_type": content_type, + "date": datetime.now().strftime("%Y-%m-%d"), + "created_at": datetime.now().isoformat(), + "access_count": 0 + } + + point = { + "points": [{ + "id": point_id, + "vector": embedding, + "payload": payload + }] + } + + data = json.dumps(point).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true", + data=data, + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except Exception as e: + print(f"Error storing to KB: {e}", file=sys.stderr) + return False + + +def store_batch( + entries: List[Dict[str, Any]], + batch_size: int = DEFAULT_BATCH_SIZE, + check_duplicates: bool = True +) -> tuple: + """Store multiple KB entries in batch with optional duplicate checking""" + if not entries: + return 0, 0 + + print(f"Processing {len(entries)} entries...") + + # Filter duplicates if requested + entries_to_process = [] + duplicates = 0 + + if check_duplicates: + for entry in entries: + existing_id, match_type = check_existing( + title=entry.get("title"), + url=entry.get("url") + ) + if existing_id: + print(f" ⏭️ Skipping duplicate: {entry.get('title', 'Untitled')} ({match_type})") + duplicates += 1 + else: + entries_to_process.append(entry) + else: + entries_to_process = entries + + if not entries_to_process: + print(f"All {len(entries)} entries already exist") + return 0, 0 + + print(f"Generating embeddings for {len(entries_to_process)} entries...") + texts = [e["content"] for e in entries_to_process] + embeddings = batch_upload_embeddings(texts) + + # Prepare points + points = [] + failed_embeddings = 0 + + for entry, embedding in zip(entries_to_process, embeddings): + if embedding is None: + failed_embeddings += 1 + continue + + point_id = str(uuid.uuid4()) + + payload = { + "text": entry["content"], + "title": entry.get("title", "Untitled"), + "url": entry.get("url", ""), + "source": entry.get("source", "manual"), + "domain": entry.get("domain", "general"), + "tags": entry.get("tags", []), + "content_type": entry.get("type", "document"), + "date": datetime.now().strftime("%Y-%m-%d"), + "created_at": datetime.now().isoformat(), + "access_count": 0 + } + + points.append({ + "id": point_id, + "vector": embedding, + "payload": payload + }) + + if not points: + return 0, failed_embeddings + duplicates + + # Upload in batches + print(f"Uploading {len(points)} entries in batches of {batch_size}...") + uploaded, failed_upload = upload_points_batch(points, batch_size) + + return uploaded, failed_embeddings + failed_upload + duplicates + + +def main(): + parser = argparse.ArgumentParser(description="Store content to kimi_kb") + parser.add_argument("content", nargs="?", help="Content to store") + parser.add_argument("--title", default=None, help="Title of the content") + parser.add_argument("--url", default=None, help="Source URL if from web") + parser.add_argument("--source", default=None, help="Source name") + parser.add_argument("--domain", default="general", help="Domain/category") + parser.add_argument("--tags", default=None, help="Comma-separated tags") + parser.add_argument("--type", default="document", choices=["document", "web", "code", "note"], + help="Content type") + parser.add_argument("--replace", action="store_true", help="Replace existing entry") + parser.add_argument("--batch-file", help="JSON file with multiple entries") + parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size") + parser.add_argument("--no-check-duplicates", action="store_true", help="Skip duplicate checking in batch mode") + + args = parser.parse_args() + + # Batch mode + if args.batch_file: + print(f"Batch mode: Loading entries from {args.batch_file}") + try: + with open(args.batch_file, 'r') as f: + entries = json.load(f) + + if not isinstance(entries, list): + print("Batch file must contain a JSON array", file=sys.stderr) + sys.exit(1) + + print(f"Loaded {len(entries)} entries") + uploaded, failed = store_batch( + entries, + args.batch_size, + check_duplicates=not args.no_check_duplicates + ) + + print(f"\n{'=' * 50}") + print(f"Batch complete: {uploaded} uploaded, {failed} failed") + sys.exit(0 if failed == 0 else 1) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + # Single entry mode + if not args.content: + print("Error: Provide content or use --batch-file", file=sys.stderr) + parser.print_help() + sys.exit(1) + + tags = [t.strip() for t in args.tags.split(",")] if args.tags else [] + + print(f"Generating embedding...") + embedding = get_embedding(args.content) + + if embedding is None: + print("❌ Failed to generate embedding") + sys.exit(1) + + print(f"Storing to kimi_kb: {args.title or 'Untitled'}...") + + if store_single( + text=args.content, + embedding=embedding, + title=args.title, + url=args.url, + source=args.source, + domain=args.domain, + tags=tags, + content_type=args.type, + replace=args.replace + ): + print(f"✅ Stored to kimi_kb ({args.domain})") + else: + print("❌ Failed to store") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/llm_router.py b/skills/qdrant-memory/scripts/llm_router.py new file mode 100644 index 0000000..aa73a5d --- /dev/null +++ b/skills/qdrant-memory/scripts/llm_router.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""LLM Router for cheap metadata + compaction. + +Goal: +- Prefer Minimax m2.5 for tagging + compaction. +- Fallback to Gemini Flash (or any other OpenRouter model) if Minimax fails. + +This uses OpenRouter's OpenAI-compatible API. + +Env: + OPENROUTER_API_KEY (required) + OPENROUTER_BASE_URL default: https://openrouter.ai/api/v1 + LLM_PRIMARY_MODEL default: openrouter/minimax/minimax-m2.5 + LLM_FALLBACK_MODEL default: openrouter/google/gemini-2.5-flash + LLM_TIMEOUT default: 60 + +Notes: +- We keep this dependency-light (urllib only). +- We request strict JSON when asked. +""" + +import json +import os +import sys +import urllib.request + +BASE_URL = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1").rstrip("/") +API_KEY = os.getenv("OPENROUTER_API_KEY", "") +PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "openrouter/minimax/minimax-m2.5") +FALLBACK_MODEL = os.getenv("LLM_FALLBACK_MODEL", "openrouter/google/gemini-2.5-flash") +TIMEOUT = int(os.getenv("LLM_TIMEOUT", "60")) + + +def _post_chat(model: str, messages, response_format=None, temperature=0.2): + if not API_KEY: + raise RuntimeError("OPENROUTER_API_KEY is required") + + body = { + "model": model, + "messages": messages, + "temperature": temperature, + } + if response_format: + body["response_format"] = response_format + + req = urllib.request.Request( + f"{BASE_URL}/chat/completions", + data=json.dumps(body).encode("utf-8"), + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {API_KEY}", + }, + ) + + with urllib.request.urlopen(req, timeout=TIMEOUT) as r: + return json.loads(r.read().decode("utf-8")) + + +def chat_json(system: str, user: str) -> dict: + """Return parsed JSON object. Try primary then fallback.""" + messages = [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ] + + last_err = None + for model in (PRIMARY_MODEL, FALLBACK_MODEL): + try: + resp = _post_chat(model, messages, response_format={"type": "json_object"}, temperature=0.2) + content = resp["choices"][0]["message"]["content"] + return json.loads(content) + except Exception as e: + last_err = e + continue + + raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}") + + +def chat_text(system: str, user: str) -> str: + """Return text. Try primary then fallback.""" + messages = [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ] + + last_err = None + for model in (PRIMARY_MODEL, FALLBACK_MODEL): + try: + resp = _post_chat(model, messages, response_format=None, temperature=0.2) + return resp["choices"][0]["message"]["content"] + except Exception as e: + last_err = e + continue + + raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}") + + +if __name__ == "__main__": + # tiny self-test + if len(sys.argv) > 1 and sys.argv[1] == "--ping": + out = chat_json("Return JSON with key ok=true", "ping") + print(json.dumps(out)) diff --git a/skills/qdrant-memory/scripts/log_activity.py b/skills/qdrant-memory/scripts/log_activity.py new file mode 100755 index 0000000..4486325 --- /dev/null +++ b/skills/qdrant-memory/scripts/log_activity.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +""" +Convenience wrapper for activity logging +Add to your scripts: from log_activity import log_done, check_other_agent +""" + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from activity_log import log_activity, check_for_duplicates, get_recent_activities + +AGENT_NAME = "Kimi" # Change to "Max" on that instance + +def log_done(action_type: str, description: str, files=None, status="completed"): + """ + Quick log of completed work + + Example: + log_done("cron_created", "Set up daily OpenClaw repo monitoring", + files=["/path/to/script.py"]) + """ + activity_id = log_activity( + agent=AGENT_NAME, + action_type=action_type, + description=description, + affected_files=files or [], + status=status + ) + print(f"[ActivityLog] Logged: {action_type} → {activity_id[:8]}...") + return activity_id + +def check_other_agent(action_type: str, keywords: str, hours: int = 6) -> bool: + """ + Check if Max (or Kimi) already did this recently + + Example: + if check_other_agent("cron_created", "openclaw repo monitoring"): + print("Max already set this up!") + return + """ + other_agent = "Max" if AGENT_NAME == "Kimi" else "Kimi" + + recent = get_recent_activities(agent=other_agent, action_type=action_type, hours=hours) + + keywords_lower = keywords.lower().split() + for activity in recent: + desc = activity.get("description", "").lower() + if all(kw in desc for kw in keywords_lower): + print(f"[ActivityLog] ⚠️ {other_agent} already did this!") + print(f" When: {activity['timestamp'][:19]}") + print(f" What: {activity['description']}") + return True + + return False + +def show_recent_collaboration(hours: int = 24): + """Show what both agents have been up to""" + activities = get_recent_activities(hours=hours, limit=50) + + print(f"\n[ActivityLog] Both agents' work (last {hours}h):\n") + for a in activities: + agent = a['agent'] + icon = "🤖" if agent == "Max" else "🎙️" + print(f"{icon} [{a['timestamp'][11:19]}] {agent}: {a['action_type']}") + print(f" {a['description']}") + +if __name__ == "__main__": + # Quick test + print(f"Agent: {AGENT_NAME}") + print("Functions available:") + print(" log_done(action_type, description, files=[], status='completed')") + print(" check_other_agent(action_type, keywords, hours=6)") + print(" show_recent_collaboration(hours=24)") + print() + print("Recent activity:") + show_recent_collaboration(hours=24) diff --git a/skills/qdrant-memory/scripts/metadata_and_compact.py b/skills/qdrant-memory/scripts/metadata_and_compact.py new file mode 100644 index 0000000..bca9811 --- /dev/null +++ b/skills/qdrant-memory/scripts/metadata_and_compact.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +"""Metadata + Compaction pipeline. + +This script is designed to be run on a schedule (cron). It will: +1) Detect if anything new exists in Redis buffer since last run. +2) If new content exists, generate: + - title + - tags + - entities + - category + - compact summary + using a cheap LLM (Minimax m2.5) with fallback (Gemini Flash) +3) Store the metadata + summary into Qdrant as a single point (collection: kimi_kb by default) + while leaving raw transcripts in files/Redis. + +It is intentionally conservative: if nothing new, it exits quickly. + +Env: + REDIS_HOST/REDIS_PORT + QDRANT_URL + QDRANT_META_COLLECTION (default: kimi_kb) + OPENROUTER_API_KEY (required for LLM) + LLM_PRIMARY_MODEL / LLM_FALLBACK_MODEL + +Usage: + python3 metadata_and_compact.py --user-id michael + python3 metadata_and_compact.py --user-id michael --max-items 200 + +""" + +import argparse +import json +import os +import sys +import uuid +from datetime import datetime + +import redis + +from llm_router import chat_json + +REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) + +QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333").rstrip("/") +META_COLLECTION = os.getenv("QDRANT_META_COLLECTION", "kimi_kb") + +STATE_DIR = os.getenv("MEMORY_STATE_DIR", os.path.join(os.path.expanduser("~"), ".openclaw", "memory_state")) + +SYSTEM_PROMPT = ( + "You are a metadata extractor and compactor for conversation logs. " + "Return STRICT JSON with keys: title (string), category (string), " + "tags (array of short lowercase hyphenated strings), entities (array of strings), " + "summary (string, <= 1200 chars). " + "Prefer 6-14 tags. Tags should be searchable facets (client/project/infra/topic)." +) + + +def _state_path(user_id: str) -> str: + os.makedirs(STATE_DIR, exist_ok=True) + return os.path.join(STATE_DIR, f"meta_state_{user_id}.json") + + +def load_state(user_id: str) -> dict: + p = _state_path(user_id) + if not os.path.exists(p): + return {"last_redis_len": 0, "updated_at": None} + try: + with open(p, "r") as f: + return json.load(f) + except Exception: + return {"last_redis_len": 0, "updated_at": None} + + +def save_state(user_id: str, st: dict) -> None: + p = _state_path(user_id) + st["updated_at"] = datetime.utcnow().isoformat() + "Z" + with open(p, "w") as f: + json.dump(st, f, indent=2, sort_keys=True) + + +def redis_get_new_items(user_id: str, max_items: int, last_len: int): + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + key = f"mem:{user_id}" + cur_len = r.llen(key) + if cur_len <= last_len: + return [], cur_len + + # Only grab the delta (best effort). Our list is chronological if RPUSH is used. + start = last_len + end = min(cur_len - 1, last_len + max_items - 1) + items = r.lrange(key, start, end) + turns = [] + for it in items: + try: + turns.append(json.loads(it)) + except Exception: + continue + return turns, cur_len + + +def qdrant_upsert(point_id: str, vector, payload: dict): + body = {"points": [{"id": point_id, "vector": vector, "payload": payload}]} + import urllib.request + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{META_COLLECTION}/points?wait=true", + data=json.dumps(body).encode("utf-8"), + headers={"Content-Type": "application/json"}, + method="PUT", + ) + with urllib.request.urlopen(req, timeout=15) as resp: + out = json.loads(resp.read().decode("utf-8")) + return out.get("status") == "ok" + + +def ollama_embed(text: str): + # Uses the same Ollama embed endpoint as auto_store + import urllib.request + + ollama_url = os.getenv("OLLAMA_URL", "http://127.0.0.1:11434/v1") + data = json.dumps({"model": "snowflake-arctic-embed2", "input": text[:8192]}).encode("utf-8") + req = urllib.request.Request( + f"{ollama_url}/embeddings", + data=data, + headers={"Content-Type": "application/json"}, + ) + with urllib.request.urlopen(req, timeout=60) as resp: + out = json.loads(resp.read().decode("utf-8")) + return out["data"][0]["embedding"] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--user-id", required=True) + ap.add_argument("--max-items", type=int, default=200) + args = ap.parse_args() + + st = load_state(args.user_id) + last_len = int(st.get("last_redis_len", 0)) + + turns, cur_len = redis_get_new_items(args.user_id, args.max_items, last_len) + if not turns: + print("No new turns; skipping") + return + + # Build compact source text + lines = [] + for t in turns: + role = t.get("role", "") + content = t.get("content", "") + if not content: + continue + lines.append(f"{role.upper()}: {content}") + source_text = "\n".join(lines) + + meta = chat_json(SYSTEM_PROMPT, source_text[:24000]) + + # basic validation + for k in ("title", "category", "tags", "entities", "summary"): + if k not in meta: + raise SystemExit(f"Missing key in meta: {k}") + + summary = str(meta.get("summary", ""))[:2000] + emb = ollama_embed(summary) + + payload = { + "user_id": args.user_id, + "title": str(meta.get("title", ""))[:200], + "category": str(meta.get("category", ""))[:120], + "tags": meta.get("tags", [])[:30], + "entities": meta.get("entities", [])[:30], + "summary": summary, + "source": "redis_delta", + "created_at": datetime.utcnow().isoformat() + "Z", + "redis_range": {"from": last_len, "to": cur_len - 1}, + } + + ok = qdrant_upsert(str(uuid.uuid4()), emb, payload) + if not ok: + raise SystemExit("Failed to upsert metadata point") + + st["last_redis_len"] = cur_len + save_state(args.user_id, st) + + print(f"Stored metadata point for {args.user_id} (redis {last_len}->{cur_len})") + + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/migrate_qd_snowflake.py b/skills/qdrant-memory/scripts/migrate_qd_snowflake.py new file mode 100755 index 0000000..be3a16e --- /dev/null +++ b/skills/qdrant-memory/scripts/migrate_qd_snowflake.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +Migrate Qdrant_Documents to 1024D vectors (snowflake-arctic-embed2) - BATCH VERSION +""" + +import json +import sys +import urllib.request +import uuid +from datetime import datetime + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION = "Qdrant_Documents" +OLLAMA_URL = "http://localhost:11434/v1" +EXPORT_FILE = "/tmp/qd_export.json" +BATCH_SIZE = 50 + +def get_embeddings_batch(texts): + """Generate embeddings in batch using snowflake-arctic-embed2""" + # Truncate each text + truncated = [t[:8000] for t in texts] + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": truncated + }).encode() + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + try: + with urllib.request.urlopen(req, timeout=180) as r: + result = json.loads(r.read().decode()) + return [item["embedding"] for item in result["data"]] + except Exception as e: + print(f"Batch embed error: {e}", file=sys.stderr) + return None + +def make_request(url, data=None, method="GET"): + req = urllib.request.Request(url, method=method) + if data: + req.data = json.dumps(data).encode() + req.add_header("Content-Type", "application/json") + return req + +def delete_collection(): + print(f"Deleting {COLLECTION}...") + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}", method="DELETE") + try: + with urllib.request.urlopen(req, timeout=10) as r: + print(f"✅ Deleted") + except Exception as e: + print(f"Delete error: {e}") + +def create_collection(): + print(f"Creating {COLLECTION} with 1024D vectors...") + config = { + "vectors": { + "size": 1024, + "distance": "Cosine" + } + } + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}", data=config, method="PUT") + try: + with urllib.request.urlopen(req, timeout=30) as r: + result = json.loads(r.read().decode()) + if result.get("result") == True: + print(f"✅ Created (1024D, Cosine)") + else: + print(f"❌ Failed: {result}") + sys.exit(1) + except Exception as e: + print(f"❌ Create error: {e}") + sys.exit(1) + +def upsert_batch(points): + """Upsert batch of points""" + data = json.dumps({"points": points}).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true", + data=data, + headers={"Content-Type": "application/json"}, + method="PUT" + ) + try: + with urllib.request.urlopen(req, timeout=60) as r: + return json.loads(r.read().decode()).get("status") == "ok" + except Exception as e: + print(f"Upsert error: {e}", file=sys.stderr) + return False + +# Load exported docs +print(f"Loading {EXPORT_FILE}...") +with open(EXPORT_FILE, 'r') as f: + docs = json.load(f) +print(f"Loaded {len(docs)} documents\n") + +# Delete and recreate +delete_collection() +create_collection() +print() + +# Process in batches +print(f"Re-embedding with snowflake-arctic-embed2 (batch={BATCH_SIZE})...\n") +success = 0 +failed = 0 +total_batches = (len(docs) + BATCH_SIZE - 1) // BATCH_SIZE + +for batch_num in range(total_batches): + start = batch_num * BATCH_SIZE + end = min(start + BATCH_SIZE, len(docs)) + batch_docs = docs[start:end] + + print(f"Batch {batch_num + 1}/{total_batches} ({start}-{end})...", end=" ", flush=True) + + # Get texts for embedding + texts = [d.get("payload", {}).get("text", "") for d in batch_docs] + + # Get embeddings + embeddings = get_embeddings_batch(texts) + if not embeddings: + print(f"❌ embed failed") + failed += len(batch_docs) + continue + + # Build points + points = [] + for doc, emb in zip(batch_docs, embeddings): + points.append({ + "id": doc.get("id", str(uuid.uuid4())), + "vector": emb, + "payload": doc.get("payload", {}) + }) + + # Upsert + if upsert_batch(points): + success += len(batch_docs) + print(f"✅") + else: + failed += len(batch_docs) + print(f"❌") + +print() +print("=" * 50) +print(f"MIGRATION COMPLETE") +print(f" Success: {success}") +print(f" Failed: {failed}") +print(f" Total: {len(docs)}") +print("=" * 50) + +# Verify +req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}") +with urllib.request.urlopen(req, timeout=5) as r: + info = json.loads(r.read().decode())["result"] + print(f"\n📚 {COLLECTION}") + print(f" Points: {info['points_count']:,}") + print(f" Vector size: {info['config']['params']['vectors']['size']}") + print(f" Distance: {info['config']['params']['vectors']['distance']}") diff --git a/skills/qdrant-memory/scripts/monitor_ollama_models.py b/skills/qdrant-memory/scripts/monitor_ollama_models.py new file mode 100755 index 0000000..1e2d0b1 --- /dev/null +++ b/skills/qdrant-memory/scripts/monitor_ollama_models.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +""" +Monitor Ollama model library for 100B+ parameter models +Only outputs/announces when there are significant new large models. +Always exits with code 0 to prevent "exec failed" logs. +Usage: monitor_ollama_models.py [--json] +""" + +import argparse +import sys +import json +import urllib.request +import re +import hashlib +from datetime import datetime + +QDRANT_URL = "http://10.0.0.40:6333" +KB_COLLECTION = "knowledge_base" +OLLAMA_LIBRARY_URL = "https://ollama.com/library" + +LARGE_MODEL_TAGS = ["100b", "120b", "200b", "400b", "70b", "8x7b", "8x22b"] +GOOD_FOR_OPENCLAW = ["code", "coding", "instruct", "chat", "reasoning", "llama", "qwen", "mistral", "deepseek", "gemma", "mixtral"] + +def fetch_library(): + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'} + req = urllib.request.Request(OLLAMA_LIBRARY_URL, headers=headers) + try: + with urllib.request.urlopen(req, timeout=20) as response: + return response.read().decode('utf-8', errors='ignore') + except: + return None + +def extract_models(html): + models = [] + model_blocks = re.findall(r']*href="/library/([^"]+)"[^>]*>(.*?)', html, re.DOTALL) + + for model_name, block in model_blocks[:50]: + model_info = { + "name": model_name, "url": f"https://ollama.com/library/{model_name}", + "is_large": False, "is_new": False, "tags": [], "description": "" + } + + tag_matches = re.findall(r']*>([^<]+(?:b|B))', block) + model_info["tags"] = [t.lower() for t in tag_matches] + + for tag in model_info["tags"]: + if any(large_tag in tag for large_tag in LARGE_MODEL_TAGS): + if "70b" in tag and not ("8x" in model_name.lower() or "mixtral" in model_name.lower()): + continue + model_info["is_large"] = True + break + + desc_match = re.search(r']*>([^<]+)

', block) + if desc_match: + model_info["description"] = desc_match.group(1).strip() + + updated_match = re.search(r'(\d+)\s+(hours?|days?)\s+ago', block, re.IGNORECASE) + if updated_match: + num = int(updated_match.group(1)) + unit = updated_match.group(2).lower() + if (unit.startswith("hour") and num <= 24) or (unit.startswith("day") and num <= 2): + model_info["is_new"] = True + + desc_lower = model_info["description"].lower() + name_lower = model_name.lower() + model_info["good_for_openclaw"] = any(kw in desc_lower or kw in name_lower for kw in GOOD_FOR_OPENCLAW) + + models.append(model_info) + return models + +def get_embedding(text): + data = {"model": "nomic-embed-text", "input": text[:500]} + req = urllib.request.Request("http://localhost:11434/api/embed", + data=json.dumps(data).encode(), + headers={"Content-Type": "application/json"}, method="POST") + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result.get("embeddings", [None])[0] + except: + return None + +def search_kb_for_model(model_name): + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/scroll" + data = {"limit": 100, "with_payload": True, "filter": {"must": [ + {"key": "domain", "match": {"value": "AI/LLM"}}, + {"key": "path", "match": {"text": model_name}} + ]}} + req = urllib.request.Request(url, data=json.dumps(data).encode(), + headers={"Content-Type": "application/json"}, method="POST") + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("result", {}).get("points", []) + except: + return [] + +def store_model(model_info): + import uuid + text = f"{model_info['name']}: {model_info['description']}\nTags: {', '.join(model_info['tags'])}" + embedding = get_embedding(text) + if not embedding: + return False + + metadata = { + "domain": "AI/LLM", "path": f"AI/LLM/Ollama/Models/{model_info['name']}", + "subjects": ["ollama", "models", "llm", "100b+"] + model_info['tags'], + "category": "reference", "content_type": "web_page", + "title": f"Ollama Model: {model_info['name']}", "source_url": model_info['url'], + "date_added": datetime.now().strftime("%Y-%m-%d"), "date_scraped": datetime.now().isoformat(), + "model_tags": model_info['tags'], "is_large": model_info['is_large'], "is_new": model_info['is_new'], + "text_preview": text[:300] + } + + point = {"id": str(uuid.uuid4()), "vector": embedding, "payload": metadata} + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points" + req = urllib.request.Request(url, data=json.dumps({"points": [point]}).encode(), + headers={"Content-Type": "application/json"}, method="PUT") + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except: + return False + +def evaluate_candidate(model_info): + score = 0 + reasons = [] + + if not model_info["is_large"]: + return {"is_candidate": False, "score": 0, "reasons": []} + + score += 5 + reasons.append("🦣 100B+ parameters") + + if model_info.get("good_for_openclaw"): + score += 2 + reasons.append("✨ Good for OpenClaw") + + if model_info["is_new"]: + score += 2 + reasons.append("🆕 Recently updated") + + return {"is_candidate": score >= 5, "score": score, "reasons": reasons} + +def format_notification(candidates): + lines = ["🤖 New Large Model Alert (100B+)", f"📅 {datetime.now().strftime('%Y-%m-%d')}", ""] + lines.append(f"📊 {len(candidates)} new large model(s) found:") + lines.append("") + + for model in candidates[:5]: + eval_info = model["evaluation"] + lines.append(f"• {model['name']}") + lines.append(f" {model['description'][:60]}...") + lines.append(f" Tags: {', '.join(model['tags'][:3])}") + for reason in eval_info["reasons"]: + lines.append(f" {reason}") + lines.append(f" 🔗 {model['url']}") + lines.append("") + + lines.append("💡 Potential gpt-oss:120b replacement") + return "\n".join(lines) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--json", action="store_true") + args = parser.parse_args() + + html = fetch_library() + if not html: + if args.json: + print("{}") + sys.exit(0) # Silent fail with exit 0 + + models = extract_models(html) + large_models = [m for m in models if m["is_large"]] + + candidates = [] + + for model in large_models: + existing = search_kb_for_model(model["name"]) + is_new_to_kb = len(existing) == 0 + + evaluation = evaluate_candidate(model) + model["evaluation"] = evaluation + + if is_new_to_kb: + store_model(model) + + if evaluation["is_candidate"] and is_new_to_kb: + candidates.append(model) + + # Output results + if args.json: + if candidates: + print(json.dumps({"candidates": candidates, "notification": format_notification(candidates)})) + else: + print("{}") + elif candidates: + print(format_notification(candidates)) + # No output if no candidates (silent) + + # Always exit 0 to prevent "exec failed" logs + sys.exit(0) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/monitor_openclaw_repo.py b/skills/qdrant-memory/scripts/monitor_openclaw_repo.py new file mode 100755 index 0000000..0c160a8 --- /dev/null +++ b/skills/qdrant-memory/scripts/monitor_openclaw_repo.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +Monitor OpenClaw GitHub repo for relevant updates +Only outputs/announces when there are significant changes affecting our setup. +Always exits with code 0 to prevent "exec failed" logs. +Usage: monitor_openclaw_repo.py [--json] +""" + +import argparse +import sys +import json +import urllib.request +import re +import hashlib +from datetime import datetime + +QDRANT_URL = "http://10.0.0.40:6333" +KB_COLLECTION = "knowledge_base" + +# Keywords that indicate relevance to our setup +RELEVANT_KEYWORDS = [ + "ollama", "model", "embedding", "llm", "ai", + "telegram", "webchat", "signal", "discord", + "skill", "skills", "qdrant", "memory", "search", + "whisper", "tts", "voice", "cron", + "gateway", "agent", "session", "vector", + "browser", "exec", "read", "edit", "write", + "breaking", "deprecated", "removed", "changed", + "fix", "bug", "patch", "security", "vulnerability" +] + +HIGH_PRIORITY_AREAS = [ + "ollama", "telegram", "qdrant", "memory", "skills", + "voice", "cron", "gateway", "browser" +] + +def fetch_github_api(url): + headers = { + 'User-Agent': 'OpenClaw-KB-Monitor', + 'Accept': 'application/vnd.github.v3+json' + } + req = urllib.request.Request(url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=20) as response: + return json.loads(response.read().decode()) + except Exception as e: + return None + +def fetch_github_html(url): + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'} + req = urllib.request.Request(url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=20) as response: + html = response.read().decode('utf-8', errors='ignore') + text = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r'<[^>]+>', ' ', text) + text = re.sub(r'\s+', ' ', text).strip() + return text[:5000] + except: + return None + +def get_embedding(text): + import json as jsonlib + data = {"model": "nomic-embed-text", "input": text[:1000]} + req = urllib.request.Request( + "http://localhost:11434/api/embed", + data=jsonlib.dumps(data).encode(), + headers={"Content-Type": "application/json"}, + method="POST" + ) + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = jsonlib.loads(response.read().decode()) + return result.get("embeddings", [None])[0] + except: + return None + +def search_kb_by_path(path_prefix): + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/scroll" + data = {"limit": 100, "with_payload": True} + req = urllib.request.Request(url, data=json.dumps(data).encode(), + headers={"Content-Type": "application/json"}, method="POST") + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + return [p for p in points if p.get("payload", {}).get("path", "").startswith(path_prefix)] + except: + return [] + +def store_in_kb(text, metadata): + import uuid + embedding = get_embedding(text) + if not embedding: + return None + metadata["checksum"] = f"sha256:{hashlib.sha256(text.encode()).hexdigest()[:16]}" + metadata["date_scraped"] = datetime.now().isoformat() + metadata["text_preview"] = text[:300] + "..." if len(text) > 300 else text + point = {"id": str(uuid.uuid4()), "vector": embedding, "payload": metadata} + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points" + req = urllib.request.Request(url, data=json.dumps({"points": [point]}).encode(), + headers={"Content-Type": "application/json"}, method="PUT") + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except: + return False + +def delete_kb_entry(entry_id): + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/delete" + data = {"points": [entry_id]} + req = urllib.request.Request(url, data=json.dumps(data).encode(), + headers={"Content-Type": "application/json"}, method="POST") + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except: + return False + +def is_relevant_change(text): + text_lower = text.lower() + found_keywords = [kw for kw in RELEVANT_KEYWORDS if kw in text_lower] + high_priority_found = [area for area in HIGH_PRIORITY_AREAS if area in text_lower] + return { + "relevant": len(found_keywords) > 0, + "keywords": found_keywords, + "high_priority": high_priority_found, + "score": len(found_keywords) + (len(high_priority_found) * 2) + } + +def evaluate_significance(changes): + total_score = sum(c["analysis"]["score"] for c in changes) + high_priority_count = sum(len(c["analysis"]["high_priority"]) for c in changes) + return { + "significant": total_score >= 3 or high_priority_count > 0, + "total_score": total_score, + "high_priority_count": high_priority_count + } + +def format_summary(changes, significance): + lines = ["📊 OpenClaw Repo Update", f"📅 {datetime.now().strftime('%Y-%m-%d')}", ""] + by_section = {} + for change in changes: + section = change["section"] + if section not in by_section: + by_section[section] = [] + by_section[section].append(change) + + for section, items in by_section.items(): + lines.append(f"📁 {section}") + for item in items[:3]: + title = item["title"][:50] + "..." if len(item["title"]) > 50 else item["title"] + lines.append(f" • {title}") + if item["analysis"]["high_priority"]: + lines.append(f" ⚠️ Affects: {', '.join(item['analysis']['high_priority'][:2])}") + if len(items) > 3: + lines.append(f" ... and {len(items) - 3} more") + lines.append("") + return "\n".join(lines) + +def scrape_all_sections(): + sections = [] + main_text = fetch_github_html("https://github.com/openclaw/openclaw") + if main_text: + sections.append({"section": "Main Repo", "title": "openclaw/openclaw README", + "url": "https://github.com/openclaw/openclaw", "content": main_text}) + + releases = fetch_github_api("https://api.github.com/repos/openclaw/openclaw/releases?per_page=5") + if releases: + for release in releases: + sections.append({"section": "Release", "title": release.get("name", release.get("tag_name", "Unknown")), + "url": release.get("html_url", ""), "content": release.get("body", "")[:2000], + "published": release.get("published_at", "")}) + + issues = fetch_github_api("https://api.github.com/repos/openclaw/openclaw/issues?state=open&per_page=5") + if issues: + for issue in issues: + if "pull_request" not in issue: + sections.append({"section": "Issue", "title": issue.get("title", "Unknown"), + "url": issue.get("html_url", ""), "content": issue.get("body", "")[:1500] if issue.get("body") else "No description", + "labels": [l.get("name", "") for l in issue.get("labels", [])]}) + return sections + +def check_and_update(): + sections = scrape_all_sections() + if not sections: + return None, "No data scraped" + + existing_entries = search_kb_by_path("OpenClaw/GitHub") + existing_checksums = {e.get("payload", {}).get("checksum", ""): e for e in existing_entries} + changes_detected = [] + + for section in sections: + content = section["content"] + if not content: + continue + checksum = f"sha256:{hashlib.sha256(content.encode()).hexdigest()[:16]}" + if checksum in existing_checksums: + continue + + analysis = is_relevant_change(content + " " + section["title"]) + section["analysis"] = analysis + section["checksum"] = checksum + changes_detected.append(section) + + for old_checksum, old_entry in existing_checksums.items(): + if old_entry.get("payload", {}).get("title", "") == section["title"]: + delete_kb_entry(old_entry.get("id")) + break + + metadata = { + "domain": "OpenClaw", "path": f"OpenClaw/GitHub/{section['section']}/{section['title'][:30]}", + "subjects": ["openclaw", "github", section['section'].lower()], "category": "reference", + "content_type": "web_page", "title": section["title"], "source_url": section["url"], + "date_added": datetime.now().strftime("%Y-%m-%d") + } + store_in_kb(content, metadata) + + if changes_detected: + significance = evaluate_significance(changes_detected) + if significance["significant"]: + return {"changes": changes_detected, "significance": significance, + "summary": format_summary(changes_detected, significance)}, None + else: + return None, "Changes not significant" + return None, "No changes detected" + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--json", action="store_true") + args = parser.parse_args() + + result, reason = check_and_update() + + # Always output JSON for cron compatibility, even if empty + if args.json: + print(json.dumps(result if result else {})) + elif result: + print(result["summary"]) + # If no result, output nothing (silent) + + # Always exit 0 to prevent "exec failed" logs + sys.exit(0) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/notify_check.py b/skills/qdrant-memory/scripts/notify_check.py new file mode 100755 index 0000000..21b83a2 --- /dev/null +++ b/skills/qdrant-memory/scripts/notify_check.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +""" +Lightweight notification checker for agent messages +Cron job: Check Redis stream hourly, notify if new messages +""" + +import json +import redis +import os +from datetime import datetime, timezone + +REDIS_HOST = "10.0.0.36" +REDIS_PORT = 6379 +STREAM_NAME = "agent-messages" +LAST_NOTIFIED_KEY = "agent:notifications:last_id" + +# Simple stdout notification (OpenClaw captures stdout for alerts) +def notify(messages): + if not messages: + return + + other_agent = messages[0].get("agent", "Agent") + count = len(messages) + + # Single line notification - minimal tokens + print(f"📨 {other_agent}: {count} new message(s) in agent-messages") + + # Optional: preview first message (uncomment if wanted) + # if messages: + # preview = messages[0].get("message", "")[:50] + # print(f" Latest: {preview}...") + +def check_notifications(): + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + + # Get last position we notified about + last_id = r.get(LAST_NOTIFIED_KEY) or "0" + + # Read new messages since last notification + result = r.xread({STREAM_NAME: last_id}, block=100, count=100) + + if not result: + return # No new messages, silent exit + + messages = [] + new_last_id = last_id + + for stream_name, entries in result: + for msg_id, data in entries: + messages.append(data) + new_last_id = msg_id + + if messages: + # Filter out our own messages (don't notify about messages we sent) + my_agent = os.environ.get("AGENT_NAME", "Kimi") # Set in cron env + other_messages = [m for m in messages if m.get("agent") != my_agent] + + if other_messages: + notify(other_messages) + + # Update last notified position regardless + r.set(LAST_NOTIFIED_KEY, new_last_id) + +if __name__ == "__main__": + check_notifications() diff --git a/skills/qdrant-memory/scripts/q_save.py b/skills/qdrant-memory/scripts/q_save.py new file mode 100755 index 0000000..2115bd8 --- /dev/null +++ b/skills/qdrant-memory/scripts/q_save.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +""" +Q Save - Trigger conversation storage (Mem0-style) + +Usage: + q_save.py --user-id "rob" "User message" "AI response" [--turn N] + +Called when user says "save q" or "q save" to immediately +store the current conversation to Qdrant. + +Mem0-style: user_id is REQUIRED and persistent across all chats. +""" + +import argparse +import subprocess +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent.resolve() +BACKGROUND_STORE = SCRIPT_DIR / "background_store.py" + +def q_save( + user_id: str, + user_message: str, + ai_response: str, + turn: int = None +): + """Save conversation to Qdrant (background, zero delay)""" + + cmd = [ + sys.executable, + str(BACKGROUND_STORE), + user_message, + ai_response, + "--user-id", user_id + ] + + if turn: + cmd.extend(["--turn", str(turn)]) + + # Fire and forget + subprocess.Popen( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True + ) + + return True + +def main(): + parser = argparse.ArgumentParser( + description='Q Save - Mem0-style trigger (user-centric)' + ) + parser.add_argument("--user-id", required=True, + help="REQUIRED: User ID (e.g., 'rob')") + parser.add_argument("user_message", help="User's message") + parser.add_argument("ai_response", help="AI's response") + parser.add_argument("--turn", type=int, help="Turn number") + + args = parser.parse_args() + + if q_save(args.user_id, args.user_message, args.ai_response, args.turn): + print(f"✅ Saved for user '{args.user_id}'") + else: + print("❌ Failed to save", file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/qd.py b/skills/qdrant-memory/scripts/qd.py new file mode 100755 index 0000000..ebda4a1 --- /dev/null +++ b/skills/qdrant-memory/scripts/qd.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 +""" +Qdrant_Documents - Complete management script +Usage: qd.py [options] + +Commands: + list - List collection info and stats + search - Search documents + store - Store new document + delete - Delete document by ID + export - Export all documents to JSON + import - Import documents from JSON + count - Get total document count + tags - List unique tags +""" + +import argparse +import json +import sys +import urllib.request +import uuid +from datetime import datetime + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION = "Qdrant_Documents" +OLLAMA_URL = "http://localhost:11434/v1" + +# ============================================================================ +# UTILITIES +# ============================================================================ + +def get_embedding(text, model="nomic-embed-text"): + """Generate embedding using Ollama""" + data = json.dumps({"model": model, "input": text[:8000]}).encode() + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + try: + with urllib.request.urlopen(req, timeout=60) as r: + return json.loads(r.read().decode())["data"][0]["embedding"] + except Exception as e: + print(f"Embedding error: {e}", file=sys.stderr) + return None + +def make_request(url, data=None, method="GET"): + """Make HTTP request""" + req = urllib.request.Request(url, method=method) + if data: + req.data = json.dumps(data).encode() + req.add_header("Content-Type", "application/json") + return req + +def check_collection(): + """Verify collection exists""" + try: + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}") + with urllib.request.urlopen(req, timeout=5) as r: + return r.read() + except: + return None + +# ============================================================================ +# COMMANDS +# ============================================================================ + +def cmd_list(args): + """List collection info""" + data = check_collection() + if not data: + print(f"❌ Collection '{COLLECTION}' not found") + sys.exit(1) + + info = json.loads(data.decode())["result"] + + print(f"\n📚 Collection: {COLLECTION}") + print(f" Status: {info['status']}") + print(f" Points: {info['points_count']:,}") + print(f" Vectors: {info['indexed_vectors_count']:,}") + print(f" Segments: {info['segments_count']}") + print(f" Vector size: {info['config']['params']['vectors']['size']}") + print(f" Distance: {info['config']['params']['vectors']['distance']}") + print(f" Optimizer: {info['optimizer_status']}") + print() + + # Show payload schema + print("📋 Payload Schema:") + for field, schema in info.get("payload_schema", {}).items(): + if isinstance(schema, dict) and "data_type" in schema: + print(f" - {field}: {schema['data_type']} ({schema.get('points',0):,} points)") + print() + +def cmd_count(args): + """Get document count""" + req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}") + with urllib.request.urlopen(req, timeout=5) as r: + count = json.loads(r.read().decode())["result"]["points_count"] + print(f"{count}") + +def cmd_search(args): + """Search documents""" + embedding = get_embedding(args.query) + if not embedding: + print("❌ Failed to generate embedding") + sys.exit(1) + + search_body = { + "vector": embedding, + "limit": args.limit, + "with_payload": True, + "with_vector": False + } + + if args.tag: + search_body["filter"] = {"must": [{"key": "tag", "match": {"value": args.tag}}]} + + data = json.dumps(search_body).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points/search", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as r: + results = json.loads(r.read().decode())["result"] + except Exception as e: + print(f"❌ Search failed: {e}") + sys.exit(1) + + if not results: + print("No results found") + return + + print(f"Found {len(results)} results:\n") + for i, r in enumerate(results, 1): + p = r.get("payload", {}) + print(f"[{i}] Score: {r['score']:.3f}") + print(f" Tags: {p.get('tag', 'none')}") + text = p.get('text', '')[:args.chars] + if len(p.get('text', '')) > args.chars: + text += "..." + print(f" Text: {text}") + print() + +def cmd_store(args): + """Store a document""" + # Read from file or use text argument + if args.file: + with open(args.file, 'r') as f: + text = f.read() + else: + text = args.text + + if not text: + print("❌ No text to store") + sys.exit(1) + + embedding = get_embedding(text) + if not embedding: + print("❌ Failed to generate embedding") + sys.exit(1) + + # Parse tags + tags = args.tag.split(",") if args.tag else [] + sections = args.section.split(",") if args.section else [] + + point = { + "points": [{ + "id": str(uuid.uuid4()), + "vector": embedding, + "payload": { + "text": text, + "tag": tags, + "sections": sections, + "date": datetime.now().strftime("%Y-%m-%d"), + "created_at": datetime.now().isoformat() + } + }] + } + + data = json.dumps(point).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true", + data=data, + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as r: + result = json.loads(r.read().decode()) + if result.get("status") == "ok": + print(f"✅ Stored document ({len(text)} chars, {len(embedding)}D vector)") + else: + print(f"❌ Store failed: {result}") + sys.exit(1) + except Exception as e: + print(f"❌ Store error: {e}") + sys.exit(1) + +def cmd_delete(args): + """Delete a document by ID""" + req = make_request( + f"{QDRANT_URL}/collections/{COLLECTION}/points/{args.id}", + method="DELETE" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as r: + print(f"✅ Deleted point {args.id}") + except Exception as e: + print(f"❌ Delete error: {e}") + sys.exit(1) + +def cmd_export(args): + """Export all documents to JSON""" + print(f"Exporting {COLLECTION}...", file=sys.stderr) + + # Get all points + all_points = [] + offset = None + + while True: + scroll_body = {"limit": 100, "with_payload": True, "with_vector": False} + if offset: + scroll_body["offset"] = offset + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll", + data=json.dumps(scroll_body).encode(), + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as r: + result = json.loads(r.read().decode()) + points = result.get("result", {}).get("points", []) + if not points: + break + all_points.extend(points) + offset = result.get("result", {}).get("next_page_offset") + if not offset: + break + except Exception as e: + print(f"❌ Export error: {e}") + sys.exit(1) + + # Format output + output = [] + for p in all_points: + output.append({ + "id": p["id"], + "payload": p.get("payload", {}) + }) + + if args.output: + with open(args.output, 'w') as f: + json.dump(output, f, indent=2) + print(f"✅ Exported {len(output)} documents to {args.output}") + else: + print(json.dumps(output, indent=2)) + +def cmd_import(args): + """Import documents from JSON""" + with open(args.file, 'r') as f: + documents = json.load(f) + + print(f"Importing {len(documents)} documents...") + + success = 0 + for doc in documents: + text = doc.get("payload", {}).get("text", "") + if not text: + continue + + embedding = get_embedding(text) + if not embedding: + print(f" ⚠️ Skipping {doc.get('id')}: embedding failed") + continue + + point = { + "points": [{ + "id": doc.get("id", str(uuid.uuid4())), + "vector": embedding, + "payload": doc.get("payload", {}) + }] + } + + data = json.dumps(point).encode() + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true", + data=data, + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as r: + if json.loads(r.read().decode()).get("status") == "ok": + success += 1 + except: + pass + + print(f"✅ Imported {success}/{len(documents)} documents") + +def cmd_tags(args): + """List unique tags""" + # Use scroll to get all tags + all_tags = set() + offset = None + + while True: + scroll_body = {"limit": 100, "with_payload": True, "with_vector": False} + if offset: + scroll_body["offset"] = offset + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll", + data=json.dumps(scroll_body).encode(), + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as r: + result = json.loads(r.read().decode()) + points = result.get("result", {}).get("points", []) + if not points: + break + for p in points: + tags = p.get("payload", {}).get("tag", []) + if isinstance(tags, list): + all_tags.update(tags) + elif tags: + all_tags.add(tags) + offset = result.get("result", {}).get("next_page_offset") + if not offset: + break + except Exception as e: + print(f"❌ Error: {e}") + sys.exit(1) + + print(f"\n🏷️ Unique tags ({len(all_tags)}):") + for tag in sorted(all_tags): + print(f" - {tag}") + print() + +# ============================================================================ +# MAIN +# ============================================================================ + +def main(): + parser = argparse.ArgumentParser( + description=f"Qdrant_Documents management ({COLLECTION})", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + qd.py list # Show collection stats + qd.py search "docker volumes" # Search documents + qd.py search "query" --tag kubernetes # Filter by tag + qd.py store "text here" --tag "docker" # Store document + qd.py store --file README.md --tag "doc" + qd.py export --output backup.json # Export all + qd.py tags # List all tags + """ + ) + + subparsers = parser.add_subparsers(dest="cmd", required=True) + + # list + subparsers.add_parser("list", help="Show collection info") + + # count + subparsers.add_parser("count", help="Get document count") + + # search + p_search = subparsers.add_parser("search", help="Search documents") + p_search.add_argument("query", help="Search query") + p_search.add_argument("--tag", help="Filter by tag") + p_search.add_argument("--limit", type=int, default=5) + p_search.add_argument("--chars", type=int, default=200) + + # store + p_store = subparsers.add_parser("store", help="Store document") + p_store.add_argument("text", nargs="?", help="Text to store") + p_store.add_argument("--file", help="Read from file") + p_store.add_argument("--tag", help="Comma-separated tags") + p_store.add_argument("--section", help="Comma-separated sections", default="") + + # delete + p_delete = subparsers.add_parser("delete", help="Delete by ID") + p_delete.add_argument("id", help="Point ID to delete") + + # export + p_export = subparsers.add_parser("export", help="Export to JSON") + p_export.add_argument("--output", "-o", help="Output file") + + # import + p_import = subparsers.add_parser("import", help="Import from JSON") + p_import.add_argument("file", help="JSON file to import") + + # tags + subparsers.add_parser("tags", help="List unique tags") + + args = parser.parse_args() + + # Run command + if args.cmd == "list": + cmd_list(args) + elif args.cmd == "count": + cmd_count(args) + elif args.cmd == "search": + cmd_search(args) + elif args.cmd == "store": + cmd_store(args) + elif args.cmd == "delete": + cmd_delete(args) + elif args.cmd == "export": + cmd_export(args) + elif args.cmd == "import": + cmd_import(args) + elif args.cmd == "tags": + cmd_tags(args) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/scrape_to_kb.py b/skills/qdrant-memory/scripts/scrape_to_kb.py new file mode 100755 index 0000000..f9852bb --- /dev/null +++ b/skills/qdrant-memory/scripts/scrape_to_kb.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +""" +Scrape web content and store in knowledge_base collection +Usage: scrape_to_kb.py [--title "Title"] [--subjects "a,b,c"] +""" + +import argparse +import sys +import re +import hashlib +import urllib.request +import urllib.error +from html import unescape + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "knowledge_base" +OLLAMA_EMBED_URL = "http://localhost:11434/api/embed" + +def fetch_url(url): + """Fetch URL content""" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + } + req = urllib.request.Request(url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=30) as response: + return response.read().decode('utf-8', errors='ignore') + except Exception as e: + print(f"❌ Error fetching {url}: {e}", file=sys.stderr) + return None + +def extract_text(html): + """Extract clean text from HTML""" + # Remove script and style tags + html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) + html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) + + # Extract title + title_match = re.search(r']*>([^<]*)', html, re.IGNORECASE) + title = title_match.group(1).strip() if title_match else "Untitled" + title = unescape(title) + + # Remove nav/header/footer common patterns + html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) + html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) + html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) + + # Convert common block elements to newlines + html = re.sub(r'', '\n', html, flags=re.IGNORECASE) + html = re.sub(r'', '\n', html, flags=re.IGNORECASE) + + # Remove all remaining tags + text = re.sub(r'<[^>]+>', ' ', html) + + # Clean up whitespace + text = unescape(text) + text = re.sub(r'\n\s*\n', '\n\n', text) + text = re.sub(r'[ \t]+', ' ', text) + text = '\n'.join(line.strip() for line in text.split('\n')) + text = '\n'.join(line for line in text.split('\n') if line) + + return title, text + +def chunk_text(text, max_chars=2000, overlap=200): + """Split text into overlapping chunks""" + chunks = [] + start = 0 + + while start < len(text): + end = start + max_chars + + # Try to break at sentence or paragraph + if end < len(text): + # Look for paragraph break + para_break = text.rfind('\n\n', start, end) + if para_break > start + 500: + end = para_break + else: + # Look for sentence break + sent_break = max( + text.rfind('. ', start, end), + text.rfind('? ', start, end), + text.rfind('! ', start, end) + ) + if sent_break > start + 500: + end = sent_break + 1 + + chunk = text[start:end].strip() + if len(chunk) > 100: # Skip tiny chunks + chunks.append(chunk) + + start = end - overlap + if start >= len(text): + break + + return chunks + +def get_embedding(text): + """Generate embedding via Ollama""" + import json + data = { + "model": "nomic-embed-text", + "input": text + } + req = urllib.request.Request( + OLLAMA_EMBED_URL, + data=json.dumps(data).encode(), + headers={"Content-Type": "application/json"}, + method="POST" + ) + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + return result.get("embeddings", [None])[0] + except Exception as e: + print(f"❌ Error generating embedding: {e}", file=sys.stderr) + return None + +def compute_checksum(text): + """Compute SHA256 checksum""" + return f"sha256:{hashlib.sha256(text.encode()).hexdigest()}" + +def store_in_kb(text, metadata): + """Store chunk in knowledge_base""" + import json + import uuid + + embedding = get_embedding(text) + if not embedding: + return False + + point = { + "id": str(uuid.uuid4()), + "vector": embedding, + "payload": metadata + } + + url = f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points" + req = urllib.request.Request( + url, + data=json.dumps({"points": [point]}).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except Exception as e: + print(f"❌ Error storing: {e}", file=sys.stderr) + return False + +def main(): + parser = argparse.ArgumentParser(description="Scrape URL to knowledge base") + parser.add_argument("url", help="URL to scrape") + parser.add_argument("domain", help="Knowledge domain (e.g., Python, OpenClaw)") + parser.add_argument("path", help="Hierarchical path (e.g., OpenClaw/Docs/Overview)") + parser.add_argument("--title", help="Override title") + parser.add_argument("--subjects", help="Comma-separated subjects") + parser.add_argument("--category", default="reference", help="Category: reference|tutorial|snippet|troubleshooting|concept") + parser.add_argument("--content-type", default="web_page", help="Content type: web_page|code|markdown|pdf|note") + + args = parser.parse_args() + + print(f"🔍 Fetching {args.url}...") + html = fetch_url(args.url) + if not html: + sys.exit(1) + + print("✂️ Extracting text...") + title, text = extract_text(html) + if args.title: + title = args.title + + print(f"📄 Title: {title}") + print(f"📝 Content length: {len(text)} chars") + + if len(text) < 200: + print("❌ Content too short, skipping", file=sys.stderr) + sys.exit(1) + + print("🧩 Chunking...") + chunks = chunk_text(text) + print(f" {len(chunks)} chunks") + + subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else [] + checksum = compute_checksum(text) + date_added = "2026-02-05" + + print("💾 Storing chunks...") + stored = 0 + for i, chunk in enumerate(chunks): + chunk_metadata = { + "domain": args.domain, + "path": f"{args.path}/chunk-{i+1}", + "subjects": subjects, + "category": args.category, + "content_type": args.content_type, + "title": f"{title} (part {i+1}/{len(chunks)})", + "checksum": checksum, + "source_url": args.url, + "date_added": date_added, + "chunk_index": i + 1, + "total_chunks": len(chunks), + "text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk + } + + if store_in_kb(chunk, chunk_metadata): + stored += 1 + print(f" ✓ Chunk {i+1}/{len(chunks)}") + else: + print(f" ✗ Chunk {i+1}/{len(chunks)} failed") + + print(f"\n🎉 Stored {stored}/{len(chunks)} chunks in knowledge_base") + print(f" Domain: {args.domain}") + print(f" Path: {args.path}") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/search_memories.py b/skills/qdrant-memory/scripts/search_memories.py new file mode 100755 index 0000000..fb2c0ec --- /dev/null +++ b/skills/qdrant-memory/scripts/search_memories.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Search memories by semantic similarity in Qdrant +Usage: search_memories.py "Query text" [--limit 5] [--filter-tag tag] [--track-access] + +Now with access tracking - updates access_count and last_accessed when memories are retrieved. +""" + +import argparse +import json +import sys +import urllib.request +from datetime import datetime + +import os + +QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") +COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "kimi_memories") +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://127.0.0.1:11434/v1") + +def get_embedding(text): + """Generate embedding using snowflake-arctic-embed2 via Ollama""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"Error generating embedding: {e}", file=sys.stderr) + return None + +def update_access_stats(point_id, current_payload): + """Update access_count and last_accessed for a memory""" + + # Get current values or defaults + access_count = current_payload.get("access_count", 0) + 1 + last_accessed = datetime.now().isoformat() + + # Prepare update payload + update_body = { + "points": [ + { + "id": point_id, + "payload": { + "access_count": access_count, + "last_accessed": last_accessed + } + } + ] + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/payload?wait=true", + data=json.dumps(update_body).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=5) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except Exception as e: + # Silently fail - don't break search if update fails + return False + +def search_memories(query_vector, limit=5, tag_filter=None, track_access=True): + """Search memories in Qdrant with optional access tracking""" + + search_body = { + "vector": query_vector, + "limit": limit, + "with_payload": True, + "with_vector": False + } + + # Add filter if tag specified + if tag_filter: + search_body["filter"] = { + "must": [ + { + "key": "tags", + "match": { + "value": tag_filter + } + } + ] + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search", + data=json.dumps(search_body).encode(), + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + results = result.get("result", []) + + # Track access for retrieved memories + if track_access and results: + for r in results: + point_id = r.get("id") + payload = r.get("payload", {}) + if point_id: + update_access_stats(point_id, payload) + + return results + except Exception as e: + print(f"Error searching memories: {e}", file=sys.stderr) + return [] + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Search memories by semantic similarity") + parser.add_argument("query", help="Search query text") + parser.add_argument("--limit", type=int, default=5, help="Number of results (default: 5)") + parser.add_argument("--filter-tag", help="Filter by tag") + parser.add_argument("--json", action="store_true", help="Output as JSON") + parser.add_argument("--no-track", action="store_true", help="Don't update access stats") + + args = parser.parse_args() + + print(f"Generating query embedding...", file=sys.stderr) + query_vector = get_embedding(args.query) + + if query_vector is None: + print("❌ Failed to generate embedding", file=sys.stderr) + sys.exit(1) + + print(f"Searching Qdrant...", file=sys.stderr) + results = search_memories(query_vector, args.limit, args.filter_tag, track_access=not args.no_track) + + if not results: + print("No matching memories found.") + sys.exit(0) + + if args.json: + # JSON output with all metadata + output = [] + for r in results: + payload = r["payload"] + output.append({ + "id": r.get("id"), + "score": r["score"], + "text": payload.get("text", ""), + "date": payload.get("date", ""), + "tags": payload.get("tags", []), + "importance": payload.get("importance", "medium"), + "confidence": payload.get("confidence", "medium"), + "verified": payload.get("verified", False), + "source_type": payload.get("source_type", "inferred"), + "access_count": payload.get("access_count", 0), + "last_accessed": payload.get("last_accessed", ""), + "expires_at": payload.get("expires_at", None) + }) + print(json.dumps(output, indent=2)) + else: + # Human-readable output + print(f"\n🔍 Found {len(results)} similar memories:\n") + for i, r in enumerate(results, 1): + payload = r["payload"] + score = r["score"] + text = payload.get("text", "")[:200] + if len(payload.get("text", "")) > 200: + text += "..." + date = payload.get("date", "unknown") + tags = ", ".join(payload.get("tags", [])) + importance = payload.get("importance", "medium") + access_count = payload.get("access_count", 0) + verified = "✓" if payload.get("verified", False) else "?" + + print(f"{i}. [{date}] (score: {score:.3f}) [{importance}] {verified}") + print(f" {text}") + if tags: + print(f" Tags: {tags}") + if access_count > 0: + print(f" Accessed: {access_count} times") + print() diff --git a/skills/qdrant-memory/scripts/send_email.py b/skills/qdrant-memory/scripts/send_email.py new file mode 100755 index 0000000..8454c16 --- /dev/null +++ b/skills/qdrant-memory/scripts/send_email.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +"""Send email via Gmail SMTP with attachment support.""" + +import smtplib +import json +import sys +import os +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from email.mime.base import MIMEBase +from email import encoders + +CRED_FILE = "/root/.openclaw/workspace/.gmail_imap.json" + +def load_credentials(): + with open(CRED_FILE) as f: + return json.load(f) + +def send_email(to_email, subject, body, reply_to=None, attachment_path=None): + creds = load_credentials() + smtp_server = "smtp.gmail.com" + smtp_port = 587 + + msg = MIMEMultipart() + msg['From'] = f"Kimi <{creds['email']}>" + msg['To'] = to_email + msg['Subject'] = subject + if reply_to: + msg['In-Reply-To'] = reply_to + msg['References'] = reply_to + + # Attach body + msg.attach(MIMEText(body, 'plain')) + + # Attach file if provided + if attachment_path and os.path.exists(attachment_path): + with open(attachment_path, 'rb') as f: + mime_base = MIMEBase('application', 'octet-stream') + mime_base.set_payload(f.read()) + + encoders.encode_base64(mime_base) + filename = os.path.basename(attachment_path) + mime_base.add_header('Content-Disposition', f'attachment; filename={filename}') + msg.attach(mime_base) + print(f"📎 Attached: {filename}") + + with smtplib.SMTP(smtp_server, smtp_port) as server: + server.starttls() + server.login(creds['email'], creds['app_password']) + server.send_message(msg) + + print(f"✉️ Sent to {to_email}") + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--to", required=True) + parser.add_argument("--subject", required=True) + parser.add_argument("--body", required=True) + parser.add_argument("--reply-to") + parser.add_argument("--attach", help="Path to file to attach") + args = parser.parse_args() + + send_email(args.to, args.subject, args.body, args.reply_to, args.attach) \ No newline at end of file diff --git a/skills/qdrant-memory/scripts/sliding_backup.sh b/skills/qdrant-memory/scripts/sliding_backup.sh new file mode 100755 index 0000000..231c7cb --- /dev/null +++ b/skills/qdrant-memory/scripts/sliding_backup.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Daily Conversation Backup - 7-Day Sliding Window +# Processes last 7 days to catch any missed conversations + +SCRIPT_DIR="/root/.openclaw/workspace/skills/qdrant-memory" +LOG_FILE="/var/log/qdrant-daily-backup.log" + +echo "==============================================" >> "$LOG_FILE" +echo "7-Day Sliding Window Backup - $(date)" >> "$LOG_FILE" +echo "==============================================" >> "$LOG_FILE" + +# Process last 7 days +for day_offset in -6 -5 -4 -3 -2 -1 0; do + date_str=$(date -d "$day_offset days ago" +%Y-%m-%d) + echo "Processing: $date_str..." >> "$LOG_FILE" + cd "$SCRIPT_DIR" && python3 scripts/daily_conversation_backup.py "$date_str" >> "$LOG_FILE" 2>&1 +done + +echo "Backup complete at $(date)" >> "$LOG_FILE" +echo "" >> "$LOG_FILE" diff --git a/skills/qdrant-memory/scripts/smart_parser.py b/skills/qdrant-memory/scripts/smart_parser.py new file mode 100755 index 0000000..144405a --- /dev/null +++ b/skills/qdrant-memory/scripts/smart_parser.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +""" +Smart Parser - BeautifulSoup with CSS selectors for custom extraction +Usage: smart_parser.py --selector "article .content" --domain "Blog" --path "Tech/AI" +""" + +import argparse +import sys +import json +import re +from pathlib import Path +from bs4 import BeautifulSoup +import urllib.request + +sys.path.insert(0, str(Path(__file__).parent)) +from scrape_to_kb import chunk_text, get_embedding, compute_checksum, store_in_kb, fetch_url + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "knowledge_base" + +def parse_with_selectors(html, selectors): + """Extract content using CSS selectors""" + soup = BeautifulSoup(html, 'lxml') + + # Default: get title + title_tag = soup.find('title') + title = title_tag.get_text().strip() if title_tag else "Untitled" + + results = { + "title": title, + "content": "", + "sections": [], + "metadata": {} + } + + for name, selector in selectors.items(): + if name == "_content": + # Main content selector + elements = soup.select(selector) + if elements: + results["content"] = "\n\n".join(el.get_text(separator='\n', strip=True) for el in elements) + elif name == "_title": + # Title override selector + el = soup.select_one(selector) + if el: + results["title"] = el.get_text(strip=True) + elif name.startswith("_"): + # Special selectors + if name == "_code_blocks": + # Extract code separately + code_blocks = soup.select(selector) + results["metadata"]["code_blocks"] = [ + {"lang": el.get('class', [''])[0].replace('language-', '').replace('lang-', ''), + "code": el.get_text()} + for el in code_blocks + ] + elif name == "_links": + links = soup.select(selector) + results["metadata"]["links"] = [ + {"text": el.get_text(strip=True), "href": el.get('href')} + for el in links if el.get('href') + ] + else: + # Named section + elements = soup.select(selector) + if elements: + section_text = "\n\n".join(el.get_text(separator='\n', strip=True) for el in elements) + results["sections"].append({"name": name, "content": section_text}) + + # If no content selector matched, try to auto-extract main content + if not results["content"]: + # Try common content selectors + for sel in ['main', 'article', '[role="main"]', '.content', '.post', '.entry', '#content']: + el = soup.select_one(sel) + if el: + # Remove nav/footer from content + for unwanted in el.find_all(['nav', 'footer', 'aside', 'header']): + unwanted.decompose() + results["content"] = el.get_text(separator='\n', strip=True) + break + + # Fallback: body minus nav/header/footer + if not results["content"]: + body = soup.find('body') + if body: + for unwanted in body.find_all(['nav', 'header', 'footer', 'aside', 'script', 'style']): + unwanted.decompose() + results["content"] = body.get_text(separator='\n', strip=True) + + return results + +def format_extracted(data, include_sections=True): + """Format extracted data into clean text""" + parts = [] + + # Title + parts.append(f"# {data['title']}\n") + + # Content + if data["content"]: + parts.append(data["content"]) + + # Sections + if include_sections and data["sections"]: + for section in data["sections"]: + parts.append(f"\n## {section['name']}\n") + parts.append(section["content"]) + + # Metadata + if data["metadata"].get("code_blocks"): + parts.append("\n\n## Code Examples\n") + for cb in data["metadata"]["code_blocks"]: + lang = cb["lang"] or "text" + parts.append(f"\n```{lang}\n{cb['code']}\n```\n") + + return "\n".join(parts) + +def main(): + parser = argparse.ArgumentParser(description="Smart HTML parser with CSS selectors") + parser.add_argument("url", help="URL to parse") + parser.add_argument("--domain", required=True, help="Knowledge domain") + parser.add_argument("--path", required=True, help="Hierarchical path") + parser.add_argument("--selector", "-s", action='append', nargs=2, metavar=('NAME', 'CSS'), + help="CSS selector (e.g., -s content article -s title h1)") + parser.add_argument("--content-only", action="store_true", help="Only extract main content") + parser.add_argument("--title-selector", help="CSS selector for title") + parser.add_argument("--remove", action='append', help="Selectors to remove") + parser.add_argument("--category", default="reference") + parser.add_argument("--content-type", default="web_page") + parser.add_argument("--subjects", help="Comma-separated subjects") + parser.add_argument("--title", help="Override title") + parser.add_argument("--output", "-o", help="Save to file instead of KB") + + args = parser.parse_args() + + # Build selectors dict + selectors = {} + if args.selector: + for name, css in args.selector: + selectors[name] = css + + if args.content_only: + selectors["_content"] = "main, article, [role='main'], .content, .post, .entry, #content, body" + + if args.title_selector: + selectors["_title"] = args.title_selector + + if args.remove: + selectors["_remove"] = ", ".join(args.remove) + + print(f"🔍 Fetching {args.url}...") + html = fetch_url(args.url) + if not html: + sys.exit(1) + + print("🔧 Parsing...") + data = parse_with_selectors(html, selectors) + + if args.title: + data["title"] = args.title + + text = format_extracted(data) + + print(f"📄 Title: {data['title']}") + print(f"📝 Content: {len(text)} chars") + print(f"📊 Sections: {len(data['sections'])}") + + if args.output: + with open(args.output, 'w') as f: + f.write(text) + print(f"💾 Saved to {args.output}") + return + + if len(text) < 200: + print("❌ Content too short", file=sys.stderr) + sys.exit(1) + + chunks = chunk_text(text) + print(f"🧩 Chunks: {len(chunks)}") + + subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else [] + checksum = compute_checksum(text) + + print("💾 Storing...") + stored = 0 + for i, chunk in enumerate(chunks): + chunk_metadata = { + "domain": args.domain, + "path": f"{args.path}/chunk-{i+1}", + "subjects": subjects, + "category": args.category, + "content_type": args.content_type, + "title": f"{data['title']} (part {i+1}/{len(chunks)})", + "checksum": checksum, + "source_url": args.url, + "date_added": "2026-02-05", + "chunk_index": i + 1, + "total_chunks": len(chunks), + "text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk, + "scraper_type": "smart_parser_bs4", + "extracted_sections": [s["name"] for s in data["sections"]] + } + + if store_in_kb(chunk, chunk_metadata): + stored += 1 + print(f" ✓ Chunk {i+1}") + + print(f"\n🎉 Stored {stored}/{len(chunks)} chunks") + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/smart_search.py b/skills/qdrant-memory/scripts/smart_search.py new file mode 100755 index 0000000..2439fb4 --- /dev/null +++ b/skills/qdrant-memory/scripts/smart_search.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +""" +Hybrid search: knowledge_base first, then web search, store new findings. +Usage: smart_search.py "query" [--domain "Domain"] [--min-kb-score 0.5] [--store-new] +""" + +import argparse +import sys +import json +import urllib.request +import urllib.parse +import re +from datetime import datetime + +QDRANT_URL = "http://10.0.0.40:6333" +OLLAMA_EMBED_URL = "http://localhost:11434/api/embed" +SEARXNG_URL = "http://10.0.0.8:8888" +KB_COLLECTION = "knowledge_base" + +def get_embedding(text): + """Generate embedding via Ollama""" + data = { + "model": "nomic-embed-text", + "input": text[:1000] # Limit for speed + } + req = urllib.request.Request( + OLLAMA_EMBED_URL, + data=json.dumps(data).encode(), + headers={"Content-Type": "application/json"}, + method="POST" + ) + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result.get("embeddings", [None])[0] + except Exception as e: + print(f"⚠️ Embedding error: {e}", file=sys.stderr) + return None + +def search_knowledge_base(query, domain=None, limit=5, min_score=0.5): + """Search knowledge base via vector similarity""" + embedding = get_embedding(query) + if not embedding: + return [] + + search_data = { + "vector": embedding, + "limit": limit, + "with_payload": True + } + + # Note: score_threshold filters aggressively; we filter client-side instead + # to show users what scores were returned + + if domain: + search_data["filter"] = { + "must": [{"key": "domain", "match": {"value": domain}}] + } + + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/search" + req = urllib.request.Request( + url, + data=json.dumps(search_data).encode(), + headers={"Content-Type": "application/json"}, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + results = result.get("result", []) + # Filter by min_score client-side + return [r for r in results if r.get("score", 0) >= min_score] + except Exception as e: + print(f"⚠️ KB search error: {e}", file=sys.stderr) + return [] + +def web_search(query, limit=5): + """Search via SearXNG""" + encoded_query = urllib.parse.quote(query) + url = f"{SEARXNG_URL}/?q={encoded_query}&format=json&safesearch=0" + + try: + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + with urllib.request.urlopen(req, timeout=15) as response: + data = json.loads(response.read().decode()) + return data.get("results", [])[:limit] + except Exception as e: + print(f"⚠️ Web search error: {e}", file=sys.stderr) + return [] + +def fetch_and_extract(url): + """Fetch URL and extract clean text""" + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} + req = urllib.request.Request(url, headers=headers) + + try: + with urllib.request.urlopen(req, timeout=20) as response: + html = response.read().decode('utf-8', errors='ignore') + + # Extract title + title_match = re.search(r']*>([^<]*)', html, re.IGNORECASE) + title = title_match.group(1).strip() if title_match else "Untitled" + + # Clean HTML + html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) + html = re.sub(r']*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) + html = re.sub(r'<[^>]+>', ' ', html) + text = re.sub(r'\s+', ' ', html).strip() + + return title, text[:3000] # Limit content + except Exception as e: + return None, None + +def is_substantial(text, min_length=500): + """Check if content is substantial enough to store""" + return len(text) >= min_length + +def is_unique_content(text, kb_results, similarity_threshold=0.8): + """Check if content is unique compared to existing KB entries""" + if not kb_results: + return True + + # Simple check: if any KB result has very similar content, skip + text_lower = text.lower() + for result in kb_results: + payload = result.get("payload", {}) + kb_text = payload.get("text_preview", "").lower() + + # Check for substantial overlap + if kb_text and len(kb_text) > 100: + # Simple word overlap check + kb_words = set(kb_text.split()) + new_words = set(text_lower.split()) + if kb_words and new_words: + overlap = len(kb_words & new_words) / len(kb_words) + if overlap > similarity_threshold: + return False + return True + +def store_in_kb(text, metadata): + """Store content in knowledge base""" + import uuid + import hashlib + + embedding = get_embedding(text[:1000]) + if not embedding: + return False + + # Add metadata fields + metadata["checksum"] = f"sha256:{hashlib.sha256(text.encode()).hexdigest()[:16]}" + metadata["date_scraped"] = datetime.now().isoformat() + metadata["text_preview"] = text[:300] + "..." if len(text) > 300 else text + + point = { + "id": str(uuid.uuid4()), + "vector": embedding, + "payload": metadata + } + + url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points" + req = urllib.request.Request( + url, + data=json.dumps({"points": [point]}).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + return result.get("status") == "ok" + except Exception as e: + print(f"⚠️ Store error: {e}", file=sys.stderr) + return False + +def suggest_domain(query, title, content): + """Suggest a domain based on query and content""" + query_lower = query.lower() + title_lower = title.lower() + content_lower = content[:500].lower() + + # Keyword mapping + domains = { + "Python": ["python", "pip", "django", "flask", "asyncio"], + "JavaScript": ["javascript", "js", "node", "react", "vue", "angular"], + "Linux": ["linux", "ubuntu", "debian", "systemd", "bash", "shell"], + "Networking": ["network", "dns", "tcp", "http", "ssl", "vpn"], + "Docker": ["docker", "container", "kubernetes", "k8s"], + "AI/ML": ["ai", "ml", "machine learning", "llm", "gpt", "model"], + "OpenClaw": ["openclaw"], + "Database": ["database", "sql", "postgres", "mysql", "redis"], + "Security": ["security", "encryption", "auth", "oauth", "jwt"], + "DevOps": ["devops", "ci/cd", "github actions", "jenkins"] + } + + combined = query_lower + " " + title_lower + " " + content_lower + + for domain, keywords in domains.items(): + for kw in keywords: + if kw in combined: + return domain + + return "General" + +def main(): + parser = argparse.ArgumentParser(description="Smart search: KB first, then web, store new") + parser.add_argument("query", help="Search query") + parser.add_argument("--domain", help="Filter KB by domain") + parser.add_argument("--min-kb-score", type=float, default=0.5, help="Minimum KB match score (default: 0.5)") + parser.add_argument("--store-new", action="store_true", help="Automatically store new web findings") + parser.add_argument("--web-limit", type=int, default=3, help="Number of web results to check") + parser.add_argument("--json", action="store_true", help="Output as JSON") + + args = parser.parse_args() + + results = { + "query": args.query, + "kb_results": [], + "web_results": [], + "stored_count": 0, + "timestamp": datetime.now().isoformat() + } + + # Step 1: Search knowledge base + print(f"🔍 Searching knowledge base (min score: {args.min_kb_score})...") + kb_results = search_knowledge_base(args.query, args.domain, limit=5, min_score=args.min_kb_score) + results["kb_results"] = kb_results + + if kb_results: + print(f" ✓ Found {len(kb_results)} KB entries") + for r in kb_results: + payload = r.get("payload", {}) + score = r.get("score", 0) + title = payload.get('title', 'Untitled')[:50] + source = payload.get('source_url', 'N/A')[:40] + print(f" • {title}... (score: {score:.2f}) [{source}...]") + else: + print(f" ✗ No KB matches above threshold ({args.min_kb_score})") + + # Step 2: Web search + print(f"\n🌐 Searching web...") + web_results = web_search(args.query, limit=args.web_limit) + results["web_results"] = web_results + + if not web_results: + print(f" ✗ No web results") + if args.json: + print(json.dumps(results, indent=2)) + return + + print(f" ✓ Found {len(web_results)} web results") + + # Step 3: Check and optionally store new findings + new_stored = 0 + + for web_result in web_results: + url = web_result.get("url", "") + title = web_result.get("title", "Untitled") + snippet = web_result.get("content", "") + + print(f"\n📄 Checking: {title}") + print(f" URL: {url}") + + # Fetch full content + fetched_title, content = fetch_and_extract(url) + if not content: + print(f" ⚠️ Could not fetch content") + continue + + title = fetched_title or title + + # Check if substantial + if not is_substantial(content): + print(f" ⏭️ Content too short ({len(content)} chars), skipping") + continue + + # Check if unique + if not is_unique_content(content, kb_results): + print(f" ⏭️ Similar content already in KB") + continue + + print(f" ✓ New substantial content ({len(content)} chars)") + + # Auto-store or suggest + if args.store_new: + domain = suggest_domain(args.query, title, content) + subjects = [s.strip() for s in args.query.lower().split() if len(s) > 3] + + metadata = { + "domain": domain, + "path": f"{domain}/Web/{re.sub(r'[^\w\s-]', '', title)[:30]}", + "subjects": subjects, + "category": "reference", + "content_type": "web_page", + "title": title, + "source_url": url, + "date_added": datetime.now().strftime("%Y-%m-%d") + } + + if store_in_kb(content, metadata): + print(f" ✅ Stored in KB (domain: {domain})") + new_stored += 1 + else: + print(f" ❌ Failed to store") + else: + print(f" 💡 Use --store-new to save this") + + results["stored_count"] = new_stored + + # Summary + print(f"\n📊 Summary:") + print(f" KB results: {len(kb_results)}") + print(f" Web results checked: {len(web_results)}") + print(f" New items stored: {new_stored}") + + if args.json: + print(json.dumps(results, indent=2)) + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/store_conversation.py b/skills/qdrant-memory/scripts/store_conversation.py new file mode 100755 index 0000000..1cac0fa --- /dev/null +++ b/skills/qdrant-memory/scripts/store_conversation.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +""" +Conversation Memory Capture - Store conversational turns to Qdrant + +This script stores the full conversational context (user messages + AI responses) +as atomic facts in Qdrant, not just summaries written to daily logs. + +Usage: + store_conversation.py "User message" "AI response" --date 2026-02-15 --tags "workflow" + store_conversation.py --file conversation.json # Batch mode + +Features: + - Stores both user queries and AI responses + - Generates embeddings for semantic search + - Links related turns with conversation IDs + - Extracts facts from responses automatically +""" + +import argparse +import json +import os +import sys +import urllib.request +import urllib.error +import uuid +from datetime import datetime +from typing import List, Optional, Dict, Any + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_URL = "http://localhost:11434/v1" + + +def get_embedding(text: str) -> Optional[List[float]]: + """Generate embedding using snowflake-arctic-embed2""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"Error generating embedding: {e}", file=sys.stderr) + return None + + +def extract_tags(text: str, date_str: str) -> List[str]: + """Extract relevant tags from text""" + tags = ["conversation-turn", "atomic-fact", date_str] + + text_lower = text.lower() + tag_mappings = { + "youtube": "youtube", + "video": "video", + "workflow": "workflow", + "process": "process", + "qdrant": "qdrant", + "memory": "memory", + "fact": "facts", + "extract": "extraction", + "config": "configuration", + "setting": "settings", + "rule": "rules", + "decision": "decisions", + "preference": "preferences", + "hardware": "hardware", + "security": "security", + "research": "research", + "step": "steps", + "grok": "grok", + "thumbnail": "thumbnail", + "title": "title", + "description": "description", + "seo": "seo", + "tags": "tags", + } + + for keyword, tag in tag_mappings.items(): + if keyword in text_lower: + tags.append(tag) + + return list(set(tags)) + + +def store_turn( + speaker: str, + message: str, + date_str: str, + tags: List[str] = None, + conversation_id: str = None, + turn_number: int = None, + importance: str = "medium" +) -> Optional[str]: + """Store a single conversational turn""" + + embedding = get_embedding(message) + if embedding is None: + return None + + point_id = str(uuid.uuid4()) + + if tags is None: + tags = extract_tags(message, date_str) + + payload = { + "text": f"[{speaker}]: {message}", + "date": date_str, + "tags": tags, + "importance": importance, + "source": "conversation", + "source_type": "user" if speaker == "Rob" else "assistant", + "category": "Conversation", + "confidence": "high", + "verified": True, + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat(), + "conversation_id": conversation_id or str(uuid.uuid4()), + "turn_number": turn_number or 0 + } + + upsert_data = { + "points": [{ + "id": point_id, + "vector": embedding, + "payload": payload + }] + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + if result.get("status") == "ok": + return point_id + except Exception as e: + print(f"Error storing turn: {e}", file=sys.stderr) + + return None + + +def store_conversation_pair( + user_message: str, + ai_response: str, + date_str: str, + tags: List[str] = None, + importance: str = "medium" +) -> tuple: + """Store both user query and AI response as linked turns""" + + conversation_id = str(uuid.uuid4()) + + user_id = store_turn( + speaker="Rob", + message=user_message, + date_str=date_str, + tags=tags, + conversation_id=conversation_id, + turn_number=1, + importance=importance + ) + + ai_id = store_turn( + speaker="Kimi", + message=ai_response, + date_str=date_str, + tags=tags, + conversation_id=conversation_id, + turn_number=2, + importance=importance + ) + + return user_id, ai_id + + +def extract_facts_from_text(text: str, date_str: str) -> List[Dict[str, Any]]: + """Extract atomic facts from a text block""" + facts = [] + + # Split into sentences + sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n') if s.strip()] + + for sentence in sentences: + if len(sentence) < 10: + continue + + embedding = get_embedding(sentence) + if embedding is None: + continue + + point_id = str(uuid.uuid4()) + + facts.append({ + "id": point_id, + "vector": embedding, + "payload": { + "text": sentence[:500], + "date": date_str, + "tags": extract_tags(sentence, date_str), + "importance": "high" if "**" in sentence else "medium", + "source": "fact-extraction", + "source_type": "inferred", + "category": "Extracted Fact", + "confidence": "medium", + "verified": False, + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat() + } + }) + + return facts + + +def main(): + parser = argparse.ArgumentParser(description="Store conversational turns to Qdrant") + parser.add_argument("user_message", nargs="?", help="User's message/query") + parser.add_argument("ai_response", nargs="?", help="AI's response") + parser.add_argument("--date", default=datetime.now().strftime("%Y-%m-%d"), help="Date (YYYY-MM-DD)") + parser.add_argument("--tags", help="Comma-separated tags") + parser.add_argument("--importance", default="medium", choices=["low", "medium", "high"]) + parser.add_argument("--file", help="JSON file with conversation array") + parser.add_argument("--extract-facts", action="store_true", help="Also extract atomic facts from response") + + args = parser.parse_args() + + tags = args.tags.split(",") if args.tags else None + + if args.file: + # Batch mode from JSON file + with open(args.file, 'r') as f: + conversations = json.load(f) + + total = 0 + for conv in conversations: + user_id, ai_id = store_conversation_pair( + conv["user"], + conv["ai"], + args.date, + tags or conv.get("tags"), + args.importance + ) + if user_id and ai_id: + total += 2 + + print(f"✅ Stored {total} conversation turns") + + elif args.user_message and args.ai_response: + # Single pair mode + user_id, ai_id = store_conversation_pair( + args.user_message, + args.ai_response, + args.date, + tags, + args.importance + ) + + if user_id and ai_id: + print(f"✅ Stored conversation pair") + print(f" User turn: {user_id[:8]}...") + print(f" AI turn: {ai_id[:8]}...") + + if args.extract_facts: + facts = extract_facts_from_text(args.ai_response, args.date) + if facts: + # Upload facts + upsert_data = {"points": facts} + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + try: + with urllib.request.urlopen(req, timeout=30) as response: + print(f" Extracted {len(facts)} additional facts") + except Exception as e: + print(f" Warning: Could not store extracted facts: {e}") + else: + print("❌ Failed to store conversation") + sys.exit(1) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/skills/qdrant-memory/scripts/store_memory.py b/skills/qdrant-memory/scripts/store_memory.py new file mode 100755 index 0000000..4585365 --- /dev/null +++ b/skills/qdrant-memory/scripts/store_memory.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 +""" +Enhanced memory storage with metadata support and batch upload capability +Usage: store_memory.py "Memory text" [--tags tag1,tag2] [--importance medium] + [--confidence high] [--source user|inferred|external] + [--verified] [--expires 2026-03-01] [--related id1,id2] + [--batch-mode] [--batch-size N] + +Features: + - Single or batch memory storage + - Duplicate detection with --replace flag + - Enhanced metadata (importance, confidence, source_type, etc.) + - Access tracking (access_count, last_accessed) +""" + +import argparse +import json +import sys +import urllib.request +import urllib.error +import uuid +from datetime import datetime, timedelta +from typing import List, Optional, Dict, Any + +QDRANT_URL = "http://10.0.0.40:6333" +COLLECTION_NAME = "kimi_memories" +OLLAMA_URL = "http://localhost:11434/v1" +DEFAULT_BATCH_SIZE = 100 + + +def check_existing(date: str = None) -> Optional[str]: + """Check if entry already exists for this date""" + if not date: + return None + + try: + scroll_data = json.dumps({ + "limit": 100, + "with_payload": True, + "filter": { + "must": [{"key": "date", "match": {"value": date}}] + } + }).encode() + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll", + data=scroll_data, + headers={"Content-Type": "application/json"}, + method="POST" + ) + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + points = result.get("result", {}).get("points", []) + if points: + return points[0]["id"] # Return existing ID + except Exception as e: + print(f"Warning: Could not check existing: {e}", file=sys.stderr) + return None + + +def get_embedding(text: str) -> Optional[List[float]]: + """Generate embedding using snowflake-arctic-embed2 via Ollama""" + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": text[:8192] # Limit to 8k chars + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + result = json.loads(response.read().decode()) + return result["data"][0]["embedding"] + except Exception as e: + print(f"Error generating embedding: {e}", file=sys.stderr) + return None + + +def batch_upload_embeddings(texts: List[str]) -> List[Optional[List[float]]]: + """Generate embeddings for multiple texts in one batch""" + if not texts: + return [] + + data = json.dumps({ + "model": "snowflake-arctic-embed2", + "input": [t[:8192] for t in texts] + }).encode() + + req = urllib.request.Request( + f"{OLLAMA_URL}/embeddings", + data=data, + headers={"Content-Type": "application/json"} + ) + + try: + with urllib.request.urlopen(req, timeout=120) as response: + result = json.loads(response.read().decode()) + return [d["embedding"] for d in result["data"]] + except Exception as e: + print(f"Error generating batch embeddings: {e}", file=sys.stderr) + return [None] * len(texts) + + +def upload_points_batch(points: List[Dict[str, Any]], batch_size: int = DEFAULT_BATCH_SIZE) -> tuple: + """Upload points in batches to Qdrant""" + total = len(points) + uploaded = 0 + failed = 0 + + for i in range(0, total, batch_size): + batch = points[i:i + batch_size] + + upsert_data = {"points": batch} + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=60) as response: + result = json.loads(response.read().decode()) + if result.get("status") == "ok": + uploaded += len(batch) + else: + print(f"Batch upload failed: {result}", file=sys.stderr) + failed += len(batch) + except Exception as e: + print(f"Batch upload error: {e}", file=sys.stderr) + failed += len(batch) + + return uploaded, failed + + +def store_single_memory( + text: str, + embedding: List[float], + tags: List[str] = None, + importance: str = "medium", + date: str = None, + source: str = "conversation", + confidence: str = "high", + source_type: str = "user", + verified: bool = True, + expires_at: str = None, + related_memories: List[str] = None, + replace: bool = False +) -> Optional[str]: + """Store a single memory in Qdrant with enhanced metadata""" + + if date is None: + date = datetime.now().strftime("%Y-%m-%d") + + # Check for existing entry on same date + existing_id = check_existing(date=date) if date else None + if existing_id and not replace: + print(f"⚠️ Entry for {date} already exists (ID: {existing_id})") + print(f" Use --replace to overwrite") + return None + + # Use existing ID if replacing, otherwise generate new + point_id = existing_id if existing_id else str(uuid.uuid4()) + + # Build payload with all metadata + payload = { + "text": text, + "date": date, + "tags": tags or [], + "importance": importance, + "source": source, + "confidence": confidence, + "source_type": source_type, + "verified": verified, + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat() + } + + # Optional metadata + if expires_at: + payload["expires_at"] = expires_at + if related_memories: + payload["related_memories"] = related_memories + + # Qdrant upsert format + upsert_data = { + "points": [{ + "id": point_id, + "vector": embedding, + "payload": payload + }] + } + + req = urllib.request.Request( + f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true", + data=json.dumps(upsert_data).encode(), + headers={"Content-Type": "application/json"}, + method="PUT" + ) + + try: + with urllib.request.urlopen(req, timeout=10) as response: + result = json.loads(response.read().decode()) + if result.get("status") == "ok": + return point_id + else: + print(f"Qdrant response: {result}", file=sys.stderr) + return None + except urllib.error.HTTPError as e: + error_body = e.read().decode() + print(f"HTTP Error {e.code}: {error_body}", file=sys.stderr) + return None + except Exception as e: + print(f"Error storing memory: {e}", file=sys.stderr) + return None + + +def store_memories_batch( + memories: List[Dict[str, Any]], + batch_size: int = DEFAULT_BATCH_SIZE +) -> tuple: + """Store multiple memories in batch""" + if not memories: + return 0, 0 + + # Generate embeddings for all + texts = [m["text"] for m in memories] + print(f"Generating embeddings for {len(texts)} memories...") + embeddings = batch_upload_embeddings(texts) + + # Prepare points + points = [] + failed_indices = [] + + for i, (memory, embedding) in enumerate(zip(memories, embeddings)): + if embedding is None: + failed_indices.append(i) + continue + + point_id = str(uuid.uuid4()) + date = memory.get("date", datetime.now().strftime("%Y-%m-%d")) + + payload = { + "text": memory["text"], + "date": date, + "tags": memory.get("tags", []), + "importance": memory.get("importance", "medium"), + "source": memory.get("source", "conversation"), + "confidence": memory.get("confidence", "high"), + "source_type": memory.get("source_type", "user"), + "verified": memory.get("verified", True), + "created_at": datetime.now().isoformat(), + "access_count": 0, + "last_accessed": datetime.now().isoformat() + } + + # NOTE: User requested NO memory expiration - permanent retention + # expires_at is accepted for API compatibility but ignored + if memory.get("expires_at"): + payload["expires_at"] = memory["expires_at"] + if memory.get("related_memories"): + payload["related_memories"] = memory["related_memories"] + + points.append({ + "id": point_id, + "vector": embedding, + "payload": payload + }) + + if not points: + return 0, len(memories) + + # Upload in batches + print(f"Uploading {len(points)} memories in batches of {batch_size}...") + uploaded, failed_upload = upload_points_batch(points, batch_size) + + return uploaded, len(failed_indices) + failed_upload + + +def parse_date(date_str: str) -> Optional[str]: + """Validate date format""" + if not date_str: + return None + try: + datetime.strptime(date_str, "%Y-%m-%d") + return date_str + except ValueError: + print(f"Invalid date format: {date_str}. Use YYYY-MM-DD.", file=sys.stderr) + return None + + +def main(): + parser = argparse.ArgumentParser(description="Store memories in Qdrant with metadata") + parser.add_argument("text", nargs="?", help="Memory text to store") + parser.add_argument("--tags", help="Comma-separated tags") + parser.add_argument("--importance", default="medium", choices=["low", "medium", "high"]) + parser.add_argument("--date", help="Date in YYYY-MM-DD format") + parser.add_argument("--source", default="conversation", help="Source of the memory") + parser.add_argument("--confidence", default="high", choices=["high", "medium", "low"]) + parser.add_argument("--source-type", default="user", choices=["user", "inferred", "external"]) + parser.add_argument("--verified", action="store_true", default=True) + parser.add_argument("--expires", help="Expiration date YYYY-MM-DD (NOTE: User prefers permanent retention)") + parser.add_argument("--related", help="Comma-separated related memory IDs") + parser.add_argument("--replace", action="store_true", help="Replace existing entry for the same date") + parser.add_argument("--batch-file", help="JSON file with multiple memories for batch upload") + parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size (default: {DEFAULT_BATCH_SIZE})") + + args = parser.parse_args() + + # Batch mode + if args.batch_file: + print(f"Batch mode: Loading memories from {args.batch_file}") + try: + with open(args.batch_file, 'r') as f: + memories = json.load(f) + + if not isinstance(memories, list): + print("Batch file must contain a JSON array of memories", file=sys.stderr) + sys.exit(1) + + print(f"Loaded {len(memories)} memories for batch upload") + uploaded, failed = store_memories_batch(memories, args.batch_size) + + print(f"\n{'=' * 50}") + print(f"Batch upload complete:") + print(f" Uploaded: {uploaded}") + print(f" Failed: {failed}") + + sys.exit(0 if failed == 0 else 1) + + except Exception as e: + print(f"Error loading batch file: {e}", file=sys.stderr) + sys.exit(1) + + # Single memory mode + if not args.text: + print("Error: Either provide text argument or use --batch-file", file=sys.stderr) + parser.print_help() + sys.exit(1) + + # Parse tags and related memories + tags = [t.strip() for t in args.tags.split(",")] if args.tags else [] + related = [r.strip() for r in args.related.split(",")] if args.related else None + + # Validate date + date = parse_date(args.date) + if args.date and not date: + sys.exit(1) + + print(f"Generating embedding...") + embedding = get_embedding(args.text) + + if embedding is None: + print("❌ Failed to generate embedding", file=sys.stderr) + sys.exit(1) + + print(f"Storing memory (vector dim: {len(embedding)})...") + point_id = store_single_memory( + text=args.text, + embedding=embedding, + tags=tags, + importance=args.importance, + date=date, + source=args.source, + confidence=args.confidence, + source_type=args.source_type, + verified=args.verified, + expires_at=args.expires, + related_memories=related, + replace=args.replace + ) + + if point_id: + print(f"✅ Memory stored successfully") + print(f" ID: {point_id}") + print(f" Tags: {tags}") + print(f" Importance: {args.importance}") + print(f" Confidence: {args.confidence}") + print(f" Source: {args.source_type}") + if args.expires: + print(f" Expires: {args.expires}") + else: + print(f"❌ Failed to store memory", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/qdrant-memory/scripts/tagger.py b/skills/qdrant-memory/scripts/tagger.py new file mode 100644 index 0000000..9bd5ace --- /dev/null +++ b/skills/qdrant-memory/scripts/tagger.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +"""Tagger (optional): generate lightweight tags + title for a conversation chunk. + +Design goals: +- Cheap: run on a smaller model (e.g. Minimax 2.5 or any OpenAI-compatible endpoint). +- Portable: fully env-configured. +- Deterministic-ish: JSON output. + +This is intentionally optional. The memory system works without it. + +Env (OpenAI-compatible): + TAGGER_BASE_URL e.g. https://api.minimax.chat/v1 + TAGGER_API_KEY token + TAGGER_MODEL default: minimax-2.5 + +Usage: + python3 tagger.py --text "..." + +Output: + {"title": "...", "tags": ["..."], "entities": ["..."], "category": "..."} +""" + +import argparse +import json +import os +import sys +import urllib.request + +BASE_URL = os.getenv("TAGGER_BASE_URL", "").rstrip("/") +API_KEY = os.getenv("TAGGER_API_KEY", "") +MODEL = os.getenv("TAGGER_MODEL", "minimax-2.5") +TIMEOUT = int(os.getenv("TAGGER_TIMEOUT", "30")) + +SYSTEM = ( + "You generate compact metadata for retrieving old conversation context. " + "Return STRICT JSON with keys: title (string), tags (array of short strings), " + "entities (array of short strings), category (string). " + "Tags should be lowercase, hyphenated, <= 4 words each. " + "Prefer 5-12 tags." +) + + +def call_openai_compat(text: str) -> dict: + if not BASE_URL or not API_KEY: + raise RuntimeError("TAGGER_BASE_URL and TAGGER_API_KEY must be set") + + body = { + "model": MODEL, + "messages": [ + {"role": "system", "content": SYSTEM}, + {"role": "user", "content": text[:12000]}, + ], + "temperature": 0.2, + "response_format": {"type": "json_object"}, + } + + req = urllib.request.Request( + f"{BASE_URL}/chat/completions", + data=json.dumps(body).encode("utf-8"), + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {API_KEY}", + }, + ) + + with urllib.request.urlopen(req, timeout=TIMEOUT) as r: + resp = json.loads(r.read().decode("utf-8")) + + content = resp["choices"][0]["message"]["content"] + return json.loads(content) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--text", required=True) + args = ap.parse_args() + + try: + out = call_openai_compat(args.text) + except Exception as e: + print(f"[tagger] error: {e}", file=sys.stderr) + sys.exit(1) + + print(json.dumps(out, indent=2, sort_keys=True)) + + +if __name__ == "__main__": + main() diff --git a/skills/task-queue/SKILL.md b/skills/task-queue/SKILL.md new file mode 100644 index 0000000..22cbcbf --- /dev/null +++ b/skills/task-queue/SKILL.md @@ -0,0 +1,33 @@ +# Task Queue Skill + +Redis-based task queue for background jobs. + +## What It Does + +Queues and executes tasks via heartbeat worker. + +## Commands + +```bash +# Add a task +python3 scripts/add_task.py "Check disk space" + +# List tasks +python3 scripts/list_tasks.py + +# Execute (runs on heartbeat) +python3 scripts/heartbeat_worker.py +``` + +## Heartbeat Integration + +Add to HEARTBEAT.md: +```bash +python3 /path/to/skills/task-queue/scripts/heartbeat_worker.py +``` + +## Files + +- `add_task.py` - Add task to queue +- `list_tasks.py` - View queue status +- `heartbeat_worker.py` - Execute pending tasks diff --git a/skills/task-queue/scripts/add_task.py b/skills/task-queue/scripts/add_task.py new file mode 100755 index 0000000..46a33ac --- /dev/null +++ b/skills/task-queue/scripts/add_task.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Add a task to the queue. +Usage: python3 add_task.py "Task description" [options] +""" + +import redis +import sys +import time +import os +import argparse + +REDIS_HOST = os.environ.get("REDIS_HOST", "10.0.0.36") +REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) +REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", None) + +def get_redis(): + return redis.Redis( + host=REDIS_HOST, + port=REDIS_PORT, + password=REDIS_PASSWORD, + decode_responses=True + ) + +def generate_task_id(): + return f"task_{int(time.time())}_{os.urandom(4).hex()[:8]}" + +def add_task(description, task_type="default", priority="medium", created_by="Kimi", message=None, command=None): + r = get_redis() + + task_id = generate_task_id() + timestamp = str(int(time.time())) + + # Build task data + task_data = { + "id": task_id, + "description": description, + "type": task_type, + "status": "pending", + "created_at": timestamp, + "created_by": created_by, + "priority": priority, + "started_at": "", + "completed_at": "", + "result": "" + } + + # Add type-specific fields + if task_type == "notify" and message: + task_data["message"] = message + elif task_type == "command" and command: + task_data["command"] = command + + # Store task details + r.hset(f"task:{task_id}", mapping=task_data) + + # Add to pending queue + # For priority: high=lpush (front), others=rpush (back) + if priority == "high": + r.lpush("tasks:pending", task_id) + else: + r.rpush("tasks:pending", task_id) + + print(f"[ADDED] {task_id}: {description} ({priority}, {task_type})") + return task_id + +def main(): + parser = argparse.ArgumentParser(description="Add a task to the queue") + parser.add_argument("description", help="Task description") + parser.add_argument("--type", choices=["default", "notify", "command"], + default="default", help="Task type") + parser.add_argument("--priority", choices=["high", "medium", "low"], + default="medium", help="Task priority") + parser.add_argument("--by", default="Kimi", help="Who created the task") + parser.add_argument("--message", help="Message to send (for notify type)") + parser.add_argument("--command", help="Shell command to run (for command type)") + + args = parser.parse_args() + + task_id = add_task( + args.description, + args.type, + args.priority, + args.by, + args.message, + args.command + ) + print(f"Task ID: {task_id}") + +if __name__ == "__main__": + main() diff --git a/skills/task-queue/scripts/heartbeat_worker.py b/skills/task-queue/scripts/heartbeat_worker.py new file mode 100755 index 0000000..0c67be8 --- /dev/null +++ b/skills/task-queue/scripts/heartbeat_worker.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +""" +Heartbeat worker - GPT-powered task execution. +Sends tasks to Ollama for command generation, executes via SSH. +""" + +import redis +import json +import time +import os +import sys +import subprocess +import requests +from datetime import datetime + +REDIS_HOST = os.environ.get("REDIS_HOST", "127.0.0.1") +REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) +REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", None) +OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") +TASK_LLM_MODEL = os.environ.get("TASK_LLM_MODEL", "kimi-k2.5:cloud") +DEFAULT_TARGET_HOST = os.environ.get("TASK_SSH_HOST", "") +DEFAULT_SSH_USER = os.environ.get("TASK_SSH_USER", "") +DEFAULT_SUDO_PASS = os.environ.get("TASK_SUDO_PASS", "") + +def get_redis(): + return redis.Redis( + host=REDIS_HOST, + port=REDIS_PORT, + password=REDIS_PASSWORD, + decode_responses=True + ) + +def generate_task_id(): + return f"task_{int(time.time())}_{os.urandom(4).hex()}" + +def check_active_task(r): + """Check if there's already an active task.""" + active = r.lrange("tasks:active", 0, -1) + if active: + task_id = active[0] + task = r.hgetall(f"task:{task_id}") + started_at = int(task.get("started_at", 0)) + elapsed = time.time() - started_at + print(f"[BUSY] Task {task_id} active for {elapsed:.0f}s") + return True + return False + +def get_pending_task(r): + """Pop a task from pending queue.""" + task_id = r.rpop("tasks:pending") + if task_id: + return task_id + return None + +def clean_json_content(content): + """Strip markdown code blocks if present.""" + cleaned = content.strip() + if cleaned.startswith("```json"): + cleaned = cleaned[7:] + elif cleaned.startswith("```"): + cleaned = cleaned[3:] + if cleaned.endswith("```"): + cleaned = cleaned[:-3] + return cleaned.strip() + +def ask_gpt_for_commands(task_description, target_host=None, ssh_user=None, sudo_pass=None): + """ + Send task to Ollama/GPT to generate SSH commands. + Returns dict with commands, expected results, and explanation. + """ + target_host = target_host or DEFAULT_TARGET_HOST + ssh_user = ssh_user or DEFAULT_SSH_USER + sudo_pass = sudo_pass if sudo_pass is not None else DEFAULT_SUDO_PASS + + if not target_host or not ssh_user: + raise ValueError("TASK_SSH_HOST and TASK_SSH_USER must be set (or passed explicitly)") + + sudo_line = ( + f"Sudo password: {sudo_pass}" + if sudo_pass + else "Sudo password: (not provided; avoid sudo unless absolutely necessary)" + ) + + system_prompt = f"""You have SSH access to {ssh_user}@{target_host} +{sudo_line} + +Your job is to generate shell commands to complete the given task. +Respond ONLY with valid JSON in this format: +{{ + "commands": [ + "ssh -t {ssh_user}@{target_host} 'sudo apt update'", + "ssh -t {ssh_user}@{target_host} 'sudo apt install -y docker.io'" + ], + "expected_results": [ + "apt updated successfully", + "docker installed and running" + ], + "explanation": "Updating packages and installing Docker" +}} + +Rules: +- Commands should use ssh -t (allocates TTY for sudo) to execute on the remote host +- Use sudo only when needed +- Keep commands safe and idempotent where possible +- If task is unclear, ask for clarification in explanation + +For Docker-related tasks: +- Search Docker Hub for official images (docker.io/library/ or verified publishers) +- Prefer latest stable versions +- Use official images over community when available +- Verify image exists before trying to pull +- Map volumes as specified in the task (e.g., -v /root/html:/usr/share/nginx/html) +""" + + user_prompt = f"Task: {task_description}\n\nGenerate the commands to complete this task." + + try: + response = requests.post( + f"{OLLAMA_URL}/api/chat", + json={ + "model": TASK_LLM_MODEL, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + "stream": False, + "format": "json" + }, + timeout=120 + ) + response.raise_for_status() + + result = response.json() + content = result.get("message", {}).get("content", "{}") + + # Parse the JSON response + try: + cleaned = clean_json_content(content) + gpt_plan = json.loads(cleaned) + return gpt_plan + except json.JSONDecodeError: + # If GPT didn't return valid JSON, wrap the raw response + return { + "commands": [], + "expected_results": [], + "explanation": f"GPT response: {content[:200]}", + "parse_error": "GPT did not return valid JSON" + } + + except Exception as e: + return { + "commands": [], + "expected_results": [], + "explanation": f"Failed to get commands from GPT: {e}", + "error": str(e) + } + +def execute_ssh_command_with_sudo(command, sudo_pass, timeout=300): + """ + Execute an SSH command with sudo password handling. + Uses -t flag for TTY allocation and handles sudo password prompt. + """ + try: + # Ensure command has -t flag for TTY + if not "-t" in command and command.startswith("ssh "): + command = command.replace("ssh ", "ssh -t ", 1) + + # Use expect-like approach with subprocess + # Send password when prompted + import pty + import select + import termios + import tty + + master_fd, slave_fd = pty.openpty() + + process = subprocess.Popen( + command, + shell=True, + stdin=slave_fd, + stdout=slave_fd, + stderr=slave_fd, + preexec_fn=os.setsid + ) + + os.close(slave_fd) + + output = [] + password_sent = False + start_time = time.time() + + while process.poll() is None: + if time.time() - start_time > timeout: + process.kill() + return { + "success": False, + "stdout": "".join(output), + "stderr": "Command timed out", + "exit_code": -1 + } + + ready, _, _ = select.select([master_fd], [], [], 0.1) + if ready: + try: + data = os.read(master_fd, 1024).decode() + output.append(data) + + # Check for sudo password prompt + if "password:" in data.lower() or "password for" in data.lower(): + if not password_sent: + os.write(master_fd, (sudo_pass + "\n").encode()) + password_sent = True + time.sleep(0.5) + except OSError: + break + + os.close(master_fd) + + stdout = "".join(output) + return { + "success": process.returncode == 0, + "stdout": stdout, + "stderr": "" if process.returncode == 0 else stdout, + "exit_code": process.returncode + } + + except Exception as e: + return { + "success": False, + "stdout": "", + "stderr": str(e), + "exit_code": -1 + } + +def execute_ssh_command_simple(command, timeout=300): + """ + Execute an SSH command without sudo (simple version). + """ + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=timeout + ) + return { + "success": result.returncode == 0, + "stdout": result.stdout, + "stderr": result.stderr, + "exit_code": result.returncode + } + except subprocess.TimeoutExpired: + return { + "success": False, + "stdout": "", + "stderr": "Command timed out", + "exit_code": -1 + } + except Exception as e: + return { + "success": False, + "stdout": "", + "stderr": str(e), + "exit_code": -1 + } + +def execute_task_with_gpt(task): + """ + Execute task using GPT to generate commands, then run via SSH. + """ + task_description = task.get("description", "No description") + target_host = task.get("target_host", "10.0.0.38") + ssh_user = task.get("ssh_user", "n8n") + sudo_pass = task.get("sudo_pass", "passw0rd") + + print(f"[GPT] Generating commands for: {task_description}") + + # Get commands from GPT + gpt_plan = ask_gpt_for_commands(task_description, target_host, ssh_user, sudo_pass) + + if not gpt_plan.get("commands"): + comments = f"GPT failed to generate commands: {gpt_plan.get('explanation', 'Unknown error')}" + return { + "success": False, + "gpt_plan": gpt_plan, + "execution_results": [], + "comments": comments + } + + print(f"[GPT] Plan: {gpt_plan.get('explanation', 'No explanation')}") + print(f"[EXEC] Running {len(gpt_plan['commands'])} commands...") + + # Execute each command + execution_results = [] + any_failed = False + + for i, cmd in enumerate(gpt_plan["commands"]): + print(f"[CMD {i+1}] {cmd[:80]}...") + + # Check if command uses sudo + if "sudo" in cmd.lower(): + result = execute_ssh_command_with_sudo(cmd, sudo_pass) + else: + result = execute_ssh_command_simple(cmd) + + execution_results.append({ + "command": cmd, + "result": result + }) + + if not result["success"]: + any_failed = True + print(f"[FAIL] Exit code {result['exit_code']}: {result['stderr'][:100]}") + else: + print(f"[OK] Success") + + # Build comments field + if any_failed: + failed_cmds = [r for r in execution_results if not r["result"]["success"]] + comments = f"ERRORS ({len(failed_cmds)} failed):\n" + for r in failed_cmds: + comments += f"- Command: {r['command'][:60]}...\n" + comments += f" Error: {r['result']['stderr'][:200]}\n" + else: + comments = "OK" + + return { + "success": not any_failed, + "gpt_plan": gpt_plan, + "execution_results": execution_results, + "comments": comments + } + +def execute_simple_task(task): + """ + Execute simple tasks (notify, command) without GPT. + """ + task_type = task.get("type", "default") + description = task.get("description", "No description") + sudo_pass = task.get("sudo_pass", "passw0rd") + + if task_type == "notify": + # For now, just log it (messaging handled elsewhere) + return { + "success": True, + "result": f"Notification: {task.get('message', description)}", + "comments": "OK" + } + + elif task_type == "command": + # Execute shell command directly + command = task.get("command", "") + if command: + if "sudo" in command.lower(): + result = execute_ssh_command_with_sudo(command, sudo_pass) + else: + result = execute_ssh_command_simple(command) + comments = "OK" if result["success"] else f"Error: {result['stderr'][:500]}" + return { + "success": result["success"], + "result": result["stdout"][:500], + "comments": comments + } + else: + return { + "success": False, + "result": "No command specified", + "comments": "ERROR: No command provided" + } + + else: + # Default: use GPT + return execute_task_with_gpt(task) + +def mark_completed(r, task_id, result_data): + """Mark task as completed with full result data.""" + r.hset(f"task:{task_id}", mapping={ + "status": "completed" if result_data["success"] else "failed", + "completed_at": str(int(time.time())), + "result": json.dumps(result_data.get("result", "")), + "comments": result_data.get("comments", "") + }) + r.lrem("tasks:active", 0, task_id) + r.lpush("tasks:completed", task_id) + + status = "DONE" if result_data["success"] else "FAILED" + print(f"[{status}] {task_id}") + if result_data.get("comments") and result_data["comments"] != "OK": + print(f"[COMMENTS] {result_data['comments'][:200]}") + +def mark_failed(r, task_id, error): + """Mark task as failed.""" + r.hset(f"task:{task_id}", mapping={ + "status": "failed", + "completed_at": str(int(time.time())), + "result": f"Error: {error}", + "comments": f"Worker error: {error}" + }) + r.lrem("tasks:active", 0, task_id) + r.lpush("tasks:completed", task_id) + print(f"[FAILED] {task_id}: {error}") + +def main(): + r = get_redis() + + # Check if already busy + if check_active_task(r): + sys.exit(0) + + # Get next pending task + task_id = get_pending_task(r) + if not task_id: + print("[IDLE] No pending tasks") + sys.exit(0) + + # Load task details + task = r.hgetall(f"task:{task_id}") + if not task: + print(f"[ERROR] Task {task_id} not found") + sys.exit(1) + + # Move to active + r.hset(f"task:{task_id}", mapping={ + "status": "active", + "started_at": str(int(time.time())) + }) + r.lpush("tasks:active", task_id) + + print(f"[START] {task_id}: {task.get('description', 'No description')}") + + try: + # Execute the task + result_data = execute_simple_task(task) + mark_completed(r, task_id, result_data) + print(f"[WAKE] Task complete - check comments field for status") + + except Exception as e: + mark_failed(r, task_id, str(e)) + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/skills/task-queue/scripts/list_tasks.py b/skills/task-queue/scripts/list_tasks.py new file mode 100755 index 0000000..24a68a4 --- /dev/null +++ b/skills/task-queue/scripts/list_tasks.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +""" +List tasks in the queue - pending, active, and recent completed. +""" + +import redis +import os +from datetime import datetime + +REDIS_HOST = os.environ.get("REDIS_HOST", "10.0.0.36") +REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) + +def get_redis(): + return redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True) + +def format_time(timestamp): + if not timestamp or timestamp == "0": + return "-" + try: + dt = datetime.fromtimestamp(int(timestamp)) + return dt.strftime("%H:%M:%S") + except: + return timestamp + +def show_tasks(r, key, title, status_filter=None, limit=10): + task_ids = r.lrange(key, 0, limit - 1) + + if not task_ids: + print(f"\n{title}: (empty)") + return + + print(f"\n{title}:") + print("-" * 80) + + for task_id in task_ids: + task = r.hgetall(f"task:{task_id}") + if not task: + print(f" {task_id}: [missing data]") + continue + + status = task.get("status", "?") + desc = task.get("description", "no description")[:50] + priority = task.get("priority", "medium") + created = format_time(task.get("created_at")) + + if status_filter and status != status_filter: + continue + + print(f" [{status:10}] {task_id} | {priority:6} | {created} | {desc}") + +def main(): + r = get_redis() + + print("=" * 80) + print("TASK QUEUE STATUS") + print("=" * 80) + + # Show counts + pending_count = r.llen("tasks:pending") + active_count = r.llen("tasks:active") + completed_count = r.llen("tasks:completed") + + print(f"\nCounts: {pending_count} pending | {active_count} active | {completed_count} completed") + + # Show pending + show_tasks(r, "tasks:pending", "PENDING TASKS", limit=10) + + # Show active + show_tasks(r, "tasks:active", "ACTIVE TASKS") + + # Show recent completed + show_tasks(r, "tasks:completed", "RECENT COMPLETED (last 10)", limit=10) + + print("\n" + "=" * 80) + +if __name__ == "__main__": + main() diff --git a/uninstall.sh b/uninstall.sh new file mode 100755 index 0000000..4c2d4de --- /dev/null +++ b/uninstall.sh @@ -0,0 +1,227 @@ +#!/bin/bash +# OpenClaw Jarvis-Like Memory System - Recovery/Uninstall Script +# This script reverses all changes made by install.sh + +set -e + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo "═══════════════════════════════════════════════════════════════" +echo " OpenClaw Jarvis-Like Memory System - Recovery Tool" +echo "═══════════════════════════════════════════════════════════════" +echo "" +echo -e "${YELLOW}⚠️ This will undo changes made by the installer${NC}" +echo "" + +# Configuration +WORKSPACE_DIR="${WORKSPACE_DIR:-$HOME/.openclaw/workspace}" +USER_ID="${USER_ID:-$(whoami)}" +REDIS_HOST="${REDIS_HOST:-10.0.0.36}" +REDIS_PORT="${REDIS_PORT:-6379}" +QDRANT_URL="${QDRANT_URL:-http://10.0.0.40:6333}" + +# Track what we've done +REMOVED_ITEMS=() + +# Function to confirm action +confirm() { + read -p "$1 (y/N): " response + [[ "$response" =~ ^[Yy]$ ]] +} + +# Function to log removal +log_remove() { + REMOVED_ITEMS+=("$1") + echo -e "${GREEN} ✓ $1${NC}" +} + +# Step 1: Remove cron jobs +echo -e "${YELLOW}[1/6] Removing cron jobs...${NC}" +if confirm "Remove memory system cron jobs (3:00 AM & 3:30 AM backups)"; then + CRON_FILE=$(mktemp) + crontab -l 2>/dev/null > "$CRON_FILE" || true + + # Remove memory-related cron entries + grep -v "cron_backup.py" "$CRON_FILE" > "${CRON_FILE}.new" || true + grep -v "sliding_backup.sh" "${CRON_FILE}.new" > "$CRON_FILE" || true + grep -v "# Memory System -" "$CRON_FILE" > "${CRON_FILE}.new" || true + mv "${CRON_FILE}.new" "$CRON_FILE" + + # Clean up empty lines at end + sed -i -e :a -e '/^\n*$/{$d;N;};/\n$/ba' "$CRON_FILE" 2>/dev/null || true + + crontab "$CRON_FILE" + rm "$CRON_FILE" + log_remove "Removed cron jobs" +else + echo " ⏭️ Skipped" +fi + +# Step 2: Clear Redis buffer +echo "" +echo -e "${YELLOW}[2/6] Clearing Redis buffer...${NC}" +if confirm "Clear Redis memory buffer (mem:$USER_ID)"; then + if python3 </dev/null; then +import redis +import sys +try: + r = redis.Redis(host='$REDIS_HOST', port=$REDIS_PORT, decode_responses=True) + key = "mem:$USER_ID" + count = r.llen(key) + if count > 0: + r.delete(key) + print(f" Deleted {count} items from Redis") + else: + print(" Buffer was already empty") +except Exception as e: + print(f" Could not connect to Redis: {e}") +EOF + log_remove "Cleared Redis buffer" + else + echo -e "${RED} ✗ Could not clear Redis${NC}" + fi +else + echo " ⏭️ Skipped" +fi + +# Step 3: Delete Qdrant collections +echo "" +echo -e "${YELLOW}[3/6] Managing Qdrant collections...${NC}" +echo -e "${BLUE}Available collections:${NC}" +curl -s "$QDRANT_URL/collections" 2>/dev/null | python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + collections = data.get('result', {}).get('collections', []) + for c in collections: + name = c.get('name', '') + if name in ['kimi_memories', 'kimi_kb', 'private_court_docs']: + print(f' • {name}') +except: + print(' (Could not fetch collections)') +" || echo " (Qdrant not accessible)" + +echo "" +echo -e "${RED}⚠️ WARNING: This permanently deletes ALL stored memories!${NC}" +if confirm "Delete Qdrant collections (kimi_memories, kimi_kb, private_court_docs)"; then + for collection in kimi_memories kimi_kb private_court_docs; do + response=$(curl -s -X DELETE "$QDRANT_URL/collections/$collection" 2>/dev/null) + if echo "$response" | grep -q '"status":"ok"' 2>/dev/null; then + echo " ✓ Deleted: $collection" + else + echo " ⚠️ $collection (may not exist)" + fi + done + log_remove "Deleted Qdrant collections" +else + echo " ⏭️ Skipped (collections preserved)" +fi + +# Step 4: Remove environment file +echo "" +echo -e "${YELLOW}[4/6] Removing environment configuration...${NC}" +ENV_FILE="$WORKSPACE_DIR/.memory_env" +if [ -f "$ENV_FILE" ]; then + if confirm "Remove .memory_env file"; then + rm "$ENV_FILE" + log_remove "Removed .memory_env" + else + echo " ⏭️ Skipped" + fi +else + echo " ℹ️ .memory_env not found" +fi + +# Step 5: Remove HEARTBEAT.md changes +echo "" +echo -e "${YELLOW}[5/6] Checking HEARTBEAT.md...${NC}" +HEARTBEAT_FILE="$WORKSPACE_DIR/HEARTBEAT.md" +if [ -f "$HEARTBEAT.md" ]; then + if grep -q "Memory Buffer (Every Heartbeat)" "$HEARTBEAT.md" 2>/dev/null; then + echo -e "${BLUE}HEARTBEAT.md contains memory automation.${NC}" + echo "You may want to edit it manually to remove memory-related sections." + fi +fi + +if confirm "Remove auto-generated HEARTBEAT.md (if it matches our template)"; then + if [ -f "$HEARTBEAT.md" ] && grep -q "Generated by OpenClaw Jarvis Memory installer" "$HEARTBEAT.md" 2>/dev/null; then + rm "$HEARTBEAT.md" + log_remove "Removed auto-generated HEARTBEAT.md" + else + echo " ℹ️ HEARTBEAT.md not removed (may contain custom changes)" + fi +else + echo " ⏭️ Skipped" +fi + +# Step 6: Remove state file +echo "" +echo -e "${YELLOW}[6/6] Cleaning up state files...${NC}" +STATE_FILE="$WORKSPACE_DIR/.mem_last_turn" +if [ -f "$STATE_FILE" ]; then + if confirm "Remove turn tracking state file"; then + rm "$STATE_FILE" + log_remove "Removed .mem_last_turn" + else + echo " ⏭️ Skipped" + fi +else + echo " ℹ️ No state file found" +fi + +# Optional: Remove all skill files +if confirm "⚠️ FULL UNINSTALL: Remove ALL skill files and scripts"; then + echo "" + echo -e "${RED}Removing all skill files...${NC}" + + for skill in mem-redis qdrant-memory task-queue; do + SKILL_DIR="$WORKSPACE_DIR/skills/$skill" + if [ -d "$SKILL_DIR" ]; then + rm -rf "$SKILL_DIR" + log_remove "Removed skills/$skill" + fi + done + + # Also remove memory directory if empty + if [ -d "$WORKSPACE_DIR/memory" ]; then + if [ -z "$(ls -A "$WORKSPACE_DIR/memory" 2>/dev/null)" ]; then + rmdir "$WORKSPACE_DIR/memory" 2>/dev/null && log_remove "Removed empty memory directory" + else + echo -e "${YELLOW} ℹ️ memory/ directory contains files - not removed${NC}" + fi + fi +else + echo "" + echo -e "${BLUE}Keeping skill files for manual review${NC}" +fi + +# Summary +echo "" +echo "═══════════════════════════════════════════════════════════════" +echo -e "${GREEN} Recovery Complete!${NC}" +echo "═══════════════════════════════════════════════════════════════" +echo "" + +if [ ${#REMOVED_ITEMS[@]} -eq 0 ]; then + echo "No changes were made." +else + echo "Items removed/reset:" + for item in "${REMOVED_ITEMS[@]}"; do + echo " ✓ $item" + done +fi + +echo "" +echo -e "${BLUE}Remaining cleanup (if full uninstall was not selected):${NC}" +echo " • Skill files remain in: $WORKSPACE_DIR/skills/" +echo " • Daily memory files remain in: $WORKSPACE_DIR/memory/" +echo " • Backups remain in: /root/.openclaw/workspace/memory/" +echo "" +echo "To reinstall later, run: ./install.sh" +echo "" +echo "🧹 Cleanup complete!"