Initial commit: Jarvis Memory system

This commit is contained in:
2026-02-23 12:13:04 -06:00
commit e8854cd959
72 changed files with 14801 additions and 0 deletions

42
skills/mem-redis/SKILL.md Normal file
View File

@@ -0,0 +1,42 @@
# Memory Buffer Skill
Redis-based short-term memory buffer for OpenClaw.
## What It Does
Accumulates conversation turns in real-time and flushes to Qdrant daily.
## Commands
```bash
# Manual save (all turns)
python3 scripts/save_mem.py --user-id yourname
# Retrieve from buffer
python3 scripts/mem_retrieve.py --limit 10
# Search Redis + Qdrant
python3 scripts/search_mem.py "your query"
```
## Heartbeat Integration
Add to HEARTBEAT.md:
```bash
python3 /path/to/skills/mem-redis/scripts/hb_append.py
```
## Cron
```bash
# Daily flush at 3:00 AM
0 3 * * * python3 scripts/cron_backup.py
```
## Files
- `hb_append.py` - Heartbeat: append new turns only
- `save_mem.py` - Manual: save all turns
- `cron_backup.py` - Daily: flush to Qdrant
- `mem_retrieve.py` - Read from Redis
- `search_mem.py` - Search Redis + Qdrant

View File

@@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
Daily Cron: Process Redis buffer → Qdrant → Clear Redis.
This script runs once daily (via cron) to move buffered conversation
turns from Redis to durable Qdrant storage. Only clears Redis after
successful Qdrant write.
Usage: python3 cron_backup.py [--user-id rob] [--dry-run]
"""
import os
import sys
import json
import redis
import argparse
from datetime import datetime, timezone
from pathlib import Path
# Add qdrant-memory to path (portable)
from pathlib import Path as _Path
WORKSPACE = _Path(os.getenv("OPENCLAW_WORKSPACE", str(_Path.home() / ".openclaw" / "workspace")))
sys.path.insert(0, str(WORKSPACE / "skills" / "qdrant-memory" / "scripts"))
try:
from auto_store import store_conversation_turn
QDRANT_AVAILABLE = True
except ImportError:
QDRANT_AVAILABLE = False
print("Warning: Qdrant storage not available, will simulate", file=sys.stderr)
# Config
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
USER_ID = os.getenv("USER_ID", "yourname")
def get_redis_items(user_id):
"""Get all items from Redis list."""
try:
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
key = f"mem:{user_id}"
# Get all items (0 to -1 = entire list)
items = r.lrange(key, 0, -1)
# Parse JSON
turns = []
for item in items:
try:
turn = json.loads(item)
turns.append(turn)
except json.JSONDecodeError:
continue
return turns, key
except Exception as e:
print(f"Error reading from Redis: {e}", file=sys.stderr)
return None, None
def store_to_qdrant(turns, user_id):
"""Store turns to Qdrant with file fallback."""
if not QDRANT_AVAILABLE:
print("[DRY RUN] Would store to Qdrant:", file=sys.stderr)
for turn in turns[:3]:
print(f" - Turn {turn.get('turn', '?')}: {turn.get('role', '?')}", file=sys.stderr)
if len(turns) > 3:
print(f" ... and {len(turns) - 3} more", file=sys.stderr)
return True
# Ensure chronological order (older -> newer)
try:
turns_sorted = sorted(turns, key=lambda t: (t.get('timestamp', ''), t.get('turn', 0)))
except Exception:
turns_sorted = turns
user_turns = [t for t in turns_sorted if t.get('role') == 'user']
if not user_turns:
return True
success_count = 0
attempted = 0
for i, turn in enumerate(turns_sorted):
if turn.get('role') != 'user':
continue
attempted += 1
try:
# Pair with the next assistant message in chronological order (best effort)
ai_response = ""
j = i + 1
while j < len(turns_sorted):
if turns_sorted[j].get('role') == 'assistant':
ai_response = turns_sorted[j].get('content', '')
break
if turns_sorted[j].get('role') == 'user':
break
j += 1
result = store_conversation_turn(
user_message=turn.get('content', ''),
ai_response=ai_response,
user_id=user_id,
turn_number=turn.get('turn', i),
conversation_id=f"mem-buffer-{turn.get('timestamp', 'unknown')[:10]}"
)
# store_conversation_turn returns success/skipped; treat skipped as ok
if result.get('success') or result.get('skipped'):
success_count += 1
except Exception as e:
print(f"Error storing user turn {turn.get('turn', '?')}: {e}", file=sys.stderr)
# Only consider Qdrant storage successful if we stored/skipped ALL user turns.
return attempted > 0 and success_count == attempted
def store_to_file(turns, user_id):
"""Fallback: Store turns to JSONL file."""
from datetime import datetime
workspace = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace")))
backup_dir = workspace / "memory" / "redis-backups"
backup_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = backup_dir / f"mem-backup-{user_id}-{timestamp}.jsonl"
try:
with open(filename, 'w') as f:
for turn in turns:
f.write(json.dumps(turn) + '\n')
print(f"✅ Backed up {len(turns)} turns to file: {filename}")
return True
except Exception as e:
print(f"❌ File backup failed: {e}", file=sys.stderr)
return False
def clear_redis(key):
"""Clear Redis list after successful backup."""
try:
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
r.delete(key)
return True
except Exception as e:
print(f"Error clearing Redis: {e}", file=sys.stderr)
return False
def main():
parser = argparse.ArgumentParser(description="Backup Redis mem buffer to Qdrant")
parser.add_argument("--user-id", default=USER_ID, help="User ID")
parser.add_argument("--dry-run", action="store_true", help="Don't actually clear Redis")
args = parser.parse_args()
# Get items from Redis
turns, key = get_redis_items(args.user_id)
if turns is None:
print("❌ Failed to read from Redis")
sys.exit(1)
if not turns:
print(f"No items in Redis buffer (mem:{args.user_id})")
sys.exit(0)
print(f"Found {len(turns)} turns in Redis buffer")
# Try Qdrant first
qdrant_success = False
if not args.dry_run:
qdrant_success = store_to_qdrant(turns, args.user_id)
if qdrant_success:
print(f"✅ Stored Redis buffer to Qdrant (all user turns)")
else:
print("⚠️ Qdrant storage incomplete; will NOT clear Redis", file=sys.stderr)
else:
print("[DRY RUN] Would attempt Qdrant storage")
qdrant_success = True # Dry run pretends success
# If Qdrant failed/incomplete, try file backup (still do NOT clear Redis unless user chooses)
file_success = False
if not qdrant_success:
print("⚠️ Qdrant storage failed/incomplete, writing file backup (Redis preserved)...")
file_success = store_to_file(turns, args.user_id)
if not file_success:
print("❌ Both Qdrant and file backup failed - Redis buffer preserved")
sys.exit(1)
# Exit non-zero so monitoring can alert; keep Redis for re-try.
sys.exit(1)
# Clear Redis (only if not dry-run)
if args.dry_run:
print("[DRY RUN] Would clear Redis buffer")
sys.exit(0)
if clear_redis(key):
print(f"✅ Cleared Redis buffer (mem:{args.user_id})")
else:
print(f"⚠️ Backup succeeded but failed to clear Redis - may duplicate on next run")
sys.exit(1)
backup_type = "Qdrant" if qdrant_success else "file"
print(f"\n🎉 Successfully backed up {len(turns)} turns to {backup_type} long-term memory")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,230 @@
#!/usr/bin/env python3
"""
Cron Capture: Append NEW session transcript messages to Redis (no LLM / no heartbeat).
Goal: minimize token spend by capturing context out-of-band.
- Tracks per-session file offsets (byte position) in a JSON state file.
- No-ops if the transcript file hasn't changed since last run.
- Stores user/assistant visible text to Redis (chronological order via RPUSH).
- Optionally stores model "thinking" separately (disabled by default) so it can be
queried only when explicitly needed.
Usage:
python3 cron_capture.py [--user-id rob] [--include-thinking]
Suggested cron (every 5 minutes):
*/5 * * * * cd ~/.openclaw/workspace && python3 skills/mem-redis/scripts/cron_capture.py --user-id $USER
Env:
OPENCLAW_WORKSPACE: override workspace path (default: ~/.openclaw/workspace)
OPENCLAW_SESSIONS_DIR: override sessions dir (default: ~/.openclaw/agents/main/sessions)
REDIS_HOST / REDIS_PORT / USER_ID
"""
import argparse
import json
import os
import sys
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
USER_ID = os.getenv("USER_ID", "yourname")
DEFAULT_WORKSPACE = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace")))
DEFAULT_SESSIONS_DIR = Path(os.getenv("OPENCLAW_SESSIONS_DIR", str(Path.home() / ".openclaw" / "agents" / "main" / "sessions")))
STATE_FILE = DEFAULT_WORKSPACE / ".mem_capture_state.json"
@dataclass
class ParsedMessage:
role: str # user|assistant
text: str
thinking: Optional[str]
timestamp: str
session_id: str
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def find_latest_transcript(sessions_dir: Path) -> Optional[Path]:
files = list(sessions_dir.glob("*.jsonl"))
if not files:
return None
return max(files, key=lambda p: p.stat().st_mtime)
def load_state() -> Dict[str, Any]:
if not STATE_FILE.exists():
return {}
try:
return json.loads(STATE_FILE.read_text())
except Exception:
return {}
def save_state(state: Dict[str, Any]) -> None:
try:
STATE_FILE.write_text(json.dumps(state, indent=2, sort_keys=True))
except Exception as e:
print(f"[cron_capture] Warning: could not write state: {e}", file=sys.stderr)
def extract_text_and_thinking(content: Any) -> Tuple[str, Optional[str]]:
"""Extract visible text and optional thinking from OpenClaw message content."""
if isinstance(content, str):
return content, None
text_parts: List[str] = []
thinking_parts: List[str] = []
if isinstance(content, list):
for item in content:
if not isinstance(item, dict):
continue
if "text" in item and isinstance(item["text"], str):
text_parts.append(item["text"])
if "thinking" in item and isinstance(item["thinking"], str):
thinking_parts.append(item["thinking"])
text = "".join(text_parts).strip()
thinking = "\n".join(thinking_parts).strip() if thinking_parts else None
return text, thinking
def parse_new_messages(transcript_path: Path, start_offset: int, include_thinking: bool) -> Tuple[List[ParsedMessage], int]:
"""Parse messages from transcript_path starting at byte offset."""
session_id = transcript_path.stem
msgs: List[ParsedMessage] = []
with transcript_path.open("rb") as f:
f.seek(start_offset)
while True:
line = f.readline()
if not line:
break
try:
entry = json.loads(line.decode("utf-8", errors="replace").strip())
except Exception:
continue
if entry.get("type") != "message" or "message" not in entry:
continue
msg = entry.get("message") or {}
role = msg.get("role")
if role not in ("user", "assistant"):
continue
# Skip tool results explicitly
if role == "toolResult":
continue
text, thinking = extract_text_and_thinking(msg.get("content"))
if not text and not (include_thinking and thinking):
continue
msgs.append(
ParsedMessage(
role=role,
text=text[:8000],
thinking=(thinking[:16000] if (include_thinking and thinking) else None),
timestamp=entry.get("timestamp") or _now_iso(),
session_id=session_id,
)
)
end_offset = f.tell()
return msgs, end_offset
def append_to_redis(user_id: str, messages: List[ParsedMessage]) -> int:
if not messages:
return 0
import redis # lazy import so --dry-run works without deps
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
key = f"mem:{user_id}"
thinking_key = f"mem_thinking:{user_id}"
# RPUSH keeps chronological order.
for m in messages:
payload: Dict[str, Any] = {
"role": m.role,
"content": m.text,
"timestamp": m.timestamp,
"user_id": user_id,
"session": m.session_id,
}
r.rpush(key, json.dumps(payload))
if m.thinking:
t_payload = {
"role": m.role,
"thinking": m.thinking,
"timestamp": m.timestamp,
"user_id": user_id,
"session": m.session_id,
}
r.rpush(thinking_key, json.dumps(t_payload))
return len(messages)
def main() -> None:
parser = argparse.ArgumentParser(description="Cron capture: append new transcript messages to Redis")
parser.add_argument("--user-id", default=USER_ID)
parser.add_argument("--include-thinking", action="store_true", help="Store thinking into mem_thinking:<user>")
parser.add_argument("--sessions-dir", default=str(DEFAULT_SESSIONS_DIR))
parser.add_argument("--dry-run", action="store_true", help="Parse + update state, but do not write to Redis")
args = parser.parse_args()
sessions_dir = Path(args.sessions_dir)
transcript = find_latest_transcript(sessions_dir)
if not transcript:
print("[cron_capture] No session transcripts found")
return
st = load_state()
key = str(transcript)
info = st.get(key, {})
last_offset = int(info.get("offset", 0))
last_size = int(info.get("size", 0))
cur_size = transcript.stat().st_size
if cur_size == last_size and last_offset > 0:
print("[cron_capture] No changes")
return
messages, end_offset = parse_new_messages(transcript, last_offset, include_thinking=args.include_thinking)
if not messages:
# Still update size/offset so we don't re-read noise lines.
st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
save_state(st)
print("[cron_capture] No new user/assistant messages")
return
if args.dry_run:
st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
save_state(st)
print(f"[cron_capture] DRY RUN: would append {len(messages)} messages to Redis mem:{args.user_id}")
return
count = append_to_redis(args.user_id, messages)
st[key] = {"offset": end_offset, "size": cur_size, "updated_at": _now_iso()}
save_state(st)
print(f"[cron_capture] Appended {count} messages to Redis mem:{args.user_id}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,161 @@
#!/usr/bin/env python3
"""
Heartbeat: Append new conversation turns to Redis short-term buffer.
This script runs during heartbeat to capture recent conversation context
before it gets compacted away. Stores in Redis until daily cron backs up to Qdrant.
Usage: python3 hb_append.py [--user-id rob]
"""
import os
import sys
import json
import redis
import argparse
from datetime import datetime, timezone
from pathlib import Path
# Config
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
USER_ID = os.getenv("USER_ID", "yourname")
# Paths (portable)
WORKSPACE = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace")))
MEMORY_DIR = WORKSPACE / "memory"
SESSIONS_DIR = Path(os.getenv("OPENCLAW_SESSIONS_DIR", str(Path.home() / ".openclaw" / "agents" / "main" / "sessions")))
STATE_FILE = WORKSPACE / ".mem_last_turn"
def get_session_transcript():
"""Find the current session JSONL file."""
files = list(SESSIONS_DIR.glob("*.jsonl"))
if not files:
return None
# Get most recently modified
return max(files, key=lambda p: p.stat().st_mtime)
def parse_turns_since(last_turn_num):
"""Extract conversation turns since last processed."""
transcript_file = get_session_transcript()
if not transcript_file or not transcript_file.exists():
return []
turns = []
turn_counter = last_turn_num
try:
with open(transcript_file, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
# OpenClaw format: {"type": "message", "message": {"role": "...", ...}}
if entry.get('type') == 'message' and 'message' in entry:
msg = entry['message']
role = msg.get('role')
# Skip tool results for memory storage
if role == 'toolResult':
continue
# Get content from message content array or string
content = ""
if isinstance(msg.get('content'), list):
# Extract text from content array
for item in msg['content']:
if isinstance(item, dict):
if 'text' in item:
content += item['text']
# Intentionally do NOT store model thinking in the main buffer.
# If you need thinking, use cron_capture.py --include-thinking to store it
# separately under mem_thinking:<user_id>.
elif 'thinking' in item:
pass
elif isinstance(msg.get('content'), str):
content = msg['content']
if content and role in ('user', 'assistant'):
turn_counter += 1
turns.append({
'turn': turn_counter,
'role': role,
'content': content[:2000],
'timestamp': entry.get('timestamp', datetime.now(timezone.utc).isoformat()),
'user_id': USER_ID,
'session': str(transcript_file.name).replace('.jsonl', '')
})
except json.JSONDecodeError:
continue
except Exception as e:
print(f"Error reading transcript: {e}", file=sys.stderr)
return []
return turns
def get_last_turn():
"""Get last turn number from state file."""
if STATE_FILE.exists():
try:
with open(STATE_FILE) as f:
return int(f.read().strip())
except:
pass
return 0
def save_last_turn(turn_num):
"""Save last turn number to state file."""
try:
with open(STATE_FILE, 'w') as f:
f.write(str(turn_num))
except Exception as e:
print(f"Warning: Could not save state: {e}", file=sys.stderr)
def append_to_redis(turns, user_id):
"""Append turns to Redis list."""
if not turns:
return 0
try:
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
key = f"mem:{user_id}"
# Add all turns to list (LPUSH puts newest at front)
for turn in turns:
r.lpush(key, json.dumps(turn))
return len(turns)
except Exception as e:
print(f"Error writing to Redis: {e}", file=sys.stderr)
return 0
def main():
parser = argparse.ArgumentParser(description="Append new turns to Redis mem buffer")
parser.add_argument("--user-id", default=USER_ID, help="User ID for key naming")
args = parser.parse_args()
# Get last processed turn
last_turn = get_last_turn()
# Get new turns
new_turns = parse_turns_since(last_turn)
if not new_turns:
print(f"No new turns since turn {last_turn}")
sys.exit(0)
# Append to Redis
count = append_to_redis(new_turns, args.user_id)
if count > 0:
# Update last turn tracker
max_turn = max(t['turn'] for t in new_turns)
save_last_turn(max_turn)
print(f"✅ Appended {count} turns to Redis (mem:{args.user_id})")
else:
print("❌ Failed to append to Redis")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,87 @@
#!/usr/bin/env python3
"""
Manual Retrieval: Get recent conversation turns from Redis buffer.
Use this when context has been compacted or you need to recall recent details.
Usage: python3 mem_retrieve.py [--limit 20] [--user-id rob]
"""
import os
import sys
import json
import redis
import argparse
from datetime import datetime, timezone
# Config
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
USER_ID = os.getenv("USER_ID", "yourname")
def get_recent_turns(user_id, limit=20):
"""Get recent turns from Redis buffer."""
try:
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
key = f"mem:{user_id}"
# Get most recent N items (0 to limit-1)
items = r.lrange(key, 0, limit - 1)
# Parse and reverse (so oldest first)
turns = []
for item in items:
try:
turn = json.loads(item)
turns.append(turn)
except json.JSONDecodeError:
continue
# Reverse to chronological order
turns.reverse()
return turns
except Exception as e:
print(f"Error reading from Redis: {e}", file=sys.stderr)
return []
def format_turn(turn):
"""Format a turn for display."""
role = turn.get('role', 'unknown')
content = turn.get('content', '')
turn_num = turn.get('turn', '?')
# Truncate long content
if len(content) > 500:
content = content[:500] + "..."
role_icon = "👤" if role == 'user' else "🤖"
return f"{role_icon} Turn {turn_num} ({role}):\n{content}\n"
def main():
parser = argparse.ArgumentParser(description="Retrieve recent turns from mem buffer")
parser.add_argument("--user-id", default=USER_ID, help="User ID")
parser.add_argument("--limit", type=int, default=20, help="Number of turns to retrieve")
args = parser.parse_args()
# Get turns
turns = get_recent_turns(args.user_id, args.limit)
if not turns:
print(f"No recent turns in memory buffer (mem:{args.user_id})")
print("\nPossible reasons:")
print(" - Heartbeat hasn't run yet")
print(" - Cron already backed up and cleared Redis")
print(" - Redis connection issue")
sys.exit(0)
# Display
print(f"=== Recent {len(turns)} Turn(s) from Memory Buffer ===\n")
for turn in turns:
print(format_turn(turn))
print(f"\nBuffer key: mem:{args.user_id}")
print("Note: These turns are also in Redis until daily cron backs them up to Qdrant.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,149 @@
#!/usr/bin/env python3
"""
Save all conversation context to Redis (not just new turns).
Unlike hb_append.py which only saves NEW turns since last run,
this script saves ALL context from the session (or resets and saves fresh).
Usage: python3 save_mem.py [--user-id rob] [--reset]
"""
import os
import sys
import json
import redis
import argparse
from datetime import datetime, timezone
from pathlib import Path
# Config
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
USER_ID = os.getenv("USER_ID", "yourname")
# Paths (portable)
WORKSPACE = Path(os.getenv("OPENCLAW_WORKSPACE", str(Path.home() / ".openclaw" / "workspace")))
SESSIONS_DIR = Path(os.getenv("OPENCLAW_SESSIONS_DIR", str(Path.home() / ".openclaw" / "agents" / "main" / "sessions")))
STATE_FILE = WORKSPACE / ".mem_last_turn"
def get_session_transcript():
"""Find the current session JSONL file."""
files = list(SESSIONS_DIR.glob("*.jsonl"))
if not files:
return None
return max(files, key=lambda p: p.stat().st_mtime)
def parse_all_turns():
"""Extract ALL conversation turns from current session."""
transcript_file = get_session_transcript()
if not transcript_file or not transcript_file.exists():
return []
turns = []
turn_counter = 0
try:
with open(transcript_file, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
if entry.get('type') == 'message' and 'message' in entry:
msg = entry['message']
role = msg.get('role')
if role == 'toolResult':
continue
content = ""
if isinstance(msg.get('content'), list):
for item in msg['content']:
if isinstance(item, dict):
if 'text' in item:
content += item['text']
# Do not mix thinking into the main content buffer.
elif 'thinking' in item:
pass
elif isinstance(msg.get('content'), str):
content = msg['content']
if content and role in ('user', 'assistant'):
turn_counter += 1
turns.append({
'turn': turn_counter,
'role': role,
'content': content[:2000],
'timestamp': entry.get('timestamp', datetime.now(timezone.utc).isoformat()),
'user_id': USER_ID,
'session': str(transcript_file.name).replace('.jsonl', '')
})
except json.JSONDecodeError:
continue
except Exception as e:
print(f"Error reading transcript: {e}", file=sys.stderr)
return []
return turns
def save_to_redis(turns, user_id, reset=False):
"""Save turns to Redis. If reset, clear existing first."""
if not turns:
return 0
try:
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
key = f"mem:{user_id}"
# Clear existing if reset
if reset:
r.delete(key)
print(f"Cleared existing Redis buffer ({key})")
# Add all turns (LPUSH puts newest at front, so we reverse to keep order)
for turn in reversed(turns):
r.lpush(key, json.dumps(turn))
return len(turns)
except Exception as e:
print(f"Error writing to Redis: {e}", file=sys.stderr)
return 0
def update_state(last_turn_num):
"""Update last turn tracker."""
try:
with open(STATE_FILE, 'w') as f:
f.write(str(last_turn_num))
except Exception as e:
print(f"Warning: Could not save state: {e}", file=sys.stderr)
def main():
parser = argparse.ArgumentParser(description="Save all conversation context to Redis")
parser.add_argument("--user-id", default=USER_ID, help="User ID for key naming")
parser.add_argument("--reset", action="store_true", help="Clear existing buffer first")
args = parser.parse_args()
# Get all turns
turns = parse_all_turns()
if not turns:
print("No conversation turns found in session")
sys.exit(0)
# Save to Redis
count = save_to_redis(turns, args.user_id, reset=args.reset)
if count > 0:
# Update state to track last turn
max_turn = max(t['turn'] for t in turns)
update_state(max_turn)
action = "Reset and saved" if args.reset else "Saved"
print(f"{action} {count} turns to Redis (mem:{args.user_id})")
print(f" State updated to turn {max_turn}")
else:
print("❌ Failed to save to Redis")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,242 @@
#!/usr/bin/env python3
"""
Search memory: First Redis (exact), then Qdrant (semantic).
Usage: python3 search_mem.py "your search query" [--limit 10] [--user-id rob]
Searches:
1. Redis (mem:{user_id}) - exact text match in recent buffer
2. Qdrant (kimi_memories) - semantic similarity search
"""
import os
import sys
import json
import redis
import argparse
from pathlib import Path
from datetime import datetime
# Config
REDIS_HOST = os.getenv("REDIS_HOST", "10.0.0.36")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
USER_ID = os.getenv("USER_ID", "yourname")
QDRANT_URL = os.getenv("QDRANT_URL", "http://10.0.0.40:6333")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://10.0.0.10:11434/v1")
def search_redis(query, user_id, limit=20):
"""Search Redis buffer for exact text matches."""
try:
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
key = f"mem:{user_id}"
# Get all items from list
items = r.lrange(key, 0, -1)
if not items:
return []
query_lower = query.lower()
matches = []
for item in items:
try:
turn = json.loads(item)
content = turn.get('content', '').lower()
if query_lower in content:
matches.append({
'source': 'redis',
'turn': turn.get('turn'),
'role': turn.get('role'),
'content': turn.get('content'),
'timestamp': turn.get('timestamp'),
'score': 'exact'
})
except json.JSONDecodeError:
continue
# Sort by turn number descending (newest first)
matches.sort(key=lambda x: x.get('turn', 0), reverse=True)
return matches[:limit]
except Exception as e:
print(f"Redis search error: {e}", file=sys.stderr)
return []
def get_embedding(text):
"""Get embedding from Ollama."""
import urllib.request
payload = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=payload,
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
result = json.loads(resp.read().decode())
return result.get('data', [{}])[0].get('embedding')
except Exception as e:
print(f"Embedding error: {e}", file=sys.stderr)
return None
def search_qdrant(query, user_id, limit=10):
"""Search Qdrant for semantic similarity."""
import urllib.request
embedding = get_embedding(query)
if not embedding:
return []
payload = json.dumps({
"vector": embedding,
"limit": limit,
"with_payload": True,
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}}
]
}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/kimi_memories/points/search",
data=payload,
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
result = json.loads(resp.read().decode())
points = result.get('result', [])
matches = []
for point in points:
payload = point.get('payload', {})
matches.append({
'source': 'qdrant',
'score': round(point.get('score', 0), 3),
'turn': payload.get('turn_number'),
'role': payload.get('role'),
'content': payload.get('user_message') or payload.get('content', ''),
'ai_response': payload.get('ai_response', ''),
'timestamp': payload.get('timestamp'),
'conversation_id': payload.get('conversation_id')
})
return matches
except Exception as e:
print(f"Qdrant search error: {e}", file=sys.stderr)
return []
def format_result(result, index):
"""Format a single search result."""
source = result.get('source', 'unknown')
role = result.get('role', 'unknown')
turn = result.get('turn', '?')
score = result.get('score', '?')
content = result.get('content', '')
if len(content) > 200:
content = content[:200] + "..."
# Role emoji
role_emoji = "👤" if role == "user" else "🤖"
# Source indicator
source_icon = "🔴" if source == "redis" else "🔵"
lines = [
f"{source_icon} [{index}] Turn {turn} ({role}):",
f" {role_emoji} {content}"
]
if source == "qdrant" and result.get('ai_response'):
ai_resp = result['ai_response'][:150]
if len(result['ai_response']) > 150:
ai_resp += "..."
lines.append(f" 💬 AI: {ai_resp}")
if score != 'exact':
lines.append(f" 📊 Score: {score}")
else:
lines.append(f" 📊 Match: exact (Redis)")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Search memory: Redis first, then Qdrant")
parser.add_argument("query", help="Search query")
parser.add_argument("--limit", type=int, default=10, help="Results per source (default: 10)")
parser.add_argument("--user-id", default=USER_ID, help="User ID")
parser.add_argument("--redis-only", action="store_true", help="Only search Redis")
parser.add_argument("--qdrant-only", action="store_true", help="Only search Qdrant")
args = parser.parse_args()
print(f"🔍 Searching for: \"{args.query}\"\n")
all_results = []
# Search Redis first (unless qdrant-only)
if not args.qdrant_only:
print("📍 Searching Redis (exact match)...")
redis_results = search_redis(args.query, args.user_id, limit=args.limit)
if redis_results:
print(f"✅ Found {len(redis_results)} matches in Redis\n")
all_results.extend(redis_results)
else:
print("❌ No exact matches in Redis\n")
# Search Qdrant (unless redis-only)
if not args.redis_only:
print("🧠 Searching Qdrant (semantic similarity)...")
qdrant_results = search_qdrant(args.query, args.user_id, limit=args.limit)
if qdrant_results:
print(f"✅ Found {len(qdrant_results)} matches in Qdrant\n")
all_results.extend(qdrant_results)
else:
print("❌ No semantic matches in Qdrant\n")
# Display results
if not all_results:
print("No results found in either Redis or Qdrant.")
sys.exit(0)
print(f"=== Search Results ({len(all_results)} total) ===\n")
# Sort: Redis first (chronological), then Qdrant (by score)
redis_sorted = [r for r in all_results if r['source'] == 'redis']
qdrant_sorted = sorted(
[r for r in all_results if r['source'] == 'qdrant'],
key=lambda x: x.get('score', 0),
reverse=True
)
# Display Redis results first
if redis_sorted:
print("🔴 FROM REDIS (Recent Buffer):\n")
for i, result in enumerate(redis_sorted, 1):
print(format_result(result, i))
print()
# Then Qdrant results
if qdrant_sorted:
print("🔵 FROM QDRANT (Long-term Memory):\n")
for i, result in enumerate(qdrant_sorted, len(redis_sorted) + 1):
print(format_result(result, i))
print()
print(f"=== {len(all_results)} results ===")
if redis_sorted:
print(f" 🔴 Redis: {len(redis_sorted)} (exact, recent)")
if qdrant_sorted:
print(f" 🔵 Qdrant: {len(qdrant_sorted)} (semantic, long-term)")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,137 @@
# Session Harvest Instructions
## What is Session Harvesting?
Session harvesting extracts conversation turns from OpenClaw session JSONL files and stores them to Qdrant long-term memory with proper embeddings and user_id linking.
## When to Use
- **After setting up a new memory system** — harvest existing sessions
- **After discovering missed backups** — recover data from session files
- **Periodically** — if cron jobs missed any data
## Scripts
| Script | Purpose | Usage |
|--------|---------|-------|
| `harvest_sessions.py` | Harvest all sessions (auto-sorts by mtime) | Limited by memory, may timeout |
| `harvest_newest.py` | Harvest specific sessions by name | Recommended for batch control |
## Location
```
/root/.openclaw/workspace/skills/qdrant-memory/scripts/
├── harvest_sessions.py # Auto-harvest (use --limit to control)
└── harvest_newest.py # Manual batch (specify session names)
```
## Usage
### Method 1: Auto-Harvest with Limit
```bash
# Harvest oldest 10 sessions (default sort)
python3 harvest_sessions.py --user-id rob --limit 10
# Dry run to see what would be stored
python3 harvest_sessions.py --user-id rob --dry-run --limit 5
```
### Method 2: Batch by Session Name (Recommended)
```bash
# Harvest specific sessions (newest first recommended)
python3 harvest_newest.py --user-id rob \
session-uuid-1.jsonl \
session-uuid-2.jsonl \
session-uuid-3.jsonl
```
### Finding Newest Sessions
```bash
# List 20 newest session files
ls -t /root/.openclaw/agents/main/sessions/*.jsonl | head -20
# Get just filenames for copy-paste
ls -t /root/.openclaw/agents/main/sessions/*.jsonl | head -20 | xargs -I{} basename {}
```
## How It Works
1. **Parse** — Reads JSONL session file, extracts user/AI turns
2. **Pair** — Matches user message with next AI response
3. **Embed** — Generates 3 embeddings (user, AI, summary) via Ollama
4. **Deduplicate** — Checks content_hash before storing
5. **Store** — Upserts to Qdrant with user_id, conversation_id, turn_number
## Deduplication
- Uses MD5 hash of `user_message::ai_response`
- Checks Qdrant for existing `user_id + content_hash`
- Skips if already stored (returns "duplicate")
- Safe to run multiple times on same sessions
## Output Format
```
[1] session-uuid.jsonl
Stored: 10, Skipped: 6
Total: 44 stored, 6 skipped
```
- **Stored** = New memories added to Qdrant
- **Skipped** = Duplicates (already in Qdrant)
## Troubleshooting
### Timeout / SIGKILL
The embedding process is CPU-intensive. If killed:
```bash
# Use smaller batches
python3 harvest_newest.py --user-id rob session1.jsonl session2.jsonl
```
### Check Qdrant Status
```bash
curl -s http://10.0.0.40:6333/collections/kimi_memories | \
python3 -c "import sys,json; d=json.load(sys.stdin); print(d['result']['points_count'])"
```
### Check Session Content
```bash
# Count turns in a session
python3 -c "
import json
from pathlib import Path
f = Path('/root/.openclaw/agents/main/sessions/YOUR-SESSION.jsonl')
count = sum(1 for line in open(f) if 'user' in line or 'assistant' in line)
print(f'~{count} messages')
"
```
## Memory Architecture
```
Session JSONL (raw)
harvest_*.py
├──► Embeddings (Ollama snowflake-arctic-embed2)
Qdrant kimi_memories
└──► Searchable via user_id: "rob"
```
---
**Created:** February 17, 2026
**Author:** Kimi (audit session)

View File

@@ -0,0 +1,53 @@
# Qdrant Memory Skill
Vector database storage for long-term semantic memory.
## What It Does
Stores conversations with embeddings for semantic search.
## Commands
```bash
# Initialize collections
python3 scripts/init_kimi_memories.py
python3 scripts/init_kimi_kb.py
# Store immediately
python3 scripts/auto_store.py
# Search memories
python3 scripts/search_memories.py "your query"
# Harvest old sessions
python3 scripts/harvest_sessions.py --limit 10
```
## Heartbeat Integration
Add to HEARTBEAT.md:
```bash
python3 /path/to/skills/qdrant-memory/scripts/daily_conversation_backup.py
```
## Cron
```bash
# Daily backup at 3:30 AM
30 3 * * * scripts/sliding_backup.sh
```
## Collections
- `kimi_memories` - Conversations
- `kimi_kb` - Knowledge base
- `private_court_docs` - Legal docs
## Files
- `auto_store.py` - Store with embeddings
- `search_memories.py` - Semantic search
- `init_*.py` - Collection initialization
- `harvest_*.py` - Session harvesting
- `daily_conversation_backup.py` - Daily cron
- `sliding_backup.sh` - File backup

View File

@@ -0,0 +1,273 @@
#!/usr/bin/env python3
"""
Shared Activity Log for Kimi and Max
Prevents duplicate work by logging actions to Qdrant
"""
import argparse
import hashlib
import json
import sys
import uuid
from datetime import datetime, timezone
from typing import Optional
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "activity_log"
VECTOR_SIZE = 768 # nomic-embed-text
# Embedding function (simple keyword-based for now, or use nomic)
def simple_embed(text: str) -> list[float]:
"""Simple hash-based embedding for semantic similarity"""
# In production, use nomic-embed-text via API
# For now, use a simple approach that groups similar texts
words = text.lower().split()
vector = [0.0] * VECTOR_SIZE
for i, word in enumerate(words[:100]): # Limit to first 100 words
h = hash(word) % VECTOR_SIZE
vector[h] += 1.0
# Normalize
norm = sum(x*x for x in vector) ** 0.5
if norm > 0:
vector = [x/norm for x in vector]
return vector
def init_collection(client: QdrantClient):
"""Create activity_log collection if not exists"""
collections = [c.name for c in client.get_collections().collections]
if COLLECTION_NAME not in collections:
client.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
)
print(f"Created collection: {COLLECTION_NAME}")
def log_activity(
agent: str,
action_type: str,
description: str,
affected_files: Optional[list] = None,
status: str = "completed",
metadata: Optional[dict] = None
) -> str:
"""
Log an activity to the shared activity log
Args:
agent: "Kimi" or "Max"
action_type: e.g., "cron_created", "file_edited", "config_changed", "task_completed"
description: Human-readable description of what was done
affected_files: List of file paths or systems affected
status: "completed", "in_progress", "blocked", "failed"
metadata: Additional key-value pairs
Returns:
activity_id (UUID)
"""
client = QdrantClient(url=QDRANT_URL)
init_collection(client)
activity_id = str(uuid.uuid4())
timestamp = datetime.now(timezone.utc).isoformat()
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
# Build searchable text
searchable_text = f"{agent} {action_type} {description} {' '.join(affected_files or [])}"
vector = simple_embed(searchable_text)
payload = {
"agent": agent,
"action_type": action_type,
"description": description,
"affected_files": affected_files or [],
"status": status,
"timestamp": timestamp,
"date": date_str,
"activity_id": activity_id,
"metadata": metadata or {}
}
client.upsert(
collection_name=COLLECTION_NAME,
points=[PointStruct(id=activity_id, vector=vector, payload=payload)]
)
return activity_id
def get_recent_activities(
agent: Optional[str] = None,
action_type: Optional[str] = None,
hours: int = 24,
limit: int = 50
) -> list[dict]:
"""
Query recent activities
Args:
agent: Filter by agent name ("Kimi" or "Max") or None for both
action_type: Filter by action type or None for all
hours: Look back this many hours
limit: Max results
"""
client = QdrantClient(url=QDRANT_URL)
# Get all points and filter client-side (Qdrant payload filtering can be tricky)
# For small collections, this is fine. For large ones, use scroll with filter
all_points = client.scroll(
collection_name=COLLECTION_NAME,
limit=1000 # Get recent batch
)[0]
results = []
cutoff = datetime.now(timezone.utc).timestamp() - (hours * 3600)
for point in all_points:
payload = point.payload
ts = payload.get("timestamp", "")
try:
point_time = datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp()
except:
continue
if point_time < cutoff:
continue
if agent and payload.get("agent") != agent:
continue
if action_type and payload.get("action_type") != action_type:
continue
results.append(payload)
# Sort by timestamp descending
results.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
return results[:limit]
def search_activities(query: str, limit: int = 10) -> list[dict]:
"""Semantic search across activity descriptions"""
client = QdrantClient(url=QDRANT_URL)
vector = simple_embed(query)
results = client.search(
collection_name=COLLECTION_NAME,
query_vector=vector,
limit=limit
)
return [r.payload for r in results]
def check_for_duplicates(action_type: str, description_keywords: str, hours: int = 6) -> bool:
"""
Check if similar work was recently done
Returns True if duplicate detected, False otherwise
"""
recent = get_recent_activities(action_type=action_type, hours=hours)
keywords = description_keywords.lower().split()
for activity in recent:
desc = activity.get("description", "").lower()
if all(kw in desc for kw in keywords):
print(f"⚠️ Duplicate detected: {activity['agent']} did similar work {activity['timestamp']}")
print(f" Description: {activity['description']}")
return True
return False
def main():
parser = argparse.ArgumentParser(description="Shared Activity Log for Kimi/Max")
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# Log command
log_parser = subparsers.add_parser("log", help="Log an activity")
log_parser.add_argument("--agent", required=True, choices=["Kimi", "Max"], help="Which agent performed the action")
log_parser.add_argument("--action", required=True, help="Action type (e.g., cron_created, file_edited)")
log_parser.add_argument("--description", required=True, help="What was done")
log_parser.add_argument("--files", nargs="*", help="Files/systems affected")
log_parser.add_argument("--status", default="completed", choices=["completed", "in_progress", "blocked", "failed"])
log_parser.add_argument("--check-duplicate", action="store_true", help="Check for duplicates before logging")
log_parser.add_argument("--duplicate-keywords", help="Keywords to check for duplicates (if different from description)")
# Recent command
recent_parser = subparsers.add_parser("recent", help="Show recent activities")
recent_parser.add_argument("--agent", choices=["Kimi", "Max"], help="Filter by agent")
recent_parser.add_argument("--action", help="Filter by action type")
recent_parser.add_argument("--hours", type=int, default=24, help="Hours to look back")
recent_parser.add_argument("--limit", type=int, default=20, help="Max results")
# Search command
search_parser = subparsers.add_parser("search", help="Search activities")
search_parser.add_argument("query", help="Search query")
search_parser.add_argument("--limit", type=int, default=10)
# Check command
check_parser = subparsers.add_parser("check", help="Check for duplicate work")
check_parser.add_argument("--action", required=True, help="Action type")
check_parser.add_argument("--keywords", required=True, help="Keywords to check")
check_parser.add_argument("--hours", type=int, default=6, help="Hours to look back")
args = parser.parse_args()
if args.command == "log":
if args.check_duplicate:
keywords = args.duplicate_keywords or args.description
if check_for_duplicates(args.action, keywords):
response = input("Proceed anyway? (y/n): ")
if response.lower() != "y":
print("Cancelled.")
sys.exit(0)
activity_id = log_activity(
agent=args.agent,
action_type=args.action,
description=args.description,
affected_files=args.files,
status=args.status
)
print(f"✓ Logged activity: {activity_id}")
elif args.command == "recent":
activities = get_recent_activities(
agent=args.agent,
action_type=args.action,
hours=args.hours,
limit=args.limit
)
print(f"\nRecent activities (last {args.hours}h):\n")
for a in activities:
agent_icon = "🤖" if a["agent"] == "Max" else "🎙️"
status_icon = {
"completed": "",
"in_progress": "",
"blocked": "",
"failed": ""
}.get(a["status"], "?")
print(f"{agent_icon} [{a['timestamp'][:19]}] {status_icon} {a['action_type']}")
print(f" {a['description']}")
if a['affected_files']:
print(f" Files: {', '.join(a['affected_files'])}")
print()
elif args.command == "search":
results = search_activities(args.query, args.limit)
print(f"\nSearch results for '{args.query}':\n")
for r in results:
print(f"[{r['agent']}] {r['action_type']}: {r['description']}")
print(f" {r['timestamp'][:19]} | Status: {r['status']}")
print()
elif args.command == "check":
is_dup = check_for_duplicates(args.action, args.keywords, args.hours)
sys.exit(1 if is_dup else 0)
else:
parser.print_help()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env python3
"""
Agent Messaging System - Redis Streams
Kimi and Max shared communication channel
"""
import argparse
import json
import time
import sys
from datetime import datetime, timezone
import redis
REDIS_HOST = "10.0.0.36"
REDIS_PORT = 6379
STREAM_NAME = "agent-messages"
LAST_READ_KEY = "agent:last_read:{agent}"
class AgentChat:
def __init__(self, agent_name):
self.agent = agent_name
self.r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
def send(self, msg_type, message, reply_to=None, from_user=False):
"""Send a message to the stream"""
entry = {
"agent": self.agent,
"type": msg_type, # idea, question, update, reply
"message": message,
"timestamp": datetime.now(timezone.utc).isoformat(),
"reply_to": reply_to or "",
"from_user": str(from_user).lower() # "true" if from Rob, "false" if from agent
}
msg_id = self.r.xadd(STREAM_NAME, entry)
print(f"[{self.agent}] Sent: {msg_id}")
return msg_id
def read_new(self, block_ms=1000):
"""Read messages since last check"""
last_id = self.r.get(LAST_READ_KEY.format(agent=self.agent)) or "0"
result = self.r.xread(
{STREAM_NAME: last_id},
block=block_ms
)
if not result:
return []
messages = []
for stream_name, entries in result:
for msg_id, data in entries:
messages.append({"id": msg_id, **data})
# Update last read position
self.r.set(LAST_READ_KEY.format(agent=self.agent), msg_id)
return messages
def read_all(self, count=50):
"""Read last N messages regardless of read status"""
entries = self.r.xrevrange(STREAM_NAME, count=count)
messages = []
for msg_id, data in entries:
messages.append({"id": msg_id, **data})
return messages
def read_since(self, hours=24):
"""Read messages from last N hours"""
cutoff = time.time() - (hours * 3600)
cutoff_ms = int(cutoff * 1000)
# Get messages since cutoff (approximate using ID which is timestamp-based)
entries = self.r.xrange(STREAM_NAME, min=f"{cutoff_ms}-0", count=1000)
messages = []
for msg_id, data in entries:
messages.append({"id": msg_id, **data})
return messages
def wait_for_reply(self, reply_to_id, timeout_sec=30):
"""Block until a reply to a specific message arrives"""
start = time.time()
last_check = "0"
while time.time() - start < timeout_sec:
result = self.r.xread({STREAM_NAME: last_check}, block=timeout_sec*1000)
if result:
for stream_name, entries in result:
for msg_id, data in entries:
last_check = msg_id
if data.get("reply_to") == reply_to_id:
return {"id": msg_id, **data}
time.sleep(0.5)
return None
def format_message(self, msg):
"""Pretty print a message"""
ts = msg.get("timestamp", "")[11:19] # HH:MM:SS only
agent = msg.get("agent", "?")
msg_type = msg.get("type", "?")
text = msg.get("message", "")
reply_to = msg.get("reply_to", "")
from_user = msg.get("from_user", "false") == "true"
icon = "🤖" if agent == "Max" else "🎙️"
type_icon = {
"idea": "💡",
"question": "",
"update": "📢",
"reply": "↩️"
}.get(msg_type, "")
# Show 📝 if message is from Rob (relayed by agent), otherwise show agent icon only
source_icon = "📝" if from_user else icon
reply_info = f" [reply to {reply_to[:8]}...]" if reply_to else ""
return f"[{ts}] {source_icon} {agent} {type_icon} {text}{reply_info}"
def main():
parser = argparse.ArgumentParser(description="Agent messaging via Redis Streams")
parser.add_argument("--agent", required=True, choices=["Kimi", "Max"], help="Your agent name")
subparsers = parser.add_subparsers(dest="command", help="Command")
# Send command
send_p = subparsers.add_parser("send", help="Send a message")
send_p.add_argument("--type", default="update", choices=["idea", "question", "update", "reply"])
send_p.add_argument("--message", "-m", required=True, help="Message text")
send_p.add_argument("--reply-to", help="Reply to message ID")
send_p.add_argument("--from-user", action="store_true", help="Mark as message from Rob (not from agent)")
# Read command
read_p = subparsers.add_parser("read", help="Read messages")
read_p.add_argument("--new", action="store_true", help="Only unread messages")
read_p.add_argument("--all", action="store_true", help="Last 50 messages")
read_p.add_argument("--since", type=int, help="Messages from last N hours")
read_p.add_argument("--wait", action="store_true", help="Wait for new messages (blocking)")
args = parser.parse_args()
chat = AgentChat(args.agent)
if args.command == "send":
msg_id = chat.send(args.type, args.message, args.reply_to, args.from_user)
print(f"Message ID: {msg_id}")
elif args.command == "read":
if args.new or args.wait:
if args.wait:
print("Waiting for messages... (Ctrl+C to stop)")
try:
while True:
msgs = chat.read_new(block_ms=5000)
for m in msgs:
print(chat.format_message(m))
except KeyboardInterrupt:
print("\nStopped.")
else:
msgs = chat.read_new()
for m in msgs:
print(chat.format_message(m))
if not msgs:
print("No new messages.")
elif args.since:
msgs = chat.read_since(args.since)
for m in msgs:
print(chat.format_message(m))
if not msgs:
print(f"No messages in last {args.since} hours.")
else: # default --all
msgs = chat.read_all()
for m in reversed(msgs): # Chronological order
print(chat.format_message(m))
if not msgs:
print("No messages in stream.")
else:
parser.print_help()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""
Check agent messages from Redis stream
Usage: agent_check.py [--list N] [--check] [--last-minutes M]
"""
import argparse
import sys
import json
import time
from datetime import datetime, timezone
# Add parent to path for imports
sys.path.insert(0, '/root/.openclaw/workspace/skills/qdrant-memory')
try:
import redis
except ImportError:
print("❌ Redis module not available")
sys.exit(1)
REDIS_HOST = "10.0.0.36"
REDIS_PORT = 6379
STREAM_KEY = "agent-messages"
LAST_CHECKED_KEY = "agent:last_check_timestamp"
def get_redis_client():
"""Get Redis connection"""
try:
return redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
decode_responses=True,
socket_connect_timeout=5,
socket_timeout=5
)
except Exception as e:
print(f"❌ Redis connection failed: {e}")
return None
def get_messages_since(last_check=None, count=10):
"""Get messages from Redis stream since last check"""
r = get_redis_client()
if not r:
return []
try:
# Get last N messages from stream
messages = r.xrevrange(STREAM_KEY, count=count)
result = []
for msg_id, msg_data in messages:
# Parse message data
data = {}
for k, v in msg_data.items():
data[k] = v
# Extract timestamp from message ID
timestamp_ms = int(msg_id.split('-')[0])
msg_time = datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)
# Filter by last check if provided
if last_check:
if timestamp_ms <= last_check:
continue
result.append({
'id': msg_id,
'time': msg_time,
'data': data
})
return result
except Exception as e:
print(f"❌ Error reading stream: {e}")
return []
def update_last_check():
"""Update the last check timestamp"""
r = get_redis_client()
if not r:
return False
try:
now_ms = int(time.time() * 1000)
r.set(LAST_CHECKED_KEY, str(now_ms))
return True
except Exception as e:
print(f"❌ Error updating timestamp: {e}")
return False
def get_last_check_time():
"""Get the last check timestamp"""
r = get_redis_client()
if not r:
return None
try:
last = r.get(LAST_CHECKED_KEY)
if last:
return int(last)
return None
except:
return None
def format_message(msg):
"""Format a message for display"""
time_str = msg['time'].strftime('%Y-%m-%d %H:%M:%S UTC')
data = msg['data']
sender = data.get('sender', 'unknown')
recipient = data.get('recipient', 'all')
msg_type = data.get('type', 'message')
content = data.get('content', '')
return f"[{time_str}] {sender}{recipient} ({msg_type}):\n {content[:200]}{'...' if len(content) > 200 else ''}"
def main():
parser = argparse.ArgumentParser(description="Check agent messages from Redis")
parser.add_argument("--list", "-l", type=int, metavar="N", help="List last N messages")
parser.add_argument("--check", "-c", action="store_true", help="Check for new messages since last check")
parser.add_argument("--last-minutes", "-m", type=int, metavar="M", help="Check messages from last M minutes")
parser.add_argument("--mark-read", action="store_true", help="Update last check timestamp after reading")
args = parser.parse_args()
if args.check:
last_check = get_last_check_time()
messages = get_messages_since(last_check)
if messages:
print(f"🔔 {len(messages)} new message(s):")
for msg in reversed(messages): # Oldest first
print(format_message(msg))
print()
else:
print("✅ No new messages")
if args.mark_read:
update_last_check()
print("📌 Last check time updated")
elif args.last_minutes:
since_ms = int((time.time() - args.last_minutes * 60) * 1000)
messages = get_messages_since(since_ms)
if messages:
print(f"📨 {len(messages)} message(s) from last {args.last_minutes} minutes:")
for msg in reversed(messages):
print(format_message(msg))
print()
else:
print(f"✅ No messages in last {args.last_minutes} minutes")
elif args.list:
messages = get_messages_since(count=args.list)
if messages:
print(f"📜 Last {len(messages)} message(s):")
for msg in reversed(messages):
print(format_message(msg))
print()
else:
print("📭 No messages in stream")
else:
# Default: check for new messages
last_check = get_last_check_time()
messages = get_messages_since(last_check)
if messages:
print(f"🔔 {len(messages)} new message(s):")
for msg in reversed(messages):
print(format_message(msg))
print()
update_last_check()
else:
print("✅ No new messages")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,275 @@
#!/usr/bin/env python3
"""
API Scraper - REST API client with pagination support
Usage: api_scraper.py https://api.example.com/items --domain "API" --path "Endpoints/Items"
"""
import argparse
import sys
import json
import urllib.request
from pathlib import Path
from datetime import datetime
sys.path.insert(0, str(Path(__file__).parent))
from scrape_to_kb import chunk_text, get_embedding, compute_checksum, store_in_kb
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "knowledge_base"
class APIScraper:
def __init__(self, base_url, headers=None, rate_limit=0):
self.base_url = base_url
self.headers = headers or {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Accept': 'application/json'
}
self.rate_limit = rate_limit # seconds between requests
def fetch(self, url, params=None):
"""Fetch JSON from API"""
if params:
import urllib.parse
query = urllib.parse.urlencode(params)
url = f"{url}?{query}" if '?' not in url else f"{url}&{query}"
req = urllib.request.Request(url, headers=self.headers)
try:
with urllib.request.urlopen(req, timeout=30) as response:
return json.loads(response.read().decode())
except urllib.error.HTTPError as e:
print(f"❌ HTTP {e.code}: {e.reason}", file=sys.stderr)
return None
except Exception as e:
print(f"❌ Error: {e}", file=sys.stderr)
return None
def paginate(self, endpoint, page_param="page", size_param="limit",
size=100, max_pages=None, data_key=None):
"""Fetch paginated results"""
all_data = []
page = 1
while True:
params = {page_param: page, size_param: size}
url = f"{self.base_url}{endpoint}" if not endpoint.startswith('http') else endpoint
print(f"📄 Fetching page {page}...")
data = self.fetch(url, params)
if not data:
break
# Extract items from response
if data_key:
items = data.get(data_key, [])
elif isinstance(data, list):
items = data
else:
# Try common keys
for key in ['data', 'items', 'results', 'records', 'docs']:
if key in data:
items = data[key]
break
else:
items = [data] # Single item
if not items:
break
all_data.extend(items)
# Check for more pages
if max_pages and page >= max_pages:
print(f" Reached max pages ({max_pages})")
break
# Check if we got less than requested (last page)
if len(items) < size:
break
page += 1
if self.rate_limit:
import time
time.sleep(self.rate_limit)
return all_data
def format_for_kb(self, items, format_template=None):
"""Format API items as text for knowledge base"""
if not items:
return ""
parts = []
for i, item in enumerate(items):
if format_template:
# Use custom template
try:
text = format_template.format(**item, index=i+1)
except KeyError:
text = json.dumps(item, indent=2)
else:
# Auto-format
text = self._auto_format(item)
parts.append(text)
return "\n\n---\n\n".join(parts)
def _auto_format(self, item):
"""Auto-format a JSON item as readable text"""
if isinstance(item, str):
return item
if not isinstance(item, dict):
return json.dumps(item, indent=2)
parts = []
# Title/Name first
for key in ['name', 'title', 'id', 'key']:
if key in item:
parts.append(f"# {item[key]}")
break
# Description/summary
for key in ['description', 'summary', 'content', 'body', 'text']:
if key in item:
parts.append(f"\n{item[key]}")
break
# Other fields
skip = ['name', 'title', 'id', 'key', 'description', 'summary', 'content', 'body', 'text']
for key, value in item.items():
if key in skip:
continue
if value is None:
continue
if isinstance(value, (list, dict)):
value = json.dumps(value, indent=2)
parts.append(f"\n**{key}:** {value}")
return "\n".join(parts)
def main():
parser = argparse.ArgumentParser(description="Scrape REST API to knowledge base")
parser.add_argument("url", help="API endpoint URL")
parser.add_argument("--domain", required=True, help="Knowledge domain")
parser.add_argument("--path", required=True, help="Hierarchical path")
parser.add_argument("--paginate", action="store_true", help="Enable pagination")
parser.add_argument("--page-param", default="page", help="Page parameter name")
parser.add_argument("--size-param", default="limit", help="Page size parameter name")
parser.add_argument("--size", type=int, default=100, help="Items per page")
parser.add_argument("--max-pages", type=int, help="Max pages to fetch")
parser.add_argument("--data-key", help="Key containing data array in response")
parser.add_argument("--header", action='append', nargs=2, metavar=('KEY', 'VALUE'),
help="Custom headers (e.g., --header Authorization 'Bearer token')")
parser.add_argument("--format", help="Python format string for item display")
parser.add_argument("--category", default="reference")
parser.add_argument("--content-type", default="api_data")
parser.add_argument("--subjects", help="Comma-separated subjects")
parser.add_argument("--title", help="Content title")
parser.add_argument("--output", "-o", help="Save to JSON file instead of KB")
parser.add_argument("--rate-limit", type=float, default=0.5,
help="Seconds between requests (default: 0.5)")
args = parser.parse_args()
# Build headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Accept': 'application/json'
}
if args.header:
for key, value in args.header:
headers[key] = value
scraper = APIScraper(args.url, headers=headers, rate_limit=args.rate_limit)
print(f"🔌 API: {args.url}")
print(f"🏷️ Domain: {args.domain}")
print(f"📂 Path: {args.path}")
# Fetch data
if args.paginate:
print("📄 Pagination enabled\n")
items = scraper.paginate(
args.url,
page_param=args.page_param,
size_param=args.size_param,
size=args.size,
max_pages=args.max_pages,
data_key=args.data_key
)
else:
print("📄 Single request\n")
data = scraper.fetch(args.url)
if data_key := args.data_key:
items = data.get(data_key, []) if data else []
elif isinstance(data, list):
items = data
else:
items = [data] if data else []
if not items:
print("❌ No data fetched", file=sys.stderr)
sys.exit(1)
print(f"✓ Fetched {len(items)} items")
if args.output:
with open(args.output, 'w') as f:
json.dump(items, f, indent=2)
print(f"💾 Saved raw data to {args.output}")
return
# Format for KB
text = scraper.format_for_kb(items, args.format)
print(f"📝 Formatted: {len(text)} chars")
if len(text) < 200:
print("❌ Content too short", file=sys.stderr)
sys.exit(1)
chunks = chunk_text(text)
print(f"🧩 Chunks: {len(chunks)}")
subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else []
checksum = compute_checksum(text)
title = args.title or f"API Data from {args.url}"
print("💾 Storing...")
stored = 0
for i, chunk in enumerate(chunks):
chunk_metadata = {
"domain": args.domain,
"path": f"{args.path}/chunk-{i+1}",
"subjects": subjects,
"category": args.category,
"content_type": args.content_type,
"title": f"{title} (part {i+1}/{len(chunks)})",
"checksum": checksum,
"source_url": args.url,
"date_added": datetime.now().strftime("%Y-%m-%d"),
"chunk_index": i + 1,
"total_chunks": len(chunks),
"text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk,
"scraper_type": "api_rest",
"item_count": len(items),
"api_endpoint": args.url
}
if store_in_kb(chunk, chunk_metadata):
stored += 1
print(f" ✓ Chunk {i+1}")
print(f"\n🎉 Stored {stored}/{len(chunks)} chunks")
print(f" Source: {args.url}")
print(f" Items: {len(items)}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,388 @@
#!/usr/bin/env python3
"""
Auto Conversation Memory - TRUE Mem0-style Full Context Storage
User-centric memory - all conversations link to persistent user_id.
NOT session/chat-centric like old version.
Features:
- Persistent user_id (e.g., "rob") across all conversations
- Cross-conversation retrieval (find memories from any chat)
- Automatic conversation threading
- Deduplication
- Mem0-style: memories belong to USER, not to session
Usage:
python3 scripts/auto_store.py "user_message" "ai_response" \
--user-id "rob" \
--conversation-id <uuid> \
--turn <n>
Mem0 Architecture:
- user_id: "rob" (persistent across all your chats)
- conversation_id: Groups turns within one conversation
- session_id: Optional - tracks specific chat instance
- Retrieved by: user_id + semantic similarity (NOT session_id)
"""
import argparse
import hashlib
import json
import os
import sys
import urllib.request
import uuid
from datetime import datetime
from typing import List, Optional, Dict, Any
QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "kimi_memories")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://127.0.0.1:11434/v1")
# In-memory cache for deduplication (per process)
_recent_hashes = set()
def get_content_hash(user_msg: str, ai_response: str) -> str:
"""Generate hash for deduplication (stable across platforms)."""
content = f"{user_msg.strip()}::{ai_response.strip()}".encode("utf-8", errors="replace")
return hashlib.sha256(content).hexdigest()
def is_duplicate(user_id: str, user_msg: str, ai_response: str) -> bool:
"""
Check if this conversation turn already exists for this user.
Uses: user_id + content_hash
"""
content_hash = get_content_hash(user_msg, ai_response)
# Check in-memory cache first
if content_hash in _recent_hashes:
return True
# Check Qdrant for existing entry with this user_id + content_hash
try:
search_body = {
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}},
{"key": "content_hash", "match": {"value": content_hash}}
]
},
"limit": 1,
"with_payload": False
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=json.dumps(search_body).encode(),
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if len(points) > 0:
return True
except Exception:
pass
return False
def mark_stored(user_msg: str, ai_response: str):
"""Mark content as stored in memory cache"""
content_hash = get_content_hash(user_msg, ai_response)
_recent_hashes.add(content_hash)
if len(_recent_hashes) > 1000:
_recent_hashes.clear()
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"[AutoMemory] Embedding error: {e}", file=sys.stderr)
return None
def generate_conversation_summary(user_msg: str, ai_response: str) -> str:
"""Generate a searchable summary of the conversation turn"""
summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}"
return summary
def store_memory_point(
user_id: str,
text: str,
speaker: str,
date_str: str,
conversation_id: str,
turn_number: int,
session_id: Optional[str],
tags: List[str],
importance: str = "medium",
content_hash: Optional[str] = None
) -> Optional[str]:
"""Store a single memory point to Qdrant with user_id"""
embedding = get_embedding(text)
if embedding is None:
return None
point_id = str(uuid.uuid4())
payload = {
# MEM0-STYLE: user_id is PRIMARY key
"user_id": user_id,
"text": text,
"date": date_str,
"tags": tags,
"importance": importance,
"source": "conversation_auto",
"source_type": "user" if speaker == "user" else "assistant",
"category": "Full Conversation",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"session_id": session_id or ""
}
if content_hash:
payload["content_hash"] = content_hash
upsert_data = {
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload
}]
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
return point_id
except Exception as e:
print(f"[AutoMemory] Storage error: {e}", file=sys.stderr)
return None
def store_conversation_turn(
user_id: str,
user_message: str,
ai_response: str,
conversation_id: Optional[str] = None,
turn_number: Optional[int] = None,
session_id: Optional[str] = None,
date_str: Optional[str] = None,
skip_if_duplicate: bool = True
) -> Dict[str, Any]:
"""
Store a full conversation turn to Qdrant (Mem0-style)
Args:
user_id: PERSISTENT user identifier (e.g., "rob") - REQUIRED
user_message: User's message
ai_response: AI's response
conversation_id: Groups related turns (auto-generated if None)
turn_number: Sequential turn number
session_id: Optional chat session identifier
date_str: Date in YYYY-MM-DD format
Returns:
dict with success status and memory IDs
"""
if not user_id:
raise ValueError("user_id is required for Mem0-style storage")
if date_str is None:
date_str = datetime.now().strftime("%Y-%m-%d")
# Check for duplicates (per user)
if skip_if_duplicate and is_duplicate(user_id, user_message, ai_response):
return {
"user_point_id": None,
"ai_point_id": None,
"user_id": user_id,
"conversation_id": conversation_id or "",
"turn_number": turn_number or 1,
"success": True,
"skipped": True
}
if conversation_id is None:
conversation_id = str(uuid.uuid4())
if turn_number is None:
turn_number = 1
# Tags include user_id for easy filtering
tags = [
"conversation",
f"user:{user_id}",
date_str
]
if session_id:
tags.append(f"session:{session_id[:8]}")
# Determine importance
importance = "high" if any(kw in (user_message + ai_response).lower()
for kw in ["remember", "important", "always", "never", "rule"]) else "medium"
content_hash = get_content_hash(user_message, ai_response)
# Store user message
user_text = f"[{user_id}]: {user_message}"
user_id_point = store_memory_point(
user_id=user_id,
text=user_text,
speaker="user",
date_str=date_str,
conversation_id=conversation_id,
turn_number=turn_number,
session_id=session_id,
tags=tags + ["user-message"],
importance=importance,
content_hash=content_hash
)
# Store AI response
ai_text = f"[Kimi]: {ai_response}"
ai_id_point = store_memory_point(
user_id=user_id,
text=ai_text,
speaker="assistant",
date_str=date_str,
conversation_id=conversation_id,
turn_number=turn_number,
session_id=session_id,
tags=tags + ["ai-response"],
importance=importance,
content_hash=content_hash
)
# Store summary
summary = generate_conversation_summary(user_message, ai_response)
summary_text = f"[Turn {turn_number}] {summary}"
summary_embedding = get_embedding(summary_text)
if summary_embedding:
summary_id = str(uuid.uuid4())
summary_payload = {
"user_id": user_id,
"text": summary_text,
"date": date_str,
"tags": tags + ["summary", "combined"],
"importance": importance,
"source": "conversation_summary",
"source_type": "system",
"category": "Conversation Summary",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"session_id": session_id or "",
"content_hash": content_hash,
"user_message": user_message[:500],
"ai_response": ai_response[:800]
}
upsert_data = {
"points": [{
"id": summary_id,
"vector": summary_embedding,
"payload": summary_payload
}]
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
json.loads(response.read().decode())
except Exception as e:
print(f"[AutoMemory] Summary storage error: {e}", file=sys.stderr)
# Mark as stored
if user_id_point and ai_id_point:
mark_stored(user_message, ai_response)
return {
"user_point_id": user_id_point,
"ai_point_id": ai_id_point,
"user_id": user_id,
"conversation_id": conversation_id,
"turn_number": turn_number,
"success": bool(user_id_point and ai_id_point),
"skipped": False
}
def main():
parser = argparse.ArgumentParser(
description="Auto-store conversation turns to Qdrant (TRUE Mem0-style with user_id)"
)
parser.add_argument("user_message", help="The user's message")
parser.add_argument("ai_response", help="The AI's response")
parser.add_argument("--user-id", required=True,
help="REQUIRED: Persistent user ID (e.g., 'rob')")
parser.add_argument("--conversation-id",
help="Conversation ID for threading (auto-generated if not provided)")
parser.add_argument("--turn", type=int, help="Turn number in conversation")
parser.add_argument("--session-id",
help="Optional: Session/chat instance ID")
parser.add_argument("--date", default=datetime.now().strftime("%Y-%m-%d"),
help="Date in YYYY-MM-DD format")
args = parser.parse_args()
result = store_conversation_turn(
user_id=args.user_id,
user_message=args.user_message,
ai_response=args.ai_response,
conversation_id=args.conversation_id,
turn_number=args.turn,
session_id=args.session_id,
date_str=args.date
)
if result.get("skipped"):
print(f"⚡ Skipped duplicate (already stored for user {result['user_id']})")
elif result["success"]:
print(f"✅ Stored for user '{result['user_id']}' turn {result['turn_number']}")
print(f" Conversation: {result['conversation_id'][:8]}...")
else:
print("❌ Failed to store conversation", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,145 @@
#!/usr/bin/env python3
"""
Backfill emails to Qdrant for a specific user.
One-time use to populate memories from existing emails.
"""
import imaplib
import email
from email.policy import default
import json
import sys
import subprocess
# Authorized senders with their user IDs
# Add your authorized emails here
AUTHORIZED_SENDERS = {
# "your_email@gmail.com": "yourname",
# "spouse_email@gmail.com": "spousename"
}
# Gmail IMAP settings
IMAP_SERVER = "imap.gmail.com"
IMAP_PORT = 993
# Load credentials
CRED_FILE = "/root/.openclaw/workspace/.gmail_imap.json"
def load_credentials():
try:
with open(CRED_FILE, 'r') as f:
return json.load(f)
except Exception as e:
print(f"Error loading credentials: {e}")
return None
def store_email_memory(user_id, sender, subject, body, date):
"""Store email to Qdrant as memory for the user."""
try:
# Format as conversation-like entry
email_text = f"[EMAIL from {sender}]\nSubject: {subject}\nDate: {date}\n\n{body}"
# Store using auto_store.py (waits for completion)
script_path = "/root/.openclaw/workspace/skills/qdrant-memory/scripts/auto_store.py"
result = subprocess.run([
"python3", script_path,
f"[Email] {subject}",
email_text,
"--user-id", user_id
], capture_output=True, text=True, timeout=30)
if result.returncode == 0:
print(f" ✓ Stored: {subject[:50]}")
else:
print(f" ✗ Failed: {subject[:50]}")
except Exception as e:
print(f" ✗ Error: {e}")
def backfill(user_id=None, limit=20):
"""Backfill emails for specific user or all authorized senders."""
creds = load_credentials()
if not creds:
return
email_addr = creds.get("email")
app_password = creds.get("app_password")
if not email_addr or not app_password:
return
try:
# Connect to IMAP
mail = imaplib.IMAP4_SSL(IMAP_SERVER, IMAP_PORT)
mail.login(email_addr, app_password)
mail.select("inbox")
# Get ALL emails
status, messages = mail.search(None, "ALL")
if status != "OK" or not messages[0]:
print("No emails found.")
mail.logout()
return
email_ids = messages[0].split()
print(f"Found {len(email_ids)} total emails")
# Filter by user if specified
target_emails = []
if user_id:
# Find email address for this user
for auth_email, uid in AUTHORIZED_SENDERS.items():
if uid == user_id:
target_emails.append(auth_email.lower())
else:
target_emails = [e.lower() for e in AUTHORIZED_SENDERS.keys()]
# Process emails
stored_count = 0
for eid in email_ids[-limit:]:
status, msg_data = mail.fetch(eid, "(RFC822)")
if status != "OK":
continue
msg = email.message_from_bytes(msg_data[0][1], policy=default)
sender = msg.get("From", "").lower()
subject = msg.get("Subject", "")
date = msg.get("Date", "")
# Extract body
body = ""
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_content()
break
else:
body = msg.get_content()
body = body.strip()[:2000] if body else ""
# Check if from target sender
for auth_email, uid in AUTHORIZED_SENDERS.items():
if auth_email.lower() in sender:
if user_id and uid != user_id:
continue
print(f"\nStoring for {uid}:")
store_email_memory(uid, sender, subject, body, date)
stored_count += 1
break
print(f"\nDone! Stored {stored_count} emails to Qdrant.")
mail.close()
mail.logout()
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Backfill emails to Qdrant")
parser.add_argument("--user-id", help="Specific user to backfill (rob or jennifer)")
parser.add_argument("--limit", type=int, default=20, help="Max emails to process")
args = parser.parse_args()
backfill(user_id=args.user_id, limit=args.limit)

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""
Background Conversation Storage - Fire-and-forget wrapper (Mem0-style)
Usage:
background_store.py "user_message" "ai_response" \
--user-id "rob" \
[--turn N] \
[--session-id UUID]
Zero delay for user - storage happens asynchronously.
Mem0-style: user_id is REQUIRED (persistent across all chats).
"""
import argparse
import subprocess
import sys
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent.resolve()
AUTO_STORE = SCRIPT_DIR / "auto_store.py"
def store_in_background(
user_id: str,
user_message: str,
ai_response: str,
turn: int = None,
session_id: str = None
):
"""Fire off storage without waiting - returns immediately"""
cmd = [
sys.executable,
str(AUTO_STORE),
user_message,
ai_response,
"--user-id", user_id
]
if turn:
cmd.extend(["--turn", str(turn)])
if session_id:
cmd.extend(["--session-id", session_id])
# Fire and forget
subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True
)
return True
def main():
parser = argparse.ArgumentParser(
description="Store conversation in background (Mem0-style, zero delay)"
)
parser.add_argument("user_message", help="User's message")
parser.add_argument("ai_response", help="AI's response")
parser.add_argument("--user-id", required=True,
help="REQUIRED: Persistent user ID (e.g., 'rob')")
parser.add_argument("--turn", type=int, help="Turn number")
parser.add_argument("--session-id", help="Optional session/chat ID")
args = parser.parse_args()
store_in_background(
user_id=args.user_id,
user_message=args.user_message,
ai_response=args.ai_response,
turn=args.turn,
session_id=args.session_id
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python3
"""
Batch URL Crawler - Scrape multiple URLs to knowledge base
Usage: batch_crawl.py urls.txt --domain "Python" --path "Docs/Tutorials"
"""
import argparse
import sys
import json
import concurrent.futures
import urllib.request
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from scrape_to_kb import fetch_url, extract_text, chunk_text, get_embedding, compute_checksum, store_in_kb
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "knowledge_base"
def load_urls(url_source):
"""Load URLs from file or JSON"""
if url_source.endswith('.json'):
with open(url_source) as f:
data = json.load(f)
return [(item['url'], item.get('title'), item.get('subjects', []))
for item in data]
else:
with open(url_source) as f:
urls = []
for line in f:
line = line.strip()
if line and not line.startswith('#'):
# Parse URL [title] [subjects]
parts = line.split(' ', 1)
url = parts[0]
title = None
subjects = []
if len(parts) > 1:
# Check for [Title] and #subject1,#subject2
rest = parts[1]
if '[' in rest and ']' in rest:
title_match = rest[rest.find('[')+1:rest.find(']')]
title = title_match
rest = rest[rest.find(']')+1:]
if '#' in rest:
subjects = [s.strip() for s in rest.split('#') if s.strip()]
urls.append((url, title, subjects))
return urls
def scrape_single(url_data, domain, path, category, content_type):
"""Scrape a single URL"""
url, title_override, subjects = url_data
try:
print(f"🔍 {url}")
html = fetch_url(url)
if not html:
return {"url": url, "status": "failed", "error": "fetch"}
title, text = extract_text(html)
if title_override:
title = title_override
if len(text) < 200:
return {"url": url, "status": "skipped", "reason": "too_short"}
chunks = chunk_text(text)
checksum = compute_checksum(text)
stored = 0
for i, chunk in enumerate(chunks):
chunk_metadata = {
"domain": domain,
"path": f"{path}/chunk-{i+1}",
"subjects": subjects,
"category": category,
"content_type": content_type,
"title": f"{title} (part {i+1}/{len(chunks)})",
"checksum": checksum,
"source_url": url,
"date_added": "2026-02-05",
"chunk_index": i + 1,
"total_chunks": len(chunks),
"text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk
}
if store_in_kb(chunk, chunk_metadata):
stored += 1
return {
"url": url,
"status": "success",
"chunks": len(chunks),
"stored": stored,
"title": title
}
except Exception as e:
return {"url": url, "status": "error", "error": str(e)}
def main():
parser = argparse.ArgumentParser(description="Batch scrape URLs to knowledge base")
parser.add_argument("urls", help="File with URLs (.txt or .json)")
parser.add_argument("--domain", required=True, help="Knowledge domain")
parser.add_argument("--path", required=True, help="Hierarchical path")
parser.add_argument("--category", default="reference",
choices=["reference", "tutorial", "snippet", "troubleshooting", "concept"])
parser.add_argument("--content-type", default="web_page")
parser.add_argument("--workers", type=int, default=3, help="Concurrent workers (default: 3)")
parser.add_argument("--dry-run", action="store_true", help="Test without storing")
args = parser.parse_args()
urls = load_urls(args.urls)
print(f"📋 Loaded {len(urls)} URLs")
print(f"🏷️ Domain: {args.domain}")
print(f"📂 Path: {args.path}")
print(f"⚡ Workers: {args.workers}")
if args.dry_run:
print("\n🔍 DRY RUN - No storage\n")
for url, title, subjects in urls:
print(f" Would scrape: {url}")
if title:
print(f" Title: {title}")
if subjects:
print(f" Subjects: {', '.join(subjects)}")
return
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
futures = {
executor.submit(scrape_single, url_data, args.domain, args.path,
args.category, args.content_type): url_data
for url_data in urls
}
for future in concurrent.futures.as_completed(futures):
result = future.result()
results.append(result)
if result["status"] == "success":
print(f"{result['title'][:50]}... ({result['stored']}/{result['chunks']} chunks)")
elif result["status"] == "skipped":
print(f" ⚠ Skipped: {result.get('reason')}")
else:
print(f" ✗ Failed: {result.get('error', 'unknown')}")
# Summary
success = sum(1 for r in results if r["status"] == "success")
failed = sum(1 for r in results if r["status"] in ["failed", "error"])
skipped = sum(1 for r in results if r["status"] == "skipped")
print(f"\n📊 Summary:")
print(f" ✓ Success: {success}")
print(f" ✗ Failed: {failed}")
print(f" ⚠ Skipped: {skipped}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,298 @@
#!/usr/bin/env python3
"""
Bulk memory migration to Qdrant kimi_memories collection
Uses snowflake-arctic-embed2 (1024 dimensions)
"""
import json
import os
import re
import sys
import urllib.request
import uuid
from datetime import datetime
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://localhost:11434/v1"
MEMORY_DIR = "/root/.openclaw/workspace/memory"
MEMORY_MD = "/root/.openclaw/workspace/MEMORY.md"
def get_embedding(text):
"""Generate embedding using snowflake-arctic-embed2 via Ollama"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192] # Limit text length
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def store_memory(text, embedding, tags=None, importance="medium", date=None,
source="memory_backup", confidence="high", source_type="user",
verified=True):
"""Store memory in Qdrant with metadata"""
if date is None:
date = datetime.now().strftime("%Y-%m-%d")
point_id = str(uuid.uuid4())
payload = {
"text": text,
"date": date,
"tags": tags or [],
"importance": importance,
"confidence": confidence,
"source_type": source_type,
"verified": verified,
"source": source,
"created_at": datetime.now().isoformat(),
"access_count": 0
}
point = {
"id": point_id,
"vector": embedding,
"payload": payload
}
data = json.dumps({"points": [point]}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("result", {}).get("status") == "ok"
except Exception as e:
print(f"Error storing memory: {e}", file=sys.stderr)
return False
def extract_memories_from_file(filepath, importance="medium"):
"""Extract memory entries from a markdown file"""
memories = []
try:
with open(filepath, 'r') as f:
content = f.read()
except Exception as e:
print(f"Error reading {filepath}: {e}", file=sys.stderr)
return memories
# Extract date from filename or content
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filepath)
date = date_match.group(1) if date_match else datetime.now().strftime("%Y-%m-%d")
# Parse sections
lines = content.split('\n')
current_section = None
current_content = []
for line in lines:
# Section headers
if line.startswith('# ') and 'Memory' in line:
continue # Skip title
elif line.startswith('## '):
# Save previous section
if current_section and current_content:
section_text = '\n'.join(current_content).strip()
if len(section_text) > 20:
memories.append({
"text": f"{current_section}: {section_text}",
"date": date,
"tags": extract_tags(current_section, section_text),
"importance": importance
})
current_section = line[3:].strip()
current_content = []
elif line.startswith('### '):
# Save previous section
if current_section and current_content:
section_text = '\n'.join(current_content).strip()
if len(section_text) > 20:
memories.append({
"text": f"{current_section}: {section_text}",
"date": date,
"tags": extract_tags(current_section, section_text),
"importance": importance
})
current_section = line[4:].strip()
current_content = []
else:
if current_section:
current_content.append(line)
# Save final section
if current_section and current_content:
section_text = '\n'.join(current_content).strip()
if len(section_text) > 20:
memories.append({
"text": f"{current_section}: {section_text}",
"date": date,
"tags": extract_tags(current_section, section_text),
"importance": importance
})
return memories
def extract_tags(section, content):
"""Extract relevant tags from section and content"""
tags = []
# Section-based tags
if any(word in section.lower() for word in ['voice', 'tts', 'stt', 'audio']):
tags.extend(['voice', 'audio'])
if any(word in section.lower() for word in ['memory', 'qdrant', 'remember']):
tags.extend(['memory', 'qdrant'])
if any(word in section.lower() for word in ['redis', 'agent', 'message', 'max']):
tags.extend(['redis', 'messaging', 'agent'])
if any(word in section.lower() for word in ['youtube', 'seo', 'content']):
tags.extend(['youtube', 'content'])
if any(word in section.lower() for word in ['search', 'searxng', 'web']):
tags.extend(['search', 'web'])
if any(word in section.lower() for word in ['setup', 'install', 'bootstrap']):
tags.extend(['setup', 'configuration'])
# Content-based tags
content_lower = content.lower()
if 'voice' in content_lower:
tags.append('voice')
if 'memory' in content_lower:
tags.append('memory')
if 'qdrant' in content_lower:
tags.append('qdrant')
if 'redis' in content_lower:
tags.append('redis')
if 'youtube' in content_lower:
tags.append('youtube')
if 'rob' in content_lower:
tags.append('user')
return list(set(tags)) # Remove duplicates
def extract_core_memories_from_memory_md():
"""Extract high-importance memories from MEMORY.md"""
memories = []
try:
with open(MEMORY_MD, 'r') as f:
content = f.read()
except Exception as e:
print(f"Error reading MEMORY.md: {e}", file=sys.stderr)
return memories
# Core sections with high importance
sections = [
("Identity & Names", "high"),
("Core Preferences", "high"),
("Communication Rules", "high"),
("Voice Settings", "high"),
("Lessons Learned", "high"),
]
for section_name, importance in sections:
pattern = f"## {section_name}.*?(?=## |$)"
match = re.search(pattern, content, re.DOTALL)
if match:
section_text = match.group(0).strip()
# Extract subsections
subsections = re.findall(r'### (.+?)\n', section_text)
for sub in subsections:
sub_pattern = f"### {re.escape(sub)}.*?(?=### |## |$)"
sub_match = re.search(sub_pattern, section_text, re.DOTALL)
if sub_match:
sub_text = sub_match.group(0).strip()
if len(sub_text) > 50:
memories.append({
"text": f"{section_name} - {sub}: {sub_text[:500]}",
"date": "2026-02-10",
"tags": extract_tags(section_name, sub_text) + ['core', 'longterm'],
"importance": importance
})
return memories
def main():
print("Starting bulk memory migration to kimi_memories...")
print(f"Collection: {COLLECTION_NAME}")
print(f"Model: snowflake-arctic-embed2 (1024 dims)")
print()
all_memories = []
# Extract from daily logs
for filename in sorted(os.listdir(MEMORY_DIR)):
if filename.endswith('.md') and filename.startswith('2026'):
filepath = os.path.join(MEMORY_DIR, filename)
print(f"Processing {filename}...")
memories = extract_memories_from_file(filepath, importance="medium")
all_memories.extend(memories)
print(f" Extracted {len(memories)} memories")
# Extract from MEMORY.md
print("Processing MEMORY.md...")
core_memories = extract_core_memories_from_memory_md()
all_memories.extend(core_memories)
print(f" Extracted {len(core_memories)} core memories")
print(f"\nTotal memories to store: {len(all_memories)}")
print()
# Store each memory
success_count = 0
fail_count = 0
for i, memory in enumerate(all_memories, 1):
print(f"[{i}/{len(all_memories)}] Storing: {memory['text'][:60]}...")
# Generate embedding
embedding = get_embedding(memory['text'])
if embedding is None:
print(f" ❌ Failed to generate embedding")
fail_count += 1
continue
# Store in Qdrant
if store_memory(
text=memory['text'],
embedding=embedding,
tags=memory['tags'],
importance=memory['importance'],
date=memory['date'],
source="bulk_migration",
confidence="high",
source_type="user",
verified=True
):
print(f" ✅ Stored")
success_count += 1
else:
print(f" ❌ Failed to store")
fail_count += 1
print()
print("=" * 50)
print(f"Migration complete!")
print(f" Success: {success_count}")
print(f" Failed: {fail_count}")
print(f" Total: {len(all_memories)}")
print("=" * 50)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python3
"""
Create today's memory file if it doesn't exist
Usage: create_daily_memory.py [date]
"""
import sys
import os
from datetime import datetime, timezone
def get_cst_date():
"""Get current date in CST (America/Chicago)"""
from datetime import datetime, timezone
import time
# CST is UTC-6 (standard time) or UTC-5 (daylight time)
# Use a simple approximation: check if DST is active
now = datetime.now(timezone.utc)
# Convert to approximate CST (this is a simplified version)
# For production, use pytz or zoneinfo
is_dst = time.localtime().tm_isdst > 0
offset = -5 if is_dst else -6 # CDT or CST
cst_now = now.replace(hour=(now.hour + offset) % 24)
return cst_now.strftime('%Y-%m-%d')
def create_daily_memory(date_str=None):
"""Create memory file for the given date"""
if date_str is None:
date_str = get_cst_date()
memory_dir = "/root/.openclaw/workspace/memory"
filepath = os.path.join(memory_dir, f"{date_str}.md")
# Ensure directory exists
os.makedirs(memory_dir, exist_ok=True)
# Check if file already exists
if os.path.exists(filepath):
print(f"✅ Memory file already exists: {filepath}")
return filepath
# Create new daily memory file
content = f"""# {date_str} — Daily Memory Log
## Session Start
- **Date:** {date_str}
- **Agent:** Kimi
## Activities
*(Log activities, decisions, and important context here)*
## Notes
---
*Stored for long-term memory retention*
"""
try:
with open(filepath, 'w') as f:
f.write(content)
print(f"✅ Created memory file: {filepath}")
return filepath
except Exception as e:
print(f"❌ Error creating memory file: {e}")
return None
if __name__ == "__main__":
date_arg = sys.argv[1] if len(sys.argv) > 1 else None
create_daily_memory(date_arg)

View File

@@ -0,0 +1,317 @@
#!/usr/bin/env python3
"""
Daily memory backup script with batch upload support
Backs up all memory files to kimi_memories collection in Qdrant
Uses batch uploads (256 points) for 20x performance improvement
Avoids duplicates by checking existing dates
Usage:
daily_backup.py [--dry-run] [--batch-size N]
Features:
- Batch upload with configurable size (default 256)
- Parallel processing support
- Duplicate detection via date-based scroll
- Progress reporting
"""
import argparse
import json
import os
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://localhost:11434/v1"
MEMORY_DIR = Path("/root/.openclaw/workspace/memory")
DEFAULT_BATCH_SIZE = 256
DEFAULT_PARALLEL = 4
def get_embedding(text):
"""Generate embedding using snowflake-arctic-embed2 via Ollama"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192] # Limit to 8k chars for embedding
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def get_embedding_batch(texts):
"""Generate embeddings for multiple texts in batch"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": [t[:8192] for t in texts]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=120) as response:
result = json.loads(response.read().decode())
return [d["embedding"] for d in result["data"]]
except Exception as e:
print(f"Error generating batch embeddings: {e}", file=sys.stderr)
return [None] * len(texts)
def get_existing_dates():
"""Get list of dates already backed up via daily-backup (not manual stores)"""
try:
scroll_data = json.dumps({
"limit": 10000,
"with_payload": True,
"with_vectors": False
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
if result.get("result", {}).get("points"):
# Only count entries from daily-backup source, not manual stores
backup_dates = set()
for p in result["result"]["points"]:
payload = p.get("payload", {})
date = payload.get("date")
source = payload.get("source")
tags = payload.get("tags", [])
# Only skip if this was a daily-backup (not conversation/manual)
if date and source == "daily-backup":
backup_dates.add(date)
# Also check for daily-backup tag as fallback
elif date and "daily-backup" in tags:
backup_dates.add(date)
return backup_dates
except Exception as e:
print(f"Warning: Could not check existing dates: {e}", file=sys.stderr)
return set()
def batch_upload_points(points, batch_size=256):
"""Upload points in batches using batch_size"""
total = len(points)
uploaded = 0
failed = 0
for i in range(0, total, batch_size):
batch = points[i:i + batch_size]
upsert_data = {
"points": batch
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
uploaded += len(batch)
print(f" ✅ Batch {i//batch_size + 1}: {len(batch)} points uploaded")
else:
print(f" ❌ Batch {i//batch_size + 1}: Failed - {result}")
failed += len(batch)
except Exception as e:
print(f" ❌ Batch {i//batch_size + 1}: Error - {e}", file=sys.stderr)
failed += len(batch)
return uploaded, failed
def prepare_memory_point(content, date_str):
"""Prepare a memory point for upload"""
embedding = get_embedding(content)
if embedding is None:
return None
point_id = str(uuid.uuid4())
payload = {
"text": content,
"date": date_str,
"tags": ["daily-backup", f"backup-{date_str}"],
"importance": "high",
"source": "daily-backup",
"source_type": "inferred",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"backup_timestamp": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat()
}
return {
"id": point_id,
"vector": embedding,
"payload": payload
}
def process_file_batch(files_batch):
"""Process a batch of files in parallel"""
results = []
for date_str, file_path in files_batch:
try:
with open(file_path, 'r') as f:
content = f.read()
point = prepare_memory_point(content, date_str)
if point:
results.append(point)
except Exception as e:
print(f"{date_str}: Failed to process - {e}")
return results
def get_memory_files():
"""Get all memory markdown files sorted by date"""
if not MEMORY_DIR.exists():
return []
files = []
for f in MEMORY_DIR.glob("????-??-??.md"):
if f.name != "heartbeat-timestamps.txt":
files.append((f.stem, f)) # (date string, file path)
# Sort by date
files.sort(key=lambda x: x[0])
return files
def main():
parser = argparse.ArgumentParser(description="Daily memory backup with batch upload")
parser.add_argument("--dry-run", action="store_true", help="Show what would be backed up without uploading")
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})")
parser.add_argument("--parallel", type=int, default=DEFAULT_PARALLEL, help=f"Parallel embedding generation (default: {DEFAULT_PARALLEL})")
parser.add_argument("--force", action="store_true", help="Force re-backup of existing dates")
args = parser.parse_args()
print(f"=== Daily Memory Backup ===")
print(f"Time: {datetime.now().isoformat()}")
print(f"Batch size: {args.batch_size}")
print(f"Parallel: {args.parallel}")
if args.dry_run:
print("Mode: DRY RUN (no actual upload)")
print()
# Get existing dates to avoid duplicates
print(f"Checking for existing backups...")
existing_dates = get_existing_dates()
print(f"Found {len(existing_dates)} existing backups")
# Get memory files
memory_files = get_memory_files()
print(f"Found {len(memory_files)} memory files")
# Filter out already backed up dates (unless force)
files_to_backup = []
for date_str, file_path in memory_files:
if date_str in existing_dates and not args.force:
print(f" ⏭️ {date_str} - Already backed up, skipping")
continue
files_to_backup.append((date_str, file_path))
if not files_to_backup:
print(f"\n✅ All memories already backed up (no new files)")
return 0
print(f"\nBacking up {len(files_to_backup)} files...")
print()
if args.dry_run:
for date_str, file_path in files_to_backup:
print(f" 📄 {date_str} - Would back up ({file_path.stat().st_size} bytes)")
print(f"\nDry run complete. {len(files_to_backup)} files would be backed up.")
return 0
# Prepare all points with embeddings
all_points = []
failed_files = []
print("Generating embeddings...")
for date_str, file_path in files_to_backup:
try:
with open(file_path, 'r') as f:
content = f.read()
print(f" 📦 {date_str} - Generating embedding...")
point = prepare_memory_point(content, date_str)
if point:
all_points.append(point)
else:
failed_files.append(date_str)
except Exception as e:
print(f"{date_str} - Failed to read: {e}")
failed_files.append(date_str)
if not all_points:
print("\n❌ No points to upload")
return 1
print(f"\nGenerated {len(all_points)} embeddings, uploading in batches of {args.batch_size}...")
print()
# Upload in batches
uploaded, failed = batch_upload_points(all_points, args.batch_size)
# Summary
print(f"\n{'=' * 50}")
print("SUMMARY:")
print(f" Total files: {len(files_to_backup)}")
print(f" Successfully embedded: {len(all_points)}")
print(f" Successfully uploaded: {uploaded}")
print(f" Failed to embed: {len(failed_files)}")
print(f" Failed to upload: {failed}")
if failed_files:
print(f"\nFailed files: {', '.join(failed_files)}")
if uploaded > 0:
print(f"\n✅ Daily backup complete!")
return 0
elif failed > 0 or failed_files:
print(f"\n⚠️ Backup completed with errors")
return 1
else:
print(f"\n✅ All memories already backed up")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,347 @@
#!/usr/bin/env python3
"""
Daily Conversation Backup - Store day's conversations to Qdrant (Mem0-style)
Reads the daily memory file and stores all conversation turns to Qdrant
as full context (Mem0-style) with persistent user_id. Run at 3:30am daily.
Usage:
daily_conversation_backup.py [YYYY-MM-DD]
# If no date provided, processes yesterday's log
Mem0-style: All conversations linked to persistent user_id.
"""
import argparse
import hashlib
import json
import os
import re
import sys
import urllib.request
import uuid
from datetime import datetime, timedelta
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://10.0.0.10:11434/v1"
MEMORY_DIR = "/root/.openclaw/workspace/memory"
# DEFAULT USER - Mem0-style: memories belong to user
DEFAULT_USER_ID = "yourname"
def get_content_hash(user_msg: str, ai_response: str) -> str:
"""Generate hash for deduplication"""
content = f"{user_msg.strip()}::{ai_response.strip()}"
return hashlib.md5(content.encode()).hexdigest()
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"[DailyBackup] Embedding error: {e}", file=sys.stderr)
return None
def is_duplicate(user_id: str, content_hash: str) -> bool:
"""Check if already stored for this user"""
try:
search_body = {
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}},
{"key": "content_hash", "match": {"value": content_hash}}
]
},
"limit": 1,
"with_payload": False
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=json.dumps(search_body).encode(),
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
return len(points) > 0
except Exception:
pass
return False
def parse_daily_log(date_str: str) -> List[Dict[str, str]]:
"""Parse the daily memory file into conversation turns"""
log_file = os.path.join(MEMORY_DIR, f"{date_str}.md")
if not os.path.exists(log_file):
print(f"[DailyBackup] No log file found for {date_str}")
return []
with open(log_file, 'r') as f:
content = f.read()
conversations = []
turn_number = 0
# Split by headers (## [timestamp] ...)
sections = re.split(r'\n##\s+', content)
for section in sections:
if not section.strip():
continue
lines = section.strip().split('\n')
if not lines:
continue
header = lines[0]
body = '\n'.join(lines[1:]).strip()
# Extract user message from header
user_match = re.search(r'\[.*?\]\s*(.+)', header)
if user_match:
user_msg = user_match.group(1)
else:
user_msg = header
# Extract AI response
ai_match = re.search(r'(?:Kimi|Assistant|AI)[:\s]+(.+?)(?=\n##|\Z)', body, re.DOTALL | re.IGNORECASE)
if ai_match:
ai_response = ai_match.group(1).strip()
else:
paragraphs = body.split('\n\n')
if len(paragraphs) > 1:
ai_response = '\n\n'.join(paragraphs[1:]).strip()
else:
ai_response = body
if user_msg and ai_response:
turn_number += 1
conversations.append({
'user': user_msg,
'ai': ai_response,
'turn_number': turn_number,
'date': date_str
})
return conversations
def store_conversation_turn(
user_id: str,
user_message: str,
ai_response: str,
conversation_id: str,
turn_number: int,
date_str: str
) -> bool:
"""Store a single conversation turn to Qdrant (Mem0-style)"""
content_hash = get_content_hash(user_message, ai_response)
# Check duplicate
if is_duplicate(user_id, content_hash):
return True # Already stored, skip silently
# Generate embeddings
user_embedding = get_embedding(user_message)
ai_embedding = get_embedding(ai_response)
summary = f"Q: {user_message[:200]}... A: {ai_response[:300]}..."
summary_embedding = get_embedding(summary)
if not all([user_embedding, ai_embedding, summary_embedding]):
return False
tags = ["conversation", "daily-backup", date_str, f"user:{user_id}"]
importance = "high" if any(kw in (user_message + ai_response).lower()
for kw in ["remember", "important", "always", "never", "rule", "decision"]) else "medium"
points = []
# User message
user_id_point = str(uuid.uuid4())
points.append({
"id": user_id_point,
"vector": user_embedding,
"payload": {
"user_id": user_id,
"text": f"[{user_id}]: {user_message}",
"date": date_str,
"tags": tags + ["user-message"],
"importance": importance,
"source": "conversation_daily_backup",
"source_type": "user",
"category": "Full Conversation",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"content_hash": content_hash
}
})
# AI response
ai_id = str(uuid.uuid4())
points.append({
"id": ai_id,
"vector": ai_embedding,
"payload": {
"user_id": user_id,
"text": f"[Kimi]: {ai_response}",
"date": date_str,
"tags": tags + ["ai-response"],
"importance": importance,
"source": "conversation_daily_backup",
"source_type": "assistant",
"category": "Full Conversation",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"content_hash": content_hash
}
})
# Summary
summary_id = str(uuid.uuid4())
points.append({
"id": summary_id,
"vector": summary_embedding,
"payload": {
"user_id": user_id,
"text": f"[Turn {turn_number}] {summary}",
"date": date_str,
"tags": tags + ["summary", "combined"],
"importance": importance,
"source": "conversation_summary",
"source_type": "system",
"category": "Conversation Summary",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"content_hash": content_hash,
"user_message": user_message[:500],
"ai_response": ai_response[:800]
}
})
# Upload to Qdrant
upsert_data = {"points": points}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
print(f"[DailyBackup] Storage error: {e}", file=sys.stderr)
return False
def main():
parser = argparse.ArgumentParser(
description="Daily conversation backup to Qdrant (Mem0-style)"
)
parser.add_argument(
"date",
nargs="?",
help="Date to process (YYYY-MM-DD). Default: yesterday"
)
parser.add_argument(
"--user-id",
default=DEFAULT_USER_ID,
help=f"User ID (default: {DEFAULT_USER_ID})"
)
args = parser.parse_args()
if args.date:
date_str = args.date
else:
yesterday = datetime.now() - timedelta(days=1)
date_str = yesterday.strftime("%Y-%m-%d")
user_id = args.user_id
print(f"📅 Processing daily log for {date_str} (user: {user_id})...")
conversations = parse_daily_log(date_str)
if not conversations:
print(f"⚠️ No conversations found for {date_str}")
sys.exit(0)
print(f"📝 Found {len(conversations)} conversation turns")
stored = 0
skipped = 0
failed = 0
for conv in conversations:
conversation_id = str(uuid.uuid4())
content_hash = get_content_hash(conv['user'], conv['ai'])
if is_duplicate(user_id, content_hash):
skipped += 1
print(f" ⏭️ Turn {conv['turn_number']} skipped (duplicate)")
continue
success = store_conversation_turn(
user_id=user_id,
user_message=conv['user'],
ai_response=conv['ai'],
conversation_id=conversation_id,
turn_number=conv['turn_number'],
date_str=date_str
)
if success:
stored += 1
print(f" ✅ Turn {conv['turn_number']} stored")
else:
failed += 1
print(f" ❌ Turn {conv['turn_number']} failed")
print(f"\n{'='*50}")
print(f"Daily backup complete for {date_str} (user: {user_id}):")
print(f" Stored: {stored} turns ({stored * 3} embeddings)")
print(f" Skipped: {skipped} turns (duplicates)")
print(f" Failed: {failed} turns")
if stored > 0:
print(f"\n✅ Daily backup: {stored} conversations stored to Qdrant")
sys.exit(0 if failed == 0 else 1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,553 @@
#!/usr/bin/env python3
"""
Fact Extraction Script - Parse daily logs and extract atomic memories
This script parses memory/YYYY-MM-DD.md files and extracts individual facts
for storage in Qdrant as atomic memory units (Mem0-style), NOT whole files.
NOTE: Configured for COMPREHENSIVE capture (even minor facts) - user has
abundant storage resources. Thresholds are intentionally low to maximize
memory retention. Use --min-length flag to adjust filtering if needed.
Usage:
extract_facts.py [--date 2026-02-15] [--dry-run] [--batch-size 50]
extract_facts.py --backfill-all # Process all missing dates
Features:
- Parses markdown sections as individual facts
- Generates embeddings per fact (not per file)
- Stores with rich metadata (tags, importance, source)
- Batch upload support
- Duplicate detection
"""
import argparse
import json
import os
import re
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any, Tuple
# Configuration
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_EMBED_URL = "http://localhost:11434/v1"
MEMORY_DIR = Path("/root/.openclaw/workspace/memory")
DEFAULT_BATCH_SIZE = 50
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2 via Ollama"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192] # Limit to 8k chars
}).encode()
req = urllib.request.Request(
f"{OLLAMA_EMBED_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def batch_get_embeddings(texts: List[str]) -> List[Optional[List[float]]]:
"""Generate embeddings for multiple texts in batch"""
if not texts:
return []
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": [t[:8192] for t in texts]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_EMBED_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=120) as response:
result = json.loads(response.read().decode())
return [d["embedding"] for d in result["data"]]
except Exception as e:
print(f"Error generating batch embeddings: {e}", file=sys.stderr)
return [None] * len(texts)
def parse_markdown_sections(content: str, date_str: str) -> List[Dict[str, Any]]:
"""
Parse markdown content into atomic facts - COMPREHENSIVE CAPTURE.
Extracts EVERYTHING:
- ## Headers as fact categories
- Individual bullet points as atomic facts
- Paragraphs as standalone facts
- Code blocks as facts
- Table rows as facts
- Lines with **bold** as critical rules
- URLs/links as facts
- Key-value pairs (Key: Value)
"""
facts = []
lines = content.split('\n')
current_section = "General"
current_section_content = []
in_code_block = False
code_block_content = []
code_block_language = ""
def flush_section_content():
"""Convert accumulated section content into facts"""
nonlocal current_section_content
if not current_section_content:
return
# Join lines and split into paragraphs
full_text = '\n'.join(current_section_content)
paragraphs = [p.strip() for p in full_text.split('\n\n') if p.strip()]
for para in paragraphs:
if len(para) < 5: # Skip very short fragments
continue
# Split long paragraphs into sentence-level facts
if len(para) > 300:
sentences = [s.strip() for s in para.replace('. ', '.\n').split('\n') if s.strip()]
for sentence in sentences:
if len(sentence) > 10:
facts.append({
"text": f"{current_section}: {sentence[:500]}",
"tags": extract_tags(sentence, date_str),
"importance": "high" if "**" in sentence else "medium",
"source_type": "inferred",
"category": current_section
})
else:
# Store whole paragraph as fact
facts.append({
"text": f"{current_section}: {para[:500]}",
"tags": extract_tags(para, date_str),
"importance": "high" if "**" in para else "medium",
"source_type": "inferred",
"category": current_section
})
current_section_content = []
def extract_tags(text: str, date_str: str) -> List[str]:
"""Extract relevant tags from text"""
tags = ["atomic-fact", date_str]
# Content-based tags
text_lower = text.lower()
tag_mappings = {
"preference": "preferences",
"config": "configuration",
"hardware": "hardware",
"security": "security",
"youtube": "youtube",
"video": "video",
"workflow": "workflow",
"rule": "rules",
"critical": "critical",
"decision": "decisions",
"research": "research",
"process": "process",
"step": "steps",
}
for keyword, tag in tag_mappings.items():
if keyword in text_lower:
tags.append(tag)
return tags
for i, line in enumerate(lines):
line = line.strip()
# Code blocks
if line.startswith('```'):
if in_code_block:
# End of code block
if code_block_content:
code_text = '\n'.join(code_block_content)
facts.append({
"text": f"{current_section} [Code: {code_block_language}]: {code_text[:800]}",
"tags": ["code-block", "atomic-fact", date_str, code_block_language],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
code_block_content = []
code_block_language = ""
in_code_block = False
else:
# Start of code block
flush_section_content()
in_code_block = True
code_block_language = line[3:].strip() or "text"
continue
if in_code_block:
code_block_content.append(line)
continue
# Skip empty lines
if not line:
flush_section_content()
continue
# Section headers (##)
if line.startswith('## '):
flush_section_content()
current_section = line[3:].strip()
facts.append({
"text": f"Section: {current_section}",
"tags": ["section-header", "atomic-fact", date_str],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Skip main title (# Title)
if line.startswith('# ') and i == 0:
continue
# Bullet points (all levels)
if line.startswith('- ') or line.startswith('* ') or line.startswith('+ '):
flush_section_content()
fact_text = line[2:].strip()
if len(fact_text) > 3:
facts.append({
"text": f"{current_section}: {fact_text[:500]}",
"tags": extract_tags(fact_text, date_str),
"importance": "high" if "**" in fact_text else "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Numbered lists
if re.match(r'^\d+\.\s', line):
flush_section_content()
fact_text = re.sub(r'^\d+\.\s*', '', line)
if len(fact_text) > 3:
facts.append({
"text": f"{current_section}: {fact_text[:500]}",
"tags": extract_tags(fact_text, date_str),
"importance": "high" if "**" in fact_text else "medium",
"source_type": "inferred",
"category": current_section
})
continue
# URLs / Links
url_match = re.search(r'https?://[^\s<>"\')\]]+', line)
if url_match and len(line) < 300:
facts.append({
"text": f"{current_section}: {line[:400]}",
"tags": ["url", "link", "atomic-fact", date_str],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Key-value pairs (Key: Value)
if ':' in line and len(line) < 200 and not line.startswith('**'):
key_part = line.split(':')[0].strip()
if key_part and len(key_part) < 50 and not key_part.startswith('#'):
facts.append({
"text": f"{current_section}: {line[:400]}",
"tags": extract_tags(line, date_str) + ["key-value"],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Bold text / critical rules
if '**' in line:
flush_section_content()
facts.append({
"text": f"{current_section}: {line[:500]}",
"tags": ["critical-rule", "high-priority", date_str],
"importance": "high",
"source_type": "user",
"category": current_section
})
continue
# Table rows (| col1 | col2 |)
if '|' in line and not line.startswith('#'):
cells = [c.strip() for c in line.split('|') if c.strip()]
if cells and not all(c.replace('-', '').replace(':', '') == '' for c in cells):
facts.append({
"text": f"{current_section} [Table]: {' | '.join(cells)[:400]}",
"tags": ["table-row", "atomic-fact", date_str],
"importance": "medium",
"source_type": "inferred",
"category": current_section
})
continue
# Accumulate regular content
if len(line) > 2:
current_section_content.append(line)
# Flush remaining content
flush_section_content()
return facts
def check_existing_facts(date_str: str) -> set:
"""Check which facts from this date are already stored"""
try:
scroll_data = json.dumps({
"limit": 1000,
"with_payload": True,
"filter": {
"must": [{"key": "tags", "match": {"value": date_str}}]
}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
# Return set of text previews (first 100 chars) for comparison
return {p["payload"]["text"][:100] for p in points if "text" in p["payload"]}
except Exception as e:
print(f"Warning: Could not check existing facts: {e}", file=sys.stderr)
return set()
def upload_facts_batch(facts: List[Dict[str, Any]], batch_size: int = 50) -> Tuple[int, int]:
"""Upload facts to Qdrant in batches"""
total = len(facts)
uploaded = 0
failed = 0
for i in range(0, total, batch_size):
batch = facts[i:i + batch_size]
# Generate embeddings for this batch
texts = [f["text"] for f in batch]
embeddings = batch_get_embeddings(texts)
# Prepare points
points = []
for fact, embedding in zip(batch, embeddings):
if embedding is None:
failed += 1
continue
point_id = str(uuid.uuid4())
date_str = fact.get("date", datetime.now().strftime("%Y-%m-%d"))
payload = {
"text": fact["text"],
"date": date_str,
"tags": fact.get("tags", []),
"importance": fact.get("importance", "medium"),
"source": fact.get("source", "fact-extraction"),
"source_type": fact.get("source_type", "inferred"),
"category": fact.get("category", "general"),
"confidence": fact.get("confidence", "high"),
"verified": fact.get("verified", True),
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat()
}
# NOTE: Memories never expire - user requested permanent retention
# No expires_at field set = memories persist indefinitely
points.append({
"id": point_id,
"vector": embedding,
"payload": payload
})
if not points:
continue
# Upload batch
upsert_data = {"points": points}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
uploaded += len(points)
print(f" ✅ Batch {i//batch_size + 1}: {len(points)} facts uploaded")
else:
print(f" ❌ Batch {i//batch_size + 1}: Failed")
failed += len(points)
except Exception as e:
print(f" ❌ Batch {i//batch_size + 1}: {e}", file=sys.stderr)
failed += len(points)
return uploaded, failed
def process_single_date(date_str: str, dry_run: bool = False, batch_size: int = 50) -> Tuple[int, int]:
"""Process a single date's memory file"""
file_path = MEMORY_DIR / f"{date_str}.md"
if not file_path.exists():
print(f" ⚠️ File not found: {file_path}")
return 0, 0
print(f"Processing {date_str}...")
with open(file_path, 'r') as f:
content = f.read()
# Parse into atomic facts
facts = parse_markdown_sections(content, date_str)
if not facts:
print(f" ⚠️ No facts extracted from {date_str}")
return 0, 0
print(f" 📄 Extracted {len(facts)} atomic facts")
# Check for existing (skip duplicates)
existing = check_existing_facts(date_str)
new_facts = [f for f in facts if f["text"][:100] not in existing]
if existing:
print(f" ⏭️ Skipping {len(facts) - len(new_facts)} duplicates")
if not new_facts:
print(f" ✅ All facts already stored for {date_str}")
return 0, 0
print(f" 📤 Uploading {len(new_facts)} new facts...")
if dry_run:
print(f" [DRY RUN] Would upload {len(new_facts)} facts")
for f in new_facts[:3]: # Show first 3
print(f" - {f['text'][:80]}...")
if len(new_facts) > 3:
print(f" ... and {len(new_facts) - 3} more")
return len(new_facts), 0
# Add date to each fact
for f in new_facts:
f["date"] = date_str
uploaded, failed = upload_facts_batch(new_facts, batch_size)
return uploaded, failed
def get_all_memory_dates() -> List[str]:
"""Get all memory file dates sorted"""
if not MEMORY_DIR.exists():
return []
dates = []
for f in MEMORY_DIR.glob("????-??-??.md"):
dates.append(f.stem)
dates.sort()
return dates
def main():
parser = argparse.ArgumentParser(
description="Extract atomic facts from daily logs and store in Qdrant"
)
parser.add_argument("--date", help="Specific date to process (YYYY-MM-DD)")
parser.add_argument("--backfill-all", action="store_true",
help="Process all memory files")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be stored without uploading")
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE,
help=f"Batch size for uploads (default: {DEFAULT_BATCH_SIZE})")
parser.add_argument("--force", action="store_true",
help="Re-process even if already stored")
args = parser.parse_args()
print(f"=== Fact Extraction ===")
print(f"Time: {datetime.now().isoformat()}")
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
print(f"Batch size: {args.batch_size}")
print()
if args.date:
# Single date
uploaded, failed = process_single_date(args.date, args.dry_run, args.batch_size)
print(f"\n{'=' * 50}")
print(f"Summary for {args.date}:")
print(f" Uploaded: {uploaded}")
print(f" Failed: {failed}")
elif args.backfill_all:
# All dates
dates = get_all_memory_dates()
print(f"Found {len(dates)} memory files to process")
print()
total_uploaded = 0
total_failed = 0
for date_str in dates:
uploaded, failed = process_single_date(date_str, args.dry_run, args.batch_size)
total_uploaded += uploaded
total_failed += failed
print()
print(f"{'=' * 50}")
print(f"Total Summary:")
print(f" Files processed: {len(dates)}")
print(f" Total uploaded: {total_uploaded}")
print(f" Total failed: {total_failed}")
else:
# Default to today
today = datetime.now().strftime("%Y-%m-%d")
uploaded, failed = process_single_date(today, args.dry_run, args.batch_size)
print(f"\n{'=' * 50}")
print(f"Summary for {today}:")
print(f" Uploaded: {uploaded}")
print(f" Failed: {failed}")
print()
print("✅ Fact extraction complete!")
print("\nNext steps:")
print(" - Search facts: python3 search_memories.py 'your query'")
print(" - View by date: Check Qdrant with tag filter for date")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,236 @@
#!/usr/bin/env python3
"""
Mem0-Style Conversation Retrieval - User-centric memory search
Retrieves memories by USER, not by session/chat.
Cross-conversation search across all of Rob's memories.
Usage:
# Search user's memories across all conversations
python3 scripts/get_conversation_context.py --user-id "rob" "what was the decision about Qdrant?"
# Get specific conversation
python3 scripts/get_conversation_context.py --user-id "rob" --conversation-id <id>
# Get all conversations for user
python3 scripts/get_conversation_context.py --user-id "rob" --limit 50
Mem0-style: Memories belong to USER, not to session.
"""
import argparse
import json
import sys
import urllib.request
from datetime import datetime
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://10.0.0.10:11434/v1"
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"[Retrieval] Embedding error: {e}", file=sys.stderr)
return None
def search_user_memories(user_id: str, query: str, limit: int = 10) -> List[Dict]:
"""
MEM0-STYLE: Search memories for a specific user across all conversations.
NOT session-based - user-centric.
"""
embedding = get_embedding(query)
if embedding is None:
return []
# Search with user_id filter (MEM0: memories belong to user)
search_data = json.dumps({
"vector": embedding,
"limit": limit,
"with_payload": True,
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}},
{"key": "source_type", "match": {"value": "system"}} # Search summaries
]
}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search",
data=search_data,
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("result", [])
except Exception as e:
print(f"[Retrieval] Search error: {e}", file=sys.stderr)
return []
def get_user_conversations(user_id: str, limit: int = 100) -> List[Dict]:
"""Get all conversations for a user (Mem0-style)"""
scroll_data = json.dumps({
"limit": limit,
"with_payload": True,
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}},
{"key": "source_type", "match": {"value": "system"}} # Get summaries
]
}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("result", {}).get("points", [])
except Exception as e:
print(f"[Retrieval] Fetch error: {e}", file=sys.stderr)
return []
def get_conversation_by_id(user_id: str, conversation_id: str, limit: int = 100) -> List[Dict]:
"""Get full conversation by ID (with user verification)"""
scroll_data = json.dumps({
"limit": limit,
"with_payload": True,
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}},
{"key": "conversation_id", "match": {"value": conversation_id}}
]
}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("result", {}).get("points", [])
except Exception as e:
print(f"[Retrieval] Fetch error: {e}", file=sys.stderr)
return []
def format_conversation(points: List[Dict]) -> str:
"""Format conversation into readable transcript"""
def sort_key(p):
turn = p.get("payload", {}).get("turn_number", 0)
source = p.get("payload", {}).get("source_type", "")
return (turn, 0 if source in ["user", "assistant"] else 1)
sorted_points = sorted(points, key=sort_key)
output = []
current_turn = 0
for point in sorted_points:
payload = point.get("payload", {})
text = payload.get("text", "")
source = payload.get("source_type", "unknown")
turn = payload.get("turn_number", 0)
date = payload.get("date", "unknown")
user = payload.get("user_id", "unknown")
if payload.get("source") == "conversation_summary":
continue
if turn != current_turn:
output.append(f"\n--- Turn {turn} [{date}] ---")
current_turn = turn
output.append(text)
return "\n".join(output)
def main():
parser = argparse.ArgumentParser(
description="Mem0-style conversation retrieval (user-centric)"
)
parser.add_argument("query", nargs="?", help="Search query")
parser.add_argument("--user-id", required=True,
help="REQUIRED: User ID (e.g., 'rob')")
parser.add_argument("--conversation-id",
help="Get specific conversation")
parser.add_argument("--limit", type=int, default=10,
help="Max results")
parser.add_argument("--format", choices=["transcript", "json"],
default="transcript")
args = parser.parse_args()
if not args.user_id:
print("❌ --user-id is required for Mem0-style retrieval", file=sys.stderr)
sys.exit(1)
points = []
if args.conversation_id:
print(f"🔍 Fetching conversation for user '{args.user_id}': {args.conversation_id}")
points = get_conversation_by_id(args.user_id, args.conversation_id, args.limit * 3)
elif args.query:
print(f"🔍 Searching memories for user '{args.user_id}': {args.query}")
points = search_user_memories(args.user_id, args.query, args.limit)
else:
print(f"🔍 Fetching all memories for user '{args.user_id}'")
points = get_user_conversations(args.user_id, args.limit)
if not points:
print(f"❌ No memories found for user '{args.user_id}'")
sys.exit(1)
if args.format == "json":
print(json.dumps(points, indent=2))
else:
# Group by conversation_id
conversations = {}
for p in points:
convo_id = p.get("payload", {}).get("conversation_id")
if convo_id not in conversations:
conversations[convo_id] = []
conversations[convo_id].append(p)
for i, (convo_id, convo_points) in enumerate(conversations.items(), 1):
print(f"\n{'='*60}")
print(f"📜 Conversation {i}: {convo_id}")
print(f"{'='*60}")
print(format_conversation(convo_points))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,86 @@
#!/usr/bin/env python3
"""
Quick user context for email replies.
Returns recent memory summary, not full conversations.
"""
import json
import sys
import urllib.request
from typing import Optional
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
def get_user_context(user_id: str, limit: int = 5) -> str:
"""Get recent context for user - returns formatted summary."""
# Use scroll to get recent memories for user
data = json.dumps({
"limit": 10, # Get more to find profile
"with_payload": True,
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}}
]
}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=data,
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if not points:
return ""
# Prioritize: 1) Profile info, 2) Recent user message, 3) Recent context
profile = None
recent_user = None
recent_context = []
for point in points:
payload = point.get("payload", {})
text = payload.get("text", "")
source_type = payload.get("source_type", "")
# Look for profile (contains "Profile" or key identifying info)
if "profile" in text.lower() or "lives in" in text.lower():
profile = text[:200]
elif source_type == "user" and not recent_user:
recent_user = text[:150]
elif source_type in ["assistant", "system"]:
clean = text.replace("\r\n", " ").replace("\n", " ")[:150]
recent_context.append(clean)
# Build output: profile first if exists, then recent context
parts = []
if profile:
parts.append(f"[PROFILE] {profile}")
if recent_user:
parts.append(f"[USER] {recent_user}")
if recent_context:
parts.append(f"[CONTEXT] {recent_context[0][:100]}")
return " || ".join(parts) if parts else ""
except Exception as e:
return ""
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Get quick user context")
parser.add_argument("--user-id", required=True, help="User ID")
parser.add_argument("--limit", type=int, default=5, help="Max memories")
args = parser.parse_args()
context = get_user_context(args.user_id, args.limit)
if context:
print(context)

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env python3
"""
Harvest session files by explicit list (newest first).
"""
import argparse
import hashlib
import json
import os
import sys
import urllib.request
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://10.0.0.10:11434/v1"
SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions")
_recent_hashes = set()
def get_content_hash(user_msg: str, ai_response: str) -> str:
content = f"{user_msg.strip()}::{ai_response.strip()}"
return hashlib.md5(content.encode()).hexdigest()
def is_duplicate(user_id: str, content_hash: str) -> bool:
if content_hash in _recent_hashes:
return True
try:
search_body = {
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}},
{"key": "content_hash", "match": {"value": content_hash}}
]
},
"limit": 1,
"with_payload": False
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=json.dumps(search_body).encode(),
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
if result.get("result", {}).get("points", []):
return True
except Exception:
pass
return False
def get_embedding(text: str) -> Optional[List[float]]:
data = json.dumps({"model": "snowflake-arctic-embed2", "input": text[:8192]}).encode()
req = urllib.request.Request(f"{OLLAMA_URL}/embeddings", data=data, headers={"Content-Type": "application/json"})
try:
with urllib.request.urlopen(req, timeout=30) as response:
return json.loads(response.read().decode())["data"][0]["embedding"]
except Exception:
return None
def store_turn(user_id: str, user_msg: str, ai_response: str, date_str: str,
conversation_id: str, turn_number: int, session_id: str) -> bool:
content_hash = get_content_hash(user_msg, ai_response)
if is_duplicate(user_id, content_hash):
return False # Skipped (duplicate)
user_emb = get_embedding(f"[{user_id}]: {user_msg}")
ai_emb = get_embedding(f"[Kimi]: {ai_response}")
summary_emb = get_embedding(f"Q: {user_msg[:200]} A: {ai_response[:300]}")
if not all([user_emb, ai_emb, summary_emb]):
return False
tags = ["conversation", "harvested", f"user:{user_id}", date_str]
importance = "high" if any(kw in (user_msg + ai_response).lower() for kw in ["remember", "important", "always", "never", "rule"]) else "medium"
points = [
{"id": str(uuid.uuid4()), "vector": user_emb, "payload": {
"user_id": user_id, "text": f"[{user_id}]: {user_msg[:2000]}", "date": date_str,
"tags": tags + ["user-message"], "importance": importance, "source": "session_harvest",
"source_type": "user", "category": "Full Conversation", "confidence": "high",
"conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id, "content_hash": content_hash
}},
{"id": str(uuid.uuid4()), "vector": ai_emb, "payload": {
"user_id": user_id, "text": f"[Kimi]: {ai_response[:2000]}", "date": date_str,
"tags": tags + ["ai-response"], "importance": importance, "source": "session_harvest",
"source_type": "assistant", "category": "Full Conversation", "confidence": "high",
"conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id, "content_hash": content_hash
}},
{"id": str(uuid.uuid4()), "vector": summary_emb, "payload": {
"user_id": user_id, "text": f"[Turn {turn_number}] Q: {user_msg[:200]} A: {ai_response[:300]}", "date": date_str,
"tags": tags + ["summary"], "importance": importance, "source": "session_harvest",
"source_type": "system", "category": "Conversation Summary", "confidence": "high",
"conversation_id": conversation_id, "turn_number": turn_number, "session_id": session_id,
"content_hash": content_hash, "user_message": user_msg[:500], "ai_response": ai_response[:800]
}}
]
req = urllib.request.Request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps({"points": points}).encode(), headers={"Content-Type": "application/json"}, method="PUT")
try:
with urllib.request.urlopen(req, timeout=30) as response:
if json.loads(response.read().decode()).get("status") == "ok":
_recent_hashes.add(content_hash)
return True
except Exception:
pass
return False
def parse_and_store(filepath: Path, user_id: str) -> tuple:
turns = []
turn_num = 0
try:
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
if entry.get('type') != 'message' or 'message' not in entry:
continue
msg = entry['message']
role = msg.get('role')
if role == 'toolResult':
continue
content = ""
if isinstance(msg.get('content'), list):
for item in msg['content']:
if isinstance(item, dict) and 'text' in item:
content += item['text']
elif isinstance(msg.get('content'), str):
content = msg['content']
if content and role in ('user', 'assistant'):
turn_num += 1
ts = entry.get('timestamp', '')
turns.append({'turn': turn_num, 'role': role, 'content': content[:2000],
'date': ts[:10] if ts else datetime.now().strftime("%Y-%m-%d")})
except json.JSONDecodeError:
continue
except Exception as e:
print(f" Error: {e}", file=sys.stderr)
return 0, 0
stored, skipped = 0, 0
conv_id = str(uuid.uuid4())
i = 0
while i < len(turns):
if turns[i]['role'] == 'user':
user_msg = turns[i]['content']
ai_resp = ""
if i + 1 < len(turns) and turns[i + 1]['role'] == 'assistant':
ai_resp = turns[i + 1]['content']
i += 2
else:
i += 1
if user_msg and ai_resp:
if store_turn(user_id, user_msg, ai_resp, turns[i-1]['date'] if i > 0 else "", conv_id, turns[i-1]['turn'] if i > 0 else 0, filepath.stem):
stored += 1
else:
skipped += 1
else:
i += 1
return stored, skipped
def main():
parser = argparse.ArgumentParser(description="Harvest sessions by name")
parser.add_argument("--user-id", default="yourname")
parser.add_argument("sessions", nargs="*", help="Session filenames to process")
args = parser.parse_args()
total_stored, total_skipped = 0, 0
for i, name in enumerate(args.sessions, 1):
path = SESSIONS_DIR / name
if not path.exists():
print(f"[{i}] Not found: {name}")
continue
print(f"[{i}] {name}")
s, sk = parse_and_store(path, args.user_id)
total_stored += s
total_skipped += sk
if s > 0:
print(f" Stored: {s}, Skipped: {sk}")
print(f"\nTotal: {total_stored} stored, {total_skipped} skipped")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,341 @@
#!/usr/bin/env python3
"""
Harvest all session JSONL files and store to Qdrant.
Scans all session files, extracts conversation turns, and stores to Qdrant
with proper user_id and deduplication.
Usage: python3 harvest_sessions.py [--user-id rob] [--dry-run]
"""
import argparse
import hashlib
import json
import os
import sys
import urllib.request
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://10.0.0.10:11434/v1"
SESSIONS_DIR = Path("/root/.openclaw/agents/main/sessions")
# In-memory cache for deduplication
_recent_hashes = set()
def get_content_hash(user_msg: str, ai_response: str) -> str:
"""Generate hash for deduplication"""
content = f"{user_msg.strip()}::{ai_response.strip()}"
return hashlib.md5(content.encode()).hexdigest()
def is_duplicate(user_id: str, content_hash: str) -> bool:
"""Check if this content already exists for this user"""
if content_hash in _recent_hashes:
return True
try:
search_body = {
"filter": {
"must": [
{"key": "user_id", "match": {"value": user_id}},
{"key": "content_hash", "match": {"value": content_hash}}
]
},
"limit": 1,
"with_payload": False
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=json.dumps(search_body).encode(),
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if len(points) > 0:
return True
except Exception:
pass
return False
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"[Harvest] Embedding error: {e}", file=sys.stderr)
return None
def store_turn(user_id: str, user_msg: str, ai_response: str,
date_str: str, conversation_id: str, turn_number: int,
session_id: str, dry_run: bool = False) -> Dict:
"""Store a single conversation turn to Qdrant"""
content_hash = get_content_hash(user_msg, ai_response)
# Check duplicate
if is_duplicate(user_id, content_hash):
return {"skipped": True, "reason": "duplicate"}
if dry_run:
return {"skipped": False, "dry_run": True}
# Generate embeddings
user_embedding = get_embedding(f"[{user_id}]: {user_msg}")
ai_embedding = get_embedding(f"[Kimi]: {ai_response}")
summary = f"Q: {user_msg[:200]} A: {ai_response[:300]}..."
summary_embedding = get_embedding(summary)
if not all([user_embedding, ai_embedding, summary_embedding]):
return {"skipped": True, "reason": "embedding_failed"}
tags = ["conversation", "harvested", f"user:{user_id}", date_str]
importance = "high" if any(kw in (user_msg + ai_response).lower()
for kw in ["remember", "important", "always", "never", "rule"]) else "medium"
points = []
# User message
points.append({
"id": str(uuid.uuid4()),
"vector": user_embedding,
"payload": {
"user_id": user_id,
"text": f"[{user_id}]: {user_msg[:2000]}",
"date": date_str,
"tags": tags + ["user-message"],
"importance": importance,
"source": "session_harvest",
"source_type": "user",
"category": "Full Conversation",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"session_id": session_id,
"content_hash": content_hash
}
})
# AI response
points.append({
"id": str(uuid.uuid4()),
"vector": ai_embedding,
"payload": {
"user_id": user_id,
"text": f"[Kimi]: {ai_response[:2000]}",
"date": date_str,
"tags": tags + ["ai-response"],
"importance": importance,
"source": "session_harvest",
"source_type": "assistant",
"category": "Full Conversation",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"session_id": session_id,
"content_hash": content_hash
}
})
# Summary
if summary_embedding:
points.append({
"id": str(uuid.uuid4()),
"vector": summary_embedding,
"payload": {
"user_id": user_id,
"text": f"[Turn {turn_number}] {summary}",
"date": date_str,
"tags": tags + ["summary"],
"importance": importance,
"source": "session_harvest_summary",
"source_type": "system",
"category": "Conversation Summary",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"conversation_id": conversation_id,
"turn_number": turn_number,
"session_id": session_id,
"content_hash": content_hash,
"user_message": user_msg[:500],
"ai_response": ai_response[:800]
}
})
# Upload
upsert_data = {"points": points}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
_recent_hashes.add(content_hash)
return {"skipped": False, "stored": True}
except Exception as e:
print(f"[Harvest] Storage error: {e}", file=sys.stderr)
return {"skipped": True, "reason": "upload_failed"}
def parse_session_file(filepath: Path) -> List[Dict]:
"""Parse a session JSONL file and extract conversation turns"""
turns = []
turn_number = 0
try:
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
if entry.get('type') == 'message' and 'message' in entry:
msg = entry['message']
role = msg.get('role')
if role == 'toolResult':
continue
content = ""
if isinstance(msg.get('content'), list):
for item in msg['content']:
if isinstance(item, dict):
if 'text' in item:
content += item['text']
elif 'thinking' in item:
content += f"[thinking: {item['thinking'][:200]}...]"
elif isinstance(msg.get('content'), str):
content = msg['content']
if content and role in ('user', 'assistant'):
turn_number += 1
timestamp = entry.get('timestamp', '')
date_str = timestamp[:10] if timestamp else datetime.now().strftime("%Y-%m-%d")
turns.append({
'turn': turn_number,
'role': role,
'content': content[:2000],
'date': date_str,
'session': filepath.stem
})
except json.JSONDecodeError:
continue
except Exception as e:
print(f"[Harvest] Error reading {filepath}: {e}", file=sys.stderr)
return turns
def main():
parser = argparse.ArgumentParser(description="Harvest session files to Qdrant")
parser.add_argument("--user-id", default="yourname", help="User ID for storage")
parser.add_argument("--dry-run", action="store_true", help="Don't actually store")
parser.add_argument("--limit", type=int, default=0, help="Limit sessions (0=all)")
args = parser.parse_args()
# Find all session files
session_files = sorted(SESSIONS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime)
if args.limit > 0:
session_files = session_files[:args.limit]
print(f"Found {len(session_files)} session files")
total_stored = 0
total_skipped = 0
total_failed = 0
for i, session_file in enumerate(session_files, 1):
print(f"\n[{i}/{len(session_files)}] Processing: {session_file.name}")
turns = parse_session_file(session_file)
if not turns:
print(" No turns found")
continue
print(f" Found {len(turns)} turns")
# Pair user messages with AI responses
conversation_id = str(uuid.uuid4())
j = 0
while j < len(turns):
turn = turns[j]
if turn['role'] == 'user':
user_msg = turn['content']
ai_response = ""
# Look for next AI response
if j + 1 < len(turns) and turns[j + 1]['role'] == 'assistant':
ai_response = turns[j + 1]['content']
j += 2
else:
j += 1
if user_msg and ai_response:
result = store_turn(
user_id=args.user_id,
user_msg=user_msg,
ai_response=ai_response,
date_str=turn['date'],
conversation_id=conversation_id,
turn_number=turn['turn'],
session_id=turn['session'],
dry_run=args.dry_run
)
if result.get("skipped"):
if result.get("reason") == "duplicate":
total_skipped += 1
else:
total_failed += 1
else:
total_stored += 1
if total_stored % 10 == 0:
print(f" Progress: {total_stored} stored, {total_skipped} skipped")
else:
j += 1
print(f"\n{'='*50}")
print(f"Harvest complete:")
print(f" Stored: {total_stored} turns ({total_stored * 3} embeddings)")
print(f" Skipped (duplicates): {total_skipped}")
print(f" Failed: {total_failed}")
if args.dry_run:
print("\n[DRY RUN] Nothing was actually stored")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,186 @@
#!/usr/bin/env python3
"""
Email checker for heartbeat using Redis ID tracking.
Tracks seen email IDs in Redis to avoid missing read emails.
Stores emails to Qdrant with sender-specific user_id for memory.
Only alerts on emails from authorized senders.
"""
import imaplib
import email
from email.policy import default
import json
import sys
import redis
import subprocess
from datetime import datetime
# Authorized senders with their user IDs for Qdrant storage
# Add your authorized emails here
AUTHORIZED_SENDERS = {
# "your_email@gmail.com": "yourname",
# "spouse_email@gmail.com": "spousename"
}
# Gmail IMAP settings
IMAP_SERVER = "imap.gmail.com"
IMAP_PORT = 993
# Redis config
REDIS_HOST = "10.0.0.36"
REDIS_PORT = 6379
REDIS_KEY = "email:seen_ids"
# Load credentials
CRED_FILE = "/root/.openclaw/workspace/.gmail_imap.json"
def load_credentials():
try:
with open(CRED_FILE, 'r') as f:
return json.load(f)
except Exception as e:
return None
def get_redis():
try:
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
r.ping() # Test connection
return r
except Exception as e:
return None
def store_email_memory(user_id, sender, subject, body, date):
"""Store email to Qdrant as memory for the user."""
try:
# Format as conversation-like entry
email_text = f"[EMAIL from {sender}]\nSubject: {subject}\n\n{body}"
# Store using background_store.py (fire-and-forget)
script_path = "/root/.openclaw/workspace/skills/qdrant-memory/scripts/background_store.py"
subprocess.Popen([
"python3", script_path,
f"[Email] {subject}",
email_text,
"--user-id", user_id
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except Exception as e:
pass # Silent fail
def get_user_context(user_id):
"""Fetch recent context from Qdrant for the user."""
try:
script_path = "/root/.openclaw/workspace/skills/qdrant-memory/scripts/get_user_context.py"
result = subprocess.run([
"python3", script_path,
"--user-id", user_id,
"--limit", "3"
], capture_output=True, text=True, timeout=10)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
except Exception as e:
pass
return None
def check_emails():
creds = load_credentials()
if not creds:
return # Silent fail
email_addr = creds.get("email")
app_password = creds.get("app_password")
if not email_addr or not app_password:
return # Silent fail
r = get_redis()
if not r:
return # Silent fail if Redis unavailable
try:
# Connect to IMAP
mail = imaplib.IMAP4_SSL(IMAP_SERVER, IMAP_PORT)
mail.login(email_addr, app_password)
mail.select("inbox")
# Get ALL emails (not just unseen)
status, messages = mail.search(None, "ALL")
if status != "OK" or not messages[0]:
mail.logout()
return # No emails
email_ids = messages[0].split()
# Get already-seen IDs from Redis
seen_ids = set(r.smembers(REDIS_KEY))
# Check last 10 emails for new ones
for eid in email_ids[-10:]:
eid_str = eid.decode() if isinstance(eid, bytes) else str(eid)
# Skip if already seen
if eid_str in seen_ids:
continue
status, msg_data = mail.fetch(eid, "(RFC822)")
if status != "OK":
continue
msg = email.message_from_bytes(msg_data[0][1], policy=default)
sender = msg.get("From", "").lower()
subject = msg.get("Subject", "")
date = msg.get("Date", "")
# Extract email body
body = ""
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_content()
break
else:
body = msg.get_content()
# Clean up body (limit size)
body = body.strip()[:2000] if body else ""
# Check if sender is authorized and get their user_id
user_id = None
for auth_email, uid in AUTHORIZED_SENDERS.items():
if auth_email.lower() in sender:
user_id = uid
break
# Mark as seen in Redis regardless of sender (avoid re-checking)
r.sadd(REDIS_KEY, eid_str)
if user_id:
# Store to Qdrant for memory
store_email_memory(user_id, sender, subject, body, date)
# Get user context from Qdrant before alerting
context = get_user_context(user_id)
# Output for Kimi to respond (with context hint)
print(f"[EMAIL] User: {user_id} | From: {sender.strip()} | Subject: {subject} | Date: {date}")
if context:
print(f"[CONTEXT] {context}")
# Cleanup old IDs (keep last 100)
all_ids = r.smembers(REDIS_KEY)
if len(all_ids) > 100:
# Convert to int, sort, keep only highest 100
id_ints = sorted([int(x) for x in all_ids if x.isdigit()])
to_remove = id_ints[:-100]
for old_id in to_remove:
r.srem(REDIS_KEY, str(old_id))
mail.close()
mail.logout()
except Exception as e:
# Silent fail - no output
pass
if __name__ == "__main__":
check_emails()
sys.exit(0)

View File

@@ -0,0 +1,135 @@
#!/usr/bin/env python3
"""
Hybrid search: Search both file-based memory and Qdrant vectors
Usage: hybrid_search.py "Query text" [--file-limit 3] [--vector-limit 3]
"""
import argparse
import json
import os
import subprocess
import sys
import re
from datetime import datetime, timedelta
WORKSPACE = "/root/.openclaw/workspace"
MEMORY_DIR = f"{WORKSPACE}/memory"
def search_files(query, limit=3):
"""Search recent memory files for keyword matches"""
results = []
# Get recent memory files (last 30 days)
files = []
today = datetime.now()
for i in range(30):
date_str = (today - timedelta(days=i)).strftime("%Y-%m-%d")
filepath = f"{MEMORY_DIR}/{date_str}.md"
if os.path.exists(filepath):
files.append((date_str, filepath))
# Simple keyword search
query_lower = query.lower()
keywords = set(query_lower.split())
for date_str, filepath in files[:7]: # Check last 7 days max
try:
with open(filepath, 'r') as f:
content = f.read()
# Find sections that match
lines = content.split('\n')
for i, line in enumerate(lines):
line_lower = line.lower()
if any(kw in line_lower for kw in keywords):
# Get context (3 lines before and after)
start = max(0, i - 3)
end = min(len(lines), i + 4)
context = '\n'.join(lines[start:end])
# Simple relevance score based on keyword matches
score = sum(1 for kw in keywords if kw in line_lower) / len(keywords)
results.append({
"source": f"file:{filepath}",
"date": date_str,
"score": score,
"text": context.strip(),
"type": "file"
})
if len(results) >= limit * 2: # Get more then dedupe
break
except Exception as e:
continue
# Sort by score and return top N
results.sort(key=lambda x: x["score"], reverse=True)
return results[:limit]
def search_qdrant(query, limit=3):
"""Search Qdrant using the search_memories script"""
try:
script_path = f"{WORKSPACE}/skills/qdrant-memory/scripts/search_memories.py"
result = subprocess.run(
["python3", script_path, query, "--limit", str(limit), "--json"],
capture_output=True, text=True, timeout=60
)
if result.returncode == 0:
memories = json.loads(result.stdout)
for m in memories:
m["type"] = "vector"
m["source"] = "qdrant"
return memories
except Exception as e:
print(f"Qdrant search failed (falling back to files only): {e}", file=sys.stderr)
return []
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Hybrid memory search")
parser.add_argument("query", help="Search query")
parser.add_argument("--file-limit", type=int, default=3, help="Max file results")
parser.add_argument("--vector-limit", type=int, default=3, help="Max vector results")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
print(f"Searching for: '{args.query}'\n", file=sys.stderr)
# Search both sources
file_results = search_files(args.query, args.file_limit)
vector_results = search_qdrant(args.query, args.vector_limit)
# Combine results
all_results = file_results + vector_results
if not all_results:
print("No memories found matching your query.")
sys.exit(0)
if args.json:
print(json.dumps(all_results, indent=2))
else:
print(f"📁 File-based results ({len(file_results)}):")
print("-" * 50)
for r in file_results:
print(f"[{r['date']}] Score: {r['score']:.2f}")
print(r['text'][:300])
if len(r['text']) > 300:
print("...")
print()
print(f"\n🔍 Vector (Qdrant) results ({len(vector_results)}):")
print("-" * 50)
for r in vector_results:
print(f"[{r.get('date', 'unknown')}] Score: {r.get('score', 0):.3f} [{r.get('importance', 'medium')}]")
text = r.get('text', '')
print(text[:300])
if len(text) > 300:
print("...")
if r.get('tags'):
print(f"Tags: {', '.join(r['tags'])}")
print()

View File

@@ -0,0 +1,242 @@
#!/usr/bin/env python3
"""
Initialize Qdrant collections for Kimi Memory System
Creates 3 collections with snowflake-arctic-embed2 (1024 dims) using Qdrant 2025 best practices:
1. kimi_memories - Personal memories, preferences, lessons learned
2. kimi_kb - Knowledge base for web search, documents, scraped data
3. private_court_docs - Court documents and legal discussions
Features:
- on_disk=True for vectors (minimize RAM usage)
- on_disk_payload=True for payload
- Optimizer config for efficient indexing
- Binary quantization support (2025+ feature)
Usage: init_all_collections.py [--recreate]
"""
import argparse
import json
import sys
QDRANT_URL = "http://10.0.0.40:6333"
# Collection configurations
COLLECTIONS = {
"kimi_memories": {
"description": "Personal memories, preferences, lessons learned",
"vector_size": 1024
},
"kimi_kb": {
"description": "Knowledge base - web data, documents, reference materials",
"vector_size": 1024
},
"private_court_docs": {
"description": "Court documents and legal discussions",
"vector_size": 1024
}
}
def make_request(url, data=None, method="GET"):
"""Make HTTP request with proper method"""
import urllib.request
req = urllib.request.Request(url, method=method)
if data:
req.data = json.dumps(data).encode()
req.add_header("Content-Type", "application/json")
return req
def collection_exists(name):
"""Check if collection exists"""
import urllib.request
import urllib.error
try:
req = make_request(f"{QDRANT_URL}/collections/{name}")
with urllib.request.urlopen(req, timeout=5) as response:
return True
except urllib.error.HTTPError as e:
if e.code == 404:
return False
raise
except Exception:
return False
def get_collection_info(name):
"""Get collection info"""
import urllib.request
try:
req = make_request(f"{QDRANT_URL}/collections/{name}")
with urllib.request.urlopen(req, timeout=5) as response:
return json.loads(response.read().decode())
except Exception as e:
return None
def create_collection(name, vector_size=1024):
"""Create a collection with Qdrant 2025 best practices"""
import urllib.request
config = {
"vectors": {
"size": vector_size,
"distance": "Cosine",
"on_disk": True, # Store vectors on disk to minimize RAM
"quantization_config": {
"binary": {
"always_ram": True # Keep compressed vectors in RAM for fast search
}
}
},
"on_disk_payload": True, # Store payload on disk
"shard_number": 1, # Single node setup
"replication_factor": 1, # Single copy (set to 2 for production with HA)
"optimizers_config": {
"indexing_threshold": 20000, # Start indexing after 20k points
"default_segment_number": 0, # Fewer/larger segments for better throughput
"deleted_threshold": 0.2, # Vacuum when 20% deleted
"vacuum_min_vector_number": 1000 # Min vectors before vacuum
}
}
req = make_request(
f"{QDRANT_URL}/collections/{name}",
data=config,
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("result") == True
except Exception as e:
print(f"Error creating collection {name}: {e}", file=sys.stderr)
return False
def delete_collection(name):
"""Delete a collection"""
import urllib.request
req = make_request(f"{QDRANT_URL}/collections/{name}", method="DELETE")
try:
with urllib.request.urlopen(req, timeout=5) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
print(f"Error deleting collection {name}: {e}", file=sys.stderr)
return False
def main():
import urllib.request
parser = argparse.ArgumentParser(description="Initialize all Qdrant collections with 2025 best practices")
parser.add_argument("--recreate", action="store_true", help="Delete and recreate all collections")
parser.add_argument("--force", action="store_true", help="Force recreate even with existing data")
args = parser.parse_args()
# Check Qdrant connection
try:
req = urllib.request.Request(f"{QDRANT_URL}/")
with urllib.request.urlopen(req, timeout=3) as response:
pass
except Exception as e:
print(f"❌ Cannot connect to Qdrant at {QDRANT_URL}: {e}", file=sys.stderr)
sys.exit(1)
print(f"✅ Connected to Qdrant at {QDRANT_URL}\n")
# Check if Ollama is available for embeddings
try:
req = urllib.request.Request("http://localhost:11434/api/tags")
with urllib.request.urlopen(req, timeout=3) as response:
ollama_status = ""
except Exception:
ollama_status = "⚠️"
print(f"Ollama (localhost): {ollama_status} - Embeddings endpoint\n")
created = []
skipped = []
errors = []
recreated = []
for name, config in COLLECTIONS.items():
print(f"--- {name} ---")
print(f" Description: {config['description']}")
exists = collection_exists(name)
if exists:
info = get_collection_info(name)
if info:
actual_size = info.get("result", {}).get("config", {}).get("params", {}).get("vectors", {}).get("size", "?")
points = info.get("result", {}).get("points_count", 0)
on_disk = info.get("result", {}).get("config", {}).get("params", {}).get("vectors", {}).get("on_disk", False)
print(f" Existing collection:")
print(f" Points: {points}")
print(f" Vector size: {actual_size}")
print(f" On disk: {on_disk}")
if args.recreate:
if points > 0 and not args.force:
print(f" ⚠️ Collection has {points} points. Use --force to recreate with data loss.")
skipped.append(name)
continue
print(f" Deleting existing collection...")
if delete_collection(name):
print(f" ✅ Deleted")
exists = False
else:
print(f" ❌ Failed to delete", file=sys.stderr)
errors.append(name)
continue
else:
print(f" ⚠️ Already exists, skipping (use --recreate to update)")
skipped.append(name)
continue
if not exists:
print(f" Creating collection with 2025 best practices...")
print(f" - on_disk=True (vectors)")
print(f" - on_disk_payload=True")
print(f" - Binary quantization")
print(f" - Optimizer config")
if create_collection(name, config["vector_size"]):
print(f" ✅ Created (vector size: {config['vector_size']})")
if args.recreate and name in [c for c in COLLECTIONS]:
recreated.append(name)
else:
created.append(name)
else:
print(f" ❌ Failed to create", file=sys.stderr)
errors.append(name)
print()
# Summary
print("=" * 50)
print("SUMMARY:")
if created:
print(f" Created: {', '.join(created)}")
if recreated:
print(f" Recreated: {', '.join(recreated)}")
if skipped:
print(f" Skipped: {', '.join(skipped)}")
if errors:
print(f" Errors: {', '.join(errors)}")
sys.exit(1)
print("\n🎉 All collections ready with 2025 best practices!")
print("\nCollections configured for snowflake-arctic-embed2 (1024 dims)")
print("- kimi_memories: Personal memories (on_disk=True)")
print("- kimi_kb: Knowledge base (on_disk=True)")
print("- private_court_docs: Court documents (on_disk=True)")
print("\nFeatures enabled:")
print(" ✓ Vectors stored on disk (minimizes RAM)")
print(" ✓ Payload stored on disk")
print(" ✓ Binary quantization for fast search")
print(" ✓ Optimized indexing thresholds")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""
Initialize kimi_kb collection (Knowledge Base)
Vector size: 1024 (snowflake-arctic-embed2)
Usage: init_kimi_kb.py [--recreate]
"""
import argparse
import sys
import urllib.request
import json
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_kb"
VECTOR_SIZE = 1024
def make_request(url, data=None, method="GET"):
req = urllib.request.Request(url, method=method)
if data:
req.data = json.dumps(data).encode()
req.add_header("Content-Type", "application/json")
return req
def collection_exists():
try:
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
with urllib.request.urlopen(req, timeout=5) as response:
return True
except urllib.error.HTTPError as e:
if e.code == 404:
return False
raise
except Exception:
return False
def get_info():
try:
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
with urllib.request.urlopen(req, timeout=5) as response:
return json.loads(response.read().decode())
except Exception:
return None
def create_collection():
config = {
"vectors": {
"size": VECTOR_SIZE,
"distance": "Cosine"
}
}
req = make_request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}",
data=config,
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("result") == True
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return False
def delete_collection():
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}", method="DELETE")
try:
with urllib.request.urlopen(req, timeout=5) as response:
return json.loads(response.read().decode()).get("status") == "ok"
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Initialize kimi_kb collection")
parser.add_argument("--recreate", action="store_true", help="Delete and recreate")
args = parser.parse_args()
try:
req = make_request(f"{QDRANT_URL}/")
with urllib.request.urlopen(req, timeout=3) as response:
pass
except Exception as e:
print(f"❌ Cannot connect to Qdrant: {e}", file=sys.stderr)
sys.exit(1)
print(f"✅ Qdrant: {QDRANT_URL}")
print(f"Collection: {COLLECTION_NAME}")
print(f"Vector size: {VECTOR_SIZE} (snowflake-arctic-embed2)\n")
exists = collection_exists()
if exists:
if args.recreate:
print(f"Deleting existing...")
delete_collection()
exists = False
else:
info = get_info()
if info:
size = info.get("result", {}).get("vectors_config", {}).get("params", {}).get("vectors", {}).get("size", "?")
points = info.get("result", {}).get("points_count", 0)
print(f"⚠️ Already exists (vector size: {size}, points: {points})")
sys.exit(0)
if not exists:
if create_collection():
print(f"✅ Created {COLLECTION_NAME}")
print(f" Vector size: {VECTOR_SIZE}, Distance: Cosine")
else:
print(f"❌ Failed", file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""
Initialize kimi_memories collection (Personal Memories)
Vector size: 1024 (snowflake-arctic-embed2)
Usage: init_kimi_memories.py [--recreate]
"""
import argparse
import sys
import urllib.request
import json
import os
QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "kimi_memories")
VECTOR_SIZE = int(os.getenv("QDRANT_VECTOR_SIZE", "1024"))
def make_request(url, data=None, method="GET"):
req = urllib.request.Request(url, method=method)
if data:
req.data = json.dumps(data).encode()
req.add_header("Content-Type", "application/json")
return req
def collection_exists():
try:
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
with urllib.request.urlopen(req, timeout=5) as response:
return True
except urllib.error.HTTPError as e:
if e.code == 404:
return False
raise
except Exception:
return False
def get_info():
try:
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
with urllib.request.urlopen(req, timeout=5) as response:
return json.loads(response.read().decode())
except Exception:
return None
def create_collection():
config = {
"vectors": {
"size": VECTOR_SIZE,
"distance": "Cosine"
}
}
req = make_request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}",
data=config,
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("result") == True
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return False
def delete_collection():
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}", method="DELETE")
try:
with urllib.request.urlopen(req, timeout=5) as response:
return json.loads(response.read().decode()).get("status") == "ok"
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Initialize kimi_memories collection")
parser.add_argument("--recreate", action="store_true", help="Delete and recreate")
args = parser.parse_args()
try:
req = make_request(f"{QDRANT_URL}/")
with urllib.request.urlopen(req, timeout=3) as response:
pass
except Exception as e:
print(f"❌ Cannot connect to Qdrant: {e}", file=sys.stderr)
sys.exit(1)
print(f"✅ Qdrant: {QDRANT_URL}")
print(f"Collection: {COLLECTION_NAME}")
print(f"Vector size: {VECTOR_SIZE} (snowflake-arctic-embed2)\n")
exists = collection_exists()
if exists:
if args.recreate:
print(f"Deleting existing...")
delete_collection()
exists = False
else:
info = get_info()
if info:
size = info.get("result", {}).get("vectors_config", {}).get("params", {}).get("vectors", {}).get("size", "?")
points = info.get("result", {}).get("points_count", 0)
print(f"⚠️ Already exists (vector size: {size}, points: {points})")
sys.exit(0)
if not exists:
if create_collection():
print(f"✅ Created {COLLECTION_NAME}")
print(f" Vector size: {VECTOR_SIZE}, Distance: Cosine")
else:
print(f"❌ Failed", file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""
Initialize Qdrant collection for Projects
Usage: init_projects_collection.py [--recreate]
"""
import argparse
import sys
import urllib.request
import json
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "projects"
def make_request(url, data=None, method="GET"):
"""Make HTTP request with proper method"""
req = urllib.request.Request(url, method=method)
if data:
req.data = json.dumps(data).encode()
req.add_header("Content-Type", "application/json")
return req
def collection_exists():
"""Check if collection exists"""
try:
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
with urllib.request.urlopen(req, timeout=5) as response:
return True
except urllib.error.HTTPError as e:
if e.code == 404:
return False
raise
except Exception as e:
print(f"Error checking collection: {e}", file=sys.stderr)
return False
def create_collection():
"""Create the projects collection using PUT"""
config = {
"vectors": {
"size": 768, # nomic-embed-text outputs 768 dimensions
"distance": "Cosine"
}
}
req = make_request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}",
data=config,
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("result") == True
except Exception as e:
print(f"Error creating collection: {e}", file=sys.stderr)
return False
def delete_collection():
"""Delete collection if exists"""
req = make_request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}",
method="DELETE"
)
try:
with urllib.request.urlopen(req, timeout=5) as response:
return True
except Exception as e:
print(f"Error deleting collection: {e}", file=sys.stderr)
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Initialize Qdrant projects collection")
parser.add_argument("--recreate", action="store_true", help="Delete and recreate collection")
args = parser.parse_args()
# Check if Qdrant is reachable
try:
req = make_request(f"{QDRANT_URL}/")
with urllib.request.urlopen(req, timeout=3) as response:
pass
except Exception as e:
print(f"❌ Cannot connect to Qdrant at {QDRANT_URL}: {e}", file=sys.stderr)
sys.exit(1)
print(f"✅ Connected to Qdrant at {QDRANT_URL}")
exists = collection_exists()
if exists and args.recreate:
print(f"Deleting existing collection '{COLLECTION_NAME}'...")
if delete_collection():
print(f"✅ Deleted collection")
exists = False
else:
print(f"❌ Failed to delete collection", file=sys.stderr)
sys.exit(1)
if not exists:
print(f"Creating collection '{COLLECTION_NAME}'...")
if create_collection():
print(f"✅ Created collection '{COLLECTION_NAME}'")
print(f" Vector size: 768, Distance: Cosine")
else:
print(f"❌ Failed to create collection", file=sys.stderr)
sys.exit(1)
else:
print(f"✅ Collection '{COLLECTION_NAME}' already exists")
print("\n🎉 Qdrant projects collection ready!")

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""
JavaScript Scraper - Headless browser for JS-heavy sites
Uses Playwright to render dynamic content before scraping
Usage: js_scraper.py <url> --domain "React" --path "Docs/Hooks" --wait-for "#content"
"""
import argparse
import sys
import json
from pathlib import Path
from playwright.sync_api import sync_playwright
sys.path.insert(0, str(Path(__file__).parent))
from scrape_to_kb import chunk_text, get_embedding, compute_checksum, store_in_kb
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "knowledge_base"
def scrape_js_site(url, wait_for=None, wait_time=2000, scroll=False, viewport=None):
"""Scrape JavaScript-rendered site using Playwright"""
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context_options = {}
if viewport:
context_options["viewport"] = {"width": viewport[0], "height": viewport[1]}
context = browser.new_context(**context_options)
page = context.new_page()
# Set user agent
page.set_extra_http_headers({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
try:
print(f"🌐 Loading {url}...")
page.goto(url, wait_until="networkidle", timeout=30000)
# Wait for specific element if requested
if wait_for:
print(f"⏳ Waiting for {wait_for}...")
page.wait_for_selector(wait_for, timeout=10000)
# Additional wait for any animations/final renders
page.wait_for_timeout(wait_time)
# Scroll to bottom if requested (for infinite scroll pages)
if scroll:
print("📜 Scrolling...")
prev_height = 0
while True:
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(500)
new_height = page.evaluate("document.body.scrollHeight")
if new_height == prev_height:
break
prev_height = new_height
# Get page data
title = page.title()
# Extract clean text
text = page.evaluate("""() => {
// Remove script/style/nav/header/footer
const scripts = document.querySelectorAll('script, style, nav, header, footer, aside, .advertisement, .ads');
scripts.forEach(el => el.remove());
// Get main content if available, else body
const main = document.querySelector('main, article, [role="main"], .content, .post-content, .entry-content');
const content = main || document.body;
return content.innerText;
}""")
# Get any JSON-LD structured data
json_ld = page.evaluate("""() => {
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
const data = [];
scripts.forEach(s => {
try {
data.push(JSON.parse(s.textContent));
} catch(e) {}
});
return data;
}""")
# Get meta description
meta_desc = page.evaluate("""() => {
const meta = document.querySelector('meta[name=\"description\"], meta[property=\"og:description\"]');
return meta ? meta.content : '';
}""")
browser.close()
return {
"title": title,
"text": text,
"meta_description": meta_desc,
"json_ld": json_ld,
"url": page.url # Final URL after redirects
}
except Exception as e:
browser.close()
raise e
def main():
parser = argparse.ArgumentParser(description="Scrape JavaScript-heavy sites")
parser.add_argument("url", help="URL to scrape")
parser.add_argument("--domain", required=True, help="Knowledge domain")
parser.add_argument("--path", required=True, help="Hierarchical path")
parser.add_argument("--wait-for", help="CSS selector to wait for")
parser.add_argument("--wait-time", type=int, default=2000, help="Wait time in ms after load")
parser.add_argument("--scroll", action="store_true", help="Scroll to bottom (for infinite scroll)")
parser.add_argument("--viewport", help="Viewport size (e.g., 1920x1080)")
parser.add_argument("--category", default="reference")
parser.add_argument("--content-type", default="web_page")
parser.add_argument("--subjects", help="Comma-separated subjects")
parser.add_argument("--title", help="Override title")
args = parser.parse_args()
viewport = None
if args.viewport:
w, h = args.viewport.split('x')
viewport = (int(w), int(h))
try:
result = scrape_js_site(
args.url,
wait_for=args.wait_for,
wait_time=args.wait_time,
scroll=args.scroll,
viewport=viewport
)
except Exception as e:
print(f"❌ Error: {e}", file=sys.stderr)
sys.exit(1)
title = args.title or result["title"]
text = result["text"]
print(f"📄 Title: {title}")
print(f"📝 Content: {len(text)} chars")
if len(text) < 200:
print("❌ Content too short", file=sys.stderr)
sys.exit(1)
# Add meta description if available
if result["meta_description"]:
text = f"Description: {result['meta_description']}\n\n{text}"
chunks = chunk_text(text)
print(f"🧩 Chunks: {len(chunks)}")
subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else []
checksum = compute_checksum(text)
print("💾 Storing...")
stored = 0
for i, chunk in enumerate(chunks):
chunk_metadata = {
"domain": args.domain,
"path": f"{args.path}/chunk-{i+1}",
"subjects": subjects,
"category": args.category,
"content_type": args.content_type,
"title": f"{title} (part {i+1}/{len(chunks)})",
"checksum": checksum,
"source_url": result["url"],
"date_added": "2026-02-05",
"chunk_index": i + 1,
"total_chunks": len(chunks),
"text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk,
"scraper_type": "playwright_headless",
"rendered": True
}
if store_in_kb(chunk, chunk_metadata):
stored += 1
print(f" ✓ Chunk {i+1}")
print(f"\n🎉 Stored {stored}/{len(chunks)} chunks")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,183 @@
#!/usr/bin/env python3
"""
Review knowledge base for outdated entries
Usage: kb_review.py [--days 180] [--domains "Domain1,Domain2"] [--dry-run]
"""
import argparse
import sys
import json
import urllib.request
from datetime import datetime, timedelta
QDRANT_URL = "http://10.0.0.40:6333"
KB_COLLECTION = "knowledge_base"
# Domains where freshness matters (tech changes fast)
FAST_MOVING_DOMAINS = ["AI/ML", "Python", "JavaScript", "Docker", "OpenClaw", "DevOps"]
def make_request(url, data=None, method="GET"):
"""Make HTTP request"""
req = urllib.request.Request(url, method=method)
if data:
req.data = json.dumps(data).encode()
req.add_header("Content-Type", "application/json")
return req
def get_all_entries(limit=1000):
"""Get all entries from knowledge base"""
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/scroll"
data = {
"limit": limit,
"with_payload": True
}
req = make_request(url, data, "POST")
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("result", {}).get("points", [])
except Exception as e:
print(f"❌ Error fetching entries: {e}", file=sys.stderr)
return []
def parse_date(date_str):
"""Parse date string to datetime"""
if not date_str:
return None
formats = [
"%Y-%m-%d",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%dT%H:%M:%S.%f"
]
for fmt in formats:
try:
return datetime.strptime(date_str.split('.')[0], fmt)
except:
continue
return None
def is_outdated(entry, threshold_days, fast_moving_multiplier=0.5):
"""Check if entry is outdated"""
payload = entry.get("payload", {})
# Check date_scraped first, then date_added
date_str = payload.get("date_scraped") or payload.get("date_added")
entry_date = parse_date(date_str)
if not entry_date:
return False, None # No date, can't determine
domain = payload.get("domain", "")
# Fast-moving domains get shorter threshold
if domain in FAST_MOVING_DOMAINS:
effective_threshold = int(threshold_days * fast_moving_multiplier)
else:
effective_threshold = threshold_days
age = datetime.now() - entry_date
is_old = age.days > effective_threshold
return is_old, {
"age_days": age.days,
"threshold": effective_threshold,
"domain": domain,
"date": date_str
}
def delete_entry(entry_id):
"""Delete entry from knowledge base"""
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/delete"
data = {"points": [entry_id]}
req = make_request(url, data, "POST")
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
print(f"❌ Error deleting: {e}", file=sys.stderr)
return False
def main():
parser = argparse.ArgumentParser(description="Review knowledge base for outdated entries")
parser.add_argument("--days", type=int, default=180, help="Age threshold in days")
parser.add_argument("--domains", help="Comma-separated domains to check (default: all)")
parser.add_argument("--fast-moving-only", action="store_true", help="Only check fast-moving domains")
parser.add_argument("--dry-run", action="store_true", help="Show what would be deleted")
parser.add_argument("--delete", action="store_true", help="Actually delete outdated entries")
args = parser.parse_args()
print(f"🔍 Fetching knowledge base entries...")
entries = get_all_entries()
if not entries:
print("❌ No entries found")
return
print(f" Total entries: {len(entries)}")
# Filter by domain if specified
if args.domains:
target_domains = [d.strip() for d in args.domains.split(",")]
entries = [e for e in entries if e.get("payload", {}).get("domain") in target_domains]
print(f" Filtered to domains: {target_domains}")
elif args.fast_moving_only:
entries = [e for e in entries if e.get("payload", {}).get("domain") in FAST_MOVING_DOMAINS]
print(f" Filtered to fast-moving domains: {FAST_MOVING_DOMAINS}")
# Check for outdated entries
outdated = []
for entry in entries:
is_old, info = is_outdated(entry, args.days)
if is_old:
outdated.append({
"entry": entry,
"info": info
})
if not outdated:
print(f"\n✅ No outdated entries found!")
return
print(f"\n⚠️ Found {len(outdated)} outdated entries:")
print(f" (Threshold: {args.days} days, fast-moving: {int(args.days * 0.5)} days)")
for item in outdated:
entry = item["entry"]
info = item["info"]
payload = entry.get("payload", {})
print(f"\n 📄 {payload.get('title', 'Untitled')}")
print(f" Domain: {info['domain']} | Age: {info['age_days']} days | Threshold: {info['threshold']} days")
print(f" Date: {info['date']}")
print(f" Path: {payload.get('path', 'N/A')}")
if args.delete and not args.dry_run:
if delete_entry(entry.get("id")):
print(f" ✅ Deleted")
else:
print(f" ❌ Failed to delete")
elif args.dry_run:
print(f" [Would delete in non-dry-run mode]")
# Summary
print(f"\n📊 Summary:")
print(f" Total checked: {len(entries)}")
print(f" Outdated: {len(outdated)}")
if args.dry_run:
print(f"\n💡 Use --delete to remove these entries")
elif not args.delete:
print(f"\n💡 Use --dry-run to preview, --delete to remove")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Search kimi_kb (Knowledge Base) - Manual only
Usage:
python3 kb_search.py "query"
python3 kb_search.py "docker volumes" --domain "Docker"
python3 kb_search.py "query" --include-urls
"""
import json
import sys
import urllib.request
from pathlib import Path
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION = "kimi_kb"
OLLAMA_URL = "http://localhost:11434/v1"
def get_embedding(text):
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def search_kb(query, domain=None, limit=5):
"""Search knowledge base"""
embedding = get_embedding(query)
if embedding is None:
return None
# Build filter if domain specified
filter_clause = {}
if domain:
filter_clause = {
"must": [
{"key": "domain", "match": {"value": domain}}
]
}
search_body = {
"vector": embedding,
"limit": limit,
"with_payload": True,
"with_vector": False
}
if filter_clause:
search_body["filter"] = filter_clause
data = json.dumps(search_body).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/search",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("result", [])
except Exception as e:
print(f"Error searching KB: {e}", file=sys.stderr)
return None
def format_result(point, idx):
"""Format a search result for display"""
payload = point.get("payload", {})
score = point.get("score", 0)
output = f"\n[{idx}] {payload.get('title', 'Untitled')} (score: {score:.3f})\n"
output += f" Domain: {payload.get('domain', 'unknown')}\n"
if payload.get('url'):
output += f" URL: {payload['url']}\n"
if payload.get('source'):
output += f" Source: {payload['source']}\n"
text = payload.get('text', '')[:300]
if len(payload.get('text', '')) > 300:
text += "..."
output += f" Content: {text}\n"
return output
def main():
import argparse
parser = argparse.ArgumentParser(description="Search kimi_kb")
parser.add_argument("query", help="Search query")
parser.add_argument("--domain", default=None, help="Filter by domain")
parser.add_argument("--limit", type=int, default=5, help="Number of results")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
print(f"🔍 Searching kimi_kb: {args.query}")
if args.domain:
print(f" Filter: domain={args.domain}")
print()
results = search_kb(args.query, args.domain, args.limit)
if results is None:
print("❌ Search failed", file=sys.stderr)
sys.exit(1)
if not results:
print("No results found in kimi_kb")
return
if args.json:
print(json.dumps(results, indent=2))
else:
print(f"Found {len(results)} results:\n")
for i, point in enumerate(results, 1):
print(format_result(point, i))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,379 @@
#!/usr/bin/env python3
"""
Store content to kimi_kb (Knowledge Base) - Manual only with batch support
Usage:
Single entry:
python3 kb_store.py "Content text" --title "Title" --domain "Category" --tags "tag1,tag2"
python3 kb_store.py "Content" --title "X" --url "https://example.com" --source "docs.site"
Batch mode:
python3 kb_store.py --batch-file entries.json --batch-size 100
Features:
- Single or batch upload
- Duplicate detection by title/URL
- Domain categorization
- Access tracking
"""
import argparse
import json
import os
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION = "kimi_kb"
OLLAMA_URL = "http://localhost:11434/v1"
DEFAULT_BATCH_SIZE = 100
def check_existing(title: str = None, url: str = None) -> tuple:
"""Check if entry already exists by title or URL"""
try:
# Check by URL first if provided
if url:
scroll_data = json.dumps({
"limit": 10,
"with_payload": True,
"filter": {"must": [{"key": "url", "match": {"value": url}}]}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if points:
return points[0]["id"], "url"
# Check by title
if title:
scroll_data = json.dumps({
"limit": 10,
"with_payload": True,
"filter": {"must": [{"key": "title", "match": {"value": title}}]}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if points:
return points[0]["id"], "title"
except Exception as e:
print(f"Warning: Could not check existing: {e}", file=sys.stderr)
return None, None
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def batch_upload_embeddings(texts: List[str]) -> List[Optional[List[float]]]:
"""Generate embeddings for multiple texts in batch"""
if not texts:
return []
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": [t[:8192] for t in texts]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=120) as response:
result = json.loads(response.read().decode())
return [d["embedding"] for d in result["data"]]
except Exception as e:
print(f"Error generating batch embeddings: {e}", file=sys.stderr)
return [None] * len(texts)
def upload_points_batch(points: List[Dict[str, Any]], batch_size: int = DEFAULT_BATCH_SIZE) -> tuple:
"""Upload points in batches to Qdrant"""
total = len(points)
uploaded = 0
failed = 0
for i in range(0, total, batch_size):
batch = points[i:i + batch_size]
upsert_data = {"points": batch}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
uploaded += len(batch)
print(f" ✅ Uploaded batch {i//batch_size + 1}: {len(batch)} points")
else:
print(f" ❌ Batch {i//batch_size + 1} failed: {result}")
failed += len(batch)
except Exception as e:
print(f" ❌ Batch {i//batch_size + 1} error: {e}", file=sys.stderr)
failed += len(batch)
return uploaded, failed
def store_single(
text: str,
embedding: List[float],
title: str = None,
url: str = None,
source: str = None,
domain: str = "general",
tags: List[str] = None,
content_type: str = "document",
replace: bool = False
) -> bool:
"""Store single KB entry"""
# Check for existing entry
existing_id, match_type = check_existing(title=title, url=url)
if existing_id:
if not replace:
print(f"⚠️ Entry '{title}' already exists (matched by {match_type}, ID: {existing_id})")
print(f" Use --replace to overwrite")
return False
point_id = existing_id if existing_id else str(uuid.uuid4())
payload = {
"text": text,
"title": title or "Untitled",
"url": url or "",
"source": source or "manual",
"domain": domain or "general",
"tags": tags or [],
"content_type": content_type,
"date": datetime.now().strftime("%Y-%m-%d"),
"created_at": datetime.now().isoformat(),
"access_count": 0
}
point = {
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload
}]
}
data = json.dumps(point).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
data=data,
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
print(f"Error storing to KB: {e}", file=sys.stderr)
return False
def store_batch(
entries: List[Dict[str, Any]],
batch_size: int = DEFAULT_BATCH_SIZE,
check_duplicates: bool = True
) -> tuple:
"""Store multiple KB entries in batch with optional duplicate checking"""
if not entries:
return 0, 0
print(f"Processing {len(entries)} entries...")
# Filter duplicates if requested
entries_to_process = []
duplicates = 0
if check_duplicates:
for entry in entries:
existing_id, match_type = check_existing(
title=entry.get("title"),
url=entry.get("url")
)
if existing_id:
print(f" ⏭️ Skipping duplicate: {entry.get('title', 'Untitled')} ({match_type})")
duplicates += 1
else:
entries_to_process.append(entry)
else:
entries_to_process = entries
if not entries_to_process:
print(f"All {len(entries)} entries already exist")
return 0, 0
print(f"Generating embeddings for {len(entries_to_process)} entries...")
texts = [e["content"] for e in entries_to_process]
embeddings = batch_upload_embeddings(texts)
# Prepare points
points = []
failed_embeddings = 0
for entry, embedding in zip(entries_to_process, embeddings):
if embedding is None:
failed_embeddings += 1
continue
point_id = str(uuid.uuid4())
payload = {
"text": entry["content"],
"title": entry.get("title", "Untitled"),
"url": entry.get("url", ""),
"source": entry.get("source", "manual"),
"domain": entry.get("domain", "general"),
"tags": entry.get("tags", []),
"content_type": entry.get("type", "document"),
"date": datetime.now().strftime("%Y-%m-%d"),
"created_at": datetime.now().isoformat(),
"access_count": 0
}
points.append({
"id": point_id,
"vector": embedding,
"payload": payload
})
if not points:
return 0, failed_embeddings + duplicates
# Upload in batches
print(f"Uploading {len(points)} entries in batches of {batch_size}...")
uploaded, failed_upload = upload_points_batch(points, batch_size)
return uploaded, failed_embeddings + failed_upload + duplicates
def main():
parser = argparse.ArgumentParser(description="Store content to kimi_kb")
parser.add_argument("content", nargs="?", help="Content to store")
parser.add_argument("--title", default=None, help="Title of the content")
parser.add_argument("--url", default=None, help="Source URL if from web")
parser.add_argument("--source", default=None, help="Source name")
parser.add_argument("--domain", default="general", help="Domain/category")
parser.add_argument("--tags", default=None, help="Comma-separated tags")
parser.add_argument("--type", default="document", choices=["document", "web", "code", "note"],
help="Content type")
parser.add_argument("--replace", action="store_true", help="Replace existing entry")
parser.add_argument("--batch-file", help="JSON file with multiple entries")
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size")
parser.add_argument("--no-check-duplicates", action="store_true", help="Skip duplicate checking in batch mode")
args = parser.parse_args()
# Batch mode
if args.batch_file:
print(f"Batch mode: Loading entries from {args.batch_file}")
try:
with open(args.batch_file, 'r') as f:
entries = json.load(f)
if not isinstance(entries, list):
print("Batch file must contain a JSON array", file=sys.stderr)
sys.exit(1)
print(f"Loaded {len(entries)} entries")
uploaded, failed = store_batch(
entries,
args.batch_size,
check_duplicates=not args.no_check_duplicates
)
print(f"\n{'=' * 50}")
print(f"Batch complete: {uploaded} uploaded, {failed} failed")
sys.exit(0 if failed == 0 else 1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
# Single entry mode
if not args.content:
print("Error: Provide content or use --batch-file", file=sys.stderr)
parser.print_help()
sys.exit(1)
tags = [t.strip() for t in args.tags.split(",")] if args.tags else []
print(f"Generating embedding...")
embedding = get_embedding(args.content)
if embedding is None:
print("❌ Failed to generate embedding")
sys.exit(1)
print(f"Storing to kimi_kb: {args.title or 'Untitled'}...")
if store_single(
text=args.content,
embedding=embedding,
title=args.title,
url=args.url,
source=args.source,
domain=args.domain,
tags=tags,
content_type=args.type,
replace=args.replace
):
print(f"✅ Stored to kimi_kb ({args.domain})")
else:
print("❌ Failed to store")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""LLM Router for cheap metadata + compaction.
Goal:
- Prefer Minimax m2.5 for tagging + compaction.
- Fallback to Gemini Flash (or any other OpenRouter model) if Minimax fails.
This uses OpenRouter's OpenAI-compatible API.
Env:
OPENROUTER_API_KEY (required)
OPENROUTER_BASE_URL default: https://openrouter.ai/api/v1
LLM_PRIMARY_MODEL default: openrouter/minimax/minimax-m2.5
LLM_FALLBACK_MODEL default: openrouter/google/gemini-2.5-flash
LLM_TIMEOUT default: 60
Notes:
- We keep this dependency-light (urllib only).
- We request strict JSON when asked.
"""
import json
import os
import sys
import urllib.request
BASE_URL = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1").rstrip("/")
API_KEY = os.getenv("OPENROUTER_API_KEY", "")
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "openrouter/minimax/minimax-m2.5")
FALLBACK_MODEL = os.getenv("LLM_FALLBACK_MODEL", "openrouter/google/gemini-2.5-flash")
TIMEOUT = int(os.getenv("LLM_TIMEOUT", "60"))
def _post_chat(model: str, messages, response_format=None, temperature=0.2):
if not API_KEY:
raise RuntimeError("OPENROUTER_API_KEY is required")
body = {
"model": model,
"messages": messages,
"temperature": temperature,
}
if response_format:
body["response_format"] = response_format
req = urllib.request.Request(
f"{BASE_URL}/chat/completions",
data=json.dumps(body).encode("utf-8"),
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}",
},
)
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
return json.loads(r.read().decode("utf-8"))
def chat_json(system: str, user: str) -> dict:
"""Return parsed JSON object. Try primary then fallback."""
messages = [
{"role": "system", "content": system},
{"role": "user", "content": user},
]
last_err = None
for model in (PRIMARY_MODEL, FALLBACK_MODEL):
try:
resp = _post_chat(model, messages, response_format={"type": "json_object"}, temperature=0.2)
content = resp["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
last_err = e
continue
raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}")
def chat_text(system: str, user: str) -> str:
"""Return text. Try primary then fallback."""
messages = [
{"role": "system", "content": system},
{"role": "user", "content": user},
]
last_err = None
for model in (PRIMARY_MODEL, FALLBACK_MODEL):
try:
resp = _post_chat(model, messages, response_format=None, temperature=0.2)
return resp["choices"][0]["message"]["content"]
except Exception as e:
last_err = e
continue
raise RuntimeError(f"LLM failed on both primary and fallback: {last_err}")
if __name__ == "__main__":
# tiny self-test
if len(sys.argv) > 1 and sys.argv[1] == "--ping":
out = chat_json("Return JSON with key ok=true", "ping")
print(json.dumps(out))

View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python3
"""
Convenience wrapper for activity logging
Add to your scripts: from log_activity import log_done, check_other_agent
"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from activity_log import log_activity, check_for_duplicates, get_recent_activities
AGENT_NAME = "Kimi" # Change to "Max" on that instance
def log_done(action_type: str, description: str, files=None, status="completed"):
"""
Quick log of completed work
Example:
log_done("cron_created", "Set up daily OpenClaw repo monitoring",
files=["/path/to/script.py"])
"""
activity_id = log_activity(
agent=AGENT_NAME,
action_type=action_type,
description=description,
affected_files=files or [],
status=status
)
print(f"[ActivityLog] Logged: {action_type}{activity_id[:8]}...")
return activity_id
def check_other_agent(action_type: str, keywords: str, hours: int = 6) -> bool:
"""
Check if Max (or Kimi) already did this recently
Example:
if check_other_agent("cron_created", "openclaw repo monitoring"):
print("Max already set this up!")
return
"""
other_agent = "Max" if AGENT_NAME == "Kimi" else "Kimi"
recent = get_recent_activities(agent=other_agent, action_type=action_type, hours=hours)
keywords_lower = keywords.lower().split()
for activity in recent:
desc = activity.get("description", "").lower()
if all(kw in desc for kw in keywords_lower):
print(f"[ActivityLog] ⚠️ {other_agent} already did this!")
print(f" When: {activity['timestamp'][:19]}")
print(f" What: {activity['description']}")
return True
return False
def show_recent_collaboration(hours: int = 24):
"""Show what both agents have been up to"""
activities = get_recent_activities(hours=hours, limit=50)
print(f"\n[ActivityLog] Both agents' work (last {hours}h):\n")
for a in activities:
agent = a['agent']
icon = "🤖" if agent == "Max" else "🎙️"
print(f"{icon} [{a['timestamp'][11:19]}] {agent}: {a['action_type']}")
print(f" {a['description']}")
if __name__ == "__main__":
# Quick test
print(f"Agent: {AGENT_NAME}")
print("Functions available:")
print(" log_done(action_type, description, files=[], status='completed')")
print(" check_other_agent(action_type, keywords, hours=6)")
print(" show_recent_collaboration(hours=24)")
print()
print("Recent activity:")
show_recent_collaboration(hours=24)

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""Metadata + Compaction pipeline.
This script is designed to be run on a schedule (cron). It will:
1) Detect if anything new exists in Redis buffer since last run.
2) If new content exists, generate:
- title
- tags
- entities
- category
- compact summary
using a cheap LLM (Minimax m2.5) with fallback (Gemini Flash)
3) Store the metadata + summary into Qdrant as a single point (collection: kimi_kb by default)
while leaving raw transcripts in files/Redis.
It is intentionally conservative: if nothing new, it exits quickly.
Env:
REDIS_HOST/REDIS_PORT
QDRANT_URL
QDRANT_META_COLLECTION (default: kimi_kb)
OPENROUTER_API_KEY (required for LLM)
LLM_PRIMARY_MODEL / LLM_FALLBACK_MODEL
Usage:
python3 metadata_and_compact.py --user-id michael
python3 metadata_and_compact.py --user-id michael --max-items 200
"""
import argparse
import json
import os
import sys
import uuid
from datetime import datetime
import redis
from llm_router import chat_json
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333").rstrip("/")
META_COLLECTION = os.getenv("QDRANT_META_COLLECTION", "kimi_kb")
STATE_DIR = os.getenv("MEMORY_STATE_DIR", os.path.join(os.path.expanduser("~"), ".openclaw", "memory_state"))
SYSTEM_PROMPT = (
"You are a metadata extractor and compactor for conversation logs. "
"Return STRICT JSON with keys: title (string), category (string), "
"tags (array of short lowercase hyphenated strings), entities (array of strings), "
"summary (string, <= 1200 chars). "
"Prefer 6-14 tags. Tags should be searchable facets (client/project/infra/topic)."
)
def _state_path(user_id: str) -> str:
os.makedirs(STATE_DIR, exist_ok=True)
return os.path.join(STATE_DIR, f"meta_state_{user_id}.json")
def load_state(user_id: str) -> dict:
p = _state_path(user_id)
if not os.path.exists(p):
return {"last_redis_len": 0, "updated_at": None}
try:
with open(p, "r") as f:
return json.load(f)
except Exception:
return {"last_redis_len": 0, "updated_at": None}
def save_state(user_id: str, st: dict) -> None:
p = _state_path(user_id)
st["updated_at"] = datetime.utcnow().isoformat() + "Z"
with open(p, "w") as f:
json.dump(st, f, indent=2, sort_keys=True)
def redis_get_new_items(user_id: str, max_items: int, last_len: int):
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
key = f"mem:{user_id}"
cur_len = r.llen(key)
if cur_len <= last_len:
return [], cur_len
# Only grab the delta (best effort). Our list is chronological if RPUSH is used.
start = last_len
end = min(cur_len - 1, last_len + max_items - 1)
items = r.lrange(key, start, end)
turns = []
for it in items:
try:
turns.append(json.loads(it))
except Exception:
continue
return turns, cur_len
def qdrant_upsert(point_id: str, vector, payload: dict):
body = {"points": [{"id": point_id, "vector": vector, "payload": payload}]}
import urllib.request
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{META_COLLECTION}/points?wait=true",
data=json.dumps(body).encode("utf-8"),
headers={"Content-Type": "application/json"},
method="PUT",
)
with urllib.request.urlopen(req, timeout=15) as resp:
out = json.loads(resp.read().decode("utf-8"))
return out.get("status") == "ok"
def ollama_embed(text: str):
# Uses the same Ollama embed endpoint as auto_store
import urllib.request
ollama_url = os.getenv("OLLAMA_URL", "http://127.0.0.1:11434/v1")
data = json.dumps({"model": "snowflake-arctic-embed2", "input": text[:8192]}).encode("utf-8")
req = urllib.request.Request(
f"{ollama_url}/embeddings",
data=data,
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=60) as resp:
out = json.loads(resp.read().decode("utf-8"))
return out["data"][0]["embedding"]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--user-id", required=True)
ap.add_argument("--max-items", type=int, default=200)
args = ap.parse_args()
st = load_state(args.user_id)
last_len = int(st.get("last_redis_len", 0))
turns, cur_len = redis_get_new_items(args.user_id, args.max_items, last_len)
if not turns:
print("No new turns; skipping")
return
# Build compact source text
lines = []
for t in turns:
role = t.get("role", "")
content = t.get("content", "")
if not content:
continue
lines.append(f"{role.upper()}: {content}")
source_text = "\n".join(lines)
meta = chat_json(SYSTEM_PROMPT, source_text[:24000])
# basic validation
for k in ("title", "category", "tags", "entities", "summary"):
if k not in meta:
raise SystemExit(f"Missing key in meta: {k}")
summary = str(meta.get("summary", ""))[:2000]
emb = ollama_embed(summary)
payload = {
"user_id": args.user_id,
"title": str(meta.get("title", ""))[:200],
"category": str(meta.get("category", ""))[:120],
"tags": meta.get("tags", [])[:30],
"entities": meta.get("entities", [])[:30],
"summary": summary,
"source": "redis_delta",
"created_at": datetime.utcnow().isoformat() + "Z",
"redis_range": {"from": last_len, "to": cur_len - 1},
}
ok = qdrant_upsert(str(uuid.uuid4()), emb, payload)
if not ok:
raise SystemExit("Failed to upsert metadata point")
st["last_redis_len"] = cur_len
save_state(args.user_id, st)
print(f"Stored metadata point for {args.user_id} (redis {last_len}->{cur_len})")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python3
"""
Migrate Qdrant_Documents to 1024D vectors (snowflake-arctic-embed2) - BATCH VERSION
"""
import json
import sys
import urllib.request
import uuid
from datetime import datetime
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION = "Qdrant_Documents"
OLLAMA_URL = "http://localhost:11434/v1"
EXPORT_FILE = "/tmp/qd_export.json"
BATCH_SIZE = 50
def get_embeddings_batch(texts):
"""Generate embeddings in batch using snowflake-arctic-embed2"""
# Truncate each text
truncated = [t[:8000] for t in texts]
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": truncated
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=180) as r:
result = json.loads(r.read().decode())
return [item["embedding"] for item in result["data"]]
except Exception as e:
print(f"Batch embed error: {e}", file=sys.stderr)
return None
def make_request(url, data=None, method="GET"):
req = urllib.request.Request(url, method=method)
if data:
req.data = json.dumps(data).encode()
req.add_header("Content-Type", "application/json")
return req
def delete_collection():
print(f"Deleting {COLLECTION}...")
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}", method="DELETE")
try:
with urllib.request.urlopen(req, timeout=10) as r:
print(f"✅ Deleted")
except Exception as e:
print(f"Delete error: {e}")
def create_collection():
print(f"Creating {COLLECTION} with 1024D vectors...")
config = {
"vectors": {
"size": 1024,
"distance": "Cosine"
}
}
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}", data=config, method="PUT")
try:
with urllib.request.urlopen(req, timeout=30) as r:
result = json.loads(r.read().decode())
if result.get("result") == True:
print(f"✅ Created (1024D, Cosine)")
else:
print(f"❌ Failed: {result}")
sys.exit(1)
except Exception as e:
print(f"❌ Create error: {e}")
sys.exit(1)
def upsert_batch(points):
"""Upsert batch of points"""
data = json.dumps({"points": points}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
data=data,
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=60) as r:
return json.loads(r.read().decode()).get("status") == "ok"
except Exception as e:
print(f"Upsert error: {e}", file=sys.stderr)
return False
# Load exported docs
print(f"Loading {EXPORT_FILE}...")
with open(EXPORT_FILE, 'r') as f:
docs = json.load(f)
print(f"Loaded {len(docs)} documents\n")
# Delete and recreate
delete_collection()
create_collection()
print()
# Process in batches
print(f"Re-embedding with snowflake-arctic-embed2 (batch={BATCH_SIZE})...\n")
success = 0
failed = 0
total_batches = (len(docs) + BATCH_SIZE - 1) // BATCH_SIZE
for batch_num in range(total_batches):
start = batch_num * BATCH_SIZE
end = min(start + BATCH_SIZE, len(docs))
batch_docs = docs[start:end]
print(f"Batch {batch_num + 1}/{total_batches} ({start}-{end})...", end=" ", flush=True)
# Get texts for embedding
texts = [d.get("payload", {}).get("text", "") for d in batch_docs]
# Get embeddings
embeddings = get_embeddings_batch(texts)
if not embeddings:
print(f"❌ embed failed")
failed += len(batch_docs)
continue
# Build points
points = []
for doc, emb in zip(batch_docs, embeddings):
points.append({
"id": doc.get("id", str(uuid.uuid4())),
"vector": emb,
"payload": doc.get("payload", {})
})
# Upsert
if upsert_batch(points):
success += len(batch_docs)
print(f"")
else:
failed += len(batch_docs)
print(f"")
print()
print("=" * 50)
print(f"MIGRATION COMPLETE")
print(f" Success: {success}")
print(f" Failed: {failed}")
print(f" Total: {len(docs)}")
print("=" * 50)
# Verify
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}")
with urllib.request.urlopen(req, timeout=5) as r:
info = json.loads(r.read().decode())["result"]
print(f"\n📚 {COLLECTION}")
print(f" Points: {info['points_count']:,}")
print(f" Vector size: {info['config']['params']['vectors']['size']}")
print(f" Distance: {info['config']['params']['vectors']['distance']}")

View File

@@ -0,0 +1,207 @@
#!/usr/bin/env python3
"""
Monitor Ollama model library for 100B+ parameter models
Only outputs/announces when there are significant new large models.
Always exits with code 0 to prevent "exec failed" logs.
Usage: monitor_ollama_models.py [--json]
"""
import argparse
import sys
import json
import urllib.request
import re
import hashlib
from datetime import datetime
QDRANT_URL = "http://10.0.0.40:6333"
KB_COLLECTION = "knowledge_base"
OLLAMA_LIBRARY_URL = "https://ollama.com/library"
LARGE_MODEL_TAGS = ["100b", "120b", "200b", "400b", "70b", "8x7b", "8x22b"]
GOOD_FOR_OPENCLAW = ["code", "coding", "instruct", "chat", "reasoning", "llama", "qwen", "mistral", "deepseek", "gemma", "mixtral"]
def fetch_library():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
req = urllib.request.Request(OLLAMA_LIBRARY_URL, headers=headers)
try:
with urllib.request.urlopen(req, timeout=20) as response:
return response.read().decode('utf-8', errors='ignore')
except:
return None
def extract_models(html):
models = []
model_blocks = re.findall(r'<a[^>]*href="/library/([^"]+)"[^>]*>(.*?)</a>', html, re.DOTALL)
for model_name, block in model_blocks[:50]:
model_info = {
"name": model_name, "url": f"https://ollama.com/library/{model_name}",
"is_large": False, "is_new": False, "tags": [], "description": ""
}
tag_matches = re.findall(r'<span[^>]*>([^<]+(?:b|B))</span>', block)
model_info["tags"] = [t.lower() for t in tag_matches]
for tag in model_info["tags"]:
if any(large_tag in tag for large_tag in LARGE_MODEL_TAGS):
if "70b" in tag and not ("8x" in model_name.lower() or "mixtral" in model_name.lower()):
continue
model_info["is_large"] = True
break
desc_match = re.search(r'<p[^>]*>([^<]+)</p>', block)
if desc_match:
model_info["description"] = desc_match.group(1).strip()
updated_match = re.search(r'(\d+)\s+(hours?|days?)\s+ago', block, re.IGNORECASE)
if updated_match:
num = int(updated_match.group(1))
unit = updated_match.group(2).lower()
if (unit.startswith("hour") and num <= 24) or (unit.startswith("day") and num <= 2):
model_info["is_new"] = True
desc_lower = model_info["description"].lower()
name_lower = model_name.lower()
model_info["good_for_openclaw"] = any(kw in desc_lower or kw in name_lower for kw in GOOD_FOR_OPENCLAW)
models.append(model_info)
return models
def get_embedding(text):
data = {"model": "nomic-embed-text", "input": text[:500]}
req = urllib.request.Request("http://localhost:11434/api/embed",
data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"}, method="POST")
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("embeddings", [None])[0]
except:
return None
def search_kb_for_model(model_name):
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/scroll"
data = {"limit": 100, "with_payload": True, "filter": {"must": [
{"key": "domain", "match": {"value": "AI/LLM"}},
{"key": "path", "match": {"text": model_name}}
]}}
req = urllib.request.Request(url, data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"}, method="POST")
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("result", {}).get("points", [])
except:
return []
def store_model(model_info):
import uuid
text = f"{model_info['name']}: {model_info['description']}\nTags: {', '.join(model_info['tags'])}"
embedding = get_embedding(text)
if not embedding:
return False
metadata = {
"domain": "AI/LLM", "path": f"AI/LLM/Ollama/Models/{model_info['name']}",
"subjects": ["ollama", "models", "llm", "100b+"] + model_info['tags'],
"category": "reference", "content_type": "web_page",
"title": f"Ollama Model: {model_info['name']}", "source_url": model_info['url'],
"date_added": datetime.now().strftime("%Y-%m-%d"), "date_scraped": datetime.now().isoformat(),
"model_tags": model_info['tags'], "is_large": model_info['is_large'], "is_new": model_info['is_new'],
"text_preview": text[:300]
}
point = {"id": str(uuid.uuid4()), "vector": embedding, "payload": metadata}
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points"
req = urllib.request.Request(url, data=json.dumps({"points": [point]}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except:
return False
def evaluate_candidate(model_info):
score = 0
reasons = []
if not model_info["is_large"]:
return {"is_candidate": False, "score": 0, "reasons": []}
score += 5
reasons.append("🦣 100B+ parameters")
if model_info.get("good_for_openclaw"):
score += 2
reasons.append("✨ Good for OpenClaw")
if model_info["is_new"]:
score += 2
reasons.append("🆕 Recently updated")
return {"is_candidate": score >= 5, "score": score, "reasons": reasons}
def format_notification(candidates):
lines = ["🤖 New Large Model Alert (100B+)", f"📅 {datetime.now().strftime('%Y-%m-%d')}", ""]
lines.append(f"📊 {len(candidates)} new large model(s) found:")
lines.append("")
for model in candidates[:5]:
eval_info = model["evaluation"]
lines.append(f"{model['name']}")
lines.append(f" {model['description'][:60]}...")
lines.append(f" Tags: {', '.join(model['tags'][:3])}")
for reason in eval_info["reasons"]:
lines.append(f" {reason}")
lines.append(f" 🔗 {model['url']}")
lines.append("")
lines.append("💡 Potential gpt-oss:120b replacement")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--json", action="store_true")
args = parser.parse_args()
html = fetch_library()
if not html:
if args.json:
print("{}")
sys.exit(0) # Silent fail with exit 0
models = extract_models(html)
large_models = [m for m in models if m["is_large"]]
candidates = []
for model in large_models:
existing = search_kb_for_model(model["name"])
is_new_to_kb = len(existing) == 0
evaluation = evaluate_candidate(model)
model["evaluation"] = evaluation
if is_new_to_kb:
store_model(model)
if evaluation["is_candidate"] and is_new_to_kb:
candidates.append(model)
# Output results
if args.json:
if candidates:
print(json.dumps({"candidates": candidates, "notification": format_notification(candidates)}))
else:
print("{}")
elif candidates:
print(format_notification(candidates))
# No output if no candidates (silent)
# Always exit 0 to prevent "exec failed" logs
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,249 @@
#!/usr/bin/env python3
"""
Monitor OpenClaw GitHub repo for relevant updates
Only outputs/announces when there are significant changes affecting our setup.
Always exits with code 0 to prevent "exec failed" logs.
Usage: monitor_openclaw_repo.py [--json]
"""
import argparse
import sys
import json
import urllib.request
import re
import hashlib
from datetime import datetime
QDRANT_URL = "http://10.0.0.40:6333"
KB_COLLECTION = "knowledge_base"
# Keywords that indicate relevance to our setup
RELEVANT_KEYWORDS = [
"ollama", "model", "embedding", "llm", "ai",
"telegram", "webchat", "signal", "discord",
"skill", "skills", "qdrant", "memory", "search",
"whisper", "tts", "voice", "cron",
"gateway", "agent", "session", "vector",
"browser", "exec", "read", "edit", "write",
"breaking", "deprecated", "removed", "changed",
"fix", "bug", "patch", "security", "vulnerability"
]
HIGH_PRIORITY_AREAS = [
"ollama", "telegram", "qdrant", "memory", "skills",
"voice", "cron", "gateway", "browser"
]
def fetch_github_api(url):
headers = {
'User-Agent': 'OpenClaw-KB-Monitor',
'Accept': 'application/vnd.github.v3+json'
}
req = urllib.request.Request(url, headers=headers)
try:
with urllib.request.urlopen(req, timeout=20) as response:
return json.loads(response.read().decode())
except Exception as e:
return None
def fetch_github_html(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
req = urllib.request.Request(url, headers=headers)
try:
with urllib.request.urlopen(req, timeout=20) as response:
html = response.read().decode('utf-8', errors='ignore')
text = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text[:5000]
except:
return None
def get_embedding(text):
import json as jsonlib
data = {"model": "nomic-embed-text", "input": text[:1000]}
req = urllib.request.Request(
"http://localhost:11434/api/embed",
data=jsonlib.dumps(data).encode(),
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = jsonlib.loads(response.read().decode())
return result.get("embeddings", [None])[0]
except:
return None
def search_kb_by_path(path_prefix):
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/scroll"
data = {"limit": 100, "with_payload": True}
req = urllib.request.Request(url, data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"}, method="POST")
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
return [p for p in points if p.get("payload", {}).get("path", "").startswith(path_prefix)]
except:
return []
def store_in_kb(text, metadata):
import uuid
embedding = get_embedding(text)
if not embedding:
return None
metadata["checksum"] = f"sha256:{hashlib.sha256(text.encode()).hexdigest()[:16]}"
metadata["date_scraped"] = datetime.now().isoformat()
metadata["text_preview"] = text[:300] + "..." if len(text) > 300 else text
point = {"id": str(uuid.uuid4()), "vector": embedding, "payload": metadata}
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points"
req = urllib.request.Request(url, data=json.dumps({"points": [point]}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except:
return False
def delete_kb_entry(entry_id):
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/delete"
data = {"points": [entry_id]}
req = urllib.request.Request(url, data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"}, method="POST")
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except:
return False
def is_relevant_change(text):
text_lower = text.lower()
found_keywords = [kw for kw in RELEVANT_KEYWORDS if kw in text_lower]
high_priority_found = [area for area in HIGH_PRIORITY_AREAS if area in text_lower]
return {
"relevant": len(found_keywords) > 0,
"keywords": found_keywords,
"high_priority": high_priority_found,
"score": len(found_keywords) + (len(high_priority_found) * 2)
}
def evaluate_significance(changes):
total_score = sum(c["analysis"]["score"] for c in changes)
high_priority_count = sum(len(c["analysis"]["high_priority"]) for c in changes)
return {
"significant": total_score >= 3 or high_priority_count > 0,
"total_score": total_score,
"high_priority_count": high_priority_count
}
def format_summary(changes, significance):
lines = ["📊 OpenClaw Repo Update", f"📅 {datetime.now().strftime('%Y-%m-%d')}", ""]
by_section = {}
for change in changes:
section = change["section"]
if section not in by_section:
by_section[section] = []
by_section[section].append(change)
for section, items in by_section.items():
lines.append(f"📁 {section}")
for item in items[:3]:
title = item["title"][:50] + "..." if len(item["title"]) > 50 else item["title"]
lines.append(f"{title}")
if item["analysis"]["high_priority"]:
lines.append(f" ⚠️ Affects: {', '.join(item['analysis']['high_priority'][:2])}")
if len(items) > 3:
lines.append(f" ... and {len(items) - 3} more")
lines.append("")
return "\n".join(lines)
def scrape_all_sections():
sections = []
main_text = fetch_github_html("https://github.com/openclaw/openclaw")
if main_text:
sections.append({"section": "Main Repo", "title": "openclaw/openclaw README",
"url": "https://github.com/openclaw/openclaw", "content": main_text})
releases = fetch_github_api("https://api.github.com/repos/openclaw/openclaw/releases?per_page=5")
if releases:
for release in releases:
sections.append({"section": "Release", "title": release.get("name", release.get("tag_name", "Unknown")),
"url": release.get("html_url", ""), "content": release.get("body", "")[:2000],
"published": release.get("published_at", "")})
issues = fetch_github_api("https://api.github.com/repos/openclaw/openclaw/issues?state=open&per_page=5")
if issues:
for issue in issues:
if "pull_request" not in issue:
sections.append({"section": "Issue", "title": issue.get("title", "Unknown"),
"url": issue.get("html_url", ""), "content": issue.get("body", "")[:1500] if issue.get("body") else "No description",
"labels": [l.get("name", "") for l in issue.get("labels", [])]})
return sections
def check_and_update():
sections = scrape_all_sections()
if not sections:
return None, "No data scraped"
existing_entries = search_kb_by_path("OpenClaw/GitHub")
existing_checksums = {e.get("payload", {}).get("checksum", ""): e for e in existing_entries}
changes_detected = []
for section in sections:
content = section["content"]
if not content:
continue
checksum = f"sha256:{hashlib.sha256(content.encode()).hexdigest()[:16]}"
if checksum in existing_checksums:
continue
analysis = is_relevant_change(content + " " + section["title"])
section["analysis"] = analysis
section["checksum"] = checksum
changes_detected.append(section)
for old_checksum, old_entry in existing_checksums.items():
if old_entry.get("payload", {}).get("title", "") == section["title"]:
delete_kb_entry(old_entry.get("id"))
break
metadata = {
"domain": "OpenClaw", "path": f"OpenClaw/GitHub/{section['section']}/{section['title'][:30]}",
"subjects": ["openclaw", "github", section['section'].lower()], "category": "reference",
"content_type": "web_page", "title": section["title"], "source_url": section["url"],
"date_added": datetime.now().strftime("%Y-%m-%d")
}
store_in_kb(content, metadata)
if changes_detected:
significance = evaluate_significance(changes_detected)
if significance["significant"]:
return {"changes": changes_detected, "significance": significance,
"summary": format_summary(changes_detected, significance)}, None
else:
return None, "Changes not significant"
return None, "No changes detected"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--json", action="store_true")
args = parser.parse_args()
result, reason = check_and_update()
# Always output JSON for cron compatibility, even if empty
if args.json:
print(json.dumps(result if result else {}))
elif result:
print(result["summary"])
# If no result, output nothing (silent)
# Always exit 0 to prevent "exec failed" logs
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
Lightweight notification checker for agent messages
Cron job: Check Redis stream hourly, notify if new messages
"""
import json
import redis
import os
from datetime import datetime, timezone
REDIS_HOST = "10.0.0.36"
REDIS_PORT = 6379
STREAM_NAME = "agent-messages"
LAST_NOTIFIED_KEY = "agent:notifications:last_id"
# Simple stdout notification (OpenClaw captures stdout for alerts)
def notify(messages):
if not messages:
return
other_agent = messages[0].get("agent", "Agent")
count = len(messages)
# Single line notification - minimal tokens
print(f"📨 {other_agent}: {count} new message(s) in agent-messages")
# Optional: preview first message (uncomment if wanted)
# if messages:
# preview = messages[0].get("message", "")[:50]
# print(f" Latest: {preview}...")
def check_notifications():
r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
# Get last position we notified about
last_id = r.get(LAST_NOTIFIED_KEY) or "0"
# Read new messages since last notification
result = r.xread({STREAM_NAME: last_id}, block=100, count=100)
if not result:
return # No new messages, silent exit
messages = []
new_last_id = last_id
for stream_name, entries in result:
for msg_id, data in entries:
messages.append(data)
new_last_id = msg_id
if messages:
# Filter out our own messages (don't notify about messages we sent)
my_agent = os.environ.get("AGENT_NAME", "Kimi") # Set in cron env
other_messages = [m for m in messages if m.get("agent") != my_agent]
if other_messages:
notify(other_messages)
# Update last notified position regardless
r.set(LAST_NOTIFIED_KEY, new_last_id)
if __name__ == "__main__":
check_notifications()

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python3
"""
Q Save - Trigger conversation storage (Mem0-style)
Usage:
q_save.py --user-id "rob" "User message" "AI response" [--turn N]
Called when user says "save q" or "q save" to immediately
store the current conversation to Qdrant.
Mem0-style: user_id is REQUIRED and persistent across all chats.
"""
import argparse
import subprocess
import sys
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent.resolve()
BACKGROUND_STORE = SCRIPT_DIR / "background_store.py"
def q_save(
user_id: str,
user_message: str,
ai_response: str,
turn: int = None
):
"""Save conversation to Qdrant (background, zero delay)"""
cmd = [
sys.executable,
str(BACKGROUND_STORE),
user_message,
ai_response,
"--user-id", user_id
]
if turn:
cmd.extend(["--turn", str(turn)])
# Fire and forget
subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True
)
return True
def main():
parser = argparse.ArgumentParser(
description='Q Save - Mem0-style trigger (user-centric)'
)
parser.add_argument("--user-id", required=True,
help="REQUIRED: User ID (e.g., 'rob')")
parser.add_argument("user_message", help="User's message")
parser.add_argument("ai_response", help="AI's response")
parser.add_argument("--turn", type=int, help="Turn number")
args = parser.parse_args()
if q_save(args.user_id, args.user_message, args.ai_response, args.turn):
print(f"✅ Saved for user '{args.user_id}'")
else:
print("❌ Failed to save", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,427 @@
#!/usr/bin/env python3
"""
Qdrant_Documents - Complete management script
Usage: qd.py <command> [options]
Commands:
list - List collection info and stats
search - Search documents
store - Store new document
delete - Delete document by ID
export - Export all documents to JSON
import - Import documents from JSON
count - Get total document count
tags - List unique tags
"""
import argparse
import json
import sys
import urllib.request
import uuid
from datetime import datetime
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION = "Qdrant_Documents"
OLLAMA_URL = "http://localhost:11434/v1"
# ============================================================================
# UTILITIES
# ============================================================================
def get_embedding(text, model="nomic-embed-text"):
"""Generate embedding using Ollama"""
data = json.dumps({"model": model, "input": text[:8000]}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=60) as r:
return json.loads(r.read().decode())["data"][0]["embedding"]
except Exception as e:
print(f"Embedding error: {e}", file=sys.stderr)
return None
def make_request(url, data=None, method="GET"):
"""Make HTTP request"""
req = urllib.request.Request(url, method=method)
if data:
req.data = json.dumps(data).encode()
req.add_header("Content-Type", "application/json")
return req
def check_collection():
"""Verify collection exists"""
try:
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}")
with urllib.request.urlopen(req, timeout=5) as r:
return r.read()
except:
return None
# ============================================================================
# COMMANDS
# ============================================================================
def cmd_list(args):
"""List collection info"""
data = check_collection()
if not data:
print(f"❌ Collection '{COLLECTION}' not found")
sys.exit(1)
info = json.loads(data.decode())["result"]
print(f"\n📚 Collection: {COLLECTION}")
print(f" Status: {info['status']}")
print(f" Points: {info['points_count']:,}")
print(f" Vectors: {info['indexed_vectors_count']:,}")
print(f" Segments: {info['segments_count']}")
print(f" Vector size: {info['config']['params']['vectors']['size']}")
print(f" Distance: {info['config']['params']['vectors']['distance']}")
print(f" Optimizer: {info['optimizer_status']}")
print()
# Show payload schema
print("📋 Payload Schema:")
for field, schema in info.get("payload_schema", {}).items():
if isinstance(schema, dict) and "data_type" in schema:
print(f" - {field}: {schema['data_type']} ({schema.get('points',0):,} points)")
print()
def cmd_count(args):
"""Get document count"""
req = make_request(f"{QDRANT_URL}/collections/{COLLECTION}")
with urllib.request.urlopen(req, timeout=5) as r:
count = json.loads(r.read().decode())["result"]["points_count"]
print(f"{count}")
def cmd_search(args):
"""Search documents"""
embedding = get_embedding(args.query)
if not embedding:
print("❌ Failed to generate embedding")
sys.exit(1)
search_body = {
"vector": embedding,
"limit": args.limit,
"with_payload": True,
"with_vector": False
}
if args.tag:
search_body["filter"] = {"must": [{"key": "tag", "match": {"value": args.tag}}]}
data = json.dumps(search_body).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/search",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as r:
results = json.loads(r.read().decode())["result"]
except Exception as e:
print(f"❌ Search failed: {e}")
sys.exit(1)
if not results:
print("No results found")
return
print(f"Found {len(results)} results:\n")
for i, r in enumerate(results, 1):
p = r.get("payload", {})
print(f"[{i}] Score: {r['score']:.3f}")
print(f" Tags: {p.get('tag', 'none')}")
text = p.get('text', '')[:args.chars]
if len(p.get('text', '')) > args.chars:
text += "..."
print(f" Text: {text}")
print()
def cmd_store(args):
"""Store a document"""
# Read from file or use text argument
if args.file:
with open(args.file, 'r') as f:
text = f.read()
else:
text = args.text
if not text:
print("❌ No text to store")
sys.exit(1)
embedding = get_embedding(text)
if not embedding:
print("❌ Failed to generate embedding")
sys.exit(1)
# Parse tags
tags = args.tag.split(",") if args.tag else []
sections = args.section.split(",") if args.section else []
point = {
"points": [{
"id": str(uuid.uuid4()),
"vector": embedding,
"payload": {
"text": text,
"tag": tags,
"sections": sections,
"date": datetime.now().strftime("%Y-%m-%d"),
"created_at": datetime.now().isoformat()
}
}]
}
data = json.dumps(point).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
data=data,
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=30) as r:
result = json.loads(r.read().decode())
if result.get("status") == "ok":
print(f"✅ Stored document ({len(text)} chars, {len(embedding)}D vector)")
else:
print(f"❌ Store failed: {result}")
sys.exit(1)
except Exception as e:
print(f"❌ Store error: {e}")
sys.exit(1)
def cmd_delete(args):
"""Delete a document by ID"""
req = make_request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/{args.id}",
method="DELETE"
)
try:
with urllib.request.urlopen(req, timeout=10) as r:
print(f"✅ Deleted point {args.id}")
except Exception as e:
print(f"❌ Delete error: {e}")
sys.exit(1)
def cmd_export(args):
"""Export all documents to JSON"""
print(f"Exporting {COLLECTION}...", file=sys.stderr)
# Get all points
all_points = []
offset = None
while True:
scroll_body = {"limit": 100, "with_payload": True, "with_vector": False}
if offset:
scroll_body["offset"] = offset
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
data=json.dumps(scroll_body).encode(),
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as r:
result = json.loads(r.read().decode())
points = result.get("result", {}).get("points", [])
if not points:
break
all_points.extend(points)
offset = result.get("result", {}).get("next_page_offset")
if not offset:
break
except Exception as e:
print(f"❌ Export error: {e}")
sys.exit(1)
# Format output
output = []
for p in all_points:
output.append({
"id": p["id"],
"payload": p.get("payload", {})
})
if args.output:
with open(args.output, 'w') as f:
json.dump(output, f, indent=2)
print(f"✅ Exported {len(output)} documents to {args.output}")
else:
print(json.dumps(output, indent=2))
def cmd_import(args):
"""Import documents from JSON"""
with open(args.file, 'r') as f:
documents = json.load(f)
print(f"Importing {len(documents)} documents...")
success = 0
for doc in documents:
text = doc.get("payload", {}).get("text", "")
if not text:
continue
embedding = get_embedding(text)
if not embedding:
print(f" ⚠️ Skipping {doc.get('id')}: embedding failed")
continue
point = {
"points": [{
"id": doc.get("id", str(uuid.uuid4())),
"vector": embedding,
"payload": doc.get("payload", {})
}]
}
data = json.dumps(point).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
data=data,
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=30) as r:
if json.loads(r.read().decode()).get("status") == "ok":
success += 1
except:
pass
print(f"✅ Imported {success}/{len(documents)} documents")
def cmd_tags(args):
"""List unique tags"""
# Use scroll to get all tags
all_tags = set()
offset = None
while True:
scroll_body = {"limit": 100, "with_payload": True, "with_vector": False}
if offset:
scroll_body["offset"] = offset
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION}/points/scroll",
data=json.dumps(scroll_body).encode(),
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as r:
result = json.loads(r.read().decode())
points = result.get("result", {}).get("points", [])
if not points:
break
for p in points:
tags = p.get("payload", {}).get("tag", [])
if isinstance(tags, list):
all_tags.update(tags)
elif tags:
all_tags.add(tags)
offset = result.get("result", {}).get("next_page_offset")
if not offset:
break
except Exception as e:
print(f"❌ Error: {e}")
sys.exit(1)
print(f"\n🏷️ Unique tags ({len(all_tags)}):")
for tag in sorted(all_tags):
print(f" - {tag}")
print()
# ============================================================================
# MAIN
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description=f"Qdrant_Documents management ({COLLECTION})",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
qd.py list # Show collection stats
qd.py search "docker volumes" # Search documents
qd.py search "query" --tag kubernetes # Filter by tag
qd.py store "text here" --tag "docker" # Store document
qd.py store --file README.md --tag "doc"
qd.py export --output backup.json # Export all
qd.py tags # List all tags
"""
)
subparsers = parser.add_subparsers(dest="cmd", required=True)
# list
subparsers.add_parser("list", help="Show collection info")
# count
subparsers.add_parser("count", help="Get document count")
# search
p_search = subparsers.add_parser("search", help="Search documents")
p_search.add_argument("query", help="Search query")
p_search.add_argument("--tag", help="Filter by tag")
p_search.add_argument("--limit", type=int, default=5)
p_search.add_argument("--chars", type=int, default=200)
# store
p_store = subparsers.add_parser("store", help="Store document")
p_store.add_argument("text", nargs="?", help="Text to store")
p_store.add_argument("--file", help="Read from file")
p_store.add_argument("--tag", help="Comma-separated tags")
p_store.add_argument("--section", help="Comma-separated sections", default="")
# delete
p_delete = subparsers.add_parser("delete", help="Delete by ID")
p_delete.add_argument("id", help="Point ID to delete")
# export
p_export = subparsers.add_parser("export", help="Export to JSON")
p_export.add_argument("--output", "-o", help="Output file")
# import
p_import = subparsers.add_parser("import", help="Import from JSON")
p_import.add_argument("file", help="JSON file to import")
# tags
subparsers.add_parser("tags", help="List unique tags")
args = parser.parse_args()
# Run command
if args.cmd == "list":
cmd_list(args)
elif args.cmd == "count":
cmd_count(args)
elif args.cmd == "search":
cmd_search(args)
elif args.cmd == "store":
cmd_store(args)
elif args.cmd == "delete":
cmd_delete(args)
elif args.cmd == "export":
cmd_export(args)
elif args.cmd == "import":
cmd_import(args)
elif args.cmd == "tags":
cmd_tags(args)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,220 @@
#!/usr/bin/env python3
"""
Scrape web content and store in knowledge_base collection
Usage: scrape_to_kb.py <url> <domain> <path> [--title "Title"] [--subjects "a,b,c"]
"""
import argparse
import sys
import re
import hashlib
import urllib.request
import urllib.error
from html import unescape
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "knowledge_base"
OLLAMA_EMBED_URL = "http://localhost:11434/api/embed"
def fetch_url(url):
"""Fetch URL content"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
req = urllib.request.Request(url, headers=headers)
try:
with urllib.request.urlopen(req, timeout=30) as response:
return response.read().decode('utf-8', errors='ignore')
except Exception as e:
print(f"❌ Error fetching {url}: {e}", file=sys.stderr)
return None
def extract_text(html):
"""Extract clean text from HTML"""
# Remove script and style tags
html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
# Extract title
title_match = re.search(r'<title[^>]*>([^<]*)</title>', html, re.IGNORECASE)
title = title_match.group(1).strip() if title_match else "Untitled"
title = unescape(title)
# Remove nav/header/footer common patterns
html = re.sub(r'<nav[^>]*>.*?</nav>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<header[^>]*>.*?</header>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<footer[^>]*>.*?</footer>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
# Convert common block elements to newlines
html = re.sub(r'</(p|div|h[1-6]|li|tr)>', '\n', html, flags=re.IGNORECASE)
html = re.sub(r'<br\s*/?>', '\n', html, flags=re.IGNORECASE)
# Remove all remaining tags
text = re.sub(r'<[^>]+>', ' ', html)
# Clean up whitespace
text = unescape(text)
text = re.sub(r'\n\s*\n', '\n\n', text)
text = re.sub(r'[ \t]+', ' ', text)
text = '\n'.join(line.strip() for line in text.split('\n'))
text = '\n'.join(line for line in text.split('\n') if line)
return title, text
def chunk_text(text, max_chars=2000, overlap=200):
"""Split text into overlapping chunks"""
chunks = []
start = 0
while start < len(text):
end = start + max_chars
# Try to break at sentence or paragraph
if end < len(text):
# Look for paragraph break
para_break = text.rfind('\n\n', start, end)
if para_break > start + 500:
end = para_break
else:
# Look for sentence break
sent_break = max(
text.rfind('. ', start, end),
text.rfind('? ', start, end),
text.rfind('! ', start, end)
)
if sent_break > start + 500:
end = sent_break + 1
chunk = text[start:end].strip()
if len(chunk) > 100: # Skip tiny chunks
chunks.append(chunk)
start = end - overlap
if start >= len(text):
break
return chunks
def get_embedding(text):
"""Generate embedding via Ollama"""
import json
data = {
"model": "nomic-embed-text",
"input": text
}
req = urllib.request.Request(
OLLAMA_EMBED_URL,
data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
return result.get("embeddings", [None])[0]
except Exception as e:
print(f"❌ Error generating embedding: {e}", file=sys.stderr)
return None
def compute_checksum(text):
"""Compute SHA256 checksum"""
return f"sha256:{hashlib.sha256(text.encode()).hexdigest()}"
def store_in_kb(text, metadata):
"""Store chunk in knowledge_base"""
import json
import uuid
embedding = get_embedding(text)
if not embedding:
return False
point = {
"id": str(uuid.uuid4()),
"vector": embedding,
"payload": metadata
}
url = f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points"
req = urllib.request.Request(
url,
data=json.dumps({"points": [point]}).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
print(f"❌ Error storing: {e}", file=sys.stderr)
return False
def main():
parser = argparse.ArgumentParser(description="Scrape URL to knowledge base")
parser.add_argument("url", help="URL to scrape")
parser.add_argument("domain", help="Knowledge domain (e.g., Python, OpenClaw)")
parser.add_argument("path", help="Hierarchical path (e.g., OpenClaw/Docs/Overview)")
parser.add_argument("--title", help="Override title")
parser.add_argument("--subjects", help="Comma-separated subjects")
parser.add_argument("--category", default="reference", help="Category: reference|tutorial|snippet|troubleshooting|concept")
parser.add_argument("--content-type", default="web_page", help="Content type: web_page|code|markdown|pdf|note")
args = parser.parse_args()
print(f"🔍 Fetching {args.url}...")
html = fetch_url(args.url)
if not html:
sys.exit(1)
print("✂️ Extracting text...")
title, text = extract_text(html)
if args.title:
title = args.title
print(f"📄 Title: {title}")
print(f"📝 Content length: {len(text)} chars")
if len(text) < 200:
print("❌ Content too short, skipping", file=sys.stderr)
sys.exit(1)
print("🧩 Chunking...")
chunks = chunk_text(text)
print(f" {len(chunks)} chunks")
subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else []
checksum = compute_checksum(text)
date_added = "2026-02-05"
print("💾 Storing chunks...")
stored = 0
for i, chunk in enumerate(chunks):
chunk_metadata = {
"domain": args.domain,
"path": f"{args.path}/chunk-{i+1}",
"subjects": subjects,
"category": args.category,
"content_type": args.content_type,
"title": f"{title} (part {i+1}/{len(chunks)})",
"checksum": checksum,
"source_url": args.url,
"date_added": date_added,
"chunk_index": i + 1,
"total_chunks": len(chunks),
"text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk
}
if store_in_kb(chunk, chunk_metadata):
stored += 1
print(f" ✓ Chunk {i+1}/{len(chunks)}")
else:
print(f" ✗ Chunk {i+1}/{len(chunks)} failed")
print(f"\n🎉 Stored {stored}/{len(chunks)} chunks in knowledge_base")
print(f" Domain: {args.domain}")
print(f" Path: {args.path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""
Search memories by semantic similarity in Qdrant
Usage: search_memories.py "Query text" [--limit 5] [--filter-tag tag] [--track-access]
Now with access tracking - updates access_count and last_accessed when memories are retrieved.
"""
import argparse
import json
import sys
import urllib.request
from datetime import datetime
import os
QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "kimi_memories")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://127.0.0.1:11434/v1")
def get_embedding(text):
"""Generate embedding using snowflake-arctic-embed2 via Ollama"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def update_access_stats(point_id, current_payload):
"""Update access_count and last_accessed for a memory"""
# Get current values or defaults
access_count = current_payload.get("access_count", 0) + 1
last_accessed = datetime.now().isoformat()
# Prepare update payload
update_body = {
"points": [
{
"id": point_id,
"payload": {
"access_count": access_count,
"last_accessed": last_accessed
}
}
]
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/payload?wait=true",
data=json.dumps(update_body).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=5) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
# Silently fail - don't break search if update fails
return False
def search_memories(query_vector, limit=5, tag_filter=None, track_access=True):
"""Search memories in Qdrant with optional access tracking"""
search_body = {
"vector": query_vector,
"limit": limit,
"with_payload": True,
"with_vector": False
}
# Add filter if tag specified
if tag_filter:
search_body["filter"] = {
"must": [
{
"key": "tags",
"match": {
"value": tag_filter
}
}
]
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search",
data=json.dumps(search_body).encode(),
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
results = result.get("result", [])
# Track access for retrieved memories
if track_access and results:
for r in results:
point_id = r.get("id")
payload = r.get("payload", {})
if point_id:
update_access_stats(point_id, payload)
return results
except Exception as e:
print(f"Error searching memories: {e}", file=sys.stderr)
return []
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Search memories by semantic similarity")
parser.add_argument("query", help="Search query text")
parser.add_argument("--limit", type=int, default=5, help="Number of results (default: 5)")
parser.add_argument("--filter-tag", help="Filter by tag")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--no-track", action="store_true", help="Don't update access stats")
args = parser.parse_args()
print(f"Generating query embedding...", file=sys.stderr)
query_vector = get_embedding(args.query)
if query_vector is None:
print("❌ Failed to generate embedding", file=sys.stderr)
sys.exit(1)
print(f"Searching Qdrant...", file=sys.stderr)
results = search_memories(query_vector, args.limit, args.filter_tag, track_access=not args.no_track)
if not results:
print("No matching memories found.")
sys.exit(0)
if args.json:
# JSON output with all metadata
output = []
for r in results:
payload = r["payload"]
output.append({
"id": r.get("id"),
"score": r["score"],
"text": payload.get("text", ""),
"date": payload.get("date", ""),
"tags": payload.get("tags", []),
"importance": payload.get("importance", "medium"),
"confidence": payload.get("confidence", "medium"),
"verified": payload.get("verified", False),
"source_type": payload.get("source_type", "inferred"),
"access_count": payload.get("access_count", 0),
"last_accessed": payload.get("last_accessed", ""),
"expires_at": payload.get("expires_at", None)
})
print(json.dumps(output, indent=2))
else:
# Human-readable output
print(f"\n🔍 Found {len(results)} similar memories:\n")
for i, r in enumerate(results, 1):
payload = r["payload"]
score = r["score"]
text = payload.get("text", "")[:200]
if len(payload.get("text", "")) > 200:
text += "..."
date = payload.get("date", "unknown")
tags = ", ".join(payload.get("tags", []))
importance = payload.get("importance", "medium")
access_count = payload.get("access_count", 0)
verified = "" if payload.get("verified", False) else "?"
print(f"{i}. [{date}] (score: {score:.3f}) [{importance}] {verified}")
print(f" {text}")
if tags:
print(f" Tags: {tags}")
if access_count > 0:
print(f" Accessed: {access_count} times")
print()

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""Send email via Gmail SMTP with attachment support."""
import smtplib
import json
import sys
import os
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
CRED_FILE = "/root/.openclaw/workspace/.gmail_imap.json"
def load_credentials():
with open(CRED_FILE) as f:
return json.load(f)
def send_email(to_email, subject, body, reply_to=None, attachment_path=None):
creds = load_credentials()
smtp_server = "smtp.gmail.com"
smtp_port = 587
msg = MIMEMultipart()
msg['From'] = f"Kimi <{creds['email']}>"
msg['To'] = to_email
msg['Subject'] = subject
if reply_to:
msg['In-Reply-To'] = reply_to
msg['References'] = reply_to
# Attach body
msg.attach(MIMEText(body, 'plain'))
# Attach file if provided
if attachment_path and os.path.exists(attachment_path):
with open(attachment_path, 'rb') as f:
mime_base = MIMEBase('application', 'octet-stream')
mime_base.set_payload(f.read())
encoders.encode_base64(mime_base)
filename = os.path.basename(attachment_path)
mime_base.add_header('Content-Disposition', f'attachment; filename={filename}')
msg.attach(mime_base)
print(f"📎 Attached: {filename}")
with smtplib.SMTP(smtp_server, smtp_port) as server:
server.starttls()
server.login(creds['email'], creds['app_password'])
server.send_message(msg)
print(f"✉️ Sent to {to_email}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--to", required=True)
parser.add_argument("--subject", required=True)
parser.add_argument("--body", required=True)
parser.add_argument("--reply-to")
parser.add_argument("--attach", help="Path to file to attach")
args = parser.parse_args()
send_email(args.to, args.subject, args.body, args.reply_to, args.attach)

View File

@@ -0,0 +1,20 @@
#!/bin/bash
# Daily Conversation Backup - 7-Day Sliding Window
# Processes last 7 days to catch any missed conversations
SCRIPT_DIR="/root/.openclaw/workspace/skills/qdrant-memory"
LOG_FILE="/var/log/qdrant-daily-backup.log"
echo "==============================================" >> "$LOG_FILE"
echo "7-Day Sliding Window Backup - $(date)" >> "$LOG_FILE"
echo "==============================================" >> "$LOG_FILE"
# Process last 7 days
for day_offset in -6 -5 -4 -3 -2 -1 0; do
date_str=$(date -d "$day_offset days ago" +%Y-%m-%d)
echo "Processing: $date_str..." >> "$LOG_FILE"
cd "$SCRIPT_DIR" && python3 scripts/daily_conversation_backup.py "$date_str" >> "$LOG_FILE" 2>&1
done
echo "Backup complete at $(date)" >> "$LOG_FILE"
echo "" >> "$LOG_FILE"

View File

@@ -0,0 +1,211 @@
#!/usr/bin/env python3
"""
Smart Parser - BeautifulSoup with CSS selectors for custom extraction
Usage: smart_parser.py <url> --selector "article .content" --domain "Blog" --path "Tech/AI"
"""
import argparse
import sys
import json
import re
from pathlib import Path
from bs4 import BeautifulSoup
import urllib.request
sys.path.insert(0, str(Path(__file__).parent))
from scrape_to_kb import chunk_text, get_embedding, compute_checksum, store_in_kb, fetch_url
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "knowledge_base"
def parse_with_selectors(html, selectors):
"""Extract content using CSS selectors"""
soup = BeautifulSoup(html, 'lxml')
# Default: get title
title_tag = soup.find('title')
title = title_tag.get_text().strip() if title_tag else "Untitled"
results = {
"title": title,
"content": "",
"sections": [],
"metadata": {}
}
for name, selector in selectors.items():
if name == "_content":
# Main content selector
elements = soup.select(selector)
if elements:
results["content"] = "\n\n".join(el.get_text(separator='\n', strip=True) for el in elements)
elif name == "_title":
# Title override selector
el = soup.select_one(selector)
if el:
results["title"] = el.get_text(strip=True)
elif name.startswith("_"):
# Special selectors
if name == "_code_blocks":
# Extract code separately
code_blocks = soup.select(selector)
results["metadata"]["code_blocks"] = [
{"lang": el.get('class', [''])[0].replace('language-', '').replace('lang-', ''),
"code": el.get_text()}
for el in code_blocks
]
elif name == "_links":
links = soup.select(selector)
results["metadata"]["links"] = [
{"text": el.get_text(strip=True), "href": el.get('href')}
for el in links if el.get('href')
]
else:
# Named section
elements = soup.select(selector)
if elements:
section_text = "\n\n".join(el.get_text(separator='\n', strip=True) for el in elements)
results["sections"].append({"name": name, "content": section_text})
# If no content selector matched, try to auto-extract main content
if not results["content"]:
# Try common content selectors
for sel in ['main', 'article', '[role="main"]', '.content', '.post', '.entry', '#content']:
el = soup.select_one(sel)
if el:
# Remove nav/footer from content
for unwanted in el.find_all(['nav', 'footer', 'aside', 'header']):
unwanted.decompose()
results["content"] = el.get_text(separator='\n', strip=True)
break
# Fallback: body minus nav/header/footer
if not results["content"]:
body = soup.find('body')
if body:
for unwanted in body.find_all(['nav', 'header', 'footer', 'aside', 'script', 'style']):
unwanted.decompose()
results["content"] = body.get_text(separator='\n', strip=True)
return results
def format_extracted(data, include_sections=True):
"""Format extracted data into clean text"""
parts = []
# Title
parts.append(f"# {data['title']}\n")
# Content
if data["content"]:
parts.append(data["content"])
# Sections
if include_sections and data["sections"]:
for section in data["sections"]:
parts.append(f"\n## {section['name']}\n")
parts.append(section["content"])
# Metadata
if data["metadata"].get("code_blocks"):
parts.append("\n\n## Code Examples\n")
for cb in data["metadata"]["code_blocks"]:
lang = cb["lang"] or "text"
parts.append(f"\n```{lang}\n{cb['code']}\n```\n")
return "\n".join(parts)
def main():
parser = argparse.ArgumentParser(description="Smart HTML parser with CSS selectors")
parser.add_argument("url", help="URL to parse")
parser.add_argument("--domain", required=True, help="Knowledge domain")
parser.add_argument("--path", required=True, help="Hierarchical path")
parser.add_argument("--selector", "-s", action='append', nargs=2, metavar=('NAME', 'CSS'),
help="CSS selector (e.g., -s content article -s title h1)")
parser.add_argument("--content-only", action="store_true", help="Only extract main content")
parser.add_argument("--title-selector", help="CSS selector for title")
parser.add_argument("--remove", action='append', help="Selectors to remove")
parser.add_argument("--category", default="reference")
parser.add_argument("--content-type", default="web_page")
parser.add_argument("--subjects", help="Comma-separated subjects")
parser.add_argument("--title", help="Override title")
parser.add_argument("--output", "-o", help="Save to file instead of KB")
args = parser.parse_args()
# Build selectors dict
selectors = {}
if args.selector:
for name, css in args.selector:
selectors[name] = css
if args.content_only:
selectors["_content"] = "main, article, [role='main'], .content, .post, .entry, #content, body"
if args.title_selector:
selectors["_title"] = args.title_selector
if args.remove:
selectors["_remove"] = ", ".join(args.remove)
print(f"🔍 Fetching {args.url}...")
html = fetch_url(args.url)
if not html:
sys.exit(1)
print("🔧 Parsing...")
data = parse_with_selectors(html, selectors)
if args.title:
data["title"] = args.title
text = format_extracted(data)
print(f"📄 Title: {data['title']}")
print(f"📝 Content: {len(text)} chars")
print(f"📊 Sections: {len(data['sections'])}")
if args.output:
with open(args.output, 'w') as f:
f.write(text)
print(f"💾 Saved to {args.output}")
return
if len(text) < 200:
print("❌ Content too short", file=sys.stderr)
sys.exit(1)
chunks = chunk_text(text)
print(f"🧩 Chunks: {len(chunks)}")
subjects = [s.strip() for s in args.subjects.split(",")] if args.subjects else []
checksum = compute_checksum(text)
print("💾 Storing...")
stored = 0
for i, chunk in enumerate(chunks):
chunk_metadata = {
"domain": args.domain,
"path": f"{args.path}/chunk-{i+1}",
"subjects": subjects,
"category": args.category,
"content_type": args.content_type,
"title": f"{data['title']} (part {i+1}/{len(chunks)})",
"checksum": checksum,
"source_url": args.url,
"date_added": "2026-02-05",
"chunk_index": i + 1,
"total_chunks": len(chunks),
"text_preview": chunk[:200] + "..." if len(chunk) > 200 else chunk,
"scraper_type": "smart_parser_bs4",
"extracted_sections": [s["name"] for s in data["sections"]]
}
if store_in_kb(chunk, chunk_metadata):
stored += 1
print(f" ✓ Chunk {i+1}")
print(f"\n🎉 Stored {stored}/{len(chunks)} chunks")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,321 @@
#!/usr/bin/env python3
"""
Hybrid search: knowledge_base first, then web search, store new findings.
Usage: smart_search.py "query" [--domain "Domain"] [--min-kb-score 0.5] [--store-new]
"""
import argparse
import sys
import json
import urllib.request
import urllib.parse
import re
from datetime import datetime
QDRANT_URL = "http://10.0.0.40:6333"
OLLAMA_EMBED_URL = "http://localhost:11434/api/embed"
SEARXNG_URL = "http://10.0.0.8:8888"
KB_COLLECTION = "knowledge_base"
def get_embedding(text):
"""Generate embedding via Ollama"""
data = {
"model": "nomic-embed-text",
"input": text[:1000] # Limit for speed
}
req = urllib.request.Request(
OLLAMA_EMBED_URL,
data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result.get("embeddings", [None])[0]
except Exception as e:
print(f"⚠️ Embedding error: {e}", file=sys.stderr)
return None
def search_knowledge_base(query, domain=None, limit=5, min_score=0.5):
"""Search knowledge base via vector similarity"""
embedding = get_embedding(query)
if not embedding:
return []
search_data = {
"vector": embedding,
"limit": limit,
"with_payload": True
}
# Note: score_threshold filters aggressively; we filter client-side instead
# to show users what scores were returned
if domain:
search_data["filter"] = {
"must": [{"key": "domain", "match": {"value": domain}}]
}
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/search"
req = urllib.request.Request(
url,
data=json.dumps(search_data).encode(),
headers={"Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
results = result.get("result", [])
# Filter by min_score client-side
return [r for r in results if r.get("score", 0) >= min_score]
except Exception as e:
print(f"⚠️ KB search error: {e}", file=sys.stderr)
return []
def web_search(query, limit=5):
"""Search via SearXNG"""
encoded_query = urllib.parse.quote(query)
url = f"{SEARXNG_URL}/?q={encoded_query}&format=json&safesearch=0"
try:
req = urllib.request.Request(url, headers={"Accept": "application/json"})
with urllib.request.urlopen(req, timeout=15) as response:
data = json.loads(response.read().decode())
return data.get("results", [])[:limit]
except Exception as e:
print(f"⚠️ Web search error: {e}", file=sys.stderr)
return []
def fetch_and_extract(url):
"""Fetch URL and extract clean text"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
req = urllib.request.Request(url, headers=headers)
try:
with urllib.request.urlopen(req, timeout=20) as response:
html = response.read().decode('utf-8', errors='ignore')
# Extract title
title_match = re.search(r'<title[^>]*>([^<]*)</title>', html, re.IGNORECASE)
title = title_match.group(1).strip() if title_match else "Untitled"
# Clean HTML
html = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', html).strip()
return title, text[:3000] # Limit content
except Exception as e:
return None, None
def is_substantial(text, min_length=500):
"""Check if content is substantial enough to store"""
return len(text) >= min_length
def is_unique_content(text, kb_results, similarity_threshold=0.8):
"""Check if content is unique compared to existing KB entries"""
if not kb_results:
return True
# Simple check: if any KB result has very similar content, skip
text_lower = text.lower()
for result in kb_results:
payload = result.get("payload", {})
kb_text = payload.get("text_preview", "").lower()
# Check for substantial overlap
if kb_text and len(kb_text) > 100:
# Simple word overlap check
kb_words = set(kb_text.split())
new_words = set(text_lower.split())
if kb_words and new_words:
overlap = len(kb_words & new_words) / len(kb_words)
if overlap > similarity_threshold:
return False
return True
def store_in_kb(text, metadata):
"""Store content in knowledge base"""
import uuid
import hashlib
embedding = get_embedding(text[:1000])
if not embedding:
return False
# Add metadata fields
metadata["checksum"] = f"sha256:{hashlib.sha256(text.encode()).hexdigest()[:16]}"
metadata["date_scraped"] = datetime.now().isoformat()
metadata["text_preview"] = text[:300] + "..." if len(text) > 300 else text
point = {
"id": str(uuid.uuid4()),
"vector": embedding,
"payload": metadata
}
url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points"
req = urllib.request.Request(
url,
data=json.dumps({"points": [point]}).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
return result.get("status") == "ok"
except Exception as e:
print(f"⚠️ Store error: {e}", file=sys.stderr)
return False
def suggest_domain(query, title, content):
"""Suggest a domain based on query and content"""
query_lower = query.lower()
title_lower = title.lower()
content_lower = content[:500].lower()
# Keyword mapping
domains = {
"Python": ["python", "pip", "django", "flask", "asyncio"],
"JavaScript": ["javascript", "js", "node", "react", "vue", "angular"],
"Linux": ["linux", "ubuntu", "debian", "systemd", "bash", "shell"],
"Networking": ["network", "dns", "tcp", "http", "ssl", "vpn"],
"Docker": ["docker", "container", "kubernetes", "k8s"],
"AI/ML": ["ai", "ml", "machine learning", "llm", "gpt", "model"],
"OpenClaw": ["openclaw"],
"Database": ["database", "sql", "postgres", "mysql", "redis"],
"Security": ["security", "encryption", "auth", "oauth", "jwt"],
"DevOps": ["devops", "ci/cd", "github actions", "jenkins"]
}
combined = query_lower + " " + title_lower + " " + content_lower
for domain, keywords in domains.items():
for kw in keywords:
if kw in combined:
return domain
return "General"
def main():
parser = argparse.ArgumentParser(description="Smart search: KB first, then web, store new")
parser.add_argument("query", help="Search query")
parser.add_argument("--domain", help="Filter KB by domain")
parser.add_argument("--min-kb-score", type=float, default=0.5, help="Minimum KB match score (default: 0.5)")
parser.add_argument("--store-new", action="store_true", help="Automatically store new web findings")
parser.add_argument("--web-limit", type=int, default=3, help="Number of web results to check")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
results = {
"query": args.query,
"kb_results": [],
"web_results": [],
"stored_count": 0,
"timestamp": datetime.now().isoformat()
}
# Step 1: Search knowledge base
print(f"🔍 Searching knowledge base (min score: {args.min_kb_score})...")
kb_results = search_knowledge_base(args.query, args.domain, limit=5, min_score=args.min_kb_score)
results["kb_results"] = kb_results
if kb_results:
print(f" ✓ Found {len(kb_results)} KB entries")
for r in kb_results:
payload = r.get("payload", {})
score = r.get("score", 0)
title = payload.get('title', 'Untitled')[:50]
source = payload.get('source_url', 'N/A')[:40]
print(f"{title}... (score: {score:.2f}) [{source}...]")
else:
print(f" ✗ No KB matches above threshold ({args.min_kb_score})")
# Step 2: Web search
print(f"\n🌐 Searching web...")
web_results = web_search(args.query, limit=args.web_limit)
results["web_results"] = web_results
if not web_results:
print(f" ✗ No web results")
if args.json:
print(json.dumps(results, indent=2))
return
print(f" ✓ Found {len(web_results)} web results")
# Step 3: Check and optionally store new findings
new_stored = 0
for web_result in web_results:
url = web_result.get("url", "")
title = web_result.get("title", "Untitled")
snippet = web_result.get("content", "")
print(f"\n📄 Checking: {title}")
print(f" URL: {url}")
# Fetch full content
fetched_title, content = fetch_and_extract(url)
if not content:
print(f" ⚠️ Could not fetch content")
continue
title = fetched_title or title
# Check if substantial
if not is_substantial(content):
print(f" ⏭️ Content too short ({len(content)} chars), skipping")
continue
# Check if unique
if not is_unique_content(content, kb_results):
print(f" ⏭️ Similar content already in KB")
continue
print(f" ✓ New substantial content ({len(content)} chars)")
# Auto-store or suggest
if args.store_new:
domain = suggest_domain(args.query, title, content)
subjects = [s.strip() for s in args.query.lower().split() if len(s) > 3]
metadata = {
"domain": domain,
"path": f"{domain}/Web/{re.sub(r'[^\w\s-]', '', title)[:30]}",
"subjects": subjects,
"category": "reference",
"content_type": "web_page",
"title": title,
"source_url": url,
"date_added": datetime.now().strftime("%Y-%m-%d")
}
if store_in_kb(content, metadata):
print(f" ✅ Stored in KB (domain: {domain})")
new_stored += 1
else:
print(f" ❌ Failed to store")
else:
print(f" 💡 Use --store-new to save this")
results["stored_count"] = new_stored
# Summary
print(f"\n📊 Summary:")
print(f" KB results: {len(kb_results)}")
print(f" Web results checked: {len(web_results)}")
print(f" New items stored: {new_stored}")
if args.json:
print(json.dumps(results, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,303 @@
#!/usr/bin/env python3
"""
Conversation Memory Capture - Store conversational turns to Qdrant
This script stores the full conversational context (user messages + AI responses)
as atomic facts in Qdrant, not just summaries written to daily logs.
Usage:
store_conversation.py "User message" "AI response" --date 2026-02-15 --tags "workflow"
store_conversation.py --file conversation.json # Batch mode
Features:
- Stores both user queries and AI responses
- Generates embeddings for semantic search
- Links related turns with conversation IDs
- Extracts facts from responses automatically
"""
import argparse
import json
import os
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://localhost:11434/v1"
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def extract_tags(text: str, date_str: str) -> List[str]:
"""Extract relevant tags from text"""
tags = ["conversation-turn", "atomic-fact", date_str]
text_lower = text.lower()
tag_mappings = {
"youtube": "youtube",
"video": "video",
"workflow": "workflow",
"process": "process",
"qdrant": "qdrant",
"memory": "memory",
"fact": "facts",
"extract": "extraction",
"config": "configuration",
"setting": "settings",
"rule": "rules",
"decision": "decisions",
"preference": "preferences",
"hardware": "hardware",
"security": "security",
"research": "research",
"step": "steps",
"grok": "grok",
"thumbnail": "thumbnail",
"title": "title",
"description": "description",
"seo": "seo",
"tags": "tags",
}
for keyword, tag in tag_mappings.items():
if keyword in text_lower:
tags.append(tag)
return list(set(tags))
def store_turn(
speaker: str,
message: str,
date_str: str,
tags: List[str] = None,
conversation_id: str = None,
turn_number: int = None,
importance: str = "medium"
) -> Optional[str]:
"""Store a single conversational turn"""
embedding = get_embedding(message)
if embedding is None:
return None
point_id = str(uuid.uuid4())
if tags is None:
tags = extract_tags(message, date_str)
payload = {
"text": f"[{speaker}]: {message}",
"date": date_str,
"tags": tags,
"importance": importance,
"source": "conversation",
"source_type": "user" if speaker == "Rob" else "assistant",
"category": "Conversation",
"confidence": "high",
"verified": True,
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat(),
"conversation_id": conversation_id or str(uuid.uuid4()),
"turn_number": turn_number or 0
}
upsert_data = {
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload
}]
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
return point_id
except Exception as e:
print(f"Error storing turn: {e}", file=sys.stderr)
return None
def store_conversation_pair(
user_message: str,
ai_response: str,
date_str: str,
tags: List[str] = None,
importance: str = "medium"
) -> tuple:
"""Store both user query and AI response as linked turns"""
conversation_id = str(uuid.uuid4())
user_id = store_turn(
speaker="Rob",
message=user_message,
date_str=date_str,
tags=tags,
conversation_id=conversation_id,
turn_number=1,
importance=importance
)
ai_id = store_turn(
speaker="Kimi",
message=ai_response,
date_str=date_str,
tags=tags,
conversation_id=conversation_id,
turn_number=2,
importance=importance
)
return user_id, ai_id
def extract_facts_from_text(text: str, date_str: str) -> List[Dict[str, Any]]:
"""Extract atomic facts from a text block"""
facts = []
# Split into sentences
sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n') if s.strip()]
for sentence in sentences:
if len(sentence) < 10:
continue
embedding = get_embedding(sentence)
if embedding is None:
continue
point_id = str(uuid.uuid4())
facts.append({
"id": point_id,
"vector": embedding,
"payload": {
"text": sentence[:500],
"date": date_str,
"tags": extract_tags(sentence, date_str),
"importance": "high" if "**" in sentence else "medium",
"source": "fact-extraction",
"source_type": "inferred",
"category": "Extracted Fact",
"confidence": "medium",
"verified": False,
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat()
}
})
return facts
def main():
parser = argparse.ArgumentParser(description="Store conversational turns to Qdrant")
parser.add_argument("user_message", nargs="?", help="User's message/query")
parser.add_argument("ai_response", nargs="?", help="AI's response")
parser.add_argument("--date", default=datetime.now().strftime("%Y-%m-%d"), help="Date (YYYY-MM-DD)")
parser.add_argument("--tags", help="Comma-separated tags")
parser.add_argument("--importance", default="medium", choices=["low", "medium", "high"])
parser.add_argument("--file", help="JSON file with conversation array")
parser.add_argument("--extract-facts", action="store_true", help="Also extract atomic facts from response")
args = parser.parse_args()
tags = args.tags.split(",") if args.tags else None
if args.file:
# Batch mode from JSON file
with open(args.file, 'r') as f:
conversations = json.load(f)
total = 0
for conv in conversations:
user_id, ai_id = store_conversation_pair(
conv["user"],
conv["ai"],
args.date,
tags or conv.get("tags"),
args.importance
)
if user_id and ai_id:
total += 2
print(f"✅ Stored {total} conversation turns")
elif args.user_message and args.ai_response:
# Single pair mode
user_id, ai_id = store_conversation_pair(
args.user_message,
args.ai_response,
args.date,
tags,
args.importance
)
if user_id and ai_id:
print(f"✅ Stored conversation pair")
print(f" User turn: {user_id[:8]}...")
print(f" AI turn: {ai_id[:8]}...")
if args.extract_facts:
facts = extract_facts_from_text(args.ai_response, args.date)
if facts:
# Upload facts
upsert_data = {"points": facts}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
print(f" Extracted {len(facts)} additional facts")
except Exception as e:
print(f" Warning: Could not store extracted facts: {e}")
else:
print("❌ Failed to store conversation")
sys.exit(1)
else:
parser.print_help()
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,394 @@
#!/usr/bin/env python3
"""
Enhanced memory storage with metadata support and batch upload capability
Usage: store_memory.py "Memory text" [--tags tag1,tag2] [--importance medium]
[--confidence high] [--source user|inferred|external]
[--verified] [--expires 2026-03-01] [--related id1,id2]
[--batch-mode] [--batch-size N]
Features:
- Single or batch memory storage
- Duplicate detection with --replace flag
- Enhanced metadata (importance, confidence, source_type, etc.)
- Access tracking (access_count, last_accessed)
"""
import argparse
import json
import sys
import urllib.request
import urllib.error
import uuid
from datetime import datetime, timedelta
from typing import List, Optional, Dict, Any
QDRANT_URL = "http://10.0.0.40:6333"
COLLECTION_NAME = "kimi_memories"
OLLAMA_URL = "http://localhost:11434/v1"
DEFAULT_BATCH_SIZE = 100
def check_existing(date: str = None) -> Optional[str]:
"""Check if entry already exists for this date"""
if not date:
return None
try:
scroll_data = json.dumps({
"limit": 100,
"with_payload": True,
"filter": {
"must": [{"key": "date", "match": {"value": date}}]
}
}).encode()
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/scroll",
data=scroll_data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
points = result.get("result", {}).get("points", [])
if points:
return points[0]["id"] # Return existing ID
except Exception as e:
print(f"Warning: Could not check existing: {e}", file=sys.stderr)
return None
def get_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using snowflake-arctic-embed2 via Ollama"""
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": text[:8192] # Limit to 8k chars
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode())
return result["data"][0]["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}", file=sys.stderr)
return None
def batch_upload_embeddings(texts: List[str]) -> List[Optional[List[float]]]:
"""Generate embeddings for multiple texts in one batch"""
if not texts:
return []
data = json.dumps({
"model": "snowflake-arctic-embed2",
"input": [t[:8192] for t in texts]
}).encode()
req = urllib.request.Request(
f"{OLLAMA_URL}/embeddings",
data=data,
headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=120) as response:
result = json.loads(response.read().decode())
return [d["embedding"] for d in result["data"]]
except Exception as e:
print(f"Error generating batch embeddings: {e}", file=sys.stderr)
return [None] * len(texts)
def upload_points_batch(points: List[Dict[str, Any]], batch_size: int = DEFAULT_BATCH_SIZE) -> tuple:
"""Upload points in batches to Qdrant"""
total = len(points)
uploaded = 0
failed = 0
for i in range(0, total, batch_size):
batch = points[i:i + batch_size]
upsert_data = {"points": batch}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=60) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
uploaded += len(batch)
else:
print(f"Batch upload failed: {result}", file=sys.stderr)
failed += len(batch)
except Exception as e:
print(f"Batch upload error: {e}", file=sys.stderr)
failed += len(batch)
return uploaded, failed
def store_single_memory(
text: str,
embedding: List[float],
tags: List[str] = None,
importance: str = "medium",
date: str = None,
source: str = "conversation",
confidence: str = "high",
source_type: str = "user",
verified: bool = True,
expires_at: str = None,
related_memories: List[str] = None,
replace: bool = False
) -> Optional[str]:
"""Store a single memory in Qdrant with enhanced metadata"""
if date is None:
date = datetime.now().strftime("%Y-%m-%d")
# Check for existing entry on same date
existing_id = check_existing(date=date) if date else None
if existing_id and not replace:
print(f"⚠️ Entry for {date} already exists (ID: {existing_id})")
print(f" Use --replace to overwrite")
return None
# Use existing ID if replacing, otherwise generate new
point_id = existing_id if existing_id else str(uuid.uuid4())
# Build payload with all metadata
payload = {
"text": text,
"date": date,
"tags": tags or [],
"importance": importance,
"source": source,
"confidence": confidence,
"source_type": source_type,
"verified": verified,
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat()
}
# Optional metadata
if expires_at:
payload["expires_at"] = expires_at
if related_memories:
payload["related_memories"] = related_memories
# Qdrant upsert format
upsert_data = {
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload
}]
}
req = urllib.request.Request(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points?wait=true",
data=json.dumps(upsert_data).encode(),
headers={"Content-Type": "application/json"},
method="PUT"
)
try:
with urllib.request.urlopen(req, timeout=10) as response:
result = json.loads(response.read().decode())
if result.get("status") == "ok":
return point_id
else:
print(f"Qdrant response: {result}", file=sys.stderr)
return None
except urllib.error.HTTPError as e:
error_body = e.read().decode()
print(f"HTTP Error {e.code}: {error_body}", file=sys.stderr)
return None
except Exception as e:
print(f"Error storing memory: {e}", file=sys.stderr)
return None
def store_memories_batch(
memories: List[Dict[str, Any]],
batch_size: int = DEFAULT_BATCH_SIZE
) -> tuple:
"""Store multiple memories in batch"""
if not memories:
return 0, 0
# Generate embeddings for all
texts = [m["text"] for m in memories]
print(f"Generating embeddings for {len(texts)} memories...")
embeddings = batch_upload_embeddings(texts)
# Prepare points
points = []
failed_indices = []
for i, (memory, embedding) in enumerate(zip(memories, embeddings)):
if embedding is None:
failed_indices.append(i)
continue
point_id = str(uuid.uuid4())
date = memory.get("date", datetime.now().strftime("%Y-%m-%d"))
payload = {
"text": memory["text"],
"date": date,
"tags": memory.get("tags", []),
"importance": memory.get("importance", "medium"),
"source": memory.get("source", "conversation"),
"confidence": memory.get("confidence", "high"),
"source_type": memory.get("source_type", "user"),
"verified": memory.get("verified", True),
"created_at": datetime.now().isoformat(),
"access_count": 0,
"last_accessed": datetime.now().isoformat()
}
# NOTE: User requested NO memory expiration - permanent retention
# expires_at is accepted for API compatibility but ignored
if memory.get("expires_at"):
payload["expires_at"] = memory["expires_at"]
if memory.get("related_memories"):
payload["related_memories"] = memory["related_memories"]
points.append({
"id": point_id,
"vector": embedding,
"payload": payload
})
if not points:
return 0, len(memories)
# Upload in batches
print(f"Uploading {len(points)} memories in batches of {batch_size}...")
uploaded, failed_upload = upload_points_batch(points, batch_size)
return uploaded, len(failed_indices) + failed_upload
def parse_date(date_str: str) -> Optional[str]:
"""Validate date format"""
if not date_str:
return None
try:
datetime.strptime(date_str, "%Y-%m-%d")
return date_str
except ValueError:
print(f"Invalid date format: {date_str}. Use YYYY-MM-DD.", file=sys.stderr)
return None
def main():
parser = argparse.ArgumentParser(description="Store memories in Qdrant with metadata")
parser.add_argument("text", nargs="?", help="Memory text to store")
parser.add_argument("--tags", help="Comma-separated tags")
parser.add_argument("--importance", default="medium", choices=["low", "medium", "high"])
parser.add_argument("--date", help="Date in YYYY-MM-DD format")
parser.add_argument("--source", default="conversation", help="Source of the memory")
parser.add_argument("--confidence", default="high", choices=["high", "medium", "low"])
parser.add_argument("--source-type", default="user", choices=["user", "inferred", "external"])
parser.add_argument("--verified", action="store_true", default=True)
parser.add_argument("--expires", help="Expiration date YYYY-MM-DD (NOTE: User prefers permanent retention)")
parser.add_argument("--related", help="Comma-separated related memory IDs")
parser.add_argument("--replace", action="store_true", help="Replace existing entry for the same date")
parser.add_argument("--batch-file", help="JSON file with multiple memories for batch upload")
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"Batch size (default: {DEFAULT_BATCH_SIZE})")
args = parser.parse_args()
# Batch mode
if args.batch_file:
print(f"Batch mode: Loading memories from {args.batch_file}")
try:
with open(args.batch_file, 'r') as f:
memories = json.load(f)
if not isinstance(memories, list):
print("Batch file must contain a JSON array of memories", file=sys.stderr)
sys.exit(1)
print(f"Loaded {len(memories)} memories for batch upload")
uploaded, failed = store_memories_batch(memories, args.batch_size)
print(f"\n{'=' * 50}")
print(f"Batch upload complete:")
print(f" Uploaded: {uploaded}")
print(f" Failed: {failed}")
sys.exit(0 if failed == 0 else 1)
except Exception as e:
print(f"Error loading batch file: {e}", file=sys.stderr)
sys.exit(1)
# Single memory mode
if not args.text:
print("Error: Either provide text argument or use --batch-file", file=sys.stderr)
parser.print_help()
sys.exit(1)
# Parse tags and related memories
tags = [t.strip() for t in args.tags.split(",")] if args.tags else []
related = [r.strip() for r in args.related.split(",")] if args.related else None
# Validate date
date = parse_date(args.date)
if args.date and not date:
sys.exit(1)
print(f"Generating embedding...")
embedding = get_embedding(args.text)
if embedding is None:
print("❌ Failed to generate embedding", file=sys.stderr)
sys.exit(1)
print(f"Storing memory (vector dim: {len(embedding)})...")
point_id = store_single_memory(
text=args.text,
embedding=embedding,
tags=tags,
importance=args.importance,
date=date,
source=args.source,
confidence=args.confidence,
source_type=args.source_type,
verified=args.verified,
expires_at=args.expires,
related_memories=related,
replace=args.replace
)
if point_id:
print(f"✅ Memory stored successfully")
print(f" ID: {point_id}")
print(f" Tags: {tags}")
print(f" Importance: {args.importance}")
print(f" Confidence: {args.confidence}")
print(f" Source: {args.source_type}")
if args.expires:
print(f" Expires: {args.expires}")
else:
print(f"❌ Failed to store memory", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,88 @@
#!/usr/bin/env python3
"""Tagger (optional): generate lightweight tags + title for a conversation chunk.
Design goals:
- Cheap: run on a smaller model (e.g. Minimax 2.5 or any OpenAI-compatible endpoint).
- Portable: fully env-configured.
- Deterministic-ish: JSON output.
This is intentionally optional. The memory system works without it.
Env (OpenAI-compatible):
TAGGER_BASE_URL e.g. https://api.minimax.chat/v1
TAGGER_API_KEY token
TAGGER_MODEL default: minimax-2.5
Usage:
python3 tagger.py --text "..."
Output:
{"title": "...", "tags": ["..."], "entities": ["..."], "category": "..."}
"""
import argparse
import json
import os
import sys
import urllib.request
BASE_URL = os.getenv("TAGGER_BASE_URL", "").rstrip("/")
API_KEY = os.getenv("TAGGER_API_KEY", "")
MODEL = os.getenv("TAGGER_MODEL", "minimax-2.5")
TIMEOUT = int(os.getenv("TAGGER_TIMEOUT", "30"))
SYSTEM = (
"You generate compact metadata for retrieving old conversation context. "
"Return STRICT JSON with keys: title (string), tags (array of short strings), "
"entities (array of short strings), category (string). "
"Tags should be lowercase, hyphenated, <= 4 words each. "
"Prefer 5-12 tags."
)
def call_openai_compat(text: str) -> dict:
if not BASE_URL or not API_KEY:
raise RuntimeError("TAGGER_BASE_URL and TAGGER_API_KEY must be set")
body = {
"model": MODEL,
"messages": [
{"role": "system", "content": SYSTEM},
{"role": "user", "content": text[:12000]},
],
"temperature": 0.2,
"response_format": {"type": "json_object"},
}
req = urllib.request.Request(
f"{BASE_URL}/chat/completions",
data=json.dumps(body).encode("utf-8"),
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}",
},
)
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
resp = json.loads(r.read().decode("utf-8"))
content = resp["choices"][0]["message"]["content"]
return json.loads(content)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--text", required=True)
args = ap.parse_args()
try:
out = call_openai_compat(args.text)
except Exception as e:
print(f"[tagger] error: {e}", file=sys.stderr)
sys.exit(1)
print(json.dumps(out, indent=2, sort_keys=True))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,33 @@
# Task Queue Skill
Redis-based task queue for background jobs.
## What It Does
Queues and executes tasks via heartbeat worker.
## Commands
```bash
# Add a task
python3 scripts/add_task.py "Check disk space"
# List tasks
python3 scripts/list_tasks.py
# Execute (runs on heartbeat)
python3 scripts/heartbeat_worker.py
```
## Heartbeat Integration
Add to HEARTBEAT.md:
```bash
python3 /path/to/skills/task-queue/scripts/heartbeat_worker.py
```
## Files
- `add_task.py` - Add task to queue
- `list_tasks.py` - View queue status
- `heartbeat_worker.py` - Execute pending tasks

View File

@@ -0,0 +1,91 @@
#!/usr/bin/env python3
"""
Add a task to the queue.
Usage: python3 add_task.py "Task description" [options]
"""
import redis
import sys
import time
import os
import argparse
REDIS_HOST = os.environ.get("REDIS_HOST", "10.0.0.36")
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", None)
def get_redis():
return redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=True
)
def generate_task_id():
return f"task_{int(time.time())}_{os.urandom(4).hex()[:8]}"
def add_task(description, task_type="default", priority="medium", created_by="Kimi", message=None, command=None):
r = get_redis()
task_id = generate_task_id()
timestamp = str(int(time.time()))
# Build task data
task_data = {
"id": task_id,
"description": description,
"type": task_type,
"status": "pending",
"created_at": timestamp,
"created_by": created_by,
"priority": priority,
"started_at": "",
"completed_at": "",
"result": ""
}
# Add type-specific fields
if task_type == "notify" and message:
task_data["message"] = message
elif task_type == "command" and command:
task_data["command"] = command
# Store task details
r.hset(f"task:{task_id}", mapping=task_data)
# Add to pending queue
# For priority: high=lpush (front), others=rpush (back)
if priority == "high":
r.lpush("tasks:pending", task_id)
else:
r.rpush("tasks:pending", task_id)
print(f"[ADDED] {task_id}: {description} ({priority}, {task_type})")
return task_id
def main():
parser = argparse.ArgumentParser(description="Add a task to the queue")
parser.add_argument("description", help="Task description")
parser.add_argument("--type", choices=["default", "notify", "command"],
default="default", help="Task type")
parser.add_argument("--priority", choices=["high", "medium", "low"],
default="medium", help="Task priority")
parser.add_argument("--by", default="Kimi", help="Who created the task")
parser.add_argument("--message", help="Message to send (for notify type)")
parser.add_argument("--command", help="Shell command to run (for command type)")
args = parser.parse_args()
task_id = add_task(
args.description,
args.type,
args.priority,
args.by,
args.message,
args.command
)
print(f"Task ID: {task_id}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,443 @@
#!/usr/bin/env python3
"""
Heartbeat worker - GPT-powered task execution.
Sends tasks to Ollama for command generation, executes via SSH.
"""
import redis
import json
import time
import os
import sys
import subprocess
import requests
from datetime import datetime
REDIS_HOST = os.environ.get("REDIS_HOST", "127.0.0.1")
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", None)
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434")
TASK_LLM_MODEL = os.environ.get("TASK_LLM_MODEL", "kimi-k2.5:cloud")
DEFAULT_TARGET_HOST = os.environ.get("TASK_SSH_HOST", "")
DEFAULT_SSH_USER = os.environ.get("TASK_SSH_USER", "")
DEFAULT_SUDO_PASS = os.environ.get("TASK_SUDO_PASS", "")
def get_redis():
return redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=True
)
def generate_task_id():
return f"task_{int(time.time())}_{os.urandom(4).hex()}"
def check_active_task(r):
"""Check if there's already an active task."""
active = r.lrange("tasks:active", 0, -1)
if active:
task_id = active[0]
task = r.hgetall(f"task:{task_id}")
started_at = int(task.get("started_at", 0))
elapsed = time.time() - started_at
print(f"[BUSY] Task {task_id} active for {elapsed:.0f}s")
return True
return False
def get_pending_task(r):
"""Pop a task from pending queue."""
task_id = r.rpop("tasks:pending")
if task_id:
return task_id
return None
def clean_json_content(content):
"""Strip markdown code blocks if present."""
cleaned = content.strip()
if cleaned.startswith("```json"):
cleaned = cleaned[7:]
elif cleaned.startswith("```"):
cleaned = cleaned[3:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3]
return cleaned.strip()
def ask_gpt_for_commands(task_description, target_host=None, ssh_user=None, sudo_pass=None):
"""
Send task to Ollama/GPT to generate SSH commands.
Returns dict with commands, expected results, and explanation.
"""
target_host = target_host or DEFAULT_TARGET_HOST
ssh_user = ssh_user or DEFAULT_SSH_USER
sudo_pass = sudo_pass if sudo_pass is not None else DEFAULT_SUDO_PASS
if not target_host or not ssh_user:
raise ValueError("TASK_SSH_HOST and TASK_SSH_USER must be set (or passed explicitly)")
sudo_line = (
f"Sudo password: {sudo_pass}"
if sudo_pass
else "Sudo password: (not provided; avoid sudo unless absolutely necessary)"
)
system_prompt = f"""You have SSH access to {ssh_user}@{target_host}
{sudo_line}
Your job is to generate shell commands to complete the given task.
Respond ONLY with valid JSON in this format:
{{
"commands": [
"ssh -t {ssh_user}@{target_host} 'sudo apt update'",
"ssh -t {ssh_user}@{target_host} 'sudo apt install -y docker.io'"
],
"expected_results": [
"apt updated successfully",
"docker installed and running"
],
"explanation": "Updating packages and installing Docker"
}}
Rules:
- Commands should use ssh -t (allocates TTY for sudo) to execute on the remote host
- Use sudo only when needed
- Keep commands safe and idempotent where possible
- If task is unclear, ask for clarification in explanation
For Docker-related tasks:
- Search Docker Hub for official images (docker.io/library/ or verified publishers)
- Prefer latest stable versions
- Use official images over community when available
- Verify image exists before trying to pull
- Map volumes as specified in the task (e.g., -v /root/html:/usr/share/nginx/html)
"""
user_prompt = f"Task: {task_description}\n\nGenerate the commands to complete this task."
try:
response = requests.post(
f"{OLLAMA_URL}/api/chat",
json={
"model": TASK_LLM_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"stream": False,
"format": "json"
},
timeout=120
)
response.raise_for_status()
result = response.json()
content = result.get("message", {}).get("content", "{}")
# Parse the JSON response
try:
cleaned = clean_json_content(content)
gpt_plan = json.loads(cleaned)
return gpt_plan
except json.JSONDecodeError:
# If GPT didn't return valid JSON, wrap the raw response
return {
"commands": [],
"expected_results": [],
"explanation": f"GPT response: {content[:200]}",
"parse_error": "GPT did not return valid JSON"
}
except Exception as e:
return {
"commands": [],
"expected_results": [],
"explanation": f"Failed to get commands from GPT: {e}",
"error": str(e)
}
def execute_ssh_command_with_sudo(command, sudo_pass, timeout=300):
"""
Execute an SSH command with sudo password handling.
Uses -t flag for TTY allocation and handles sudo password prompt.
"""
try:
# Ensure command has -t flag for TTY
if not "-t" in command and command.startswith("ssh "):
command = command.replace("ssh ", "ssh -t ", 1)
# Use expect-like approach with subprocess
# Send password when prompted
import pty
import select
import termios
import tty
master_fd, slave_fd = pty.openpty()
process = subprocess.Popen(
command,
shell=True,
stdin=slave_fd,
stdout=slave_fd,
stderr=slave_fd,
preexec_fn=os.setsid
)
os.close(slave_fd)
output = []
password_sent = False
start_time = time.time()
while process.poll() is None:
if time.time() - start_time > timeout:
process.kill()
return {
"success": False,
"stdout": "".join(output),
"stderr": "Command timed out",
"exit_code": -1
}
ready, _, _ = select.select([master_fd], [], [], 0.1)
if ready:
try:
data = os.read(master_fd, 1024).decode()
output.append(data)
# Check for sudo password prompt
if "password:" in data.lower() or "password for" in data.lower():
if not password_sent:
os.write(master_fd, (sudo_pass + "\n").encode())
password_sent = True
time.sleep(0.5)
except OSError:
break
os.close(master_fd)
stdout = "".join(output)
return {
"success": process.returncode == 0,
"stdout": stdout,
"stderr": "" if process.returncode == 0 else stdout,
"exit_code": process.returncode
}
except Exception as e:
return {
"success": False,
"stdout": "",
"stderr": str(e),
"exit_code": -1
}
def execute_ssh_command_simple(command, timeout=300):
"""
Execute an SSH command without sudo (simple version).
"""
try:
result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
timeout=timeout
)
return {
"success": result.returncode == 0,
"stdout": result.stdout,
"stderr": result.stderr,
"exit_code": result.returncode
}
except subprocess.TimeoutExpired:
return {
"success": False,
"stdout": "",
"stderr": "Command timed out",
"exit_code": -1
}
except Exception as e:
return {
"success": False,
"stdout": "",
"stderr": str(e),
"exit_code": -1
}
def execute_task_with_gpt(task):
"""
Execute task using GPT to generate commands, then run via SSH.
"""
task_description = task.get("description", "No description")
target_host = task.get("target_host", "10.0.0.38")
ssh_user = task.get("ssh_user", "n8n")
sudo_pass = task.get("sudo_pass", "passw0rd")
print(f"[GPT] Generating commands for: {task_description}")
# Get commands from GPT
gpt_plan = ask_gpt_for_commands(task_description, target_host, ssh_user, sudo_pass)
if not gpt_plan.get("commands"):
comments = f"GPT failed to generate commands: {gpt_plan.get('explanation', 'Unknown error')}"
return {
"success": False,
"gpt_plan": gpt_plan,
"execution_results": [],
"comments": comments
}
print(f"[GPT] Plan: {gpt_plan.get('explanation', 'No explanation')}")
print(f"[EXEC] Running {len(gpt_plan['commands'])} commands...")
# Execute each command
execution_results = []
any_failed = False
for i, cmd in enumerate(gpt_plan["commands"]):
print(f"[CMD {i+1}] {cmd[:80]}...")
# Check if command uses sudo
if "sudo" in cmd.lower():
result = execute_ssh_command_with_sudo(cmd, sudo_pass)
else:
result = execute_ssh_command_simple(cmd)
execution_results.append({
"command": cmd,
"result": result
})
if not result["success"]:
any_failed = True
print(f"[FAIL] Exit code {result['exit_code']}: {result['stderr'][:100]}")
else:
print(f"[OK] Success")
# Build comments field
if any_failed:
failed_cmds = [r for r in execution_results if not r["result"]["success"]]
comments = f"ERRORS ({len(failed_cmds)} failed):\n"
for r in failed_cmds:
comments += f"- Command: {r['command'][:60]}...\n"
comments += f" Error: {r['result']['stderr'][:200]}\n"
else:
comments = "OK"
return {
"success": not any_failed,
"gpt_plan": gpt_plan,
"execution_results": execution_results,
"comments": comments
}
def execute_simple_task(task):
"""
Execute simple tasks (notify, command) without GPT.
"""
task_type = task.get("type", "default")
description = task.get("description", "No description")
sudo_pass = task.get("sudo_pass", "passw0rd")
if task_type == "notify":
# For now, just log it (messaging handled elsewhere)
return {
"success": True,
"result": f"Notification: {task.get('message', description)}",
"comments": "OK"
}
elif task_type == "command":
# Execute shell command directly
command = task.get("command", "")
if command:
if "sudo" in command.lower():
result = execute_ssh_command_with_sudo(command, sudo_pass)
else:
result = execute_ssh_command_simple(command)
comments = "OK" if result["success"] else f"Error: {result['stderr'][:500]}"
return {
"success": result["success"],
"result": result["stdout"][:500],
"comments": comments
}
else:
return {
"success": False,
"result": "No command specified",
"comments": "ERROR: No command provided"
}
else:
# Default: use GPT
return execute_task_with_gpt(task)
def mark_completed(r, task_id, result_data):
"""Mark task as completed with full result data."""
r.hset(f"task:{task_id}", mapping={
"status": "completed" if result_data["success"] else "failed",
"completed_at": str(int(time.time())),
"result": json.dumps(result_data.get("result", "")),
"comments": result_data.get("comments", "")
})
r.lrem("tasks:active", 0, task_id)
r.lpush("tasks:completed", task_id)
status = "DONE" if result_data["success"] else "FAILED"
print(f"[{status}] {task_id}")
if result_data.get("comments") and result_data["comments"] != "OK":
print(f"[COMMENTS] {result_data['comments'][:200]}")
def mark_failed(r, task_id, error):
"""Mark task as failed."""
r.hset(f"task:{task_id}", mapping={
"status": "failed",
"completed_at": str(int(time.time())),
"result": f"Error: {error}",
"comments": f"Worker error: {error}"
})
r.lrem("tasks:active", 0, task_id)
r.lpush("tasks:completed", task_id)
print(f"[FAILED] {task_id}: {error}")
def main():
r = get_redis()
# Check if already busy
if check_active_task(r):
sys.exit(0)
# Get next pending task
task_id = get_pending_task(r)
if not task_id:
print("[IDLE] No pending tasks")
sys.exit(0)
# Load task details
task = r.hgetall(f"task:{task_id}")
if not task:
print(f"[ERROR] Task {task_id} not found")
sys.exit(1)
# Move to active
r.hset(f"task:{task_id}", mapping={
"status": "active",
"started_at": str(int(time.time()))
})
r.lpush("tasks:active", task_id)
print(f"[START] {task_id}: {task.get('description', 'No description')}")
try:
# Execute the task
result_data = execute_simple_task(task)
mark_completed(r, task_id, result_data)
print(f"[WAKE] Task complete - check comments field for status")
except Exception as e:
mark_failed(r, task_id, str(e))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python3
"""
List tasks in the queue - pending, active, and recent completed.
"""
import redis
import os
from datetime import datetime
REDIS_HOST = os.environ.get("REDIS_HOST", "10.0.0.36")
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
def get_redis():
return redis.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
def format_time(timestamp):
if not timestamp or timestamp == "0":
return "-"
try:
dt = datetime.fromtimestamp(int(timestamp))
return dt.strftime("%H:%M:%S")
except:
return timestamp
def show_tasks(r, key, title, status_filter=None, limit=10):
task_ids = r.lrange(key, 0, limit - 1)
if not task_ids:
print(f"\n{title}: (empty)")
return
print(f"\n{title}:")
print("-" * 80)
for task_id in task_ids:
task = r.hgetall(f"task:{task_id}")
if not task:
print(f" {task_id}: [missing data]")
continue
status = task.get("status", "?")
desc = task.get("description", "no description")[:50]
priority = task.get("priority", "medium")
created = format_time(task.get("created_at"))
if status_filter and status != status_filter:
continue
print(f" [{status:10}] {task_id} | {priority:6} | {created} | {desc}")
def main():
r = get_redis()
print("=" * 80)
print("TASK QUEUE STATUS")
print("=" * 80)
# Show counts
pending_count = r.llen("tasks:pending")
active_count = r.llen("tasks:active")
completed_count = r.llen("tasks:completed")
print(f"\nCounts: {pending_count} pending | {active_count} active | {completed_count} completed")
# Show pending
show_tasks(r, "tasks:pending", "PENDING TASKS", limit=10)
# Show active
show_tasks(r, "tasks:active", "ACTIVE TASKS")
# Show recent completed
show_tasks(r, "tasks:completed", "RECENT COMPLETED (last 10)", limit=10)
print("\n" + "=" * 80)
if __name__ == "__main__":
main()