Initial commit: workspace setup with skills, memory, config

2026-02-10 14:37:49 -06:00
commit d1357c5463
77 changed files with 10822 additions and 0 deletions
--- a/skills/log-monitor/scripts/log_monitor.py
+++ b/skills/log-monitor/scripts/log_monitor.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+"""
+Log Monitor & Auto-Repair Script
+Scans system logs for errors and attempts safe auto-fixes.
+Runs daily at 2 AM via cron.
+"""
+
+import subprocess
+import re
+import sys
+import os
+from datetime import datetime, timedelta
+
+# Config
+LOG_HOURS = 24  # Check last 24 hours
+REPORT_FILE = "/tmp/log_monitor_report.txt"
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Patterns to exclude (noise, not real errors)
+EXCLUDE_PATTERNS = [
+    r"sabnzbd",  # Download manager references (not errors)
+    r"github\.com/sabnzbd",  # GitHub repo references
+    r"functions\.(read|edit|exec) failed.*Missing required parameter",  # My own tool errors
+    r"log_monitor\.py",  # Don't report on myself
+    r"SyntaxWarning.*invalid escape sequence",  # My own script warnings
+    r'"type":"thinking"',  # My internal thinking blocks
+    r'"thinking":',  # More thinking content
+    r"The user has pasted a log of errors",  # My own analysis text
+    r"Let me respond appropriately",  # My response planning
+    r"functions\.(read|edit|exec) failed",  # Tool failures in logs
+    r"agent/embedded.*read tool called without path",  # Embedded session errors
+    r"rs_\d+",  # Reasoning signature IDs
+    r"encrypted_content",  # Encrypted thinking blocks
+    r"Missing required parameter.*newText",  # My edit tool errors
+    # Filter session log content showing file reads of this script
+    r"content.*report\.append.*OpenClaw Logs: No errors found",  # My own code appearing in logs
+    r"file_path.*log_monitor\.py",  # File operations on this script
+    # Container-specific harmless errors
+    r"nvidia",  # NVIDIA modules not available in container
+    r"nvidia-uvm",  # NVIDIA UVM module
+    r"nvidia-persistenced",  # NVIDIA persistence daemon
+    r"Failed to find module 'nvidia",  # NVIDIA module load failure
+    r"Failed to query NVIDIA devices",  # No GPU in container
+    r"rsyslogd.*imklog",  # rsyslog kernel log issues (expected in container)
+    r"imklog.*cannot open kernel log",  # Kernel log not available
+    r"imklog.*failed",  # imklog activation failures
+    r"activation of module imklog failed",  # imklog module activation
+    r"pam_lastlog\.so",  # PAM module not in container
+    r"PAM unable to dlopen",  # PAM module load failure
+    r"PAM adding faulty module",  # PAM module error
+    r"pam_systemd.*Failed to create session",  # Session creation (expected in container)
+    r"Failed to start motd-news\.service",  # MOTD news (expected in container)
+]
+
+# Known error patterns and their fixes
+AUTO_FIXES = {
+    # Python module missing
+    r"ModuleNotFoundError: No module named '([^']+)'": {
+        "fix_cmd": "pip install {module}",
+        "description": "Install missing Python module: {module}"
+    },
+    # Permission denied on common paths
+    r"Permission denied: (/tmp/[^\s]+)": {
+        "fix_cmd": "chmod 755 {path}",
+        "description": "Fix permissions on {path}"
+    },
+    # Disk space issues
+    r"No space left on device": {
+        "fix_cmd": None,  # Can't auto-fix, needs human
+        "description": "CRITICAL: Disk full - manual cleanup required",
+        "alert": True
+    },
+    # Connection refused (services down)
+    r"Connection refused.*:(\d+)": {
+        "fix_cmd": None,
+        "description": "Service on port {port} may be down - check status",
+        "alert": True
+    },
+    # Ollama connection issues
+    r"ollama.*connection.*refused": {
+        "fix_cmd": "systemctl restart ollama",
+        "description": "Restart ollama service"
+    },
+    # Redis connection issues
+    r"redis.*connection.*refused": {
+        "fix_cmd": "systemctl restart redis-server || docker restart redis",
+        "description": "Restart Redis service"
+    },
+}
+
+def should_exclude(line):
+    """Check if a log line should be excluded as noise"""
+    for pattern in EXCLUDE_PATTERNS:
+        if re.search(pattern, line, re.IGNORECASE):
+            return True
+    return False
+
+def run_cmd(cmd, timeout=30):
+    """Run shell command and return output"""
+    try:
+        result = subprocess.run(
+            cmd, shell=True, capture_output=True, text=True, timeout=timeout
+        )
+        return result.stdout + result.stderr
+    except Exception as e:
+        return f"Command failed: {e}"
+
+def check_redis():
+    """Check Redis health using Python (redis-cli not available in container)"""
+    try:
+        import redis
+        r = redis.Redis(host='10.0.0.36', port=6379, socket_timeout=5, decode_responses=True)
+        if r.ping():
+            return "Redis: ✅ Connected (10.0.0.36:6379)"
+        else:
+            return "Redis: ❌ Ping failed"
+    except ImportError:
+        return "Redis: ⚠️ redis module not installed, cannot check"
+    except Exception as e:
+        return f"Redis: ❌ Error - {str(e)[:50]}"
+
+def get_journal_errors():
+    """Get errors from systemd journal (last 24h)"""
+    since = (datetime.now() - timedelta(hours=LOG_HOURS)).strftime("%Y-%m-%d %H:%M:%S")
+    cmd = f"journalctl --since='{since}' --priority=err --no-pager -q"
+    output = run_cmd(cmd)
+    
+    # Filter out noise
+    lines = output.strip().split('\n')
+    filtered = [line for line in lines if line.strip() and not should_exclude(line)]
+    return '\n'.join(filtered) if filtered else ""
+
+def get_cron_errors():
+    """Get cron-related errors"""
+    cron_logs = []
+    
+    # Try common cron log locations
+    for log_path in ["/var/log/cron", "/var/log/syslog", "/var/log/messages"]:
+        if os.path.exists(log_path):
+            # Use proper shell escaping - pipe character needs to be in the pattern
+            cmd = rf"grep -iE 'cron.*error|CRON.*FAILED| exited with ' {log_path} 2>/dev/null | tail -20"
+            output = run_cmd(cmd)
+            if output.strip():
+                # Filter noise
+                lines = output.strip().split('\n')
+                filtered = [line for line in lines if not should_exclude(line)]
+                if filtered:
+                    cron_logs.append(f"=== {log_path} ===\n" + '\n'.join(filtered))
+    
+    return "\n\n".join(cron_logs) if cron_logs else ""
+
+def get_openclaw_errors():
+    """Check OpenClaw session logs for errors"""
+    # Find files with errors from last 24h, excluding this script's runs
+    cmd = rf"find /root/.openclaw/agents -name '*.jsonl' -mtime -1 -exec grep -l 'error|Error|FAILED|Traceback' {{}} \; 2>/dev/null"
+    files = run_cmd(cmd).strip().split("\n")
+    
+    errors = []
+    for f in files:
+        if f and SCRIPT_DIR not in f:  # Skip my own script's logs
+            # Get recent errors from each file
+            cmd = rf"grep -iE 'error|traceback|failed' '{f}' 2>/dev/null | tail -5"
+            output = run_cmd(cmd)
+            if output.strip():
+                # Filter noise aggressively for OpenClaw logs
+                lines = output.strip().split('\n')
+                filtered = [line for line in lines if not should_exclude(line)]
+                # Additional filter: skip lines that are just me analyzing errors
+                filtered = [line for line in filtered if not re.search(r'I (can )?see', line, re.IGNORECASE)]
+                filtered = [line for line in filtered if not re.search(r'meta and kind of funny', line, re.IGNORECASE)]
+                # Filter very long content blocks (file reads)
+                filtered = [line for line in filtered if len(line) < 500]
+                if filtered:
+                    errors.append(f"=== {os.path.basename(f)} ===\n" + '\n'.join(filtered))
+    
+    return "\n\n".join(errors) if errors else ""
+
+def scan_and_fix(log_content, source_name):
+    """Scan log content for known errors and attempt fixes"""
+    fixes_applied = []
+    alerts_needed = []
+    
+    # Track which fixes we've already tried (avoid duplicates)
+    tried_fixes = set()
+    
+    for pattern, fix_info in AUTO_FIXES.items():
+        matches = re.finditer(pattern, log_content, re.IGNORECASE)
+        
+        for match in matches:
+            # Extract groups if any
+            groups = match.groups()
+            
+            description = fix_info["description"]
+            fix_cmd = fix_info.get("fix_cmd")
+            needs_alert = fix_info.get("alert", False)
+            
+            # Format description with extracted values
+            if groups:
+                for i, group in enumerate(groups):
+                    placeholder = ["module", "path", "port", "service"][i] if i < 4 else f"group{i}"
+                    description = description.replace(f"{{{placeholder}}}", group)
+                    if fix_cmd:
+                        fix_cmd = fix_cmd.replace(f"{{{placeholder}}}", group)
+            
+            # Skip if we already tried this exact fix
+            fix_key = f"{description}:{fix_cmd}"
+            if fix_key in tried_fixes:
+                continue
+            tried_fixes.add(fix_key)
+            
+            if needs_alert:
+                alerts_needed.append({
+                    "error": match.group(0),
+                    "description": description,
+                    "source": source_name
+                })
+            elif fix_cmd:
+                # Attempt the fix
+                print(f"[FIXING] {description}")
+                result = run_cmd(fix_cmd)
+                success = "error" not in result.lower() and "failed" not in result.lower()
+                
+                fixes_applied.append({
+                    "description": description,
+                    "command": fix_cmd,
+                    "success": success,
+                    "result": result[:200] if result else "OK"
+                })
+    
+    return fixes_applied, alerts_needed
+
+def main():
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    report = [f"=== Log Monitor Report: {timestamp} ===\n"]
+    
+    all_fixes = []
+    all_alerts = []
+    
+    # Check service health (parallel-style in Python)
+    print("Checking service health...")
+    redis_status = check_redis()
+    report.append(f"\n--- Service Health ---\n{redis_status}")
+    
+    # Check systemd journal
+    print("Checking systemd journal...")
+    journal_errors = get_journal_errors()
+    if journal_errors:
+        report.append(f"\n--- Systemd Journal Errors ---\n{journal_errors[:2000]}")
+        fixes, alerts = scan_and_fix(journal_errors, "journal")
+        all_fixes.extend(fixes)
+        all_alerts.extend(alerts)
+    else:
+        report.append("\n--- Systemd Journal: No errors found ---")
+    
+    # Check cron logs
+    print("Checking cron logs...")
+    cron_errors = get_cron_errors()
+    if cron_errors:
+        report.append(f"\n--- Cron Errors ---\n{cron_errors[:2000]}")
+        fixes, alerts = scan_and_fix(cron_errors, "cron")
+        all_fixes.extend(fixes)
+        all_alerts.extend(alerts)
+    else:
+        report.append("\n--- Cron Logs: No errors found ---")
+    
+    # Check OpenClaw logs
+    print("Checking OpenClaw logs...")
+    oc_errors = get_openclaw_errors()
+    if oc_errors:
+        report.append(f"\n--- OpenClaw Errors ---\n{oc_errors[:2000]}")
+        fixes, alerts = scan_and_fix(oc_errors, "openclaw")
+        all_fixes.extend(fixes)
+        all_alerts.extend(alerts)
+    else:
+        report.append("\n--- OpenClaw Logs: No errors found ---")
+    
+    # Summarize fixes
+    report.append(f"\n\n=== FIXES APPLIED: {len(all_fixes)} ===")
+    for fix in all_fixes:
+        status = "✅" if fix["success"] else "❌"
+        report.append(f"\n{status} {fix['description']}")
+        report.append(f"   Command: {fix['command']}")
+        if not fix["success"]:
+            report.append(f"   Result: {fix['result']}")
+    
+    # Summarize alerts (need human attention)
+    if all_alerts:
+        report.append(f"\n\n=== ALERTS NEEDING ATTENTION: {len(all_alerts)} ===")
+        for alert in all_alerts:
+            report.append(f"\n⚠️  {alert['description']}")
+            report.append(f"   Source: {alert['source']}")
+            report.append(f"   Error: {alert['error'][:100]}")
+    
+    # Save report
+    report_text = "\n".join(report)
+    with open(REPORT_FILE, "w") as f:
+        f.write(report_text)
+    
+    # Print summary
+    print(f"\n{report_text}")
+    
+    # Return non-zero if there are unhandled alerts (for cron notification)
+    if all_alerts:
+        print(f"\n⚠️  {len(all_alerts)} issue(s) need human attention")
+        return 1
+    
+    print("\n✅ Log check complete. All issues resolved or no errors found.")
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())