Initial commit: workspace setup with skills, memory, config

This commit is contained in:
root
2026-02-10 14:37:49 -06:00
commit d1357c5463
77 changed files with 10822 additions and 0 deletions

View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python3
"""
Log Monitor & Auto-Repair Script
Scans system logs for errors and attempts safe auto-fixes.
Runs daily at 2 AM via cron.
"""
import subprocess
import re
import sys
import os
from datetime import datetime, timedelta
# Config
LOG_HOURS = 24 # Check last 24 hours
REPORT_FILE = "/tmp/log_monitor_report.txt"
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Patterns to exclude (noise, not real errors)
EXCLUDE_PATTERNS = [
r"sabnzbd", # Download manager references (not errors)
r"github\.com/sabnzbd", # GitHub repo references
r"functions\.(read|edit|exec) failed.*Missing required parameter", # My own tool errors
r"log_monitor\.py", # Don't report on myself
r"SyntaxWarning.*invalid escape sequence", # My own script warnings
r'"type":"thinking"', # My internal thinking blocks
r'"thinking":', # More thinking content
r"The user has pasted a log of errors", # My own analysis text
r"Let me respond appropriately", # My response planning
r"functions\.(read|edit|exec) failed", # Tool failures in logs
r"agent/embedded.*read tool called without path", # Embedded session errors
r"rs_\d+", # Reasoning signature IDs
r"encrypted_content", # Encrypted thinking blocks
r"Missing required parameter.*newText", # My edit tool errors
# Filter session log content showing file reads of this script
r"content.*report\.append.*OpenClaw Logs: No errors found", # My own code appearing in logs
r"file_path.*log_monitor\.py", # File operations on this script
# Container-specific harmless errors
r"nvidia", # NVIDIA modules not available in container
r"nvidia-uvm", # NVIDIA UVM module
r"nvidia-persistenced", # NVIDIA persistence daemon
r"Failed to find module 'nvidia", # NVIDIA module load failure
r"Failed to query NVIDIA devices", # No GPU in container
r"rsyslogd.*imklog", # rsyslog kernel log issues (expected in container)
r"imklog.*cannot open kernel log", # Kernel log not available
r"imklog.*failed", # imklog activation failures
r"activation of module imklog failed", # imklog module activation
r"pam_lastlog\.so", # PAM module not in container
r"PAM unable to dlopen", # PAM module load failure
r"PAM adding faulty module", # PAM module error
r"pam_systemd.*Failed to create session", # Session creation (expected in container)
r"Failed to start motd-news\.service", # MOTD news (expected in container)
]
# Known error patterns and their fixes
AUTO_FIXES = {
# Python module missing
r"ModuleNotFoundError: No module named '([^']+)'": {
"fix_cmd": "pip install {module}",
"description": "Install missing Python module: {module}"
},
# Permission denied on common paths
r"Permission denied: (/tmp/[^\s]+)": {
"fix_cmd": "chmod 755 {path}",
"description": "Fix permissions on {path}"
},
# Disk space issues
r"No space left on device": {
"fix_cmd": None, # Can't auto-fix, needs human
"description": "CRITICAL: Disk full - manual cleanup required",
"alert": True
},
# Connection refused (services down)
r"Connection refused.*:(\d+)": {
"fix_cmd": None,
"description": "Service on port {port} may be down - check status",
"alert": True
},
# Ollama connection issues
r"ollama.*connection.*refused": {
"fix_cmd": "systemctl restart ollama",
"description": "Restart ollama service"
},
# Redis connection issues
r"redis.*connection.*refused": {
"fix_cmd": "systemctl restart redis-server || docker restart redis",
"description": "Restart Redis service"
},
}
def should_exclude(line):
"""Check if a log line should be excluded as noise"""
for pattern in EXCLUDE_PATTERNS:
if re.search(pattern, line, re.IGNORECASE):
return True
return False
def run_cmd(cmd, timeout=30):
"""Run shell command and return output"""
try:
result = subprocess.run(
cmd, shell=True, capture_output=True, text=True, timeout=timeout
)
return result.stdout + result.stderr
except Exception as e:
return f"Command failed: {e}"
def check_redis():
"""Check Redis health using Python (redis-cli not available in container)"""
try:
import redis
r = redis.Redis(host='10.0.0.36', port=6379, socket_timeout=5, decode_responses=True)
if r.ping():
return "Redis: ✅ Connected (10.0.0.36:6379)"
else:
return "Redis: ❌ Ping failed"
except ImportError:
return "Redis: ⚠️ redis module not installed, cannot check"
except Exception as e:
return f"Redis: ❌ Error - {str(e)[:50]}"
def get_journal_errors():
"""Get errors from systemd journal (last 24h)"""
since = (datetime.now() - timedelta(hours=LOG_HOURS)).strftime("%Y-%m-%d %H:%M:%S")
cmd = f"journalctl --since='{since}' --priority=err --no-pager -q"
output = run_cmd(cmd)
# Filter out noise
lines = output.strip().split('\n')
filtered = [line for line in lines if line.strip() and not should_exclude(line)]
return '\n'.join(filtered) if filtered else ""
def get_cron_errors():
"""Get cron-related errors"""
cron_logs = []
# Try common cron log locations
for log_path in ["/var/log/cron", "/var/log/syslog", "/var/log/messages"]:
if os.path.exists(log_path):
# Use proper shell escaping - pipe character needs to be in the pattern
cmd = rf"grep -iE 'cron.*error|CRON.*FAILED| exited with ' {log_path} 2>/dev/null | tail -20"
output = run_cmd(cmd)
if output.strip():
# Filter noise
lines = output.strip().split('\n')
filtered = [line for line in lines if not should_exclude(line)]
if filtered:
cron_logs.append(f"=== {log_path} ===\n" + '\n'.join(filtered))
return "\n\n".join(cron_logs) if cron_logs else ""
def get_openclaw_errors():
"""Check OpenClaw session logs for errors"""
# Find files with errors from last 24h, excluding this script's runs
cmd = rf"find /root/.openclaw/agents -name '*.jsonl' -mtime -1 -exec grep -l 'error|Error|FAILED|Traceback' {{}} \; 2>/dev/null"
files = run_cmd(cmd).strip().split("\n")
errors = []
for f in files:
if f and SCRIPT_DIR not in f: # Skip my own script's logs
# Get recent errors from each file
cmd = rf"grep -iE 'error|traceback|failed' '{f}' 2>/dev/null | tail -5"
output = run_cmd(cmd)
if output.strip():
# Filter noise aggressively for OpenClaw logs
lines = output.strip().split('\n')
filtered = [line for line in lines if not should_exclude(line)]
# Additional filter: skip lines that are just me analyzing errors
filtered = [line for line in filtered if not re.search(r'I (can )?see', line, re.IGNORECASE)]
filtered = [line for line in filtered if not re.search(r'meta and kind of funny', line, re.IGNORECASE)]
# Filter very long content blocks (file reads)
filtered = [line for line in filtered if len(line) < 500]
if filtered:
errors.append(f"=== {os.path.basename(f)} ===\n" + '\n'.join(filtered))
return "\n\n".join(errors) if errors else ""
def scan_and_fix(log_content, source_name):
"""Scan log content for known errors and attempt fixes"""
fixes_applied = []
alerts_needed = []
# Track which fixes we've already tried (avoid duplicates)
tried_fixes = set()
for pattern, fix_info in AUTO_FIXES.items():
matches = re.finditer(pattern, log_content, re.IGNORECASE)
for match in matches:
# Extract groups if any
groups = match.groups()
description = fix_info["description"]
fix_cmd = fix_info.get("fix_cmd")
needs_alert = fix_info.get("alert", False)
# Format description with extracted values
if groups:
for i, group in enumerate(groups):
placeholder = ["module", "path", "port", "service"][i] if i < 4 else f"group{i}"
description = description.replace(f"{{{placeholder}}}", group)
if fix_cmd:
fix_cmd = fix_cmd.replace(f"{{{placeholder}}}", group)
# Skip if we already tried this exact fix
fix_key = f"{description}:{fix_cmd}"
if fix_key in tried_fixes:
continue
tried_fixes.add(fix_key)
if needs_alert:
alerts_needed.append({
"error": match.group(0),
"description": description,
"source": source_name
})
elif fix_cmd:
# Attempt the fix
print(f"[FIXING] {description}")
result = run_cmd(fix_cmd)
success = "error" not in result.lower() and "failed" not in result.lower()
fixes_applied.append({
"description": description,
"command": fix_cmd,
"success": success,
"result": result[:200] if result else "OK"
})
return fixes_applied, alerts_needed
def main():
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
report = [f"=== Log Monitor Report: {timestamp} ===\n"]
all_fixes = []
all_alerts = []
# Check service health (parallel-style in Python)
print("Checking service health...")
redis_status = check_redis()
report.append(f"\n--- Service Health ---\n{redis_status}")
# Check systemd journal
print("Checking systemd journal...")
journal_errors = get_journal_errors()
if journal_errors:
report.append(f"\n--- Systemd Journal Errors ---\n{journal_errors[:2000]}")
fixes, alerts = scan_and_fix(journal_errors, "journal")
all_fixes.extend(fixes)
all_alerts.extend(alerts)
else:
report.append("\n--- Systemd Journal: No errors found ---")
# Check cron logs
print("Checking cron logs...")
cron_errors = get_cron_errors()
if cron_errors:
report.append(f"\n--- Cron Errors ---\n{cron_errors[:2000]}")
fixes, alerts = scan_and_fix(cron_errors, "cron")
all_fixes.extend(fixes)
all_alerts.extend(alerts)
else:
report.append("\n--- Cron Logs: No errors found ---")
# Check OpenClaw logs
print("Checking OpenClaw logs...")
oc_errors = get_openclaw_errors()
if oc_errors:
report.append(f"\n--- OpenClaw Errors ---\n{oc_errors[:2000]}")
fixes, alerts = scan_and_fix(oc_errors, "openclaw")
all_fixes.extend(fixes)
all_alerts.extend(alerts)
else:
report.append("\n--- OpenClaw Logs: No errors found ---")
# Summarize fixes
report.append(f"\n\n=== FIXES APPLIED: {len(all_fixes)} ===")
for fix in all_fixes:
status = "" if fix["success"] else ""
report.append(f"\n{status} {fix['description']}")
report.append(f" Command: {fix['command']}")
if not fix["success"]:
report.append(f" Result: {fix['result']}")
# Summarize alerts (need human attention)
if all_alerts:
report.append(f"\n\n=== ALERTS NEEDING ATTENTION: {len(all_alerts)} ===")
for alert in all_alerts:
report.append(f"\n⚠️ {alert['description']}")
report.append(f" Source: {alert['source']}")
report.append(f" Error: {alert['error'][:100]}")
# Save report
report_text = "\n".join(report)
with open(REPORT_FILE, "w") as f:
f.write(report_text)
# Print summary
print(f"\n{report_text}")
# Return non-zero if there are unhandled alerts (for cron notification)
if all_alerts:
print(f"\n⚠️ {len(all_alerts)} issue(s) need human attention")
return 1
print("\n✅ Log check complete. All issues resolved or no errors found.")
return 0
if __name__ == "__main__":
sys.exit(main())