Initial commit: workspace setup with skills, memory, config
This commit is contained in:
60
skills/log-monitor/SKILL.md
Normal file
60
skills/log-monitor/SKILL.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# Log Monitor Skill
|
||||
|
||||
Automatic log scanning and error repair for OpenClaw/agent systems.
|
||||
|
||||
## Purpose
|
||||
|
||||
Runs daily at 2 AM to:
|
||||
1. Scan system logs (journald, cron, OpenClaw) for errors
|
||||
2. Attempt safe auto-fixes for known issues
|
||||
3. Report unhandled errors needing human attention
|
||||
|
||||
## Auto-Fixes Supported
|
||||
|
||||
| Error Pattern | Fix Action |
|
||||
|---------------|------------|
|
||||
| Missing Python module (`ModuleNotFoundError`) | `pip install <module>` |
|
||||
| Permission denied on temp files | `chmod 755 <path>` |
|
||||
| Ollama connection issues | `systemctl restart ollama` |
|
||||
| Disk full | Alert only (requires manual cleanup) |
|
||||
| Service down (connection refused) | Alert only (investigate first) |
|
||||
|
||||
## Usage
|
||||
|
||||
### Manual Run
|
||||
```bash
|
||||
cd /root/.openclaw/workspace/skills/log-monitor/scripts
|
||||
python3 log_monitor.py
|
||||
```
|
||||
|
||||
### View Latest Report
|
||||
```bash
|
||||
cat /tmp/log_monitor_report.txt
|
||||
```
|
||||
|
||||
### Cron Schedule
|
||||
Runs daily at 2:00 AM via `openclaw cron`.
|
||||
|
||||
## Adding New Auto-Fixes
|
||||
|
||||
Edit `log_monitor.py` and add to `AUTO_FIXES` dictionary:
|
||||
|
||||
```python
|
||||
AUTO_FIXES = {
|
||||
r"your-regex-pattern-here": {
|
||||
"fix_cmd": "command-to-run {placeholder}",
|
||||
"description": "Human-readable description with {placeholder}"
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
Use `{module}`, `{path}`, `{port}`, `{service}` as capture group placeholders.
|
||||
|
||||
Set `"alert": True` for issues that should notify you but not auto-fix.
|
||||
|
||||
## Safety
|
||||
|
||||
- Only "safe" fixes are automated (package installs, restarts, permissions)
|
||||
- Critical issues (disk full, service down) alert but don't auto-fix
|
||||
- All actions are logged to `/tmp/log_monitor_report.txt`
|
||||
- Cron exits with code 1 if human attention needed (triggers notification)
|
||||
311
skills/log-monitor/scripts/log_monitor.py
Executable file
311
skills/log-monitor/scripts/log_monitor.py
Executable file
@@ -0,0 +1,311 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Log Monitor & Auto-Repair Script
|
||||
Scans system logs for errors and attempts safe auto-fixes.
|
||||
Runs daily at 2 AM via cron.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Config
|
||||
LOG_HOURS = 24 # Check last 24 hours
|
||||
REPORT_FILE = "/tmp/log_monitor_report.txt"
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Patterns to exclude (noise, not real errors)
|
||||
EXCLUDE_PATTERNS = [
|
||||
r"sabnzbd", # Download manager references (not errors)
|
||||
r"github\.com/sabnzbd", # GitHub repo references
|
||||
r"functions\.(read|edit|exec) failed.*Missing required parameter", # My own tool errors
|
||||
r"log_monitor\.py", # Don't report on myself
|
||||
r"SyntaxWarning.*invalid escape sequence", # My own script warnings
|
||||
r'"type":"thinking"', # My internal thinking blocks
|
||||
r'"thinking":', # More thinking content
|
||||
r"The user has pasted a log of errors", # My own analysis text
|
||||
r"Let me respond appropriately", # My response planning
|
||||
r"functions\.(read|edit|exec) failed", # Tool failures in logs
|
||||
r"agent/embedded.*read tool called without path", # Embedded session errors
|
||||
r"rs_\d+", # Reasoning signature IDs
|
||||
r"encrypted_content", # Encrypted thinking blocks
|
||||
r"Missing required parameter.*newText", # My edit tool errors
|
||||
# Filter session log content showing file reads of this script
|
||||
r"content.*report\.append.*OpenClaw Logs: No errors found", # My own code appearing in logs
|
||||
r"file_path.*log_monitor\.py", # File operations on this script
|
||||
# Container-specific harmless errors
|
||||
r"nvidia", # NVIDIA modules not available in container
|
||||
r"nvidia-uvm", # NVIDIA UVM module
|
||||
r"nvidia-persistenced", # NVIDIA persistence daemon
|
||||
r"Failed to find module 'nvidia", # NVIDIA module load failure
|
||||
r"Failed to query NVIDIA devices", # No GPU in container
|
||||
r"rsyslogd.*imklog", # rsyslog kernel log issues (expected in container)
|
||||
r"imklog.*cannot open kernel log", # Kernel log not available
|
||||
r"imklog.*failed", # imklog activation failures
|
||||
r"activation of module imklog failed", # imklog module activation
|
||||
r"pam_lastlog\.so", # PAM module not in container
|
||||
r"PAM unable to dlopen", # PAM module load failure
|
||||
r"PAM adding faulty module", # PAM module error
|
||||
r"pam_systemd.*Failed to create session", # Session creation (expected in container)
|
||||
r"Failed to start motd-news\.service", # MOTD news (expected in container)
|
||||
]
|
||||
|
||||
# Known error patterns and their fixes
|
||||
AUTO_FIXES = {
|
||||
# Python module missing
|
||||
r"ModuleNotFoundError: No module named '([^']+)'": {
|
||||
"fix_cmd": "pip install {module}",
|
||||
"description": "Install missing Python module: {module}"
|
||||
},
|
||||
# Permission denied on common paths
|
||||
r"Permission denied: (/tmp/[^\s]+)": {
|
||||
"fix_cmd": "chmod 755 {path}",
|
||||
"description": "Fix permissions on {path}"
|
||||
},
|
||||
# Disk space issues
|
||||
r"No space left on device": {
|
||||
"fix_cmd": None, # Can't auto-fix, needs human
|
||||
"description": "CRITICAL: Disk full - manual cleanup required",
|
||||
"alert": True
|
||||
},
|
||||
# Connection refused (services down)
|
||||
r"Connection refused.*:(\d+)": {
|
||||
"fix_cmd": None,
|
||||
"description": "Service on port {port} may be down - check status",
|
||||
"alert": True
|
||||
},
|
||||
# Ollama connection issues
|
||||
r"ollama.*connection.*refused": {
|
||||
"fix_cmd": "systemctl restart ollama",
|
||||
"description": "Restart ollama service"
|
||||
},
|
||||
# Redis connection issues
|
||||
r"redis.*connection.*refused": {
|
||||
"fix_cmd": "systemctl restart redis-server || docker restart redis",
|
||||
"description": "Restart Redis service"
|
||||
},
|
||||
}
|
||||
|
||||
def should_exclude(line):
|
||||
"""Check if a log line should be excluded as noise"""
|
||||
for pattern in EXCLUDE_PATTERNS:
|
||||
if re.search(pattern, line, re.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
|
||||
def run_cmd(cmd, timeout=30):
|
||||
"""Run shell command and return output"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd, shell=True, capture_output=True, text=True, timeout=timeout
|
||||
)
|
||||
return result.stdout + result.stderr
|
||||
except Exception as e:
|
||||
return f"Command failed: {e}"
|
||||
|
||||
def check_redis():
|
||||
"""Check Redis health using Python (redis-cli not available in container)"""
|
||||
try:
|
||||
import redis
|
||||
r = redis.Redis(host='10.0.0.36', port=6379, socket_timeout=5, decode_responses=True)
|
||||
if r.ping():
|
||||
return "Redis: ✅ Connected (10.0.0.36:6379)"
|
||||
else:
|
||||
return "Redis: ❌ Ping failed"
|
||||
except ImportError:
|
||||
return "Redis: ⚠️ redis module not installed, cannot check"
|
||||
except Exception as e:
|
||||
return f"Redis: ❌ Error - {str(e)[:50]}"
|
||||
|
||||
def get_journal_errors():
|
||||
"""Get errors from systemd journal (last 24h)"""
|
||||
since = (datetime.now() - timedelta(hours=LOG_HOURS)).strftime("%Y-%m-%d %H:%M:%S")
|
||||
cmd = f"journalctl --since='{since}' --priority=err --no-pager -q"
|
||||
output = run_cmd(cmd)
|
||||
|
||||
# Filter out noise
|
||||
lines = output.strip().split('\n')
|
||||
filtered = [line for line in lines if line.strip() and not should_exclude(line)]
|
||||
return '\n'.join(filtered) if filtered else ""
|
||||
|
||||
def get_cron_errors():
|
||||
"""Get cron-related errors"""
|
||||
cron_logs = []
|
||||
|
||||
# Try common cron log locations
|
||||
for log_path in ["/var/log/cron", "/var/log/syslog", "/var/log/messages"]:
|
||||
if os.path.exists(log_path):
|
||||
# Use proper shell escaping - pipe character needs to be in the pattern
|
||||
cmd = rf"grep -iE 'cron.*error|CRON.*FAILED| exited with ' {log_path} 2>/dev/null | tail -20"
|
||||
output = run_cmd(cmd)
|
||||
if output.strip():
|
||||
# Filter noise
|
||||
lines = output.strip().split('\n')
|
||||
filtered = [line for line in lines if not should_exclude(line)]
|
||||
if filtered:
|
||||
cron_logs.append(f"=== {log_path} ===\n" + '\n'.join(filtered))
|
||||
|
||||
return "\n\n".join(cron_logs) if cron_logs else ""
|
||||
|
||||
def get_openclaw_errors():
|
||||
"""Check OpenClaw session logs for errors"""
|
||||
# Find files with errors from last 24h, excluding this script's runs
|
||||
cmd = rf"find /root/.openclaw/agents -name '*.jsonl' -mtime -1 -exec grep -l 'error|Error|FAILED|Traceback' {{}} \; 2>/dev/null"
|
||||
files = run_cmd(cmd).strip().split("\n")
|
||||
|
||||
errors = []
|
||||
for f in files:
|
||||
if f and SCRIPT_DIR not in f: # Skip my own script's logs
|
||||
# Get recent errors from each file
|
||||
cmd = rf"grep -iE 'error|traceback|failed' '{f}' 2>/dev/null | tail -5"
|
||||
output = run_cmd(cmd)
|
||||
if output.strip():
|
||||
# Filter noise aggressively for OpenClaw logs
|
||||
lines = output.strip().split('\n')
|
||||
filtered = [line for line in lines if not should_exclude(line)]
|
||||
# Additional filter: skip lines that are just me analyzing errors
|
||||
filtered = [line for line in filtered if not re.search(r'I (can )?see', line, re.IGNORECASE)]
|
||||
filtered = [line for line in filtered if not re.search(r'meta and kind of funny', line, re.IGNORECASE)]
|
||||
# Filter very long content blocks (file reads)
|
||||
filtered = [line for line in filtered if len(line) < 500]
|
||||
if filtered:
|
||||
errors.append(f"=== {os.path.basename(f)} ===\n" + '\n'.join(filtered))
|
||||
|
||||
return "\n\n".join(errors) if errors else ""
|
||||
|
||||
def scan_and_fix(log_content, source_name):
|
||||
"""Scan log content for known errors and attempt fixes"""
|
||||
fixes_applied = []
|
||||
alerts_needed = []
|
||||
|
||||
# Track which fixes we've already tried (avoid duplicates)
|
||||
tried_fixes = set()
|
||||
|
||||
for pattern, fix_info in AUTO_FIXES.items():
|
||||
matches = re.finditer(pattern, log_content, re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
# Extract groups if any
|
||||
groups = match.groups()
|
||||
|
||||
description = fix_info["description"]
|
||||
fix_cmd = fix_info.get("fix_cmd")
|
||||
needs_alert = fix_info.get("alert", False)
|
||||
|
||||
# Format description with extracted values
|
||||
if groups:
|
||||
for i, group in enumerate(groups):
|
||||
placeholder = ["module", "path", "port", "service"][i] if i < 4 else f"group{i}"
|
||||
description = description.replace(f"{{{placeholder}}}", group)
|
||||
if fix_cmd:
|
||||
fix_cmd = fix_cmd.replace(f"{{{placeholder}}}", group)
|
||||
|
||||
# Skip if we already tried this exact fix
|
||||
fix_key = f"{description}:{fix_cmd}"
|
||||
if fix_key in tried_fixes:
|
||||
continue
|
||||
tried_fixes.add(fix_key)
|
||||
|
||||
if needs_alert:
|
||||
alerts_needed.append({
|
||||
"error": match.group(0),
|
||||
"description": description,
|
||||
"source": source_name
|
||||
})
|
||||
elif fix_cmd:
|
||||
# Attempt the fix
|
||||
print(f"[FIXING] {description}")
|
||||
result = run_cmd(fix_cmd)
|
||||
success = "error" not in result.lower() and "failed" not in result.lower()
|
||||
|
||||
fixes_applied.append({
|
||||
"description": description,
|
||||
"command": fix_cmd,
|
||||
"success": success,
|
||||
"result": result[:200] if result else "OK"
|
||||
})
|
||||
|
||||
return fixes_applied, alerts_needed
|
||||
|
||||
def main():
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
report = [f"=== Log Monitor Report: {timestamp} ===\n"]
|
||||
|
||||
all_fixes = []
|
||||
all_alerts = []
|
||||
|
||||
# Check service health (parallel-style in Python)
|
||||
print("Checking service health...")
|
||||
redis_status = check_redis()
|
||||
report.append(f"\n--- Service Health ---\n{redis_status}")
|
||||
|
||||
# Check systemd journal
|
||||
print("Checking systemd journal...")
|
||||
journal_errors = get_journal_errors()
|
||||
if journal_errors:
|
||||
report.append(f"\n--- Systemd Journal Errors ---\n{journal_errors[:2000]}")
|
||||
fixes, alerts = scan_and_fix(journal_errors, "journal")
|
||||
all_fixes.extend(fixes)
|
||||
all_alerts.extend(alerts)
|
||||
else:
|
||||
report.append("\n--- Systemd Journal: No errors found ---")
|
||||
|
||||
# Check cron logs
|
||||
print("Checking cron logs...")
|
||||
cron_errors = get_cron_errors()
|
||||
if cron_errors:
|
||||
report.append(f"\n--- Cron Errors ---\n{cron_errors[:2000]}")
|
||||
fixes, alerts = scan_and_fix(cron_errors, "cron")
|
||||
all_fixes.extend(fixes)
|
||||
all_alerts.extend(alerts)
|
||||
else:
|
||||
report.append("\n--- Cron Logs: No errors found ---")
|
||||
|
||||
# Check OpenClaw logs
|
||||
print("Checking OpenClaw logs...")
|
||||
oc_errors = get_openclaw_errors()
|
||||
if oc_errors:
|
||||
report.append(f"\n--- OpenClaw Errors ---\n{oc_errors[:2000]}")
|
||||
fixes, alerts = scan_and_fix(oc_errors, "openclaw")
|
||||
all_fixes.extend(fixes)
|
||||
all_alerts.extend(alerts)
|
||||
else:
|
||||
report.append("\n--- OpenClaw Logs: No errors found ---")
|
||||
|
||||
# Summarize fixes
|
||||
report.append(f"\n\n=== FIXES APPLIED: {len(all_fixes)} ===")
|
||||
for fix in all_fixes:
|
||||
status = "✅" if fix["success"] else "❌"
|
||||
report.append(f"\n{status} {fix['description']}")
|
||||
report.append(f" Command: {fix['command']}")
|
||||
if not fix["success"]:
|
||||
report.append(f" Result: {fix['result']}")
|
||||
|
||||
# Summarize alerts (need human attention)
|
||||
if all_alerts:
|
||||
report.append(f"\n\n=== ALERTS NEEDING ATTENTION: {len(all_alerts)} ===")
|
||||
for alert in all_alerts:
|
||||
report.append(f"\n⚠️ {alert['description']}")
|
||||
report.append(f" Source: {alert['source']}")
|
||||
report.append(f" Error: {alert['error'][:100]}")
|
||||
|
||||
# Save report
|
||||
report_text = "\n".join(report)
|
||||
with open(REPORT_FILE, "w") as f:
|
||||
f.write(report_text)
|
||||
|
||||
# Print summary
|
||||
print(f"\n{report_text}")
|
||||
|
||||
# Return non-zero if there are unhandled alerts (for cron notification)
|
||||
if all_alerts:
|
||||
print(f"\n⚠️ {len(all_alerts)} issue(s) need human attention")
|
||||
return 1
|
||||
|
||||
print("\n✅ Log check complete. All issues resolved or no errors found.")
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user