#!/usr/bin/env node /** * OpenClaw Agent Health Monitor & Auto-Healing System * * Features: * - Process crash detection and auto-restart * - Memory leak monitoring * - Service health checks * - Telegram notifications on events * - Comprehensive logging * - Systemd integration */ const fs = require('fs'); const path = require('path'); const { spawn, execSync } = require('child_process'); const { exec } = require('child_process'); const util = require('util'); const execAsync = util.promisify(exec); const WORKSPACE = '/root/.openclaw/workspace'; const PARSE_AGENTS = `python3 ${WORKSPACE}/scripts/parse_agents.py`; class AgentHealthMonitor { constructor() { this.config = this.loadConfig(); this.logDir = '/root/.openclaw/workspace/logs/agents'; this.workspaceDir = '/root/.openclaw/workspace'; this.processes = new Map(); this.restartCounts = new Map(); this.maxRestarts = 5; this.restartWindow = 300000; // 5 minutes this.gracePeriod = 60000; // 60s grace period after first failure (upgrade tolerance) this.heartbeatInterval = 600000; // 10 minutes this.ensureLogDir(); this.services = this.loadMonitoredServices(); this.lastKnownState = {}; this.firstFailureTime = {}; for (const svc of this.services) { this.lastKnownState[svc.name] = true; this.firstFailureTime[svc.name] = 0; } this.setupSignalHandlers(); this.log('Agent Health Monitor initialized', 'info'); } loadMonitoredServices() { try { const output = execSync(`${PARSE_AGENTS} services`, { encoding: 'utf8' }).trim(); if (!output) return []; return output.split('\n').map(line => { const parts = line.split('\t'); const [name, type] = parts; if (type === 'local-cli') { const checkCmd = parts[2]; const startCmd = parts[3]; const pattern = parts[4]; return { name, type, checkCmd, startCmd, checkFn: (stdout) => new RegExp(pattern).test(stdout), }; } else if (type === 'local-systemd') { return { name, type, unit: parts[2] }; } else if (type === 'remote-http') { return { name, type, healthUrl: parts[2], timeout: parseInt(parts[3]) || 5000 }; } return { name, type }; }); } catch (error) { this.log(`Failed to load agents.yaml: ${error.message}`, 'error'); const ocBin = '/www/server/nodejs/v24.13.1/bin/openclaw'; return [{ name: 'gateway', type: 'local-cli', checkCmd: `${ocBin} gateway status 2>&1 || echo "not running"`, startCmd: `${ocBin} gateway start`, checkFn: (stdout) => /running|active|RPC probe: ok|Listening:/.test(stdout), }]; } } loadConfig() { try { const configPath = '/root/.openclaw/openclaw.json'; if (fs.existsSync(configPath)) { return JSON.parse(fs.readFileSync(configPath, 'utf8')); } } catch (error) { console.error('Failed to load OpenClaw config:', error.message); } return {}; } ensureLogDir() { if (!fs.existsSync(this.logDir)) { fs.mkdirSync(this.logDir, { recursive: true }); } } setupSignalHandlers() { process.on('SIGTERM', () => this.gracefulShutdown()); process.on('SIGINT', () => this.gracefulShutdown()); } async gracefulShutdown() { this.log('Graceful shutdown initiated', 'info'); // Stop all monitored processes for (const [name, proc] of this.processes.entries()) { try { proc.kill('SIGTERM'); this.log(`Stopped process: ${name}`, 'info'); } catch (error) { this.log(`Error stopping ${name}: ${error.message}`, 'error'); } } process.exit(0); } log(message, severity = 'info') { const timestamp = new Date().toISOString(); const logEntry = `[${timestamp}] [${severity.toUpperCase()}] ${message}\n`; // Console output console.log(logEntry.trim()); // File logging const logFile = path.join(this.logDir, `health-${new Date().toISOString().split('T')[0]}.log`); fs.appendFileSync(logFile, logEntry); } async sendNotification(message, severity = 'info') { this.log(message, severity); // Send via Telegram if configured const telegramConfig = this.config.channels?.telegram; if (telegramConfig?.enabled && telegramConfig.botToken) { await this.sendTelegramNotification(message, severity); } // Also send via OpenClaw message tool if available if (severity === 'critical' || severity === 'error') { await this.sendOpenClawNotification(message, severity); } } async sendTelegramNotification(message, severity) { const botToken = this.config.channels.telegram.botToken; const chatId = '5237946060'; if (!botToken) { return; } try { const url = `https://api.telegram.org/bot${botToken}/sendMessage`; const emojis = { critical: '🚨', error: '❌', warning: 'âš ī¸', info: 'â„šī¸' }; const payload = { chat_id: chatId, text: `${emojis[severity] || 'đŸ“ĸ'} *OpenClaw Alert* (${severity})\n\n${message}`, parse_mode: 'Markdown' }; const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload) }); if (!response.ok) { throw new Error(`Telegram API error: ${response.status}`); } } catch (error) { console.error('Telegram notification error:', error.message); } } async sendOpenClawNotification(message, severity) { try { // Use OpenClaw's message tool via exec const cmd = `/www/server/nodejs/v24.13.1/bin/openclaw message send --channel telegram --target 5237946060 --message "🚨 OpenClaw Service Alert (${severity})\\n\\n${message}"`; await execAsync(cmd); } catch (error) { console.error('OpenClaw notification error:', error.message); } } checkRestartLimit(processName) { const now = Date.now(); const restarts = this.restartCounts.get(processName) || []; // Filter restarts within the window const recentRestarts = restarts.filter(time => now - time < this.restartWindow); if (recentRestarts.length >= this.maxRestarts) { return false; // Too many restarts } this.restartCounts.set(processName, [...recentRestarts, now]); return true; } async monitorProcess(name, command, args = [], options = {}) { const { healthCheck, healthCheckInterval = 30000, env = {}, cwd = this.workspaceDir } = options; const startProcess = () => { return new Promise((resolve, reject) => { const proc = spawn(command, args, { cwd, env: { ...process.env, ...env }, stdio: ['ignore', 'pipe', 'pipe'] }); proc.stdout.on('data', (data) => { this.log(`[${name}] ${data.toString().trim()}`, 'info'); }); proc.stderr.on('data', (data) => { this.log(`[${name}] ${data.toString().trim()}`, 'error'); }); proc.on('error', async (error) => { this.log(`[${name}] Process error: ${error.message}`, 'critical'); await this.sendNotification(`${name} failed to start: ${error.message}`, 'critical'); reject(error); }); proc.on('close', async (code, signal) => { this.processes.delete(name); this.log(`[${name}] Process exited with code ${code}, signal ${signal}`, 'warning'); // Auto-restart logic if (code !== 0 || signal) { if (this.checkRestartLimit(name)) { this.log(`[${name}] Auto-restarting...`, 'warning'); await this.sendNotification(`${name} crashed (code: ${code}, signal: ${signal}). Restarting...`, 'error'); setTimeout(() => startProcess(), 5000); } else { await this.sendNotification( `${name} crashed ${this.maxRestarts} times in ${this.restartWindow/60000} minutes. Giving up.`, 'critical' ); } } }); this.processes.set(name, proc); resolve(proc); }); }; // Start the process await startProcess(); // Set up health checks if (healthCheck) { setInterval(async () => { try { const isHealthy = await healthCheck(); if (!isHealthy) { await this.sendNotification(`${name} health check failed`, 'warning'); // Restart unhealthy process const proc = this.processes.get(name); if (proc) { proc.kill('SIGTERM'); } } } catch (error) { await this.sendNotification(`${name} health check error: ${error.message}`, 'error'); } }, healthCheckInterval); } } getUserEnv() { return { ...process.env, XDG_RUNTIME_DIR: '/run/user/0', DBUS_SESSION_BUS_ADDRESS: 'unix:path=/run/user/0/bus' }; } async checkService(svc) { try { if (svc.type === 'local-cli') { const { stdout } = await execAsync(svc.checkCmd, { env: this.getUserEnv() }); return svc.checkFn(stdout); } else if (svc.type === 'local-systemd') { const { stdout } = await execAsync( `systemctl --user is-active ${svc.unit} 2>&1 || echo "inactive"`, { env: this.getUserEnv() } ); return stdout.trim() === 'active'; } else if (svc.type === 'remote-http') { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), svc.timeout || 5000); try { const resp = await fetch(svc.healthUrl, { signal: controller.signal }); clearTimeout(timer); return resp.ok; } catch { clearTimeout(timer); return false; } } return false; } catch (error) { this.log(`${svc.name} check error: ${error.message}`, 'error'); return false; } } async startService(svc) { const env = this.getUserEnv(); try { if (svc.type === 'local-cli') { const { stdout } = await execAsync(svc.startCmd, { env }); this.log(`${svc.name} started: ${stdout}`, 'info'); } else if (svc.type === 'local-systemd') { const { stdout } = await execAsync(`systemctl --user start ${svc.unit}`, { env }); this.log(`${svc.name} started: ${stdout}`, 'info'); } else if (svc.type === 'remote-http') { this.log(`${svc.name} is remote; cannot auto-start from this host`, 'warning'); throw new Error('Remote auto-start not supported'); } } catch (error) { this.log(`Failed to start ${svc.name}: ${error.message}`, 'error'); throw error; } } async handleServiceDown(serviceName, startFn) { const now = Date.now(); if (this.lastKnownState[serviceName]) { this.firstFailureTime[serviceName] = now; this.lastKnownState[serviceName] = false; this.log(`${serviceName} detected down, entering grace period (${this.gracePeriod / 1000}s)...`, 'warning'); return; } if (now - this.firstFailureTime[serviceName] < this.gracePeriod) { return; } if (!this.checkRestartLimit(serviceName)) { await this.sendNotification( `${serviceName} crashed ${this.maxRestarts} times in ${this.restartWindow / 60000} min. Auto-restart disabled until window resets.`, 'critical' ); return; } await this.sendNotification(`${serviceName} is down. Attempting restart...`, 'error'); try { await startFn(); this.lastKnownState[serviceName] = true; this.firstFailureTime[serviceName] = 0; await this.sendNotification(`${serviceName} restarted successfully`, 'info'); } catch (error) { await this.sendNotification(`Failed to restart ${serviceName}: ${error.message}`, 'critical'); } } async monitorOpenClawService() { const names = this.services.map(s => s.name).join(' + '); this.log(`Starting service monitoring (${names})...`, 'info'); let heartbeatCounter = 0; setInterval(async () => { const status = {}; for (const svc of this.services) { const ok = await this.checkService(svc); status[svc.name] = ok; if (ok) { if (!this.lastKnownState[svc.name]) { this.log(`${svc.name} recovered`, 'info'); } this.lastKnownState[svc.name] = true; this.firstFailureTime[svc.name] = 0; } else { await this.handleServiceDown(svc.name, () => this.startService(svc)); } } heartbeatCounter++; if (heartbeatCounter >= (this.heartbeatInterval / 30000)) { const summary = this.services.map(s => `${s.name}=${status[s.name] ? 'OK' : 'DOWN'}`).join(', '); this.log(`Heartbeat: ${summary}`, 'info'); heartbeatCounter = 0; } }, 30000); } async start() { this.log('Agent Health Monitor starting...', 'info'); for (const svc of this.services) { const ok = await this.checkService(svc); this.lastKnownState[svc.name] = ok; this.log(`Initial check: ${svc.name}=${ok ? 'OK' : 'DOWN'}`, 'info'); } await this.monitorOpenClawService(); this.log('Monitor is now active. Press Ctrl+C to stop.', 'info'); } } // Start the monitor const monitor = new AgentHealthMonitor(); monitor.start().catch(console.error);