#!/usr/bin/env node /** * OpenClaw Agent Health Monitor & Auto-Healing System * * Features: * - Process crash detection and auto-restart * - Memory leak monitoring * - Service health checks * - Telegram notifications on events * - Comprehensive logging * - Systemd integration */ const fs = require('fs'); const path = require('path'); const { spawn } = require('child_process'); const { exec } = require('child_process'); const util = require('util'); const execAsync = util.promisify(exec); class AgentHealthMonitor { constructor() { this.config = this.loadConfig(); this.logDir = '/root/.openclaw/workspace/logs/agents'; this.workspaceDir = '/root/.openclaw/workspace'; this.processes = new Map(); this.restartCounts = new Map(); this.maxRestarts = 5; this.restartWindow = 300000; // 5 minutes this.gracePeriod = 60000; // 60s grace period after first failure (upgrade tolerance) this.heartbeatInterval = 600000; // 10 minutes this.services = this.loadMonitoredServices(); this.lastKnownState = {}; this.firstFailureTime = {}; for (const svc of this.services) { this.lastKnownState[svc.name] = true; this.firstFailureTime[svc.name] = 0; } this.ensureLogDir(); this.setupSignalHandlers(); this.log('Agent Health Monitor initialized', 'info'); } loadMonitoredServices() { return [ { name: 'gateway', type: 'local-cli', checkCmd: 'openclaw gateway status 2>&1 || echo "not running"', startCmd: 'openclaw gateway start', checkFn: (stdout) => stdout.includes('running') || stdout.includes('active') || stdout.includes('RPC probe: ok') || stdout.includes('Listening:'), }, { name: 'life', type: 'local-systemd', unit: 'openclaw-gateway-life.service', }, // To add a remote agent, use type: 'remote-http': // { // name: 'remote-agent', // type: 'remote-http', // healthUrl: 'http://100.115.94.X:18789/health', // timeout: 5000, // }, ]; } loadConfig() { try { const configPath = '/root/.openclaw/openclaw.json'; if (fs.existsSync(configPath)) { return JSON.parse(fs.readFileSync(configPath, 'utf8')); } } catch (error) { console.error('Failed to load OpenClaw config:', error.message); } return {}; } ensureLogDir() { if (!fs.existsSync(this.logDir)) { fs.mkdirSync(this.logDir, { recursive: true }); } } setupSignalHandlers() { process.on('SIGTERM', () => this.gracefulShutdown()); process.on('SIGINT', () => this.gracefulShutdown()); } async gracefulShutdown() { this.log('Graceful shutdown initiated', 'info'); // Stop all monitored processes for (const [name, proc] of this.processes.entries()) { try { proc.kill('SIGTERM'); this.log(`Stopped process: ${name}`, 'info'); } catch (error) { this.log(`Error stopping ${name}: ${error.message}`, 'error'); } } process.exit(0); } log(message, severity = 'info') { const timestamp = new Date().toISOString(); const logEntry = `[${timestamp}] [${severity.toUpperCase()}] ${message}\n`; // Console output console.log(logEntry.trim()); // File logging const logFile = path.join(this.logDir, `health-${new Date().toISOString().split('T')[0]}.log`); fs.appendFileSync(logFile, logEntry); } async sendNotification(message, severity = 'info') { this.log(message, severity); // Send via Telegram if configured const telegramConfig = this.config.channels?.telegram; if (telegramConfig?.enabled && telegramConfig.botToken) { await this.sendTelegramNotification(message, severity); } // Also send via OpenClaw message tool if available if (severity === 'critical' || severity === 'error') { await this.sendOpenClawNotification(message, severity); } } async sendTelegramNotification(message, severity) { const botToken = this.config.channels.telegram.botToken; const chatId = '5237946060'; if (!botToken) { return; } try { const url = `https://api.telegram.org/bot${botToken}/sendMessage`; const emojis = { critical: '🚨', error: '❌', warning: 'âš ī¸', info: 'â„šī¸' }; const payload = { chat_id: chatId, text: `${emojis[severity] || 'đŸ“ĸ'} *OpenClaw Alert* (${severity})\n\n${message}`, parse_mode: 'Markdown' }; const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload) }); if (!response.ok) { throw new Error(`Telegram API error: ${response.status}`); } } catch (error) { console.error('Telegram notification error:', error.message); } } async sendOpenClawNotification(message, severity) { try { // Use OpenClaw's message tool via exec const cmd = `openclaw message send --channel telegram --target 5237946060 --message "🚨 OpenClaw Service Alert (${severity})\\n\\n${message}"`; await execAsync(cmd); } catch (error) { console.error('OpenClaw notification error:', error.message); } } checkRestartLimit(processName) { const now = Date.now(); const restarts = this.restartCounts.get(processName) || []; // Filter restarts within the window const recentRestarts = restarts.filter(time => now - time < this.restartWindow); if (recentRestarts.length >= this.maxRestarts) { return false; // Too many restarts } this.restartCounts.set(processName, [...recentRestarts, now]); return true; } async monitorProcess(name, command, args = [], options = {}) { const { healthCheck, healthCheckInterval = 30000, env = {}, cwd = this.workspaceDir } = options; const startProcess = () => { return new Promise((resolve, reject) => { const proc = spawn(command, args, { cwd, env: { ...process.env, ...env }, stdio: ['ignore', 'pipe', 'pipe'] }); proc.stdout.on('data', (data) => { this.log(`[${name}] ${data.toString().trim()}`, 'info'); }); proc.stderr.on('data', (data) => { this.log(`[${name}] ${data.toString().trim()}`, 'error'); }); proc.on('error', async (error) => { this.log(`[${name}] Process error: ${error.message}`, 'critical'); await this.sendNotification(`${name} failed to start: ${error.message}`, 'critical'); reject(error); }); proc.on('close', async (code, signal) => { this.processes.delete(name); this.log(`[${name}] Process exited with code ${code}, signal ${signal}`, 'warning'); // Auto-restart logic if (code !== 0 || signal) { if (this.checkRestartLimit(name)) { this.log(`[${name}] Auto-restarting...`, 'warning'); await this.sendNotification(`${name} crashed (code: ${code}, signal: ${signal}). Restarting...`, 'error'); setTimeout(() => startProcess(), 5000); } else { await this.sendNotification( `${name} crashed ${this.maxRestarts} times in ${this.restartWindow/60000} minutes. Giving up.`, 'critical' ); } } }); this.processes.set(name, proc); resolve(proc); }); }; // Start the process await startProcess(); // Set up health checks if (healthCheck) { setInterval(async () => { try { const isHealthy = await healthCheck(); if (!isHealthy) { await this.sendNotification(`${name} health check failed`, 'warning'); // Restart unhealthy process const proc = this.processes.get(name); if (proc) { proc.kill('SIGTERM'); } } } catch (error) { await this.sendNotification(`${name} health check error: ${error.message}`, 'error'); } }, healthCheckInterval); } } getUserEnv() { return { ...process.env, XDG_RUNTIME_DIR: '/run/user/0', DBUS_SESSION_BUS_ADDRESS: 'unix:path=/run/user/0/bus' }; } async checkService(svc) { try { if (svc.type === 'local-cli') { const { stdout } = await execAsync(svc.checkCmd, { env: this.getUserEnv() }); return svc.checkFn(stdout); } else if (svc.type === 'local-systemd') { const { stdout } = await execAsync( `systemctl --user is-active ${svc.unit} 2>&1 || echo "inactive"`, { env: this.getUserEnv() } ); return stdout.trim() === 'active'; } else if (svc.type === 'remote-http') { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), svc.timeout || 5000); try { const resp = await fetch(svc.healthUrl, { signal: controller.signal }); clearTimeout(timer); return resp.ok; } catch { clearTimeout(timer); return false; } } return false; } catch (error) { this.log(`${svc.name} check error: ${error.message}`, 'error'); return false; } } async startService(svc) { const env = this.getUserEnv(); try { if (svc.type === 'local-cli') { const { stdout } = await execAsync(svc.startCmd, { env }); this.log(`${svc.name} started: ${stdout}`, 'info'); } else if (svc.type === 'local-systemd') { const { stdout } = await execAsync(`systemctl --user start ${svc.unit}`, { env }); this.log(`${svc.name} started: ${stdout}`, 'info'); } else if (svc.type === 'remote-http') { this.log(`${svc.name} is remote; cannot auto-start from this host`, 'warning'); throw new Error('Remote auto-start not supported'); } } catch (error) { this.log(`Failed to start ${svc.name}: ${error.message}`, 'error'); throw error; } } async handleServiceDown(serviceName, startFn) { const now = Date.now(); if (this.lastKnownState[serviceName]) { this.firstFailureTime[serviceName] = now; this.lastKnownState[serviceName] = false; this.log(`${serviceName} detected down, entering grace period (${this.gracePeriod / 1000}s)...`, 'warning'); return; } if (now - this.firstFailureTime[serviceName] < this.gracePeriod) { return; } if (!this.checkRestartLimit(serviceName)) { await this.sendNotification( `${serviceName} crashed ${this.maxRestarts} times in ${this.restartWindow / 60000} min. Auto-restart disabled until window resets.`, 'critical' ); return; } await this.sendNotification(`${serviceName} is down. Attempting restart...`, 'error'); try { await startFn(); this.lastKnownState[serviceName] = true; this.firstFailureTime[serviceName] = 0; await this.sendNotification(`${serviceName} restarted successfully`, 'info'); } catch (error) { await this.sendNotification(`Failed to restart ${serviceName}: ${error.message}`, 'critical'); } } async monitorOpenClawService() { const names = this.services.map(s => s.name).join(' + '); this.log(`Starting service monitoring (${names})...`, 'info'); let heartbeatCounter = 0; setInterval(async () => { const status = {}; for (const svc of this.services) { const ok = await this.checkService(svc); status[svc.name] = ok; if (ok) { if (!this.lastKnownState[svc.name]) { this.log(`${svc.name} recovered`, 'info'); } this.lastKnownState[svc.name] = true; this.firstFailureTime[svc.name] = 0; } else { await this.handleServiceDown(svc.name, () => this.startService(svc)); } } heartbeatCounter++; if (heartbeatCounter >= (this.heartbeatInterval / 30000)) { const summary = this.services.map(s => `${s.name}=${status[s.name] ? 'OK' : 'DOWN'}`).join(', '); this.log(`Heartbeat: ${summary}`, 'info'); heartbeatCounter = 0; } }, 30000); } async start() { this.log('Agent Health Monitor starting...', 'info'); for (const svc of this.services) { const ok = await this.checkService(svc); this.lastKnownState[svc.name] = ok; this.log(`Initial check: ${svc.name}=${ok ? 'OK' : 'DOWN'}`, 'info'); } await this.monitorOpenClawService(); this.log('Monitor is now active. Press Ctrl+C to stop.', 'info'); } } // Start the monitor const monitor = new AgentHealthMonitor(); monitor.start().catch(console.error);