#!/usr/bin/env node /** * OpenClaw Agent Health Monitor & Auto-Healing System * * Features: * - Process crash detection and auto-restart * - Memory leak monitoring * - Service health checks * - Telegram notifications on events * - Comprehensive logging * - Systemd integration */ const fs = require('fs'); const path = require('path'); const { spawn } = require('child_process'); const { exec } = require('child_process'); const util = require('util'); const execAsync = util.promisify(exec); class AgentHealthMonitor { constructor() { this.config = this.loadConfig(); this.logDir = '/root/.openclaw/workspace/logs/agents'; this.workspaceDir = '/root/.openclaw/workspace'; this.processes = new Map(); this.restartCounts = new Map(); this.maxRestarts = 5; this.restartWindow = 300000; // 5 minutes this.ensureLogDir(); this.setupSignalHandlers(); this.log('Agent Health Monitor initialized', 'info'); } loadConfig() { try { const configPath = '/root/.openclaw/openclaw.json'; if (fs.existsSync(configPath)) { return JSON.parse(fs.readFileSync(configPath, 'utf8')); } } catch (error) { console.error('Failed to load OpenClaw config:', error.message); } return {}; } ensureLogDir() { if (!fs.existsSync(this.logDir)) { fs.mkdirSync(this.logDir, { recursive: true }); } } setupSignalHandlers() { process.on('SIGTERM', () => this.gracefulShutdown()); process.on('SIGINT', () => this.gracefulShutdown()); } async gracefulShutdown() { this.log('Graceful shutdown initiated', 'info'); // Stop all monitored processes for (const [name, proc] of this.processes.entries()) { try { proc.kill('SIGTERM'); this.log(`Stopped process: ${name}`, 'info'); } catch (error) { this.log(`Error stopping ${name}: ${error.message}`, 'error'); } } process.exit(0); } log(message, severity = 'info') { const timestamp = new Date().toISOString(); const logEntry = `[${timestamp}] [${severity.toUpperCase()}] ${message}\n`; // Console output console.log(logEntry.trim()); // File logging const logFile = path.join(this.logDir, `health-${new Date().toISOString().split('T')[0]}.log`); fs.appendFileSync(logFile, logEntry); } async sendNotification(message, severity = 'info') { this.log(message, severity); // Send via Telegram if configured const telegramConfig = this.config.channels?.telegram; if (telegramConfig?.enabled && telegramConfig.botToken) { await this.sendTelegramNotification(message, severity); } // Also send via OpenClaw message tool if available if (severity === 'critical' || severity === 'error') { await this.sendOpenClawNotification(message, severity); } } async sendTelegramNotification(message, severity) { const botToken = this.config.channels.telegram.botToken; const chatId = '5237946060'; if (!botToken) { return; } try { const url = `https://api.telegram.org/bot${botToken}/sendMessage`; const emojis = { critical: '🚨', error: '❌', warning: 'âš ī¸', info: 'â„šī¸' }; const payload = { chat_id: chatId, text: `${emojis[severity] || 'đŸ“ĸ'} *OpenClaw Alert* (${severity})\n\n${message}`, parse_mode: 'Markdown' }; const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload) }); if (!response.ok) { throw new Error(`Telegram API error: ${response.status}`); } } catch (error) { console.error('Telegram notification error:', error.message); } } async sendOpenClawNotification(message, severity) { try { // Use OpenClaw's message tool via exec const cmd = `openclaw message send --channel telegram --target 5237946060 --message "🚨 OpenClaw Service Alert (${severity})\\n\\n${message}"`; await execAsync(cmd); } catch (error) { console.error('OpenClaw notification error:', error.message); } } checkRestartLimit(processName) { const now = Date.now(); const restarts = this.restartCounts.get(processName) || []; // Filter restarts within the window const recentRestarts = restarts.filter(time => now - time < this.restartWindow); if (recentRestarts.length >= this.maxRestarts) { return false; // Too many restarts } this.restartCounts.set(processName, [...recentRestarts, now]); return true; } async monitorProcess(name, command, args = [], options = {}) { const { healthCheck, healthCheckInterval = 30000, env = {}, cwd = this.workspaceDir } = options; const startProcess = () => { return new Promise((resolve, reject) => { const proc = spawn(command, args, { cwd, env: { ...process.env, ...env }, stdio: ['ignore', 'pipe', 'pipe'] }); proc.stdout.on('data', (data) => { this.log(`[${name}] ${data.toString().trim()}`, 'info'); }); proc.stderr.on('data', (data) => { this.log(`[${name}] ${data.toString().trim()}`, 'error'); }); proc.on('error', async (error) => { this.log(`[${name}] Process error: ${error.message}`, 'critical'); await this.sendNotification(`${name} failed to start: ${error.message}`, 'critical'); reject(error); }); proc.on('close', async (code, signal) => { this.processes.delete(name); this.log(`[${name}] Process exited with code ${code}, signal ${signal}`, 'warning'); // Auto-restart logic if (code !== 0 || signal) { if (this.checkRestartLimit(name)) { this.log(`[${name}] Auto-restarting...`, 'warning'); await this.sendNotification(`${name} crashed (code: ${code}, signal: ${signal}). Restarting...`, 'error'); setTimeout(() => startProcess(), 5000); } else { await this.sendNotification( `${name} crashed ${this.maxRestarts} times in ${this.restartWindow/60000} minutes. Giving up.`, 'critical' ); } } }); this.processes.set(name, proc); resolve(proc); }); }; // Start the process await startProcess(); // Set up health checks if (healthCheck) { setInterval(async () => { try { const isHealthy = await healthCheck(); if (!isHealthy) { await this.sendNotification(`${name} health check failed`, 'warning'); // Restart unhealthy process const proc = this.processes.get(name); if (proc) { proc.kill('SIGTERM'); } } } catch (error) { await this.sendNotification(`${name} health check error: ${error.message}`, 'error'); } }, healthCheckInterval); } } async checkOpenClawGateway() { try { const { stdout } = await execAsync('openclaw gateway status 2>&1 || echo "not running"'); return stdout.includes('running') || stdout.includes('active'); } catch { return false; } } async startOpenClawGateway() { try { await execAsync('openclaw gateway start'); this.log('OpenClaw Gateway started', 'info'); } catch (error) { this.log(`Failed to start OpenClaw Gateway: ${error.message}`, 'error'); throw error; } } async monitorOpenClawService() { this.log('Starting OpenClaw Gateway monitoring...', 'info'); // Check every 30 seconds setInterval(async () => { const isRunning = await this.checkOpenClawGateway(); if (!isRunning) { this.log('OpenClaw Gateway is not running! Attempting to restart...', 'critical'); await this.sendNotification('🚨 OpenClaw Gateway stopped unexpectedly. Restarting...', 'critical'); try { await this.startOpenClawGateway(); await this.sendNotification('✅ OpenClaw Gateway has been restarted successfully', 'info'); } catch (error) { await this.sendNotification(`❌ Failed to restart OpenClaw Gateway: ${error.message}`, 'critical'); } } }, 30000); } async start() { this.log('Agent Health Monitor starting...', 'info'); // Monitor OpenClaw Gateway service await this.monitorOpenClawService(); // Keep the monitor running this.log('Monitor is now active. Press Ctrl+C to stop.', 'info'); } } // Start the monitor const monitor = new AgentHealthMonitor(); monitor.start().catch(console.error);