|
|
#!/usr/bin/env node |
|
|
|
|
|
/** |
|
|
* OpenClaw Agent Health Monitor & Auto-Healing System |
|
|
* |
|
|
* Features: |
|
|
* - Process crash detection and auto-restart |
|
|
* - Memory leak monitoring |
|
|
* - Service health checks |
|
|
* - Telegram notifications on events |
|
|
* - Comprehensive logging |
|
|
* - Systemd integration |
|
|
*/ |
|
|
|
|
|
const fs = require('fs'); |
|
|
const path = require('path'); |
|
|
const { spawn } = require('child_process'); |
|
|
const { exec } = require('child_process'); |
|
|
const util = require('util'); |
|
|
const execAsync = util.promisify(exec); |
|
|
|
|
|
class AgentHealthMonitor { |
|
|
constructor() { |
|
|
this.config = this.loadConfig(); |
|
|
this.logDir = '/root/.openclaw/workspace/logs/agents'; |
|
|
this.workspaceDir = '/root/.openclaw/workspace'; |
|
|
this.processes = new Map(); |
|
|
this.restartCounts = new Map(); |
|
|
this.maxRestarts = 5; |
|
|
this.restartWindow = 300000; // 5 minutes |
|
|
this.gracePeriod = 60000; // 60s grace period after first failure (upgrade tolerance) |
|
|
this.heartbeatInterval = 600000; // 10 minutes |
|
|
this.services = this.loadMonitoredServices(); |
|
|
this.lastKnownState = {}; |
|
|
this.firstFailureTime = {}; |
|
|
for (const svc of this.services) { |
|
|
this.lastKnownState[svc.name] = true; |
|
|
this.firstFailureTime[svc.name] = 0; |
|
|
} |
|
|
|
|
|
this.ensureLogDir(); |
|
|
this.setupSignalHandlers(); |
|
|
this.log('Agent Health Monitor initialized', 'info'); |
|
|
} |
|
|
|
|
|
loadMonitoredServices() { |
|
|
return [ |
|
|
{ |
|
|
name: 'gateway', |
|
|
type: 'local-cli', |
|
|
checkCmd: 'openclaw gateway status 2>&1 || echo "not running"', |
|
|
startCmd: 'openclaw gateway start', |
|
|
checkFn: (stdout) => stdout.includes('running') || stdout.includes('active') || |
|
|
stdout.includes('RPC probe: ok') || stdout.includes('Listening:'), |
|
|
}, |
|
|
{ |
|
|
name: 'life', |
|
|
type: 'local-systemd', |
|
|
unit: 'openclaw-gateway-life.service', |
|
|
}, |
|
|
// To add a remote agent, use type: 'remote-http': |
|
|
// { |
|
|
// name: 'remote-agent', |
|
|
// type: 'remote-http', |
|
|
// healthUrl: 'http://100.115.94.X:18789/health', |
|
|
// timeout: 5000, |
|
|
// }, |
|
|
]; |
|
|
} |
|
|
|
|
|
loadConfig() { |
|
|
try { |
|
|
const configPath = '/root/.openclaw/openclaw.json'; |
|
|
if (fs.existsSync(configPath)) { |
|
|
return JSON.parse(fs.readFileSync(configPath, 'utf8')); |
|
|
} |
|
|
} catch (error) { |
|
|
console.error('Failed to load OpenClaw config:', error.message); |
|
|
} |
|
|
return {}; |
|
|
} |
|
|
|
|
|
ensureLogDir() { |
|
|
if (!fs.existsSync(this.logDir)) { |
|
|
fs.mkdirSync(this.logDir, { recursive: true }); |
|
|
} |
|
|
} |
|
|
|
|
|
setupSignalHandlers() { |
|
|
process.on('SIGTERM', () => this.gracefulShutdown()); |
|
|
process.on('SIGINT', () => this.gracefulShutdown()); |
|
|
} |
|
|
|
|
|
async gracefulShutdown() { |
|
|
this.log('Graceful shutdown initiated', 'info'); |
|
|
|
|
|
// Stop all monitored processes |
|
|
for (const [name, proc] of this.processes.entries()) { |
|
|
try { |
|
|
proc.kill('SIGTERM'); |
|
|
this.log(`Stopped process: ${name}`, 'info'); |
|
|
} catch (error) { |
|
|
this.log(`Error stopping ${name}: ${error.message}`, 'error'); |
|
|
} |
|
|
} |
|
|
|
|
|
process.exit(0); |
|
|
} |
|
|
|
|
|
log(message, severity = 'info') { |
|
|
const timestamp = new Date().toISOString(); |
|
|
const logEntry = `[${timestamp}] [${severity.toUpperCase()}] ${message}\n`; |
|
|
|
|
|
// Console output |
|
|
console.log(logEntry.trim()); |
|
|
|
|
|
// File logging |
|
|
const logFile = path.join(this.logDir, `health-${new Date().toISOString().split('T')[0]}.log`); |
|
|
fs.appendFileSync(logFile, logEntry); |
|
|
} |
|
|
|
|
|
async sendNotification(message, severity = 'info') { |
|
|
this.log(message, severity); |
|
|
|
|
|
// Send via Telegram if configured |
|
|
const telegramConfig = this.config.channels?.telegram; |
|
|
if (telegramConfig?.enabled && telegramConfig.botToken) { |
|
|
await this.sendTelegramNotification(message, severity); |
|
|
} |
|
|
|
|
|
// Also send via OpenClaw message tool if available |
|
|
if (severity === 'critical' || severity === 'error') { |
|
|
await this.sendOpenClawNotification(message, severity); |
|
|
} |
|
|
} |
|
|
|
|
|
async sendTelegramNotification(message, severity) { |
|
|
const botToken = this.config.channels.telegram.botToken; |
|
|
const chatId = '5237946060'; |
|
|
|
|
|
if (!botToken) { |
|
|
return; |
|
|
} |
|
|
|
|
|
try { |
|
|
const url = `https://api.telegram.org/bot${botToken}/sendMessage`; |
|
|
const emojis = { |
|
|
critical: '🚨', |
|
|
error: '❌', |
|
|
warning: '⚠️', |
|
|
info: 'ℹ️' |
|
|
}; |
|
|
|
|
|
const payload = { |
|
|
chat_id: chatId, |
|
|
text: `${emojis[severity] || '📢'} *OpenClaw Alert* (${severity})\n\n${message}`, |
|
|
parse_mode: 'Markdown' |
|
|
}; |
|
|
|
|
|
const response = await fetch(url, { |
|
|
method: 'POST', |
|
|
headers: { 'Content-Type': 'application/json' }, |
|
|
body: JSON.stringify(payload) |
|
|
}); |
|
|
|
|
|
if (!response.ok) { |
|
|
throw new Error(`Telegram API error: ${response.status}`); |
|
|
} |
|
|
} catch (error) { |
|
|
console.error('Telegram notification error:', error.message); |
|
|
} |
|
|
} |
|
|
|
|
|
async sendOpenClawNotification(message, severity) { |
|
|
try { |
|
|
// Use OpenClaw's message tool via exec |
|
|
const cmd = `openclaw message send --channel telegram --target 5237946060 --message "🚨 OpenClaw Service Alert (${severity})\\n\\n${message}"`; |
|
|
await execAsync(cmd); |
|
|
} catch (error) { |
|
|
console.error('OpenClaw notification error:', error.message); |
|
|
} |
|
|
} |
|
|
|
|
|
checkRestartLimit(processName) { |
|
|
const now = Date.now(); |
|
|
const restarts = this.restartCounts.get(processName) || []; |
|
|
|
|
|
// Filter restarts within the window |
|
|
const recentRestarts = restarts.filter(time => now - time < this.restartWindow); |
|
|
|
|
|
if (recentRestarts.length >= this.maxRestarts) { |
|
|
return false; // Too many restarts |
|
|
} |
|
|
|
|
|
this.restartCounts.set(processName, [...recentRestarts, now]); |
|
|
return true; |
|
|
} |
|
|
|
|
|
async monitorProcess(name, command, args = [], options = {}) { |
|
|
const { |
|
|
healthCheck, |
|
|
healthCheckInterval = 30000, |
|
|
env = {}, |
|
|
cwd = this.workspaceDir |
|
|
} = options; |
|
|
|
|
|
const startProcess = () => { |
|
|
return new Promise((resolve, reject) => { |
|
|
const proc = spawn(command, args, { |
|
|
cwd, |
|
|
env: { ...process.env, ...env }, |
|
|
stdio: ['ignore', 'pipe', 'pipe'] |
|
|
}); |
|
|
|
|
|
proc.stdout.on('data', (data) => { |
|
|
this.log(`[${name}] ${data.toString().trim()}`, 'info'); |
|
|
}); |
|
|
|
|
|
proc.stderr.on('data', (data) => { |
|
|
this.log(`[${name}] ${data.toString().trim()}`, 'error'); |
|
|
}); |
|
|
|
|
|
proc.on('error', async (error) => { |
|
|
this.log(`[${name}] Process error: ${error.message}`, 'critical'); |
|
|
await this.sendNotification(`${name} failed to start: ${error.message}`, 'critical'); |
|
|
reject(error); |
|
|
}); |
|
|
|
|
|
proc.on('close', async (code, signal) => { |
|
|
this.processes.delete(name); |
|
|
this.log(`[${name}] Process exited with code ${code}, signal ${signal}`, 'warning'); |
|
|
|
|
|
// Auto-restart logic |
|
|
if (code !== 0 || signal) { |
|
|
if (this.checkRestartLimit(name)) { |
|
|
this.log(`[${name}] Auto-restarting...`, 'warning'); |
|
|
await this.sendNotification(`${name} crashed (code: ${code}, signal: ${signal}). Restarting...`, 'error'); |
|
|
setTimeout(() => startProcess(), 5000); |
|
|
} else { |
|
|
await this.sendNotification( |
|
|
`${name} crashed ${this.maxRestarts} times in ${this.restartWindow/60000} minutes. Giving up.`, |
|
|
'critical' |
|
|
); |
|
|
} |
|
|
} |
|
|
}); |
|
|
|
|
|
this.processes.set(name, proc); |
|
|
resolve(proc); |
|
|
}); |
|
|
}; |
|
|
|
|
|
// Start the process |
|
|
await startProcess(); |
|
|
|
|
|
// Set up health checks |
|
|
if (healthCheck) { |
|
|
setInterval(async () => { |
|
|
try { |
|
|
const isHealthy = await healthCheck(); |
|
|
if (!isHealthy) { |
|
|
await this.sendNotification(`${name} health check failed`, 'warning'); |
|
|
|
|
|
// Restart unhealthy process |
|
|
const proc = this.processes.get(name); |
|
|
if (proc) { |
|
|
proc.kill('SIGTERM'); |
|
|
} |
|
|
} |
|
|
} catch (error) { |
|
|
await this.sendNotification(`${name} health check error: ${error.message}`, 'error'); |
|
|
} |
|
|
}, healthCheckInterval); |
|
|
} |
|
|
} |
|
|
|
|
|
getUserEnv() { |
|
|
return { |
|
|
...process.env, |
|
|
XDG_RUNTIME_DIR: '/run/user/0', |
|
|
DBUS_SESSION_BUS_ADDRESS: 'unix:path=/run/user/0/bus' |
|
|
}; |
|
|
} |
|
|
|
|
|
async checkService(svc) { |
|
|
try { |
|
|
if (svc.type === 'local-cli') { |
|
|
const { stdout } = await execAsync(svc.checkCmd, { env: this.getUserEnv() }); |
|
|
return svc.checkFn(stdout); |
|
|
} else if (svc.type === 'local-systemd') { |
|
|
const { stdout } = await execAsync( |
|
|
`systemctl --user is-active ${svc.unit} 2>&1 || echo "inactive"`, |
|
|
{ env: this.getUserEnv() } |
|
|
); |
|
|
return stdout.trim() === 'active'; |
|
|
} else if (svc.type === 'remote-http') { |
|
|
const controller = new AbortController(); |
|
|
const timer = setTimeout(() => controller.abort(), svc.timeout || 5000); |
|
|
try { |
|
|
const resp = await fetch(svc.healthUrl, { signal: controller.signal }); |
|
|
clearTimeout(timer); |
|
|
return resp.ok; |
|
|
} catch { |
|
|
clearTimeout(timer); |
|
|
return false; |
|
|
} |
|
|
} |
|
|
return false; |
|
|
} catch (error) { |
|
|
this.log(`${svc.name} check error: ${error.message}`, 'error'); |
|
|
return false; |
|
|
} |
|
|
} |
|
|
|
|
|
async startService(svc) { |
|
|
const env = this.getUserEnv(); |
|
|
try { |
|
|
if (svc.type === 'local-cli') { |
|
|
const { stdout } = await execAsync(svc.startCmd, { env }); |
|
|
this.log(`${svc.name} started: ${stdout}`, 'info'); |
|
|
} else if (svc.type === 'local-systemd') { |
|
|
const { stdout } = await execAsync(`systemctl --user start ${svc.unit}`, { env }); |
|
|
this.log(`${svc.name} started: ${stdout}`, 'info'); |
|
|
} else if (svc.type === 'remote-http') { |
|
|
this.log(`${svc.name} is remote; cannot auto-start from this host`, 'warning'); |
|
|
throw new Error('Remote auto-start not supported'); |
|
|
} |
|
|
} catch (error) { |
|
|
this.log(`Failed to start ${svc.name}: ${error.message}`, 'error'); |
|
|
throw error; |
|
|
} |
|
|
} |
|
|
|
|
|
async handleServiceDown(serviceName, startFn) { |
|
|
const now = Date.now(); |
|
|
|
|
|
if (this.lastKnownState[serviceName]) { |
|
|
this.firstFailureTime[serviceName] = now; |
|
|
this.lastKnownState[serviceName] = false; |
|
|
this.log(`${serviceName} detected down, entering grace period (${this.gracePeriod / 1000}s)...`, 'warning'); |
|
|
return; |
|
|
} |
|
|
|
|
|
if (now - this.firstFailureTime[serviceName] < this.gracePeriod) { |
|
|
return; |
|
|
} |
|
|
|
|
|
if (!this.checkRestartLimit(serviceName)) { |
|
|
await this.sendNotification( |
|
|
`${serviceName} crashed ${this.maxRestarts} times in ${this.restartWindow / 60000} min. Auto-restart disabled until window resets.`, |
|
|
'critical' |
|
|
); |
|
|
return; |
|
|
} |
|
|
|
|
|
await this.sendNotification(`${serviceName} is down. Attempting restart...`, 'error'); |
|
|
try { |
|
|
await startFn(); |
|
|
this.lastKnownState[serviceName] = true; |
|
|
this.firstFailureTime[serviceName] = 0; |
|
|
await this.sendNotification(`${serviceName} restarted successfully`, 'info'); |
|
|
} catch (error) { |
|
|
await this.sendNotification(`Failed to restart ${serviceName}: ${error.message}`, 'critical'); |
|
|
} |
|
|
} |
|
|
|
|
|
async monitorOpenClawService() { |
|
|
const names = this.services.map(s => s.name).join(' + '); |
|
|
this.log(`Starting service monitoring (${names})...`, 'info'); |
|
|
let heartbeatCounter = 0; |
|
|
|
|
|
setInterval(async () => { |
|
|
const status = {}; |
|
|
|
|
|
for (const svc of this.services) { |
|
|
const ok = await this.checkService(svc); |
|
|
status[svc.name] = ok; |
|
|
|
|
|
if (ok) { |
|
|
if (!this.lastKnownState[svc.name]) { |
|
|
this.log(`${svc.name} recovered`, 'info'); |
|
|
} |
|
|
this.lastKnownState[svc.name] = true; |
|
|
this.firstFailureTime[svc.name] = 0; |
|
|
} else { |
|
|
await this.handleServiceDown(svc.name, () => this.startService(svc)); |
|
|
} |
|
|
} |
|
|
|
|
|
heartbeatCounter++; |
|
|
if (heartbeatCounter >= (this.heartbeatInterval / 30000)) { |
|
|
const summary = this.services.map(s => `${s.name}=${status[s.name] ? 'OK' : 'DOWN'}`).join(', '); |
|
|
this.log(`Heartbeat: ${summary}`, 'info'); |
|
|
heartbeatCounter = 0; |
|
|
} |
|
|
}, 30000); |
|
|
} |
|
|
|
|
|
async start() { |
|
|
this.log('Agent Health Monitor starting...', 'info'); |
|
|
|
|
|
for (const svc of this.services) { |
|
|
const ok = await this.checkService(svc); |
|
|
this.lastKnownState[svc.name] = ok; |
|
|
this.log(`Initial check: ${svc.name}=${ok ? 'OK' : 'DOWN'}`, 'info'); |
|
|
} |
|
|
|
|
|
await this.monitorOpenClawService(); |
|
|
this.log('Monitor is now active. Press Ctrl+C to stop.', 'info'); |
|
|
} |
|
|
} |
|
|
|
|
|
// Start the monitor |
|
|
const monitor = new AgentHealthMonitor(); |
|
|
monitor.start().catch(console.error);
|
|
|
|