You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

438 lines
14 KiB

#!/usr/bin/env node
/**
* OpenClaw Agent Health Monitor & Auto-Healing System
*
* Features:
* - Process crash detection and auto-restart
* - Service health checks (process/systemd only)
* - Telegram notifications on events
* - Comprehensive logging
* - Systemd integration
*
* LIMITATIONS (why "运行不正常不报错也不修复"):
* - Only checks process/systemd liveness (e.g. gateway status, systemctl is-active).
* It does NOT verify that the agent can actually reply (e.g. API/Telegram/config issues).
* - First time a service is detected DOWN: enters 60s grace period without restart/alert,
* then on next check after grace period will attempt restart and send notification.
* - If "无法回复" is due to config (e.g. Telegram groupAllowFrom empty), fix config and
* restart the gateway; the monitor will not detect this as failure.
*/
const fs = require('fs');
const path = require('path');
const { spawn, execSync } = require('child_process');
const { exec } = require('child_process');
const util = require('util');
const execAsync = util.promisify(exec);
const WORKSPACE = '/root/.openclaw/workspace';
const PARSE_AGENTS = `python3 ${WORKSPACE}/scripts/parse_agents.py`;
class AgentHealthMonitor {
constructor() {
this.config = this.loadConfig();
this.logDir = '/root/.openclaw/workspace/logs/agents';
this.workspaceDir = '/root/.openclaw/workspace';
this.processes = new Map();
this.restartCounts = new Map();
this.maxRestarts = 5;
this.restartWindow = 300000; // 5 minutes
this.gracePeriod = 60000; // 60s grace period after first failure (upgrade tolerance)
this.heartbeatInterval = 600000; // 10 minutes
this.ensureLogDir();
this.services = this.loadMonitoredServices();
this.lastKnownState = {};
this.firstFailureTime = {};
for (const svc of this.services) {
this.lastKnownState[svc.name] = true;
this.firstFailureTime[svc.name] = 0;
}
this.setupSignalHandlers();
this.log('Agent Health Monitor initialized', 'info');
}
loadMonitoredServices() {
try {
const output = execSync(`${PARSE_AGENTS} services`, { encoding: 'utf8' }).trim();
if (!output) return [];
return output.split('\n').map(line => {
const parts = line.split('\t');
const [name, type] = parts;
if (type === 'local-cli') {
const checkCmd = parts[2];
const startCmd = parts[3];
const pattern = parts[4];
return {
name, type, checkCmd, startCmd,
checkFn: (stdout) => new RegExp(pattern).test(stdout),
};
} else if (type === 'local-systemd') {
return { name, type, unit: parts[2] };
} else if (type === 'remote-http') {
return { name, type, healthUrl: parts[2], timeout: parseInt(parts[3]) || 5000 };
}
return { name, type };
});
} catch (error) {
this.log(`Failed to load agents.yaml: ${error.message}`, 'error');
const ocBin = '/www/server/nodejs/v24.13.1/bin/openclaw';
return [{
name: 'gateway',
type: 'local-cli',
checkCmd: `${ocBin} gateway status 2>&1 || echo "not running"`,
startCmd: `${ocBin} gateway start`,
checkFn: (stdout) => /running|active|RPC probe: ok|Listening:/.test(stdout),
}];
}
}
loadConfig() {
try {
const configPath = '/root/.openclaw/openclaw.json';
if (fs.existsSync(configPath)) {
return JSON.parse(fs.readFileSync(configPath, 'utf8'));
}
} catch (error) {
console.error('Failed to load OpenClaw config:', error.message);
}
return {};
}
ensureLogDir() {
if (!fs.existsSync(this.logDir)) {
fs.mkdirSync(this.logDir, { recursive: true });
}
}
setupSignalHandlers() {
process.on('SIGTERM', () => this.gracefulShutdown());
process.on('SIGINT', () => this.gracefulShutdown());
}
async gracefulShutdown() {
this.log('Graceful shutdown initiated', 'info');
// Stop all monitored processes
for (const [name, proc] of this.processes.entries()) {
try {
proc.kill('SIGTERM');
this.log(`Stopped process: ${name}`, 'info');
} catch (error) {
this.log(`Error stopping ${name}: ${error.message}`, 'error');
}
}
process.exit(0);
}
log(message, severity = 'info') {
const timestamp = new Date().toISOString();
const logEntry = `[${timestamp}] [${severity.toUpperCase()}] ${message}\n`;
// Console output
console.log(logEntry.trim());
// File logging
const logFile = path.join(this.logDir, `health-${new Date().toISOString().split('T')[0]}.log`);
fs.appendFileSync(logFile, logEntry);
}
async sendNotification(message, severity = 'info') {
this.log(message, severity);
// Send via Telegram if configured
const telegramConfig = this.config.channels?.telegram;
if (telegramConfig?.enabled && telegramConfig.botToken) {
await this.sendTelegramNotification(message, severity);
}
// Also send via OpenClaw message tool if available
if (severity === 'critical' || severity === 'error') {
await this.sendOpenClawNotification(message, severity);
}
}
async sendTelegramNotification(message, severity) {
const botToken = this.config.channels.telegram.botToken;
const chatId = '5237946060';
if (!botToken) {
return;
}
try {
const url = `https://api.telegram.org/bot${botToken}/sendMessage`;
const emojis = {
critical: '🚨',
error: '❌',
warning: '⚠',
info: 'ℹ'
};
const payload = {
chat_id: chatId,
text: `${emojis[severity] || '📢'} *OpenClaw Alert* (${severity})\n\n${message}`,
parse_mode: 'Markdown'
};
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload)
});
if (!response.ok) {
throw new Error(`Telegram API error: ${response.status}`);
}
} catch (error) {
console.error('Telegram notification error:', error.message);
}
}
async sendOpenClawNotification(message, severity) {
try {
// Use OpenClaw's message tool via exec
const cmd = `/www/server/nodejs/v24.13.1/bin/openclaw message send --channel telegram --target 5237946060 --message "🚨 OpenClaw Service Alert (${severity})\\n\\n${message}"`;
await execAsync(cmd);
} catch (error) {
console.error('OpenClaw notification error:', error.message);
}
}
checkRestartLimit(processName) {
const now = Date.now();
const restarts = this.restartCounts.get(processName) || [];
// Filter restarts within the window
const recentRestarts = restarts.filter(time => now - time < this.restartWindow);
if (recentRestarts.length >= this.maxRestarts) {
return false; // Too many restarts
}
this.restartCounts.set(processName, [...recentRestarts, now]);
return true;
}
async monitorProcess(name, command, args = [], options = {}) {
const {
healthCheck,
healthCheckInterval = 30000,
env = {},
cwd = this.workspaceDir
} = options;
const startProcess = () => {
return new Promise((resolve, reject) => {
const proc = spawn(command, args, {
cwd,
env: { ...process.env, ...env },
stdio: ['ignore', 'pipe', 'pipe']
});
proc.stdout.on('data', (data) => {
this.log(`[${name}] ${data.toString().trim()}`, 'info');
});
proc.stderr.on('data', (data) => {
this.log(`[${name}] ${data.toString().trim()}`, 'error');
});
proc.on('error', async (error) => {
this.log(`[${name}] Process error: ${error.message}`, 'critical');
await this.sendNotification(`${name} failed to start: ${error.message}`, 'critical');
reject(error);
});
proc.on('close', async (code, signal) => {
this.processes.delete(name);
this.log(`[${name}] Process exited with code ${code}, signal ${signal}`, 'warning');
// Auto-restart logic
if (code !== 0 || signal) {
if (this.checkRestartLimit(name)) {
this.log(`[${name}] Auto-restarting...`, 'warning');
await this.sendNotification(`${name} crashed (code: ${code}, signal: ${signal}). Restarting...`, 'error');
setTimeout(() => startProcess(), 5000);
} else {
await this.sendNotification(
`${name} crashed ${this.maxRestarts} times in ${this.restartWindow/60000} minutes. Giving up.`,
'critical'
);
}
}
});
this.processes.set(name, proc);
resolve(proc);
});
};
// Start the process
await startProcess();
// Set up health checks
if (healthCheck) {
setInterval(async () => {
try {
const isHealthy = await healthCheck();
if (!isHealthy) {
await this.sendNotification(`${name} health check failed`, 'warning');
// Restart unhealthy process
const proc = this.processes.get(name);
if (proc) {
proc.kill('SIGTERM');
}
}
} catch (error) {
await this.sendNotification(`${name} health check error: ${error.message}`, 'error');
}
}, healthCheckInterval);
}
}
getUserEnv() {
return {
...process.env,
XDG_RUNTIME_DIR: '/run/user/0',
DBUS_SESSION_BUS_ADDRESS: 'unix:path=/run/user/0/bus'
};
}
async checkService(svc) {
try {
if (svc.type === 'local-cli') {
const { stdout } = await execAsync(svc.checkCmd, { env: this.getUserEnv() });
return svc.checkFn(stdout);
} else if (svc.type === 'local-systemd') {
const { stdout } = await execAsync(
`systemctl --user is-active ${svc.unit} 2>&1 || echo "inactive"`,
{ env: this.getUserEnv() }
);
return stdout.trim() === 'active';
} else if (svc.type === 'remote-http') {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), svc.timeout || 5000);
try {
const resp = await fetch(svc.healthUrl, { signal: controller.signal });
clearTimeout(timer);
return resp.ok;
} catch {
clearTimeout(timer);
return false;
}
}
return false;
} catch (error) {
this.log(`${svc.name} check error: ${error.message}`, 'error');
return false;
}
}
async startService(svc) {
const env = this.getUserEnv();
try {
if (svc.type === 'local-cli') {
const { stdout } = await execAsync(svc.startCmd, { env });
this.log(`${svc.name} started: ${stdout}`, 'info');
} else if (svc.type === 'local-systemd') {
const { stdout } = await execAsync(`systemctl --user start ${svc.unit}`, { env });
this.log(`${svc.name} started: ${stdout}`, 'info');
} else if (svc.type === 'remote-http') {
this.log(`${svc.name} is remote; cannot auto-start from this host`, 'warning');
throw new Error('Remote auto-start not supported');
}
} catch (error) {
this.log(`Failed to start ${svc.name}: ${error.message}`, 'error');
throw error;
}
}
async handleServiceDown(serviceName, startFn) {
const now = Date.now();
// First detection: record and enter grace period (no restart yet, no Telegram alert)
if (this.lastKnownState[serviceName]) {
this.firstFailureTime[serviceName] = now;
this.lastKnownState[serviceName] = false;
this.log(`${serviceName} detected down, entering grace period (${this.gracePeriod / 1000}s)...`, 'warning');
await this.sendNotification(`${serviceName} is down (grace period ${this.gracePeriod / 1000}s before auto-restart).`, 'warning');
return;
}
if (now - this.firstFailureTime[serviceName] < this.gracePeriod) {
return;
}
if (!this.checkRestartLimit(serviceName)) {
await this.sendNotification(
`${serviceName} crashed ${this.maxRestarts} times in ${this.restartWindow / 60000} min. Auto-restart disabled until window resets.`,
'critical'
);
return;
}
await this.sendNotification(`${serviceName} is down. Attempting restart...`, 'error');
try {
await startFn();
this.lastKnownState[serviceName] = true;
this.firstFailureTime[serviceName] = 0;
await this.sendNotification(`${serviceName} restarted successfully`, 'info');
} catch (error) {
await this.sendNotification(`Failed to restart ${serviceName}: ${error.message}`, 'critical');
}
}
async monitorOpenClawService() {
const names = this.services.map(s => s.name).join(' + ');
this.log(`Starting service monitoring (${names})...`, 'info');
let heartbeatCounter = 0;
setInterval(async () => {
const status = {};
for (const svc of this.services) {
const ok = await this.checkService(svc);
status[svc.name] = ok;
if (ok) {
if (!this.lastKnownState[svc.name]) {
this.log(`${svc.name} recovered`, 'info');
}
this.lastKnownState[svc.name] = true;
this.firstFailureTime[svc.name] = 0;
} else {
await this.handleServiceDown(svc.name, () => this.startService(svc));
}
}
heartbeatCounter++;
if (heartbeatCounter >= (this.heartbeatInterval / 30000)) {
const summary = this.services.map(s => `${s.name}=${status[s.name] ? 'OK' : 'DOWN'}`).join(', ');
this.log(`Heartbeat: ${summary}`, 'info');
heartbeatCounter = 0;
}
}, 30000);
}
async start() {
this.log('Agent Health Monitor starting...', 'info');
for (const svc of this.services) {
const ok = await this.checkService(svc);
this.lastKnownState[svc.name] = ok;
this.log(`Initial check: ${svc.name}=${ok ? 'OK' : 'DOWN'}`, 'info');
}
await this.monitorOpenClawService();
this.log('Monitor is now active. Press Ctrl+C to stop.', 'info');
}
}
// Start the monitor
const monitor = new AgentHealthMonitor();
monitor.start().catch(console.error);