|
|
|
|
|
#!/usr/bin/env node
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* OpenClaw Agent Health Monitor & Auto-Healing System
|
|
|
|
|
|
*
|
|
|
|
|
|
* Features:
|
|
|
|
|
|
* - Process crash detection and auto-restart
|
|
|
|
|
|
* - Memory leak monitoring
|
|
|
|
|
|
* - Service health checks
|
|
|
|
|
|
* - Telegram notifications on events
|
|
|
|
|
|
* - Comprehensive logging
|
|
|
|
|
|
* - Systemd integration
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
const fs = require('fs');
|
|
|
|
|
|
const path = require('path');
|
|
|
|
|
|
const { spawn } = require('child_process');
|
|
|
|
|
|
const { exec } = require('child_process');
|
|
|
|
|
|
const util = require('util');
|
|
|
|
|
|
const execAsync = util.promisify(exec);
|
|
|
|
|
|
|
|
|
|
|
|
class AgentHealthMonitor {
|
|
|
|
|
|
constructor() {
|
|
|
|
|
|
this.config = this.loadConfig();
|
|
|
|
|
|
this.logDir = '/root/.openclaw/workspace/logs/agents';
|
|
|
|
|
|
this.workspaceDir = '/root/.openclaw/workspace';
|
|
|
|
|
|
this.processes = new Map();
|
|
|
|
|
|
this.restartCounts = new Map();
|
|
|
|
|
|
this.maxRestarts = 5;
|
|
|
|
|
|
this.restartWindow = 300000; // 5 minutes
|
|
|
|
|
|
this.gracePeriod = 60000; // 60s grace period after first failure (upgrade tolerance)
|
|
|
|
|
|
this.heartbeatInterval = 600000; // 10 minutes
|
|
|
|
|
|
this.lastKnownState = { gateway: true, life: true };
|
|
|
|
|
|
this.firstFailureTime = { gateway: 0, life: 0 };
|
|
|
|
|
|
|
|
|
|
|
|
this.ensureLogDir();
|
|
|
|
|
|
this.setupSignalHandlers();
|
|
|
|
|
|
this.log('Agent Health Monitor initialized', 'info');
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
loadConfig() {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const configPath = '/root/.openclaw/openclaw.json';
|
|
|
|
|
|
if (fs.existsSync(configPath)) {
|
|
|
|
|
|
return JSON.parse(fs.readFileSync(configPath, 'utf8'));
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
console.error('Failed to load OpenClaw config:', error.message);
|
|
|
|
|
|
}
|
|
|
|
|
|
return {};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
ensureLogDir() {
|
|
|
|
|
|
if (!fs.existsSync(this.logDir)) {
|
|
|
|
|
|
fs.mkdirSync(this.logDir, { recursive: true });
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
setupSignalHandlers() {
|
|
|
|
|
|
process.on('SIGTERM', () => this.gracefulShutdown());
|
|
|
|
|
|
process.on('SIGINT', () => this.gracefulShutdown());
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async gracefulShutdown() {
|
|
|
|
|
|
this.log('Graceful shutdown initiated', 'info');
|
|
|
|
|
|
|
|
|
|
|
|
// Stop all monitored processes
|
|
|
|
|
|
for (const [name, proc] of this.processes.entries()) {
|
|
|
|
|
|
try {
|
|
|
|
|
|
proc.kill('SIGTERM');
|
|
|
|
|
|
this.log(`Stopped process: ${name}`, 'info');
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
this.log(`Error stopping ${name}: ${error.message}`, 'error');
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
process.exit(0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
log(message, severity = 'info') {
|
|
|
|
|
|
const timestamp = new Date().toISOString();
|
|
|
|
|
|
const logEntry = `[${timestamp}] [${severity.toUpperCase()}] ${message}\n`;
|
|
|
|
|
|
|
|
|
|
|
|
// Console output
|
|
|
|
|
|
console.log(logEntry.trim());
|
|
|
|
|
|
|
|
|
|
|
|
// File logging
|
|
|
|
|
|
const logFile = path.join(this.logDir, `health-${new Date().toISOString().split('T')[0]}.log`);
|
|
|
|
|
|
fs.appendFileSync(logFile, logEntry);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async sendNotification(message, severity = 'info') {
|
|
|
|
|
|
this.log(message, severity);
|
|
|
|
|
|
|
|
|
|
|
|
// Send via Telegram if configured
|
|
|
|
|
|
const telegramConfig = this.config.channels?.telegram;
|
|
|
|
|
|
if (telegramConfig?.enabled && telegramConfig.botToken) {
|
|
|
|
|
|
await this.sendTelegramNotification(message, severity);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Also send via OpenClaw message tool if available
|
|
|
|
|
|
if (severity === 'critical' || severity === 'error') {
|
|
|
|
|
|
await this.sendOpenClawNotification(message, severity);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async sendTelegramNotification(message, severity) {
|
|
|
|
|
|
const botToken = this.config.channels.telegram.botToken;
|
|
|
|
|
|
const chatId = '5237946060';
|
|
|
|
|
|
|
|
|
|
|
|
if (!botToken) {
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
const url = `https://api.telegram.org/bot${botToken}/sendMessage`;
|
|
|
|
|
|
const emojis = {
|
|
|
|
|
|
critical: '🚨',
|
|
|
|
|
|
error: '❌',
|
|
|
|
|
|
warning: '⚠️',
|
|
|
|
|
|
info: 'ℹ️'
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
const payload = {
|
|
|
|
|
|
chat_id: chatId,
|
|
|
|
|
|
text: `${emojis[severity] || '📢'} *OpenClaw Alert* (${severity})\n\n${message}`,
|
|
|
|
|
|
parse_mode: 'Markdown'
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
const response = await fetch(url, {
|
|
|
|
|
|
method: 'POST',
|
|
|
|
|
|
headers: { 'Content-Type': 'application/json' },
|
|
|
|
|
|
body: JSON.stringify(payload)
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
if (!response.ok) {
|
|
|
|
|
|
throw new Error(`Telegram API error: ${response.status}`);
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
console.error('Telegram notification error:', error.message);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async sendOpenClawNotification(message, severity) {
|
|
|
|
|
|
try {
|
|
|
|
|
|
// Use OpenClaw's message tool via exec
|
|
|
|
|
|
const cmd = `openclaw message send --channel telegram --target 5237946060 --message "🚨 OpenClaw Service Alert (${severity})\\n\\n${message}"`;
|
|
|
|
|
|
await execAsync(cmd);
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
console.error('OpenClaw notification error:', error.message);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
checkRestartLimit(processName) {
|
|
|
|
|
|
const now = Date.now();
|
|
|
|
|
|
const restarts = this.restartCounts.get(processName) || [];
|
|
|
|
|
|
|
|
|
|
|
|
// Filter restarts within the window
|
|
|
|
|
|
const recentRestarts = restarts.filter(time => now - time < this.restartWindow);
|
|
|
|
|
|
|
|
|
|
|
|
if (recentRestarts.length >= this.maxRestarts) {
|
|
|
|
|
|
return false; // Too many restarts
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
this.restartCounts.set(processName, [...recentRestarts, now]);
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async monitorProcess(name, command, args = [], options = {}) {
|
|
|
|
|
|
const {
|
|
|
|
|
|
healthCheck,
|
|
|
|
|
|
healthCheckInterval = 30000,
|
|
|
|
|
|
env = {},
|
|
|
|
|
|
cwd = this.workspaceDir
|
|
|
|
|
|
} = options;
|
|
|
|
|
|
|
|
|
|
|
|
const startProcess = () => {
|
|
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
|
const proc = spawn(command, args, {
|
|
|
|
|
|
cwd,
|
|
|
|
|
|
env: { ...process.env, ...env },
|
|
|
|
|
|
stdio: ['ignore', 'pipe', 'pipe']
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
proc.stdout.on('data', (data) => {
|
|
|
|
|
|
this.log(`[${name}] ${data.toString().trim()}`, 'info');
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
proc.stderr.on('data', (data) => {
|
|
|
|
|
|
this.log(`[${name}] ${data.toString().trim()}`, 'error');
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
proc.on('error', async (error) => {
|
|
|
|
|
|
this.log(`[${name}] Process error: ${error.message}`, 'critical');
|
|
|
|
|
|
await this.sendNotification(`${name} failed to start: ${error.message}`, 'critical');
|
|
|
|
|
|
reject(error);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
proc.on('close', async (code, signal) => {
|
|
|
|
|
|
this.processes.delete(name);
|
|
|
|
|
|
this.log(`[${name}] Process exited with code ${code}, signal ${signal}`, 'warning');
|
|
|
|
|
|
|
|
|
|
|
|
// Auto-restart logic
|
|
|
|
|
|
if (code !== 0 || signal) {
|
|
|
|
|
|
if (this.checkRestartLimit(name)) {
|
|
|
|
|
|
this.log(`[${name}] Auto-restarting...`, 'warning');
|
|
|
|
|
|
await this.sendNotification(`${name} crashed (code: ${code}, signal: ${signal}). Restarting...`, 'error');
|
|
|
|
|
|
setTimeout(() => startProcess(), 5000);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
await this.sendNotification(
|
|
|
|
|
|
`${name} crashed ${this.maxRestarts} times in ${this.restartWindow/60000} minutes. Giving up.`,
|
|
|
|
|
|
'critical'
|
|
|
|
|
|
);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
this.processes.set(name, proc);
|
|
|
|
|
|
resolve(proc);
|
|
|
|
|
|
});
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Start the process
|
|
|
|
|
|
await startProcess();
|
|
|
|
|
|
|
|
|
|
|
|
// Set up health checks
|
|
|
|
|
|
if (healthCheck) {
|
|
|
|
|
|
setInterval(async () => {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const isHealthy = await healthCheck();
|
|
|
|
|
|
if (!isHealthy) {
|
|
|
|
|
|
await this.sendNotification(`${name} health check failed`, 'warning');
|
|
|
|
|
|
|
|
|
|
|
|
// Restart unhealthy process
|
|
|
|
|
|
const proc = this.processes.get(name);
|
|
|
|
|
|
if (proc) {
|
|
|
|
|
|
proc.kill('SIGTERM');
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
await this.sendNotification(`${name} health check error: ${error.message}`, 'error');
|
|
|
|
|
|
}
|
|
|
|
|
|
}, healthCheckInterval);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async checkOpenClawGateway() {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const { stdout } = await execAsync('openclaw gateway status 2>&1 || echo "not running"');
|
|
|
|
|
|
return stdout.includes('running') ||
|
|
|
|
|
|
stdout.includes('active') ||
|
|
|
|
|
|
stdout.includes('RPC probe: ok') ||
|
|
|
|
|
|
stdout.includes('Listening:');
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
this.log(`Gateway status check error: ${error.message}`, 'error');
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async checkLifeAgent() {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const env = {
|
|
|
|
|
|
...process.env,
|
|
|
|
|
|
XDG_RUNTIME_DIR: '/run/user/0',
|
|
|
|
|
|
DBUS_SESSION_BUS_ADDRESS: 'unix:path=/run/user/0/bus'
|
|
|
|
|
|
};
|
|
|
|
|
|
const { stdout } = await execAsync(
|
|
|
|
|
|
'systemctl --user is-active openclaw-gateway-life.service 2>&1 || echo "inactive"',
|
|
|
|
|
|
{ env }
|
|
|
|
|
|
);
|
|
|
|
|
|
return stdout.trim() === 'active';
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
this.log(`Life agent status check error: ${error.message}`, 'error');
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async startOpenClawGateway() {
|
|
|
|
|
|
const env = {
|
|
|
|
|
|
...process.env,
|
|
|
|
|
|
XDG_RUNTIME_DIR: '/run/user/0',
|
|
|
|
|
|
DBUS_SESSION_BUS_ADDRESS: 'unix:path=/run/user/0/bus'
|
|
|
|
|
|
};
|
|
|
|
|
|
try {
|
|
|
|
|
|
const { stdout } = await execAsync('openclaw gateway start', { env });
|
|
|
|
|
|
this.log(`OpenClaw Gateway started: ${stdout}`, 'info');
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
this.log(`Failed to start OpenClaw Gateway: ${error.message}`, 'error');
|
|
|
|
|
|
throw error;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async startLifeAgent() {
|
|
|
|
|
|
const env = {
|
|
|
|
|
|
...process.env,
|
|
|
|
|
|
XDG_RUNTIME_DIR: '/run/user/0',
|
|
|
|
|
|
DBUS_SESSION_BUS_ADDRESS: 'unix:path=/run/user/0/bus'
|
|
|
|
|
|
};
|
|
|
|
|
|
try {
|
|
|
|
|
|
const { stdout } = await execAsync(
|
|
|
|
|
|
'systemctl --user start openclaw-gateway-life.service', { env }
|
|
|
|
|
|
);
|
|
|
|
|
|
this.log(`Life agent started: ${stdout}`, 'info');
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
this.log(`Failed to start Life agent: ${error.message}`, 'error');
|
|
|
|
|
|
throw error;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async handleServiceDown(serviceName, startFn) {
|
|
|
|
|
|
const now = Date.now();
|
|
|
|
|
|
|
|
|
|
|
|
if (this.lastKnownState[serviceName]) {
|
|
|
|
|
|
this.firstFailureTime[serviceName] = now;
|
|
|
|
|
|
this.lastKnownState[serviceName] = false;
|
|
|
|
|
|
this.log(`${serviceName} detected down, entering grace period (${this.gracePeriod / 1000}s)...`, 'warning');
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (now - this.firstFailureTime[serviceName] < this.gracePeriod) {
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!this.checkRestartLimit(serviceName)) {
|
|
|
|
|
|
await this.sendNotification(
|
|
|
|
|
|
`${serviceName} crashed ${this.maxRestarts} times in ${this.restartWindow / 60000} min. Auto-restart disabled until window resets.`,
|
|
|
|
|
|
'critical'
|
|
|
|
|
|
);
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
await this.sendNotification(`${serviceName} is down. Attempting restart...`, 'error');
|
|
|
|
|
|
try {
|
|
|
|
|
|
await startFn();
|
|
|
|
|
|
this.lastKnownState[serviceName] = true;
|
|
|
|
|
|
this.firstFailureTime[serviceName] = 0;
|
|
|
|
|
|
await this.sendNotification(`${serviceName} restarted successfully`, 'info');
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
await this.sendNotification(`Failed to restart ${serviceName}: ${error.message}`, 'critical');
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async monitorOpenClawService() {
|
|
|
|
|
|
this.log('Starting service monitoring (gateway + life)...', 'info');
|
|
|
|
|
|
let heartbeatCounter = 0;
|
|
|
|
|
|
|
|
|
|
|
|
setInterval(async () => {
|
|
|
|
|
|
const gatewayOk = await this.checkOpenClawGateway();
|
|
|
|
|
|
const lifeOk = await this.checkLifeAgent();
|
|
|
|
|
|
|
|
|
|
|
|
if (gatewayOk) {
|
|
|
|
|
|
if (!this.lastKnownState.gateway) {
|
|
|
|
|
|
this.log('Gateway recovered', 'info');
|
|
|
|
|
|
}
|
|
|
|
|
|
this.lastKnownState.gateway = true;
|
|
|
|
|
|
this.firstFailureTime.gateway = 0;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
await this.handleServiceDown('gateway', () => this.startOpenClawGateway());
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (lifeOk) {
|
|
|
|
|
|
if (!this.lastKnownState.life) {
|
|
|
|
|
|
this.log('Life agent recovered', 'info');
|
|
|
|
|
|
}
|
|
|
|
|
|
this.lastKnownState.life = true;
|
|
|
|
|
|
this.firstFailureTime.life = 0;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
await this.handleServiceDown('life', () => this.startLifeAgent());
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
heartbeatCounter++;
|
|
|
|
|
|
if (heartbeatCounter >= (this.heartbeatInterval / 30000)) {
|
|
|
|
|
|
this.log(`Heartbeat: gateway=${gatewayOk ? 'OK' : 'DOWN'}, life=${lifeOk ? 'OK' : 'DOWN'}`, 'info');
|
|
|
|
|
|
heartbeatCounter = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}, 30000);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async start() {
|
|
|
|
|
|
this.log('Agent Health Monitor starting...', 'info');
|
|
|
|
|
|
|
|
|
|
|
|
const gatewayOk = await this.checkOpenClawGateway();
|
|
|
|
|
|
const lifeOk = await this.checkLifeAgent();
|
|
|
|
|
|
this.log(`Initial check: gateway=${gatewayOk ? 'OK' : 'DOWN'}, life=${lifeOk ? 'OK' : 'DOWN'}`, 'info');
|
|
|
|
|
|
this.lastKnownState.gateway = gatewayOk;
|
|
|
|
|
|
this.lastKnownState.life = lifeOk;
|
|
|
|
|
|
|
|
|
|
|
|
await this.monitorOpenClawService();
|
|
|
|
|
|
this.log('Monitor is now active. Press Ctrl+C to stop.', 'info');
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Start the monitor
|
|
|
|
|
|
const monitor = new AgentHealthMonitor();
|
|
|
|
|
|
monitor.start().catch(console.error);
|