diff --git a/MEMORY.md b/MEMORY.md index a31a83c..24833c8 100644 --- a/MEMORY.md +++ b/MEMORY.md @@ -195,4 +195,105 @@ This file contains curated long-term memories and important context. - [ ] Implement log rotation and archival - [ ] Add email notifications as backup channel - [ ] Create web-based admin dashboard -- [ ] Add automated security scanning in CI/CD \ No newline at end of file +- [ ] Add automated security scanning in CI/CD + +--- + +## User-Level vs System-Level Systemd Services - Critical Lesson (2026-02-20 14:35 UTC) + +### Problem Discovered +Initial deployment used system-level systemd services (`/etc/systemd/system/`) for OpenClaw Gateway, but OpenClaw natively uses **user-level systemd** (`~/.config/systemd/user/`). This caused: +- Service restart loops (5 attempts then failure) +- Error: `systemctl --user unavailable: Failed to connect to bus: No medium found` +- Conflicts between system and user service definitions + +### Root Cause +OpenClaw Gateway is designed as a user-level service because: +1. It runs under the user's context, not root +2. It needs access to user-specific config (`~/.openclaw/`) +3. User-level services have different environment requirements + +### Solution: Hybrid Architecture + +#### User-Level Service (Gateway) +- **Location**: `~/.config/systemd/user/openclaw-gateway.service` +- **Required Setup**: + ```bash + # Enable linger (CRITICAL - allows user services to run without login session) + loginctl enable-linger $(whoami) + + # Set environment variables + export XDG_RUNTIME_DIR=/run/user/$(id -u) + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" + ``` +- **Management Commands**: + ```bash + systemctl --user status openclaw-gateway + systemctl --user start/stop/restart openclaw-gateway + journalctl --user -u openclaw-gateway -f + ``` + +#### System-Level Service (Agent Monitor) +- **Location**: `/etc/systemd/system/openclaw-agent-monitor.service` +- **Purpose**: Independently monitor the gateway (survives user session issues) +- **Management Commands**: + ```bash + systemctl status openclaw-agent-monitor + systemctl start/stop/restart openclaw-agent-monitor + journalctl -u openclaw-agent-monitor -f + ``` + +### Deployment Checklist for New Servers +```bash +# 1. Enable user linger (MUST DO FIRST) +loginctl enable-linger $(whoami) + +# 2. Create runtime directory if needed +mkdir -p /run/user/$(id -u) +chmod 700 /run/user/$(id -u) + +# 3. Export environment (add to ~/.bashrc for persistence) +echo 'export XDG_RUNTIME_DIR=/run/user/$(id -u)' >> ~/.bashrc +echo 'export DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/$(id -u)/bus' >> ~/.bashrc + +# 4. Install services +./deploy.sh install + +# 5. Verify +./deploy.sh health +``` + +### Troubleshooting Guide + +#### Error: "Failed to connect to bus: No medium found" +**Cause**: User linger not enabled or environment variables not set +**Fix**: +```bash +loginctl enable-linger $(whoami) +export XDG_RUNTIME_DIR=/run/user/$(id -u) +export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" +``` + +#### Error: "Start request repeated too quickly" +**Cause**: Service crashing due to misconfiguration +**Fix**: Check logs with `journalctl --user -u openclaw-gateway -f` + +#### User service not starting after reboot +**Cause**: Linger not enabled +**Fix**: `loginctl enable-linger $(whoami)` + +### Best Practices for Multi-Agent Deployments +1. **Always enable linger** on first setup - document this in deployment guide +2. **Use hybrid architecture** - user-level for agents, system-level for monitors +3. **Set environment variables** in startup scripts, not just shell config +4. **Test after reboot** - verify services auto-start correctly +5. **Document in MEMORY.md** - share lessons across agent instances + +### Updated deploy.sh Features +- Automatically enables linger during install +- Sets up XDG_RUNTIME_DIR and DBUS_SESSION_BUS_ADDRESS +- Uses `systemctl --user` for gateway, `systemctl` for monitor +- Health check verifies linger status and runtime directory +- Proper log commands for both service types + +--- \ No newline at end of file diff --git a/agent-monitor.js b/agent-monitor.js index 5a0d9ca..fc23cc9 100644 --- a/agent-monitor.js +++ b/agent-monitor.js @@ -242,17 +242,31 @@ class AgentHealthMonitor { async checkOpenClawGateway() { try { + // Use openclaw CLI for reliable status check (works with user-level systemd) const { stdout } = await execAsync('openclaw gateway status 2>&1 || echo "not running"'); - return stdout.includes('running') || stdout.includes('active'); - } catch { + + // Check for various running states + return stdout.includes('running') || + stdout.includes('active') || + stdout.includes('RPC probe: ok') || + stdout.includes('Listening:'); + } catch (error) { + this.log(`Gateway status check error: ${error.message}`, 'error'); return false; } } async startOpenClawGateway() { try { - await execAsync('openclaw gateway start'); - this.log('OpenClaw Gateway started', 'info'); + // Set up environment for user-level systemd + const env = { + ...process.env, + XDG_RUNTIME_DIR: '/run/user/0', + DBUS_SESSION_BUS_ADDRESS: 'unix:path=/run/user/0/bus' + }; + + const { stdout, stderr } = await execAsync('openclaw gateway start', { env }); + this.log(`OpenClaw Gateway started: ${stdout}`, 'info'); } catch (error) { this.log(`Failed to start OpenClaw Gateway: ${error.message}`, 'error'); throw error; diff --git a/deploy.sh b/deploy.sh index 5c74687..1dfdaac 100755 --- a/deploy.sh +++ b/deploy.sh @@ -57,62 +57,117 @@ ensure_log_dir() { install_services() { log_info "Installing OpenClaw systemd services..." - # Copy service files - cp "$WORKSPACE/systemd/openclaw-gateway.service" /etc/systemd/system/ - cp "$WORKSPACE/systemd/openclaw-agent-monitor.service" /etc/systemd/system/ + # Step 1: Enable linger for user-level systemd (CRITICAL for VPS/server deployments) + log_info "Enabling user linger for persistent user-level services..." + loginctl enable-linger $(whoami) - # Reload systemd - systemctl daemon-reload + # Step 2: Export required environment variables + export XDG_RUNTIME_DIR=/run/user/$(id -u) + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" + + # Verify environment + if [ ! -d "$XDG_RUNTIME_DIR" ]; then + log_error "XDG_RUNTIME_DIR not found: $XDG_RUNTIME_DIR" + log_warning "Creating runtime directory..." + mkdir -p "$XDG_RUNTIME_DIR" + chmod 700 "$XDG_RUNTIME_DIR" + fi - # Enable services - systemctl enable openclaw-gateway + # Step 3: Install user-level gateway service + log_info "Installing user-level gateway service..." + mkdir -p ~/.config/systemd/user/ + cp "$WORKSPACE/systemd/openclaw-gateway-user.service" ~/.config/systemd/user/openclaw-gateway.service + + # Reload user systemd daemon + systemctl --user daemon-reload + systemctl --user enable openclaw-gateway + + # Step 4: Install system-level agent monitor (independent of user session) + log_info "Installing system-level agent monitor..." + cp "$WORKSPACE/systemd/openclaw-agent-monitor.service" /etc/systemd/system/ + systemctl daemon-reload systemctl enable openclaw-agent-monitor - # Start services - systemctl start openclaw-gateway + # Step 5: Start services + log_info "Starting services..." + systemctl --user start openclaw-gateway systemctl start openclaw-agent-monitor + # Wait for gateway to be ready + sleep 3 + log_success "OpenClaw services installed and started!" - log_info "Gateway: http://localhost:18789" - log_info "Logs: journalctl -u openclaw-gateway -f" + log_info "Gateway: ws://localhost:18789" + log_info "Dashboard: http://localhost:18789/" + log_info "User service logs: journalctl --user -u openclaw-gateway -f" + log_info "Monitor logs: journalctl -u openclaw-agent-monitor -f" } start_services() { log_info "Starting OpenClaw services..." - systemctl start openclaw-gateway + + # Set up environment for user-level services + export XDG_RUNTIME_DIR=/run/user/$(id -u) + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" + + systemctl --user start openclaw-gateway systemctl start openclaw-agent-monitor log_success "Services started!" } stop_services() { log_info "Stopping OpenClaw services..." - systemctl stop openclaw-gateway + + # Set up environment for user-level services + export XDG_RUNTIME_DIR=/run/user/$(id -u) + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" + + systemctl --user stop openclaw-gateway systemctl stop openclaw-agent-monitor log_success "Services stopped!" } restart_services() { log_info "Restarting OpenClaw services..." - systemctl restart openclaw-gateway + + # Set up environment for user-level services + export XDG_RUNTIME_DIR=/run/user/$(id -u) + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" + + systemctl --user restart openclaw-gateway systemctl restart openclaw-agent-monitor log_success "Services restarted!" } show_status() { + # Set up environment for user-level services + export XDG_RUNTIME_DIR=/run/user/$(id -u) + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" + echo "" - log_info "=== OpenClaw Gateway Status ===" - systemctl status openclaw-gateway --no-pager -l + log_info "=== OpenClaw Gateway Status (User Service) ===" + systemctl --user status openclaw-gateway --no-pager -l echo "" - log_info "=== Agent Monitor Status ===" + log_info "=== Agent Monitor Status (System Service) ===" systemctl status openclaw-agent-monitor --no-pager -l echo "" - log_info "=== Recent Logs ===" - journalctl -u openclaw-gateway -u openclaw-agent-monitor --no-pager -n 20 + log_info "=== Recent Gateway Logs ===" + journalctl --user -u openclaw-gateway --no-pager -n 15 + echo "" + log_info "=== Recent Monitor Logs ===" + journalctl -u openclaw-agent-monitor --no-pager -n 15 } show_logs() { - log_info "Showing recent logs (last 50 lines)..." - journalctl -u openclaw-gateway -u openclaw-agent-monitor --no-pager -n 50 + # Set up environment for user-level services + export XDG_RUNTIME_DIR=/run/user/$(id -u) + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" + + log_info "Showing recent gateway logs (last 50 lines)..." + journalctl --user -u openclaw-gateway --no-pager -n 50 + echo "" + log_info "Showing recent monitor logs (last 50 lines)..." + journalctl -u openclaw-agent-monitor --no-pager -n 50 } rollback() { @@ -181,19 +236,23 @@ backup() { health_check() { log_info "Running health check..." + # Set up environment for user-level services + export XDG_RUNTIME_DIR=/run/user/$(id -u) + export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" + local issues=0 - # Check gateway - if systemctl is-active --quiet openclaw-gateway; then - log_success "✓ Gateway is running" + # Check gateway (user-level service) + if systemctl --user is-active --quiet openclaw-gateway 2>/dev/null; then + log_success "✓ Gateway is running (user service)" else log_error "✗ Gateway is not running" ((issues++)) fi - # Check monitor + # Check monitor (system-level service) if systemctl is-active --quiet openclaw-agent-monitor; then - log_success "✓ Agent Monitor is running" + log_success "✓ Agent Monitor is running (system service)" else log_error "✗ Agent Monitor is not running" ((issues++)) @@ -217,6 +276,22 @@ health_check() { ((issues++)) fi + # Check XDG_RUNTIME_DIR + if [ -d "$XDG_RUNTIME_DIR" ]; then + log_success "✓ XDG_RUNTIME_DIR exists: $XDG_RUNTIME_DIR" + else + log_warning "⚠ XDG_RUNTIME_DIR not found" + ((issues++)) + fi + + # Check linger status + if loginctl show-user $(whoami) -p Linger | grep -q "yes"; then + log_success "✓ User linger is enabled" + else + log_warning "⚠ User linger is NOT enabled (run: loginctl enable-linger)" + ((issues++)) + fi + echo "" if [ $issues -eq 0 ]; then log_success "All health checks passed!" diff --git a/systemd/openclaw-agent-monitor.service b/systemd/openclaw-agent-monitor.service index 4d84cdd..aa3fd32 100644 --- a/systemd/openclaw-agent-monitor.service +++ b/systemd/openclaw-agent-monitor.service @@ -1,7 +1,7 @@ [Unit] Description=OpenClaw Agent Health Monitor Documentation=https://docs.openclaw.ai -After=network.target openclaw-gateway.service +After=network-online.target Wants=network-online.target [Service] @@ -9,6 +9,9 @@ Type=simple User=root WorkingDirectory=/root/.openclaw/workspace Environment=NODE_ENV=production +Environment=HOME=/root +Environment=XDG_RUNTIME_DIR=/run/user/0 +Environment=DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/0/bus # Monitor process ExecStart=/usr/bin/node /root/.openclaw/workspace/agent-monitor.js diff --git a/systemd/openclaw-gateway-user.service b/systemd/openclaw-gateway-user.service new file mode 100644 index 0000000..429700a --- /dev/null +++ b/systemd/openclaw-gateway-user.service @@ -0,0 +1,51 @@ +# User-level systemd service for OpenClaw Gateway +# Install to: ~/.config/systemd/user/openclaw-gateway.service +# Required: loginctl enable-linger $(whoami) + +[Unit] +Description=OpenClaw Gateway (v2026.2.19-2) +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/www/server/nodejs/v24.13.1/bin/node /www/server/nodejs/v24.13.1/lib/node_modules/openclaw/dist/index.js gateway --port 18789 +Restart=always +RestartSec=10 +StartLimitInterval=300 +StartLimitBurst=5 +KillMode=process +TimeoutStopSec=30 + +# Critical environment variables for user-level systemd +Environment=HOME=/root +Environment=XDG_RUNTIME_DIR=/run/user/0 +Environment=DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/0/bus +Environment=PATH=/root/.local/bin:/root/.npm-global/bin:/root/bin:/root/.volta/bin:/root/.asdf/shims:/root/.bun/bin:/root/.nvm/current/bin:/root/.fnm/current/bin:/root/.local/share/pnpm:/usr/local/bin:/usr/bin:/bin +Environment=OPENCLAW_GATEWAY_PORT=18789 +Environment=OPENCLAW_GATEWAY_TOKEN=9e2e91b31a56fb56a35e91821c025267292ec44c26169b12 +Environment=OPENCLAW_SYSTEMD_UNIT=openclaw-gateway.service +Environment=OPENCLAW_SERVICE_MARKER=openclaw +Environment=OPENCLAW_SERVICE_KIND=gateway +Environment=OPENCLAW_SERVICE_VERSION=2026.2.19-2 + +# Resource limits +MemoryLimit=2G +CPUQuota=80% + +# Security +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=read-only +ReadWritePaths=/root/.openclaw + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=openclaw-gateway + +# Watchdog +WatchdogSec=30 + +[Install] +WantedBy=default.target