#!/bin/bash ############################################################################### # OpenClaw System Deployment & Management Script # # Config-driven: reads agent list from agents.yaml via parse_agents.py # No hardcoded agent references -- add/remove agents by editing agents.yaml. # # Usage: # ./deploy.sh install - Install and start all services # ./deploy.sh start - Start all services # ./deploy.sh stop - Stop all services # ./deploy.sh restart - Restart all services # ./deploy.sh status - Show service status # ./deploy.sh logs - Show recent logs # ./deploy.sh health - Run health check # ./deploy.sh rollback - Rollback to previous git commit # ./deploy.sh backup - Full backup (workspace + Qdrant snapshot + agent profiles) # ./deploy.sh backup quick - Quick backup (workspace files only, no Qdrant) # ./deploy.sh restore - Restore workspace + config from backup directory # ./deploy.sh restore-qdrant - Restore Qdrant collection from snapshot file # ./deploy.sh debug-stop - Stop ALL services (including monitor) for debugging # ./deploy.sh debug-start - Start ALL services after debugging # ./deploy.sh fix-service - Re-inject EnvironmentFile after OpenClaw UI upgrade ############################################################################### set -e WORKSPACE="/root/.openclaw/workspace" LOG_DIR="/root/.openclaw/workspace/logs/system" TIMESTAMP=$(date +%Y%m%d-%H%M%S) PARSE_AGENTS="python3 $WORKSPACE/scripts/parse_agents.py" RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } ensure_log_dir() { mkdir -p "$LOG_DIR"; } setup_user_env() { export XDG_RUNTIME_DIR=/run/user/$(id -u) export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus" } # Iterate over agents from agents.yaml and perform an action per type. # Usage: for_each_agent for_each_agent() { local action="$1" setup_user_env while IFS=$'\t' read -r aid atype f3 f4 f5; do case "$atype" in local-cli) local check_cmd="$f3" start_cmd="$f4" case "$action" in start) eval "$start_cmd" 2>/dev/null && log_info "Started $aid" || log_warning "$aid start failed" ;; stop) eval "${start_cmd/start/stop}" 2>/dev/null || true; log_info "Stopped $aid" ;; restart) eval "${start_cmd/start/stop}" 2>/dev/null || true; sleep 1; eval "$start_cmd" 2>/dev/null && log_info "Restarted $aid" || log_warning "$aid restart failed" ;; status) echo ""; log_info "=== $aid (local-cli) ==="; eval "$check_cmd" || true ;; logs) log_info "=== $aid logs ==="; journalctl --user -u openclaw-gateway --no-pager -n 50 2>/dev/null || true ;; esac ;; local-systemd) local unit="$f3" case "$action" in start) systemctl --user start "$unit" 2>/dev/null && log_info "Started $aid ($unit)" || log_warning "$aid start failed" ;; stop) systemctl --user stop "$unit" 2>/dev/null || true; log_info "Stopped $aid" ;; restart) systemctl --user restart "$unit" 2>/dev/null && log_info "Restarted $aid ($unit)" || log_warning "$aid restart failed" ;; enable) systemctl --user enable "$unit" 2>/dev/null ;; disable) systemctl --user disable "$unit" 2>/dev/null ;; status) echo ""; log_info "=== $aid (systemd: $unit) ==="; systemctl --user status "$unit" --no-pager -l 2>&1 || true ;; logs) log_info "=== $aid logs ==="; journalctl --user -u "$unit" --no-pager -n 50 2>/dev/null || true ;; esac ;; remote-http) case "$action" in status) log_info "=== $aid (remote) ==="; echo " Remote agent -- check via health URL" ;; *) log_info "$aid is remote; skipping $action" ;; esac ;; esac done < <($PARSE_AGENTS services) } install_services() { log_info "Installing OpenClaw systemd services..." loginctl enable-linger $(whoami) setup_user_env if [ ! -d "$XDG_RUNTIME_DIR" ]; then log_warning "Creating runtime directory..." mkdir -p "$XDG_RUNTIME_DIR" chmod 700 "$XDG_RUNTIME_DIR" fi mkdir -p ~/.config/systemd/user/ # Install main gateway service cp "$WORKSPACE/systemd/openclaw-gateway-user.service" ~/.config/systemd/user/openclaw-gateway.service # Install any local-systemd agents from agents.yaml while IFS=$'\t' read -r aid atype f3 f4 f5; do if [ "$atype" = "local-systemd" ]; then local unit="$f3" local svc_template="$WORKSPACE/systemd/$unit" if [ -f "$svc_template" ]; then cp "$svc_template" "$HOME/.config/systemd/user/$unit" systemctl --user enable "$unit" 2>/dev/null log_info "Installed $unit" fi fi done < <($PARSE_AGENTS services) systemctl --user daemon-reload systemctl --user enable openclaw-gateway # Install system-level agent monitor log_info "Installing system-level agent monitor..." cp "$WORKSPACE/systemd/openclaw-agent-monitor.service" /etc/systemd/system/ systemctl daemon-reload systemctl enable openclaw-agent-monitor fix_service_files log_info "Starting services..." for_each_agent start systemctl start openclaw-agent-monitor sleep 3 log_success "OpenClaw services installed and started!" local agent_names=$($PARSE_AGENTS ids) log_info "Active agents: $agent_names" log_info "Gateway logs: journalctl --user -u openclaw-gateway -f" log_info "Monitor logs: journalctl -u openclaw-agent-monitor -f" } start_services() { log_info "Starting OpenClaw services..." for_each_agent start systemctl start openclaw-agent-monitor log_success "All services started" } stop_services() { log_info "Stopping OpenClaw services..." for_each_agent stop systemctl stop openclaw-agent-monitor log_success "All services stopped" } restart_services() { log_info "Restarting OpenClaw services..." for_each_agent restart systemctl restart openclaw-agent-monitor log_success "All services restarted" } debug_stop() { log_warning "=== DEBUG MODE: Stopping ALL services ===" log_warning "Monitor will NOT auto-restart gateway while in debug mode." log_warning "Run './deploy.sh debug-start' when done debugging." systemctl stop openclaw-agent-monitor 2>/dev/null || true for_each_agent stop log_success "All services stopped. Safe to debug." echo "" log_info "Useful debug commands:" log_info " openclaw gateway start # start gateway in foreground" log_info " journalctl --user -u openclaw-gateway -n 100" } debug_start() { log_info "=== Exiting DEBUG MODE: Restarting ALL services ===" for_each_agent start systemctl start openclaw-agent-monitor sleep 2 log_success "All services restored. Monitor is active again." health_check } fix_service_files() { log_info "Ensuring EnvironmentFile= is present in installed service files..." setup_user_env local changed=0 while IFS=$'\t' read -r aid atype f3 f4 f5; do eval $($PARSE_AGENTS info "$aid" 2>/dev/null | grep -E '^(ENV_FILE|AGENT_TYPE)=') if [ -z "$ENV_FILE" ]; then continue; fi local env_path="$WORKSPACE/systemd/$ENV_FILE" local svc_file="" if [ "$AGENT_TYPE" = "local-cli" ]; then svc_file="$HOME/.config/systemd/user/openclaw-gateway.service" elif [ "$AGENT_TYPE" = "local-systemd" ]; then svc_file="$HOME/.config/systemd/user/$f3" fi if [ -n "$svc_file" ] && [ -f "$svc_file" ] && [ -f "$env_path" ]; then if ! grep -q "EnvironmentFile=.*${ENV_FILE}" "$svc_file" 2>/dev/null; then sed -i "/^\[Service\]/a EnvironmentFile=-${env_path}" "$svc_file" log_info "Injected EnvironmentFile into $(basename $svc_file)" changed=1 else log_info "$(basename $svc_file) already has EnvironmentFile" fi fi done < <($PARSE_AGENTS services) if [ $changed -eq 1 ]; then systemctl --user daemon-reload log_success "Service files updated. Run './deploy.sh restart' to apply." else log_success "All service files are up to date." fi } show_status() { for_each_agent status echo "" log_info "=== Agent Monitor (System Service) ===" systemctl status openclaw-agent-monitor --no-pager -l 2>&1 || true } show_logs() { setup_user_env for_each_agent logs echo "" log_info "=== Monitor logs (last 50 lines) ===" journalctl -u openclaw-agent-monitor --no-pager -n 50 } rollback() { log_warning "This will rollback the workspace to the previous git commit!" read -p "Are you sure? (y/N): " confirm if [[ $confirm =~ ^[Yy]$ ]]; then cd "$WORKSPACE" backup log_info "Current commit:" git log -1 --oneline git reset --hard HEAD~1 log_success "Rolled back to previous commit!" log_info "Restarting services to apply changes..." restart_services else log_info "Rollback cancelled." fi } rollback_to() { if [ -z "$1" ]; then log_error "Please specify a commit hash or tag" exit 1 fi log_warning "This will rollback the workspace to commit: $1" read -p "Are you sure? (y/N): " confirm if [[ $confirm =~ ^[Yy]$ ]]; then cd "$WORKSPACE" backup git reset --hard "$1" log_success "Rolled back to commit: $1" restart_services else log_info "Rollback cancelled." fi } backup() { local mode="${1:-full}" local backup_dir="/root/.openclaw/backups/$TIMESTAMP" mkdir -p "$backup_dir" log_info "Creating $mode backup -> $backup_dir" # --- Layer 1+2: workspace files --- log_info "Backing up workspace (Layer 1+2)..." tar -czf "$backup_dir/workspace.tar.gz" \ --exclude='.git' \ --exclude='logs' \ -C /root/.openclaw workspace # --- Config: all agent openclaw.json profiles --- log_info "Backing up agent profiles..." for d in /root/.openclaw/openclaw.json /root/.openclaw-*/openclaw.json; do [ -f "$d" ] && cp "$d" "$backup_dir/$(echo "$d" | sed 's|/root/||;s|/|__|g')" 2>/dev/null || true done # --- Config: docker-compose --- cp /opt/mem0-center/docker-compose.yml "$backup_dir/" 2>/dev/null || true if [ "$mode" = "full" ]; then # --- Layer 4: Qdrant snapshot --- log_info "Creating Qdrant snapshot (mem0_v4_shared)..." local snap_response snap_response=$(curl -sf -X POST "http://localhost:6333/collections/mem0_v4_shared/snapshots" 2>/dev/null) if [ $? -eq 0 ] && [ -n "$snap_response" ]; then local snap_name snap_name=$(echo "$snap_response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('name',''))" 2>/dev/null) if [ -n "$snap_name" ]; then local snap_src="/opt/mem0-center/snapshots/mem0_v4_shared/$snap_name" if [ -f "$snap_src" ]; then cp "$snap_src" "$backup_dir/qdrant-mem0_v4_shared.snapshot" log_success "Qdrant snapshot saved: $snap_name" else log_warning "Snapshot file not found at $snap_src" fi else log_warning "Could not parse snapshot name from response" fi else log_warning "Qdrant snapshot failed (is Qdrant running?)" fi # --- Layer 4: pre-backup memory count --- local mem_count mem_count=$(curl -sf "http://localhost:6333/collections/mem0_v4_shared" 2>/dev/null | \ python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('points_count',0))" 2>/dev/null || echo "unknown") echo "$mem_count" > "$backup_dir/qdrant-point-count.txt" log_info "Qdrant point count: $mem_count" fi # --- Manifest --- cat > "$backup_dir/manifest.txt" </dev/null || echo "unknown") Contents: workspace.tar.gz - Layer 1+2 workspace files .openclaw__openclaw.json - main agent profile docker-compose.yml - Qdrant docker config EOF [ "$mode" = "full" ] && echo " qdrant-mem0_v4_shared.snapshot - Layer 4 vector data" >> "$backup_dir/manifest.txt" log_success "Backup complete: $backup_dir" # --- Retention: keep last 10 backups --- local parent="/root/.openclaw/backups" local count=$(ls -1d "$parent"/[0-9]* 2>/dev/null | wc -l) if [ "$count" -gt 10 ]; then local to_remove=$((count - 10)) ls -1d "$parent"/[0-9]* 2>/dev/null | head -n "$to_remove" | while read -r old; do rm -rf "$old" log_info "Pruned old backup: $(basename "$old")" done fi } restore_workspace() { local restore_dir="$1" if [ -z "$restore_dir" ] || [ ! -d "$restore_dir" ]; then log_error "Usage: $0 restore " log_info "Available backups:" ls -1d /root/.openclaw/backups/[0-9]* 2>/dev/null | while read -r d; do echo " $d" done exit 1 fi log_warning "This will restore workspace from: $restore_dir" log_warning "Current workspace will be overwritten!" read -p "Are you sure? (y/N): " confirm if [[ ! $confirm =~ ^[Yy]$ ]]; then log_info "Restore cancelled." return fi # Pre-restore backup log_info "Creating pre-restore backup..." backup quick if [ -f "$restore_dir/workspace.tar.gz" ]; then log_info "Restoring workspace files..." tar -xzf "$restore_dir/workspace.tar.gz" -C /root/.openclaw/ log_success "Workspace restored" fi # Restore agent profiles for f in "$restore_dir"/.openclaw__openclaw.json "$restore_dir"/.openclaw-*__openclaw.json; do [ -f "$f" ] || continue local target="/root/$(basename "$f" | sed 's|__|/|g')" local target_dir="$(dirname "$target")" mkdir -p "$target_dir" cp "$f" "$target" log_info "Restored: $target" done log_success "Restore complete. Run './deploy.sh restart' to apply." } restore_qdrant() { local snap_file="$1" if [ -z "$snap_file" ]; then log_error "Usage: $0 restore-qdrant " log_info "Example: $0 restore-qdrant /root/.openclaw/backups/20260306-120000/qdrant-mem0_v4_shared.snapshot" exit 1 fi if [ ! -f "$snap_file" ]; then log_error "Snapshot file not found: $snap_file" exit 1 fi log_warning "This will REPLACE collection mem0_v4_shared with snapshot data!" log_warning "Snapshot: $snap_file" read -p "Are you sure? (y/N): " confirm if [[ ! $confirm =~ ^[Yy]$ ]]; then log_info "Restore cancelled." return fi # Copy snapshot into Qdrant snapshots directory local qdrant_snap_dir="/opt/mem0-center/snapshots/mem0_v4_shared" mkdir -p "$qdrant_snap_dir" local snap_name="$(basename "$snap_file")" cp "$snap_file" "$qdrant_snap_dir/$snap_name" log_info "Recovering Qdrant snapshot..." local result result=$(curl -sf -X PUT "http://localhost:6333/collections/mem0_v4_shared/snapshots/recover" \ -H "Content-Type: application/json" \ -d "{\"location\":\"/qdrant/snapshots/mem0_v4_shared/$snap_name\"}" 2>&1) if [ $? -eq 0 ]; then log_success "Qdrant snapshot recovered: $snap_name" local count count=$(curl -sf "http://localhost:6333/collections/mem0_v4_shared" 2>/dev/null | \ python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('points_count',0))" 2>/dev/null || echo "unknown") log_info "Collection point count after restore: $count" else log_error "Qdrant snapshot recovery failed: $result" fi } health_check() { log_info "Running health check..." setup_user_env local issues=0 while IFS=$'\t' read -r aid atype f3 f4 f5; do case "$atype" in local-cli) local check_cmd="$f3" check_pattern="$f5" local output output=$(eval "$check_cmd" 2>&1) if echo "$output" | grep -qE "$check_pattern"; then log_success "✓ $aid is running" else log_error "✗ $aid is not running" ((issues++)) || true fi ;; local-systemd) local unit="$f3" if systemctl --user is-active --quiet "$unit" 2>/dev/null; then log_success "✓ $aid is running ($unit)" else log_error "✗ $aid is not running ($unit)" ((issues++)) || true fi ;; remote-http) local health_url="$f3" timeout="$f4" if curl -sf --max-time 5 "$health_url" >/dev/null 2>&1; then log_success "✓ $aid is reachable" else log_warning "⚠ $aid is unreachable ($health_url)" ((issues++)) || true fi ;; esac done < <($PARSE_AGENTS services) if systemctl is-active --quiet openclaw-agent-monitor; then log_success "✓ Agent Monitor is running" else log_error "✗ Agent Monitor is not running" ((issues++)) || true fi local disk_usage=$(df -h /root | tail -1 | awk '{print $5}' | sed 's/%//') if [ "$disk_usage" -lt 80 ]; then log_success "✓ Disk usage: ${disk_usage}%" else log_warning "⚠ Disk usage: ${disk_usage}%" ((issues++)) || true fi local mem_usage=$(free | grep Mem | awk '{printf("%.0f", $3/$2 * 100.0)}') if [ "$mem_usage" -lt 80 ]; then log_success "✓ Memory usage: ${mem_usage}%" else log_warning "⚠ Memory usage: ${mem_usage}%" ((issues++)) || true fi if [ -d "$XDG_RUNTIME_DIR" ]; then log_success "✓ XDG_RUNTIME_DIR exists" else log_warning "⚠ XDG_RUNTIME_DIR not found" ((issues++)) || true fi if loginctl show-user $(whoami) -p Linger | grep -q "yes"; then log_success "✓ User linger enabled" else log_warning "⚠ User linger NOT enabled" ((issues++)) || true fi echo "" if [ $issues -eq 0 ]; then log_success "All health checks passed!" return 0 else log_error "$issues health check(s) failed!" return 1 fi } show_help() { echo "OpenClaw System Management Script (config-driven via agents.yaml)" echo "" echo "Usage: $0 " echo "" echo "Commands:" echo " install - Install and start all systemd services" echo " start - Start all registered agent services + monitor" echo " stop - Stop all services" echo " restart - Restart all services" echo " status - Show service status" echo " logs - Show recent logs" echo " health - Run health check" echo " backup - Full backup (workspace + Qdrant snapshot + agent profiles)" echo " backup quick - Quick backup (workspace files only, no Qdrant)" echo " restore - Restore workspace + config from backup directory" echo " restore-qdrant - Restore Qdrant from snapshot file" echo " rollback - Rollback to previous git commit" echo " rollback-to - Rollback to specific commit" echo " debug-stop - Stop ALL services including monitor (safe for debugging)" echo " debug-start - Restart all services after debugging" echo " fix-service - Re-inject EnvironmentFile after OpenClaw UI upgrade" echo " help - Show this help message" echo "" echo "Registered agents:" $PARSE_AGENTS list | while IFS=$'\t' read -r id type name; do echo " $id ($type) - $name" done echo "" } # Main case "${1:-help}" in install) install_services ;; start) start_services ;; stop) stop_services ;; restart) restart_services ;; status) show_status ;; logs) show_logs ;; health) health_check ;; backup) backup "$2" ;; restore) restore_workspace "$2" ;; restore-qdrant) restore_qdrant "$2" ;; rollback) rollback ;; rollback-to) rollback_to "$2" ;; debug-stop) debug_stop ;; debug-start) debug_start ;; fix-service) fix_service_files ;; help|--help|-h) show_help ;; *) log_error "Unknown command: $1" show_help exit 1 ;; esac