openClaw_agent_dm/deploy.sh

#!/bin/bash

###############################################################################
# OpenClaw System Deployment & Management Script
# 
# Config-driven: reads agent list from agents.yaml via parse_agents.py
# No hardcoded agent references -- add/remove agents by editing agents.yaml.
#
# Usage:
#   ./deploy.sh install      - Install and start all services
#   ./deploy.sh start        - Start all services
#   ./deploy.sh stop         - Stop all services
#   ./deploy.sh restart      - Restart all services
#   ./deploy.sh status       - Show service status
#   ./deploy.sh logs         - Show recent logs
#   ./deploy.sh health       - Run health check
#   ./deploy.sh rollback     - Rollback to previous git commit
#   ./deploy.sh backup       - Full backup (workspace + Qdrant snapshot + agent profiles)
#   ./deploy.sh backup quick - Quick backup (workspace files only, no Qdrant)
#   ./deploy.sh restore <dir>          - Restore workspace + config from backup directory
#   ./deploy.sh restore-qdrant <file>  - Restore Qdrant collection from snapshot file
#   ./deploy.sh debug-stop   - Stop ALL services (including monitor) for debugging
#   ./deploy.sh debug-start  - Start ALL services after debugging
#   ./deploy.sh fix-service  - Re-inject EnvironmentFile after OpenClaw UI upgrade
###############################################################################

set -e

WORKSPACE="/root/.openclaw/workspace"
LOG_DIR="/root/.openclaw/workspace/logs/system"
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
PARSE_AGENTS="python3 $WORKSPACE/scripts/parse_agents.py"

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

log_info()    { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
log_error()   { echo -e "${RED}[ERROR]${NC} $1"; }

ensure_log_dir() { mkdir -p "$LOG_DIR"; }

setup_user_env() {
    export XDG_RUNTIME_DIR=/run/user/$(id -u)
    export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus"
}

# Iterate over agents from agents.yaml and perform an action per type.
# Usage: for_each_agent <start|stop|restart|enable|disable|status>
for_each_agent() {
    local action="$1"
    setup_user_env

    while IFS=$'\t' read -r aid atype f3 f4 f5; do
        case "$atype" in
            local-cli)
                local check_cmd="$f3" start_cmd="$f4"
                case "$action" in
                    start)   eval "$start_cmd" 2>/dev/null && log_info "Started $aid" || log_warning "$aid start failed" ;;
                    stop)    eval "${start_cmd/start/stop}" 2>/dev/null || true; log_info "Stopped $aid" ;;
                    restart) eval "${start_cmd/start/stop}" 2>/dev/null || true; sleep 1; eval "$start_cmd" 2>/dev/null && log_info "Restarted $aid" || log_warning "$aid restart failed" ;;
                    status)  echo ""; log_info "=== $aid (local-cli) ==="; eval "$check_cmd" || true ;;
                    logs)    log_info "=== $aid logs ==="; journalctl --user -u openclaw-gateway --no-pager -n 50 2>/dev/null || true ;;
                esac
                ;;
            local-systemd)
                local unit="$f3"
                case "$action" in
                    start)   systemctl --user start "$unit" 2>/dev/null && log_info "Started $aid ($unit)" || log_warning "$aid start failed" ;;
                    stop)    systemctl --user stop "$unit" 2>/dev/null || true; log_info "Stopped $aid" ;;
                    restart) systemctl --user restart "$unit" 2>/dev/null && log_info "Restarted $aid ($unit)" || log_warning "$aid restart failed" ;;
                    enable)  systemctl --user enable "$unit" 2>/dev/null ;;
                    disable) systemctl --user disable "$unit" 2>/dev/null ;;
                    status)  echo ""; log_info "=== $aid (systemd: $unit) ==="; systemctl --user status "$unit" --no-pager -l 2>&1 || true ;;
                    logs)    log_info "=== $aid logs ==="; journalctl --user -u "$unit" --no-pager -n 50 2>/dev/null || true ;;
                esac
                ;;
            remote-http)
                case "$action" in
                    status)  log_info "=== $aid (remote) ==="; echo "  Remote agent -- check via health URL" ;;
                    *)       log_info "$aid is remote; skipping $action" ;;
                esac
                ;;
        esac
    done < <($PARSE_AGENTS services)
}

install_services() {
    log_info "Installing OpenClaw systemd services..."
    
    loginctl enable-linger $(whoami)
    setup_user_env
    
    if [ ! -d "$XDG_RUNTIME_DIR" ]; then
        log_warning "Creating runtime directory..."
        mkdir -p "$XDG_RUNTIME_DIR"
        chmod 700 "$XDG_RUNTIME_DIR"
    fi
    
    mkdir -p ~/.config/systemd/user/

    # Install main gateway service
    cp "$WORKSPACE/systemd/openclaw-gateway-user.service" ~/.config/systemd/user/openclaw-gateway.service
    
    # Install any local-systemd agents from agents.yaml
    while IFS=$'\t' read -r aid atype f3 f4 f5; do
        if [ "$atype" = "local-systemd" ]; then
            local unit="$f3"
            local svc_template="$WORKSPACE/systemd/$unit"
            if [ -f "$svc_template" ]; then
                cp "$svc_template" "$HOME/.config/systemd/user/$unit"
                systemctl --user enable "$unit" 2>/dev/null
                log_info "Installed $unit"
            fi
        fi
    done < <($PARSE_AGENTS services)
    
    systemctl --user daemon-reload
    systemctl --user enable openclaw-gateway
    
    # Install system-level agent monitor
    log_info "Installing system-level agent monitor..."
    cp "$WORKSPACE/systemd/openclaw-agent-monitor.service" /etc/systemd/system/
    systemctl daemon-reload
    systemctl enable openclaw-agent-monitor
    
    fix_service_files
    
    log_info "Starting services..."
    for_each_agent start
    systemctl start openclaw-agent-monitor
    
    sleep 3
    
    log_success "OpenClaw services installed and started!"
    local agent_names=$($PARSE_AGENTS ids)
    log_info "Active agents: $agent_names"
    log_info "Gateway logs:  journalctl --user -u openclaw-gateway -f"
    log_info "Monitor logs:  journalctl -u openclaw-agent-monitor -f"
}

start_services() {
    log_info "Starting OpenClaw services..."
    for_each_agent start
    systemctl start openclaw-agent-monitor
    log_success "All services started"
}

stop_services() {
    log_info "Stopping OpenClaw services..."
    for_each_agent stop
    systemctl stop openclaw-agent-monitor
    log_success "All services stopped"
}

restart_services() {
    log_info "Restarting OpenClaw services..."
    for_each_agent restart
    systemctl restart openclaw-agent-monitor
    log_success "All services restarted"
}

debug_stop() {
    log_warning "=== DEBUG MODE: Stopping ALL services ==="
    log_warning "Monitor will NOT auto-restart gateway while in debug mode."
    log_warning "Run './deploy.sh debug-start' when done debugging."
    
    systemctl stop openclaw-agent-monitor 2>/dev/null || true
    for_each_agent stop
    
    log_success "All services stopped. Safe to debug."
    echo ""
    log_info "Useful debug commands:"
    log_info "  openclaw gateway start          # start gateway in foreground"
    log_info "  journalctl --user -u openclaw-gateway -n 100"
}

debug_start() {
    log_info "=== Exiting DEBUG MODE: Restarting ALL services ==="
    for_each_agent start
    systemctl start openclaw-agent-monitor
    
    sleep 2
    log_success "All services restored. Monitor is active again."
    health_check
}

fix_service_files() {
    log_info "Ensuring EnvironmentFile= is present in installed service files..."
    setup_user_env
    
    local changed=0

    while IFS=$'\t' read -r aid atype f3 f4 f5; do
        eval $($PARSE_AGENTS info "$aid" 2>/dev/null | grep -E '^(ENV_FILE|AGENT_TYPE)=')
        if [ -z "$ENV_FILE" ]; then continue; fi
        
        local env_path="$WORKSPACE/systemd/$ENV_FILE"
        local svc_file=""
        
        if [ "$AGENT_TYPE" = "local-cli" ]; then
            svc_file="$HOME/.config/systemd/user/openclaw-gateway.service"
        elif [ "$AGENT_TYPE" = "local-systemd" ]; then
            svc_file="$HOME/.config/systemd/user/$f3"
        fi
        
        if [ -n "$svc_file" ] && [ -f "$svc_file" ] && [ -f "$env_path" ]; then
            if ! grep -q "EnvironmentFile=.*${ENV_FILE}" "$svc_file" 2>/dev/null; then
                sed -i "/^\[Service\]/a EnvironmentFile=-${env_path}" "$svc_file"
                log_info "Injected EnvironmentFile into $(basename $svc_file)"
                changed=1
            else
                log_info "$(basename $svc_file) already has EnvironmentFile"
            fi
        fi
    done < <($PARSE_AGENTS services)
    
    if [ $changed -eq 1 ]; then
        systemctl --user daemon-reload
        log_success "Service files updated. Run './deploy.sh restart' to apply."
    else
        log_success "All service files are up to date."
    fi
}

show_status() {
    for_each_agent status
    echo ""
    log_info "=== Agent Monitor (System Service) ==="
    systemctl status openclaw-agent-monitor --no-pager -l 2>&1 || true
}

show_logs() {
    setup_user_env
    for_each_agent logs
    echo ""
    log_info "=== Monitor logs (last 50 lines) ==="
    journalctl -u openclaw-agent-monitor --no-pager -n 50
}

rollback() {
    log_warning "This will rollback the workspace to the previous git commit!"
    read -p "Are you sure? (y/N): " confirm
    
    if [[ $confirm =~ ^[Yy]$ ]]; then
        cd "$WORKSPACE"
        backup
        log_info "Current commit:"
        git log -1 --oneline
        git reset --hard HEAD~1
        log_success "Rolled back to previous commit!"
        log_info "Restarting services to apply changes..."
        restart_services
    else
        log_info "Rollback cancelled."
    fi
}

rollback_to() {
    if [ -z "$1" ]; then
        log_error "Please specify a commit hash or tag"
        exit 1
    fi
    
    log_warning "This will rollback the workspace to commit: $1"
    read -p "Are you sure? (y/N): " confirm
    
    if [[ $confirm =~ ^[Yy]$ ]]; then
        cd "$WORKSPACE"
        backup
        git reset --hard "$1"
        log_success "Rolled back to commit: $1"
        restart_services
    else
        log_info "Rollback cancelled."
    fi
}

backup() {
    local mode="${1:-full}"
    local backup_dir="/root/.openclaw/backups/$TIMESTAMP"
    mkdir -p "$backup_dir"

    log_info "Creating $mode backup -> $backup_dir"

    # --- Layer 1+2: workspace files ---
    log_info "Backing up workspace (Layer 1+2)..."
    tar -czf "$backup_dir/workspace.tar.gz" \
        --exclude='.git' \
        --exclude='logs' \
        -C /root/.openclaw workspace

    # --- Config: all agent openclaw.json profiles ---
    log_info "Backing up agent profiles..."
    for d in /root/.openclaw/openclaw.json /root/.openclaw-*/openclaw.json; do
        [ -f "$d" ] && cp "$d" "$backup_dir/$(echo "$d" | sed 's|/root/||;s|/|__|g')" 2>/dev/null || true
    done

    # --- Config: docker-compose ---
    cp /opt/mem0-center/docker-compose.yml "$backup_dir/" 2>/dev/null || true

    if [ "$mode" = "full" ]; then
        # --- Layer 4: Qdrant snapshot ---
        log_info "Creating Qdrant snapshot (mem0_v4_shared)..."
        local snap_response
        snap_response=$(curl -sf -X POST "http://localhost:6333/collections/mem0_v4_shared/snapshots" 2>/dev/null)
        if [ $? -eq 0 ] && [ -n "$snap_response" ]; then
            local snap_name
            snap_name=$(echo "$snap_response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('name',''))" 2>/dev/null)
            if [ -n "$snap_name" ]; then
                local snap_src="/opt/mem0-center/snapshots/mem0_v4_shared/$snap_name"
                if [ -f "$snap_src" ]; then
                    cp "$snap_src" "$backup_dir/qdrant-mem0_v4_shared.snapshot"
                    log_success "Qdrant snapshot saved: $snap_name"
                else
                    log_warning "Snapshot file not found at $snap_src"
                fi
            else
                log_warning "Could not parse snapshot name from response"
            fi
        else
            log_warning "Qdrant snapshot failed (is Qdrant running?)"
        fi

        # --- Layer 4: pre-backup memory count ---
        local mem_count
        mem_count=$(curl -sf "http://localhost:6333/collections/mem0_v4_shared" 2>/dev/null | \
            python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('points_count',0))" 2>/dev/null || echo "unknown")
        echo "$mem_count" > "$backup_dir/qdrant-point-count.txt"
        log_info "Qdrant point count: $mem_count"
    fi

    # --- Manifest ---
    cat > "$backup_dir/manifest.txt" <<EOF
OpenClaw Backup - $TIMESTAMP
Mode: $mode
Date: $(date -Iseconds)
Agents: $($PARSE_AGENTS ids 2>/dev/null || echo "unknown")
Contents:
  workspace.tar.gz          - Layer 1+2 workspace files
  .openclaw__openclaw.json  - main agent profile
  docker-compose.yml        - Qdrant docker config
EOF
    [ "$mode" = "full" ] && echo "  qdrant-mem0_v4_shared.snapshot - Layer 4 vector data" >> "$backup_dir/manifest.txt"

    log_success "Backup complete: $backup_dir"

    # --- Retention: keep last 10 backups ---
    local parent="/root/.openclaw/backups"
    local count=$(ls -1d "$parent"/[0-9]* 2>/dev/null | wc -l)
    if [ "$count" -gt 10 ]; then
        local to_remove=$((count - 10))
        ls -1d "$parent"/[0-9]* 2>/dev/null | head -n "$to_remove" | while read -r old; do
            rm -rf "$old"
            log_info "Pruned old backup: $(basename "$old")"
        done
    fi
}

restore_workspace() {
    local restore_dir="$1"
    if [ -z "$restore_dir" ] || [ ! -d "$restore_dir" ]; then
        log_error "Usage: $0 restore <backup-directory>"
        log_info "Available backups:"
        ls -1d /root/.openclaw/backups/[0-9]* 2>/dev/null | while read -r d; do
            echo "  $d"
        done
        exit 1
    fi

    log_warning "This will restore workspace from: $restore_dir"
    log_warning "Current workspace will be overwritten!"
    read -p "Are you sure? (y/N): " confirm
    if [[ ! $confirm =~ ^[Yy]$ ]]; then
        log_info "Restore cancelled."
        return
    fi

    # Pre-restore backup
    log_info "Creating pre-restore backup..."
    backup quick

    if [ -f "$restore_dir/workspace.tar.gz" ]; then
        log_info "Restoring workspace files..."
        tar -xzf "$restore_dir/workspace.tar.gz" -C /root/.openclaw/
        log_success "Workspace restored"
    fi

    # Restore agent profiles
    for f in "$restore_dir"/.openclaw__openclaw.json "$restore_dir"/.openclaw-*__openclaw.json; do
        [ -f "$f" ] || continue
        local target="/root/$(basename "$f" | sed 's|__|/|g')"
        local target_dir="$(dirname "$target")"
        mkdir -p "$target_dir"
        cp "$f" "$target"
        log_info "Restored: $target"
    done

    log_success "Restore complete. Run './deploy.sh restart' to apply."
}

restore_qdrant() {
    local snap_file="$1"
    if [ -z "$snap_file" ]; then
        log_error "Usage: $0 restore-qdrant <snapshot-file>"
        log_info "Example: $0 restore-qdrant /root/.openclaw/backups/20260306-120000/qdrant-mem0_v4_shared.snapshot"
        exit 1
    fi
    if [ ! -f "$snap_file" ]; then
        log_error "Snapshot file not found: $snap_file"
        exit 1
    fi

    log_warning "This will REPLACE collection mem0_v4_shared with snapshot data!"
    log_warning "Snapshot: $snap_file"
    read -p "Are you sure? (y/N): " confirm
    if [[ ! $confirm =~ ^[Yy]$ ]]; then
        log_info "Restore cancelled."
        return
    fi

    # Copy snapshot into Qdrant snapshots directory
    local qdrant_snap_dir="/opt/mem0-center/snapshots/mem0_v4_shared"
    mkdir -p "$qdrant_snap_dir"
    local snap_name="$(basename "$snap_file")"
    cp "$snap_file" "$qdrant_snap_dir/$snap_name"

    log_info "Recovering Qdrant snapshot..."
    local result
    result=$(curl -sf -X PUT "http://localhost:6333/collections/mem0_v4_shared/snapshots/recover" \
        -H "Content-Type: application/json" \
        -d "{\"location\":\"/qdrant/snapshots/mem0_v4_shared/$snap_name\"}" 2>&1)
    if [ $? -eq 0 ]; then
        log_success "Qdrant snapshot recovered: $snap_name"
        local count
        count=$(curl -sf "http://localhost:6333/collections/mem0_v4_shared" 2>/dev/null | \
            python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('points_count',0))" 2>/dev/null || echo "unknown")
        log_info "Collection point count after restore: $count"
    else
        log_error "Qdrant snapshot recovery failed: $result"
    fi
}

health_check() {
    log_info "Running health check..."
    setup_user_env
    
    local issues=0
    
    while IFS=$'\t' read -r aid atype f3 f4 f5; do
        case "$atype" in
            local-cli)
                local check_cmd="$f3" check_pattern="$f5"
                local output
                output=$(eval "$check_cmd" 2>&1)
                if echo "$output" | grep -qE "$check_pattern"; then
                    log_success "✓ $aid is running"
                else
                    log_error "✗ $aid is not running"
                    ((issues++)) || true
                fi
                ;;
            local-systemd)
                local unit="$f3"
                if systemctl --user is-active --quiet "$unit" 2>/dev/null; then
                    log_success "✓ $aid is running ($unit)"
                else
                    log_error "✗ $aid is not running ($unit)"
                    ((issues++)) || true
                fi
                ;;
            remote-http)
                local health_url="$f3" timeout="$f4"
                if curl -sf --max-time 5 "$health_url" >/dev/null 2>&1; then
                    log_success "✓ $aid is reachable"
                else
                    log_warning "⚠ $aid is unreachable ($health_url)"
                    ((issues++)) || true
                fi
                ;;
        esac
    done < <($PARSE_AGENTS services)
    
    if systemctl is-active --quiet openclaw-agent-monitor; then
        log_success "✓ Agent Monitor is running"
    else
        log_error "✗ Agent Monitor is not running"
        ((issues++)) || true
    fi
    
    local disk_usage=$(df -h /root | tail -1 | awk '{print $5}' | sed 's/%//')
    if [ "$disk_usage" -lt 80 ]; then
        log_success "✓ Disk usage: ${disk_usage}%"
    else
        log_warning "⚠ Disk usage: ${disk_usage}%"
        ((issues++)) || true
    fi
    
    local mem_usage=$(free | grep Mem | awk '{printf("%.0f", $3/$2 * 100.0)}')
    if [ "$mem_usage" -lt 80 ]; then
        log_success "✓ Memory usage: ${mem_usage}%"
    else
        log_warning "⚠ Memory usage: ${mem_usage}%"
        ((issues++)) || true
    fi
    
    if [ -d "$XDG_RUNTIME_DIR" ]; then
        log_success "✓ XDG_RUNTIME_DIR exists"
    else
        log_warning "⚠ XDG_RUNTIME_DIR not found"
        ((issues++)) || true
    fi
    
    if loginctl show-user $(whoami) -p Linger | grep -q "yes"; then
        log_success "✓ User linger enabled"
    else
        log_warning "⚠ User linger NOT enabled"
        ((issues++)) || true
    fi
    
    echo ""
    if [ $issues -eq 0 ]; then
        log_success "All health checks passed!"
        return 0
    else
        log_error "$issues health check(s) failed!"
        return 1
    fi
}

show_help() {
    echo "OpenClaw System Management Script (config-driven via agents.yaml)"
    echo ""
    echo "Usage: $0 <command>"
    echo ""
    echo "Commands:"
    echo "  install       - Install and start all systemd services"
    echo "  start         - Start all registered agent services + monitor"
    echo "  stop          - Stop all services"
    echo "  restart       - Restart all services"
    echo "  status        - Show service status"
    echo "  logs          - Show recent logs"
    echo "  health        - Run health check"
    echo "  backup        - Full backup (workspace + Qdrant snapshot + agent profiles)"
    echo "  backup quick  - Quick backup (workspace files only, no Qdrant)"
    echo "  restore <dir> - Restore workspace + config from backup directory"
    echo "  restore-qdrant <file> - Restore Qdrant from snapshot file"
    echo "  rollback      - Rollback to previous git commit"
    echo "  rollback-to   - Rollback to specific commit"
    echo "  debug-stop    - Stop ALL services including monitor (safe for debugging)"
    echo "  debug-start   - Restart all services after debugging"
    echo "  fix-service   - Re-inject EnvironmentFile after OpenClaw UI upgrade"
    echo "  help          - Show this help message"
    echo ""
    echo "Registered agents:"
    $PARSE_AGENTS list | while IFS=$'\t' read -r id type name; do
        echo "  $id ($type) - $name"
    done
    echo ""
}

# Main
case "${1:-help}" in
    install)     install_services ;;
    start)       start_services ;;
    stop)        stop_services ;;
    restart)     restart_services ;;
    status)      show_status ;;
    logs)        show_logs ;;
    health)      health_check ;;
    backup)      backup "$2" ;;
    restore)     restore_workspace "$2" ;;
    restore-qdrant) restore_qdrant "$2" ;;
    rollback)    rollback ;;
    rollback-to) rollback_to "$2" ;;
    debug-stop)  debug_stop ;;
    debug-start) debug_start ;;
    fix-service) fix_service_files ;;
    help|--help|-h) show_help ;;
    *)
        log_error "Unknown command: $1"
        show_help
        exit 1
        ;;
esac