You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

589 lines
21 KiB

#!/bin/bash
###############################################################################
# OpenClaw System Deployment & Management Script
#
# Config-driven: reads agent list from agents.yaml via parse_agents.py
# No hardcoded agent references -- add/remove agents by editing agents.yaml.
#
# Usage:
# ./deploy.sh install - Install and start all services
# ./deploy.sh start - Start all services
# ./deploy.sh stop - Stop all services
# ./deploy.sh restart - Restart all services
# ./deploy.sh status - Show service status
# ./deploy.sh logs - Show recent logs
# ./deploy.sh health - Run health check
# ./deploy.sh rollback - Rollback to previous git commit
# ./deploy.sh backup - Full backup (workspace + Qdrant snapshot + agent profiles)
# ./deploy.sh backup quick - Quick backup (workspace files only, no Qdrant)
# ./deploy.sh restore <dir> - Restore workspace + config from backup directory
# ./deploy.sh restore-qdrant <file> - Restore Qdrant collection from snapshot file
# ./deploy.sh debug-stop - Stop ALL services (including monitor) for debugging
# ./deploy.sh debug-start - Start ALL services after debugging
# ./deploy.sh fix-service - Re-inject EnvironmentFile after OpenClaw UI upgrade
###############################################################################
set -e
WORKSPACE="/root/.openclaw/workspace"
LOG_DIR="/root/.openclaw/workspace/logs/system"
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
PARSE_AGENTS="python3 $WORKSPACE/scripts/parse_agents.py"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
ensure_log_dir() { mkdir -p "$LOG_DIR"; }
setup_user_env() {
export XDG_RUNTIME_DIR=/run/user/$(id -u)
export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/$(id -u)/bus"
}
# Iterate over agents from agents.yaml and perform an action per type.
# Usage: for_each_agent <start|stop|restart|enable|disable|status>
for_each_agent() {
local action="$1"
setup_user_env
while IFS=$'\t' read -r aid atype f3 f4 f5; do
case "$atype" in
local-cli)
local check_cmd="$f3" start_cmd="$f4"
case "$action" in
start) eval "$start_cmd" 2>/dev/null && log_info "Started $aid" || log_warning "$aid start failed" ;;
stop) eval "${start_cmd/start/stop}" 2>/dev/null || true; log_info "Stopped $aid" ;;
restart) eval "${start_cmd/start/stop}" 2>/dev/null || true; sleep 1; eval "$start_cmd" 2>/dev/null && log_info "Restarted $aid" || log_warning "$aid restart failed" ;;
status) echo ""; log_info "=== $aid (local-cli) ==="; eval "$check_cmd" || true ;;
logs) log_info "=== $aid logs ==="; journalctl --user -u openclaw-gateway --no-pager -n 50 2>/dev/null || true ;;
esac
;;
local-systemd)
local unit="$f3"
case "$action" in
start) systemctl --user start "$unit" 2>/dev/null && log_info "Started $aid ($unit)" || log_warning "$aid start failed" ;;
stop) systemctl --user stop "$unit" 2>/dev/null || true; log_info "Stopped $aid" ;;
restart) systemctl --user restart "$unit" 2>/dev/null && log_info "Restarted $aid ($unit)" || log_warning "$aid restart failed" ;;
enable) systemctl --user enable "$unit" 2>/dev/null ;;
disable) systemctl --user disable "$unit" 2>/dev/null ;;
status) echo ""; log_info "=== $aid (systemd: $unit) ==="; systemctl --user status "$unit" --no-pager -l 2>&1 || true ;;
logs) log_info "=== $aid logs ==="; journalctl --user -u "$unit" --no-pager -n 50 2>/dev/null || true ;;
esac
;;
remote-http)
case "$action" in
status) log_info "=== $aid (remote) ==="; echo " Remote agent -- check via health URL" ;;
*) log_info "$aid is remote; skipping $action" ;;
esac
;;
esac
done < <($PARSE_AGENTS services)
}
install_services() {
log_info "Installing OpenClaw systemd services..."
loginctl enable-linger $(whoami)
setup_user_env
if [ ! -d "$XDG_RUNTIME_DIR" ]; then
log_warning "Creating runtime directory..."
mkdir -p "$XDG_RUNTIME_DIR"
chmod 700 "$XDG_RUNTIME_DIR"
fi
mkdir -p ~/.config/systemd/user/
# Install main gateway service
cp "$WORKSPACE/systemd/openclaw-gateway-user.service" ~/.config/systemd/user/openclaw-gateway.service
# Install any local-systemd agents from agents.yaml
while IFS=$'\t' read -r aid atype f3 f4 f5; do
if [ "$atype" = "local-systemd" ]; then
local unit="$f3"
local svc_template="$WORKSPACE/systemd/$unit"
if [ -f "$svc_template" ]; then
cp "$svc_template" "$HOME/.config/systemd/user/$unit"
systemctl --user enable "$unit" 2>/dev/null
log_info "Installed $unit"
fi
fi
done < <($PARSE_AGENTS services)
systemctl --user daemon-reload
systemctl --user enable openclaw-gateway
# Install system-level agent monitor
log_info "Installing system-level agent monitor..."
cp "$WORKSPACE/systemd/openclaw-agent-monitor.service" /etc/systemd/system/
systemctl daemon-reload
systemctl enable openclaw-agent-monitor
fix_service_files
log_info "Starting services..."
for_each_agent start
systemctl start openclaw-agent-monitor
sleep 3
log_success "OpenClaw services installed and started!"
local agent_names=$($PARSE_AGENTS ids)
log_info "Active agents: $agent_names"
log_info "Gateway logs: journalctl --user -u openclaw-gateway -f"
log_info "Monitor logs: journalctl -u openclaw-agent-monitor -f"
}
start_services() {
log_info "Starting OpenClaw services..."
for_each_agent start
systemctl start openclaw-agent-monitor
log_success "All services started"
}
stop_services() {
log_info "Stopping OpenClaw services..."
for_each_agent stop
systemctl stop openclaw-agent-monitor
log_success "All services stopped"
}
restart_services() {
log_info "Restarting OpenClaw services..."
for_each_agent restart
systemctl restart openclaw-agent-monitor
log_success "All services restarted"
}
debug_stop() {
log_warning "=== DEBUG MODE: Stopping ALL services ==="
log_warning "Monitor will NOT auto-restart gateway while in debug mode."
log_warning "Run './deploy.sh debug-start' when done debugging."
systemctl stop openclaw-agent-monitor 2>/dev/null || true
for_each_agent stop
log_success "All services stopped. Safe to debug."
echo ""
log_info "Useful debug commands:"
log_info " openclaw gateway start # start gateway in foreground"
log_info " journalctl --user -u openclaw-gateway -n 100"
}
debug_start() {
log_info "=== Exiting DEBUG MODE: Restarting ALL services ==="
for_each_agent start
systemctl start openclaw-agent-monitor
sleep 2
log_success "All services restored. Monitor is active again."
health_check
}
fix_service_files() {
log_info "Ensuring EnvironmentFile= is present in installed service files..."
setup_user_env
local changed=0
while IFS=$'\t' read -r aid atype f3 f4 f5; do
eval $($PARSE_AGENTS info "$aid" 2>/dev/null | grep -E '^(ENV_FILE|AGENT_TYPE)=')
if [ -z "$ENV_FILE" ]; then continue; fi
local env_path="$WORKSPACE/systemd/$ENV_FILE"
local svc_file=""
if [ "$AGENT_TYPE" = "local-cli" ]; then
svc_file="$HOME/.config/systemd/user/openclaw-gateway.service"
elif [ "$AGENT_TYPE" = "local-systemd" ]; then
svc_file="$HOME/.config/systemd/user/$f3"
fi
if [ -n "$svc_file" ] && [ -f "$svc_file" ] && [ -f "$env_path" ]; then
if ! grep -q "EnvironmentFile=.*${ENV_FILE}" "$svc_file" 2>/dev/null; then
sed -i "/^\[Service\]/a EnvironmentFile=-${env_path}" "$svc_file"
log_info "Injected EnvironmentFile into $(basename $svc_file)"
changed=1
else
log_info "$(basename $svc_file) already has EnvironmentFile"
fi
fi
done < <($PARSE_AGENTS services)
if [ $changed -eq 1 ]; then
systemctl --user daemon-reload
log_success "Service files updated. Run './deploy.sh restart' to apply."
else
log_success "All service files are up to date."
fi
}
show_status() {
for_each_agent status
echo ""
log_info "=== Agent Monitor (System Service) ==="
systemctl status openclaw-agent-monitor --no-pager -l 2>&1 || true
}
show_logs() {
setup_user_env
for_each_agent logs
echo ""
log_info "=== Monitor logs (last 50 lines) ==="
journalctl -u openclaw-agent-monitor --no-pager -n 50
}
rollback() {
log_warning "This will rollback the workspace to the previous git commit!"
read -p "Are you sure? (y/N): " confirm
if [[ $confirm =~ ^[Yy]$ ]]; then
cd "$WORKSPACE"
backup
log_info "Current commit:"
git log -1 --oneline
git reset --hard HEAD~1
log_success "Rolled back to previous commit!"
log_info "Restarting services to apply changes..."
restart_services
else
log_info "Rollback cancelled."
fi
}
rollback_to() {
if [ -z "$1" ]; then
log_error "Please specify a commit hash or tag"
exit 1
fi
log_warning "This will rollback the workspace to commit: $1"
read -p "Are you sure? (y/N): " confirm
if [[ $confirm =~ ^[Yy]$ ]]; then
cd "$WORKSPACE"
backup
git reset --hard "$1"
log_success "Rolled back to commit: $1"
restart_services
else
log_info "Rollback cancelled."
fi
}
backup() {
local mode="${1:-full}"
local backup_dir="/root/.openclaw/backups/$TIMESTAMP"
mkdir -p "$backup_dir"
log_info "Creating $mode backup -> $backup_dir"
# --- Layer 1+2: workspace files ---
log_info "Backing up workspace (Layer 1+2)..."
tar -czf "$backup_dir/workspace.tar.gz" \
--exclude='.git' \
--exclude='logs' \
-C /root/.openclaw workspace
# --- Config: all agent openclaw.json profiles ---
log_info "Backing up agent profiles..."
for d in /root/.openclaw/openclaw.json /root/.openclaw-*/openclaw.json; do
[ -f "$d" ] && cp "$d" "$backup_dir/$(echo "$d" | sed 's|/root/||;s|/|__|g')" 2>/dev/null || true
done
# --- Config: docker-compose ---
cp /opt/mem0-center/docker-compose.yml "$backup_dir/" 2>/dev/null || true
if [ "$mode" = "full" ]; then
# --- Layer 4: Qdrant snapshot ---
log_info "Creating Qdrant snapshot (mem0_v4_shared)..."
local snap_response
snap_response=$(curl -sf -X POST "http://localhost:6333/collections/mem0_v4_shared/snapshots" 2>/dev/null)
if [ $? -eq 0 ] && [ -n "$snap_response" ]; then
local snap_name
snap_name=$(echo "$snap_response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('name',''))" 2>/dev/null)
if [ -n "$snap_name" ]; then
local snap_src="/opt/mem0-center/snapshots/mem0_v4_shared/$snap_name"
if [ -f "$snap_src" ]; then
cp "$snap_src" "$backup_dir/qdrant-mem0_v4_shared.snapshot"
log_success "Qdrant snapshot saved: $snap_name"
else
log_warning "Snapshot file not found at $snap_src"
fi
else
log_warning "Could not parse snapshot name from response"
fi
else
log_warning "Qdrant snapshot failed (is Qdrant running?)"
fi
# --- Layer 4: pre-backup memory count ---
local mem_count
mem_count=$(curl -sf "http://localhost:6333/collections/mem0_v4_shared" 2>/dev/null | \
python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('points_count',0))" 2>/dev/null || echo "unknown")
echo "$mem_count" > "$backup_dir/qdrant-point-count.txt"
log_info "Qdrant point count: $mem_count"
fi
# --- Manifest ---
cat > "$backup_dir/manifest.txt" <<EOF
OpenClaw Backup - $TIMESTAMP
Mode: $mode
Date: $(date -Iseconds)
Agents: $($PARSE_AGENTS ids 2>/dev/null || echo "unknown")
Contents:
workspace.tar.gz - Layer 1+2 workspace files
.openclaw__openclaw.json - main agent profile
docker-compose.yml - Qdrant docker config
EOF
[ "$mode" = "full" ] && echo " qdrant-mem0_v4_shared.snapshot - Layer 4 vector data" >> "$backup_dir/manifest.txt"
log_success "Backup complete: $backup_dir"
# --- Retention: keep last 10 backups ---
local parent="/root/.openclaw/backups"
local count=$(ls -1d "$parent"/[0-9]* 2>/dev/null | wc -l)
if [ "$count" -gt 10 ]; then
local to_remove=$((count - 10))
ls -1d "$parent"/[0-9]* 2>/dev/null | head -n "$to_remove" | while read -r old; do
rm -rf "$old"
log_info "Pruned old backup: $(basename "$old")"
done
fi
}
restore_workspace() {
local restore_dir="$1"
if [ -z "$restore_dir" ] || [ ! -d "$restore_dir" ]; then
log_error "Usage: $0 restore <backup-directory>"
log_info "Available backups:"
ls -1d /root/.openclaw/backups/[0-9]* 2>/dev/null | while read -r d; do
echo " $d"
done
exit 1
fi
log_warning "This will restore workspace from: $restore_dir"
log_warning "Current workspace will be overwritten!"
read -p "Are you sure? (y/N): " confirm
if [[ ! $confirm =~ ^[Yy]$ ]]; then
log_info "Restore cancelled."
return
fi
# Pre-restore backup
log_info "Creating pre-restore backup..."
backup quick
if [ -f "$restore_dir/workspace.tar.gz" ]; then
log_info "Restoring workspace files..."
tar -xzf "$restore_dir/workspace.tar.gz" -C /root/.openclaw/
log_success "Workspace restored"
fi
# Restore agent profiles
for f in "$restore_dir"/.openclaw__openclaw.json "$restore_dir"/.openclaw-*__openclaw.json; do
[ -f "$f" ] || continue
local target="/root/$(basename "$f" | sed 's|__|/|g')"
local target_dir="$(dirname "$target")"
mkdir -p "$target_dir"
cp "$f" "$target"
log_info "Restored: $target"
done
log_success "Restore complete. Run './deploy.sh restart' to apply."
}
restore_qdrant() {
local snap_file="$1"
if [ -z "$snap_file" ]; then
log_error "Usage: $0 restore-qdrant <snapshot-file>"
log_info "Example: $0 restore-qdrant /root/.openclaw/backups/20260306-120000/qdrant-mem0_v4_shared.snapshot"
exit 1
fi
if [ ! -f "$snap_file" ]; then
log_error "Snapshot file not found: $snap_file"
exit 1
fi
log_warning "This will REPLACE collection mem0_v4_shared with snapshot data!"
log_warning "Snapshot: $snap_file"
read -p "Are you sure? (y/N): " confirm
if [[ ! $confirm =~ ^[Yy]$ ]]; then
log_info "Restore cancelled."
return
fi
# Copy snapshot into Qdrant snapshots directory
local qdrant_snap_dir="/opt/mem0-center/snapshots/mem0_v4_shared"
mkdir -p "$qdrant_snap_dir"
local snap_name="$(basename "$snap_file")"
cp "$snap_file" "$qdrant_snap_dir/$snap_name"
log_info "Recovering Qdrant snapshot..."
local result
result=$(curl -sf -X PUT "http://localhost:6333/collections/mem0_v4_shared/snapshots/recover" \
-H "Content-Type: application/json" \
-d "{\"location\":\"/qdrant/snapshots/mem0_v4_shared/$snap_name\"}" 2>&1)
if [ $? -eq 0 ]; then
log_success "Qdrant snapshot recovered: $snap_name"
local count
count=$(curl -sf "http://localhost:6333/collections/mem0_v4_shared" 2>/dev/null | \
python3 -c "import sys,json; print(json.load(sys.stdin).get('result',{}).get('points_count',0))" 2>/dev/null || echo "unknown")
log_info "Collection point count after restore: $count"
else
log_error "Qdrant snapshot recovery failed: $result"
fi
}
health_check() {
log_info "Running health check..."
setup_user_env
local issues=0
while IFS=$'\t' read -r aid atype f3 f4 f5; do
case "$atype" in
local-cli)
local check_cmd="$f3" check_pattern="$f5"
local output
output=$(eval "$check_cmd" 2>&1)
if echo "$output" | grep -qE "$check_pattern"; then
log_success "$aid is running"
else
log_error "$aid is not running"
((issues++)) || true
fi
;;
local-systemd)
local unit="$f3"
if systemctl --user is-active --quiet "$unit" 2>/dev/null; then
log_success "$aid is running ($unit)"
else
log_error "$aid is not running ($unit)"
((issues++)) || true
fi
;;
remote-http)
local health_url="$f3" timeout="$f4"
if curl -sf --max-time 5 "$health_url" >/dev/null 2>&1; then
log_success "$aid is reachable"
else
log_warning "$aid is unreachable ($health_url)"
((issues++)) || true
fi
;;
esac
done < <($PARSE_AGENTS services)
if systemctl is-active --quiet openclaw-agent-monitor; then
log_success "✓ Agent Monitor is running"
else
log_error "✗ Agent Monitor is not running"
((issues++)) || true
fi
local disk_usage=$(df -h /root | tail -1 | awk '{print $5}' | sed 's/%//')
if [ "$disk_usage" -lt 80 ]; then
log_success "✓ Disk usage: ${disk_usage}%"
else
log_warning "⚠ Disk usage: ${disk_usage}%"
((issues++)) || true
fi
local mem_usage=$(free | grep Mem | awk '{printf("%.0f", $3/$2 * 100.0)}')
if [ "$mem_usage" -lt 80 ]; then
log_success "✓ Memory usage: ${mem_usage}%"
else
log_warning "⚠ Memory usage: ${mem_usage}%"
((issues++)) || true
fi
if [ -d "$XDG_RUNTIME_DIR" ]; then
log_success "✓ XDG_RUNTIME_DIR exists"
else
log_warning "⚠ XDG_RUNTIME_DIR not found"
((issues++)) || true
fi
if loginctl show-user $(whoami) -p Linger | grep -q "yes"; then
log_success "✓ User linger enabled"
else
log_warning "⚠ User linger NOT enabled"
((issues++)) || true
fi
echo ""
if [ $issues -eq 0 ]; then
log_success "All health checks passed!"
return 0
else
log_error "$issues health check(s) failed!"
return 1
fi
}
show_help() {
echo "OpenClaw System Management Script (config-driven via agents.yaml)"
echo ""
echo "Usage: $0 <command>"
echo ""
echo "Commands:"
echo " install - Install and start all systemd services"
echo " start - Start all registered agent services + monitor"
echo " stop - Stop all services"
echo " restart - Restart all services"
echo " status - Show service status"
echo " logs - Show recent logs"
echo " health - Run health check"
echo " backup - Full backup (workspace + Qdrant snapshot + agent profiles)"
echo " backup quick - Quick backup (workspace files only, no Qdrant)"
echo " restore <dir> - Restore workspace + config from backup directory"
echo " restore-qdrant <file> - Restore Qdrant from snapshot file"
echo " rollback - Rollback to previous git commit"
echo " rollback-to - Rollback to specific commit"
echo " debug-stop - Stop ALL services including monitor (safe for debugging)"
echo " debug-start - Restart all services after debugging"
echo " fix-service - Re-inject EnvironmentFile after OpenClaw UI upgrade"
echo " help - Show this help message"
echo ""
echo "Registered agents:"
$PARSE_AGENTS list | while IFS=$'\t' read -r id type name; do
echo " $id ($type) - $name"
done
echo ""
}
# Main
case "${1:-help}" in
install) install_services ;;
start) start_services ;;
stop) stop_services ;;
restart) restart_services ;;
status) show_status ;;
logs) show_logs ;;
health) health_check ;;
backup) backup "$2" ;;
restore) restore_workspace "$2" ;;
restore-qdrant) restore_qdrant "$2" ;;
rollback) rollback ;;
rollback-to) rollback_to "$2" ;;
debug-stop) debug_stop ;;
debug-start) debug_start ;;
fix-service) fix_service_files ;;
help|--help|-h) show_help ;;
*)
log_error "Unknown command: $1"
show_help
exit 1
;;
esac