#!/usr/bin/env bash # watchdog.sh — tmux-agent-orchestrate-monitor 의 부속 스크립트 # # Metadata for SKILL.md: # description: "Watchdog helper that keeps subscriber alive and exits when JOB is done" # usage: "watchdog.sh [--help]" if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ] || [ $# -lt 2 ]; then echo "Usage: $0 " exit 0 fi JOB_ID="$1" WORKDIR="$2" LOG_DIR="$WORKDIR/.hermes/jobs" mkdir -p "$LOG_DIR" log() { echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] $*" } log "watchdog started for JOB=$JOB_ID workdir=$WORKDIR" while true; do # 1) Get current job status with robust Python parsing STATUS=$(cd "$WORKDIR" && .venv/bin/python skills/tmux-agent-orchestrate-delegate-job/scripts/registry.py get --job "$JOB_ID" 2>/dev/null | python3 -c ' import sys, json try: data = json.load(sys.stdin) print(data.get("status", "unknown")) except Exception: print("unknown") ' 2>/dev/null || echo "unknown") log "JOB status: $STATUS" # 2) Terminal check case "$STATUS" in completed|error|permission_required) log "JOB reached terminal state ($STATUS), watchdog exiting" exit 0 ;; esac # 3) Start subscriber (2min hard limit) LOG_FILE="$LOG_DIR/subscriber-${JOB_ID}-$(date +%s).log" log "starting subscriber (2min hard limit, log: $LOG_FILE)" ( cd "$WORKDIR" && timeout 120 .venv/bin/python skills/tmux-agent-orchestrate-delegate-job/scripts/job_subscriber.py \ --job "$JOB_ID" --timeout 120 --idle-timeout 999999 --registry-dir .hermes/jobs > "$LOG_FILE" 2>&1 echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] subscriber exited" >> "$LOG_FILE" ) & SUB_PID=$! log "subscriber PID=$SUB_PID" # 4) Wait for subscriber to exit or timeout wait $SUB_PID 2>/dev/null EXIT_CODE=$? log "subscriber exited code=$EXIT_CODE" sleep 1 done