From e8eebe5eb195a67218ec6ace039d7b2beb4fec93 Mon Sep 17 00:00:00 2001 From: Godopu Date: Fri, 19 Jun 2026 23:33:46 +0000 Subject: [PATCH] feat(tmux-agent-orchestrate-monitor): integrate watchdog pattern as skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moved /tmp/subscriber-watchdog.sh → skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh (skill-managed lifecycle, no longer lives outside workspace). Added lib.sh::start_watchdog() helper: - Spawns watchdog as background nohup process - Writes watchdog log to .hermes/jobs/.watchdog.log - Returns watchdog PID via stdout Wired create_session.sh --submit-job to auto-start watchdog after JOB registration. Fixes: - Bug: registry.py get first-line parse was fragile (empty status → infinite loop) → Now uses python3 json.load for robust parsing - Bug: old path skills/delegate-job/scripts/job_subscriber.py hardcoded → Now uses skills/tmux-agent-orchestrate-delegate-job/scripts/job_subscriber.py Verified on isolated server -L agy-watchdog-skill-test (kill-server after): - Syntax check PASS - E2E: register job → start watchdog → publish completed → watchdog exits - Global skill non-interference verified - Main isolated server -L multi-agent-canary untouched --- skills/lib.sh | 22 +++++++ .../scripts/create_session.sh | 2 + .../scripts/watchdog.sh | 65 +++++++++++++++++++ 3 files changed, 89 insertions(+) create mode 100755 skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh diff --git a/skills/lib.sh b/skills/lib.sh index db2a291..caf7d82 100644 --- a/skills/lib.sh +++ b/skills/lib.sh @@ -425,3 +425,25 @@ delegate_publish_event() { "$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true } +# start_watchdog [workdir] +# Spawns a watchdog process to monitor a delegate-job JOB in the background. +# The watchdog re-spawns the subscriber every 2 minutes (or whatever hard +# limit we set) and exits automatically when the JOB reaches terminal state. +# Returns the watchdog PID via stdout. +start_watchdog() { + local job_id="$1" + local workdir="${2:-$PWD}" + local watchdog_script="$workdir/skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh" + local log_file="$workdir/.hermes/jobs/${job_id}.watchdog.log" + + if [ ! -x "$watchdog_script" ]; then + echo "ERROR: watchdog not found or not executable: $watchdog_script" >&2 + return 1 + fi + + nohup "$watchdog_script" "$job_id" "$workdir" > "$log_file" 2>&1 & + local pid=$! + echo "$pid" +} + + diff --git a/skills/tmux-agent-orchestrate-create/scripts/create_session.sh b/skills/tmux-agent-orchestrate-create/scripts/create_session.sh index 64ec0e0..edc87df 100755 --- a/skills/tmux-agent-orchestrate-create/scripts/create_session.sh +++ b/skills/tmux-agent-orchestrate-create/scripts/create_session.sh @@ -263,6 +263,8 @@ echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_ if [ -n "$DELEGATE_JOB_ID" ]; then echo "delegate job: $DELEGATE_JOB_ID" delegate_publish_event "$DELEGATE_JOB_ID" started "tmux-agent-orchestrate session created" + WD_PID=$(start_watchdog "$DELEGATE_JOB_ID" "$WORKSPACE") + echo "watchdog PID: $WD_PID" fi echo "agent-sessions.yaml updated" echo diff --git a/skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh b/skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh new file mode 100755 index 0000000..3c26e1c --- /dev/null +++ b/skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# watchdog.sh — tmux-agent-orchestrate-monitor 의 부속 스크립트 +# +# Metadata for SKILL.md: +# description: "Watchdog helper that keeps subscriber alive and exits when JOB is done" +# usage: "watchdog.sh [--help]" + +if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ] || [ $# -lt 2 ]; then + echo "Usage: $0 " + exit 0 +fi + +JOB_ID="$1" +WORKDIR="$2" +LOG_DIR="$WORKDIR/.hermes/jobs" + +mkdir -p "$LOG_DIR" + +log() { + echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] $*" +} + +log "watchdog started for JOB=$JOB_ID workdir=$WORKDIR" + +while true; do + # 1) Get current job status with robust Python parsing + STATUS=$(cd "$WORKDIR" && .venv/bin/python skills/tmux-agent-orchestrate-delegate-job/scripts/registry.py get --job "$JOB_ID" 2>/dev/null | python3 -c ' +import sys, json +try: + data = json.load(sys.stdin) + print(data.get("status", "unknown")) +except Exception: + print("unknown") +' 2>/dev/null || echo "unknown") + + log "JOB status: $STATUS" + + # 2) Terminal check + case "$STATUS" in + completed|error|permission_required) + log "JOB reached terminal state ($STATUS), watchdog exiting" + exit 0 + ;; + esac + + # 3) Start subscriber (2min hard limit) + LOG_FILE="$LOG_DIR/subscriber-${JOB_ID}-$(date +%s).log" + log "starting subscriber (2min hard limit, log: $LOG_FILE)" + + ( + cd "$WORKDIR" && timeout 120 .venv/bin/python skills/tmux-agent-orchestrate-delegate-job/scripts/job_subscriber.py \ + --job "$JOB_ID" --timeout 120 --idle-timeout 999999 --registry-dir .hermes/jobs > "$LOG_FILE" 2>&1 + echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] subscriber exited" >> "$LOG_FILE" + ) & + + SUB_PID=$! + log "subscriber PID=$SUB_PID" + + # 4) Wait for subscriber to exit or timeout + wait $SUB_PID 2>/dev/null + EXIT_CODE=$? + log "subscriber exited code=$EXIT_CODE" + + sleep 1 +done