feat(tmux-agent-orchestrate-monitor): integrate watchdog pattern as skill
Moved /tmp/subscriber-watchdog.sh → skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh (skill-managed lifecycle, no longer lives outside workspace). Added lib.sh::start_watchdog() helper: - Spawns watchdog as background nohup process - Writes watchdog log to .hermes/jobs/<JID>.watchdog.log - Returns watchdog PID via stdout Wired create_session.sh --submit-job to auto-start watchdog after JOB registration. Fixes: - Bug: registry.py get first-line parse was fragile (empty status → infinite loop) → Now uses python3 json.load for robust parsing - Bug: old path skills/delegate-job/scripts/job_subscriber.py hardcoded → Now uses skills/tmux-agent-orchestrate-delegate-job/scripts/job_subscriber.py Verified on isolated server -L agy-watchdog-skill-test (kill-server after): - Syntax check PASS - E2E: register job → start watchdog → publish completed → watchdog exits - Global skill non-interference verified - Main isolated server -L multi-agent-canary untouched
This commit is contained in:
@@ -425,3 +425,25 @@ delegate_publish_event() {
|
|||||||
"$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true
|
"$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# start_watchdog <job_id> [workdir]
|
||||||
|
# Spawns a watchdog process to monitor a delegate-job JOB in the background.
|
||||||
|
# The watchdog re-spawns the subscriber every 2 minutes (or whatever hard
|
||||||
|
# limit we set) and exits automatically when the JOB reaches terminal state.
|
||||||
|
# Returns the watchdog PID via stdout.
|
||||||
|
start_watchdog() {
|
||||||
|
local job_id="$1"
|
||||||
|
local workdir="${2:-$PWD}"
|
||||||
|
local watchdog_script="$workdir/skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh"
|
||||||
|
local log_file="$workdir/.hermes/jobs/${job_id}.watchdog.log"
|
||||||
|
|
||||||
|
if [ ! -x "$watchdog_script" ]; then
|
||||||
|
echo "ERROR: watchdog not found or not executable: $watchdog_script" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
nohup "$watchdog_script" "$job_id" "$workdir" > "$log_file" 2>&1 &
|
||||||
|
local pid=$!
|
||||||
|
echo "$pid"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -263,6 +263,8 @@ echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_
|
|||||||
if [ -n "$DELEGATE_JOB_ID" ]; then
|
if [ -n "$DELEGATE_JOB_ID" ]; then
|
||||||
echo "delegate job: $DELEGATE_JOB_ID"
|
echo "delegate job: $DELEGATE_JOB_ID"
|
||||||
delegate_publish_event "$DELEGATE_JOB_ID" started "tmux-agent-orchestrate session created"
|
delegate_publish_event "$DELEGATE_JOB_ID" started "tmux-agent-orchestrate session created"
|
||||||
|
WD_PID=$(start_watchdog "$DELEGATE_JOB_ID" "$WORKSPACE")
|
||||||
|
echo "watchdog PID: $WD_PID"
|
||||||
fi
|
fi
|
||||||
echo "agent-sessions.yaml updated"
|
echo "agent-sessions.yaml updated"
|
||||||
echo
|
echo
|
||||||
|
|||||||
@@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# watchdog.sh — tmux-agent-orchestrate-monitor 의 부속 스크립트
|
||||||
|
#
|
||||||
|
# Metadata for SKILL.md:
|
||||||
|
# description: "Watchdog helper that keeps subscriber alive and exits when JOB is done"
|
||||||
|
# usage: "watchdog.sh <job_id> <workdir> [--help]"
|
||||||
|
|
||||||
|
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ] || [ $# -lt 2 ]; then
|
||||||
|
echo "Usage: $0 <job_id> <workdir>"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
JOB_ID="$1"
|
||||||
|
WORKDIR="$2"
|
||||||
|
LOG_DIR="$WORKDIR/.hermes/jobs"
|
||||||
|
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "watchdog started for JOB=$JOB_ID workdir=$WORKDIR"
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
# 1) Get current job status with robust Python parsing
|
||||||
|
STATUS=$(cd "$WORKDIR" && .venv/bin/python skills/tmux-agent-orchestrate-delegate-job/scripts/registry.py get --job "$JOB_ID" 2>/dev/null | python3 -c '
|
||||||
|
import sys, json
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
print(data.get("status", "unknown"))
|
||||||
|
except Exception:
|
||||||
|
print("unknown")
|
||||||
|
' 2>/dev/null || echo "unknown")
|
||||||
|
|
||||||
|
log "JOB status: $STATUS"
|
||||||
|
|
||||||
|
# 2) Terminal check
|
||||||
|
case "$STATUS" in
|
||||||
|
completed|error|permission_required)
|
||||||
|
log "JOB reached terminal state ($STATUS), watchdog exiting"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# 3) Start subscriber (2min hard limit)
|
||||||
|
LOG_FILE="$LOG_DIR/subscriber-${JOB_ID}-$(date +%s).log"
|
||||||
|
log "starting subscriber (2min hard limit, log: $LOG_FILE)"
|
||||||
|
|
||||||
|
(
|
||||||
|
cd "$WORKDIR" && timeout 120 .venv/bin/python skills/tmux-agent-orchestrate-delegate-job/scripts/job_subscriber.py \
|
||||||
|
--job "$JOB_ID" --timeout 120 --idle-timeout 999999 --registry-dir .hermes/jobs > "$LOG_FILE" 2>&1
|
||||||
|
echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] subscriber exited" >> "$LOG_FILE"
|
||||||
|
) &
|
||||||
|
|
||||||
|
SUB_PID=$!
|
||||||
|
log "subscriber PID=$SUB_PID"
|
||||||
|
|
||||||
|
# 4) Wait for subscriber to exit or timeout
|
||||||
|
wait $SUB_PID 2>/dev/null
|
||||||
|
EXIT_CODE=$?
|
||||||
|
log "subscriber exited code=$EXIT_CODE"
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
Reference in New Issue
Block a user