feat(monitor): consolidate per-job watchdogs into shared wildcard subscriber (FW-W3)

This commit is contained in:
2026-06-23 00:35:48 +09:00
parent 31f18b2e5a
commit 12dceb14b2
8 changed files with 97 additions and 83 deletions
+18 -6
View File
@@ -723,16 +723,28 @@ delegate_publish_event() {
start_watchdog() {
local job_id="$1"
local workdir="${2:-$PWD}"
local watchdog_script="$workdir/.agents/skills/multi-agent-mux-monitor/scripts/watchdog.sh"
local log_file="$workdir/.mam/jobs/${job_id}.watchdog.log"
local monitor_script="$workdir/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh"
local log_file="$workdir/.mam/multi-agent-mux-monitor.log"
if [ ! -x "$watchdog_script" ]; then
echo "ERROR: watchdog not found or not executable: $watchdog_script" >&2
if [ ! -f "$monitor_script" ]; then
echo "ERROR: monitor script not found: $monitor_script" >&2
return 1
fi
nohup "$watchdog_script" "$job_id" "$workdir" > "$log_file" 2>&1 &
local pid=$!
# Check if reconcile.sh --subscribe is already running on this workspace
local pid
pid=$(pgrep -f "bash $monitor_script --subscribe" || true)
if [ -z "$pid" ]; then
# Start the wildcard monitor subscriber daemon with --idle-timeout 0 (never idle out)
# and ensure it runs with $workdir as cwd to anchor relative log paths.
local orig_pwd="$PWD"
cd "$workdir"
nohup bash "$monitor_script" --subscribe --idle-timeout 0 >> "$log_file" 2>&1 &
pid=$!
cd "$orig_pwd"
fi
echo "$pid"
}
@@ -55,16 +55,32 @@ if [ "$SUBSCRIBE" = "1" ]; then
# The MQTT subscribe loop exits 3 to signal "broker unavailable → poll instead".
set +e
YAML_PATH="$AGENT_SESSIONS_YAML" HOME_DIR="$HOME_DIR" CLAUDE_PROJECT_DIR="$CLAUDE_PROJECT_DIR" LOCAL_BIN="$LOCAL_BIN" \
SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
WORKSPACE_ROOT="$WORKSPACE_ROOT" SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
SKILLS_DIR="$SKILLS_DIR" LIB_SH="$LIB_SH" \
"$PYBIN" - <<'PYEOF'
import os, sys, json, time, subprocess
lib_sh = os.environ.get('LIB_SH', '')
skills_dir = os.environ.get('SKILLS_DIR', '')
yaml_path = os.environ.get('YAML_PATH', '')
workspace_root = os.environ.get('WORKSPACE_ROOT', '')
timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout
idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '3600') or '0') # 0 = no idle timeout
# Prevent duplicate wildcard subscribers for this workspace (concurrency race)
import fcntl
lock_file_path = os.path.join(workspace_root or '.', '.mam', 'monitor.lock')
try:
os.makedirs(os.path.dirname(lock_file_path), exist_ok=True)
lock_file = open(lock_file_path, 'w')
fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB)
except BlockingIOError:
print("MQTT Monitor: another subscriber is already running for this workspace. Exiting.", flush=True)
sys.exit(0)
except Exception as e:
print(f"MQTT Monitor: failed to acquire monitor lock ({e}). Exiting.", flush=True)
sys.exit(1)
# Locate skills/multi-agent-mux-delegate-job/scripts to import mqtt_common — relative first, then
# an upward walk from cwd. No hardcoded absolute path (review item 6).
cand = os.path.join(skills_dir, 'multi-agent-mux-delegate-job', 'scripts') if skills_dir else ''
@@ -85,6 +101,7 @@ else:
d = os.path.dirname(d)
import mqtt_common
import registry
# Executed INSIDE lib.sh::atomic_dump_yaml (system python3 + PyYAML), under the
# YAML flock with schema-validate + .bak (review item 5). Marks matching running
@@ -132,6 +149,7 @@ def handle_terminal(jid, event):
state = {'last_msg': time.time(), 'connected': False, 'failed': False}
last_seqs = {}
def on_message(_client, _userdata, msg):
@@ -140,7 +158,48 @@ def on_message(_client, _userdata, msg):
payload = json.loads(msg.payload.decode("utf-8"))
jid = payload.get("job_id")
event = payload.get("event")
if jid and event in ("completed", "error"):
if not jid or not event:
return
if workspace_root:
registry_dir = os.path.join(workspace_root, '.mam', 'jobs')
else:
yaml_dir = os.path.dirname(yaml_path) if yaml_path else ""
registry_dir = os.path.join(yaml_dir, 'jobs') if yaml_dir else '.mam/jobs'
try:
job = registry.load_job(jid, registry_dir)
except FileNotFoundError:
# Silently ignore events for jobs not in the local registry
return
expected_token = job.get("auth_token")
if not mqtt_common.verify_hmac(payload, expected_token):
print(f"MQTT Monitor: drop event for job {jid}: HMAC verify failed", flush=True)
return
seq = payload.get("seq")
if seq is None or not isinstance(seq, int):
print(f"MQTT Monitor: drop event for job {jid}: missing or invalid seq", flush=True)
return
if seq <= last_seqs.get(jid, 0):
print(f"MQTT Monitor: drop event for job {jid}: seq {seq} not monotonic (last {last_seqs.get(jid, 0)})", flush=True)
return
last_seqs[jid] = seq
# Append the event to events.ndjson audit trail
mqtt_common.append_event(jid, {
"event": "received",
"source_event": event,
"seq": seq,
"topic": msg.topic,
"timestamp": payload.get("timestamp"),
"detail": payload.get("detail", ""),
})
print(f"MQTT Monitor: recorded event {event} for job {jid} (seq={seq})", flush=True)
if event in ("completed", "error"):
print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
handle_terminal(jid, event)
except Exception as e:
@@ -1,65 +0,0 @@
#!/usr/bin/env bash
# watchdog.sh — multi-agent-mux-monitor 의 부속 스크립트
#
# Metadata for SKILL.md:
# description: "Watchdog helper that keeps subscriber alive and exits when JOB is done"
# usage: "watchdog.sh <job_id> <workdir> [--help]"
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ] || [ $# -lt 2 ]; then
echo "Usage: $0 <job_id> <workdir>"
exit 0
fi
JOB_ID="$1"
WORKDIR="$2"
LOG_DIR="$WORKDIR/.mam/jobs"
mkdir -p "$LOG_DIR"
log() {
echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] $*"
}
log "watchdog started for JOB=$JOB_ID workdir=$WORKDIR"
while true; do
# 1) Get current job status with robust Python parsing
STATUS=$(cd "$WORKDIR" && .venv/bin/python .agents/skills/multi-agent-mux-delegate-job/scripts/registry.py get --job "$JOB_ID" 2>/dev/null | python3 -c '
import sys, json
try:
data = json.load(sys.stdin)
print(data.get("status", "unknown"))
except Exception:
print("unknown")
' 2>/dev/null || echo "unknown")
log "JOB status: $STATUS"
# 2) Terminal check
case "$STATUS" in
completed|error|permission_required)
log "JOB reached terminal state ($STATUS), watchdog exiting"
exit 0
;;
esac
# 3) Start subscriber (2min hard limit)
LOG_FILE="$LOG_DIR/subscriber-${JOB_ID}-$(date +%s).log"
log "starting subscriber (2min hard limit, log: $LOG_FILE)"
(
cd "$WORKDIR" && timeout 120 .venv/bin/python .agents/skills/multi-agent-mux-delegate-job/scripts/job_subscriber.py \
--job "$JOB_ID" --timeout 120 --idle-timeout 999999 --registry-dir .mam/jobs > "$LOG_FILE" 2>&1
echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] subscriber exited" >> "$LOG_FILE"
) &
SUB_PID=$!
log "subscriber PID=$SUB_PID"
# 4) Wait for subscriber to exit or timeout
wait $SUB_PID 2>/dev/null
EXIT_CODE=$?
log "subscriber exited code=$EXIT_CODE"
sleep 1
done