feat(monitor): consolidate per-job watchdogs into shared wildcard subscriber (FW-W3)

This commit is contained in:
2026-06-23 00:35:48 +09:00
parent 31f18b2e5a
commit 12dceb14b2
8 changed files with 97 additions and 83 deletions
+18 -6
View File
@@ -723,16 +723,28 @@ delegate_publish_event() {
start_watchdog() {
local job_id="$1"
local workdir="${2:-$PWD}"
local watchdog_script="$workdir/.agents/skills/multi-agent-mux-monitor/scripts/watchdog.sh"
local log_file="$workdir/.mam/jobs/${job_id}.watchdog.log"
local monitor_script="$workdir/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh"
local log_file="$workdir/.mam/multi-agent-mux-monitor.log"
if [ ! -x "$watchdog_script" ]; then
echo "ERROR: watchdog not found or not executable: $watchdog_script" >&2
if [ ! -f "$monitor_script" ]; then
echo "ERROR: monitor script not found: $monitor_script" >&2
return 1
fi
nohup "$watchdog_script" "$job_id" "$workdir" > "$log_file" 2>&1 &
local pid=$!
# Check if reconcile.sh --subscribe is already running on this workspace
local pid
pid=$(pgrep -f "bash $monitor_script --subscribe" || true)
if [ -z "$pid" ]; then
# Start the wildcard monitor subscriber daemon with --idle-timeout 0 (never idle out)
# and ensure it runs with $workdir as cwd to anchor relative log paths.
local orig_pwd="$PWD"
cd "$workdir"
nohup bash "$monitor_script" --subscribe --idle-timeout 0 >> "$log_file" 2>&1 &
pid=$!
cd "$orig_pwd"
fi
echo "$pid"
}
@@ -55,16 +55,32 @@ if [ "$SUBSCRIBE" = "1" ]; then
# The MQTT subscribe loop exits 3 to signal "broker unavailable → poll instead".
set +e
YAML_PATH="$AGENT_SESSIONS_YAML" HOME_DIR="$HOME_DIR" CLAUDE_PROJECT_DIR="$CLAUDE_PROJECT_DIR" LOCAL_BIN="$LOCAL_BIN" \
SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
WORKSPACE_ROOT="$WORKSPACE_ROOT" SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
SKILLS_DIR="$SKILLS_DIR" LIB_SH="$LIB_SH" \
"$PYBIN" - <<'PYEOF'
import os, sys, json, time, subprocess
lib_sh = os.environ.get('LIB_SH', '')
skills_dir = os.environ.get('SKILLS_DIR', '')
yaml_path = os.environ.get('YAML_PATH', '')
workspace_root = os.environ.get('WORKSPACE_ROOT', '')
timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout
idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '3600') or '0') # 0 = no idle timeout
# Prevent duplicate wildcard subscribers for this workspace (concurrency race)
import fcntl
lock_file_path = os.path.join(workspace_root or '.', '.mam', 'monitor.lock')
try:
os.makedirs(os.path.dirname(lock_file_path), exist_ok=True)
lock_file = open(lock_file_path, 'w')
fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB)
except BlockingIOError:
print("MQTT Monitor: another subscriber is already running for this workspace. Exiting.", flush=True)
sys.exit(0)
except Exception as e:
print(f"MQTT Monitor: failed to acquire monitor lock ({e}). Exiting.", flush=True)
sys.exit(1)
# Locate skills/multi-agent-mux-delegate-job/scripts to import mqtt_common — relative first, then
# an upward walk from cwd. No hardcoded absolute path (review item 6).
cand = os.path.join(skills_dir, 'multi-agent-mux-delegate-job', 'scripts') if skills_dir else ''
@@ -85,6 +101,7 @@ else:
d = os.path.dirname(d)
import mqtt_common
import registry
# Executed INSIDE lib.sh::atomic_dump_yaml (system python3 + PyYAML), under the
# YAML flock with schema-validate + .bak (review item 5). Marks matching running
@@ -132,6 +149,7 @@ def handle_terminal(jid, event):
state = {'last_msg': time.time(), 'connected': False, 'failed': False}
last_seqs = {}
def on_message(_client, _userdata, msg):
@@ -140,7 +158,48 @@ def on_message(_client, _userdata, msg):
payload = json.loads(msg.payload.decode("utf-8"))
jid = payload.get("job_id")
event = payload.get("event")
if jid and event in ("completed", "error"):
if not jid or not event:
return
if workspace_root:
registry_dir = os.path.join(workspace_root, '.mam', 'jobs')
else:
yaml_dir = os.path.dirname(yaml_path) if yaml_path else ""
registry_dir = os.path.join(yaml_dir, 'jobs') if yaml_dir else '.mam/jobs'
try:
job = registry.load_job(jid, registry_dir)
except FileNotFoundError:
# Silently ignore events for jobs not in the local registry
return
expected_token = job.get("auth_token")
if not mqtt_common.verify_hmac(payload, expected_token):
print(f"MQTT Monitor: drop event for job {jid}: HMAC verify failed", flush=True)
return
seq = payload.get("seq")
if seq is None or not isinstance(seq, int):
print(f"MQTT Monitor: drop event for job {jid}: missing or invalid seq", flush=True)
return
if seq <= last_seqs.get(jid, 0):
print(f"MQTT Monitor: drop event for job {jid}: seq {seq} not monotonic (last {last_seqs.get(jid, 0)})", flush=True)
return
last_seqs[jid] = seq
# Append the event to events.ndjson audit trail
mqtt_common.append_event(jid, {
"event": "received",
"source_event": event,
"seq": seq,
"topic": msg.topic,
"timestamp": payload.get("timestamp"),
"detail": payload.get("detail", ""),
})
print(f"MQTT Monitor: recorded event {event} for job {jid} (seq={seq})", flush=True)
if event in ("completed", "error"):
print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
handle_terminal(jid, event)
except Exception as e:
@@ -1,65 +0,0 @@
#!/usr/bin/env bash
# watchdog.sh — multi-agent-mux-monitor 의 부속 스크립트
#
# Metadata for SKILL.md:
# description: "Watchdog helper that keeps subscriber alive and exits when JOB is done"
# usage: "watchdog.sh <job_id> <workdir> [--help]"
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ] || [ $# -lt 2 ]; then
echo "Usage: $0 <job_id> <workdir>"
exit 0
fi
JOB_ID="$1"
WORKDIR="$2"
LOG_DIR="$WORKDIR/.mam/jobs"
mkdir -p "$LOG_DIR"
log() {
echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] $*"
}
log "watchdog started for JOB=$JOB_ID workdir=$WORKDIR"
while true; do
# 1) Get current job status with robust Python parsing
STATUS=$(cd "$WORKDIR" && .venv/bin/python .agents/skills/multi-agent-mux-delegate-job/scripts/registry.py get --job "$JOB_ID" 2>/dev/null | python3 -c '
import sys, json
try:
data = json.load(sys.stdin)
print(data.get("status", "unknown"))
except Exception:
print("unknown")
' 2>/dev/null || echo "unknown")
log "JOB status: $STATUS"
# 2) Terminal check
case "$STATUS" in
completed|error|permission_required)
log "JOB reached terminal state ($STATUS), watchdog exiting"
exit 0
;;
esac
# 3) Start subscriber (2min hard limit)
LOG_FILE="$LOG_DIR/subscriber-${JOB_ID}-$(date +%s).log"
log "starting subscriber (2min hard limit, log: $LOG_FILE)"
(
cd "$WORKDIR" && timeout 120 .venv/bin/python .agents/skills/multi-agent-mux-delegate-job/scripts/job_subscriber.py \
--job "$JOB_ID" --timeout 120 --idle-timeout 999999 --registry-dir .mam/jobs > "$LOG_FILE" 2>&1
echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] subscriber exited" >> "$LOG_FILE"
) &
SUB_PID=$!
log "subscriber PID=$SUB_PID"
# 4) Wait for subscriber to exit or timeout
wait $SUB_PID 2>/dev/null
EXIT_CODE=$?
log "subscriber exited code=$EXIT_CODE"
sleep 1
done
+4 -2
View File
@@ -7,7 +7,7 @@
## 요약
- **처리 항목**: FW-01 ~ FW-16, FW-L1 ~ FW-L3, FW-N1 ~ FW-N4, Infra Pattern (총 24개)
- **처리 항목**: FW-01 ~ FW-16, FW-L1 ~ FW-L3, FW-N1 ~ FW-N7, FW-W3, Infra Pattern (총 28개)
- **Working tree**: clean
- **검증 결과**: 모든 장기 과제, 신규 발견 항목 및 분석 인프라 개선 완료 (agy-existing, claude-existing 교차 검증 PASS)
@@ -44,6 +44,7 @@
| FW-N5 | `job-protocol.md` 보안 프로토콜 규격 갱신 (HMAC 서명 기준) | `6a88f10, 450722b` | Hermes 직접 | 문서/설계 정합성 패스 완료 (PASS) |
| FW-N6 | `registry.py``auth_token` 자동 생성 및 CLI 연동 지원 | `6a88f10` | Hermes 직접 | `--auth-token` 인자 추가 및 보안 브로커 감지 시 자동 생성 처리 완료 (PASS) |
| FW-N7 | `job_subscriber.py` 내 시퀀스 단조 증가 검증을 통한 Replay Attack 방어 | `6a88f10` | Hermes 직접 | Watcher 내 last_seq 추적 및 seq 단조 증가 검사 로직 구현 완료 (PASS) |
| FW-W3 | 개별 잡 와치독을 단일 와일드카드 구독자로 통합 | `358c72b` | Antigravity | watchdog.sh를 제거하고 reconcile.sh --subscribe 단일 구독자로 이벤트 처리 및 와치독 역할 통합 완료 (PASS) |
---
@@ -98,4 +99,5 @@ a6f7c04 feat(delegate-job): bump default --timeout 600s -> 3600s (1h wall-clock
## 날짜
2026-06-21 (Sun) 03:52 ~ 07:00 KST
- 2026-06-21 (Sun) 03:52 ~ 07:00 KST (FW-01 ~ FW-16, FW-L1 ~ FW-L3, FW-N1 ~ FW-N7, Infra)
- 2026-06-22 (Mon) 23:44 ~ KST (FW-W3)
+4 -2
View File
@@ -7,7 +7,7 @@
## Summary
- **Completed Items**: FW-01 ~ FW-16, FW-L1 ~ FW-L3, FW-N1 ~ FW-N4, Infra Pattern (total of 24 items)
- **Completed Items**: FW-01 ~ FW-16, FW-L1 ~ FW-L3, FW-N1 ~ FW-N7, FW-W3, Infra Pattern (total of 28 items)
- **Working Tree**: clean
- **Verification Results**: All long-term tasks, newly discovered items, and analysis infrastructure improvements have been completed (mutual verification PASS from `agy-existing` and `claude-existing`).
@@ -44,6 +44,7 @@
| FW-N5 | Update `job-protocol.md` security protocol spec (to HMAC signatures) | `6a88f10, 450722b` | Hermes Direct | Documentation/Design consistency pass completed (PASS) |
| FW-N6 | Support auto-generated `auth_token` and CLI integration in `registry.py` | `6a88f10` | Hermes Direct | Added `--auth-token` argument, auto-generation on secure broker detection (PASS) |
| FW-N7 | Prevent Replay Attacks via sequence monotonic increase validation in `job_subscriber.py` | `6a88f10` | Hermes Direct | Added seq tracking in watcher to verify monotonic increase (PASS) |
| FW-W3 | Consolidate per-job watchdogs into shared wildcard subscriber | `358c72b` | Antigravity | Consolidate watchdog logic to reconcile.sh --subscribe, remove watchdog.sh (PASS) |
---
@@ -100,4 +101,5 @@ a6f7c04 feat(delegate-job): bump default --timeout 600s -> 3600s (1h wall-clock
## Date
2026-06-21 (Sun) 03:52 ~ 07:00 KST
- 2026-06-21 (Sun) 03:52 ~ 07:00 KST (FW-01 ~ FW-16, FW-L1 ~ FW-L3, FW-N1 ~ FW-N7, Infra)
- 2026-06-22 (Mon) 23:44 ~ KST (FW-W3)
+4 -2
View File
@@ -22,7 +22,6 @@
| **FW-P7** | 모니터 종료 경로에 대한 HMAC 서명 검증 및 활성 상태 체크 강화 | P1 (High) | 중 | **이식성 / 보안**: `reconcile.sh``verify_hmac` 서명 검증 없이 `completed`/`error` 이벤트만으로 세션을 즉시 강제 종료하는 리스크 해결. 모니터링 이벤트 핸들러(`on_message`)에서 보안 토큰 검증을 필수 처리하고, `kill-session` 전 실제 tmux 활성 여부와 예상 아티팩트 보존 상태를 대조하게 설계 | 없음 |
| **FW-W1** | 글로벌 레지스트리 락을 세밀한 락(Fine-grained locks)으로 대체 | P2 (Medium) | 중 | **동시성 / 확장성**: 모든 세션 및 progress/sequence 업데이트가 단일 `.mam/jobs/` 글로벌 fcntl lock을 거치며 생기는 병목 차단. 잡 단위의 개별 락 파일 도입 | 없음 |
| **FW-W2** | 블라인드 TUI 키 입력 방지를 위한 실행 준비도 검증 | P2 (Medium) | 대 | **워크플로우**: 세션 생성, 재개, 중지 시 단순 sleep(예: 6초) 대신 터미널 스크린 스크랩이나 준비도 프로브(Readiness Probe)를 활용하여 다이얼로그나 예외 창을 안전하게 차단 | 없음 |
| **FW-W3** | 개별 잡 와치독을 단일 와일드카드 구독자로 통합 | P2 (Medium) | 중 | **워크플로우 / 효율성**: 잡 단위로 watchdog.sh 및 120초 주기로 재연결되는 구독자 프로세스가 매번 뜨는 문제를 해결하고, `reconcile.sh`의 와일드카드 MQTT 구독자 하나로 이벤트 처리를 일원화 | 없음 |
| **FW-W4** | 구독자 시퀀스 번호(last_seq)의 디스크 영속화 | P1 (High) | 중 | **워크플로우 / 보안**: 와치독 재기동 시 시퀀스 카운터가 리셋되는 구조적 취약을 방지하기 위해 `subscriber.last_seq`를 디스크/DB에 기록하여 잡 라이프타임 전체를 커버하는 Replay 방어선 유지 | 없음 |
| **FW-W5** | 리뷰어 판정을 위한 구조적 메시지 스키마 정의 | P2 (Medium) | 중 | **워크플로우**: PM 에이전트가 터미널 스크롤백 문자열을 무가공 grep 파싱하는 대신, 전용 리뷰 피드백 토픽(예: `reviews/<job_id>/verdicts`) 및 정형화된 JSON 포맷(`PASS`/`NOT_PASS` + 차단 요인) 도입 | 없음 |
| **FW-W6** | 모니터링 복구 루프의 Hermes 에이전트 지원 확장 | P2 (Medium) | 중 | **워크플로우 / 일관성**: `reconcile.sh` 내 자동 등록(drift-B) 및 ID 동기화(drift-C) 로직에 `hermes` 세션을 완전 편입시켜 Claude/Agy 세션과 동일한 모니터링 및 복구 수준 지원 | 없음 |
@@ -41,4 +40,7 @@
* 하위 경로 탐색 시 특정 파일의 상대 경로 깊이(`../..` 등)에 의존하는 구조는 디렉터리 리팩토링이나 래퍼 이동 시 치명적 취약점으로 작용합니다. 디렉터리 트리를 따라 `.git`이나 `.mam` 등 알려진 루트 표시 마커를 동적으로 검색하는 방식을 채택하여 스크립트 실행 안정성과 이식 속도를 획기적으로 개선합니다.
4. **모니터 종료 권한 제어 강화 (FW-P7)**:
* 세션 강제 종료(`tmux kill-session`) 권한은 안전하게 제어되어야 합니다. 모니터(`reconcile.sh`)가 와일드카드 토픽을 무검증 수신하여 즉시 세션을 정리하면 위조 주입 공격에 취약해집니다. 종료 이벤트 수신부에 HMAC 서명 검증을 의무화하고, 세션 강제 중지 전 예상되는 작업 결과물(Artifact) 존속 상태를 교차 검토하도록 설계합니다.
* 세션 강제 종료(`tmux kill-session`) 권한은 안전하게 제어되어야 합니다. 모니터(`reconcile.sh`)가 와일드카드 토픽을 무검증 수신하여 즉시 세션을 정리하면 위조 주입 공격에 취약해집니다. 종료 이벤트 수신부에 HMAC 서명 검증을 의무화하고, 세션 강제 중지 전 예상되는 작업 결과물(Artifact) 존속 상태를 교차 검토하도록 설계합니다.
5. **개별 잡 와치독의 단일 와일드카드 구독자 통합 (FW-W3)**:
* 매 잡마다 개별적으로 실행되어 2분 주기로 끊고 재연결하던 `watchdog.sh` 프로세스 방식 대신, 상시 기동되는 `reconcile.sh --subscribe` 단일 와일드카드 구독자 구조로 이벤트 처리, HMAC 보안 검증 및 시퀀스 추적 로직을 완전히 통일했습니다. 이를 통해 불필요한 MQTT 커넥션 급증을 원천 차단하고 세션 정리 과정을 간소화했으며, 메모리 캐시 기반 시퀀스 추적을 통해 Replay 공격 차단 정합성을 동시 실행 중인 모든 잡에 대해 안정적으로 제공합니다.
+4 -2
View File
@@ -22,7 +22,6 @@ Below is the list of pending future work items. These items were proposed based
| **FW-P7** | Enforce HMAC verification and liveness checks on monitor termination | P1 (High) | Medium | **Portability / Security**: Prevent remote session killing by unauthorized or spoofed events. Integrate `verify_hmac` inside the monitor (`reconcile.sh`'s `on_message` handler) and confirm expected artifacts exist before executing `tmux kill-session`. | None |
| **FW-W1** | Replace global registry lock with fine-grained locks | P2 (Medium) | Medium | **Concurrency / Scaling**: Eliminate throughput bottlenecks where all progress/sequence updates channel through a single fcntl lock on `.mam/jobs/`. Implement per-job lock files. | None |
| **FW-W2** | Implement readiness probes for blind TUI key inputs | P2 (Medium) | Large | **Workflow**: Replace fixed timing sleeps in create, resume, and stop scripts with dynamic terminal readiness probes (e.g. scrapers or CLI checking hooks) to dismiss trust dialogs robustly. | None |
| **FW-W3** | Consolidate per-job watchdogs into shared wildcard subscriber | P2 (Medium) | Medium | **Workflow / Efficiency**: Drop per-job watchdog + subscriber churn (which reconnects every 120s) and migrate their handling to the wildcard MQTT subscriber already running in `reconcile.sh`. | None |
| **FW-W4** | Persist subscriber sequence numbers alongside job records | P1 (High) | Medium | **Workflow / Security**: Persist `subscriber.last_seq` to disk or SQLite to prevent sequence counter reset on subscriber restart, locking down the replay defense window for the full job lifetime. | None |
| **FW-W5** | Define structured message schema for reviewer verdicts | P2 (Medium) | Medium | **Workflow**: Create a dedicated reviewer topic (e.g., `reviews/<job_id>/verdicts`) emitting structured JSON verdicts (`PASS` / `NOT_PASS` + details) to eliminate raw text grepping by the PM. | None |
| **FW-W6** | Expand monitor reconciliation support to Hermes agent | P2 (Medium) | Medium | **Workflow / Consistency**: Fully integrate `hermes` sessions into auto-registration (drift-B) and ID materialization (drift-C) under `reconcile.sh` to match Claude/Agy monitoring coverage. | None |
@@ -41,4 +40,7 @@ Below is the list of pending future work items. These items were proposed based
* Hardcoding relative depth limits (like `../..` relative to a skill's location) creates direct fragility when moving directories or refactoring. By walking up the directory tree to search for known anchors (like `.git` or `.mam`), we establish a single canonical root path and prevent scripts from breaking when their execution wrappers are relocated.
4. **Monitor Termination Authorization (FW-P7)**:
* Auto-termination must not trust unauthenticated events. Since `reconcile.sh` listens to a wildcard topic, any client on a public broker could spoof a terminal message and trigger `tmux kill-session`. Requiring HMAC signature verification on the terminal event path, combined with artifact validation, mitigates spoofing and accidental session cleanup.
* Auto-termination must not trust unauthenticated events. Since `reconcile.sh` listens to a wildcard topic, any client on a public broker could spoof a terminal message and trigger `tmux kill-session`. Requiring HMAC signature verification on the terminal event path, combined with artifact validation, mitigates spoofing and accidental session cleanup.
5. **Consolidation of per-job watchdogs (FW-W3)**:
* Instead of spawning an independent `watchdog.sh` process for each job which reconnects every 2 minutes, we consolidated the event handling, HMAC security verification, and sequence tracking into a single, persistent wildcard subscriber running under `reconcile.sh --subscribe`. This drastically reduces MQTT broker connections, simplifies cleanup logic, and leverages python's memory storage to handle replay attack prevention (monotonic sequence numbers) for concurrent jobs.
+2 -2
View File
@@ -295,8 +295,8 @@ graph LR
```
* Pre-seeds agent instruction headers via stdin to enforce that the agent runs `publish_event.py` for its transitions.
* Blocks on `wait $sub_pid`, and finally prints the audit log directory.
2. **`multi-agent-mux-monitor` (`reconcile.sh` & `watchdog.sh`)**:
* **Watchdog Integration**: Starts a subscriber monitoring loop (`watchdog.sh`) to detect orphaned agent panes or locked workspaces.
2. **`multi-agent-mux-monitor` (`reconcile.sh`)**:
* **Wildcard Monitor Integration**: Runs a unified background subscriber loop (`reconcile.sh --subscribe`) to capture progress, verify security tokens (HMAC) and sequences, write audit logs, and automatically clean up tmux sessions upon terminal events.
* **Reconciliation loop**: Subscribes to the global job topic. On terminal events, it invokes `lib.sh::atomic_dump_yaml` to sync status drifts (e.g. setting tmux sessions to `terminated` in `agent-sessions.yaml` once the agent exits).
3. **`multi-agent-mux-create / stop / resume`**:
* Integrates the job life status into session metadata updates, ensuring standard tmux cleanup triggers state updates in the registry and audit logs.