diff --git a/skills/agent-sessions-monitor/SKILL.md b/skills/agent-sessions-monitor/SKILL.md index 97bb5f5..e38534e 100644 --- a/skills/agent-sessions-monitor/SKILL.md +++ b/skills/agent-sessions-monitor/SKILL.md @@ -94,16 +94,20 @@ The worker calls this script every 30s. It: ```bash # Reconcile + auto-update YAML (atomic, flock-guarded). Emits JSON drift to stdout. -bash ~/PuKi/lab/agent_sessions/skills/agent-sessions-monitor/scripts/reconcile.sh --once --emit-diff +bash skills/agent-sessions-monitor/scripts/reconcile.sh --once --emit-diff # Read-only: compute drift WITHOUT writing the YAML (use for "what's running?" checks). -bash ~/PuKi/lab/agent_sessions/skills/agent-sessions-monitor/scripts/reconcile.sh --once --emit-diff --dry-run +bash skills/agent-sessions-monitor/scripts/reconcile.sh --once --emit-diff --dry-run # Push-based MQTT Monitor: listen to delegated job events on the broker and update the YAML instantly. -bash ~/PuKi/lab/agent_sessions/skills/agent-sessions-monitor/scripts/reconcile.sh --subscribe +# Bounded run that exits after 5 min idle, or 1 h wall-clock; falls back to polling if the broker is down. +bash skills/agent-sessions-monitor/scripts/reconcile.sh --subscribe --idle-timeout 300 --timeout 3600 + +# Persistent monitor (no timeouts): runs until interrupted; still polls if the broker is unreachable. +bash skills/agent-sessions-monitor/scripts/reconcile.sh --subscribe --idle-timeout 0 ``` -Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (persistent push-based MQTT subscription monitoring; falls back to polling if connection fails). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself. +Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (push-based MQTT subscription monitoring). `--subscribe` sub-flags: `--timeout N` (exit after N seconds of wall-clock; `0` = no limit, default), `--idle-timeout N` (exit after N seconds with no message; default `600`, `0` = never idle-out). On a broker connection failure (connect error **or** non-zero CONNACK), `--subscribe` falls back to a polling loop that re-runs `--once --emit-diff` every `RECONCILE_POLL_INTERVAL` (default 15) seconds until `--timeout`. Terminal-event YAML updates are written through `lib.sh::atomic_dump_yaml` (flock + schema-validate + `.bak`). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself. ## Drift classes (what the script handles) diff --git a/skills/agent-sessions-monitor/scripts/reconcile.sh b/skills/agent-sessions-monitor/scripts/reconcile.sh index a1682de..5c1225f 100755 --- a/skills/agent-sessions-monitor/scripts/reconcile.sh +++ b/skills/agent-sessions-monitor/scripts/reconcile.sh @@ -22,6 +22,11 @@ ONCE=0 EMIT_DIFF=0 DRY_RUN=0 SUBSCRIBE=0 +# --subscribe controls (review item 4): 0 = no overall timeout; idle default 600s +# (raised from the old hardcoded 120s); idle 0 = never idle-out. +SUB_TIMEOUT=0 +SUB_IDLE_TIMEOUT=600 +POLL_INTERVAL="${RECONCILE_POLL_INTERVAL:-15}" while [ $# -gt 0 ]; do case "$1" in @@ -29,7 +34,9 @@ while [ $# -gt 0 ]; do --emit-diff) EMIT_DIFF=1; shift ;; --dry-run) DRY_RUN=1; shift ;; --subscribe) SUBSCRIBE=1; shift ;; - -h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe]"; exit 0 ;; + --timeout) SUB_TIMEOUT="$2"; shift 2 ;; + --idle-timeout) SUB_IDLE_TIMEOUT="$2"; shift 2 ;; + -h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe [--timeout N] [--idle-timeout N]]"; exit 0 ;; *) echo "ERROR: unknown arg: $1" >&2; exit 2 ;; esac done @@ -37,122 +44,176 @@ done [ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; } if [ "$SUBSCRIBE" = "1" ]; then - SUBSCRIBE_MODE=1 env_python "$AGENT_SESSIONS_YAML" <<'PYEOF' -import os, sys, json, fcntl, tempfile, subprocess -from datetime import datetime, timezone -import yaml + # Paths resolved relative to this script (review item 6): skills/ dir + lib.sh. + SKILLS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + LIB_SH="$SKILLS_DIR/lib.sh" + # MQTT client lives in the project venv (has paho). All YAML work is delegated + # to lib.sh::atomic_dump_yaml, which runs the system python3 (has PyYAML) — so + # no single interpreter needs both paho and PyYAML (review items 4/5/6). + PYBIN="$(_delegate_py_bin)" -yaml_path = os.environ['YAML_PATH'] -home = os.environ['HOME_DIR'] + # The MQTT subscribe loop exits 3 to signal "broker unavailable → poll instead". + set +e + YAML_PATH="$AGENT_SESSIONS_YAML" HOME_DIR="$HOME" \ + SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \ + SKILLS_DIR="$SKILLS_DIR" LIB_SH="$LIB_SH" \ + "$PYBIN" - <<'PYEOF' +import os, sys, json, time, subprocess -# Add skills/delegate-job/scripts to path to import mqtt_common -script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd() -path_candidate = os.path.join('/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent', 'skills', 'delegate-job', 'scripts') -if os.path.isdir(path_candidate): - sys.path.append(path_candidate) +lib_sh = os.environ.get('LIB_SH', '') +skills_dir = os.environ.get('SKILLS_DIR', '') +timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout +idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '600') or '0') # 0 = no idle timeout + +# Locate skills/delegate-job/scripts to import mqtt_common — relative first, then +# an upward walk from cwd. No hardcoded absolute path (review item 6). +cand = os.path.join(skills_dir, 'delegate-job', 'scripts') if skills_dir else '' +if cand and os.path.isdir(cand): + sys.path.append(cand) else: - d = script_dir - found = False - while d != '/' and d: - p = os.path.join(d, 'skills', 'delegate-job', 'scripts') - if os.path.isdir(p): - sys.path.append(p) - found = True - break - p2 = os.path.join(d, 'delegate-job', 'scripts') - if os.path.isdir(p2): - sys.path.append(p2) - found = True + d = os.getcwd() + while d and d != '/': + hit = None + for sub in (('skills', 'delegate-job', 'scripts'), ('delegate-job', 'scripts')): + p = os.path.join(d, *sub) + if os.path.isdir(p): + hit = p + break + if hit: + sys.path.append(hit) break d = os.path.dirname(d) import mqtt_common -cfg = mqtt_common.broker_config_from_env() -client = mqtt_common.make_client("monitor_sub", cfg) +# Executed INSIDE lib.sh::atomic_dump_yaml (system python3 + PyYAML), under the +# YAML flock with schema-validate + .bak (review item 5). Marks matching running +# sessions terminated and kills their tmux (review item 3 behaviour preserved), +# or aborts the write entirely when nothing matches. The untrusted MQTT job id / +# event arrive via env (MQTT_JID / MQTT_EVENT) — never spliced into source (P1-B). +_MUTATION = r''' +import os, subprocess +from datetime import datetime, timezone +_jid = os.environ['MQTT_JID'] +_event = os.environ['MQTT_EVENT'] +_now = datetime.now(timezone.utc) +_changed = False +for s in d.get('tmux_sessions', []): + if s.get('delegate_job_id') == _jid and s.get('status') == 'running': + s['status'] = 'terminated' + s['terminated_at'] = _now.strftime('%Y-%m-%dT%H:%M:%SZ') + s['terminated_at_epoch'] = int(_now.timestamp()) + s['termination_mode'] = 'auto-detected (MQTT ' + _event + ')' + _name = s.get('name') + _srv = s.get('tmux_server') or 'default' + _cmd = ['tmux'] + (['-L', _srv] if _srv != 'default' else []) + ['kill-session', '-t', _name] + subprocess.run(_cmd, capture_output=True) + print('MQTT Monitor: terminated + killed ' + str(_name) + ' on ' + str(_srv), flush=True) + _changed = True +if not _changed: + raise SystemExit(0) # nothing matched — skip the write entirely +''' -def on_message(client, userdata, msg): + +def handle_terminal(jid, event): + if not lib_sh or not os.path.isfile(lib_sh): + print('MQTT Monitor: lib.sh not found, cannot update YAML', flush=True) + return + env = dict(os.environ) + env['MQTT_JID'] = jid + env['MQTT_EVENT'] = event + cmd = ['bash', '-c', + 'source "$LIB_SH"; atomic_dump_yaml "$YAML_PATH" MQTT_JID="$MQTT_JID" MQTT_EVENT="$MQTT_EVENT"'] + r = subprocess.run(cmd, input=_MUTATION, text=True, env=env, capture_output=True) + if (r.stdout or '').strip(): + print(r.stdout.strip(), flush=True) + if r.returncode != 0 and (r.stderr or '').strip(): + print('MQTT Monitor: atomic_dump_yaml stderr: ' + r.stderr.strip(), flush=True) + + +state = {'last_msg': time.time(), 'connected': False, 'failed': False} + + +def on_message(_client, _userdata, msg): + state['last_msg'] = time.time() try: payload = json.loads(msg.payload.decode("utf-8")) jid = payload.get("job_id") event = payload.get("event") - if not jid or not event: - return - - if event in ("completed", "error"): + if jid and event in ("completed", "error"): print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True) - update_session_by_job(jid, event) + handle_terminal(jid, event) except Exception as e: print(f"MQTT Monitor error parsing message: {e}", flush=True) - -def update_session_by_job(jid, event): - lock_path = yaml_path + '.lock' - lock_fh = open(lock_path, 'w') - fcntl.flock(lock_fh, fcntl.LOCK_EX) - try: - if os.path.exists(yaml_path): - with open(yaml_path) as f: - d_local = yaml.safe_load(f) or {} - else: - d_local = {} - - sessions = d_local.setdefault('tmux_sessions', []) - updated = False - for s in sessions: - if s.get('delegate_job_id') == jid and s.get('status') == 'running': - s['status'] = 'terminated' - now_iso = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') - s['terminated_at'] = now_iso - s['terminated_at_epoch'] = int(datetime.now(timezone.utc).timestamp()) - s['termination_mode'] = f"auto-detected (MQTT {event})" - name = s.get('name') - srv = s.get('tmux_server') or 'default' - kill_tmux_session(name, srv) - updated = True - - if updated: - dir_ = os.path.dirname(yaml_path) or '.' - fd, tmp = tempfile.mkstemp(dir=dir_, prefix='.agent-sessions.', suffix='.tmp') - try: - with os.fdopen(fd, 'w') as f: - yaml.safe_dump(d_local, f, default_flow_style=False, sort_keys=False, - allow_unicode=True, width=4096) - os.replace(tmp, yaml_path) - print(f"MQTT Monitor: updated YAML for job {jid} to terminated", flush=True) - except Exception as e: - if os.path.exists(tmp): - os.remove(tmp) - print(f"MQTT Monitor error writing YAML: {e}", flush=True) - finally: - fcntl.flock(lock_fh, fcntl.LOCK_UN) - lock_fh.close() -def kill_tmux_session(name, srv): - try: - cmd = ['tmux'] - if srv != 'default': - cmd += ['-L', srv] - cmd += ['kill-session', '-t', name] - subprocess.run(cmd, capture_output=True) - print(f"MQTT Monitor: killed tmux session {name} on server {srv}", flush=True) - except Exception as e: - print(f"MQTT Monitor error killing tmux: {e}", flush=True) - -client.on_message = on_message def on_connect(_c, _u, _flags, reason_code, _props): rc = mqtt_common.reason_code_value(reason_code) if rc == 0: + state['connected'] = True _c.subscribe("python/mqtt/jobs/+/events", qos=1) print("MQTT Monitor: subscribed to python/mqtt/jobs/+/events", flush=True) else: - print(f"MQTT Monitor connection failed: {rc}", flush=True) - + state['failed'] = True + print(f"MQTT Monitor connection failed: rc={rc}", flush=True) + + +cfg = mqtt_common.broker_config_from_env() +client = mqtt_common.make_client("monitor_sub", cfg) +client.on_message = on_message client.on_connect = on_connect print(f"MQTT Monitor: connecting to {cfg.host}:{cfg.port} (TLS={cfg.tls})...", flush=True) -client.connect(cfg.host, cfg.port, cfg.keepalive) -client.loop_forever() + +# Connection failure → fall back to polling (review item 4). +try: + client.connect(cfg.host, cfg.port, cfg.keepalive) +except Exception as e: + print(f"MQTT Monitor: connect failed ({e}); falling back to polling", flush=True) + sys.exit(3) + +client.loop_start() +_wait = time.time() +while time.time() - _wait < 5 and not state['connected'] and not state['failed']: + time.sleep(0.1) +if not state['connected']: + print("MQTT Monitor: broker did not accept connection; falling back to polling", flush=True) + client.loop_stop() + sys.exit(3) + +start = time.time() +try: + while True: + now = time.time() + if timeout and (now - start) >= timeout: + print(f"MQTT Monitor: --timeout {timeout}s reached, exiting", flush=True) + break + if idle_timeout and (now - state['last_msg']) >= idle_timeout: + print(f"MQTT Monitor: --idle-timeout {idle_timeout}s reached, exiting", flush=True) + break + time.sleep(0.5) +finally: + client.loop_stop() + try: + client.disconnect() + except Exception: + pass +sys.exit(0) PYEOF + sub_rc=$? + set -e + + if [ "$sub_rc" = "3" ]; then + echo "MQTT Monitor: broker unavailable — falling back to polling (interval ${POLL_INTERVAL}s)" >&2 + _self="$SKILLS_DIR/agent-sessions-monitor/scripts/reconcile.sh" + _start=$(date +%s) + while :; do + bash "$_self" --once --emit-diff >/dev/null 2>&1 || true + if [ "$SUB_TIMEOUT" != "0" ] && [ "$(( $(date +%s) - _start ))" -ge "$SUB_TIMEOUT" ]; then + break + fi + sleep "$POLL_INTERVAL" + done + fi exit 0 fi diff --git a/skills/lib.sh b/skills/lib.sh index da0a5ed..94b7efb 100644 --- a/skills/lib.sh +++ b/skills/lib.sh @@ -362,37 +362,66 @@ PYEOF } # --------------------------------------------------------------------------- -# delegate_submit_job +# delegate-job integration helpers # -# Register a job in the delegate-job registry. Auto-detects the virtualenv python -# and prints the new JID on stdout. +# All paths are resolved relative to lib.sh's own location (BASH_SOURCE), so the +# skill tree is relocatable — no hardcoded absolute paths (review item 6). # --------------------------------------------------------------------------- -delegate_submit_job() { - local prompt="$1" agent="$2" session="$3" - local skill_dir - skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - - local py_bin="python3" - local d="$skill_dir" + +# _delegate_py_bin — echo the virtualenv python (walk up from skills/), else python3. +_delegate_py_bin() { + local d + d="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" while [ "$d" != "/" ] && [ -n "$d" ]; do if [ -x "$d/.venv/bin/python" ]; then - py_bin="$d/.venv/bin/python" - break + printf '%s\n' "$d/.venv/bin/python"; return 0 fi d="$(dirname "$d")" done + printf '%s\n' "python3" +} - local registry_py="$skill_dir/delegate-job/scripts/registry.py" - if [ ! -f "$registry_py" ]; then - registry_py="$(find "$skill_dir" -name "registry.py" | head -n 1 || echo "")" - fi +# _delegate_script — echo the path to a delegate-job script, resolved +# relative to skills/ (lib.sh dir). Empty if not found. +_delegate_script() { + local name="$1" skill_dir cand + skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + cand="$skill_dir/delegate-job/scripts/$name" + if [ -f "$cand" ]; then printf '%s\n' "$cand"; return 0; fi + printf '%s\n' "$(find "$skill_dir" -name "$name" 2>/dev/null | head -n 1 || true)" +} + +# delegate_submit_job +# +# Register a job in the delegate-job registry. Prints the new JID on stdout. +delegate_submit_job() { + local prompt="$1" agent="$2" session="$3" + local py_bin registry_py + py_bin="$(_delegate_py_bin)" + registry_py="$(_delegate_script registry.py)" if [ -z "$registry_py" ] || [ ! -f "$registry_py" ]; then - registry_py="/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent/skills/delegate-job/scripts/registry.py" + echo "ERROR: delegate-job registry.py not found under skills/" >&2 + return 1 fi - "$py_bin" "$registry_py" register \ --prompt "$prompt" \ --agent "$agent" \ --agent-session "$session" } +# delegate_publish_event [detail] +# +# Publish a lifecycle event to the delegate-job registry. Consolidates the +# inline .venv-walk + publish_event.py blocks that were duplicated across +# create/delete/resume (review item 7). Non-fatal by contract: an empty job id, +# a missing script, or a broker failure never aborts the caller. +delegate_publish_event() { + local job_id="$1" event="$2" detail="${3:-}" + [ -n "$job_id" ] || return 0 + local py_bin pub + py_bin="$(_delegate_py_bin)" + pub="$(_delegate_script publish_event.py)" + [ -n "$pub" ] && [ -f "$pub" ] || return 0 + "$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true +} + diff --git a/skills/multi-agent-create/scripts/create_session.sh b/skills/multi-agent-create/scripts/create_session.sh index 11a444a..0a5a982 100755 --- a/skills/multi-agent-create/scripts/create_session.sh +++ b/skills/multi-agent-create/scripts/create_session.sh @@ -262,17 +262,7 @@ echo "=== created ===" echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_CWD)" if [ -n "$DELEGATE_JOB_ID" ]; then echo "delegate job: $DELEGATE_JOB_ID" - py_bin="python3" - d_dir="$(dirname "${BASH_SOURCE[0]}")" - while [ "$d_dir" != "/" ] && [ -n "$d_dir" ]; do - if [ -x "$d_dir/.venv/bin/python" ]; then - py_bin="$d_dir/.venv/bin/python" - break - fi - d_dir="$(dirname "$d_dir")" - done - pub_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/delegate-job/scripts/publish_event.py" - "$py_bin" "$pub_script" --job "$DELEGATE_JOB_ID" --event started --detail "canary session created" || true + delegate_publish_event "$DELEGATE_JOB_ID" started "canary session created" fi echo "agent-sessions.yaml updated" echo diff --git a/skills/multi-agent-delete/scripts/delete_session.sh b/skills/multi-agent-delete/scripts/delete_session.sh index 0b32014..13360bc 100755 --- a/skills/multi-agent-delete/scripts/delete_session.sh +++ b/skills/multi-agent-delete/scripts/delete_session.sh @@ -61,9 +61,10 @@ if [ -z "$AGENT" ]; then esac fi -# 세션이 YAML 에 있는지 + 해당 row 의 워크스페이스 cwd 및 delegate_job_id 추출 +# 세션이 YAML 에 있는지 + 해당 row 의 워크스페이스 cwd 및 delegate_job_id 추출. +# JSON 으로 emit — cwd 에 '|' 가 들어가도 안전 (review item 7; 기존 cwd|jid 파서 대체). MAPPED_DATA=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF' -import os, yaml +import os, json, yaml name = os.environ['SESSION_NAME'] with open(os.environ['YAML_PATH']) as f: d = yaml.safe_load(f) or {} @@ -71,7 +72,7 @@ for s in d.get('tmux_sessions', []): if s.get('name') == name: cwd = (s.get('pane') or {}).get('cwd', '') jid = s.get('delegate_job_id', '') or '' - print(f"{cwd}|{jid}") + print(json.dumps({"cwd": cwd, "job_id": jid})) raise SystemExit(0) raise SystemExit(7) PYEOF @@ -80,8 +81,8 @@ PYEOF exit 1 } -TARGET_CWD="${MAPPED_DATA%|*}" -DELEGATE_JOB_ID="${MAPPED_DATA#*|}" +TARGET_CWD=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("cwd",""))') +DELEGATE_JOB_ID=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("job_id",""))') # purge 확인 if [ "$PURGE" = "1" ] && [ "$YES" != "1" ]; then @@ -109,19 +110,7 @@ if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then LAST_STATUS=$(tmux capture-pane -t "$SESSION_NAME" -p -S -10 2>/dev/null | tr '\n' ' ' | head -c 500 || true) fi -if [ -n "$DELEGATE_JOB_ID" ]; then - py_bin="python3" - d_dir="$(dirname "${BASH_SOURCE[0]}")" - while [ "$d_dir" != "/" ] && [ -n "$d_dir" ]; do - if [ -x "$d_dir/.venv/bin/python" ]; then - py_bin="$d_dir/.venv/bin/python" - break - fi - d_dir="$(dirname "$d_dir")" - done - pub_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/delegate-job/scripts/publish_event.py" - "$py_bin" "$pub_script" --job "$DELEGATE_JOB_ID" --event progress --detail "terminating" || true -fi +delegate_publish_event "$DELEGATE_JOB_ID" progress "terminating" # hard 모드면 tmux 죽임 if [ "$MODE" = "hard" ] && [ "$TMUX_ALIVE" = "1" ]; then @@ -206,19 +195,7 @@ elif purge and not purge_uuid: print(f"updated: {name} status={target['status']}", flush=True) PYEOF -if [ -n "$DELEGATE_JOB_ID" ]; then - py_bin="python3" - d_dir="$(dirname "${BASH_SOURCE[0]}")" - while [ "$d_dir" != "/" ] && [ -n "$d_dir" ]; do - if [ -x "$d_dir/.venv/bin/python" ]; then - py_bin="$d_dir/.venv/bin/python" - break - fi - d_dir="$(dirname "$d_dir")" - done - pub_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/delegate-job/scripts/publish_event.py" - "$py_bin" "$pub_script" --job "$DELEGATE_JOB_ID" --event completed --detail "session terminated" || true -fi +delegate_publish_event "$DELEGATE_JOB_ID" completed "session terminated" echo echo "=== delete complete ===" diff --git a/skills/multi-agent-resume/scripts/update_yaml_resumed.sh b/skills/multi-agent-resume/scripts/update_yaml_resumed.sh index b2394b0..75ad1fe 100755 --- a/skills/multi-agent-resume/scripts/update_yaml_resumed.sh +++ b/skills/multi-agent-resume/scripts/update_yaml_resumed.sh @@ -118,16 +118,4 @@ snap.pop('terminated_at_epoch', None) print(f"updated: {name} status=running (resume id -> per-row own id)", flush=True) PYEOF -if [ -n "$DELEGATE_JOB_ID" ]; then - py_bin="python3" - d_dir="$(dirname "${BASH_SOURCE[0]}")" - while [ "$d_dir" != "/" ] && [ -n "$d_dir" ]; do - if [ -x "$d_dir/.venv/bin/python" ]; then - py_bin="$d_dir/.venv/bin/python" - break - fi - d_dir="$(dirname "$d_dir")" - done - pub_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/delegate-job/scripts/publish_event.py" - "$py_bin" "$pub_script" --job "$DELEGATE_JOB_ID" --event progress --detail "resumed" || true -fi +delegate_publish_event "$DELEGATE_JOB_ID" progress "resumed" diff --git a/skills/multi-agent-status/scripts/status.sh b/skills/multi-agent-status/scripts/status.sh index 9c94cb8..dde3ded 100755 --- a/skills/multi-agent-status/scripts/status.sh +++ b/skills/multi-agent-status/scripts/status.sh @@ -24,7 +24,11 @@ if [ "$JSON" = "1" ]; then exit 0 fi -DRIFT_JSON="$DRIFT_JSON" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF' +# Project root (parent of skills/) holds the delegate-job .hermes registry. +# Resolved relative to this script — no hardcoded absolute path (review item 6). +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" + +DRIFT_JSON="$DRIFT_JSON" env_python "$AGENT_SESSIONS_YAML" PROJECT_ROOT="$PROJECT_ROOT" <<'PYEOF' import os, json, glob import yaml @@ -64,36 +68,23 @@ def get_job_status(s): jid = s.get('delegate_job_id') if not jid: return ('-', '-') - - # Try workspace relative - path = os.path.join('.hermes', 'jobs', f"{jid}.json") - if os.path.exists(path): - try: - with open(path) as jf: - job_data = json.load(jf) - return (jid, job_data.get('status', 'unknown')) - except Exception: - pass - # Try fixed absolute path of registry - path_fixed = os.path.join('/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent', '.hermes', 'jobs', f"{jid}.json") - if os.path.exists(path_fixed): - try: - with open(path_fixed) as jf: - job_data = json.load(jf) - return (jid, job_data.get('status', 'unknown')) - except Exception: - pass - - # Try audit log status.json - path_audit = os.path.join('/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent', '.hermes', 'delegate_job_logs', jid, 'status.json') - if os.path.exists(path_audit): - try: - with open(path_audit) as jf: - job_data = json.load(jf) - return (jid, job_data.get('status', 'unknown')) - except Exception: - pass + project_root = os.environ.get('PROJECT_ROOT', '.') + # Candidate locations (review item 6: project-root-relative, no hardcoded abs paths): + # 1) cwd-relative registry 2) project-root registry 3) project-root audit log + candidates = [ + os.path.join('.hermes', 'jobs', f"{jid}.json"), + os.path.join(project_root, '.hermes', 'jobs', f"{jid}.json"), + os.path.join(project_root, '.hermes', 'delegate_job_logs', jid, 'status.json'), + ] + for path in candidates: + if os.path.exists(path): + try: + with open(path) as jf: + job_data = json.load(jf) + return (jid, job_data.get('status', 'unknown')) + except Exception: + pass return (jid, 'unknown')