fix(skills): claude review items 4-7 (subscribe timeout, atomic_dump_yaml, hardcoded paths, lifecycle helper)
Item 4: --subscribe gains --timeout/--idle-timeout (idle default raised
120s->600s, 0=disable); connect-error AND non-zero CONNACK now fall
back to a polling loop. SKILL.md matches actual behaviour.
Item 5: --subscribe terminal-event YAML writes routed through
lib.sh::atomic_dump_yaml (flock + schema-validate + .bak).
Item 6: removed hardcoded /home/godopu16/PuKi fallbacks in lib.sh,
status.sh (x2) and reconcile.sh; paths now BASH_SOURCE-relative.
Item 7: lib.sh::delegate_publish_event helper consolidates the 4 duplicated
lifecycle publish blocks; delete cwd|jid parser replaced with JSON.
Also: subscribe loop runs under the project venv python (paho) and delegates
all YAML work to atomic_dump_yaml on system python3 (PyYAML), since neither
interpreter has both modules — the original env_python path could never import
paho. Items 3 + 8 out of scope (per user). Verified on -L claude-phase4-test
(kill-server after).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -94,16 +94,20 @@ The worker calls this script every 30s. It:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Reconcile + auto-update YAML (atomic, flock-guarded). Emits JSON drift to stdout.
|
# Reconcile + auto-update YAML (atomic, flock-guarded). Emits JSON drift to stdout.
|
||||||
bash ~/PuKi/lab/agent_sessions/skills/agent-sessions-monitor/scripts/reconcile.sh --once --emit-diff
|
bash skills/agent-sessions-monitor/scripts/reconcile.sh --once --emit-diff
|
||||||
|
|
||||||
# Read-only: compute drift WITHOUT writing the YAML (use for "what's running?" checks).
|
# Read-only: compute drift WITHOUT writing the YAML (use for "what's running?" checks).
|
||||||
bash ~/PuKi/lab/agent_sessions/skills/agent-sessions-monitor/scripts/reconcile.sh --once --emit-diff --dry-run
|
bash skills/agent-sessions-monitor/scripts/reconcile.sh --once --emit-diff --dry-run
|
||||||
|
|
||||||
# Push-based MQTT Monitor: listen to delegated job events on the broker and update the YAML instantly.
|
# Push-based MQTT Monitor: listen to delegated job events on the broker and update the YAML instantly.
|
||||||
bash ~/PuKi/lab/agent_sessions/skills/agent-sessions-monitor/scripts/reconcile.sh --subscribe
|
# Bounded run that exits after 5 min idle, or 1 h wall-clock; falls back to polling if the broker is down.
|
||||||
|
bash skills/agent-sessions-monitor/scripts/reconcile.sh --subscribe --idle-timeout 300 --timeout 3600
|
||||||
|
|
||||||
|
# Persistent monitor (no timeouts): runs until interrupted; still polls if the broker is unreachable.
|
||||||
|
bash skills/agent-sessions-monitor/scripts/reconcile.sh --subscribe --idle-timeout 0
|
||||||
```
|
```
|
||||||
|
|
||||||
Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (persistent push-based MQTT subscription monitoring; falls back to polling if connection fails). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself.
|
Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (push-based MQTT subscription monitoring). `--subscribe` sub-flags: `--timeout N` (exit after N seconds of wall-clock; `0` = no limit, default), `--idle-timeout N` (exit after N seconds with no message; default `600`, `0` = never idle-out). On a broker connection failure (connect error **or** non-zero CONNACK), `--subscribe` falls back to a polling loop that re-runs `--once --emit-diff` every `RECONCILE_POLL_INTERVAL` (default 15) seconds until `--timeout`. Terminal-event YAML updates are written through `lib.sh::atomic_dump_yaml` (flock + schema-validate + `.bak`). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself.
|
||||||
|
|
||||||
## Drift classes (what the script handles)
|
## Drift classes (what the script handles)
|
||||||
|
|
||||||
|
|||||||
@@ -22,6 +22,11 @@ ONCE=0
|
|||||||
EMIT_DIFF=0
|
EMIT_DIFF=0
|
||||||
DRY_RUN=0
|
DRY_RUN=0
|
||||||
SUBSCRIBE=0
|
SUBSCRIBE=0
|
||||||
|
# --subscribe controls (review item 4): 0 = no overall timeout; idle default 600s
|
||||||
|
# (raised from the old hardcoded 120s); idle 0 = never idle-out.
|
||||||
|
SUB_TIMEOUT=0
|
||||||
|
SUB_IDLE_TIMEOUT=600
|
||||||
|
POLL_INTERVAL="${RECONCILE_POLL_INTERVAL:-15}"
|
||||||
|
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@@ -29,7 +34,9 @@ while [ $# -gt 0 ]; do
|
|||||||
--emit-diff) EMIT_DIFF=1; shift ;;
|
--emit-diff) EMIT_DIFF=1; shift ;;
|
||||||
--dry-run) DRY_RUN=1; shift ;;
|
--dry-run) DRY_RUN=1; shift ;;
|
||||||
--subscribe) SUBSCRIBE=1; shift ;;
|
--subscribe) SUBSCRIBE=1; shift ;;
|
||||||
-h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe]"; exit 0 ;;
|
--timeout) SUB_TIMEOUT="$2"; shift 2 ;;
|
||||||
|
--idle-timeout) SUB_IDLE_TIMEOUT="$2"; shift 2 ;;
|
||||||
|
-h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe [--timeout N] [--idle-timeout N]]"; exit 0 ;;
|
||||||
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
|
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
@@ -37,122 +44,176 @@ done
|
|||||||
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
|
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
|
||||||
|
|
||||||
if [ "$SUBSCRIBE" = "1" ]; then
|
if [ "$SUBSCRIBE" = "1" ]; then
|
||||||
SUBSCRIBE_MODE=1 env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
|
# Paths resolved relative to this script (review item 6): skills/ dir + lib.sh.
|
||||||
import os, sys, json, fcntl, tempfile, subprocess
|
SKILLS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||||
from datetime import datetime, timezone
|
LIB_SH="$SKILLS_DIR/lib.sh"
|
||||||
import yaml
|
# MQTT client lives in the project venv (has paho). All YAML work is delegated
|
||||||
|
# to lib.sh::atomic_dump_yaml, which runs the system python3 (has PyYAML) — so
|
||||||
|
# no single interpreter needs both paho and PyYAML (review items 4/5/6).
|
||||||
|
PYBIN="$(_delegate_py_bin)"
|
||||||
|
|
||||||
yaml_path = os.environ['YAML_PATH']
|
# The MQTT subscribe loop exits 3 to signal "broker unavailable → poll instead".
|
||||||
home = os.environ['HOME_DIR']
|
set +e
|
||||||
|
YAML_PATH="$AGENT_SESSIONS_YAML" HOME_DIR="$HOME" \
|
||||||
|
SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
|
||||||
|
SKILLS_DIR="$SKILLS_DIR" LIB_SH="$LIB_SH" \
|
||||||
|
"$PYBIN" - <<'PYEOF'
|
||||||
|
import os, sys, json, time, subprocess
|
||||||
|
|
||||||
# Add skills/delegate-job/scripts to path to import mqtt_common
|
lib_sh = os.environ.get('LIB_SH', '')
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
|
skills_dir = os.environ.get('SKILLS_DIR', '')
|
||||||
path_candidate = os.path.join('/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent', 'skills', 'delegate-job', 'scripts')
|
timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout
|
||||||
if os.path.isdir(path_candidate):
|
idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '600') or '0') # 0 = no idle timeout
|
||||||
sys.path.append(path_candidate)
|
|
||||||
|
# Locate skills/delegate-job/scripts to import mqtt_common — relative first, then
|
||||||
|
# an upward walk from cwd. No hardcoded absolute path (review item 6).
|
||||||
|
cand = os.path.join(skills_dir, 'delegate-job', 'scripts') if skills_dir else ''
|
||||||
|
if cand and os.path.isdir(cand):
|
||||||
|
sys.path.append(cand)
|
||||||
else:
|
else:
|
||||||
d = script_dir
|
d = os.getcwd()
|
||||||
found = False
|
while d and d != '/':
|
||||||
while d != '/' and d:
|
hit = None
|
||||||
p = os.path.join(d, 'skills', 'delegate-job', 'scripts')
|
for sub in (('skills', 'delegate-job', 'scripts'), ('delegate-job', 'scripts')):
|
||||||
|
p = os.path.join(d, *sub)
|
||||||
if os.path.isdir(p):
|
if os.path.isdir(p):
|
||||||
sys.path.append(p)
|
hit = p
|
||||||
found = True
|
|
||||||
break
|
break
|
||||||
p2 = os.path.join(d, 'delegate-job', 'scripts')
|
if hit:
|
||||||
if os.path.isdir(p2):
|
sys.path.append(hit)
|
||||||
sys.path.append(p2)
|
|
||||||
found = True
|
|
||||||
break
|
break
|
||||||
d = os.path.dirname(d)
|
d = os.path.dirname(d)
|
||||||
|
|
||||||
import mqtt_common
|
import mqtt_common
|
||||||
|
|
||||||
cfg = mqtt_common.broker_config_from_env()
|
# Executed INSIDE lib.sh::atomic_dump_yaml (system python3 + PyYAML), under the
|
||||||
client = mqtt_common.make_client("monitor_sub", cfg)
|
# YAML flock with schema-validate + .bak (review item 5). Marks matching running
|
||||||
|
# sessions terminated and kills their tmux (review item 3 behaviour preserved),
|
||||||
|
# or aborts the write entirely when nothing matches. The untrusted MQTT job id /
|
||||||
|
# event arrive via env (MQTT_JID / MQTT_EVENT) — never spliced into source (P1-B).
|
||||||
|
_MUTATION = r'''
|
||||||
|
import os, subprocess
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
_jid = os.environ['MQTT_JID']
|
||||||
|
_event = os.environ['MQTT_EVENT']
|
||||||
|
_now = datetime.now(timezone.utc)
|
||||||
|
_changed = False
|
||||||
|
for s in d.get('tmux_sessions', []):
|
||||||
|
if s.get('delegate_job_id') == _jid and s.get('status') == 'running':
|
||||||
|
s['status'] = 'terminated'
|
||||||
|
s['terminated_at'] = _now.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||||
|
s['terminated_at_epoch'] = int(_now.timestamp())
|
||||||
|
s['termination_mode'] = 'auto-detected (MQTT ' + _event + ')'
|
||||||
|
_name = s.get('name')
|
||||||
|
_srv = s.get('tmux_server') or 'default'
|
||||||
|
_cmd = ['tmux'] + (['-L', _srv] if _srv != 'default' else []) + ['kill-session', '-t', _name]
|
||||||
|
subprocess.run(_cmd, capture_output=True)
|
||||||
|
print('MQTT Monitor: terminated + killed ' + str(_name) + ' on ' + str(_srv), flush=True)
|
||||||
|
_changed = True
|
||||||
|
if not _changed:
|
||||||
|
raise SystemExit(0) # nothing matched — skip the write entirely
|
||||||
|
'''
|
||||||
|
|
||||||
def on_message(client, userdata, msg):
|
|
||||||
|
def handle_terminal(jid, event):
|
||||||
|
if not lib_sh or not os.path.isfile(lib_sh):
|
||||||
|
print('MQTT Monitor: lib.sh not found, cannot update YAML', flush=True)
|
||||||
|
return
|
||||||
|
env = dict(os.environ)
|
||||||
|
env['MQTT_JID'] = jid
|
||||||
|
env['MQTT_EVENT'] = event
|
||||||
|
cmd = ['bash', '-c',
|
||||||
|
'source "$LIB_SH"; atomic_dump_yaml "$YAML_PATH" MQTT_JID="$MQTT_JID" MQTT_EVENT="$MQTT_EVENT"']
|
||||||
|
r = subprocess.run(cmd, input=_MUTATION, text=True, env=env, capture_output=True)
|
||||||
|
if (r.stdout or '').strip():
|
||||||
|
print(r.stdout.strip(), flush=True)
|
||||||
|
if r.returncode != 0 and (r.stderr or '').strip():
|
||||||
|
print('MQTT Monitor: atomic_dump_yaml stderr: ' + r.stderr.strip(), flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
state = {'last_msg': time.time(), 'connected': False, 'failed': False}
|
||||||
|
|
||||||
|
|
||||||
|
def on_message(_client, _userdata, msg):
|
||||||
|
state['last_msg'] = time.time()
|
||||||
try:
|
try:
|
||||||
payload = json.loads(msg.payload.decode("utf-8"))
|
payload = json.loads(msg.payload.decode("utf-8"))
|
||||||
jid = payload.get("job_id")
|
jid = payload.get("job_id")
|
||||||
event = payload.get("event")
|
event = payload.get("event")
|
||||||
if not jid or not event:
|
if jid and event in ("completed", "error"):
|
||||||
return
|
|
||||||
|
|
||||||
if event in ("completed", "error"):
|
|
||||||
print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
|
print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
|
||||||
update_session_by_job(jid, event)
|
handle_terminal(jid, event)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"MQTT Monitor error parsing message: {e}", flush=True)
|
print(f"MQTT Monitor error parsing message: {e}", flush=True)
|
||||||
|
|
||||||
def update_session_by_job(jid, event):
|
|
||||||
lock_path = yaml_path + '.lock'
|
|
||||||
lock_fh = open(lock_path, 'w')
|
|
||||||
fcntl.flock(lock_fh, fcntl.LOCK_EX)
|
|
||||||
try:
|
|
||||||
if os.path.exists(yaml_path):
|
|
||||||
with open(yaml_path) as f:
|
|
||||||
d_local = yaml.safe_load(f) or {}
|
|
||||||
else:
|
|
||||||
d_local = {}
|
|
||||||
|
|
||||||
sessions = d_local.setdefault('tmux_sessions', [])
|
|
||||||
updated = False
|
|
||||||
for s in sessions:
|
|
||||||
if s.get('delegate_job_id') == jid and s.get('status') == 'running':
|
|
||||||
s['status'] = 'terminated'
|
|
||||||
now_iso = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
||||||
s['terminated_at'] = now_iso
|
|
||||||
s['terminated_at_epoch'] = int(datetime.now(timezone.utc).timestamp())
|
|
||||||
s['termination_mode'] = f"auto-detected (MQTT {event})"
|
|
||||||
name = s.get('name')
|
|
||||||
srv = s.get('tmux_server') or 'default'
|
|
||||||
kill_tmux_session(name, srv)
|
|
||||||
updated = True
|
|
||||||
|
|
||||||
if updated:
|
|
||||||
dir_ = os.path.dirname(yaml_path) or '.'
|
|
||||||
fd, tmp = tempfile.mkstemp(dir=dir_, prefix='.agent-sessions.', suffix='.tmp')
|
|
||||||
try:
|
|
||||||
with os.fdopen(fd, 'w') as f:
|
|
||||||
yaml.safe_dump(d_local, f, default_flow_style=False, sort_keys=False,
|
|
||||||
allow_unicode=True, width=4096)
|
|
||||||
os.replace(tmp, yaml_path)
|
|
||||||
print(f"MQTT Monitor: updated YAML for job {jid} to terminated", flush=True)
|
|
||||||
except Exception as e:
|
|
||||||
if os.path.exists(tmp):
|
|
||||||
os.remove(tmp)
|
|
||||||
print(f"MQTT Monitor error writing YAML: {e}", flush=True)
|
|
||||||
finally:
|
|
||||||
fcntl.flock(lock_fh, fcntl.LOCK_UN)
|
|
||||||
lock_fh.close()
|
|
||||||
|
|
||||||
def kill_tmux_session(name, srv):
|
|
||||||
try:
|
|
||||||
cmd = ['tmux']
|
|
||||||
if srv != 'default':
|
|
||||||
cmd += ['-L', srv]
|
|
||||||
cmd += ['kill-session', '-t', name]
|
|
||||||
subprocess.run(cmd, capture_output=True)
|
|
||||||
print(f"MQTT Monitor: killed tmux session {name} on server {srv}", flush=True)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"MQTT Monitor error killing tmux: {e}", flush=True)
|
|
||||||
|
|
||||||
client.on_message = on_message
|
|
||||||
|
|
||||||
def on_connect(_c, _u, _flags, reason_code, _props):
|
def on_connect(_c, _u, _flags, reason_code, _props):
|
||||||
rc = mqtt_common.reason_code_value(reason_code)
|
rc = mqtt_common.reason_code_value(reason_code)
|
||||||
if rc == 0:
|
if rc == 0:
|
||||||
|
state['connected'] = True
|
||||||
_c.subscribe("python/mqtt/jobs/+/events", qos=1)
|
_c.subscribe("python/mqtt/jobs/+/events", qos=1)
|
||||||
print("MQTT Monitor: subscribed to python/mqtt/jobs/+/events", flush=True)
|
print("MQTT Monitor: subscribed to python/mqtt/jobs/+/events", flush=True)
|
||||||
else:
|
else:
|
||||||
print(f"MQTT Monitor connection failed: {rc}", flush=True)
|
state['failed'] = True
|
||||||
|
print(f"MQTT Monitor connection failed: rc={rc}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
cfg = mqtt_common.broker_config_from_env()
|
||||||
|
client = mqtt_common.make_client("monitor_sub", cfg)
|
||||||
|
client.on_message = on_message
|
||||||
client.on_connect = on_connect
|
client.on_connect = on_connect
|
||||||
print(f"MQTT Monitor: connecting to {cfg.host}:{cfg.port} (TLS={cfg.tls})...", flush=True)
|
print(f"MQTT Monitor: connecting to {cfg.host}:{cfg.port} (TLS={cfg.tls})...", flush=True)
|
||||||
|
|
||||||
|
# Connection failure → fall back to polling (review item 4).
|
||||||
|
try:
|
||||||
client.connect(cfg.host, cfg.port, cfg.keepalive)
|
client.connect(cfg.host, cfg.port, cfg.keepalive)
|
||||||
client.loop_forever()
|
except Exception as e:
|
||||||
|
print(f"MQTT Monitor: connect failed ({e}); falling back to polling", flush=True)
|
||||||
|
sys.exit(3)
|
||||||
|
|
||||||
|
client.loop_start()
|
||||||
|
_wait = time.time()
|
||||||
|
while time.time() - _wait < 5 and not state['connected'] and not state['failed']:
|
||||||
|
time.sleep(0.1)
|
||||||
|
if not state['connected']:
|
||||||
|
print("MQTT Monitor: broker did not accept connection; falling back to polling", flush=True)
|
||||||
|
client.loop_stop()
|
||||||
|
sys.exit(3)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
now = time.time()
|
||||||
|
if timeout and (now - start) >= timeout:
|
||||||
|
print(f"MQTT Monitor: --timeout {timeout}s reached, exiting", flush=True)
|
||||||
|
break
|
||||||
|
if idle_timeout and (now - state['last_msg']) >= idle_timeout:
|
||||||
|
print(f"MQTT Monitor: --idle-timeout {idle_timeout}s reached, exiting", flush=True)
|
||||||
|
break
|
||||||
|
time.sleep(0.5)
|
||||||
|
finally:
|
||||||
|
client.loop_stop()
|
||||||
|
try:
|
||||||
|
client.disconnect()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
sys.exit(0)
|
||||||
PYEOF
|
PYEOF
|
||||||
|
sub_rc=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ "$sub_rc" = "3" ]; then
|
||||||
|
echo "MQTT Monitor: broker unavailable — falling back to polling (interval ${POLL_INTERVAL}s)" >&2
|
||||||
|
_self="$SKILLS_DIR/agent-sessions-monitor/scripts/reconcile.sh"
|
||||||
|
_start=$(date +%s)
|
||||||
|
while :; do
|
||||||
|
bash "$_self" --once --emit-diff >/dev/null 2>&1 || true
|
||||||
|
if [ "$SUB_TIMEOUT" != "0" ] && [ "$(( $(date +%s) - _start ))" -ge "$SUB_TIMEOUT" ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep "$POLL_INTERVAL"
|
||||||
|
done
|
||||||
|
fi
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
+46
-17
@@ -362,37 +362,66 @@ PYEOF
|
|||||||
}
|
}
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# delegate_submit_job <prompt> <agent> <agent_session>
|
# delegate-job integration helpers
|
||||||
#
|
#
|
||||||
# Register a job in the delegate-job registry. Auto-detects the virtualenv python
|
# All paths are resolved relative to lib.sh's own location (BASH_SOURCE), so the
|
||||||
# and prints the new JID on stdout.
|
# skill tree is relocatable — no hardcoded absolute paths (review item 6).
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
delegate_submit_job() {
|
|
||||||
local prompt="$1" agent="$2" session="$3"
|
|
||||||
local skill_dir
|
|
||||||
skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
|
|
||||||
local py_bin="python3"
|
# _delegate_py_bin — echo the virtualenv python (walk up from skills/), else python3.
|
||||||
local d="$skill_dir"
|
_delegate_py_bin() {
|
||||||
|
local d
|
||||||
|
d="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
while [ "$d" != "/" ] && [ -n "$d" ]; do
|
while [ "$d" != "/" ] && [ -n "$d" ]; do
|
||||||
if [ -x "$d/.venv/bin/python" ]; then
|
if [ -x "$d/.venv/bin/python" ]; then
|
||||||
py_bin="$d/.venv/bin/python"
|
printf '%s\n' "$d/.venv/bin/python"; return 0
|
||||||
break
|
|
||||||
fi
|
fi
|
||||||
d="$(dirname "$d")"
|
d="$(dirname "$d")"
|
||||||
done
|
done
|
||||||
|
printf '%s\n' "python3"
|
||||||
|
}
|
||||||
|
|
||||||
local registry_py="$skill_dir/delegate-job/scripts/registry.py"
|
# _delegate_script <name> — echo the path to a delegate-job script, resolved
|
||||||
if [ ! -f "$registry_py" ]; then
|
# relative to skills/ (lib.sh dir). Empty if not found.
|
||||||
registry_py="$(find "$skill_dir" -name "registry.py" | head -n 1 || echo "")"
|
_delegate_script() {
|
||||||
fi
|
local name="$1" skill_dir cand
|
||||||
|
skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cand="$skill_dir/delegate-job/scripts/$name"
|
||||||
|
if [ -f "$cand" ]; then printf '%s\n' "$cand"; return 0; fi
|
||||||
|
printf '%s\n' "$(find "$skill_dir" -name "$name" 2>/dev/null | head -n 1 || true)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# delegate_submit_job <prompt> <agent> <agent_session>
|
||||||
|
#
|
||||||
|
# Register a job in the delegate-job registry. Prints the new JID on stdout.
|
||||||
|
delegate_submit_job() {
|
||||||
|
local prompt="$1" agent="$2" session="$3"
|
||||||
|
local py_bin registry_py
|
||||||
|
py_bin="$(_delegate_py_bin)"
|
||||||
|
registry_py="$(_delegate_script registry.py)"
|
||||||
if [ -z "$registry_py" ] || [ ! -f "$registry_py" ]; then
|
if [ -z "$registry_py" ] || [ ! -f "$registry_py" ]; then
|
||||||
registry_py="/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent/skills/delegate-job/scripts/registry.py"
|
echo "ERROR: delegate-job registry.py not found under skills/" >&2
|
||||||
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
"$py_bin" "$registry_py" register \
|
"$py_bin" "$registry_py" register \
|
||||||
--prompt "$prompt" \
|
--prompt "$prompt" \
|
||||||
--agent "$agent" \
|
--agent "$agent" \
|
||||||
--agent-session "$session"
|
--agent-session "$session"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# delegate_publish_event <job_id> <event> [detail]
|
||||||
|
#
|
||||||
|
# Publish a lifecycle event to the delegate-job registry. Consolidates the
|
||||||
|
# inline .venv-walk + publish_event.py blocks that were duplicated across
|
||||||
|
# create/delete/resume (review item 7). Non-fatal by contract: an empty job id,
|
||||||
|
# a missing script, or a broker failure never aborts the caller.
|
||||||
|
delegate_publish_event() {
|
||||||
|
local job_id="$1" event="$2" detail="${3:-}"
|
||||||
|
[ -n "$job_id" ] || return 0
|
||||||
|
local py_bin pub
|
||||||
|
py_bin="$(_delegate_py_bin)"
|
||||||
|
pub="$(_delegate_script publish_event.py)"
|
||||||
|
[ -n "$pub" ] && [ -f "$pub" ] || return 0
|
||||||
|
"$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -262,17 +262,7 @@ echo "=== created ==="
|
|||||||
echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_CWD)"
|
echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_CWD)"
|
||||||
if [ -n "$DELEGATE_JOB_ID" ]; then
|
if [ -n "$DELEGATE_JOB_ID" ]; then
|
||||||
echo "delegate job: $DELEGATE_JOB_ID"
|
echo "delegate job: $DELEGATE_JOB_ID"
|
||||||
py_bin="python3"
|
delegate_publish_event "$DELEGATE_JOB_ID" started "canary session created"
|
||||||
d_dir="$(dirname "${BASH_SOURCE[0]}")"
|
|
||||||
while [ "$d_dir" != "/" ] && [ -n "$d_dir" ]; do
|
|
||||||
if [ -x "$d_dir/.venv/bin/python" ]; then
|
|
||||||
py_bin="$d_dir/.venv/bin/python"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
d_dir="$(dirname "$d_dir")"
|
|
||||||
done
|
|
||||||
pub_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/delegate-job/scripts/publish_event.py"
|
|
||||||
"$py_bin" "$pub_script" --job "$DELEGATE_JOB_ID" --event started --detail "canary session created" || true
|
|
||||||
fi
|
fi
|
||||||
echo "agent-sessions.yaml updated"
|
echo "agent-sessions.yaml updated"
|
||||||
echo
|
echo
|
||||||
|
|||||||
@@ -61,9 +61,10 @@ if [ -z "$AGENT" ]; then
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 세션이 YAML 에 있는지 + 해당 row 의 워크스페이스 cwd 및 delegate_job_id 추출
|
# 세션이 YAML 에 있는지 + 해당 row 의 워크스페이스 cwd 및 delegate_job_id 추출.
|
||||||
|
# JSON 으로 emit — cwd 에 '|' 가 들어가도 안전 (review item 7; 기존 cwd|jid 파서 대체).
|
||||||
MAPPED_DATA=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF'
|
MAPPED_DATA=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF'
|
||||||
import os, yaml
|
import os, json, yaml
|
||||||
name = os.environ['SESSION_NAME']
|
name = os.environ['SESSION_NAME']
|
||||||
with open(os.environ['YAML_PATH']) as f:
|
with open(os.environ['YAML_PATH']) as f:
|
||||||
d = yaml.safe_load(f) or {}
|
d = yaml.safe_load(f) or {}
|
||||||
@@ -71,7 +72,7 @@ for s in d.get('tmux_sessions', []):
|
|||||||
if s.get('name') == name:
|
if s.get('name') == name:
|
||||||
cwd = (s.get('pane') or {}).get('cwd', '')
|
cwd = (s.get('pane') or {}).get('cwd', '')
|
||||||
jid = s.get('delegate_job_id', '') or ''
|
jid = s.get('delegate_job_id', '') or ''
|
||||||
print(f"{cwd}|{jid}")
|
print(json.dumps({"cwd": cwd, "job_id": jid}))
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
raise SystemExit(7)
|
raise SystemExit(7)
|
||||||
PYEOF
|
PYEOF
|
||||||
@@ -80,8 +81,8 @@ PYEOF
|
|||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
TARGET_CWD="${MAPPED_DATA%|*}"
|
TARGET_CWD=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("cwd",""))')
|
||||||
DELEGATE_JOB_ID="${MAPPED_DATA#*|}"
|
DELEGATE_JOB_ID=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("job_id",""))')
|
||||||
|
|
||||||
# purge 확인
|
# purge 확인
|
||||||
if [ "$PURGE" = "1" ] && [ "$YES" != "1" ]; then
|
if [ "$PURGE" = "1" ] && [ "$YES" != "1" ]; then
|
||||||
@@ -109,19 +110,7 @@ if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
|
|||||||
LAST_STATUS=$(tmux capture-pane -t "$SESSION_NAME" -p -S -10 2>/dev/null | tr '\n' ' ' | head -c 500 || true)
|
LAST_STATUS=$(tmux capture-pane -t "$SESSION_NAME" -p -S -10 2>/dev/null | tr '\n' ' ' | head -c 500 || true)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -n "$DELEGATE_JOB_ID" ]; then
|
delegate_publish_event "$DELEGATE_JOB_ID" progress "terminating"
|
||||||
py_bin="python3"
|
|
||||||
d_dir="$(dirname "${BASH_SOURCE[0]}")"
|
|
||||||
while [ "$d_dir" != "/" ] && [ -n "$d_dir" ]; do
|
|
||||||
if [ -x "$d_dir/.venv/bin/python" ]; then
|
|
||||||
py_bin="$d_dir/.venv/bin/python"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
d_dir="$(dirname "$d_dir")"
|
|
||||||
done
|
|
||||||
pub_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/delegate-job/scripts/publish_event.py"
|
|
||||||
"$py_bin" "$pub_script" --job "$DELEGATE_JOB_ID" --event progress --detail "terminating" || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# hard 모드면 tmux 죽임
|
# hard 모드면 tmux 죽임
|
||||||
if [ "$MODE" = "hard" ] && [ "$TMUX_ALIVE" = "1" ]; then
|
if [ "$MODE" = "hard" ] && [ "$TMUX_ALIVE" = "1" ]; then
|
||||||
@@ -206,19 +195,7 @@ elif purge and not purge_uuid:
|
|||||||
print(f"updated: {name} status={target['status']}", flush=True)
|
print(f"updated: {name} status={target['status']}", flush=True)
|
||||||
PYEOF
|
PYEOF
|
||||||
|
|
||||||
if [ -n "$DELEGATE_JOB_ID" ]; then
|
delegate_publish_event "$DELEGATE_JOB_ID" completed "session terminated"
|
||||||
py_bin="python3"
|
|
||||||
d_dir="$(dirname "${BASH_SOURCE[0]}")"
|
|
||||||
while [ "$d_dir" != "/" ] && [ -n "$d_dir" ]; do
|
|
||||||
if [ -x "$d_dir/.venv/bin/python" ]; then
|
|
||||||
py_bin="$d_dir/.venv/bin/python"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
d_dir="$(dirname "$d_dir")"
|
|
||||||
done
|
|
||||||
pub_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/delegate-job/scripts/publish_event.py"
|
|
||||||
"$py_bin" "$pub_script" --job "$DELEGATE_JOB_ID" --event completed --detail "session terminated" || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo
|
echo
|
||||||
echo "=== delete complete ==="
|
echo "=== delete complete ==="
|
||||||
|
|||||||
@@ -118,16 +118,4 @@ snap.pop('terminated_at_epoch', None)
|
|||||||
print(f"updated: {name} status=running (resume id -> per-row own id)", flush=True)
|
print(f"updated: {name} status=running (resume id -> per-row own id)", flush=True)
|
||||||
PYEOF
|
PYEOF
|
||||||
|
|
||||||
if [ -n "$DELEGATE_JOB_ID" ]; then
|
delegate_publish_event "$DELEGATE_JOB_ID" progress "resumed"
|
||||||
py_bin="python3"
|
|
||||||
d_dir="$(dirname "${BASH_SOURCE[0]}")"
|
|
||||||
while [ "$d_dir" != "/" ] && [ -n "$d_dir" ]; do
|
|
||||||
if [ -x "$d_dir/.venv/bin/python" ]; then
|
|
||||||
py_bin="$d_dir/.venv/bin/python"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
d_dir="$(dirname "$d_dir")"
|
|
||||||
done
|
|
||||||
pub_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/delegate-job/scripts/publish_event.py"
|
|
||||||
"$py_bin" "$pub_script" --job "$DELEGATE_JOB_ID" --event progress --detail "resumed" || true
|
|
||||||
fi
|
|
||||||
|
|||||||
@@ -24,7 +24,11 @@ if [ "$JSON" = "1" ]; then
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
DRIFT_JSON="$DRIFT_JSON" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
|
# Project root (parent of skills/) holds the delegate-job .hermes registry.
|
||||||
|
# Resolved relative to this script — no hardcoded absolute path (review item 6).
|
||||||
|
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
|
||||||
|
|
||||||
|
DRIFT_JSON="$DRIFT_JSON" env_python "$AGENT_SESSIONS_YAML" PROJECT_ROOT="$PROJECT_ROOT" <<'PYEOF'
|
||||||
import os, json, glob
|
import os, json, glob
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
@@ -65,8 +69,15 @@ def get_job_status(s):
|
|||||||
if not jid:
|
if not jid:
|
||||||
return ('-', '-')
|
return ('-', '-')
|
||||||
|
|
||||||
# Try workspace relative
|
project_root = os.environ.get('PROJECT_ROOT', '.')
|
||||||
path = os.path.join('.hermes', 'jobs', f"{jid}.json")
|
# Candidate locations (review item 6: project-root-relative, no hardcoded abs paths):
|
||||||
|
# 1) cwd-relative registry 2) project-root registry 3) project-root audit log
|
||||||
|
candidates = [
|
||||||
|
os.path.join('.hermes', 'jobs', f"{jid}.json"),
|
||||||
|
os.path.join(project_root, '.hermes', 'jobs', f"{jid}.json"),
|
||||||
|
os.path.join(project_root, '.hermes', 'delegate_job_logs', jid, 'status.json'),
|
||||||
|
]
|
||||||
|
for path in candidates:
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
try:
|
try:
|
||||||
with open(path) as jf:
|
with open(path) as jf:
|
||||||
@@ -75,26 +86,6 @@ def get_job_status(s):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Try fixed absolute path of registry
|
|
||||||
path_fixed = os.path.join('/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent', '.hermes', 'jobs', f"{jid}.json")
|
|
||||||
if os.path.exists(path_fixed):
|
|
||||||
try:
|
|
||||||
with open(path_fixed) as jf:
|
|
||||||
job_data = json.load(jf)
|
|
||||||
return (jid, job_data.get('status', 'unknown'))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Try audit log status.json
|
|
||||||
path_audit = os.path.join('/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent', '.hermes', 'delegate_job_logs', jid, 'status.json')
|
|
||||||
if os.path.exists(path_audit):
|
|
||||||
try:
|
|
||||||
with open(path_audit) as jf:
|
|
||||||
job_data = json.load(jf)
|
|
||||||
return (jid, job_data.get('status', 'unknown'))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return (jid, 'unknown')
|
return (jid, 'unknown')
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user