refactor(skills): cleanup dead code + full workflow A→B→C→D integration

Cleanup:
- Remove unused validate_yaml() helper from lib.sh
- Remove USER_MANUAL.html + mqtt-broker-setup.html (no refs found)

Workflow A (create_session ↔ delegate-job):
- Add --submit-job <prompt> option to create_session.sh
- Auto-register session in delegate-job registry, store delegate_job_id in YAML

Workflow B (push-based monitor):
- Migrate reconcile.sh to MQTT subscriber mode (polling fallback preserved)

Workflow C (unified status):
- status.sh now shows session + delegate-job state in single column

Workflow D (audit log + perms):
- JSON job files chmod 600
- create/delete/resume now publish lifecycle events to delegate-job
This commit is contained in:
2026-06-19 14:27:29 +00:00
parent 97f649a3e1
commit 0eb1d94a9c
15 changed files with 335 additions and 3688 deletions
@@ -21,18 +21,141 @@ STATE_DIR="${AGENT_SESSIONS_STATE_DIR:-$HOME/.cache/agent-sessions-monitor}"
ONCE=0
EMIT_DIFF=0
DRY_RUN=0
SUBSCRIBE=0
while [ $# -gt 0 ]; do
case "$1" in
--once) ONCE=1; shift ;;
--emit-diff) EMIT_DIFF=1; shift ;;
--dry-run) DRY_RUN=1; shift ;;
-h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run]"; exit 0 ;;
--subscribe) SUBSCRIBE=1; shift ;;
-h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe]"; exit 0 ;;
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
esac
done
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
if [ "$SUBSCRIBE" = "1" ]; then
SUBSCRIBE_MODE=1 env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
import os, sys, json, fcntl, tempfile, subprocess
from datetime import datetime, timezone
import yaml
yaml_path = os.environ['YAML_PATH']
home = os.environ['HOME_DIR']
# Add skills/delegate-job/scripts to path to import mqtt_common
script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
path_candidate = os.path.join('/home/godopu16/PuKi/laa/canary_projects/advanced_multi_agent', 'skills', 'delegate-job', 'scripts')
if os.path.isdir(path_candidate):
sys.path.append(path_candidate)
else:
d = script_dir
found = False
while d != '/' and d:
p = os.path.join(d, 'skills', 'delegate-job', 'scripts')
if os.path.isdir(p):
sys.path.append(p)
found = True
break
p2 = os.path.join(d, 'delegate-job', 'scripts')
if os.path.isdir(p2):
sys.path.append(p2)
found = True
break
d = os.path.dirname(d)
import mqtt_common
cfg = mqtt_common.broker_config_from_env()
client = mqtt_common.make_client("monitor_sub", cfg)
def on_message(client, userdata, msg):
try:
payload = json.loads(msg.payload.decode("utf-8"))
jid = payload.get("job_id")
event = payload.get("event")
if not jid or not event:
return
if event in ("completed", "error"):
print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
update_session_by_job(jid, event)
except Exception as e:
print(f"MQTT Monitor error parsing message: {e}", flush=True)
def update_session_by_job(jid, event):
lock_path = yaml_path + '.lock'
lock_fh = open(lock_path, 'w')
fcntl.flock(lock_fh, fcntl.LOCK_EX)
try:
if os.path.exists(yaml_path):
with open(yaml_path) as f:
d_local = yaml.safe_load(f) or {}
else:
d_local = {}
sessions = d_local.setdefault('tmux_sessions', [])
updated = False
for s in sessions:
if s.get('delegate_job_id') == jid and s.get('status') == 'running':
s['status'] = 'terminated'
now_iso = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
s['terminated_at'] = now_iso
s['terminated_at_epoch'] = int(datetime.now(timezone.utc).timestamp())
s['termination_mode'] = f"auto-detected (MQTT {event})"
name = s.get('name')
srv = s.get('tmux_server') or 'default'
kill_tmux_session(name, srv)
updated = True
if updated:
dir_ = os.path.dirname(yaml_path) or '.'
fd, tmp = tempfile.mkstemp(dir=dir_, prefix='.agent-sessions.', suffix='.tmp')
try:
with os.fdopen(fd, 'w') as f:
yaml.safe_dump(d_local, f, default_flow_style=False, sort_keys=False,
allow_unicode=True, width=4096)
os.replace(tmp, yaml_path)
print(f"MQTT Monitor: updated YAML for job {jid} to terminated", flush=True)
except Exception as e:
if os.path.exists(tmp):
os.remove(tmp)
print(f"MQTT Monitor error writing YAML: {e}", flush=True)
finally:
fcntl.flock(lock_fh, fcntl.LOCK_UN)
lock_fh.close()
def kill_tmux_session(name, srv):
try:
cmd = ['tmux']
if srv != 'default':
cmd += ['-L', srv]
cmd += ['kill-session', '-t', name]
subprocess.run(cmd, capture_output=True)
print(f"MQTT Monitor: killed tmux session {name} on server {srv}", flush=True)
except Exception as e:
print(f"MQTT Monitor error killing tmux: {e}", flush=True)
client.on_message = on_message
def on_connect(_c, _u, _flags, reason_code, _props):
rc = mqtt_common.reason_code_value(reason_code)
if rc == 0:
_c.subscribe("python/mqtt/jobs/+/events", qos=1)
print("MQTT Monitor: subscribed to python/mqtt/jobs/+/events", flush=True)
else:
print(f"MQTT Monitor connection failed: {rc}", flush=True)
client.on_connect = on_connect
print(f"MQTT Monitor: connecting to {cfg.host}:{cfg.port} (TLS={cfg.tls})...", flush=True)
client.connect(cfg.host, cfg.port, cfg.keepalive)
client.loop_forever()
PYEOF
exit 0
fi
mkdir -p "$STATE_DIR"
# 모든 비교 로직을 단일 소스로 둔다. dry-run 은 env_python(읽기전용), 그 외엔