mam 설치

2026-06-25 12:19:24 +09:00
parent 06a95a6d5b
commit b76249a2a6
25 changed files with 5780 additions and 0 deletions
@@ -0,0 +1,787 @@
 #!/usr/bin/env bash
 # lib.sh — shared library for the multi-agent-mux-* skills.
 #
 # Single source of truth for the four things that were inconsistently
 # re-implemented across create/resume/delete/monitor (REVIEW.md §4.1):
 #   - derive_session_name : the tmux session slug                               (P0-A)
 #   - atomic_dump_yaml     : SQLite db transaction + temp+rename + .bak + validate (P0-B)
 #   - env_python           : env-safe Python (no heredoc injection)             (P0-B / P1-B)
 #   - find_workspace_uuid  : workspace-SCOPED resume id lookup                 (P0-C)
 #
 # Source it from each script with a path computed from the script location:
 #   source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
 #
 # HARD RULE: the agent-sessions.yaml file is only ever written through
 # atomic_dump_yaml. Never `open(yaml_path, 'w')` anywhere else.
 SKILL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 WORKSPACE_ROOT="$(cd "$SKILL_DIR/../.." && pwd)"
 AGENT_SESSIONS_YAML="${AGENT_SESSIONS_YAML:-$WORKSPACE_ROOT/.mam/agent-sessions.yaml}"
 # Workspace-relative defaults with environment overrides (Phase Z)
 HOME_DIR="${HOME_DIR:-$WORKSPACE_ROOT}"
 CLAUDE_PROJECT_DIR="${CLAUDE_PROJECT_DIR:-$HOME/.claude/projects}"
 LOCAL_BIN="${LOCAL_BIN:-$HOME/.local/bin}"
 # ---------------------------------------------------------------------------
 # Tmux Server Isolation support
 # ---------------------------------------------------------------------------
 # Paths to exclude when resolving the real tmux binary (shim/wrapper dirs).
 _TMUX_SHIM_DIR_PATTERN="${_TMUX_SHIM_DIR_PATTERN:-/multi-agent-tmux-shim/}"
 _TMUX_SKILLS_BIN_PATTERN="${_TMUX_SKILLS_BIN_PATTERN:-/.agents/skills/.bin}"
 TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}"
 _resolve_real_tmux_path() {
  if [ -z "${_REAL_TMUX_PATH:-}" ] || [[ "$_REAL_TMUX_PATH" == *"${_TMUX_SHIM_DIR_PATTERN}"* ]] || [[ "$_REAL_TMUX_PATH" == *"${_TMUX_SKILLS_BIN_PATTERN}"* ]]; then
    local dir save_ifs="$IFS"
    _REAL_TMUX_PATH=""
    IFS=:
    for dir in $PATH; do
      if [[ "$dir" != *"${_TMUX_SHIM_DIR_PATTERN}"* ]] && [[ "$dir" != *"${_TMUX_SKILLS_BIN_PATTERN}"* ]] && [ -x "$dir/tmux" ]; then
        _REAL_TMUX_PATH="$dir/tmux"
        break
      fi
    done
    IFS="$save_ifs"
    if [ -z "$_REAL_TMUX_PATH" ]; then
      _REAL_TMUX_PATH="tmux"
    fi
    export _REAL_TMUX_PATH
  fi
 }
 _init_tmux_isolation() {
  _resolve_real_tmux_path
  if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
    local wrapper_dir="${TMPDIR:-/tmp}${_TMUX_SHIM_DIR_PATTERN}${TMUX_SERVER_NAME}"
    if [[ ":$PATH:" != *":$wrapper_dir:"* ]]; then
      mkdir -p "$wrapper_dir"
      cat <<EOF > "$wrapper_dir/tmux"
 #!/usr/bin/env bash
 if [ -z "\${TMUX_SERVER_NAME:-}" ] || [ "\$TMUX_SERVER_NAME" = "default" ]; then
  exec "$_REAL_TMUX_PATH" "\$@"
 else
  exec "$_REAL_TMUX_PATH" -L "\$TMUX_SERVER_NAME" "\$@"
 fi
 EOF
      chmod +x "$wrapper_dir/tmux"
      export PATH="$wrapper_dir:$PATH"
    fi
  else
    # 격리 비활성화 시 shim 자동 cleanup (PATH에서 제거)
    local new_path="" dir save_ifs="$IFS"
    IFS=:
    for dir in $PATH; do
      if [[ "$dir" != *"${_TMUX_SHIM_DIR_PATTERN}"* ]] && [[ "$dir" != *"${_TMUX_SKILLS_BIN_PATTERN}"* ]]; then
        if [ -z "$new_path" ]; then
          new_path="$dir"
        else
          new_path="$new_path:$dir"
        fi
      fi
    done
    IFS="$save_ifs"
    export PATH="$new_path"
  fi
 }
 _tmux() {
  _init_tmux_isolation
  if [ -z "${TMUX_SERVER_NAME:-}" ] || [ "$TMUX_SERVER_NAME" = "default" ]; then
    "$_REAL_TMUX_PATH" "$@"
  else
    "$_REAL_TMUX_PATH" -L "$TMUX_SERVER_NAME" "$@"
  fi
 }
 tmux() {
  _tmux "$@"
 }
 # ---------------------------------------------------------------------------
 # resolve_tmux_server <session_name>
 #
 # Query agent-sessions.yaml to find the tmux_server associated with a session.
 # Fallback to TMUX_SERVER_NAME or 'default' if not registered or field is missing.
 # Prints the resolved server name on stdout.
 # ---------------------------------------------------------------------------
 resolve_tmux_server() {
  local session_name="$1"
  SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
 import os, sys, sqlite3, json, yaml
 name = os.environ['SESSION_NAME']
 yaml_path = os.environ['YAML_PATH']
 db_path = os.path.splitext(yaml_path)[0] + '.db'
 try:
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path, timeout=10.0)
        try:
            row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
            if row:
                s = json.loads(row[0])
                server = s.get('tmux_server')
                if server:
                    print(server)
                    sys.exit(0)
        except sqlite3.OperationalError:
            pass
        row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
        if row:
            d = json.loads(row[0])
            for s in d.get('tmux_sessions', []):
                if s.get('name') == name:
                    server = s.get('tmux_server')
                    if server:
                        print(server)
                        sys.exit(0)
        conn.close()
    elif os.path.exists(yaml_path):
        with open(yaml_path) as f:
            d = yaml.safe_load(f) or {}
        for s in d.get('tmux_sessions', []):
            if s.get('name') == name:
                server = s.get('tmux_server')
                if server:
                    print(server)
                    sys.exit(0)
 except Exception:
    pass
 # Fallback
 print(os.environ.get('TMUX_SERVER_NAME', 'default'))
 PYEOF
 }
 # ---------------------------------------------------------------------------
 # derive_session_name <workspace> <agent>
 #
 # THE single source of truth for the tmux session name. Rule:
 #   slug = the two trailing path components of the absolute workspace,
 #          '_' -> '-', lowercased, joined with '-'
 #   name = "<slug>-creator-<agent>"
 #
 #   Workspace root 기준 상대 해석. 예:
 #     $WORKSPACE_ROOT/landing_page/refer_landing_page + claude
 #       -> landing-page-refer-landing-page-creator-claude
 #
 # Decision (REVIEW P0-A): the actual workspace basename (refer_landing_page)
 # IS included. The hand-written historical entry that dropped it
 # (lab-landing-page-creator-claude) was the bug, not the convention.
 # Every script and SKILL.md must use exactly this rule.
 # ---------------------------------------------------------------------------
 derive_session_name() {
  local workspace="$1" agent="$2"
  local abs parent work slug
  abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
  parent="$(basename "$(dirname "$abs")" 2>/dev/null || echo "")"
  work="$(basename "$abs" 2>/dev/null || echo "root")"
  if [ -z "$parent" ] || [ "$parent" = "/" ] || [ "$parent" = "." ]; then
    parent="workspace"
  fi
  if [ -z "$work" ] || [ "$work" = "/" ] || [ "$work" = "." ]; then
    work="root"
  fi
  slug="$(printf '%s-%s' "$parent" "$work" | tr '[:upper:]' '[:lower:]' | tr '_' '-')"
  slug="$(printf '%s' "$slug" | tr -cd 'a-zA-Z0-9-')"
  printf '%s-creator-%s' "$slug" "$agent"
 }
 # ---------------------------------------------------------------------------
 # env_python <yaml_path> [KEY=VALUE ...]   (Python source read from stdin)
 #
 # Run python3 with the source supplied on stdin via a *quoted* heredoc, so the
 # shell never interpolates the source. All values are passed through the
 # environment (YAML_PATH plus any KEY=VALUE pairs). Untrusted data (workspace
 # paths, capture-pane text) must travel as env vars and be read via os.environ
 # inside the script — never spliced into the source. Read-only by convention;
 # use atomic_dump_yaml when you need to write the YAML.
 # ---------------------------------------------------------------------------
 _validate_env_key() {
  local key="$1"
  if [[ ! "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then
    echo "ERROR: Invalid environment variable name: $key" >&2
    return 1
  fi
  case "$key" in
    LD_PRELOAD|LD_LIBRARY_PATH|PYTHONPATH|PYTHONHOME|PYTHONINSPECT|PYTHONSTARTUP)
      echo "ERROR: Blocked environment variable: $key" >&2
      return 1
      ;;
  esac
  return 0
 }
 env_python() {
  local yaml_path="$1"; shift
  local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN")
  while [ $# -gt 0 ]; do
    case "$1" in
      *=*)
        local key="${1%%=*}"
        _validate_env_key "$key" || return 1
        envs+=("$1")
        shift
        ;;
      *)
        break
        ;;
    esac
  done
  env "${envs[@]}" python3 - "$@"
 }
 # ---------------------------------------------------------------------------
 # atomic_dump_yaml <yaml_path> [KEY=VALUE ...]   (mutation source from stdin)
 #
 # The ONLY sanctioned way to write agent-sessions.yaml. It:
 #   1. takes an exclusive SQLite BEGIN IMMEDIATE transaction lock on
 #      agent-sessions.db (serialises all writers)
 #   2. loads the current state into `d` (seeds from YAML if DB is empty)
 #   3. exec()s the caller's mutation source (sees d, yaml, os, datetime,
 #      timezone, glob, subprocess; reads values via os.environ). The mutation
 #      may print and may `raise SystemExit(n)` to abort *without* writing.
 #   4. validates the resulting schema
 #   5. backs up to <yaml_path>.bak, then writes YAML atomically (temp + os.replace)
 #      when a session transitions to a finished state.
 #
 # The mutation source is passed via env and exec()'d — it is never string
 # spliced and untrusted data never lands in Python source (P0-B / P1-B).
 # ---------------------------------------------------------------------------
 # Check if the workspace is on NFS — locking behaves differently on NFS
 _check_is_nfs() {
  local f="$1"
  local mountpoint
  mountpoint="$(df --output=target "$f" 2>/dev/null | tail -1)" || return 1
  if mount | grep -q "$mountpoint.*nfs\|$mountpoint.*cifs\|$mountpoint.*fuse.sshfs"; then
    return 0 # is NFS
  fi
  return 1 # not NFS
 }
 atomic_dump_yaml() {
  local yaml_path="$1"; shift
  if [ -z "${MAM_IS_NFS:-}" ]; then
    if _check_is_nfs "$(dirname "$yaml_path")"; then
      export MAM_IS_NFS="true"
      echo "WARNING: $(dirname "$yaml_path") appears to be a network filesystem (NFS/CIFS/SSHFS)." >&2
      echo "WARNING: SQLite journal_mode automatically falls back to DELETE." >&2
    else
      export MAM_IS_NFS="false"
    fi
  fi
  local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN" "MAM_IS_NFS=$MAM_IS_NFS")
  while [ $# -gt 0 ]; do
    case "$1" in
      *=*)
        local key="${1%%=*}"
        _validate_env_key "$key" || return 1
        envs+=("$1")
        shift
        ;;
      *)
        break
        ;;
    esac
  done
  local mutation; mutation="$(cat)"
  env "${envs[@]}" AGENT_SESSIONS_MUTATION="$mutation" python3 - <<'PYEOF'
 import os, sys, tempfile, shutil, glob, subprocess, json, sqlite3
 from datetime import datetime, timezone
 import yaml
 yaml_path = os.environ['YAML_PATH']
 db_path = os.path.splitext(yaml_path)[0] + '.db'
 def _validate(d):
    if not isinstance(d, dict):
        raise SystemExit("VALIDATE: top-level is not a mapping")
    sessions = d.get('tmux_sessions', [])
    if not isinstance(sessions, list):
        raise SystemExit("VALIDATE: tmux_sessions is not a list")
    valid = {'running', 'terminated', 'archived', 'stopped'}
    for i, s in enumerate(sessions):
        if not isinstance(s, dict):
            raise SystemExit(f"VALIDATE: tmux_sessions[{i}] not a mapping")
        if not s.get('name') or not s.get('status'):
            raise SystemExit(f"VALIDATE: tmux_sessions[{i}] missing name/status")
        if s['status'] not in valid:
            raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} bad status {s['status']!r}")
        if not isinstance(s.get('pane'), dict):
            raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} missing pane")
 def get_terminal_set(d):
    return {s.get('name'): s.get('status') for s in d.get('tmux_sessions', []) if s.get('status') in ('stopped', 'terminated', 'archived')}
 os.makedirs(os.path.dirname(db_path) or '.', exist_ok=True)
 conn = sqlite3.connect(db_path, timeout=60.0)
 for f in [db_path, db_path + '-wal', db_path + '-shm']:
    if os.path.exists(f):
        try:
            os.chmod(f, 0o600)
        except Exception:
            pass
 is_nfs = os.environ.get('MAM_IS_NFS') == 'true'
 if is_nfs:
    conn.execute('PRAGMA journal_mode=DELETE')
 else:
    conn.execute('PRAGMA journal_mode=WAL')
 try:
    # Disable auto-commit by explicitly starting a transaction with BEGIN IMMEDIATE
    # This prevents the read-modify-write lost update race condition.
    conn.execute('BEGIN IMMEDIATE')
    conn.execute('CREATE TABLE IF NOT EXISTS state (id INTEGER PRIMARY KEY, data TEXT)')
    conn.execute('CREATE TABLE IF NOT EXISTS sessions (name TEXT PRIMARY KEY, status TEXT, pane_cwd TEXT, data JSON)')
    conn.execute('CREATE INDEX IF NOT EXISTS idx_sessions_pane_cwd ON sessions(pane_cwd)')
    row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
    if row:
        d = json.loads(row[0])
    else:
        # Seed from YAML
        if os.path.exists(yaml_path):
            with open(yaml_path) as f:
                d = yaml.safe_load(f) or {}
        else:
            d = {}
    # Assemble d['tmux_sessions'] from sessions table if table contains data
    db_sessions = []
    cursor = conn.execute('SELECT name, status, pane_cwd, data FROM sessions')
    for s_row in cursor.fetchall():
        s_data = json.loads(s_row[3])
        s_data['name'] = s_row[0]
        s_data['status'] = s_row[1]
        if 'pane' not in s_data:
            s_data['pane'] = {}
        s_data['pane']['cwd'] = s_row[2]
        db_sessions.append(s_data)
    if db_sessions:
        d['tmux_sessions'] = db_sessions
    elif 'tmux_sessions' not in d:
        d['tmux_sessions'] = []
    old_terminals = get_terminal_set(d)
    # --- caller mutation (module scope: sees d, yaml, os, glob, subprocess) ---
    exec(compile(os.environ['AGENT_SESSIONS_MUTATION'], '<mutation>', 'exec'), globals())
    _validate(d)
    # Separate globals and sessions for normalization
    d_state = {k: v for k, v in d.items() if k != 'tmux_sessions'}
    conn.execute('REPLACE INTO state (id, data) VALUES (1, ?)', (json.dumps(d_state),))
    current_names = []
    for s in d.get('tmux_sessions', []):
        name = s.get('name')
        status = s.get('status')
        pane_cwd = (s.get('pane') or {}).get('cwd', '')
        conn.execute('REPLACE INTO sessions (name, status, pane_cwd, data) VALUES (?, ?, ?, ?)',
                     (name, status, pane_cwd, json.dumps(s)))
        current_names.append(name)
    if current_names:
        placeholders = ','.join('?' for _ in current_names)
        conn.execute(f'DELETE FROM sessions WHERE name NOT IN ({placeholders})', current_names)
    else:
        conn.execute('DELETE FROM sessions')
    new_terminals = get_terminal_set(d)
    conn.commit()
    # Write to YAML ONLY when a session transitions to a finished state
    # (Moved after conn.commit() per Claude's feedback)
    if new_terminals != old_terminals:
        if os.path.exists(yaml_path):
            try:
                shutil.copy2(yaml_path, yaml_path + '.bak')
            except Exception:
                pass
        dir_ = os.path.dirname(yaml_path) or '.'
        fd, tmp = tempfile.mkstemp(dir=dir_, prefix='.agent-sessions.', suffix='.tmp')
        try:
            with os.fdopen(fd, 'w') as f:
                yaml.safe_dump(d, f, default_flow_style=False, sort_keys=False,
                               allow_unicode=True, width=4096)
            os.replace(tmp, yaml_path)
        except Exception:
            if os.path.exists(tmp):
                os.remove(tmp)
            raise
        try:
            conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
        except Exception:
            pass
 except Exception:
    conn.rollback()
    raise
 finally:
    conn.close()
    # H3: Re-apply chmod 0600 after close to cover newly created -wal / -shm files
    try:
        os.chmod(db_path, 0o600)
        wal = db_path + '-wal'
        if os.path.exists(wal): os.chmod(wal, 0o600)
        shm = db_path + '-shm'
        if os.path.exists(shm): os.chmod(shm, 0o600)
    except Exception:
        pass
 PYEOF
 }
 # ---------------------------------------------------------------------------
 # find_workspace_uuid <workspace> <agent>
 #
 # Workspace-SCOPED resolution of the resume UUID (P0-C). It NEVER returns a
 # global agent_identities id unless that id's project_cwd matches THIS
 # workspace. Resolution order:
 #   1) tmux_sessions[] row whose pane.cwd == this workspace -> per-row own id
 #      (claude_session_id_own / agy_conversation_id_own)
 #   2) on-disk scan scoped to this workspace
 #      (claude: ~/.claude/projects/<key>/*.jsonl ; agy: last_conversations.json[cwd])
 #   3) agent_identities cache, ONLY when its project_cwd == this workspace
 # Prints the UUID on stdout (empty line if none). Always exits 0.
 # ---------------------------------------------------------------------------
 find_workspace_uuid() {
  local workspace="$1" agent="$2"
  local abs; abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
  WS_ABS="$abs" AGENT="$agent" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
 import os, json, glob, sqlite3
 import yaml
 ws = os.environ['WS_ABS']
 agent = os.environ['AGENT']
 home = os.environ['HOME_DIR']
 yaml_path = os.environ['YAML_PATH']
 db_path = os.path.splitext(yaml_path)[0] + '.db'
 claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
 def jsonl_exists(uuid):
    key = ws.replace('/', '-').replace('_', '-')
    return os.path.exists(f"{claude_project_dir}/{key}/{uuid}.jsonl")
 def db_exists(uuid):
    return os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{uuid}.db")
 def hermes_exists(uuid):
    hdb = f"{home}/.mam/state.db"
    if not os.path.exists(hdb):
        return False
    try:
        conn = sqlite3.connect(hdb)
        r = conn.execute("SELECT 1 FROM sessions WHERE id=?", (uuid,)).fetchone()
        conn.close()
        return r is not None
    except Exception:
        return False
 def emit(u):
    print(u)
    raise SystemExit(0)
 # 1) per-row own id for THIS workspace (optimized with direct sqlite query if db exists)
 sessions = []
 try:
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path, timeout=10.0)
        has_sessions_table = False
        try:
            cursor = conn.execute('SELECT data FROM sessions WHERE pane_cwd=?', (ws,))
            for row in cursor.fetchall():
                sessions.append(json.loads(row[0]))
            has_sessions_table = True
        except sqlite3.OperationalError:
            pass
        if not has_sessions_table or not sessions:
            row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
            if row:
                d = json.loads(row[0])
                for s in d.get('tmux_sessions', []):
                    if isinstance(s, dict) and (s.get('pane') or {}).get('cwd') == ws:
                        sessions.append(s)
        conn.close()
    elif os.path.exists(yaml_path):
        with open(yaml_path) as f:
            d = yaml.safe_load(f) or {}
        for s in d.get('tmux_sessions', []):
            if isinstance(s, dict) and (s.get('pane') or {}).get('cwd') == ws:
                sessions.append(s)
 except Exception:
    pass
 for s in sessions:
    name = s.get('name', '')
    if agent == 'claude' and name.endswith('-creator-claude'):
        cand = s.get('claude_session_id_own')
        if cand and jsonl_exists(cand):
            emit(cand)
    if agent == 'agy' and name.endswith('-creator-agy'):
        cand = s.get('agy_conversation_id_own')
        if cand and db_exists(cand):
            emit(cand)
    if agent == 'hermes' and name.endswith('-creator-hermes'):
        cand = s.get('hermes_conversation_id_own')
        if cand and hermes_exists(cand):
            emit(cand)
 # 2) disk scan scoped to THIS workspace
 if agent == 'claude':
    key = ws.replace('/', '-').replace('_', '-')
    proj = f"{claude_project_dir}/{key}"
    if os.path.isdir(proj):
        for j in sorted(glob.glob(f"{proj}/*.jsonl"), key=os.path.getmtime, reverse=True):
            sid = None
            try:
                with open(j) as f:
                    first = f.readline().strip()
                if first:
                    sid = json.loads(first).get('sessionId')
            except Exception:
                sid = None
            cand = sid or os.path.basename(j)[:-6]
            if cand and jsonl_exists(cand):
                emit(cand)
 elif agent == 'agy':
    lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json"
    if os.path.exists(lc):
        cand = None
        try:
            cand = json.load(open(lc)).get(ws)
        except Exception:
            cand = None
        if cand and db_exists(cand):
            emit(cand)
 elif agent == 'hermes':
    hdb = f"{home}/.mam/state.db"
    if os.path.exists(hdb):
        cand = None
        try:
            conn = sqlite3.connect(hdb)
            r = conn.execute("SELECT id FROM sessions WHERE cwd=? ORDER BY started_at DESC LIMIT 1", (ws,)).fetchone()
            conn.close()
            if r:
                cand = r[0]
        except Exception:
            cand = None
        if cand:
            emit(cand)
 # 3) agent_identities cache, ONLY when its project_cwd == this workspace
 ai = {}
 try:
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path, timeout=10.0)
        row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
        if row:
            ai = json.loads(row[0]).get('agent_identities', {})
        conn.close()
    elif os.path.exists(yaml_path):
        with open(yaml_path) as f:
            d = yaml.safe_load(f) or {}
        ai = d.get('agent_identities', {})
 except Exception:
    pass
 ai_agent = ai.get(agent) or {}
 if ai_agent.get('project_cwd') == ws:
    if agent == 'claude':
        cand = ai_agent.get('session_id')
        if cand and jsonl_exists(cand):
            emit(cand)
    elif agent == 'agy':
        cand = ai.get('conversation_id')
        if cand and db_exists(cand):
            emit(cand)
    elif agent == 'hermes':
        cand = ai_agent.get('session_id') or ai.get('conversation_id')
        if cand and hermes_exists(cand):
            emit(cand)
 print('')
 PYEOF
 }
 # ---------------------------------------------------------------------------
 # capture_conversation_id <agent> <workdir>
 #
 # Thin wrapper over find_workspace_uuid: resolves THIS workspace's conversation
 # id (claude jsonl sessionId / agy db uuid) and prints it on stdout (empty line
 # if none). find_workspace_uuid is already a workspace-scoped, 3-tier, race-free
 # resolver (per-row own id -> workspace-scoped disk scan -> cwd-matched cache),
 # so recording its result into the row before kill guarantees tier-1 on the next
 # resume. Always exits 0.
 # ---------------------------------------------------------------------------
 capture_conversation_id() {
  local agent="$1" workdir="$2"
  find_workspace_uuid "$workdir" "$agent"
 }
 # ---------------------------------------------------------------------------
 # is_already_stopped <session_name>
 #
 # Exits 0 if the row's status is 'stopped' (printing "stopped_at=<ts>" on
 # stdout), 1 otherwise (including not-found). Used for idempotency: a second
 # stop on an already-stopped session is a no-op.
 # ---------------------------------------------------------------------------
 is_already_stopped() {
  local session_name="$1"
  SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
 import os, yaml, sqlite3, json
 name = os.environ['SESSION_NAME']
 yaml_path = os.environ['YAML_PATH']
 db_path = os.path.splitext(yaml_path)[0] + '.db'
 try:
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path, timeout=10.0)
        has_sessions_table = False
        try:
            row = conn.execute('SELECT status, data FROM sessions WHERE name=?', (name,)).fetchone()
            if row:
                status, s_data_str = row[0], row[1]
                if status == 'stopped':
                    s = json.loads(s_data_str)
                    print(f"stopped_at={s.get('stopped_at', '?')}")
                    raise SystemExit(0)
            has_sessions_table = True
        except sqlite3.OperationalError:
            pass
        if not has_sessions_table:
            row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
            if row:
                d = json.loads(row[0])
                for s in d.get('tmux_sessions', []):
                    if s.get('name') == name and s.get('status') == 'stopped':
                        print(f"stopped_at={s.get('stopped_at', '?')}")
                        raise SystemExit(0)
        conn.close()
        raise SystemExit(1)
    elif os.path.exists(yaml_path):
        with open(yaml_path) as f:
            d = yaml.safe_load(f) or {}
        for s in d.get('tmux_sessions', []):
            if s.get('name') == name and s.get('status') == 'stopped':
                print(f"stopped_at={s.get('stopped_at', '?')}")
                raise SystemExit(0)
 except Exception:
    pass
 raise SystemExit(1)
 PYEOF
 }
 # ---------------------------------------------------------------------------
 # multi-agent-mux-delegate-job integration helpers
 #
 # All paths are resolved relative to lib.sh's own location (BASH_SOURCE), so the
 # skill tree is relocatable — no hardcoded absolute paths (review item 6).
 # ---------------------------------------------------------------------------
 # _delegate_py_bin — echo the virtualenv python (walk up from .agents/skills/), else python3.
 _delegate_py_bin() {
  # Return cached result if available (shell variable, not exported — avoids cross-workspace pollution)
  if [ -n "${AGENT_PYTHON_BIN:-}" ] && [ -x "$AGENT_PYTHON_BIN" ]; then
    printf '%s\n' "$AGENT_PYTHON_BIN"; return 0
  fi
  local d
  d="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  while [ "$d" != "/" ] && [ -n "$d" ]; do
    if [ -x "$d/.venv/bin/python" ]; then
      AGENT_PYTHON_BIN="$d/.venv/bin/python"
      printf '%s\n' "$AGENT_PYTHON_BIN"; return 0
    fi
    d="$(dirname "$d")"
  done
  AGENT_PYTHON_BIN="$(command -v python3 || echo python3)"
  printf '%s\n' "$AGENT_PYTHON_BIN"
 }
 # _delegate_script <name> — echo the path to a multi-agent-mux-delegate-job script, resolved
 # relative to .agents/skills/ (lib.sh dir). Empty if not found.
 _delegate_script() {
  local name="$1" skill_dir cand
  skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  cand="$skill_dir/multi-agent-mux-delegate-job/scripts/$name"
  if [ -f "$cand" ]; then printf '%s\n' "$cand"; return 0; fi
  printf '%s\n' "$(find "$skill_dir" -name "$name" 2>/dev/null | head -n 1 || true)"
 }
 # delegate_submit_job <prompt> <agent> <agent_session>
 #
 # Register a job in the multi-agent-mux-delegate-job registry. Prints the new JID on stdout.
 delegate_submit_job() {
  local prompt="$1" agent="$2" session="$3"
  local py_bin registry_py
  py_bin="$(_delegate_py_bin)"
  registry_py="$(_delegate_script registry.py)"
  if [ -z "$registry_py" ] || [ ! -f "$registry_py" ]; then
    echo "ERROR: multi-agent-mux-delegate-job registry.py not found under .agents/skills/" >&2
    return 1
  fi
  "$py_bin" "$registry_py" register \
    --prompt "$prompt" \
    --agent "$agent" \
    --agent-session "$session"
 }
 # delegate_publish_event <job_id> <event> [detail]
 #
 # Publish a lifecycle event to the multi-agent-mux-delegate-job registry. Consolidates the
 # inline .venv-walk + publish_event.py blocks that were duplicated across
 # create/delete/resume (review item 7). Non-fatal by contract: an empty job id,
 # a missing script, or a broker failure never aborts the caller.
 delegate_publish_event() {
  local job_id="$1" event="$2" detail="${3:-}"
  [ -n "$job_id" ] || return 0
  local py_bin pub
  py_bin="$(_delegate_py_bin)"
  pub="$(_delegate_script publish_event.py)"
  [ -n "$pub" ] && [ -f "$pub" ] || return 0
  "$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true
 }
 # start_watchdog <job_id> [workdir]
 # Spawns a watchdog process to monitor a delegate-job JOB in the background.
 # The watchdog re-spawns the subscriber every 2 minutes (or whatever hard
 # limit we set) and exits automatically when the JOB reaches terminal state.
 # Returns the watchdog PID via stdout.
 start_watchdog() {
  local job_id="$1"
  local workdir="${2:-$PWD}"
  local monitor_script="$workdir/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh"
  local log_file="$workdir/.mam/multi-agent-mux-monitor.log"
  if [ ! -f "$monitor_script" ]; then
    echo "ERROR: monitor script not found: $monitor_script" >&2
    return 1
  fi
  # Check if reconcile.sh --subscribe is already running on this workspace
  local pid
  pid=$(pgrep -f "bash $monitor_script --subscribe" || true)
  if [ -z "$pid" ]; then
    # Start the wildcard monitor subscriber daemon with --idle-timeout 0 (never idle out)
    # and ensure it runs with $workdir as cwd to anchor relative log paths.
    local orig_pwd="$PWD"
    cd "$workdir"
    nohup bash "$monitor_script" --subscribe --idle-timeout 0 >> "$log_file" 2>&1 &
    pid=$!
    cd "$orig_pwd"
  fi
  echo "$pid"
 }
@@ -0,0 +1,220 @@
 ---
 name: multi-agent-mux-create
 description: "Create a new agent session (claude, antigravity/agy) in a dedicated tmux session for context-preserving long-running work. Always creates a tmux session — never backgrounds with nohup/disown. Writes the new session to .mam/agent-sessions.yaml. Use when you want to start a fresh agent (no prior UUID) for a new project workspace."
 version: 1.0.0
 author: godopu
 license: MIT
 platforms: [linux, macos]
 environments: [terminal, tmux]
 metadata:
  hermes:
    tags: [agent, tmux, claude, antigravity, agy, multi-agent, context, session]
    related_skills: [multi-agent-mux-resume, multi-agent-mux-stop, multi-agent-mux-monitor, claude-code]
    prereq_skills: [claude-code]
 ---
 # Multi-Agent Create — Start a Fresh Agent in a tmux Session
 > **Companion skills**: `multi-agent-mux-resume` (resume an existing UUID), `multi-agent-mux-stop` (terminate), `multi-agent-mux-monitor` (live status).
 > **Single source of truth**: `./.mam/agent-sessions.yaml` (this skill writes to it; never read it ad-hoc — go through this skill).
 ## What this skill does
 Spawn a new agent (`claude` or `agy`/antigravity-cli) in a **dedicated tmux session** for context-preserving long-running work. The tmux session is the *container*; the agent's session ID is *data* inside the container. **This skill creates the container + starts the agent — but does not resume an old conversation** (use `multi-agent-mux-resume` for that).
 For all agents: the tmux session name is produced by **`lib.sh::derive_session_name`** — the single source of truth shared by create/resume/stop/status/monitor (P0-A). The rule (verbatim from the function):
 > slug = the **two trailing path components** of the absolute workspace, `_`→`-`, lowercased, joined with `-`; name = `<slug>-creator-<agent>`.
 So `$WORKSPACE_ROOT/landing_page/refer_landing_page` + `claude` → `landing-page-refer-landing-page-creator-claude`. The workspace basename (`refer_landing_page`) **is** included; the hand-written historical entry that dropped it (`lab-landing-page-creator-claude`) was the bug, not the convention.
 ## Pre-flight checks
 Before doing anything, verify the environment:
 ```bash
 # 1) tmux available and isolated server status
 command -v tmux || { echo "ERROR: tmux not installed"; exit 1; }
 echo "Tmux server name: ${TMUX_SERVER_NAME:-default}"
 # 2) claude / agy available
 command -v claude  # required for --agent claude
 command -v agy     # required for --agent agy
 # 3) claude auth (if --agent claude)
 claude auth status 2>&1 | python3 -c "import json,sys; d=json.load(sys.stdin); assert d.get('loggedIn'), 'claude not logged in'"
 # 4) target workspace exists
 test -d "$WORKSPACE" || { echo "ERROR: workspace $WORKSPACE not a directory"; exit 1; }
 ```
 If any check fails → `kanban_block(reason="...")` (worker path) or report to user (interactive path). Do not proceed with a half-broken setup.
 ## Standard names
 - **tmux session name**: `derive_session_name <workspace> <agent>` (lib.sh)
  - `<workspace-slug>` = `basename $(dirname $WORKSPACE)` `-` `basename $WORKSPACE` (lowercase, `_`→`-`)
  - examples: `landing-page-refer-landing-page-creator-claude`, `paper-pdf2md-creator-agy`
  - never re-derive this by hand — source lib.sh and call the function
 - **wrapper script** (claude only): `~/.local/bin/<workspace-slug>-creator-claude`
  - contents: tmux new-session with `claude` inside, auto-handles trust/bypass dialogs
  - see `<workdir>/agent_sessions.md` for the canonical wrapper template
 ## Tmux Server Isolation (격리 서버)
 When running multiple agent sessions alongside other workflows (e.g., cmux, Kanban workers, manual tmux sessions), sharing the default tmux server can lead to session name conflicts, monitoring clutter, and accidental destruction of user sessions via global commands.
 To prevent this, you can run this skill inside an **isolated tmux server** using the `TMUX_SERVER_NAME` environment variable or the `--tmux-server <name>` flag (opt-in).
 ### How to use
 1. **Via Environment Variable**:
   ```bash
   export TMUX_SERVER_NAME=multi-agent-canary
   # All subsequent commands (create, status, stop, etc.) will run in the isolated 'multi-agent-canary' tmux server.
   ```
 2. **Via Option Flag**:
   ```bash
   bash scripts/create_session.sh --workspace /path/to/project --agent claude --tmux-server multi-agent-canary
   ```
 3. **Submit Job Integration**:
   You can automatically register a delegated job with a prompt when creating a session:
   ```bash
   bash scripts/create_session.sh --workspace /path/to/project --agent claude --submit-job "Task prompt here"
   ```
 ### Recommended Alias
 You can set an alias in your shell to easily query sessions on the isolated server:
 ```bash
 alias tmc='tmux -L multi-agent-canary'
 tmc ls  # Lists only your multi-agent sessions
 ```
 ### Safety Rules (Pitfall 29 Summary)
 - Never use global server termination commands like `tmux kill-server` or `tmux kill-session -a` as they will destroy all sessions on that server (including your own workspace sessions if they share the server).
 - By using an isolated server via `TMUX_SERVER_NAME`, your agent sessions are completely separated from your default user workspace, ensuring 0% interference.
 ## Workflow
 ```bash
 WORKSPACE=/path/to/project
 AGENT=claude  # or agy
 source .agents/skills/lib.sh
 SESSION_NAME="$(derive_session_name "$WORKSPACE" "$AGENT")"
 # 1. If session already alive, fail fast
 tmux has-session -t "$SESSION_NAME" 2>/dev/null && {
  echo "ERROR: tmux session '$SESSION_NAME' already exists. Use multi-agent-mux-resume to attach or multi-agent-mux-stop first."
  exit 1
 }
 # 2. Spawn the tmux session with the agent inside
 case "$AGENT" in
  claude)
    # Use the wrapper if it exists, else inline tmux new-session
    # Use the wrapper if it exists (LOCAL_BIN env var overrides default $HOME/.local/bin)
    local_bin="${LOCAL_BIN:-$HOME/.local/bin}"
    if [ -x "$local_bin/$SESSION_NAME" ]; then
      nohup "$local_bin/$SESSION_NAME" >/dev/null 2>&1 &
    else
      tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude"
    fi
    ;;
  agy)
    tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "agy --dangerously-skip-permissions"
    ;;
  *) echo "ERROR: --agent must be claude or agy, got: $AGENT"; exit 2 ;;
 esac
 # 3. Wait for agent TUI to be ready (varies: claude ~5s, agy ~3s)
 sleep 6
 # 4. Capture pane metadata
 PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}')
 PANE_CWD=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_path}')
 PANE_CMD=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_command}')
 TMUX_EPOCH=$(tmux list-sessions -F '#{session_created}' -t "$SESSION_NAME" 2>/dev/null | head -1)
 ```
 ## Registering the session in agent-sessions.yaml
 After spawn, append a new `tmux_sessions[]` entry to `.mam/agent-sessions.yaml`:
 ```yaml
 - name: <SESSION_NAME>
  status: running
  tmux_session_created_at: 2026-06-17T...Z   # ISO 8601 UTC
  tmux_session_epoch: <TMUX_EPOCH>
  tmux_server: <TMUX_SERVER_NAME>           # Isolated server name (default: 'default')
  pane:
    index: 0
    pid: <PANE_PID>
    cmd: <AGENT>            # 'claude' or 'agy'
    cmd_full: <full command line, see table below>
    cwd: <PANE_CWD>
  tui:                     # only for claude
    model: <from TUI status>
    provider: <from TUI status>
    plan: <from TUI status>
    account: <from TUI status>
    version: <from TUI status>
  start_command: <the exact tmux new-session command used>
  attach_command: "tmux attach -t <SESSION_NAME>"
  kill_command: "tmux kill-session -t <SESSION_NAME>"
 ```
 `cmd_full` per agent (this is the actual command line in the pane, not the resume command):
 | agent | cmd_full |
 |---|---|
 | claude (interactive) | `claude` |
 | agy (interactive) | `agy --dangerously-skip-permissions` |
 Use the `agent-sessions-yaml-edit` script in `scripts/` to safely append (preserves comments + format):
 ```bash
 bash .agents/skills/multi-agent-mux-create/scripts/create_session.sh \
  --workspace "$WORKSPACE" --agent "$AGENT" --session "$SESSION_NAME"
 ```
 The script handles the YAML append, pane capture, and the `last_visible_status` placeholder.
 ## Pitfalls
 - **Don't use `nohup`/`disown`/`setsid` for the agent itself** — those background the agent outside tmux. The whole point of this skill is *the tmux session is the supervisor*. `nohup` is OK only for *launching the wrapper* (which itself creates the tmux session via `tmux new-session -d`).
 - **Don't trust `--session-id <uuid>` flags blindly** — claude/agy may not accept a fixed session id on first spawn. The session id is *assigned* on first user message; you can read it back from `~/.claude/projects/.../session.jsonl` headers or `~/.gemini/.../cache/last_conversations.json` AFTER the first message.
 - **Wrapper script MUST NOT be created via `hermes profile alias`** — that command writes a `hermes -p <profile>` wrapper that destroys the tmux behavior. Create wrappers manually (see `lab-landing-page-creator-claude` template).
 - **Always use the workspace-relative path** in tmux `cwd` — relative paths break when tmux respawns in a different shell context.
 - **The first `claude` message generates the session id** — `multi-agent-mux-create` only sets up the *container*. If you need a known session id for later resume, send a placeholder message (e.g. "init") and read it back, then call `multi-agent-mux-resume` later.
 ## Verification
 After spawn + YAML append:
 ```bash
 # 1. tmux session is alive
 tmux has-session -t "$SESSION_NAME" && echo OK || echo MISSING
 # 2. pane has the expected cmd + cwd
 tmux list-panes -t "$SESSION_NAME" -F 'cmd=#{pane_current_command} cwd=#{pane_current_path}'
 # 3. agent-sessions.yaml has the new entry
 python3 -c "
 import yaml
 d = yaml.safe_load(open('.mam/agent-sessions.yaml'))
 names = [s['name'] for s in d['tmux_sessions']]
 assert '$SESSION_NAME' in names, 'session not registered'
 print('OK:', names)
 "
 # 4. Optional: send a probe via tmux send-keys and capture-pane
 tmux send-keys -t "$SESSION_NAME" "" Enter
 sleep 2
 tmux capture-pane -t "$SESSION_NAME" -p -S -20
 ```
 ## When NOT to use this skill
 - **Resuming an old conversation** → `multi-agent-mux-resume`
 - **Killing an existing session** → `multi-agent-mux-stop`
 - **Just attaching to an existing session** → `tmux attach -t <name>` (no skill needed)
 - **One-shot print mode (claude -p "...")** → no tmux needed; use `claude-code` skill's print mode
@@ -0,0 +1,294 @@
 #!/usr/bin/env bash
 # create_session.sh — multi-agent-mux-create 의 부속 스크립트
 # Usage:
 #   bash create_session.sh --workspace <path> --agent <claude|agy> [--session <name>] [--wrapper]
 #
 # 동작:
 #   1) preflight: tmux/claude/agy 가용성, workspace 존재
 #   2) tmux 세션 이름 결정 (--session 없으면 자동)
 #   3) tmux 세션 시작 (claude 는 wrapper 우선, agy 는 인라인)
 #   4) pane 메타 캡처 (pid, cmd, cwd)
 #   5) agent-sessions.yaml 에 tmux_sessions[] 엔트리 append
 #   6) 검증 출력
 #
 # Exit codes:
 #   0  = success
 #   1  = preflight failure
 #   2  = invalid args
 #   3  = tmux session already exists (use multi-agent-mux-resume or delete first)
 #   4  = agent-sessions.yaml append failure
 set -euo pipefail
 source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
 usage() {
  cat <<EOF
 Usage: $0 --workspace <path> --agent <claude|agy|hermes> [options]
 Options:
  --workspace PATH    project directory (required)
  --agent AGENT       claude | agy | hermes (required)
  --session NAME      tmux session name (default: derived from workspace)
  --wrapper           force use of ~/.local/bin/<session> wrapper even if not present
  --dry-run           print commands without executing
  --tmux-server NAME  specify isolated tmux server name
  --submit-job PROMPT submit a job to multi-agent-mux-delegate-job registry with the given prompt
  -h, --help          this help
 EOF
 }
 WORKSPACE=""
 AGENT=""
 SESSION_NAME=""
 USE_WRAPPER=0
 DRY_RUN=0
 TMUX_SERVER_OPT=""
 SUBMIT_JOB_PROMPT=""
 while [ $# -gt 0 ]; do
  case "$1" in
    --workspace) WORKSPACE="$2"; shift 2 ;;
    --agent)     AGENT="$2";     shift 2 ;;
    --session)   SESSION_NAME="$2"; shift 2 ;;
    --wrapper)   USE_WRAPPER=1; shift ;;
    --dry-run)   DRY_RUN=1; shift ;;
    --tmux-server) TMUX_SERVER_OPT="$2"; shift 2 ;;
    --submit-job) SUBMIT_JOB_PROMPT="$2"; shift 2 ;;
    -h|--help)   usage; exit 0 ;;
    *) echo "ERROR: unknown arg: $1" >&2; usage; exit 2 ;;
  esac
 done
 if [ -n "$TMUX_SERVER_OPT" ]; then
  export TMUX_SERVER_NAME="$TMUX_SERVER_OPT"
 fi
 # Preflight
 [ -n "$WORKSPACE" ] || { echo "ERROR: --workspace required" >&2; usage; exit 2; }
 [ -n "$AGENT" ]    || { echo "ERROR: --agent required" >&2; usage; exit 2; }
 [ -d "$WORKSPACE" ] || { echo "ERROR: workspace $WORKSPACE not a directory" >&2; exit 1; }
 command -v tmux >/dev/null || { echo "ERROR: tmux not installed" >&2; exit 1; }
 command -v "$AGENT" >/dev/null || { echo "ERROR: $AGENT CLI not in PATH" >&2; exit 1; }
 # Auth Check (OAuth check for agy, loggedIn check for claude, status for hermes)
 if [ "$AGENT" = "claude" ]; then
  if ! claude auth status 2>/dev/null | grep -q '"loggedIn":\s*true'; then
    echo "ERROR: claude not logged in. Run 'claude auth login' first." >&2
    exit 1
  fi
 elif [ "$AGENT" = "agy" ]; then
  if ! agy models >/dev/null 2>&1; then
    echo "ERROR: agy is not authenticated. Please log in first." >&2
    exit 1
  fi
 elif [ "$AGENT" = "hermes" ]; then
  if ! hermes status >/dev/null 2>&1; then
    echo "ERROR: hermes is not functional. Run 'hermes setup' first." >&2
    exit 1
  fi
 fi
 # 세션 이름 — lib.sh::derive_session_name 이 단일 소스 (P0-A)
 if [ -z "$SESSION_NAME" ]; then
  SESSION_NAME="$(derive_session_name "$WORKSPACE" "$AGENT")"
 fi
 # 이미 살아있으면 실패
 if _tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
  echo "ERROR: tmux session '$SESSION_NAME' already exists. Use multi-agent-mux-resume to attach, or multi-agent-mux-stop first." >&2
  exit 3
 fi
 # tmux 세션 띄우기
 LOCAL_BIN="${LOCAL_BIN:-$HOME/.local/bin}"
 WRAPPER="$LOCAL_BIN/$SESSION_NAME"
 spawn() {
  case "$AGENT" in
    claude)
      if { [ -x "$WRAPPER" ] && [ "$(basename "$WRAPPER")" != "claude" ]; } || [ "$USE_WRAPPER" = "1" ]; then
        nohup "$WRAPPER" >/dev/null 2>&1 &
        disown
      else
        _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude --dangerously-skip-permissions"
      fi
      ;;
    agy)
      _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "agy --dangerously-skip-permissions"
      ;;
    hermes)
      _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "hermes"
      ;;
    *) echo "ERROR: --agent must be claude, agy or hermes, got: $AGENT" >&2; exit 2 ;;
  esac
 }
 if [ "$DRY_RUN" = "1" ]; then
  echo "[dry-run] would spawn: tmux session '$SESSION_NAME' in $WORKSPACE (agent=$AGENT)"
  exit 0
 fi
 spawn
 # TUI 준비 대기
 sleep 6
 # pane 메타 캡처
 PANE_PID=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null || echo "")
 PANE_CWD=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_path}' 2>/dev/null || echo "$WORKSPACE")
 PANE_CMD=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_command}' 2>/dev/null || echo "$AGENT")
 TMUX_EPOCH=$(date +%s)
 NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
 # cmd_full 결정
 case "$AGENT" in
  claude) CMD_FULL='claude --dangerously-skip-permissions' ;;
  agy)    CMD_FULL='agy --dangerously-skip-permissions' ;;
  hermes) CMD_FULL='hermes' ;;
 esac
 # 시작 명령
 local_tmux="tmux"
 if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
  local_tmux="tmux -L $TMUX_SERVER_NAME"
 fi
 case "$AGENT" in
  claude)
    if [ -x "$WRAPPER" ]; then
      START_CMD="$WRAPPER   # ~/.local/bin 의 래퍼"
    else
      START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude --dangerously-skip-permissions\""
    fi
    ;;
  agy|hermes)
    START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"$CMD_FULL\""
    ;;
 esac
 # agent-sessions.yaml 에 append
 DELEGATE_JOB_ID=""
 if [ -n "$SUBMIT_JOB_PROMPT" ]; then
  delegate_agent=""
  if [ "$AGENT" = "claude" ]; then
    delegate_agent="claude-code"
  elif [ "$AGENT" = "hermes" ]; then
    delegate_agent="hermes-agent"
  else
    delegate_agent="antigravity-cli"
  fi
  agent_session="tmux:$SESSION_NAME"
  DELEGATE_JOB_ID=$(delegate_submit_job "$SUBMIT_JOB_PROMPT" "$delegate_agent" "$agent_session")
  echo "Submitted delegated job: $DELEGATE_JOB_ID"
 fi
 if [ ! -f "$AGENT_SESSIONS_YAML" ]; then
  mkdir -p "$(dirname "$AGENT_SESSIONS_YAML")"
  echo "tmux_sessions: []" > "$AGENT_SESSIONS_YAML"
 fi
 # atomic_dump_yaml: flock + temp+rename + .bak + schema validate (P0-B).
 # 모든 값은 환경변수로 전달 — heredoc interpolation 없음 (P1-B).
 # 자식 pid 는 bash 에서 pgrep 으로 미리 구함 (P2: 도구명 필터).
 CHILD_PID=0
 if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then
  CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
  CHILD_PID="${CHILD_PID:-0}"
 fi
 atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
  SESSION_NAME="$SESSION_NAME" AGENT="$AGENT" NOW_ISO="$NOW_ISO" \
  TMUX_EPOCH="$TMUX_EPOCH" PANE_PID="$PANE_PID" PANE_CWD="$PANE_CWD" \
  CMD_FULL="$CMD_FULL" START_CMD="$START_CMD" CHILD_PID="$CHILD_PID" \
  TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}" \
  DELEGATE_JOB_ID="$DELEGATE_JOB_ID" <<'PYEOF'
 name = os.environ['SESSION_NAME']
 agent = os.environ['AGENT']
 pid = os.environ.get('PANE_PID', '')
 epoch = os.environ.get('TMUX_EPOCH', '')
 server_name = os.environ.get('TMUX_SERVER_NAME', 'default')
 server_opt = f"-L {server_name} " if server_name and server_name != 'default' else ""
 sessions = d.setdefault('tmux_sessions', [])
 # P0-D: 같은 이름 엔트리가 status=running 이면만 거부. terminated/archived 는
 # 재사용 가능 — 낡은 엔트리를 제거하고 새로 append (create -> delete -> create).
 running_same = [s for s in sessions if s.get('name') == name and s.get('status') == 'running']
 if running_same:
    print(f"ERROR: {name} already running in agent-sessions.yaml", flush=True)
    raise SystemExit(4)
 sessions[:] = [s for s in sessions if s.get('name') != name]
 entry = {
    'name': name,
    'status': 'running',
    'tmux_session_created_at': os.environ['NOW_ISO'],
    'tmux_session_epoch': int(epoch) if epoch.isdigit() else 0,
    'tmux_server': server_name,
    'delegate_job_id': os.environ.get('DELEGATE_JOB_ID', '') or None,
    'pane': {
        'index': 0,
        'pid': int(pid) if pid.isdigit() else 0,
        'cmd': agent,
        'cmd_full': os.environ['CMD_FULL'],
        'cwd': os.environ['PANE_CWD'],
    },
    'start_command': os.environ['START_CMD'],
    'attach_command': f'tmux {server_opt}attach -t {name}',
    'kill_command': f'tmux {server_opt}kill-session -t {name}',
 }
 if agent == 'claude':
    entry['tui'] = {
        'model': '(unknown — capture after first message)',
        'provider': 'anthropic',
        'plan': '(unknown)',
        'account': '(unknown — read from claude auth status)',
        'version': '(unknown — read from TUI)',
    }
    entry['claude_session_id_own'] = None
    entry['last_visible_status'] = "TUI started; awaiting first user message"
 elif agent == 'agy':
    cp = os.environ.get('CHILD_PID', '0')
    entry['child_pid'] = int(cp) if cp.isdigit() else 0
    entry['agy_conversation_id_own'] = None
    entry['mcp_attachments'] = [
        {
            'name': 'stitch',
            'transport': 'mcp-remote',
            'endpoint': 'https://stitch.googleapis.com/mcp'
        }
    ]
    entry['last_visible_status'] = "TUI started; awaiting first user message"
 elif agent == 'hermes':
    cp = os.environ.get('CHILD_PID', '0')
    entry['child_pid'] = int(cp) if cp.isdigit() else 0
    entry['hermes_conversation_id_own'] = None
    entry['last_visible_status'] = "TUI started; awaiting first user message"
 sessions.append(entry)
 snap = d.setdefault('snapshot', {})
 snap['taken_at'] = os.environ['NOW_ISO']
 snap['cwd'] = os.environ['PANE_CWD']
 print(f"appended: {name}", flush=True)
 PYEOF
 echo
 echo "=== created ==="
 echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_CWD)"
 if [ -n "$DELEGATE_JOB_ID" ]; then
  echo "delegate job: $DELEGATE_JOB_ID"
  delegate_publish_event "$DELEGATE_JOB_ID" started "multi-agent-mux session created"
  WD_PID=$(start_watchdog "$DELEGATE_JOB_ID" "$WORKSPACE")
  echo "watchdog PID: $WD_PID"
 fi
 echo "agent-sessions.yaml updated"
 echo
 if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
  echo "Attach:  tmux -L $TMUX_SERVER_NAME attach -t $SESSION_NAME"
 else
  echo "Attach:  tmux attach -t $SESSION_NAME"
 fi
 echo "Delete:  use multi-agent-mux-stop skill"
 echo "Resume:  use multi-agent-mux-resume skill (after first message creates a session id)"
@@ -0,0 +1,11 @@
 # multi-agent-mux-delegate-job 스킬
 작업(Job)을 자율 에이전트(claude-code/codex/opencode/human)에게 위임하고 MQTT
 이벤트 채널로 비동기 관찰하는 Hermes 스킬. **시작점은 [`SKILL.md`](./SKILL.md).**
 - 프로토콜/스키마: [`job-protocol.md`](./job-protocol.md)
 - 브로커 PoC→운영 전환: [`mqtt-broker-setup.md`](./mqtt-broker-setup.md)
 - 레지스트리 포맷/동시성: [`registry.md`](./registry.md)
 - 참조 구현: [`multi-agent-mux-delegate-job`](./multi-agent-mux-delegate-job) (bash wrapper), [`scripts/publish_event.py`](./scripts/publish_event.py), [`scripts/job_subscriber.py`](./scripts/job_subscriber.py), [`scripts/registry.py`](./scripts/registry.py), [`scripts/mqtt_common.py`](./scripts/mqtt_common.py)
 - 영구 감사 로그: `.mam/delegate_job_logs/<job_id>/` (`meta.json`·`events.ndjson`·`status.json`)
  — `multi-agent-mux-delegate-job logs <id>` 또는 `multi-agent-mux-delegate-job logs --list`로 조회 (SKILL.md "Audit Logs" 참조)
@@ -0,0 +1,385 @@
 ---
 name: multi-agent-mux-delegate-job
 description: "Delegate a unit of work to any autonomous agent (claude-code, codex, opencode, or a human) and observe it asynchronously over an MQTT event channel. Each job gets a unique id, a registry record (prompt, broker, status, timeouts), and a single per-job topic that carries started/permission_required/progress/completed/error events as schema-versioned JSON. The delegator starts a subscriber first, runs the agent, and treats a completed/error event or a timeout as the job's terminal state. Ships a working reference implementation (publish_event.py, job_subscriber.py, registry.py, mqtt_common.py, multi-agent-mux-delegate-job wrapper) plus a PoC-to-production path: validate on a public broker, then move to an authenticated TLS broker by changing config only — no code change. Use when you need fire-and-observe delegation, multi-job fan-out across tmux sessions, or a uniform completion-signal protocol shared by several agent types."
 version: 1.0.0
 author: Hermes Agent
 license: MIT
 platforms: [linux, macos, windows]
 metadata:
  hermes:
    tags: [agent-delegation, mqtt, jobs, orchestration, async-completion]
    related_skills: [claude-code, codex, opencode, hermes-agent-skill-authoring]
 ---
 # multi-agent-mux-delegate-job — Async Job Delegation over MQTT
 Delegate a unit of work to an autonomous agent, then **observe** it instead of
 blocking on it. Every job gets a unique id and a registry record; the agent
 publishes lifecycle events (`started`, `permission_required`, `progress`,
 `completed`, `error`) to a per-job MQTT topic; the delegator subscribes and
 treats `completed`/`error` — or a timeout — as the terminal state.
 This skill is a **reference implementation**: copy the files in this directory
 into your project and customise. The `communication_over_mqtt` project is the
 canonical concrete instance.
 ## Overview
 The model is deliberately small. A **job** is one delegated task. An **agent**
 is a worker (a claude-code tmux session, a codex run, a human). The **registry**
 (`.mam/jobs/<id>.json`) holds everything about a job so nothing important
 lives in environment variables — which means one tmux session can process many
 jobs sequentially, and many sessions can fan out in parallel, with no env
 collisions. The **event channel** is one MQTT topic per job carrying JSON
 payloads; `event` discriminates the type.
 Responsibility is split into exactly one entry point each:
 [`publish_event.py`](./scripts/publish_event.py) emits events (registry lookup,
 monotonic `seq`, retry+backoff) and [`job_subscriber.py`](./scripts/job_subscriber.py)
 observes them (timeouts, terminal state machine, defensive parsing). Shared
 logic lives in [`mqtt_common.py`](./scripts/mqtt_common.py); registry I/O in
 [`registry.py`](./scripts/registry.py). The demo `publisher.py`/`subscriber.py`
 in the host project stay frozen.
 Two stages, same code. **PoC** runs on the public `broker.hivemq.com` to wire up
 the protocol. **Production** moves to your own authenticated TLS broker — the
 switch is **config only** (env vars + the registry `broker.*` block), never a
 code change. See [`mqtt-broker-setup.md`](./mqtt-broker-setup.md).
 ## When to Use / When NOT to Use
 **Use when:**
 - you want **fire-and-observe** delegation — kick off work and get a completion
  signal rather than blocking a terminal;
 - several agent types (claude-code, codex, opencode, human) must follow **one**
  completion protocol;
 - you need **multi-job fan-out** across tmux sessions with safe job claiming;
 - you want a clean PoC → authenticated-broker upgrade path.
 **Do NOT use when:**
 - a one-shot `claude -p '…'` that returns inline is enough (no async signal
  needed) — just use the [claude-code](../claude-code/SKILL.md) skill directly;
 - you need request/response RPC or large artifact transfer (this is a
  one-direction event stream, not a data bus);
 - the payload would carry secrets and you're still on the public broker — move
  to the own-broker stage first.
 ## Quick Start
 The one-line wrapper handles register + subscriber-first + agent launch. If
 you're new, **start here** and only fall back to the manual 5-step flow when
 you need finer control.
 ```bash
 # 1) one line: register → start subscriber → launch agent in tmux
 #    (uses public broker by default; last stdout line is the audit-log dir)
 multi-agent-mux-delegate-job submit \
  --agent claude-code \
  --prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \
  --workdir /path/to/project \
  --agent-session tmux:demo \
  --timeout 3600 --idle-timeout 120
 # → stdout: registered job: <JID>
 #          subscriber pid: …
 #          agent launched in tmux session: demo
 #          subscriber output: <one line per event>
 #          /path/to/project/.mam/delegate_job_logs/<JID>     ← audit log dir
 # 2) at any time, query the job or its audit log
 multi-agent-mux-delegate-job status --job <JID>
 multi-agent-mux-delegate-job logs   <JID>            # pretty timeline
 multi-agent-mux-delegate-job logs   --list           # every job, live status
 # 3) run a user-supplied validator against the job's artifacts
 multi-agent-mux-delegate-job verify --job <JID> --validate ./validate.sh
 ```
 The wrapper enforces the **subscribe-before-publish** ordering and **forwards
 the freshly-minted `JOB_ID` into the agent's prompt** (so the agent calls
 `publish_event.py --job <JID>` with the right id — see Pitfall §"Wrong job_id
 propagated to the agent"). When you need finer control, the manual flow is:
 ```bash
 # Manual 5-step (same outcome, more knobs)
 PY=.venv/bin/python
 SKILL=./.agents/skills/multi-agent-mux-delegate-job/scripts
 # 1) register
 JID=$($PY "$SKILL/registry.py" register \
        --prompt "…" --agent claude-code --agent-session tmux:demo \
        --timeout 3600 --idle-timeout 120)
 # 2) START THE SUBSCRIBER FIRST (MQTT does not queue non-retained msgs)
 $PY "$SKILL/job_subscriber.py" --job "$JID" --timeout 3600 --idle-timeout 120 &
 # 3) pass JID to the agent and instruct it to publish events with --job "$JID"
 #    (don't hard-code a job id you saw earlier — see Pitfall §"Wrong job_id")
 # 4) on completion the subscriber prints events and exits 0/1/2
 # 5) inspect any time
 $PY "$SKILL/registry.py" get       --job "$JID"
 $PY "$SKILL/registry.py" logs      "$JID"        # positional job id
 $PY "$SKILL/registry.py" logs --list
 ```
 ## Job Protocol
 One topic per job: `python/mqtt/jobs/<job_id>/events`. Payload (JSON, UTF-8,
 `schema_version=1`):
 ```json
 { "schema_version": 1, "seq": 7, "job_id": "abc12345",
  "event": "started|permission_required|progress|completed|error",
  "timestamp": "2026-06-19T09:32:00Z", "detail": "generalised text",
  "data": { "optional": "metadata" } }
 ```
 - `seq` is monotonic per job (first = 1); the subscriber uses it to spot
  reorder/duplication.
 - `timestamp` is advisory — timeouts are measured from **receive** time.
 - `detail`/`data` carry **no** secrets or absolute paths.
 - A `schema_version` or `job_id` mismatch is **dropped** (defensive parsing).
 `started` and `completed`/`error` are the mandatory bookends; `completed`→exit 0,
 `error`→exit 1. Full catalogue + production `auth_token` handling:
 [`job-protocol.md`](./job-protocol.md).
 ## Registry Format
 ```
 .mam/jobs/<id>.json        # metadata record (single source of truth)
 .mam/jobs/<id>.events.log  # append-only JSON-lines log (debug, optional)
 .mam/jobs/.lock            # fcntl advisory lock for the registry
 ```
 The record holds `status`, `prompt`, `agent`, `agent_session`, a `broker` block,
 `topic_prefix`, `timeout_sec`/`idle_timeout_sec`, `expected_artifacts`,
 `last_seq`, and (production) `auth_token`. Because the `broker` block lives in
 the record, `publish_event.py` connects from the registry alone. Concurrency,
 the atomic rename trick, and multi-session job claiming are in
 [`registry.md`](./registry.md).
 ## Audit Logs
 Every job's lifecycle is mirrored to a **persistent, append-only audit log**
 under `.mam/delegate_job_logs/` (override with `DELEGATE_JOB_LOGS_DIR`;
 default `<cwd>/.mam/delegate_job_logs`). Unlike the registry — live state
 mutated in place and liable to be cleaned up — the audit log is durable
 history you can replay after the fact. It is git-ignored.
 ```
 .mam/delegate_job_logs/<job_id>/
  meta.json      # registration snapshot: prompt, agent, broker, timeouts, …
  events.ndjson  # append-only, one JSON event per line, in time order
  status.json    # current status only (fast point-query)
 ```
 **What is logged, automatically:**
 | When | `events.ndjson` line | Written by |
 |------|----------------------|------------|
 | job registered | `registered` (also seeds meta.json + status.json) | `registry.register_job` |
 | any status change | `status_changed` (`from`/`to`; also rewrites status.json) | `update_job_status`, `pick_pending` |
 | event published | `published` (carries the exact payload — reproducible) | `publish_event.py` |
 | event received | `received` (subscriber's external view) | `job_subscriber.py` |
 Both the emitter side (`published`) and the observer side (`received`) are
 recorded, so a dropped publish or a missed receive is still visible from the
 other. Every write is **best-effort and isolated** — an fcntl-locked append
 guarded by `try/except` that only ever emits a `logger.warning`, so a logging
 failure can never break a publish, a subscribe, or a registry write. stdout is
 never touched.
 **Reading them:**
 ```bash
 multi-agent-mux-delegate-job logs <job_id>     # pretty-print one job's timeline
 multi-agent-mux-delegate-job logs --list       # summarise every logged job (with live status)
 # or directly via the registry CLI:
 $PY scripts/registry.py logs <job_id> [--tail N] [--json]
 $PY scripts/registry.py logs --list [--json]
 ```
 `submit` prints the job's audit-log directory as its last stdout line, so a
 caller can `tail -n1` to locate it.
 ## Broker Setup
 | Stage | Broker | Auth | Transport |
 |-------|--------|------|-----------|
 | PoC | `broker.hivemq.com` | none | 1883 plaintext |
 | Production | self-hosted Mosquitto/EMQX | user/pass + ACL | 8883 TLS |
 All connection settings come from env (`MQTT_BROKER`, `MQTT_PORT`, `MQTT_TLS`,
 `MQTT_USERNAME`/`MQTT_PASSWORD`, `MQTT_CA_CERTS`, …) resolved by
 `broker_config_from_env()`, with the registry `broker.*` block overriding per
 job. Moving to your own broker is **config only**: install Mosquitto, set
 `persistence true` + `acl_file` + `password_file` + a TLS `listener 8883`, grant
 the worker `write python/mqtt/jobs/+/events` and Hermes `read`, then flip
 `MQTT_TLS=1` and fill the registry `broker.*`. Step-by-step (conf, ACL,
 `mosquitto_passwd`, self-signed/private-CA certs, cut-over verification):
 [`mqtt-broker-setup.md`](./mqtt-broker-setup.md).
 ## Agent Adapters
 Each agent voluntarily follows the contract: receive a `JOB_ID` (or registry
 path), call `publish_event.py` at lifecycle points, exit 0/1/2. **The contract
 in one line**: every event call uses `--job "$JOB_ID"` where `$JOB_ID` is the
 **freshly-issued id from the registry record for *this* delegation** — never a
 job_id you saw in an earlier session (Pitfall §"Wrong job_id propagated to the
 agent").
 - **claude-code** — Claude Code calls `publish_event.py` via its Bash tool at
  lifecycle points. `submit --mode tmux` injects a prompt that already names
  `$JOB_ID`; if you drive claude manually, hand it the id explicitly. Reference
  instruction block (the wrapper injects something equivalent):
  ```text
  Your job_id is "$JOB_ID" (read it from the registry record for this delegation —
  do not reuse any job_id you saw before).
  On start:        $PY multi-agent-mux-delegate-job/scripts/publish_event.py --job "$JOB_ID" --event started
  On permission:   $PY … --job "$JOB_ID" --event permission_required --detail "<tool>:<what>"
  On progress:     $PY … --job "$JOB_ID" --event progress --detail "<short status>"
  On success:      $PY … --job "$JOB_ID" --event completed --detail "<one-line summary>"
  On failure:      $PY … --job "$JOB_ID" --event error     --detail "<one-line reason>"
  Task: <the user's prompt>
  The subscriber for "$JOB_ID" is already running; your completed/error event
  ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.
  ```
  See [claude-code](../claude-code/SKILL.md) for tmux orchestration patterns.
 - **codex** — same contract. Invoke `codex exec "<instruction-block-above>"` or
  wire `publish_event.py` as an MCP tool so the agent can call it directly.
 - **opencode** — wire `publish_event.py` as a tool/command the agent can call;
  identical event points.
 - **human** — a person does the work, reads the registry record, then runs
  `publish_event.py --job <id> --event completed` (or `error`) by hand.
 ## User Interface
 The [`multi-agent-mux-delegate-job`](./multi-agent-mux-delegate-job) bash wrapper bundles register +
 subscribe-first + run-agent + validate:
 ```bash
 multi-agent-mux-delegate-job submit  --agent claude-code \
   --prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \
    --workdir /path/to/project --timeout 3600 [--validate ./validate.sh]
 multi-agent-mux-delegate-job status  --job <id>          # one record, pretty-printed
 multi-agent-mux-delegate-job list                        # all jobs, one line each
 multi-agent-mux-delegate-job verify  --job <id> --validate ./validate.sh   # runs it, reports exit code
 multi-agent-mux-delegate-job wait    [--job <id>]        # block until terminal (else --wait-any)
 ```
 `submit` **always starts the subscriber before the agent** (the ordering
 dependency), runs the agent in `--mode print` (one-shot) or `--mode tmux`, and
 calls `--validate` afterward if given. The skill automates job-id generation,
 registry creation, broker resolution, subscriber-first ordering, agent launch,
 and completion detection; it does **not** automate the agent's internals or your
 business-logic validation — those are hooks you fill (`validate.sh` reads
 `$JOB_ID`/`$REGISTRY_DIR`).
 ## Common Pitfalls
 - **Publishing before subscribing** — MQTT does not queue non-retained messages
  for absent subscribers. Start `job_subscriber.py` *before* the agent, or rely
  on retained terminal events (production). `submit` enforces this.
 - **Wrong job_id propagated to the agent** — the wrapper prints a fresh `JOB_ID`
  on every `submit`. If your agent instruction (or the wrapper's prompt template)
  hard-codes an old job_id, the agent calls `publish_event.py --job <wrong>`,
  the subscriber's defensive parser drops it as a `job_id` mismatch, and the
  delegator waits until idle timeout (exit 2). Fix: instruct the agent to
  **read the job_id from the registry record for *this* delegation** (or pass it
  in via env / `--prompt` interpolation), never from prior runs. `submit`'s
  default prompt template interpolates `$JOB_ID` for you — if you build a custom
  prompt, do the same.
 - **tmux session name collision** — `submit --mode tmux` derives the session
  name from `--agent-session tmux:<name>` (default `tmux:claude`). If a session
  with that name is already attached (e.g. you ran the demo and the previous
  session is still open), `tmux new-session -d -s <name>` fails and the agent
  never launches. Pick a unique `--agent-session` per concurrent delegation
  (e.g. `tmux:demo`, `tmux:claude-a`, `tmux:claude-b`) or kill the stale one
  (`tmux kill-session -t claude`) before re-running.
 - **Timeout before `started`** — a cold-starting agent may not emit `started`
  for a while; the wall-clock timeout starts at subscribe time so a stuck agent
  still terminates. Don't set `--timeout` so low you false-positive a slow start.
 - **No retry on publish** — a dropped `completed` would hang the delegator
  forever; `publish_event.py` retries with exponential backoff and exits 2 if it
  still fails, so the delegator is never left waiting silently.
 - **QoS-1 duplicates / reorders** — a terminal event can arrive twice, or
  `error` can trail `completed`; the subscriber's terminal state machine
  finalises each job once and ignores the rest.
 - **Trusting the public broker** — anyone can publish there; never make a real
  decision on a PoC signal. Add `auth_token` + an authenticated broker first.
 - **Secrets in `detail`/`data`** — keep payloads generalised; no paths, keys, or
  tokens (except the production `auth_token` in `data`).
 ## Subagent Orchestration Pattern
 When using this skill from a Hermes `delegate_task` subagent to dispatch work to
 a coding-agent CLI (agy/claude) running in a tmux session, the following pattern
 has been verified (2026-06-21, 6-batch refactoring sprint):
 ### Roles
 - **Main worker** (implementation): one agent session (e.g. `agy-new`) receives
  brief files and executes code changes.
 - **Reviewers** (spec compliance + code quality): two other agent sessions
  (e.g. `agy-existing`, `claude-existing`) review the diff in parallel.
 - **Hermes** (orchestrator): dispatches subagents, verifies diffs, commits,
  and falls back to direct fixes when reviewers find issues.
 ### Key lessons learned
 1. **Brief delivery via file path** — don't paste long briefs inline via
   `tmux send-keys`; the TUI may swallow them. Instead, send a short instruction
   like "follow /tmp/batch1-brief.md" and let the agent read the file.
 2. **Polling vs MQTT subscriber** — for short tasks (<5min), pane polling
   (`capture-pane` + grep for completion markers) is simpler and more reliable
   than registering a job via `registry.py` + `job_subscriber.py`. Use MQTT
   subscriber only for long-running jobs (>5min) where push notification matters.
 3. **Reviewers catch different bugs** — in practice, agy (Flash) caught
   semantic issues (slash matching, export scope), while claude (Opus) caught
   API signature mismatches (paho v2 5-arg vs 4-arg `on_disconnect`). Two
   reviewers with different models provide complementary coverage.
 4. **Hermes fallback fix** — when reviewers find a small, well-defined issue
   (wrong argument count, missing slash), Hermes should fix it directly rather
   than re-dispatching the implementer. This saves a full round-trip.
 5. **Batch grouping** — group 2-3 FW items per batch when they touch different
   files (no file overlap). This amortises the dispatch overhead. Items touching
   the same file must be in separate batches to avoid conflicts.
 6. **Pane Snapshots & Truncation Prevention** — to prevent long agent responses from being scrolled out and truncated due to TUI viewport limitations, enforce the following snapshotting pattern:
   - Immediately after dispatching a brief, capture the pre-brief pane buffer via `capture-pane -S -200`.
   - During long execution, run a background loop taking incremental snapshots (e.g. every 30 seconds `>> /tmp/pane-snap.txt`).
   - Immediately after job termination, capture the entire final pane state to ensure no terminal logs are lost.
 ## Verification Checklist
 - [ ] `started` → `completed` over the public broker: subscriber prints the
      lines and exits **0**.
 - [ ] `error` path: subscriber exits **1**.
 - [ ] timeout path: no terminal event within `--timeout`/`--idle-timeout` →
      exit **2**.
 - [ ] polluted payload (bad JSON, wrong `schema_version`, wrong `job_id`) is
      dropped with a warning, not crashed on.
 - [ ] one tmux session processes two registry jobs in sequence; a second
      session with a different `agent_session` claims only its own.
 - [ ] broker cut-over: same scripts reach an authenticated TLS broker with env
      changes only; a credential without write ACL is rejected; a late
      subscriber still receives the retained terminal event.
 - [ ] `publisher.py`/`subscriber.py`/`README.md` demo on `python/mqtt/sample`
      still works unchanged (regression).
 - [ ] **audit log integrity** — for a completed job,
      `.mam/delegate_job_logs/<JID>/events.ndjson` contains `registered` →
      `received started` → `published completed` (in that order), and
      `status.json.status == "completed"` matches the registry record. A
      logging failure (e.g. read-only log dir) does not break the publish or
      subscribe path — only a `logger.warning` is emitted.
 - [ ] **end-to-end demo smoke** — run
      `multi-agent-mux-delegate-job submit --agent claude-code --agent-session tmux:demo-smoke
       --prompt "echo hello and call publish_event.py --job <JID>
       --event completed" --timeout 120` and confirm
      (a) registered job id echoed, (b) subscriber pid echoed, (c) tmux session
      name printed, (d) `events.ndjson` grows as the agent runs, (e) final
      stdout line is the audit-log dir.
@@ -0,0 +1,114 @@
 # Job Event Protocol
 The wire contract every multi-agent-mux-delegate-job agent (claude-code, codex, opencode,
 human, …) speaks. One job → one MQTT topic → JSON event payloads. Stable across
 the PoC (public broker) and production (own broker) stages; only transport
 hardening changes, never the payload shape.
 Reference implementation: [`./scripts/publish_event.py`](./scripts/publish_event.py)
 (emit) and [`./scripts/job_subscriber.py`](./scripts/job_subscriber.py) (observe).
 ---
 ## 1. Topic design
 | Topic | Purpose |
 |-------|---------|
 | `python/mqtt/sample` | Legacy demo topic — **never changed** (README compat). |
 | `python/mqtt/jobs/<job_id>/events` | Per-job event stream (this protocol). |
 - One topic per job, JSON payload, `event` field discriminates the type.
 - Single-direction publish only (worker → observer). No request/response.
 - Future split is reserved but not required:
  `<job_id>/events`, `<job_id>/logs`, `<job_id>/artifacts`.
 - `topic_prefix` is stored in the job record so publishers resolve the topic
  from the registry alone (`<topic_prefix>/events`).
 ---
 ## 2. Payload schema (JSON, UTF-8, `schema_version = 1`)
 ```json
 {
  "schema_version": 1,
  "seq": 7,
  "job_id": "abc12345",
  "event": "started | permission_required | progress | completed | error",
  "timestamp": "2026-06-19T09:32:00Z",
  "detail": "generalised, whitelisted human-readable string",
  "data": { "optional": "metadata" }
 }
 ```
 | Field | Rule |
 |-------|------|
 | `schema_version` | If publisher/subscriber disagree, the subscriber **drops** the event with a warning (defensive parsing). |
 | `seq` | Monotonic **per `job_id`**, first publish = 1. Lets the subscriber detect reorder/duplication. Persisted in the registry (`last_seq`) so it survives restarts. |
 | `job_id` | Subscriber drops any event whose `job_id` it did not subscribe for. |
 | `timestamp` | Publisher host clock, **advisory only**. The delegator's timeout is measured from *receive* time, not this field. |
 | `detail` | Generalised text only. **No absolute paths, keys, or tokens.** |
 | `data` | Optional metadata. Production may add `hmac_sig`, `build_id`, etc. |
 ---
 ## 3. Event catalogue
 | event | When emitted | `detail` example | seq |
 |-------|--------------|------------------|-----|
 | `started` | Agent first picks up the job | `"Job a1b2c3d4 started"` | 1 |
 | `permission_required` | Agent needs a tool/permission grant | `"needs to write sort_problems.md"` | as it happens |
 | `progress` | Optional intermediate checkpoint | `"creating problem 5/10"` | as it happens |
 | `completed` | Successful terminal state | `"saved to sort_problems.md"` | last |
 | `error` | Failure / exception terminal state | `"internal error, see logs"` | last |
 `started` and `completed`/`error` are mandatory bookends; `permission_required`
 and `progress` are optional. `detail` must stay on the whitelist of generalised
 phrasings — never leak secrets through it.
 ### Terminal semantics
 - `completed` → subscriber exits 0; `error` → exits 1.
 - The subscriber runs a **terminal state machine**: it finalises a job on the
  first `completed`/`error` it sees and ignores any later terminal event for
  that job (QoS-1 duplicate, or an `error`-after-`completed` reorder). When all
  watched jobs are finalised it exits.
 - Wall-clock timeout *or* idle timeout before a terminal event → exit 2.
 ---
 ## 4. Production hardening (own broker stage)
 The payload shape is unchanged; the transport and trust model tighten. See
 [`mqtt-broker-setup.md`](./mqtt-broker-setup.md) for the broker side.
 - **Auth / ACL** — username/password + per-topic ACL. `jobs/+/events` publish is
  granted to the worker credential, subscribe to the Hermes credential.
 - **HMAC Signature Verification (`data.hmac_sig`)** — to authenticate the publisher and verify message integrity without exposing the raw secret token over the wire, each job record contains a per-job `auth_token` (`secrets.token_urlsafe(32)`). The publisher computes an HMAC-SHA256 signature over the serialized payload (excluding `data.hmac_sig` itself) using the `auth_token` as the key, and appends it to **`data.hmac_sig`**. The subscriber reconstructs this signature and **drops any message that does not match or lacks a valid signature**.
  ```json
  { "...": "...", "data": { "hmac_sig": "d2f3...", "build_id": "42" } }
  ```
 - **TLS** — port 8883 + private CA. Toggled with `MQTT_TLS=1` (+ `MQTT_CA_CERTS`);
  no code change.
 - **Retained terminal events** — `completed`/`error` publish with `retain=True`
  so a subscriber that joins late immediately receives the last terminal state
  instead of a stale view. The reference publisher auto-retains terminal events;
  `--retained` forces it for any event.
 - **Dual timeouts** — total wall-clock budget + last-activity idle detection,
  both measured from receive time.
 - **Clock trust** — never trust the payload `timestamp` for timeout decisions.
 ---
 ## 5. Why a public broker is PoC-only
 On `broker.hivemq.com` anyone can publish/subscribe the same topic. Therefore:
 - No secret data in payloads.
 - `started`/`completed`/`error` are *signals*, never a basis for a security
  decision.
 - Non-retained messages are **not queued** for absent subscribers — start the
  subscriber **before** the agent (ordering dependency), or rely on retained
  terminal events in production.
 - Real operational decisions belong to the own-broker stage with auth + ACL.
@@ -0,0 +1,176 @@
 # MQTT Broker Setup — PoC → Production
 The multi-agent-mux-delegate-job scripts read **all** broker settings from environment
 variables (or a job record's `broker.*` block) through a single helper,
 `broker_config_from_env()` in
 [`./scripts/mqtt_common.py`](./scripts/mqtt_common.py). The design goal:
 **switch from the public PoC broker to your own broker with config only — no
 code change.**
 | Env var | Meaning | PoC default | Production |
 |---------|---------|-------------|-----------|
 | `MQTT_BROKER` | host | `broker.hivemq.com` | internal hostname/IP |
 | `MQTT_PORT` | port | `1883` | `8883` (TLS) |
 | `MQTT_TLS` | TLS on/off (`1`/`0`) | `0` | `1` |
 | `MQTT_USERNAME` / `MQTT_PASSWORD` | auth | (none) | broker-issued |
 | `MQTT_CA_CERTS` | CA bundle path | (none) | private CA path |
 | `MQTT_CERTFILE` / `MQTT_KEYFILE` | client cert (optional mTLS) | (none) | per-client |
 | `MQTT_CLIENT_ID_PREFIX` | client id prefix | `hermes` | per-environment |
 ---
 ## 1. PoC: public broker (`broker.hivemq.com`)
 **Pros** — zero setup, reachable from anywhere, perfect for wiring up the
 publish/subscribe loop and the timeout/state-machine logic.
 **Cons / accepted assumptions** — no auth, no integrity, shared with the world:
 - no secrets in payloads;
 - `started`/`completed`/`error` are advisory signals only;
 - non-retained messages are **not queued** for absent subscribers, so the
  subscriber must start before the agent;
 - a re-subscribing client cannot recover past (non-retained) events.
 Use it only to validate the protocol, never for real decisions.
 ---
 ## 2. Production: self-hosted Mosquitto (or EMQX)
 Both support MQTT 5 + ACL + TLS. Mosquitto shown below; EMQX is a drop-in for
 the same env vars.
 ### 2.1 Install
 ```bash
 # macOS
 brew install mosquitto
 # Debian/Ubuntu
 sudo apt-get update && sudo apt-get install -y mosquitto mosquitto-clients
 # Docker
 docker run -d --name mosquitto -p 8883:8883 \
  -v "$PWD/mosquitto.conf:/mosquitto/config/mosquitto.conf" \
  -v "$PWD/certs:/mosquitto/certs" \
  -v "$PWD/auth:/mosquitto/auth" \
  eclipse-mosquitto:2
 ```
 ### 2.2 `mosquitto.conf` (key lines)
 ```conf
 persistence true
 persistence_location /mosquitto/data/
 password_file /mosquitto/auth/passwd
 acl_file      /mosquitto/auth/acl
 allow_anonymous false
 listener 8883
 cafile   /mosquitto/certs/ca.crt
 certfile /mosquitto/certs/server.crt
 keyfile  /mosquitto/certs/server.key
 ```
 `persistence true` + QoS 1 + retained terminal events means a subscriber that
 joins after a job finished still sees the final `completed`/`error`.
 ### 2.3 Users (username/password)
 ```bash
 # create the file with the first user, then add more with -b
 mosquitto_passwd -c /mosquitto/auth/passwd hermes        # subscriber/delegator
 mosquitto_passwd    /mosquitto/auth/passwd claude-worker # publisher/agent
 # (omit -c after the first; -c truncates the file)
 ```
 ### 2.4 ACL — least privilege
 The worker only **publishes** events; Hermes only **subscribes**:
 ```conf
 # /mosquitto/auth/acl
 # claude-worker: may publish job events, may not read others' streams
 user claude-worker
 topic write python/mqtt/jobs/+/events
 # hermes: observes every job's events
 user hermes
 topic read python/mqtt/jobs/+/events
 # keep the legacy demo topic usable for both, if desired
 pattern readwrite python/mqtt/sample
 ```
 ### 2.5 TLS certificates
 **Quick self-signed (single host, internal only):**
 ```bash
 mkdir -p certs && cd certs
 openssl req -x509 -newkey rsa:2048 -nodes -days 825 \
  -keyout server.key -out server.crt \
  -subj "/CN=mqtt.internal"
 cp server.crt ca.crt   # clients trust this as the CA bundle
 ```
 **Private CA (recommended — separate CA from server cert):**
 ```bash
 # 1) CA
 openssl genrsa -out ca.key 4096
 openssl req -x509 -new -nodes -key ca.key -days 3650 -out ca.crt -subj "/CN=Hermes-CA"
 # 2) server cert signed by the CA
 openssl genrsa -out server.key 2048
 openssl req -new -key server.key -out server.csr -subj "/CN=mqtt.internal"
 openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \
  -out server.crt -days 825
 ```
 Clients trust `ca.crt` via `MQTT_CA_CERTS=/path/to/ca.crt`.
 ---
 ## 3. Cut-over verification (config-only, no code change)
 Goal: prove the **same scripts** talk to your broker by changing only env/registry.
 ```bash
 # 1) point the env at the new broker
 export MQTT_BROKER=mqtt.internal
 export MQTT_PORT=8883
 export MQTT_TLS=1
 export MQTT_CA_CERTS=$PWD/certs/ca.crt
 export MQTT_USERNAME=hermes
 export MQTT_PASSWORD=…            # subscriber side
 # (publisher side uses claude-worker creds via the job record's broker block)
 # 2) sanity-check with the mosquitto CLI first
 mosquitto_sub -h "$MQTT_BROKER" -p 8883 --cafile "$MQTT_CA_CERTS" \
  -u hermes -P "$MQTT_PASSWORD" -t 'python/mqtt/jobs/+/events' -v &
 # 3) run the unchanged multi-agent-mux-delegate-job loop
 PY=.venv/bin/python
 JID=$($PY scripts/registry.py register --prompt "broker cutover smoke")
 $PY scripts/job_subscriber.py --job "$JID" --timeout 30 &
 sleep 3
 $PY scripts/publish_event.py --job "$JID" --event started
 $PY scripts/publish_event.py --job "$JID" --event completed   # auto-retained
 ```
 Expected:
 - subscriber prints the `started` and `completed` lines and exits 0;
 - `mosquitto_sub` shows the same events (ACL allows `hermes` to read);
 - publishing as a credential **without** write ACL is rejected by the broker;
 - a subscriber started *after* `completed` still receives it (retained).
 If all four hold, the migration is config-only. Persist the broker block into
 each job record so `publish_event.py` connects from the registry alone:
 ```json
 "broker": { "host": "mqtt.internal", "port": 8883, "tls": true,
            "username": "claude-worker", "password": "…" }
 ```
@@ -0,0 +1,277 @@
 #!/usr/bin/env bash
 # multi-agent-mux-delegate-job — user-facing orchestrator for the multi-agent-mux-delegate-job skill.
 #
 # Subcommands:
 #   submit   register a job, start the subscriber FIRST, then run the agent,
 #            then (optionally) run a validation script.
 #   status   show one job record.
 #   list     list all jobs.
 #   verify   run a user-supplied --validate script against a job's artifacts.
 #   wait     block until all running/pending jobs reach a terminal state.
 #
 # This is a reference wrapper: it shells out to the python scripts that live
 # next to it. Copy it into your project and customise as needed. It never hard
 # fails if `claude`/`codex`/`tmux` are missing — it prints what it would run.
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Pick an interpreter: prefer a project .venv, else python3.
 pick_python() {
  local py_bin
  if [[ -n "${DELEGATE_JOB_PYTHON:-}" ]]; then
    py_bin="$DELEGATE_JOB_PYTHON"
  elif [[ -x "${WORKDIR:-.}/.venv/bin/python" ]]; then
    py_bin="${WORKDIR}/.venv/bin/python"
  elif [[ -x ".venv/bin/python" ]]; then
    py_bin="$(pwd)/.venv/bin/python"
  else
    py_bin="python3"
  fi
  if ! "$py_bin" -c "import paho.mqtt" 2>/dev/null; then
    echo "ERROR: paho-mqtt package is missing for $py_bin." >&2
    echo "       Please create a virtual environment and install it:" >&2
    echo "       python3 -m venv .venv && .venv/bin/pip install -r \"$SCRIPT_DIR/requirements.txt\"" >&2
    exit 1
  fi
  echo "$py_bin"
 }
 REGISTRY_DIR_DEFAULT=".mam/jobs"
 usage() {
  cat <<'EOF'
 multi-agent-mux-delegate-job <command> [options]
  submit  --agent <name> --prompt <text> [--workdir <dir>] [--agent-session <label>]
          [--timeout <sec>] [--idle-timeout <sec>] [--validate <script>]
          [--registry-dir <dir>] [--dry-run]
          # The skill is tmux-interactive only; --mode print was removed.
  status  --job <id> [--registry-dir <dir>]
  list    [--registry-dir <dir>]
  verify  --job <id> --validate <script> [--registry-dir <dir>]
  wait    [--job <id>] [--timeout <sec>] [--registry-dir <dir>]
  logs    <job_id> | --list      # persistent audit log (delegate_job_logs/)
 EOF
 }
 # ---- arg parsing helpers --------------------------------------------------
 AGENT="claude-code"; PROMPT=""; WORKDIR="$(pwd)"; AGENT_SESSION="tmux:claude"
 TIMEOUT=3600; IDLE_TIMEOUT=120; VALIDATE=""; DRY_RUN=0
 JOB_ID=""; REGISTRY_DIR="$REGISTRY_DIR_DEFAULT"
 parse_opts() {
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --agent) AGENT="$2"; shift 2;;
      --prompt) PROMPT="$2"; shift 2;;
      --workdir) WORKDIR="$2"; shift 2;;
      --agent-session) AGENT_SESSION="$2"; shift 2;;
      --timeout) TIMEOUT="$2"; shift 2;;
      --idle-timeout) IDLE_TIMEOUT="$2"; shift 2;;
      --validate) VALIDATE="$2"; shift 2;;
      --job) JOB_ID="$2"; shift 2;;
      --registry-dir) REGISTRY_DIR="$2"; shift 2;;
      --dry-run) DRY_RUN=1; shift;;
      *) echo "unknown option: $1" >&2; usage; exit 1;;
    esac
  done
 }
 cmd_submit() {
  parse_opts "$@"
  [[ -n "$PROMPT" ]] || { echo "submit requires --prompt" >&2; exit 1; }
  PY="$(pick_python)"
  cd "$WORKDIR"
  mkdir -p "$REGISTRY_DIR"
  # 1) register job (prints the new job id)
  JOB_ID="$("$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" register \
      --prompt "$PROMPT" --agent "$AGENT" --agent-session "$AGENT_SESSION" \
      --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT")"
  echo "registered job: $JOB_ID"
  # 2) START THE SUBSCRIBER FIRST (ordering dependency — MQTT does not queue
  #    non-retained messages for absent subscribers).
  local logf="$REGISTRY_DIR/$JOB_ID.subscriber.out"
  "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
      --job "$JOB_ID" --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT" \
      >"$logf" 2>&1 &
  local sub_pid=$!
  echo "subscriber pid: $sub_pid (log: $logf)"
  sleep 1  # give the subscriber time to CONNACK + SUBSCRIBE before the agent runs
  # 3) run the agent (or print the command for dry-run / missing binary)
  local pub="$PY $SCRIPT_DIR/scripts/publish_event.py --registry-dir $REGISTRY_DIR --job $JOB_ID"
  # NOTE: the agent MUST use --job "$JOB_ID" (the one we just minted). Hard-coding
  # an id from an earlier session is the #1 reason a delegated job sits idle and
  # times out (see SKILL.md "Wrong job_id propagated to the agent"). We make the
  # freshness explicit in the instruction header.
  local instructions="Your job_id is \"$JOB_ID\" (the one just registered for THIS delegation — read it from the registry record, do NOT reuse any job_id you saw in earlier runs).
 On start run:        $pub --event started.
 On permission/tool prompt run: $pub --event permission_required --detail '<tool>:<what>'.
 On progress (optional): $pub --event progress --detail '<short status>'.
 On success run:      $pub --event completed --detail '<one-line summary>'.
 On failure run:      $pub --event error     --detail '<one-line reason>'.
 The subscriber for this job_id is already running; your completed/error event ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.
 Task: $PROMPT"
  run_agent "$JOB_ID" "$instructions"
  # 4) optional validation hook
  if [[ -n "$VALIDATE" ]]; then
    echo "running validation: $VALIDATE"
    if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
      echo "validation: PASS"
    else
      local rc=$?
      echo "validation: FAIL (exit $rc)"
    fi
  fi
  if [[ "$DRY_RUN" == "1" ]]; then
    # In dry-run we never started a real subscriber (the wrapper short-circuits
    # before launching one), but the wait below would still try to join the
    # background sub_pid from cmd_submit. Skip both the wait and the subscriber
    # log dump; the user just wants to see the instruction that would have run.
    local logs_root_dry="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
    echo "$logs_root_dry/$JOB_ID"
    return 0
  fi
  wait "$sub_pid" || true
  echo "subscriber output:"; cat "$logf" || true
  # Last stdout line: the persistent audit-log dir for this job (see SKILL.md
  # "Audit Logs"). Callers can scrape `tail -n1` to find it.
  local logs_root="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
  echo "$logs_root/$JOB_ID"
 }
 run_agent() {
  local job_id="$1"; local instructions="$2"
  # The skill is INTERACTIVE-ONLY. We never invoke `claude -p` or any other
  # one-shot print mode, because:
  #   - claude -p exits the moment stdin is drained, so there's nothing to
  #     `tmux attach` to afterwards.
  #   - fire-and-forget via wrapper defeats the whole point of the audit log
  #     (you can't tell what happened if the agent crashes mid-turn).
  #   - the job registry already gives us an authoritative completion signal,
  #     so we don't need a wrapper-side exit code to know "done".
  # The user attaches with `tmux attach -t <session>` and types follow-up
  # prompts themselves. We pre-load the first prompt via stdin and `read`
  # keeps the pane open after the agent exits so the user can review.
  if [ "$AGENT" = "human" ]; then
    echo "[human agent] complete the task, then run publish_event.py --event completed"
    return
  fi
  local sess="${AGENT_SESSION#tmux:}"
  if [[ "$DRY_RUN" == "1" ]]; then
    echo "[dry-run] would delegate task to running agent '$AGENT' in tmux session '$sess' with instructions:"
    echo "----"; echo "$instructions"; echo "----"
    return
  fi
  if ! command -v tmux >/dev/null 2>&1; then
    echo "ERROR: this skill requires tmux (interactive agent sessions)." >&2
    echo "       Install with: brew install tmux   (or your package manager)" >&2
    return 1
  fi
  local _tmux="tmux"
  if [ -n "${TMUX_SERVER_NAME:-}" ]; then
    _tmux="tmux -L $TMUX_SERVER_NAME"
  fi
  if ! $_tmux has-session -t "$sess" 2>/dev/null; then
    echo "ERROR: 에이전트 세션 '$sess'이 존재하지 않습니다. 작업을 위임하기 전에 먼저 에이전트 세션을 기동해 주세요." >&2
    echo "       팁: 'multi-agent-mux-resume' 또는 'multi-agent-mux-create'를 통해 에이전트를 먼저 생성할 수 있습니다." >&2
    return 1
  fi
  # Before launching the agent, set up error trap to publish error event
  if [ -n "${job_id:-}" ] && [ -n "${PY:-}" ]; then
    local pub_script="$SCRIPT_DIR/scripts/publish_event.py"
    trap 'rc=$?; if [ $rc -ne 0 ]; then "$PY" "$pub_script" --job "$job_id" --event error --detail "agent bootstrap failed (exit $rc)"; fi' EXIT
  fi
  echo "살아있는 에이전트 세션 '$sess'에 작업을 위임합니다..."
  $_tmux set-buffer -b "job_buf_$job_id" "$instructions"
  $_tmux paste-buffer -b "job_buf_$job_id" -t "$sess"
  $_tmux send-keys -t "$sess" C-m
  $_tmux delete-buffer -b "job_buf_$job_id"
  echo "작업이 세션 '$sess'에 전송되었습니다. (연결하려면: $_tmux attach -t $sess)"
  trap - EXIT
 }
 cmd_status() {
  parse_opts "$@"
  [[ -n "$JOB_ID" ]] || { echo "status requires --job" >&2; exit 1; }
  PY="$(pick_python)"
  "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" get --job "$JOB_ID"
 }
 cmd_list() {
  parse_opts "$@"
  PY="$(pick_python)"
  "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" list
 }
 cmd_verify() {
  parse_opts "$@"
  [[ -n "$JOB_ID" ]] || { echo "verify requires --job" >&2; exit 1; }
  [[ -n "$VALIDATE" ]] || { echo "verify requires --validate <script>" >&2; exit 1; }
  echo "verifying job $JOB_ID with $VALIDATE"
  if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
    echo "verify: PASS (exit 0)"; exit 0
  else
    rc=$?; echo "verify: FAIL (exit $rc)"; exit "$rc"
  fi
 }
 cmd_logs() {
  # logs <job_id> | logs --list — delegates to registry.py's logs CLI, which
  # reads the persistent audit log under $DELEGATE_JOB_LOGS_DIR (or
  # <cwd>/delegate_job_logs). Run from your project dir so the default resolves.
  PY="$(pick_python)"
  if [[ "${1:-}" == "--list" ]]; then
    "$PY" "$SCRIPT_DIR/scripts/registry.py" logs --list
  else
    local jid="${1:-}"
    [[ -n "$jid" ]] || { echo "logs requires <job_id> or --list" >&2; exit 1; }
    "$PY" "$SCRIPT_DIR/scripts/registry.py" logs "$jid"
  fi
 }
 cmd_wait() {
  parse_opts "$@"
  PY="$(pick_python)"
  if [[ -n "$JOB_ID" ]]; then
    "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
        --job "$JOB_ID" --timeout "$TIMEOUT"
  else
    "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
        --wait-any --timeout "$TIMEOUT"
  fi
 }
 main() {
  local sub="${1:-}"; shift || true
  case "$sub" in
    submit) cmd_submit "$@";;
    status) cmd_status "$@";;
    list)   cmd_list "$@";;
    verify) cmd_verify "$@";;
    wait)   cmd_wait "$@";;
    logs)   cmd_logs "$@";;
    ""|-h|--help|help) usage;;
    *) echo "unknown command: $sub" >&2; usage; exit 1;;
  esac
 }
 main "$@"
@@ -0,0 +1,183 @@
 # Job Registry
 The registry is the **single source of truth** for delegated work. Job metadata
 (id, prompt, broker, status, timeouts) lives in files, **not** environment
 variables — so one tmux session can handle many jobs sequentially or in
 parallel without collisions, and `publish_event.py` / `job_subscriber.py` can
 reconstruct everything they need from the registry alone.
 Reference implementation: [`./scripts/registry.py`](./scripts/registry.py)
 (library + CLI) over the primitives in
 [`./scripts/mqtt_common.py`](./scripts/mqtt_common.py).
 ---
 ## 1. Directory layout
 ```
 .mam/jobs/
  <job_id>.json          # job metadata record (schema below)
  <job_id>.events.log    # append-only JSON-lines event log (debug, optional)
  .lock                  # shared advisory lock (fcntl) for the whole registry
 ```
 `registry_dir` defaults to `.mam/jobs` and is overridable everywhere via
 `--registry-dir`.
 ---
 ## 2. Job record schema
 ```json
 {
  "schema_version": 1,
  "job_id": "abc12345",
  "status": "pending | running | completed | error | cancelled",
  "created_at": "2026-06-19T09:30:00Z",
  "updated_at": "2026-06-19T09:32:00Z",
  "prompt": "정렬 문제 10개를 만들어 sort_problems.md로 저장…",
  "agent": "claude-code",
  "agent_session": "tmux:claude",
  "broker": {
    "host": "broker.hivemq.com",
    "port": 1883,
    "tls": false,
    "username": null,
    "password": null
  },
  "topic_prefix": "python/mqtt/jobs/abc12345",
  "timeout_sec": 3600,
  "idle_timeout_sec": 120,
  "expected_artifacts": ["sort_problems.md"],
  "last_seq": 0,
  "auth_token": null
 }
 ```
 - `broker` lets `publish_event.py` connect from the record alone (env still
  overrides toggles like `MQTT_TLS`).
 - `topic_prefix` → the events topic is `<topic_prefix>/events`.
 - `last_seq` backs the monotonic `seq` counter so it survives process restarts.
 - `expected_artifacts` is the hook a user `validate.sh` checks (existence/content).
 - `auth_token` is `null` in PoC; production sets `secrets.token_urlsafe(32)`.
 ---
 ## 3. Concurrency rules
 ### PoC — fcntl advisory lock
 Every read-modify-write (`register_job`, `pick_pending`, `update_status`,
 `next_seq`) runs inside `registry_lock(registry_dir)`, an exclusive
 `fcntl.flock` over `.lock`. Single-host, good enough for many tmux sessions on
 one machine.
 ### Production — SQLite WAL
 When delegation spans **multiple hosts**, the file lock no longer serialises
 across machines. Migrate the same operations to a SQLite database in WAL mode
 (`PRAGMA journal_mode=WAL`) with a transaction per claim. The function
 signatures stay identical; only the storage backend changes.
 ---
 ## 4. How multiple sessions take only their own work
 Each tmux session carries an `agent_session` label (`tmux:claude`,
 `tmux:claude-a`, `tmux:claude-b`, …). `pick_pending(agent_session)`:
 1. acquires the registry lock,
 2. scans for the **oldest** record with `status == "pending"` **and**
   matching `agent_session`,
 3. flips it to `running` and writes it back **atomically**,
 4. releases the lock and returns the `job_id` (or `None`).
 Because the scan + flip happen under one lock, two sessions can never claim the
 same job. Sessions with distinct labels naturally partition the work; sessions
 sharing a label compete safely — first to acquire the lock wins, the other sees
 the job already `running` and moves on.
 ```bash
 # session A only ever runs its own pending jobs
 PY scripts/registry.py pick --agent-session tmux:claude-a   # prints id or exits 3
 ```
 ---
 ## 5. Atomic status updates
 All writes use a temp-file + `os.replace` rename, which is atomic on POSIX:
 1. take the registry lock,
 2. load the current record,
 3. mutate fields + refresh `updated_at` (and `last_seq` for `next_seq`),
 4. write to `.<job_id>.<rand>.tmp` in the **same directory**, `fsync`,
 5. `os.replace(tmp, <job_id>.json)`,
 6. release the lock.
 A reader therefore always sees either the old or the new complete record, never
 a half-written file. This is the file-based equivalent of the rename trick
 (`pending.<session>` → `running.<session>`) and maps cleanly onto a single
 SQLite transaction when you migrate.
 ---
 ## 6. CLI quick reference
 ```bash
 PY=.venv/bin/python
 $PY scripts/registry.py register --prompt "…" --agent claude-code \
    --agent-session tmux:claude --timeout 3600 --idle-timeout 120   # → prints job_id
 $PY scripts/registry.py list                                       # human table
 $PY scripts/registry.py list --json                                # full records
 $PY scripts/registry.py get    --job <id>                          # one record
 $PY scripts/registry.py status --job <id> --set completed          # set status
 $PY scripts/registry.py pick   --agent-session tmux:claude         # claim → running
 ```
 Exit codes: `0` ok, `1` not found / bad status, `3` (`pick`) no pending job for
 that session.
 ---
 ## 7. Persistent audit log
 Separate from the registry, every job is also mirrored to a durable append-only
 audit log at `.mam/delegate_job_logs/<job_id>/` (override with
 `DELEGATE_JOB_LOGS_DIR`, default `<cwd>/.mam/delegate_job_logs`). The registry
 is **live state** mutated in place; the audit log is **history** that survives
 even after the registry dir is cleaned up. It is git-ignored.
 ```
 .mam/delegate_job_logs/<job_id>/
  meta.json      # registration snapshot (the full job record at register time)
  events.ndjson  # append-only, one JSON event per line, time-ordered
  status.json    # current status only (fast point-query)
 ```
 `events.ndjson` lines are written automatically at four points:
 | Trigger | line `event` | Source |
 |---------|-------------|--------|
 | `register_job` | `registered` | `registry.register_job` → `mqtt_common.init_job_log` |
 | status change (`update_status`, `pick`, publish status sync) | `status_changed` (`from`/`to`) | `mqtt_common.update_job_status` / `pick_pending` |
 | event published | `published` (embeds the exact payload) | `publish_event.py` |
 | event received | `received` | `job_subscriber.py` |
 Helpers live in [`./scripts/mqtt_common.py`](./scripts/mqtt_common.py):
 `LOGS_DIR`, `job_log_path`, `init_job_log`, `append_event` (fcntl-locked,
 concurrent-append safe), `update_logged_status`, and the readers
 `read_logged_meta` / `read_logged_status` / `iter_logged_events` /
 `list_logged_jobs`. Every writer is **best-effort and isolated** — wrapped in
 `try/except` with a `logger.warning`, so an audit-log failure never breaks the
 registry write, the publish, or the subscribe it shadows.
 Read them via the CLI:
 ```bash
 PY=.venv/bin/python
 $PY scripts/registry.py logs <job_id>            # pretty timeline
 $PY scripts/registry.py logs <job_id> --tail 20  # last 20 events
 $PY scripts/registry.py logs <job_id> --json     # raw JSON lines
 $PY scripts/registry.py logs --list              # every job, live status
 ```
@@ -0,0 +1,2 @@
 paho-mqtt>=2.0.0
 pyyaml
@@ -0,0 +1,252 @@
 #!/usr/bin/env python3
 """job_subscriber.py — the single entry point for observing Job events.
 Subscribes to one job's ``<topic_prefix>/events`` (or, with ``--wait-any``, the
 events of every running/pending job in the registry), prints one line to stdout
 per accepted event, and exits on a terminal event or a timeout.
 Design points (all flagged in the PLAN review):
  - terminal state machine: ``completed``/``error`` is acted on exactly once per
    job, so QoS-1 duplicates or an ``error``-after-``completed`` reorder are safe.
  - dual timeouts: a wall-clock ``--timeout`` (total budget, started at
    subscribe time so a cold start can't hang forever) AND an idle
    ``--idle-timeout`` (no new event for N seconds).
  - defensive parsing: undecodable payloads, ``schema_version`` mismatches, and
    ``job_id`` values we did not subscribe for are logged and dropped.
 stdout = event lines only. Diagnostics go to stderr via logging.
 Exit codes:
  0  all watched jobs reached ``completed``
  1  any watched job reached ``error``
  2  timed out (wall-clock or idle) before all jobs finished
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 import queue
 import sys
 import time
 from typing import Any, Dict, List, Optional, Set, Tuple
 import mqtt_common
 import registry
 from mqtt_common import (
    DEFAULT_REGISTRY_DIR,
    SCHEMA_VERSION,
    broker_config_from_job,
    load_job,
    make_client,
 )
 logger = logging.getLogger("delegate_job.job_subscriber")
 TERMINAL_EVENTS = ("completed", "error")
 def _format_line(topic: str, payload: Dict[str, Any]) -> str:
    return (
        f"{payload.get('timestamp','-')}  "
        f"job={payload.get('job_id','?')}  "
        f"seq={payload.get('seq','?')}  "
        f"{payload.get('event','?'):<20}  "
        f"{payload.get('detail','')}"
    )
 class _Watcher:
    """Holds the shared queue + the set of job_ids we accept events for."""
    def __init__(self, expected_job_ids: Set[str], expected_tokens: Dict[str, Optional[str]]):
        self.events: "queue.Queue[Tuple[str, Dict[str, Any]]]" = queue.Queue()
        self.expected = set(expected_job_ids)
        self.tokens = expected_tokens  # job_id -> expected auth_token (or None)
        self.last_seq: Dict[str, int] = {jid: 0 for jid in expected_job_ids}
    def on_message(self, _client, _userdata, msg) -> None:
        # --- defensive parsing -------------------------------------------
        try:
            payload = json.loads(msg.payload.decode("utf-8"))
        except (UnicodeDecodeError, json.JSONDecodeError) as exc:
            logger.warning("drop unparseable payload on %s: %s", msg.topic, exc)
            return
        if not isinstance(payload, dict):
            logger.warning("drop non-object payload on %s", msg.topic)
            return
        if payload.get("schema_version") != SCHEMA_VERSION:
            logger.warning("drop event with schema_version=%r (expected %d)",
                           payload.get("schema_version"), SCHEMA_VERSION)
            return
        jid = payload.get("job_id")
        if jid not in self.expected:
            logger.warning("drop event for unexpected job_id=%r on %s", jid, msg.topic)
            return
        # --- production auth check: data.auth_token must match if expected ---
        expected_token = self.tokens.get(jid)
        if not mqtt_common.verify_hmac(payload, expected_token):
            logger.warning("drop event for job %s: HMAC verify failed", jid)
            return
        # --- replay attack defense: check monotonic sequence ---
        seq = payload.get("seq")
        if seq is None or not isinstance(seq, int):
            logger.warning("drop event for job %s: missing or invalid seq", jid)
            return
        if seq <= self.last_seq.get(jid, 0):
            logger.warning("drop event for job %s: seq %d is not monotonically increasing (last %d)",
                           jid, seq, self.last_seq.get(jid, 0))
            return
        self.last_seq[jid] = seq
        # Persistent audit log from the *subscriber's* vantage point: every event
        # that survives defensive parsing is recorded here, including ones a
        # different host published. This is the external-observer record that
        # backstops the publisher's own "published" line if it never wrote one.
        mqtt_common.append_event(jid, {
            "event": "received",
            "source_event": payload.get("event"),
            "seq": payload.get("seq"),
            "topic": msg.topic,
            "timestamp": payload.get("timestamp"),
            "detail": payload.get("detail", ""),
        })
        self.events.put((msg.topic, payload))
 def _collect_jobs(args) -> List[Dict[str, Any]]:
    """Resolve the list of job records this invocation should watch."""
    if args.wait_any:
        jobs = [r for r in registry.list_jobs(args.registry_dir)
                if r.get("status") in ("pending", "running")]
        if not jobs:
            logger.error("no pending/running jobs to wait for")
        return jobs
    job = load_job(args.job, args.registry_dir)  # raises FileNotFoundError
    return [job]
 def main(argv=None) -> int:
    parser = argparse.ArgumentParser(description="Subscribe to Job events on MQTT")
    target = parser.add_mutually_exclusive_group(required=True)
    target.add_argument("--job", help="job id to watch")
    target.add_argument("--wait-any", action="store_true",
                        help="watch every pending/running job in the registry")
    parser.add_argument("--timeout", type=float, default=None,
                        help="wall-clock budget in seconds (default: job.timeout_sec or 3600)")
    parser.add_argument("--idle-timeout", type=float, default=None,
                        help="max seconds with no new event (default: job.idle_timeout_sec or 120)")
    parser.add_argument("--expect-retention", action="store_true",
                        help="warn if no retained terminal event arrives promptly")
    parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args(argv)
    mqtt_common.setup_logging(logging.DEBUG if args.verbose else logging.WARNING)
    try:
        jobs = _collect_jobs(args)
    except FileNotFoundError as exc:
        logger.error("%s", exc)
        return 2
    if not jobs:
        return 2
    expected_ids: Set[str] = {j["job_id"] for j in jobs}
    tokens = {j["job_id"]: j.get("auth_token") for j in jobs}
    watcher = _Watcher(expected_ids, tokens)
    # Resolve timeouts from CLI, falling back to the (first) job's settings.
    base_job = jobs[0]
    wall_timeout = args.timeout if args.timeout is not None else float(base_job.get("timeout_sec", 3600))
    idle_timeout = args.idle_timeout if args.idle_timeout is not None else float(base_job.get("idle_timeout_sec", 120))
    # All watched jobs share a broker in practice; connect using the first
    # job's broker and subscribe to each job's events topic.
    config = broker_config_from_job(base_job)
    client = make_client("subscriber", config)
    client.on_message = watcher.on_message
    subscribed_topics = []
    for job in jobs:
        prefix = job.get("topic_prefix") or mqtt_common.topic_prefix_for(job["job_id"])
        subscribed_topics.append(f"{prefix}/events")
    def on_connect(_c, _u, _flags, reason_code, _props):
        if mqtt_common.reason_code_value(reason_code) != 0:
            logger.error("broker connection failed: rc=%s", reason_code)
            return
        for topic in subscribed_topics:
            _c.subscribe(topic, qos=1)
            logger.info("subscribed to %s", topic)
    def on_disconnect(_c, _u, _flags, reason_code, _props):
        rc = mqtt_common.reason_code_value(reason_code)
        if rc != 0:
            logger.warning("broker disconnected (rc=%s); will retry reconnect", reason_code)
    client.on_connect = on_connect
    client.on_disconnect = on_disconnect
    client.reconnect_delay_set(min_delay=1, max_delay=16)
    mqtt_common.with_retry(
        lambda: client.connect(config.host, config.port, config.keepalive),
        attempts=5, base_delay=1.0, max_delay=16.0
    )()
    client.loop_start()
    terminal: Dict[str, str] = {}        # job_id -> "completed"/"error"
    pending: Set[str] = set(expected_ids)
    start = time.monotonic()
    wall_deadline = start + wall_timeout
    last_event = start
    retention_checked = not args.expect_retention
    try:
        while pending:
            now = time.monotonic()
            if now >= wall_deadline:
                logger.error("wall-clock timeout (%.0fs); still pending: %s",
                             wall_timeout, ", ".join(sorted(pending)))
                return 2
            idle_left = idle_timeout - (now - last_event)
            if idle_left <= 0:
                logger.error("idle timeout (%.0fs, no events); still pending: %s",
                             idle_timeout, ", ".join(sorted(pending)))
                return 2
            wait = min(wall_deadline - now, idle_left, 1.0)
            try:
                topic, payload = watcher.events.get(timeout=wait)
            except queue.Empty:
                if not retention_checked and (now - start) > 3.0:
                    logger.warning("--expect-retention set but no retained "
                                   "terminal event observed yet")
                    retention_checked = True
                continue
            last_event = time.monotonic()
            retention_checked = True
            print(_format_line(topic, payload), flush=True)
            jid = payload["job_id"]
            event = payload.get("event")
            if event in TERMINAL_EVENTS:
                if jid in terminal:
                    # Already finalised: ignore duplicates / late reorders.
                    logger.info("ignoring duplicate terminal %s for %s", event, jid)
                    continue
                terminal[jid] = event
                pending.discard(jid)
    finally:
        client.loop_stop()
        try:
            client.disconnect()
        except Exception:  # pragma: no cover
            pass
    # All jobs reached a terminal state. error wins over completed.
    if any(state == "error" for state in terminal.values()):
        return 1
    return 0
 if __name__ == "__main__":
    sys.exit(main())
@@ -0,0 +1,616 @@
 """Shared MQTT + registry helpers for the multi-agent-mux-delegate-job skill.
 Single entry point for:
  - broker configuration (env -> dataclass),
  - paho client construction (auth + TLS + unique client id),
  - monotonic per-job sequence counters,
  - retry-with-exponential-backoff,
  - atomic registry record load/update under an fcntl lock.
 Requires paho-mqtt >= 2.0 (uses CallbackAPIVersion.VERSION2).
 This module is the *only* place that talks to the broker config and to the
 raw job record file, so PoC -> production migration touches just env/registry
 values, never code (see references/mqtt-broker-setup.md).
 """
 from __future__ import annotations
 import functools
 import hashlib
 import hmac
 import json
 import logging
 import os
 import tempfile
 import time
 import uuid
 from contextlib import contextmanager
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional
 import paho.mqtt.client as mqtt
 logger = logging.getLogger("delegate_job.mqtt_common")
 def _load_dotenv(workspace_dir: str = None) -> None:
    """Load .env file from workspace if it exists and env var not already set.
    This ensures Python scripts get the same env vars as the shell wrapper
    scripts that source .env. Only sets vars that are not already in os.environ
    (i.e. OS env takes precedence over .env file).
    """
    import os
    if workspace_dir is None:
        # Walk up from this script to find workspace root
        d = os.path.dirname(os.path.abspath(__file__))
        for _ in range(5):
            if os.path.isfile(os.path.join(d, ".env")):
                break
            d = os.path.dirname(d)
    else:
        d = workspace_dir
    env_path = os.path.join(d, ".env")
    if not os.path.isfile(env_path):
        return
    with open(env_path, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if "=" in line:
                key, _, val = line.partition("=")
                key = key.strip()
                val = val.strip().strip('"').strip("'")
                if key and key not in os.environ:
                    os.environ[key] = val
 _load_dotenv()
 # --------------------------------------------------------------------------
 # Constants
 # --------------------------------------------------------------------------
 SCHEMA_VERSION = 1
 DEFAULT_REGISTRY_DIR = ".mam/jobs"
 DEFAULT_TOPIC_ROOT = "python/mqtt/jobs"
 LOCK_FILENAME = ".lock"
 # Persistent audit-log layout: .mam/delegate_job_logs/<job_id>/{meta,events,status}.
 # This is a *separate* artifact from the registry: the registry is the live job
 # record (mutated in place), the audit log is an append-only history that
 # survives even if the registry dir is cleaned up.
 META_FILENAME = "meta.json"
 EVENTS_FILENAME = "events.ndjson"
 STATUS_FILENAME = "status.json"
 def _default_logs_dir() -> str:
    """Audit-log root. Overridable with ``DELEGATE_JOB_LOGS_DIR``; otherwise
    ``<cwd>/.mam/delegate_job_logs`` — we keep audit logs next to the
    live registry (``.mam/jobs/``) so the two runtime artifacts sit
    under the same parent dir and follow the same ``.gitignore`` rule.
    The cwd of whichever process emits events (the bash wrapper and
    scripts) is used as the anchor."""
    env = os.environ.get("DELEGATE_JOB_LOGS_DIR")
    if env and env.strip():
        return env
    return os.path.join(os.getcwd(), ".mam", "delegate_job_logs")
 LOGS_DIR = _default_logs_dir()
 # --------------------------------------------------------------------------
 # Broker configuration
 # --------------------------------------------------------------------------
@dataclass
 class BrokerConfig:
    """Resolved broker connection settings.
    PoC defaults target the public HiveMQ broker. Production overrides arrive
    either from environment variables or from a job record's ``broker.*`` block
    (see ``broker_config_from_job``).
    """
    host: str = "broker.hivemq.com"
    port: int = 1883
    tls: bool = False
    username: Optional[str] = None
    password: Optional[str] = None
    client_id_prefix: str = "hermes"
    # TLS material (only consulted when tls is True).
    ca_certs: Optional[str] = None
    certfile: Optional[str] = None
    keyfile: Optional[str] = None
    keepalive: int = 60
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
    def to_registry_block(self) -> Dict[str, Any]:
        """The subset that gets persisted into a job record's broker block."""
        return {
            "host": self.host,
            "port": self.port,
            "tls": self.tls,
            "username": self.username,
            "password": self.password,
        }
 def _env_bool(name: str, default: bool = False) -> bool:
    raw = os.environ.get(name)
    if raw is None:
        return default
    return raw.strip().lower() in ("1", "true", "yes", "on")
 def _env_int(name: str, default: int) -> int:
    raw = os.environ.get(name)
    if raw is None or raw.strip() == "":
        return default
    try:
        return int(raw)
    except ValueError:
        logger.warning("invalid int for %s=%r; using default %d", name, raw, default)
        return default
 def broker_config_from_env(overrides: Optional[Dict[str, Any]] = None) -> BrokerConfig:
    """Build a :class:`BrokerConfig` from environment variables.
    Recognised vars (all optional, PoC defaults shown):
      MQTT_BROKER (broker.hivemq.com), MQTT_PORT (1883), MQTT_TLS (0),
      MQTT_USERNAME, MQTT_PASSWORD, MQTT_CLIENT_ID_PREFIX (hermes),
      MQTT_CA_CERTS, MQTT_CERTFILE, MQTT_KEYFILE, MQTT_KEEPALIVE (60).
    ``overrides`` (e.g. a job record's broker block) wins over the env values
    for any key it specifies with a non-None value.
    """
    cfg = BrokerConfig(
        host=os.environ.get("MQTT_BROKER", "broker.hivemq.com"),
        port=_env_int("MQTT_PORT", 1883),
        tls=_env_bool("MQTT_TLS", False),
        username=os.environ.get("MQTT_USERNAME") or None,
        password=os.environ.get("MQTT_PASSWORD") or None,
        client_id_prefix=os.environ.get("MQTT_CLIENT_ID_PREFIX", "hermes"),
        ca_certs=os.environ.get("MQTT_CA_CERTS") or None,
        certfile=os.environ.get("MQTT_CERTFILE") or None,
        keyfile=os.environ.get("MQTT_KEYFILE") or None,
        keepalive=_env_int("MQTT_KEEPALIVE", 60),
    )
    if overrides:
        for key, value in overrides.items():
            if value is not None and hasattr(cfg, key):
                setattr(cfg, key, value)
    return cfg
 def broker_config_from_job(job: Dict[str, Any]) -> BrokerConfig:
    """Resolve broker config for a job: env defaults, then the job's broker.*
    block overrides. This lets ``publish_event.py`` connect from the registry
    alone, while still honouring environment toggles (e.g. MQTT_TLS=1)."""
    return broker_config_from_env(overrides=job.get("broker") or {})
 def make_client(role: str, config: Optional[BrokerConfig] = None) -> mqtt.Client:
    """Return a configured paho ``Client`` (not yet connected).
    The client id is ``f"{prefix}-{role}-{uuid8}"`` so concurrent publishers /
    subscribers never collide on the broker. Auth and TLS are applied when the
    config supplies them.
    """
    config = config or broker_config_from_env()
    client_id = f"{config.client_id_prefix}-{role}-{uuid.uuid4().hex[:8]}"
    client = mqtt.Client(
        callback_api_version=mqtt.CallbackAPIVersion.VERSION2,
        client_id=client_id,
    )
    if config.username:
        client.username_pw_set(config.username, config.password)
    if config.tls:
        # If ca_certs is None paho uses the system trust store (good enough for
        # public CAs); a private CA bundle path is passed through unchanged.
        client.tls_set(
            ca_certs=config.ca_certs,
            certfile=config.certfile,
            keyfile=config.keyfile,
        )
    logger.debug("built client id=%s tls=%s host=%s", client_id, config.tls, config.host)
    return client
 def reason_code_value(rc: Any) -> int:
    """Normalise a paho v2 connect reason code to an int.
    paho-mqtt 2.x hands callbacks a ``ReasonCode`` object (not an int); older
    paths may pass a plain int. ``ReasonCode`` exposes ``.value``; 0 == success.
    """
    return int(getattr(rc, "value", rc))
 def verify_hmac(payload: dict, auth_token: Optional[str]) -> bool:
    """Verify HMAC-SHA256 signature. Returns True if valid or no token set."""
    if not auth_token:
        return True  # PoC mode — no auth
    sig = payload.get("data", {}).get("hmac_sig")
    if not sig:
        return False
    sign_payload = {k: v for k, v in payload.items() if k != "data"}
    sign_payload["data"] = {k: v for k, v in payload.get("data", {}).items() if k != "hmac_sig"}
    msg = json.dumps(sign_payload, sort_keys=True, separators=(",", ":")).encode()
    expected = hmac.new(auth_token.encode(), msg, hashlib.sha256).hexdigest()
    return hmac.compare_digest(sig, expected)
 def topic_prefix_for(job_id: str, root: str = DEFAULT_TOPIC_ROOT) -> str:
    return f"{root}/{job_id}"
 def events_topic_for(job_id: str, root: str = DEFAULT_TOPIC_ROOT) -> str:
    return f"{topic_prefix_for(job_id, root)}/events"
 # --------------------------------------------------------------------------
 # Registry primitives (single source of truth for raw record I/O)
 # --------------------------------------------------------------------------
 def _job_path(job_id: str, registry_dir: str) -> Path:
    return Path(registry_dir) / f"{job_id}.json"
 def _lock_path(registry_dir: str) -> Path:
    return Path(registry_dir) / LOCK_FILENAME
@contextmanager
 def registry_lock(registry_dir: str):
    """Advisory exclusive lock over the whole registry dir via fcntl.
    PoC-grade single-host concurrency control. Multiple tmux sessions / scripts
    serialise their read-modify-write of job records through this lock so two
    sessions never claim the same pending job. For multi-host delegation move
    to SQLite WAL (see references/registry.md)."""
    import fcntl  # POSIX only; imported lazily so import works on Windows.
    Path(registry_dir).mkdir(parents=True, exist_ok=True)
    lock_file = _lock_path(registry_dir)
    fh = open(lock_file, "a+")
    try:
        fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
        yield
    finally:
        try:
            fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
        finally:
            fh.close()
 def load_job(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> Dict[str, Any]:
    """Load and parse a job record. Raises FileNotFoundError if absent."""
    path = _job_path(job_id, registry_dir)
    if not path.exists():
        raise FileNotFoundError(f"job record not found: {path}")
    with open(path, "r", encoding="utf-8") as fh:
        return json.load(fh)
 def _atomic_write_record(job_id: str, registry_dir: str, record: Dict[str, Any]) -> None:
    """Write a record atomically: temp file in the same dir + os.replace.
    The rename is atomic on POSIX, so readers never observe a half-written
    file. Callers MUST already hold ``registry_lock`` for read-modify-write
    correctness."""
    Path(registry_dir).mkdir(parents=True, exist_ok=True)
    path = _job_path(job_id, registry_dir)
    fd, tmp = tempfile.mkstemp(dir=str(path.parent), prefix=f".{job_id}.", suffix=".tmp")
    try:
        with os.fdopen(fd, "w", encoding="utf-8") as fh:
            json.dump(record, fh, ensure_ascii=False, indent=2)
            fh.write("\n")
            fh.flush()
            os.fsync(fh.fileno())
        os.replace(tmp, path)
        try:
            os.chmod(path, 0o600)
        except Exception:
            pass
    except BaseException:
        if os.path.exists(tmp):
            os.unlink(tmp)
        raise
 def update_job_status(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR, **fields: Any) -> Dict[str, Any]:
    """Atomically merge ``fields`` into a job record under the registry lock.
    Always refreshes ``updated_at``. Returns the new record. Raises
    FileNotFoundError if the job does not exist.
    This is the single chokepoint for status writes (both ``registry.update_status``
    and ``publish_event.py``'s status sync route through here), so it also mirrors
    any ``status`` change into the persistent audit log. We perform the log mirror
    under the lock to guarantee sequential consistency in audit history."""
    with registry_lock(registry_dir):
        record = load_job(job_id, registry_dir)
        old_status = record.get("status")
        record.update(fields)
        record["updated_at"] = _utcnow()
        _atomic_write_record(job_id, registry_dir, record)
        if "status" in fields:
            new_status = record.get("status")
            update_logged_status(job_id, new_status, updated_at=record["updated_at"])
            if old_status != new_status:
                append_event(job_id, {
                    "event": "status_changed",
                    "from": old_status,
                    "to": new_status,
                    "timestamp": record["updated_at"],
                })
    return record
 def next_seq(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> int:
    """Return the next monotonic sequence number for a job, persisted in the
    record's ``last_seq`` field so it stays consistent across process restarts.
    First call returns 1."""
    with registry_lock(registry_dir):
        record = load_job(job_id, registry_dir)
        seq = int(record.get("last_seq", 0)) + 1
        record["last_seq"] = seq
        record["updated_at"] = _utcnow()
        _atomic_write_record(job_id, registry_dir, record)
        return seq
 def _utcnow() -> str:
    """ISO-8601 UTC timestamp with trailing Z (payload `timestamp` field)."""
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
 def _utcnow_precise() -> str:
    """ISO-8601 UTC timestamp with millisecond resolution. Used for the audit
    log's ``logged_at`` so events sort cleanly even within the same second."""
    now = time.time()
    base = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(now))
    return f"{base}.{int((now % 1) * 1000):03d}Z"
 # --------------------------------------------------------------------------
 # Persistent audit log (.mam/delegate_job_logs/<job_id>/...)
 #
 # Every function here is idempotent, concurrency-safe, and *best-effort*: a
 # logging failure is swallowed with a logger.warning and never propagated, so it
 # can never break a publish, a subscribe, or a registry write. stdout is never
 # touched (it is reserved for data output).
 # --------------------------------------------------------------------------
 def job_log_dir(job_id: str, logs_dir: Optional[str] = None) -> Path:
    return Path(logs_dir or LOGS_DIR) / job_id
 def job_log_path(job_id: str, kind: str, logs_dir: Optional[str] = None) -> Path:
    """Path to one audit-log file for a job. ``kind`` is a filename, e.g. the
    module constants META_FILENAME / EVENTS_FILENAME / STATUS_FILENAME."""
    return job_log_dir(job_id, logs_dir) / kind
@contextmanager
 def _file_lock(fh):
    """Best-effort exclusive lock over a single open file via fcntl, so two
    processes appending to events.ndjson never interleave a line. A no-op where
    fcntl is unavailable (Windows); a short append is atomic enough there."""
    try:
        import fcntl
    except ImportError:  # pragma: no cover - non-POSIX
        yield
        return
    fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
    try:
        yield
    finally:
        fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
 def _redact_dict(d: Any) -> Any:
    """Recursively mask sensitive values (passwords, secrets, tokens) inside logs."""
    if isinstance(d, dict):
        redacted = {}
        for k, v in d.items():
            if any(s in k.lower() for s in ("password", "token", "secret", "auth_token", "key")):
                redacted[k] = "[REDACTED]"
            else:
                redacted[k] = _redact_dict(v)
        return redacted
    elif isinstance(d, list):
        return [_redact_dict(item) for item in d]
    return d
 def append_event(job_id: str, event_dict: Dict[str, Any], logs_dir: Optional[str] = None) -> None:
    """Append one event as a JSON line to ``<logs>/<job_id>/events.ndjson``.
    Concurrency-safe (fcntl lock over the file) and best-effort. A millisecond
    ``logged_at`` is stamped when the caller did not supply one."""
    try:
        path = job_log_path(job_id, EVENTS_FILENAME, logs_dir)
        path.parent.mkdir(parents=True, exist_ok=True)
        record = _redact_dict(dict(event_dict))
        record.setdefault("logged_at", _utcnow_precise())
        line = json.dumps(record, ensure_ascii=False) + "\n"
        with open(path, "a", encoding="utf-8") as fh:
            with _file_lock(fh):
                fh.write(line)
                fh.flush()
    except Exception as exc:  # pragma: no cover - best effort
        logger.warning("append_event failed for job %s: %s", job_id, exc)
 def update_logged_status(job_id: str, status: str, logs_dir: Optional[str] = None, **extras: Any) -> None:
    """Rewrite ``<logs>/<job_id>/status.json`` (current status for fast point
    queries) atomically. Best-effort; merges any ``extras``."""
    try:
        path = job_log_path(job_id, STATUS_FILENAME, logs_dir)
        path.parent.mkdir(parents=True, exist_ok=True)
        record: Dict[str, Any] = {"job_id": job_id, "status": status, "updated_at": _utcnow()}
        record.update(extras)
        tmp = path.with_name(path.name + ".tmp")
        with open(tmp, "w", encoding="utf-8") as fh:
            json.dump(record, fh, ensure_ascii=False, indent=2)
            fh.write("\n")
        os.replace(tmp, path)
    except Exception as exc:  # pragma: no cover - best effort
        logger.warning("update_logged_status failed for job %s: %s", job_id, exc)
 def init_job_log(job_id: str, meta: Dict[str, Any], logs_dir: Optional[str] = None) -> None:
    """Seed the per-job audit-log dir: write meta.json, status.json, and a first
    ``registered`` line in events.ndjson. Idempotent (the ``registered`` line is
    written only when events.ndjson does not yet exist) and best-effort."""
    try:
        d = job_log_dir(job_id, logs_dir)
        d.mkdir(parents=True, exist_ok=True)
        meta_redacted = _redact_dict(meta)
        with open(d / META_FILENAME, "w", encoding="utf-8") as fh:
            json.dump(meta_redacted, fh, ensure_ascii=False, indent=2)
            fh.write("\n")
        status = meta.get("status", "pending")
        update_logged_status(
            job_id, status, logs_dir=logs_dir,
            created_at=meta.get("created_at"), prompt=meta.get("prompt"),
        )
        events_path = d / EVENTS_FILENAME
        first_time = not events_path.exists()
        events_path.touch(exist_ok=True)
        if first_time:
            append_event(job_id, {
                "event": "registered",
                "status": status,
                "agent": meta.get("agent"),
                "agent_session": meta.get("agent_session"),
                "topic_prefix": meta.get("topic_prefix"),
                "timestamp": meta.get("created_at"),
            }, logs_dir=logs_dir)
    except Exception as exc:  # pragma: no cover - best effort
        logger.warning("init_job_log failed for job %s: %s", job_id, exc)
 def read_logged_meta(job_id: str, logs_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """Return a job's audit meta.json (registration snapshot), or None."""
    try:
        with open(job_log_path(job_id, META_FILENAME, logs_dir), "r", encoding="utf-8") as fh:
            return json.load(fh)
    except (OSError, json.JSONDecodeError):
        return None
 def read_logged_status(job_id: str, logs_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """Return a job's current status.json, or None. This is the fast point-query
    file (current status only), separate from the registration-time meta.json."""
    try:
        with open(job_log_path(job_id, STATUS_FILENAME, logs_dir), "r", encoding="utf-8") as fh:
            return json.load(fh)
    except (OSError, json.JSONDecodeError):
        return None
 def iter_logged_events(job_id: str, logs_dir: Optional[str] = None):
    """Yield each parsed event from a job's events.ndjson in file (time) order.
    Malformed lines are skipped with a warning."""
    path = job_log_path(job_id, EVENTS_FILENAME, logs_dir)
    if not path.exists():
        return
    with open(path, "r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                logger.warning("skipping malformed audit line in %s", path)
 def list_logged_jobs(logs_dir: Optional[str] = None) -> List[Dict[str, Any]]:
    """Return one meta record per job directory under the logs root, oldest
    first. Falls back to ``{"job_id": <dir>}`` when meta.json is missing."""
    base = Path(logs_dir or LOGS_DIR)
    out: List[Dict[str, Any]] = []
    if not base.exists():
        return out
    for d in sorted(base.iterdir()):
        if not d.is_dir():
            continue
        meta = read_logged_meta(d.name, logs_dir) or {"job_id": d.name}
        # Overlay the live status.json so the summary reflects current state, not
        # the registration-time snapshot frozen in meta.json.
        status = read_logged_status(d.name, logs_dir)
        if status:
            meta = {**meta,
                    "status": status.get("status", meta.get("status")),
                    "updated_at": status.get("updated_at", meta.get("updated_at"))}
        out.append(meta)
    out.sort(key=lambda m: m.get("created_at") or "")
    return out
 # --------------------------------------------------------------------------
 # Retry helper
 # --------------------------------------------------------------------------
 def with_retry(
    fn: Optional[Callable] = None,
    *,
    attempts: int = 3,
    base_delay: float = 0.5,
    factor: float = 2.0,
    max_delay: float = 8.0,
    exceptions: Iterable[type] = (Exception,),
 ) -> Callable:
    """Retry ``fn`` with exponential backoff.
    Usable two ways::
        result = with_retry(do_publish, attempts=3)()      # wrap-and-call
        @with_retry(attempts=5, base_delay=1.0)            # decorator
        def do_publish(): ...
    Re-raises the last exception once ``attempts`` is exhausted.
    """
    exc_tuple = tuple(exceptions)
    def decorate(func: Callable) -> Callable:
        @functools.wraps(func)
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            delay = base_delay
            last_exc: Optional[BaseException] = None
            for attempt in range(1, attempts + 1):
                try:
                    return func(*args, **kwargs)
                except exc_tuple as exc:
                    last_exc = exc
                    if attempt >= attempts:
                        break
                    logger.warning(
                        "attempt %d/%d failed: %s; retrying in %.1fs",
                        attempt, attempts, exc, delay,
                    )
                    time.sleep(delay)
                    delay = min(delay * factor, max_delay)
            assert last_exc is not None
            raise last_exc
        return wrapper
    if fn is not None:
        return decorate(fn)
    return decorate
 def setup_logging(level: int = logging.WARNING) -> None:
    """Configure root logging to stderr. stdout is reserved for data output
    (subscriber event lines, registry ids)."""
    import sys
    logging.basicConfig(
        level=level,
        stream=sys.stderr,
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )
@@ -0,0 +1,229 @@
 #!/usr/bin/env python3
 """publish_event.py — the single entry point for emitting a Job event.
 Loads the job record from the registry, resolves its broker, assigns the next
 monotonic ``seq``, builds the schema-v1 JSON payload, and publishes it to
 ``<topic_prefix>/events`` over QoS 1 with exponential-backoff retry.
 Silent by design: nothing is printed to stdout. Diagnostics go to stderr via
 logging. Terminal events (``completed``/``error``) publish with retain=True so
 a late subscriber still observes the final state (production hardening).
 Exit codes:
  0  published successfully
  1  parameter / registry error (bad args, unknown job, no pending job)
  2  publish failed after retries (network / broker / ACK timeout)
 Usage:
  publish_event.py --job <id> --event started [--detail "..."] [--data '{...}']
  publish_event.py --pick-pending --agent-session tmux:claude --event completed
  publish_event.py --job <id> --event completed --retained
 """
 from __future__ import annotations
 import argparse
 import hashlib
 import hmac
 import json
 import logging
 import sys
 import time
 from typing import Any, Dict, Optional
 import mqtt_common
 import registry
 from mqtt_common import (
    DEFAULT_REGISTRY_DIR,
    SCHEMA_VERSION,
    broker_config_from_job,
    events_topic_for,
    load_job,
    make_client,
    next_seq,
    with_retry,
 )
 logger = logging.getLogger("delegate_job.publish_event")
 VALID_EVENTS = ("started", "permission_required", "progress", "completed", "error")
 TERMINAL_EVENTS = ("completed", "error")
 # event -> registry status to sync as a best-effort side effect
 EVENT_TO_STATUS = {
    "started": "running",
    "completed": "completed",
    "error": "error",
 }
 CONNECT_ACK_TIMEOUT = 10  # seconds to wait for CONNACK
 PUBLISH_ACK_TIMEOUT = 5   # seconds to wait for QoS-1 PUBACK
 def build_payload(
    job_id: str,
    seq: int,
    event: str,
    detail: str,
    data: Optional[Dict[str, Any]],
    auth_token: Optional[str],
 ) -> Dict[str, Any]:
    payload: Dict[str, Any] = {
        "schema_version": SCHEMA_VERSION,
        "seq": seq,
        "job_id": job_id,
        "event": event,
        "timestamp": mqtt_common._utcnow(),
        "detail": detail,
        "data": dict(data) if data else {},
    }
    # Production: carry the per-job HMAC-SHA256 signature in `data.hmac_sig` so
    # the subscriber can verify the publisher without exposing the secret token.
    # The signature is calculated over the entire payload (with `data.hmac_sig` excluded).
    if auth_token:
        sign_payload = {k: v for k, v in payload.items() if k != "data"}
        sign_payload["data"] = {k: v for k, v in payload.get("data", {}).items() if k != "hmac_sig"}
        msg = json.dumps(sign_payload, sort_keys=True, separators=(",", ":")).encode()
        sig = hmac.new(auth_token.encode(), msg, hashlib.sha256).hexdigest()
        payload["data"]["hmac_sig"] = sig
    return payload
 def _publish_once(config, topic: str, body: bytes, retain: bool) -> None:
    """Connect, publish one QoS-1 message, wait for the broker ACK, disconnect.
    Raises on any failure so ``with_retry`` can re-run the whole sequence (a
    fresh connection per attempt is the robust choice for a PoC)."""
    client = make_client("publisher", config)
    connected = {"rc": None}
    def on_connect(_c, _u, _flags, reason_code, _props):
        connected["rc"] = reason_code
    client.on_connect = on_connect
    client.connect(config.host, config.port, config.keepalive)
    client.loop_start()
    try:
        # Wait for CONNACK so we fail fast on auth/TLS errors.
        deadline = time.monotonic() + CONNECT_ACK_TIMEOUT
        while connected["rc"] is None and time.monotonic() < deadline:
            time.sleep(0.05)
        if connected["rc"] is None:
            raise TimeoutError("no CONNACK from broker")
        if mqtt_common.reason_code_value(connected["rc"]) != 0:
            raise ConnectionError(f"broker refused connection: rc={connected['rc']}")
        info = client.publish(topic, payload=body, qos=1, retain=retain)
        info.wait_for_publish(timeout=PUBLISH_ACK_TIMEOUT)
        if not info.is_published():
            raise TimeoutError("publish not acknowledged within timeout")
    finally:
        client.loop_stop()
        try:
            client.disconnect()
        except Exception:  # pragma: no cover - disconnect best effort
            pass
 def _resolve_job_id(args) -> Optional[str]:
    if args.pick_pending:
        return registry.pick_pending(args.agent_session, args.registry_dir)
    return args.job
 def main(argv=None) -> int:
    parser = argparse.ArgumentParser(description="Publish a Job event to MQTT")
    target = parser.add_mutually_exclusive_group(required=True)
    target.add_argument("--job", help="job id to publish for")
    target.add_argument("--pick-pending", action="store_true",
                        help="auto-select a pending job for --agent-session")
    parser.add_argument("--agent-session", default="tmux:claude",
                        help="session label used with --pick-pending")
    parser.add_argument("--event", default="progress", choices=VALID_EVENTS)
    parser.add_argument("--detail", default="")
    parser.add_argument("--data", default=None, help="optional JSON object string")
    parser.add_argument("--retained", action="store_true",
                        help="force retain=True (auto for completed/error)")
    parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
    parser.add_argument("--attempts", type=int, default=3)
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args(argv)
    mqtt_common.setup_logging(logging.DEBUG if args.verbose else logging.WARNING)
    # --- parse optional data JSON (parameter error -> exit 1) ---
    data: Optional[Dict[str, Any]] = None
    if args.data:
        try:
            data = json.loads(args.data)
            if not isinstance(data, dict):
                raise ValueError("--data must be a JSON object")
        except (ValueError, json.JSONDecodeError) as exc:
            logger.error("invalid --data: %s", exc)
            return 1
    job_id = _resolve_job_id(args)
    if not job_id:
        logger.error("no job to publish for (unknown --job or no pending job)")
        return 1
    try:
        job = load_job(job_id, args.registry_dir)
    except FileNotFoundError as exc:
        logger.error("%s", exc)
        return 1
    config = broker_config_from_job(job)
    topic = job.get("topic_prefix")
    topic = f"{topic}/events" if topic else events_topic_for(job_id)
    seq = next_seq(job_id, args.registry_dir)
    payload = build_payload(
        job_id=job_id,
        seq=seq,
        event=args.event,
        detail=args.detail,
        data=data,
        auth_token=job.get("auth_token"),
    )
    body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
    retain = args.retained or args.event in TERMINAL_EVENTS
    publish = with_retry(
        _publish_once,
        attempts=args.attempts,
        exceptions=(OSError, TimeoutError, ConnectionError, ValueError),
    )
    try:
        publish(config, topic, body, retain)
    except Exception as exc:
        logger.error("publish failed after %d attempts: %s", args.attempts, exc)
        return 2
    # Persistent audit log: record the exact payload we put on the wire so the
    # publish is reproducible from the log alone. Best-effort (isolated inside
    # append_event) — never fails the publish.
    mqtt_common.append_event(job_id, {
        "event": "published",
        "source_event": args.event,
        "seq": seq,
        "topic": topic,
        "retain": retain,
        "timestamp": payload["timestamp"],
        "detail": args.detail,
        "payload": payload,
    })
    # Best-effort side effects: registry status sync + (debug) event log. Never
    # fail the publish on these.
    registry.append_event(job_id, args.registry_dir, payload)
    new_status = EVENT_TO_STATUS.get(args.event)
    if new_status:
        try:
            mqtt_common.update_job_status(job_id, args.registry_dir, status=new_status)
        except Exception as exc:  # pragma: no cover - best effort
            logger.warning("status sync failed: %s", exc)
    logger.info("published %s seq=%d job=%s retain=%s", args.event, seq, job_id, retain)
    return 0
 if __name__ == "__main__":
    sys.exit(main())
@@ -0,0 +1,334 @@
 """Job registry for the multi-agent-mux-delegate-job skill.
 A job record is the single source of truth for one delegated unit of work:
 its id, prompt, owning agent session, broker connection, timeouts, and status.
 Records live as ``<registry_dir>/<job_id>.json`` with an append-only event log
 ``<registry_dir>/<job_id>.events.log`` and a shared ``<registry_dir>/.lock``.
 Concurrency is handled via the fcntl lock in :mod:`mqtt_common` (PoC). For
 multi-host delegation, migrate to SQLite WAL — see references/registry.md.
 Importable as a library and runnable as a CLI (``register``/``list``/``get``/
 ``status``/``pick``) so the ``multi-agent-mux-delegate-job`` bash wrapper can shell out.
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 import sys
 import uuid
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 import mqtt_common
 from mqtt_common import (
    DEFAULT_REGISTRY_DIR,
    SCHEMA_VERSION,
    _atomic_write_record,
    _utcnow,
    broker_config_from_env,
    load_job,
    registry_lock,
    topic_prefix_for,
 )
 logger = logging.getLogger("delegate_job.registry")
 TERMINAL_STATUSES = ("completed", "error", "cancelled")
 VALID_STATUSES = ("pending", "running", "completed", "error", "cancelled")
 def generate_job_id(bits: int = 32) -> str:
    """PoC: 32-bit hex (8 chars). Production: 128-bit (full uuid4 hex)."""
    if bits >= 128:
        return uuid.uuid4().hex
    nibbles = max(1, bits // 4)
    return uuid.uuid4().hex[:nibbles]
 def register_job(
    prompt: str,
    agent: str = "claude-code",
    agent_session: str = "tmux:claude",
    broker: Optional[Dict[str, Any]] = None,
    timeout_sec: int = 3600,
    idle_timeout_sec: int = 120,
    registry_dir: str = DEFAULT_REGISTRY_DIR,
    job_id: Optional[str] = None,
    expected_artifacts: Optional[List[str]] = None,
    bits: int = 32,
    auth_token: Optional[str] = None,
 ) -> str:
    """Create a new ``pending`` job record and return its id.
    ``broker`` defaults to the current environment's resolved broker block, so
    the registry alone is enough for ``publish_event.py`` to connect later.
    """
    job_id = job_id or generate_job_id(bits)
    if broker is None:
        broker = broker_config_from_env().to_registry_block()
    if auth_token is None:
        # Auto-generate token if secure broker configuration (TLS or username) is detected
        if broker.get("tls") or broker.get("username"):
            import secrets
            auth_token = secrets.token_urlsafe(32)
    now = _utcnow()
    record: Dict[str, Any] = {
        "schema_version": SCHEMA_VERSION,
        "job_id": job_id,
        "status": "pending",
        "created_at": now,
        "updated_at": now,
        "prompt": prompt,
        "agent": agent,
        "agent_session": agent_session,
        "broker": broker,
        "topic_prefix": topic_prefix_for(job_id),
        "timeout_sec": int(timeout_sec),
        "idle_timeout_sec": int(idle_timeout_sec),
        "expected_artifacts": expected_artifacts or [],
        "last_seq": 0,
        "auth_token": auth_token,
    }
    with registry_lock(registry_dir):
        if mqtt_common._job_path(job_id, registry_dir).exists():
            raise FileExistsError(f"job already exists: {job_id}")
        _atomic_write_record(job_id, registry_dir, record)
    # Seed the persistent audit log (meta.json + status.json + a "registered"
    # event). Best-effort inside init_job_log — never blocks registration.
    mqtt_common.init_job_log(job_id, meta=record)
    logger.info("registered job %s (agent=%s session=%s)", job_id, agent, agent_session)
    return job_id
 def pick_pending(agent_session: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> Optional[str]:
    """Claim the oldest ``pending`` job for ``agent_session``, flipping it to
    ``running`` atomically under the lock. Returns the job id, or None if no
    pending job matches. This is how each tmux session takes only its own work
    without two sessions grabbing the same job."""
    with registry_lock(registry_dir):
        candidates = []
        for record in _iter_records(registry_dir):
            if record.get("status") == "pending" and record.get("agent_session") == agent_session:
                candidates.append(record)
        if not candidates:
            return None
        candidates.sort(key=lambda r: r.get("created_at", ""))
        chosen = candidates[0]
        chosen["status"] = "running"
        chosen["updated_at"] = _utcnow()
        _atomic_write_record(chosen["job_id"], registry_dir, chosen)
        logger.info("session %s picked job %s", agent_session, chosen["job_id"])
        job_id = chosen["job_id"]
        updated_at = chosen["updated_at"]
    # pick_pending writes the record directly (not via update_job_status), so it
    # mirrors the pending->running transition into the audit log here. Best-effort.
    mqtt_common.update_logged_status(job_id, "running", updated_at=updated_at)
    mqtt_common.append_event(job_id, {
        "event": "status_changed",
        "from": "pending",
        "to": "running",
        "by": agent_session,
        "timestamp": updated_at,
    })
    return job_id
 def update_status(job_id: str, registry_dir: str, status: str) -> Dict[str, Any]:
    if status not in VALID_STATUSES:
        raise ValueError(f"invalid status {status!r}; expected one of {VALID_STATUSES}")
    return mqtt_common.update_job_status(job_id, registry_dir, status=status)
 def list_jobs(registry_dir: str = DEFAULT_REGISTRY_DIR, status: Optional[str] = None) -> List[Dict[str, Any]]:
    records = list(_iter_records(registry_dir))
    if status:
        records = [r for r in records if r.get("status") == status]
    records.sort(key=lambda r: r.get("created_at", ""))
    return records
 def append_event(job_id: str, registry_dir: str, payload: Dict[str, Any]) -> None:
    """Append one event payload as a JSON line to the job's events log. Best
    effort, debug-only; failures are logged but never raised to the caller."""
    try:
        Path(registry_dir).mkdir(parents=True, exist_ok=True)
        log_path = Path(registry_dir) / f"{job_id}.events.log"
        with open(log_path, "a", encoding="utf-8") as fh:
            fh.write(json.dumps(payload, ensure_ascii=False) + "\n")
    except OSError as exc:  # pragma: no cover - best effort
        logger.warning("could not append event for %s: %s", job_id, exc)
 # convenience re-export so callers can `from registry import load_job`
 __all__ = [
    "register_job", "pick_pending", "update_status", "load_job",
    "list_jobs", "append_event", "generate_job_id",
 ]
 def _iter_records(registry_dir: str):
    base = Path(registry_dir)
    if not base.exists():
        return
    for path in sorted(base.glob("*.json")):
        try:
            with open(path, "r", encoding="utf-8") as fh:
                yield json.load(fh)
        except (OSError, json.JSONDecodeError) as exc:
            logger.warning("skipping unreadable record %s: %s", path, exc)
 # --------------------------------------------------------------------------
 # CLI (so the bash wrapper can shell out without inline python)
 # --------------------------------------------------------------------------
 def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="multi-agent-mux-delegate-job registry CLI")
    parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
    sub = parser.add_subparsers(dest="command", required=True)
    p_reg = sub.add_parser("register", help="create a pending job; prints the job id")
    p_reg.add_argument("--prompt", required=True)
    p_reg.add_argument("--agent", default="claude-code")
    p_reg.add_argument("--agent-session", default="tmux:claude")
    p_reg.add_argument("--timeout", type=int, default=3600)
    p_reg.add_argument("--idle-timeout", type=int, default=120)
    p_reg.add_argument("--bits", type=int, default=32, help="32 (PoC) or 128 (prod)")
    p_reg.add_argument("--artifact", action="append", default=[], dest="artifacts")
    p_reg.add_argument("--auth-token", default=None, help="HMAC auth token for the job (auto-generated if secure broker is detected)")
    p_list = sub.add_parser("list", help="list jobs (optionally by status)")
    p_list.add_argument("--status", default=None)
    p_list.add_argument("--json", action="store_true")
    p_get = sub.add_parser("get", help="print one job record as JSON")
    p_get.add_argument("--job", required=True)
    p_status = sub.add_parser("status", help="set a job status")
    p_status.add_argument("--job", required=True)
    p_status.add_argument("--set", required=True, dest="status")
    p_pick = sub.add_parser("pick", help="claim a pending job for a session; prints id")
    p_pick.add_argument("--agent-session", default="tmux:claude")
    p_logs = sub.add_parser(
        "logs",
        help="show the persistent audit log for a job, or --list every logged job",
    )
    p_logs.add_argument("job_id", nargs="?", default=None,
                        help="job id whose events.ndjson to print")
    p_logs.add_argument("--list", action="store_true", dest="list_all",
                        help="summarise every job under the logs dir instead")
    p_logs.add_argument("--logs-dir", default=None,
                        help="override the audit-log root (default: $DELEGATE_JOB_LOGS_DIR "
                             "or <cwd>/.mam/delegate_job_logs)")
    p_logs.add_argument("--tail", type=int, default=0,
                        help="show only the last N events (0 = all)")
    p_logs.add_argument("--json", action="store_true",
                        help="emit raw JSON lines / records instead of a table")
    return parser
 def main(argv: Optional[List[str]] = None) -> int:
    mqtt_common.setup_logging(logging.INFO)
    args = _build_parser().parse_args(argv)
    rd = args.registry_dir
    if args.command == "register":
        job_id = register_job(
            prompt=args.prompt,
            agent=args.agent,
            agent_session=args.agent_session,
            timeout_sec=args.timeout,
            idle_timeout_sec=args.idle_timeout,
            registry_dir=rd,
            expected_artifacts=args.artifacts,
            bits=args.bits,
            auth_token=args.auth_token,
        )
        print(job_id)
        return 0
    if args.command == "list":
        records = list_jobs(rd, status=args.status)
        if args.json:
            print(json.dumps(records, ensure_ascii=False, indent=2))
        else:
            if not records:
                print("(no jobs)")
            for r in records:
                print(f"{r['job_id']}  {r.get('status','?'):10s}  {r.get('agent_session','')}"
                      f"  {r.get('prompt','')[:48]}")
        return 0
    if args.command == "get":
        try:
            print(json.dumps(load_job(args.job, rd), ensure_ascii=False, indent=2))
        except FileNotFoundError as exc:
            print(str(exc), file=sys.stderr)
            return 1
        return 0
    if args.command == "status":
        try:
            update_status(args.job, rd, args.status)
        except (FileNotFoundError, ValueError) as exc:
            print(str(exc), file=sys.stderr)
            return 1
        return 0
    if args.command == "pick":
        job_id = pick_pending(args.agent_session, rd)
        if job_id is None:
            return 3  # no pending job for this session
        print(job_id)
        return 0
    if args.command == "logs":
        return _cmd_logs(args)
    return 1
 def _cmd_logs(args) -> int:
    """Pretty-print one job's events.ndjson, or summarise all logged jobs."""
    logs_dir = args.logs_dir or mqtt_common.LOGS_DIR
    if args.list_all:
        jobs = mqtt_common.list_logged_jobs(logs_dir)
        if args.json:
            print(json.dumps(jobs, ensure_ascii=False, indent=2))
            return 0
        if not jobs:
            print(f"(no logged jobs under {logs_dir})")
            return 0
        for m in jobs:
            print(f"{m.get('job_id','?')}  {m.get('status','?'):10s}  "
                  f"{m.get('created_at','-'):20s}  {(m.get('prompt') or '')[:48]}")
        return 0
    if not args.job_id:
        print("logs requires a <job_id> or --list", file=sys.stderr)
        return 1
    events = list(mqtt_common.iter_logged_events(args.job_id, logs_dir))
    if not events and not mqtt_common.job_log_dir(args.job_id, logs_dir).exists():
        print(f"no audit log for job {args.job_id} under {logs_dir}", file=sys.stderr)
        return 1
    if args.tail and args.tail > 0:
        events = events[-args.tail:]
    if args.json:
        for e in events:
            print(json.dumps(e, ensure_ascii=False))
        return 0
    for e in events:
        ts = e.get("logged_at") or e.get("timestamp") or "-"
        extra = e.get("detail") or e.get("to") or e.get("source_event") or ""
        print(f"{ts:24s}  {e.get('event','?'):<16s}  {extra}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
@@ -0,0 +1,236 @@
 ---
 name: multi-agent-mux-monitor
 description: "Run a long-lived Kanban worker that polls .mam/agent-sessions.yaml against the actual tmux/agent runtime state and reconciles them. Use when you want live visibility into which agent sessions are running, which are dead, which have stale YAML entries, and which have new session ids that haven't been recorded yet. Designed to be dispatched as a Kanban goal_mode task (--goal) so it keeps running until the user stops it."
 version: 1.0.0
 author: godopu
 license: MIT
 platforms: [linux, macos]
 environments: [kanban, terminal, tmux]
 metadata:
  hermes:
    tags: [agent, tmux, claude, antigravity, agy, monitor, kanban, observation, reconciliation]
    related_skills: [multi-agent-mux-create, multi-agent-mux-resume, multi-agent-mux-stop, kanban-orchestrator]
    prereq_skills: [kanban-worker, multi-agent-mux-create]
 ---
 # Agent Sessions Monitor — Live Reconciliation via Kanban Worker
 > **Companion skills**: `multi-agent-mux-create` / `multi-agent-mux-resume` / `multi-agent-mux-stop` (mutators); this skill is the **observer**.
 > **Single source of truth**: `./.mam/agent-sessions.yaml`.
 ## What this skill does
 Dispatch a **Kanban worker** (in `goal_mode`) that:
 1. Every ~30s polls the actual state of:
   - `tmux ls` (which sessions are alive)
   - `tmux list-panes -t <session> ...` (pane cmd, cwd, pid)
   - `~/.claude/projects/<workspace-key>/*.jsonl` mtime + first-line sessionId
   - `~/.gemini/antigravity-cli/cache/last_conversations.json` (agy workspace → conversation mapping)
   - `~/.gemini/antigravity-cli/conversations/<uuid>.db` mtime (agy)
 2. Compares the live state to `agent-sessions.yaml`
 3. Detects 4 classes of drift:
   - **yaml-only terminated/archived/stopped**: tmux dead, YAML says `terminated`, `archived`, or `stopped` → OK, left untouched (deliberate end states)
   - **yaml-only running, tmux dead**: YAML says `running`, tmux is gone → mark `terminated` with timestamp
   - **tmux-only running, not in YAML**: tmux session exists with `<workspace>-creator-*` naming but YAML doesn't know about it → register as a new entry
   - **stale UUID**: YAML has a UUID, but the on-disk artifact is gone → flag in comment
 4. Writes a Kanban `kanban_comment` on every drift event with diff details
 5. Heartbeat every 5 minutes
 6. **Goal loop**: judge (auxiliary model) re-checks the card after each turn against the body to decide "is monitoring still wanted?". When the user says "stop monitoring" via comment, the worker blocks with `reason=stop-requested`.
 ## When to use
 - You have multiple workspaces with tmux agent sessions and want a single source of truth
 - You suspect YAML drift after a host reboot / crash
 - You want a notification when a session id was just created (so you can record it before next restart)
 - You're running multi-day work and want to know "what's actually running right now"
 ## When NOT to use
 - One-off interactive session — just check `tmux ls` and read the YAML
 - A single, short session — overhead > benefit
 - You don't have a Kanban dispatcher running
 ## Dispatching the monitor
 ```bash
 # Goal-mode task: keeps running until the user signals stop
 hermes kanban create \
  --title "agent-sessions monitor (live reconcile)" \
  --assignee default \
  --workspace worktree \
  --branch wt/multi-agent-mux-monitor \
  --goal \
  --goal-max-turns 100 \
  --max-runtime 8h \
  --max-retries 1 \
  --skill multi-agent-mux-monitor \
  --body "$(cat <<'EOF'
 You are the agent-sessions monitor. Every 30 seconds, do:
 1. Read .mam/agent-sessions.yaml
 2. Run `tmux ls` and `tmux list-panes -F 'session=#{session_name} pid=#{pane_pid} cmd=#{pane_current_command} cwd=#{pane_current_path}'`
 3. For each session in the YAML, check the corresponding tmux state
 4. For each tmux session matching `*-creator-claude` or `*-creator-agy` that's not in the YAML, register it
 5. For any drift, call `kanban_comment` with the diff
 6. Sleep 30 seconds, then repeat
 If the user comments `stop` or `stop monitoring` on this card, call `kanban_block(reason="stop-requested by user")`.
 If you find that a Claude session's `claude_session_id_own` is null but there's a new *.jsonl in the project dir, read the sessionId from the first line and update the YAML.
 Use the helper script at .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh for the YAML updates — it handles all the merge logic and writes a structured comment to this card.
 EOF
 )"
 ```
 ## Helper script: `reconcile.sh`
 The worker calls this script every 30s. It:
 1. Diffs YAML ↔ tmux ↔ disk artifacts
 2. Updates YAML if needed (only when changes are real, not on every poll — avoids spamming)
 3. Emits a JSON diff to stdout that the worker turns into a `kanban_comment`
 ```bash
 # Reconcile + auto-update YAML (atomic, flock-guarded). Emits JSON drift to stdout.
 bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --once --emit-diff
 # Read-only: compute drift WITHOUT writing the YAML (use for "what's running?" checks).
 bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --once --emit-diff --dry-run
 # Push-based MQTT Monitor: listen to delegated job events on the broker and update the YAML instantly.
 # Bounded run that exits after 5 min idle, or 1 h wall-clock; falls back to polling if the broker is down.
 bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --subscribe --idle-timeout 300 --timeout 3600
 # Persistent monitor (no timeouts): runs until interrupted; still polls if the broker is unreachable.
 bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --subscribe --idle-timeout 0
 ```
 Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (push-based MQTT subscription monitoring). `--subscribe` sub-flags: `--timeout N` (exit after N seconds of wall-clock; `0` = no limit, default), `--idle-timeout N` (exit after N seconds with no message; default `3600`, `0` = never idle-out). On a broker connection failure (connect error **or** non-zero CONNACK), `--subscribe` falls back to a polling loop that re-runs `--once --emit-diff` every `RECONCILE_POLL_INTERVAL` (default 15) seconds until `--timeout`. Terminal-event YAML updates are written through `lib.sh::atomic_dump_yaml` (flock + schema-validate + `.bak`). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself.
 ## Drift classes (what the script handles)
 ### Status Enum
 The `status` and `last_visible_status` fields MUST be one of the following exact strings: `running`, `stopped`, `terminated`, `archived`.
 Any unstructured comments or reasons for the status change should be placed in `last_visible_note` or `termination_mode`.
 ### A. tmux dead, YAML says running → auto-terminate
 ```
 YAML:  status=running, pane.pid=201132, cmd=claude
 tmux:  no session
       → set status=terminated, terminated_at=<now>, termination_mode=auto-detected
       → comment: "lab-landing-page-creator-claude: tmux gone (was pane 201132, cmd claude). Marked terminated."
 ```
 **Skip-set**: the auto-terminate only fires for sessions whose status is `running`.
 Rows already in a deliberate end state — `terminated`, `archived`, or **`stopped`**
 (set by `multi-agent-mux-stop`) — are
 left untouched. This is critical: a `stopped` row keeps its `resumable: true` and
 captured `*_session_id_own`, so the monitor must **not** overwrite it with
 `terminated ("auto-detected")` when its tmux is (expectedly) gone.
 ### B. tmux alive, not in YAML → auto-register
 ```
 tmux:  session=lab-paper-pdf2md-creator-agy, pid=...,
       cmd=agy, cwd=$WORKSPACE_ROOT/paper-pdf2md
 YAML:  no such session
       → register as new entry: status=running, last_visible_status=running, last_visible_note=auto-registered
       → comment: "lab-paper-pdf2md-creator-agy: tmux found but not in YAML. Auto-registered."
 ```
 ### C. New session id materializes (claude first message sent)
 ```
 YAML:  claude_session_id_own=null (placeholder)
 disk:  ~/.claude/projects/.../b3a7...c2f.jsonl exists, mtime=now,
       first line sessionId=b3a7...c2f
       → update claude_session_id_own=b3a7...c2f
       → comment: "lab-landing-page-creator-claude: session id materialized b3a7...c2f"
 ```
 ### D. Stale UUID (artifact gone)
 ```
 YAML:  agent_identities.claude.session_id=87dc548e-...
 disk:  ~/.claude/projects/.../87dc548e-...jsonl: missing
       → flag in comment, but DO NOT delete from YAML
       (the user may have moved the file or the disk may be temporarily unavailable;
        only `--purge-conversation` should remove the id)
 ```
 ## Pitfalls
 - **Don't run the monitor without `--goal`** — without goal mode, a single turn will spawn, do one reconcile, and complete. Goal mode keeps the worker alive across many turns.
 - **The 30s poll is a default** — workers may override if they detect heavy churn. A workspace with 5+ agent sessions should bump to 60s to avoid noise.
 - **`kanban_comment` rate limits** — Kanban may throttle if you comment too fast. Coalesce: only comment when the diff is *new* (not the same drift on every poll). The script tracks a state file at `.cache/multi-agent-mux-monitor/<workspace>.state` in the workspace root for this (overridable via `AGENT_SESSIONS_STATE_DIR`).
 - **Don't fight the user's explicit action** — if `multi-agent-mux-stop` is mid-flight and the monitor sees the same session in two states within 5s, prefer the user's most recent action. The monitor should not auto-revert a fresh `terminated` to `running` because of a stale `tmux has-session` check.
 - **The monitor should never modify the conversation artifacts** (jsonl, db) — only the YAML. If you see a stale UUID, comment about it but don't delete the file.
 - **TUI capture-pane is expensive** — only capture when you need to update `last_visible_status`, not every poll.
 ## Worker body template (for `hermes kanban create --body`)
 The `--body` of the dispatched task IS the worker's behavior spec. Here's a tested template:
 ```markdown
 # agent-sessions monitor
 ## Loop (every 30s)
 1. Read agent-sessions.yaml
 2. Bash: `bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --emit-diff`
 3. Parse the JSON diff from stdout
 4. If `drifts` is non-empty:
   - For each drift, call `kanban_comment` with the diff message
 5. Bash: `sleep 30`
 6. Heartbeat every 5 min: `kanban_heartbeat(progress="alive, N drifts detected, last at <time>")`
 ## Stop condition
 If `$HERMES_KANBAN_TASK` card has any comment containing "stop" or "stop monitoring" from a user:
 - Call `kanban_block(reason="stop-requested by user at <timestamp>")`
 ## Drift responses
 - A. tmux dead + YAML running: auto-terminate YAML, comment
 - B. tmux alive not in YAML: auto-register, comment
 - C. New session id from *.jsonl: update YAML, comment
 - D. Stale UUID: comment only, no YAML change
 ## Hard rules
 - Do NOT modify conversation artifacts (jsonl, db, brain/)
 - Do NOT spawn/delete tmux sessions — that's the create/delete skills' job
 - Do NOT call multi-agent-mux-create or multi-agent-mux-stop — only the user initiates those
 - Do NOT call `git commit` / `git push`
 ```
 ## Security: --subscribe on Public Brokers
 When using `--subscribe` with the default PoC public broker
 (`broker.hivemq.com:1883`), be aware that:
 1. **Wildcard subscription** means anyone can publish events to your job topics.
 2. **Auto-kill on terminal events** means a spoofed `completed` or `error`
   event from a third party can terminate your agent session.
 3. **Mitigation**: Use `--subscribe` only on private TLS-enabled brokers
   (production mode). For PoC, prefer polling-based monitor (`--once` or
   no `--subscribe`) which reads YAML/tmux state directly without MQTT.
 4. **HMAC verification**: Events are now verified via `verify_hmac()` in
   `mqtt_common.py` (see FW-05). Ensure `auth_token` is set for each job
   to enable signature validation — unauthenticated events will be dropped.
 ## Verification (one-shot)
 ```bash
 # Run reconcile once and inspect output
 bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --emit-diff --once \
  | python3 -m json.tool
 ```
 ## Related skills
 - `kanban-worker` — base lifecycle for the dispatched worker
 - `kanban-orchestrator` — if you want to dispatch this monitor *from* an orchestrator, use this to know how to phrase the body
@@ -0,0 +1,542 @@
 #!/usr/bin/env bash
 # reconcile.sh — multi-agent-mux-monitor 의 부속 스크립트
 # YAML ↔ tmux ↔ 디스크 artifact 간 drift 감지 (+ YAML 자동 갱신).
 #
 # Usage:
 #   bash reconcile.sh --once --emit-diff            # drift 감지 + 갱신
 #   bash reconcile.sh --once --emit-diff --dry-run  # drift 만 계산, 쓰기 안 함 (P1-E)
 #
 # --dry-run: 부수효과 없는 read-only. "지금 뭐 돌고 있지?" 질문에 안전.
 #            multi-agent-mux-status 스킬이 이걸 재사용.
 #
 # 출력 (JSON): {timestamp, yaml_path, tmux_sessions_alive, tmux_confirmed, drifts, actions}
 #
 # Exit codes: 0 = ok | 1 = YAML not found | 2 = error
 set -euo pipefail
 source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
 STATE_DIR="${AGENT_SESSIONS_STATE_DIR:-$WORKSPACE_ROOT/.cache/multi-agent-mux-monitor}"
 ONCE=0
 EMIT_DIFF=0
 DRY_RUN=0
 SUBSCRIBE=0
 # --subscribe controls (review item 4): 0 = no overall timeout; idle default 3600s
 # (raised from 600s to align with job timeout defaults); idle 0 = never idle-out.
 SUB_TIMEOUT=0
 SUB_IDLE_TIMEOUT=3600
 POLL_INTERVAL="${RECONCILE_POLL_INTERVAL:-15}"
 while [ $# -gt 0 ]; do
  case "$1" in
    --once) ONCE=1; shift ;;
    --emit-diff) EMIT_DIFF=1; shift ;;
    --dry-run) DRY_RUN=1; shift ;;
    --subscribe) SUBSCRIBE=1; shift ;;
    --timeout) SUB_TIMEOUT="$2"; shift 2 ;;
    --idle-timeout) SUB_IDLE_TIMEOUT="$2"; shift 2 ;;
    -h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe [--timeout N] [--idle-timeout N]]"; exit 0 ;;
    *) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
  esac
 done
 [ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
 if [ "$SUBSCRIBE" = "1" ]; then
  # Paths resolved relative to this script (review item 6): skills/ dir + lib.sh.
  SKILLS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
  LIB_SH="$SKILLS_DIR/lib.sh"
  # MQTT client lives in the project venv (has paho). All YAML work is delegated
  # to lib.sh::atomic_dump_yaml, which runs the system python3 (has PyYAML) — so
  # no single interpreter needs both paho and PyYAML (review items 4/5/6).
  PYBIN="$(_delegate_py_bin)"
  # The MQTT subscribe loop exits 3 to signal "broker unavailable → poll instead".
  set +e
  YAML_PATH="$AGENT_SESSIONS_YAML" HOME_DIR="$HOME_DIR" CLAUDE_PROJECT_DIR="$CLAUDE_PROJECT_DIR" LOCAL_BIN="$LOCAL_BIN" \
    WORKSPACE_ROOT="$WORKSPACE_ROOT" SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
    SKILLS_DIR="$SKILLS_DIR" LIB_SH="$LIB_SH" \
    "$PYBIN" - <<'PYEOF'
 import os, sys, json, time, subprocess
 lib_sh = os.environ.get('LIB_SH', '')
 skills_dir = os.environ.get('SKILLS_DIR', '')
 yaml_path = os.environ.get('YAML_PATH', '')
 workspace_root = os.environ.get('WORKSPACE_ROOT', '')
 timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0')          # 0 = no overall timeout
 idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '3600') or '0')  # 0 = no idle timeout
 # Prevent duplicate wildcard subscribers for this workspace (concurrency race)
 import fcntl
 lock_file_path = os.path.join(workspace_root or '.', '.mam', 'monitor.lock')
 try:
    os.makedirs(os.path.dirname(lock_file_path), exist_ok=True)
    lock_file = open(lock_file_path, 'w')
    fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB)
 except BlockingIOError:
    print("MQTT Monitor: another subscriber is already running for this workspace. Exiting.", flush=True)
    sys.exit(0)
 except Exception as e:
    print(f"MQTT Monitor: failed to acquire monitor lock ({e}). Exiting.", flush=True)
    sys.exit(1)
 # Locate skills/multi-agent-mux-delegate-job/scripts to import mqtt_common — relative first, then
 # an upward walk from cwd. No hardcoded absolute path (review item 6).
 cand = os.path.join(skills_dir, 'multi-agent-mux-delegate-job', 'scripts') if skills_dir else ''
 if cand and os.path.isdir(cand):
    sys.path.append(cand)
 else:
    d = os.getcwd()
    while d and d != '/':
        hit = None
        for sub in (('.agents', 'skills', 'multi-agent-mux-delegate-job', 'scripts'), ('skills', 'multi-agent-mux-delegate-job', 'scripts'), ('multi-agent-mux-delegate-job', 'scripts')):
            p = os.path.join(d, *sub)
            if os.path.isdir(p):
                hit = p
                break
        if hit:
            sys.path.append(hit)
            break
        d = os.path.dirname(d)
 import mqtt_common
 import registry
 # Executed INSIDE lib.sh::atomic_dump_yaml (system python3 + PyYAML), under the
 # YAML flock with schema-validate + .bak (review item 5). Marks matching running
 # sessions terminated and kills their tmux (review item 3 behaviour preserved),
 # or aborts the write entirely when nothing matches. The untrusted MQTT job id /
 # event arrive via env (MQTT_JID / MQTT_EVENT) — never spliced into source (P1-B).
 _MUTATION = r'''
 import os, subprocess
 from datetime import datetime, timezone
 _jid = os.environ['MQTT_JID']
 _event = os.environ['MQTT_EVENT']
 _now = datetime.now(timezone.utc)
 _changed = False
 for s in d.get('tmux_sessions', []):
    if s.get('delegate_job_id') == _jid and s.get('status') == 'running':
        s['status'] = 'terminated'
        s['terminated_at'] = _now.strftime('%Y-%m-%dT%H:%M:%SZ')
        s['terminated_at_epoch'] = int(_now.timestamp())
        s['termination_mode'] = 'auto-detected (MQTT ' + _event + ')'
        _name = s.get('name')
        _srv = s.get('tmux_server') or 'default'
        _cmd = ['tmux'] + (['-L', _srv] if _srv != 'default' else []) + ['kill-session', '-t', _name]
        subprocess.run(_cmd, capture_output=True)
        print('MQTT Monitor: terminated + killed ' + str(_name) + ' on ' + str(_srv), flush=True)
        _changed = True
 if not _changed:
    raise SystemExit(0)  # nothing matched — skip the write entirely
 '''
 def handle_terminal(jid, event):
    if not lib_sh or not os.path.isfile(lib_sh):
        print('MQTT Monitor: lib.sh not found, cannot update YAML', flush=True)
        return
    env = dict(os.environ)
    env['MQTT_JID'] = jid
    env['MQTT_EVENT'] = event
    cmd = ['bash', '-c',
           'source "$LIB_SH"; atomic_dump_yaml "$YAML_PATH" MQTT_JID="$MQTT_JID" MQTT_EVENT="$MQTT_EVENT"']
    r = subprocess.run(cmd, input=_MUTATION, text=True, env=env, capture_output=True)
    if (r.stdout or '').strip():
        print(r.stdout.strip(), flush=True)
    if r.returncode != 0 and (r.stderr or '').strip():
        print('MQTT Monitor: atomic_dump_yaml stderr: ' + r.stderr.strip(), flush=True)
 state = {'last_msg': time.time(), 'connected': False, 'failed': False}
 last_seqs = {}
 def on_message(_client, _userdata, msg):
    state['last_msg'] = time.time()
    try:
        payload = json.loads(msg.payload.decode("utf-8"))
        jid = payload.get("job_id")
        event = payload.get("event")
        if not jid or not event:
            return
        if workspace_root:
            registry_dir = os.path.join(workspace_root, '.mam', 'jobs')
        else:
            yaml_dir = os.path.dirname(yaml_path) if yaml_path else ""
            registry_dir = os.path.join(yaml_dir, 'jobs') if yaml_dir else '.mam/jobs'
        try:
            job = registry.load_job(jid, registry_dir)
        except FileNotFoundError:
            # Silently ignore events for jobs not in the local registry
            return
        expected_token = job.get("auth_token")
        if not mqtt_common.verify_hmac(payload, expected_token):
            print(f"MQTT Monitor: drop event for job {jid}: HMAC verify failed", flush=True)
            return
        seq = payload.get("seq")
        if seq is None or not isinstance(seq, int):
            print(f"MQTT Monitor: drop event for job {jid}: missing or invalid seq", flush=True)
            return
        if seq <= last_seqs.get(jid, 0):
            print(f"MQTT Monitor: drop event for job {jid}: seq {seq} not monotonic (last {last_seqs.get(jid, 0)})", flush=True)
            return
        last_seqs[jid] = seq
        # Append the event to events.ndjson audit trail
        mqtt_common.append_event(jid, {
            "event": "received",
            "source_event": event,
            "seq": seq,
            "topic": msg.topic,
            "timestamp": payload.get("timestamp"),
            "detail": payload.get("detail", ""),
        })
        print(f"MQTT Monitor: recorded event {event} for job {jid} (seq={seq})", flush=True)
        if event in ("completed", "error"):
            print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
            handle_terminal(jid, event)
    except Exception as e:
        print(f"MQTT Monitor error parsing message: {e}", flush=True)
 def on_connect(_c, _u, _flags, reason_code, _props):
    rc = mqtt_common.reason_code_value(reason_code)
    if rc == 0:
        state['connected'] = True
        _c.subscribe("python/mqtt/jobs/+/events", qos=1)
        print("MQTT Monitor: subscribed to python/mqtt/jobs/+/events", flush=True)
    else:
        state['failed'] = True
        print(f"MQTT Monitor connection failed: rc={rc}", flush=True)
 cfg = mqtt_common.broker_config_from_env()
 client = mqtt_common.make_client("monitor_sub", cfg)
 client.on_message = on_message
 client.on_connect = on_connect
 print(f"MQTT Monitor: connecting to {cfg.host}:{cfg.port} (TLS={cfg.tls})...", flush=True)
 # Connection failure → fall back to polling (review item 4).
 try:
    client.connect(cfg.host, cfg.port, cfg.keepalive)
 except Exception as e:
    print(f"MQTT Monitor: connect failed ({e}); falling back to polling", flush=True)
    sys.exit(3)
 client.loop_start()
 _wait = time.time()
 while time.time() - _wait < 5 and not state['connected'] and not state['failed']:
    time.sleep(0.1)
 if not state['connected']:
    print("MQTT Monitor: broker did not accept connection; falling back to polling", flush=True)
    client.loop_stop()
    sys.exit(3)
 start = time.time()
 try:
    while True:
        now = time.time()
        if timeout and (now - start) >= timeout:
            print(f"MQTT Monitor: --timeout {timeout}s reached, exiting", flush=True)
            break
        if idle_timeout and (now - state['last_msg']) >= idle_timeout:
            print(f"MQTT Monitor: --idle-timeout {idle_timeout}s reached, exiting", flush=True)
            break
        time.sleep(0.5)
 finally:
    client.loop_stop()
    try:
        client.disconnect()
    except Exception:
        pass
 sys.exit(0)
 PYEOF
  sub_rc=$?
  set -e
  if [ "$sub_rc" = "3" ]; then
    echo "MQTT Monitor: broker unavailable — falling back to polling (interval ${POLL_INTERVAL}s)" >&2
    _self="$SKILLS_DIR/multi-agent-mux-monitor/scripts/reconcile.sh"
    _start=$(date +%s)
    while :; do
      bash "$_self" --once --emit-diff >/dev/null 2>&1 || true
      if [ "$SUB_TIMEOUT" != "0" ] && [ "$(( $(date +%s) - _start ))" -ge "$SUB_TIMEOUT" ]; then
        break
      fi
      sleep "$POLL_INTERVAL"
    done
  fi
  exit 0
 fi
 mkdir -p "$STATE_DIR"
 # 모든 비교 로직을 단일 소스로 둔다. dry-run 은 env_python(읽기전용), 그 외엔
 # atomic_dump_yaml(flock + temp+rename) 로 같은 소스를 돌린다. atomic 래퍼에서는
 # 'actions' 가 없으면 SystemExit(0) 으로 쓰기를 건너뛴다 (불필요한 재포맷 방지).
 read -r -d '' RECON_SRC <<'PYEOF' || true
 import os, json, glob, subprocess, time
 from datetime import datetime, timezone
 import yaml
 yaml_path = os.environ['YAML_PATH']
 home = os.environ['HOME_DIR']
 claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
 now_iso = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
 # atomic 래퍼에서는 d 가 이미 로드돼 있음. env_python(dry-run)에서는 여기서 로드.
 try:
    d
 except NameError:
    import sqlite3
    db_path = os.path.splitext(yaml_path)[0] + '.db'
    d = {}
    try:
        if os.path.exists(db_path):
            conn = sqlite3.connect(db_path, timeout=10.0)
            row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
            if row: d = json.loads(row[0])
            try:
                db_sessions = []
                cursor = conn.execute('SELECT data FROM sessions')
                for s_row in cursor.fetchall():
                    db_sessions.append(json.loads(s_row[0]))
                d['tmux_sessions'] = db_sessions
            except sqlite3.OperationalError:
                pass
            conn.close()
        elif os.path.exists(yaml_path):
            with open(yaml_path) as f:
                d = yaml.safe_load(f) or {}
    except Exception:
        pass
 drifts = []
 actions = []
 # === 현재 tmux 상태 — transient 실패를 'no sessions' 와 구분 (P1-E) ===
 tmux_sessions = []
 tmux_confirmed = True
 # YAML 에 등록된 고유한 tmux_server 목록 수집 + 환경변수 TMUX_SERVER_NAME 포함
 unique_servers = {'default'}
 if 'TMUX_SERVER_NAME' in os.environ:
    unique_servers.add(os.environ['TMUX_SERVER_NAME'])
 for s in d.get('tmux_sessions', []):
    srv = s.get('tmux_server') or 'default'
    unique_servers.add(srv)
 try:
    for srv in sorted(unique_servers):
        cmd = ['tmux']
        if srv != 'default':
            cmd += ['-L', srv]
        cmd += ['ls', '-F', '#{session_name}|#{session_created}']
        r = subprocess.run(cmd, capture_output=True, text=True)
        if r.returncode == 0:
            for line in r.stdout.strip().split('\n'):
                if not line:
                    continue
                name, created = line.split('|', 1)
                tmux_sessions.append({'name': name, 'created': int(created), 'server': srv})
        else:
            err = (r.stderr or '').lower()
            is_empty = ('no server running' in err) or ('no sessions' in err) or ('failed to connect' in err)
            if not is_empty:
                tmux_confirmed = False
 except Exception:
    tmux_confirmed = False
 def pane_meta(session, srv):
    try:
        cmd = ['tmux']
        if srv != 'default':
            cmd += ['-L', srv]
        cmd += ['list-panes', '-t', session, '-F',
                '#{pane_pid}|#{pane_current_path}|#{pane_current_command}']
        out = subprocess.check_output(cmd, text=True)
        parts = out.strip().split('\n')[0].split('|')
        return {'pid': int(parts[0]), 'cwd': parts[1], 'cmd': parts[2]}
    except Exception:
        return None
 yaml_sessions = d.get('tmux_sessions', [])
 yaml_session_names = {s['name'] for s in yaml_sessions if s.get('name')}
 alive_set = {(t['name'], t.get('server', 'default')) for t in tmux_sessions}
 # === drift A: tmux dead + YAML running → auto-terminate ===
 # tmux 응답을 확정했을 때만. transient 실패 시 모두 terminated 로 마크하지 않음 (P1-E)
 if tmux_confirmed:
    for s in yaml_sessions:
        name = s.get('name')
        if not name:
            continue
        # 'stopped' 도 deliberate한 종료 상태 — drift 로 보지 않고 그대로 둔다.
        # (없으면 tmux-dead stopped 세션을 'terminated' 로 덮어써 resumable 플래그가 소실됨)
        if s.get('status') in ('terminated', 'archived', 'stopped'):
            continue
        srv = s.get('tmux_server') or 'default'
        if (name, srv) not in alive_set:
            s['status'] = 'terminated'
            s['terminated_at'] = now_iso
            s['terminated_at_epoch'] = int(datetime.now(timezone.utc).timestamp())
            s['termination_mode'] = 'auto-detected (tmux gone)'
            pane = s.get('pane') or {}
            drifts.append({'class': 'A', 'name': name,
                           'msg': f"{name}: tmux gone (was pane {pane.get('pid')}, cmd {pane.get('cmd')}). Marked terminated."})
            actions.append(f"terminated: {name}")
 # === drift B: tmux alive + not in YAML → auto-register ===
 if tmux_confirmed:
    for t in tmux_sessions:
        name = t['name']
        if name in yaml_session_names:
            continue
        if not (name.endswith('-creator-claude') or name.endswith('-creator-agy')):
            continue
        srv = t.get('server', 'default')
        pm = pane_meta(name, srv)
        if not pm:
            continue
        agent = 'claude' if name.endswith('-creator-claude') else 'agy'
        cmd_full = 'claude --dangerously-skip-permissions' if agent == 'claude' else 'agy --dangerously-skip-permissions'
        server_opt = f"-L {srv} " if srv != 'default' else ""
        entry = {
            'name': name,
            'status': 'running',
            'tmux_session_created_at': datetime.fromtimestamp(t['created'], tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'tmux_session_epoch': t['created'],
            'tmux_server': srv,
            'pane': {'index': 0, 'pid': pm['pid'], 'cmd': agent, 'cmd_full': cmd_full, 'cwd': pm['cwd']},
            # P2: cwd 인용
            'start_command': f'tmux {server_opt}new-session -d -s "{name}" -x 140 -y 40 -c "{pm["cwd"]}" "{cmd_full}"',
            'attach_command': f'tmux {server_opt}attach -t {name}',
            'kill_command': f'tmux {server_opt}kill-session -t {name}',
            'last_visible_status': 'running',
            'last_visible_note': 'auto-registered by monitor',
        }
        if agent == 'claude':
            entry['tui'] = {'model': '(unknown — capture after first message)', 'provider': 'anthropic',
                            'plan': '(unknown)', 'account': '(unknown)', 'version': '(unknown)'}
            entry['claude_session_id_own'] = None
        else:
            entry['child_pid'] = 0
            entry['agy_conversation_id_own'] = None
            entry['mcp_attachments'] = [
                {
                    'name': 'stitch',
                    'transport': 'mcp-remote',
                    'endpoint': 'https://stitch.googleapis.com/mcp'
                }
            ]
        d.setdefault('tmux_sessions', []).append(entry)
        yaml_session_names.add(name)
        drifts.append({'class': 'B', 'name': name,
                       'msg': f"{name}: tmux found but not in YAML. Auto-registered (pane {pm['pid']}, cmd {pm['cmd']}, cwd {pm['cwd']})."})
        actions.append(f"registered: {name}")
 # === drift C: claude 새 session id materialize (per-row own id) ===
 for s in d.get('tmux_sessions', []):
    if not s.get('name', '').endswith('-creator-claude'):
        continue
    if s.get('status') != 'running':
        continue
    if s.get('claude_session_id_own'):
        continue
    cwd = (s.get('pane') or {}).get('cwd', '')
    if not cwd:
        continue
    proj_key = cwd.replace('/', '-').replace('_', '-')
    proj_dir = f"{claude_project_dir}/{proj_key}"
    if not os.path.isdir(proj_dir):
        continue
    jsonls = sorted(glob.glob(f"{proj_dir}/*.jsonl"), key=os.path.getmtime, reverse=True)
    if not jsonls:
        continue
    latest = jsonls[0]
    if time.time() - os.path.getmtime(latest) > 300:
        continue
    try:
        with open(latest) as f:
            first = f.readline().strip()
        if not first:
            continue
        sid = json.loads(first).get('sessionId')
        if not sid:
            continue
    except Exception:
        continue
    s['claude_session_id_own'] = sid
    drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: session id materialized: {sid}"})
    actions.append(f"updated session id: {sid}")
 # === drift C (agy): agy 새 session id materialize (per-row own id) ===
 for s in d.get('tmux_sessions', []):
    if not s.get('name', '').endswith('-creator-agy'):
        continue
    if s.get('status') != 'running':
        continue
    if s.get('agy_conversation_id_own'):
        continue
    cwd = (s.get('pane') or {}).get('cwd', '')
    if not cwd:
        continue
    lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json"
    if os.path.exists(lc):
        try:
            with open(lc) as f:
                lc_data = json.load(f)
            cid = lc_data.get(cwd)
            if cid and os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
                s['agy_conversation_id_own'] = cid
                drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: conversation id materialized: {cid}"})
                actions.append(f"updated conversation id: {cid}")
        except Exception:
            pass
 # === drift D: stale UUID (cache 의 artifact 가 사라짐) — 보고만, 변경 없음 ===
 ai = d.get('agent_identities', {}) or {}
 cl = (ai.get('claude') or {})
 if cl.get('session_id'):
    sid = cl['session_id']
    if not glob.glob(f"{claude_project_dir}/*/{sid}.jsonl"):
        drifts.append({'class': 'D', 'name': '(claude identity cache)',
                       'msg': f"stale UUID in agent_identities.claude.session_id: {sid} (jsonl missing)"})
 ag = (ai.get('agy') or {})
 if ag.get('conversation_id'):
    cid = ag['conversation_id']
    if not os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
        drifts.append({'class': 'D', 'name': '(agy identity cache)',
                       'msg': f"stale UUID in agent_identities.agy.conversation_id: {cid} (.db missing)"})
 result = {
    'timestamp': now_iso,
    'yaml_path': yaml_path,
    'tmux_sessions_alive': sorted(f"{t['name']}|{t.get('server', 'default')}" for t in tmux_sessions),
    'tmux_confirmed': tmux_confirmed,
    'drifts': drifts,
    'actions': actions,
 }
 print(json.dumps(result, indent=2, ensure_ascii=False))
 # atomic 래퍼: actions 가 없으면 쓰기를 건너뛴다. env_python(dry-run)에선 무해.
 if not actions:
    raise SystemExit(0)
 PYEOF
 if [ "$DRY_RUN" = "1" ]; then
  printf '%s' "$RECON_SRC" | env_python "$AGENT_SESSIONS_YAML"
 else
  printf '%s' "$RECON_SRC" | atomic_dump_yaml "$AGENT_SESSIONS_YAML"
 fi
@@ -0,0 +1,151 @@
 ---
 name: multi-agent-mux-resume
 description: "Resume an existing agent (claude, antigravity/agy) conversation by UUID into a tmux session. Reads .mam/agent-sessions.yaml for the saved session/conversation id, spawns (or reuses) a tmux session of the matching name, and runs `claude -r <id>` or `agy --conversation <id>` inside. Use when you want to reattach to a previous session's context, or revive a session whose tmux died but the agent's conversation is still on disk."
 version: 1.0.0
 author: godopu
 license: MIT
 platforms: [linux, macos]
 environments: [terminal, tmux]
 metadata:
  hermes:
    tags: [agent, tmux, claude, antigravity, agy, multi-agent, context, resume, session-id]
    related_skills: [multi-agent-mux-create, multi-agent-mux-stop, multi-agent-mux-monitor, claude-code]
    prereq_skills: [multi-agent-mux-create]
 ---
 # Multi-Agent Resume — Reattach to a Saved Conversation
 > **Companion skills**: `multi-agent-mux-create` (start a fresh agent), `multi-agent-mux-stop` (terminate), `multi-agent-mux-monitor` (live status).
 > **Tmux Isolation**: `TMUX_SERVER_NAME` env var를 create에서 설정한 경우, 동일 서버에서 동작합니다. 자세한 격리 패턴은 [multi-agent-mux-create/SKILL.md](../multi-agent-mux-create/SKILL.md) 참조.
 > **Single source of truth**: `./.mam/agent-sessions.yaml`.
 ## What this skill does
 **Container + data reconstruction**: spawn a tmux session (the container), then run the agent inside with a specific session id (the data) so the previous conversation's context is restored.
 Three cases this skill handles:
 1. **tmux is dead, conversation lives** — `agent-sessions.yaml` has the UUID. The JSONL/db is on disk. Re-spawn the tmux session + run `claude -r <id>` / `agy --conversation <id>`.
 2. **tmux is alive but empty** — You started a session with `multi-agent-mux-create` but haven't sent a message yet (so no session id was assigned). The user can either send their first message (and the id is auto-assigned), or you can read the *workspace's* most recent conversation from `$HOME_DIR/.gemini/antigravity-cli/cache/last_conversations.json` (defaults to `~/.gemini/...`) for agy, or the latest `*.jsonl` in `$CLAUDE_PROJECT_DIR/<workspace-key>/` (defaults to `~/.claude/projects/`) for claude.
 3. **tmux is alive AND the agent inside is already running** — Just attach. No re-spawn needed.
 ### Resuming a `stopped` session (`stopped → running`)
 When a session was ended via `multi-agent-mux-stop` (which captures the ID and gracefully stops by default),
 its row is `status: stopped` with `resumable: true` and the conversation id
 already recorded in `claude_session_id_own` / `agy_conversation_id_own`. This is the
 ideal resume path:
 - **tier-1, race-free**: because the stop command wrote the id into the row at stop
  time, `resolve_session_id.sh` resolves it via `find_workspace_uuid` tier-1 (the
  per-row own id) — no reliance on the mtime-based disk scan, so a concurrent
  session in another workspace can never shadow it.
 - On resume, `update_yaml_resumed.sh` transitions `stopped → running` and **clears
  the stop metadata** (`stopped_at`, `stopped_at_epoch`, `stop_reason`, `resumable`)
  along with the usual `terminated_at*` / `termination_mode` / `archived_at`, so the
  row reflects a clean running state with no stale end-of-session fields.
 ## UUID resolution order
 `agent-sessions.yaml` is the *primary* source. The skill reads in this order:
 1. **`agent-sessions.yaml` → `agent_identities.<agent>.session_id` (claude) / `conversation_id` (agy)** — explicit saved value
 2. **`agent-sessions.yaml` → `agent_identities.<agent>.session_jsonl` (claude) / `conversation_db` (agy)** — the on-disk artifact
 3. **Fallback: scan disk for the workspace's most recent conversation** (Note: `CLAUDE_PROJECT_DIR` overrides the default `~/.claude/projects/` path, and `HOME_DIR` overrides the `~` path) —
   - claude: `ls -t $CLAUDE_PROJECT_DIR/<workspace-key>/*.jsonl | head -1` and parse the `sessionId` from the first line
   - agy: `jq -r '."<workspace>"' $HOME_DIR/.gemini/antigravity-cli/cache/last_conversations.json`
 If all three are empty → the workspace has no conversation yet. Fall back to `multi-agent-mux-create`.
 ## Workflow
 ```bash
 WORKSPACE=/path/to/project
 AGENT=claude  # or agy or hermes
 SESSION_NAME=<workspace>-creator-<agent>  # same convention as multi-agent-mux-create
 # 1. Resolve the session id
 UUID=$(bash .agents/skills/multi-agent-mux-resume/scripts/resolve_session_id.sh \
  --workspace "$WORKSPACE" --agent "$AGENT")
 if [ -z "$UUID" ]; then
  echo "No saved session for $WORKSPACE ($AGENT). Use multi-agent-mux-create first."
  exit 1
 fi
 # Resolve the isolated tmux server name
 source .agents/skills/lib.sh
 export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
 # 2. If tmux is alive, attach. Done.
 if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
  echo "tmux '$SESSION_NAME' already running. Attaching..."
  exec tmux attach -t "$SESSION_NAME"
 fi
 # 3. Spawn new tmux session + run agent with the saved id
 case "$AGENT" in
  claude)
    tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
      "claude --dangerously-skip-permissions -r $UUID"
    # auto-handle trust / bypass dialogs
    sleep 5
    tmux send-keys -t "$SESSION_NAME" Enter 2>/dev/null || true
    sleep 3
    tmux send-keys -t "$SESSION_NAME" Down 2>/dev/null || true
    sleep 0.3
    tmux send-keys -t "$SESSION_NAME" Enter 2>/dev/null || true
    ;;
  agy)
    tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
      "agy --dangerously-skip-permissions --conversation $UUID"
    ;;
  hermes)
    tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
      "hermes --resume $UUID"
    ;;
 esac
 # 4. Update agent-sessions.yaml: status running, last_visible_status
 # (Also automatically publishes a `progress --detail "resumed"` event to the multi-agent-mux-delegate-job registry if a delegate_job_id exists)
 bash .agents/skills/multi-agent-mux-resume/scripts/update_yaml_resumed.sh \
  --session "$SESSION_NAME" --uuid "$UUID"
 # 5. Attach
 tmux attach -t "$SESSION_NAME"
 ```
 ## Pitfalls
 - **`claude -r` requires the SAME project directory** — if the workspace path differs from when the session was created, claude will create a new project dir key (`-home-...-different-name`) and put the resume in a different location. Always `-c` (cd to workspace) before running.
 - **agy's `--conversation` flag name varies by version** — older versions used `--resume` or `-r`. Check `agy --help | grep -E "conversation|resume"` and use the right flag. v1.0.x: `--conversation`.
 - **The first message after resume might re-trigger TUI dialogs** — if the original session was created with `--dangerously-skip-permissions`, those flags are NOT persisted; you must re-apply them on resume. The script above re-passes them.
 - **Don't resume if the session is brand new and empty** — `multi-agent-mux-create` already set up an empty container; sending a probe message ("init") is the right way to materialize a session id, NOT `claude -r` with a placeholder.
 - **`agy --conversation <id>` will fail if the conversation was deleted from disk** — check `~/.gemini/antigravity-cli/conversations/<uuid>.db` exists before attempting resume. If missing, the conversation is gone; you need a fresh session via `multi-agent-mux-create`.
 ## Verification
 ```bash
 # 1. tmux alive with the right cmd
 tmux list-panes -t "$SESSION_NAME" -F 'cmd=#{pane_current_command} cwd=#{pane_current_path}'
 # 2. agent-sessions.yaml updated
 python3 -c "
 import yaml
 d = yaml.safe_load(open('.mam/agent-sessions.yaml'))
 s = [s for s in d['tmux_sessions'] if s['name'] == '$SESSION_NAME'][0]
 print(f'  status: {s[\"status\"]}')
 print(f'  pane.cmd_full: {s[\"pane\"][\"cmd_full\"]}')
 "
 # 3. TUI shows resumed conversation (capture-pane to verify)
 sleep 5
 tmux capture-pane -t "$SESSION_NAME" -p -S -30
 # look for the previous message at top of the buffer (claude) or last_visible_status set (agy)
 ```
 ## When NOT to use this skill
 - **No saved session yet** → `multi-agent-mux-create`
 - **Killing an existing session** → `multi-agent-mux-stop`
 - **Just attaching** → `tmux attach -t <name>` (no skill needed)
@@ -0,0 +1,40 @@
 #!/usr/bin/env bash
 # resolve_session_id.sh — multi-agent-mux-resume 의 부속 스크립트
 # Usage:
 #   bash resolve_session_id.sh --workspace <path> --agent <claude|agy>
 # 출력: stdout 으로 UUID 한 줄 (없으면 빈 줄 + exit 0)
 #
 # P0-C: 전역 agent_identities 를 즉시 반환하지 않는다. lib.sh::find_workspace_uuid
 # 가 워크스페이스 격리된 해결 경로(per-row own id -> 디스크 스캔 -> cwd 일치하는
 # cache)만 사용. 다른 워크스페이스의 UUID 를 절대 반환하지 않음.
 set -euo pipefail
 source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
 usage() {
  cat <<EOF
 Usage: $0 --workspace <path> --agent <claude|agy>
 Outputs the resolved UUID on stdout (empty if not found).
 EOF
 }
 WORKSPACE=""
 AGENT=""
 while [ $# -gt 0 ]; do
  case "$1" in
    --workspace) WORKSPACE="$2"; shift 2 ;;
    --agent)     AGENT="$2";     shift 2 ;;
    -h|--help)   usage; exit 0 ;;
    *) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
  esac
 done
 [ -n "$WORKSPACE" ] || { echo "ERROR: --workspace required" >&2; exit 2; }
 [ -n "$AGENT" ]    || { echo "ERROR: --agent required" >&2; exit 2; }
 case "$AGENT" in
  claude|agy|hermes) ;;
  *) echo "ERROR: --agent must be claude or agy or hermes" >&2; exit 2 ;;
 esac
 find_workspace_uuid "$WORKSPACE" "$AGENT"
@@ -0,0 +1,156 @@
 #!/usr/bin/env bash
 # update_yaml_resumed.sh — multi-agent-mux-resume 의 부속 스크립트
 # Resume 한 세션의 agent-sessions.yaml 엔트리를 status=running + resume 메타로 갱신.
 # resume UUID 를 per-row own id (claude_session_id_own / agy_conversation_id_own)
 # 에 박는다 — agent_identities 전역은 더 이상 primary 아님 (cache 로 강등, P0-C/단계 e).
 #
 # Usage: bash update_yaml_resumed.sh --session <name> --uuid <id> [--agent claude|agy]
 set -euo pipefail
 source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
 usage() {
  cat <<EOF
 Usage: $0 --session <name> --uuid <id> [--agent claude|agy]
 EOF
 }
 SESSION_NAME=""
 UUID=""
 AGENT=""
 while [ $# -gt 0 ]; do
  case "$1" in
    --session) SESSION_NAME="$2"; shift 2 ;;
    --uuid)    UUID="$2";          shift 2 ;;
    --agent)   AGENT="$2";         shift 2 ;;
    -h|--help) usage; exit 0 ;;
    *) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
  esac
 done
 [ -n "$SESSION_NAME" ] || { echo "ERROR: --session required" >&2; exit 2; }
 [ -n "$UUID" ]        || { echo "ERROR: --uuid required" >&2; exit 2; }
 [ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
 export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
 # --agent 미지정 시 이름 suffix 로 fallback (P1-F: 가능하면 --agent 명시)
 if [ -z "$AGENT" ]; then
  case "$SESSION_NAME" in
    *-creator-claude) AGENT=claude ;;
    *-creator-agy)    AGENT=agy ;;
    *-creator-hermes) AGENT=hermes ;;
    *) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
  esac
 fi
 NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
 # 새 tmux pane pid / 자식 pid 를 bash 에서 캡처 (env 로 전달, P1-B)
 PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null | head -1 || true)
 PANE_PID="${PANE_PID:-}"
 CHILD_PID=0
 if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then
  CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
  CHILD_PID="${CHILD_PID:-0}"
 fi
 DELEGATE_JOB_ID=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF'
 import os, sys, sqlite3, json, yaml
 name = os.environ['SESSION_NAME']
 yaml_path = os.environ['YAML_PATH']
 db_path = os.path.splitext(yaml_path)[0] + '.db'
 d = {}
 try:
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path, timeout=10.0)
        try:
            row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
            if row:
                s = json.loads(row[0])
                print(s.get('delegate_job_id', '') or '')
                raise SystemExit(0)
        except sqlite3.OperationalError:
            pass
        row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
        if row:
            d = json.loads(row[0])
        conn.close()
    elif os.path.exists(yaml_path):
        with open(yaml_path) as f:
            d = yaml.safe_load(f) or {}
 except Exception:
    pass
 for s in d.get('tmux_sessions', []):
    if s.get('name') == name:
        print(s.get('delegate_job_id', '') or '')
        raise SystemExit(0)
 raise SystemExit(0)
 PYEOF
 )
 atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
  SESSION_NAME="$SESSION_NAME" UUID="$UUID" AGENT="$AGENT" NOW_ISO="$NOW_ISO" \
  PANE_PID="$PANE_PID" CHILD_PID="$CHILD_PID" <<'PYEOF'
 name = os.environ['SESSION_NAME']
 uuid = os.environ['UUID']
 agent = os.environ['AGENT']
 now = os.environ['NOW_ISO']
 pane_pid = os.environ.get('PANE_PID', '')
 target = None
 for s in d.get('tmux_sessions', []):
    if s.get('name') == name:
        target = s
        break
 if target is None:
    print(f"ERROR: session not in YAML: {name}", flush=True)
    raise SystemExit(1)
 target['status'] = 'running'
 target.pop('terminated_at', None)
 target.pop('terminated_at_epoch', None)
 target.pop('termination_mode', None)
 target.pop('archived_at', None)
 # stop 메타도 정리 — resume 하면 더 이상 stopped 상태가 아니므로 잔존 필드를 제거.
 target.pop('stopped_at', None)
 target.pop('stopped_at_epoch', None)
 target.pop('stop_reason', None)
 target.pop('resumable', None)
 target['last_visible_status'] = f'resumed conversation {uuid} at {now}'
 target.setdefault('pane', {})
 if pane_pid.isdigit():
    target['pane']['pid'] = int(pane_pid)
 if agent == 'claude':
    target['pane']['cmd'] = 'claude'
    target['pane']['cmd_full'] = f'claude --dangerously-skip-permissions -r {uuid}'
    target['claude_session_id_own'] = uuid
 elif agent == 'agy':
    target['pane']['cmd'] = 'agy'
    target['pane']['cmd_full'] = f'agy --dangerously-skip-permissions --conversation {uuid}'
    target['agy_conversation_id_own'] = uuid
    cp = os.environ.get('CHILD_PID', '0')
    if cp.isdigit() and int(cp) > 0:
        target['child_pid'] = int(cp)
 elif agent == 'hermes':
    target['pane']['cmd'] = 'hermes'
    target['pane']['cmd_full'] = f'hermes --resume {uuid}'
    target['hermes_conversation_id_own'] = uuid
    cp = os.environ.get('CHILD_PID', '0')
    if cp.isdigit() and int(cp) > 0:
        target['child_pid'] = int(cp)
 snap = d.setdefault('snapshot', {})
 snap['taken_at'] = now
 snap.pop('terminated_at', None)
 snap.pop('terminated_at_epoch', None)
 print(f"updated: {name} status=running (resume id -> per-row own id)", flush=True)
 PYEOF
 delegate_publish_event "$DELEGATE_JOB_ID" progress "resumed"
@@ -0,0 +1,124 @@
 ---
 name: multi-agent-mux-status
 description: "Read-only instant snapshot of all agent tmux sessions — name, YAML status, tmux alive, pane cmd/cwd, resume UUID on disk, and any drift. No Kanban, no mutation. Reuses reconcile.sh --dry-run for the diff logic. Use when you want to know 'what's running RIGHT NOW' without spinning up a Kanban monitor worker."
 version: 1.0.0
 author: godopu
 license: MIT
 platforms: [linux, macos]
 environments: [terminal, tmux]
 metadata:
  hermes:
    tags: [agent, tmux, claude, antigravity, agy, status, read-only, snapshot]
    related_skills: [multi-agent-mux-create, multi-agent-mux-resume, multi-agent-mux-stop, multi-agent-mux-monitor]
    prereq_skills: [multi-agent-mux-create, multi-agent-mux-monitor]
 ---
 # Multi-Agent Status — Read-Only Instant Snapshot
 > **Companion skills**: `multi-agent-mux-create` (start), `multi-agent-mux-resume` (re-attach), `multi-agent-mux-stop` (terminate), `multi-agent-mux-monitor` (live polling).
 > **Tmux Isolation**: `status` 명령은 YAML에 등록된 모든 세션의 격리 서버(`tmux_server` 필드)를 자동으로 조회하여 상태를 확인하므로, `TMUX_SERVER_NAME` 환경변수를 수동으로 지정하지 않아도 모든 격리 서버의 세션 상태를 통합 조회합니다.
 > **Single source of truth**: `./.mam/agent-sessions.yaml`.
 ## What this skill does
 Print a single table of every agent tmux session, comparing YAML state to actual tmux state. **No mutation. No Kanban. No polling loop.**
 This is the "what's running right now?" answer — faster than dispatching `multi-agent-mux-monitor` (which polls every 30s) and safer than `reconcile.sh --once --emit-diff` (which mutates as a side effect).
 ## Pre-flight
 ```bash
 command -v tmux
 command -v python3
 test -f .mam/agent-sessions.yaml
 ```
 If `agent-sessions.yaml` doesn't exist or is malformed → print clear error, exit 1. **Do not create it.** (Use `multi-agent-mux-create` first.)
 ## Workflow
 ```bash
 bash .agents/skills/multi-agent-mux-status/scripts/status.sh [--json]
 ```
 The script:
 1. Calls `reconcile.sh --once --emit-diff --dry-run` (read-only; no YAML mutation) for the drift snapshot
 2. Loads `agent-sessions.yaml` (read-only) to enrich the table
 3. For each row in `tmux_sessions[]`:
   - tmux alive? (via `tmux has-session -t <name>`)
   - pane cmd, cwd (via `tmux list-panes`)
   - resume UUID on disk? (claude: `$CLAUDE_PROJECT_DIR/<key>/<uuid>.jsonl` with default `~/.claude/projects/`; agy: `$HOME_DIR/.gemini/antigravity-cli/conversations/<uuid>.db` with default `~/.gemini/...`)
 4. For each tmux session matching `*-creator-*` not in YAML → flag as "unregistered"
 5. Prints a table (default) or JSON (with `--json`)
 ## Output format (default = aligned table)
 ```
 agent-sessions status — 2026-06-19T14:20:00Z  (tmux_confirmed=True)
 ========================================================================================================================================
 NAME                                         SERVER       YAML       TMUX   CMD    RESUME   JOB_ID    JOB_STATUS   DRIFT
 ----------------------------------------------------------------------------------------------------------------------------------------
 lab-landing-page-creator-claude              default      running    alive  claude yes      -         -            -
 lab-landing-page-creator-agy                 default      terminated dead   agy    yes      5fe09ba8  completed    -
 lab-paper-pdf2md-creator-claude              default      running    alive  claude scan     -         -            -
 ========================================================================================================================================
 ```
 ## Output format (`--json`)
 ```json
 {
  "yaml_path": "...",
  "tmux_sessions_alive": ["..."],
  "yaml_entries": [...],
  "rows": [
    {
      "name": "lab-landing-page-creator-claude",
      "yaml_status": "running",
      "tmux_alive": true,
      "pane_cmd": "claude",
      "pane_cwd": "/home/.../refer_landing_page",
      "resume_uuid_on_disk": true,
      "drift": null
    },
    {
      "name": "lab-landing-page-creator-agy",
      "yaml_status": "terminated",
      "tmux_alive": false,
      "drift": "yaml-says-terminated-but-disk-uuid-still-present"
    }
  ],
  "unregistered": [],
  "drifts": []
 }
 ```
 ## Drift classes (read-only — never mutates)
 | Class | Detection | Meaning |
 |---|---|---|
 | `A` | YAML `running`, tmux dead | session died without going through `multi-agent-mux-stop`. *Could* auto-terminate but won't — that's `multi-agent-mux-monitor`'s job. |
 | `B` | tmux alive, not in YAML | ad-hoc session someone started without `multi-agent-mux-create`. Suggest: "use multi-agent-mux-create to register, or tmux kill-session to clean up." |
 | `C` | YAML has `claude_session_id_own: null` AND a new *.jsonl exists | new session id materialized; suggest: "run multi-agent-mux-resume or reconcile to register it." |
 | `D` | YAML has UUID in `agent_identities`, but the on-disk artifact is gone | stale UUID; user should `multi-agent-mux-stop --purge-conversation` to clean up. |
 ## Pitfalls
 - **Do NOT use this skill to drive mutations** — the output is a snapshot, not a call to action. If you need to fix drifts, dispatch `multi-agent-mux-monitor` (Kanban worker) or run `multi-agent-mux-resume` / `multi-agent-mux-stop` manually.
 - **Read-only is enforced by script** — `status.sh` opens the YAML with `open(path)` (no `'w'`), never calls `tmux kill-session`, never writes anywhere. The `reconcile.sh --dry-run` mode is the same path.
 - **If `agent-sessions.yaml` is malformed** — print the YAML error verbatim and exit 1. Do NOT attempt recovery (that's `multi-agent-mux-stop --purge-conversation` or manual edit's job).
 - **Sessions outside the `<workspace>-creator-*` naming convention** are still shown but tagged `ad-hoc` — they didn't go through `multi-agent-mux-create` and aren't tracked in YAML.
 ## When to use
 - "Is the claude session still running?" → this skill, not the monitor
 - "What UUID does this workspace have?" → this skill
 - "Is there drift between YAML and reality?" → this skill, then dispatch monitor or fix manually
 - Quick sanity check before dispatching a long Kanban task
 ## When NOT to use
 - Continuous live tracking → `multi-agent-mux-monitor` (Kanban worker)
 - Recovering from corruption → manual edit + `.bak` restore
 - Polling more than once a minute → `multi-agent-mux-monitor` (it dedupes)
@@ -0,0 +1,140 @@
 #!/usr/bin/env bash
 # status.sh — multi-agent-mux-status 의 부속 스크립트 (READ-ONLY)
 # 한 번 호출로 현재 agent 세션 상태표를 출력. 부수효과 없음.
 # reconcile.sh --dry-run 을 재사용해 drift 를 계산하고 (P1-E), YAML/디스크에서
 # 보강한 표를 그린다. YAML 을 절대 수정하지 않는다.
 #
 # Usage: bash status.sh [--json]
 set -euo pipefail
 source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
 RECONCILE="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/multi-agent-mux-monitor/scripts/reconcile.sh"
 JSON=0
 [ "${1:-}" = "--json" ] && JSON=1
 [ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found. Run multi-agent-mux-create first." >&2; exit 1; }
 # read-only drift snapshot — reconcile.sh --dry-run (no side effects)
 DRIFT_JSON="$(bash "$RECONCILE" --once --emit-diff --dry-run)"
 if [ "$JSON" = "1" ]; then
  printf '%s\n' "$DRIFT_JSON"
  exit 0
 fi
 # Project root (parent of .agents/) holds the multi-agent-mux-delegate-job .mam registry.
 # Resolved relative to this script — no hardcoded absolute path (review item 6).
 PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)"
 DRIFT_JSON="$DRIFT_JSON" env_python "$AGENT_SESSIONS_YAML" PROJECT_ROOT="$PROJECT_ROOT" <<'PYEOF'
 import os, json, glob
 import yaml
 yaml_path = os.environ['YAML_PATH']
 home = os.environ['HOME_DIR']
 claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
 drift = json.loads(os.environ['DRIFT_JSON'])
 db_path = os.path.splitext(yaml_path)[0] + '.db'
 d = {}
 import sqlite3
 try:
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path, timeout=10.0)
        row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
        if row: d = json.loads(row[0])
        try:
            db_sessions = []
            cursor = conn.execute('SELECT data FROM sessions')
            for s_row in cursor.fetchall():
                db_sessions.append(json.loads(s_row[0]))
            d['tmux_sessions'] = db_sessions
        except sqlite3.OperationalError:
            pass
        conn.close()
    elif os.path.exists(yaml_path):
        with open(yaml_path) as f:
            d = yaml.safe_load(f) or {}
 except Exception:
    pass
 alive = set(drift.get('tmux_sessions_alive', []))
 drift_by_name = {}
 for dr in drift.get('drifts', []):
    drift_by_name.setdefault(dr['name'], []).append(dr['class'])
 def resume_on_disk(s):
    # workspace-SCOPED check only — per-row own id, never a global identity (P0-C)
    name = s.get('name', '')
    cwd = (s.get('pane') or {}).get('cwd', '')
    if name.endswith('-creator-claude'):
        u = s.get('claude_session_id_own')
        if u:
            key = cwd.replace('/', '-').replace('_', '-')
            return 'yes' if os.path.exists(f"{claude_project_dir}/{key}/{u}.jsonl") else 'MISSING'
        key = cwd.replace('/', '-').replace('_', '-')
        return 'scan' if glob.glob(f"{claude_project_dir}/{key}/*.jsonl") else 'no'
    if name.endswith('-creator-agy'):
        u = s.get('agy_conversation_id_own')
        if u:
            return 'yes' if os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{u}.db") else 'MISSING'
        return 'no'
    return '?'
 def get_job_status(s):
    jid = s.get('delegate_job_id')
    if not jid:
        return ('-', '-')
    project_root = os.environ.get('PROJECT_ROOT', '.')
    # Candidate locations (review item 6: project-root-relative, no hardcoded abs paths):
    #   1) cwd-relative registry  2) project-root registry  3) project-root audit log
    candidates = [
        os.path.join('.mam', 'jobs', f"{jid}.json"),
        os.path.join(project_root, '.mam', 'jobs', f"{jid}.json"),
        os.path.join(project_root, '.mam', 'delegate_job_logs', jid, 'status.json'),
    ]
    for path in candidates:
        if os.path.exists(path):
            try:
                with open(path) as jf:
                    job_data = json.load(jf)
                return (jid, job_data.get('status', 'unknown'))
            except Exception:
                pass
    return (jid, 'unknown')
 sessions = d.get('tmux_sessions', [])
 print(f"agent-sessions status — {drift['timestamp']}  (tmux_confirmed={drift['tmux_confirmed']})")
 print("=" * 136)
 print(f"{'NAME':<44} {'SERVER':<12} {'YAML':<10} {'TMUX':<6} {'CMD':<6} {'RESUME':<8} {'JOB_ID':<10} {'JOB_STATUS':<12} DRIFT")
 print("-" * 136)
 if not sessions:
    print("(no sessions registered)")
 for s in sessions:
    name = s.get('name', '?')
    server = s.get('tmux_server') or 'default'
    status = s.get('status', '?')
    tmux = 'alive' if f"{name}|{server}" in alive else 'dead'
    cmd = (s.get('pane') or {}).get('cmd', '?')
    res = resume_on_disk(s)
    jid, jstatus = get_job_status(s)
    drs = ','.join(drift_by_name.get(name, [])) or '-'
    print(f"{name:<44} {server:<12} {status:<10} {tmux:<6} {cmd:<6} {res:<8} {jid:<10} {jstatus:<12} {drs}")
 # drifts not tied to a registered row (e.g. class B unregistered, class D cache)
 known = {s.get('name') for s in sessions}
 extra = [dr for dr in drift.get('drifts', []) if dr['name'] not in known]
 if extra:
    print("-" * 136)
    for dr in extra:
        print(f"  [{dr['class']}] {dr['msg']}")
 print("=" * 136)
 print(f"alive tmux: {sorted(alive)}")
 PYEOF
@@ -0,0 +1,136 @@
 ---
 name: multi-agent-mux-stop
 description: "Stop an agent tmux session (claude, antigravity/agy) and update .mam/agent-sessions.yaml. Default stops gracefully and marks status=stopped with conversation preserved for resume. Does NOT delete on-disk conversation artifacts (jsonl/db) — those are preserved unless --purge-conversation is passed. Use when ending a work session, switching to a different one, or cleaning up before a fresh start."
 version: 1.0.0
 author: godopu
 license: MIT
 platforms: [linux, macos]
 environments: [terminal, tmux]
 metadata:
  hermes:
    tags: [agent, tmux, claude, antigravity, agy, multi-agent, stop, terminate, cleanup]
    related_skills: [multi-agent-mux-create, multi-agent-mux-resume, multi-agent-mux-monitor]
    prereq_skills: [multi-agent-mux-create, multi-agent-mux-resume]
 ---
 # Multi-Agent Stop — Stop an Agent tmux Session
 > **Companion skills**: `multi-agent-mux-create` (start), `multi-agent-mux-resume` (re-attach), `multi-agent-mux-monitor` (live status).
 > **Tmux Isolation**: `stop` 명령은 YAML의 `tmux_server` 필드를 자동으로 파싱하여 해당 격리 서버의 세션을 안전하게 종료(kill)하므로, `TMUX_SERVER_NAME` 환경변수를 수동으로 지정할 필요가 없습니다.
 > **Single source of truth**: `./.mam/agent-sessions.yaml`.
 ## What this skill does
 Stop an agent's tmux session gracefully, resolve and store the conversation ID, and **mark the YAML entry (status=stopped)**. Preserves:
 - The tmux session's recorded `pane.pid / cmd / cwd / mcp_attachments` for audit
 - The agent's on-disk conversation (claude `*.jsonl`, agy `conversations/*.db`) — so the user can `multi-agent-mux-resume` later
 - The `start_command` so a future `multi-agent-mux-create --session <name>` reproduces the same tmux spec
 The stop command is always **graceful by default**:
 1. Sends exit keys to the agent TUI (`/exit` for Claude, `Exit` for Agy) and waits 3 seconds.
 2. If still alive, issues `tmux kill-session` (SIGTERM) and waits 5 seconds.
 3. If still alive, kills the pane PID via SIGKILL (`kill -9`) as a last resort.
 4. Auto-captures the conversation ID into the row (`claude_session_id_own`/`agy_conversation_id_own`) before killing, ensuring the next resume uses a race-free tier-1 lookup.
 ## Pre-flight
 ```bash
 SESSION_NAME=<workspace>-creator-<agent>  # convention
 AGENT_SESSIONS_YAML=.mam/agent-sessions.yaml
 # 1) Session is registered?
 python3 -c "
 import yaml
 d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
 names = [s['name'] for s in d.get('tmux_sessions', [])]
 if '$SESSION_NAME' not in names:
    print('NOT in YAML — refusing to stop (no audit trail). Use multi-agent-mux-create first, or pass --force-no-yaml.')
    raise SystemExit(1)
 "
 # 2) Already stopped?
 ALREADY=$(python3 -c "
 import yaml
 d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
 s = [x for x in d['tmux_sessions'] if x['name']=='$SESSION_NAME'][0]
 print(s.get('status', 'unknown'))
 ")
 if [ "$ALREADY" = "stopped" ]; then
  echo "Already stopped."
 fi
 ```
 ## Workflow
 ```bash
 # 1. Stop gracefully (default — captures ID, shuts down safely, status=stopped)
 bash .agents/skills/multi-agent-mux-stop/scripts/stop_session.sh \
  --session "$SESSION_NAME"
 # 2. Stop gracefully + record a custom stop reason
 bash .agents/skills/multi-agent-mux-stop/scripts/stop_session.sh \
  --session "$SESSION_NAME" --reason api_error
 # 3. Stop gracefully + clean up on-disk conversation (DANGEROUS)
 #    — this prevents any future resume (status=terminated, resumable=false).
 bash .agents/skills/multi-agent-mux-stop/scripts/stop_session.sh \
  --session "$SESSION_NAME" --purge-conversation
 ```
 **Idempotency**: if the row is already `status: stopped`, the script prints `already stopped (...)` and exits 0 — re-running is a safe no-op.
 ### State machine
 ```
 running ──(stop default / --reason)────────► stopped      (resumable:true, conv preserved)
 running ──(stop --purge-conversation --yes)► terminated   (resumable:false, conv deleted)
 stopped ──(stop default … again)───────────► stopped      (idempotent no-op)
 ```
 Fields written in STOP mode: `status: stopped`, `stopped_at`, `stopped_at_epoch`, `stop_reason`, `termination_mode: graceful`, `claude_session_id_own`/`agy_conversation_id_own` and `resumable: true`.
 If `--purge-conversation` is used: `status: terminated`, `terminated_at`, `terminated_at_epoch`, `termination_mode: purge` and `resumable: false`.
 The script:
 1. Verifies the session is in agent-sessions.yaml
 2. If `delegate_job_id` is set, automatically publishes a `progress --detail "terminating"` event to the multi-agent-mux-delegate-job registry
 3. Captures the `last_visible_status` from `tmux capture-pane` (so we have a final TUI snapshot for audit)
 4. Attempts graceful exit keys → SIGTERM kill-session → SIGKILL fallback
 5. For `purge-conversation`: deletes `~/.claude/projects/.../jsonl` (claude) or `~/.gemini/antigravity-cli/conversations/...db` + `brain/...` (agy)
 6. Updates the YAML entry and SQLite database atomically
 7. If `delegate_job_id` is set, publishes a `completed` event to the multi-agent-mux-delegate-job registry
 ## Pitfalls
 - **Don't delete on-disk artifacts by default** — the agent's `*.jsonl` / `conversations/*.db` is the data that `multi-agent-mux-resume` needs. `--purge-conversation` is for when the user is genuinely done with the conversation and wants zero recovery chance.
 - **YAML is append-only until you write a stop** — if a previous run left the entry as `running` but tmux is actually dead (crash, host reboot), the YAML is stale. Running `multi-agent-mux-stop` will detect "tmux already dead, just update YAML" and proceed.
 - **Don't delete the `claude_session_id_own: null` placeholder** — when the user creates a fresh session with `multi-agent-mux-create` and never sent a message, the entry has `claude_session_id_own: null`. Stopping must preserve that field.
 - **Monitor skill may still be tracking** — if `multi-agent-mux-monitor` is running a heartbeat loop, stopping a session while it watches will trigger its `tmux ls != yaml` reconciliation. That's expected — let the monitor run, it will mark the entry as `terminated` on its own.
 ## Verification
 ```bash
 # 1. tmux gone
 tmux has-session -t "$SESSION_NAME" 2>/dev/null && echo "STILL ALIVE" || echo "OK: tmux gone"
 # 2. YAML has stopped entry
 python3 -c "
 import yaml
 d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
 s = [x for x in d['tmux_sessions'] if x['name']=='$SESSION_NAME'][0]
 assert s['status'] == 'stopped', f'expected stopped, got {s[\"status\"]}'
 assert s.get('stopped_at'), 'missing stopped_at'
 print(f'OK: stopped at {s[\"stopped_at\"]}')
 print(f'  preserved: pane.pid={s[\"pane\"][\"pid\"]}, cmd={s[\"pane\"][\"cmd\"]}, cwd={s[\"pane\"][\"cwd\"]}')
 "
 # 3. (if --purge-conversation) disk artifacts gone
 [ -f "${CLAUDE_PROJECT_DIR:-$HOME/.claude/projects}/<projkey>/<uuid>.jsonl" ] && echo "WARN: jsonl still exists" || echo "OK: jsonl purged"
 ```
 ## When NOT to use this skill
 - **Just detaching** → `tmux detach` (Ctrl-B d) or just close the terminal. The tmux session keeps running.
 - **Stopping the agent inside but keeping tmux** → send `Ctrl-C` or `/exit` (claude) / `Ctrl-D` (agy) via `tmux send-keys`. The tmux session stays but the agent process is gone.
 - **Replacing an existing session with a new one** → `multi-agent-mux-stop` first, then `multi-agent-mux-create`.
@@ -0,0 +1,341 @@
 #!/usr/bin/env bash
 # stop_session.sh — multi-agent-mux-stop 의 부속 스크립트
 # Usage:
 #   bash stop_session.sh --session <name> [--agent claude|agy] \
 #       [--mode soft|hard] [--purge-conversation] [--yes]
 #
 # mode:
 #   soft  — YAML 을 status=archived 로 마크, tmux 세션은 그대로 둠 (P1-A:
 #           terminated 는 tmux 가 실제로 죽은 상태에만 사용)
 #   hard  — tmux kill-session + YAML status=terminated
 # --purge-conversation: --mode hard 일 때만. 삭제 대상 세션의 *워크스페이스에
 #                       격리된* conversation artifact 만 삭제 (P0-C). 전역
 #                       agent_identities 를 참조하지 않음. resume 불가.
 #
 # Stop extension (Option A — stop 확장, 새 6번째 스킬 없이 stop 의미론 흡수):
 #   --capture-id   — kill 직전에 이 워크스페이스의 conversation id 를 row 에 확정
 #                    기록 (claude_session_id_own / agy_conversation_id_own) →
 #                    다음 resume 이 tier-1(race-free) 로 복원. find_workspace_uuid
 #                    재사용 (per-row -> workspace-scoped disk scan -> cache).
 #   --reason R     — 상태 전이 사유 (stop_reason). 기본값 manual_stop.
 #   --graceful     — kill-session 즉시 종료 대신 send-keys 로 정상 종료 유도 →
 #                    3초 대기 → 미종료 시 kill-session(SIGTERM) → 5초 → SIGKILL.
 #   위 세 옵션 중 하나라도 주면 STOP 모드: status 가 terminated 가 아니라 stopped
 #   로 전이 (running -> stopped). 멱등: 이미 stopped 면 no-op + exit 0.
 #   옵션 미지정 시 기존 hard/soft 동작 그대로 (backward compatible).
 #
 # Exit codes:
 #   0 = success (or already-stopped no-op) | 1 = YAML not found / not registered
 #   2 = invalid args | 3 = interactive confirmation required (--yes 누락)
 set -euo pipefail
 source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
 usage() {
  cat <<EOF
 Usage: $0 --session <name> [--agent claude|agy] [--purge-conversation] [--yes] [--reason <reason>]
 Stop arguments:
  --reason <reason> — stop_reason field (default: manual_stop)
  (idempotent: stopping an already-stopped session is a no-op with exit 0)
 EOF
 }
 SESSION_NAME=""
 AGENT=""
 PURGE=0
 YES=0
 CAPTURE_ID=1
 GRACEFUL=1
 REASON="manual_stop"
 STOP_MODE=1
 while [ $# -gt 0 ]; do
  case "$1" in
    --session) SESSION_NAME="$2"; shift 2 ;;
    --agent)   AGENT="$2";        shift 2 ;;
    --purge-conversation) PURGE=1; shift ;;
    --yes)     YES=1; shift ;;
    --reason)  REASON="$2";       shift 2 ;;
    --mode|--capture-id|--graceful)
      echo "ERROR: $1 option is deprecated. Stop now always stops gracefully and captures IDs." >&2
      exit 2
      ;;
    -h|--help) usage; exit 0 ;;
    *) echo "ERROR: unknown arg: $1" >&2; usage; exit 2 ;;
  esac
 done
 [ -n "$SESSION_NAME" ] || { echo "ERROR: --session required" >&2; usage; exit 2; }
 [ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
 export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
 # --agent 미지정 시 이름 suffix 로 fallback (P1-F)
 if [ -z "$AGENT" ]; then
  case "$SESSION_NAME" in
    *-creator-claude) AGENT=claude ;;
    *-creator-agy)    AGENT=agy ;;
    *-creator-hermes) AGENT=hermes ;;
    *) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
  esac
 fi
 # 세션이 YAML 에 있는지 + 해당 row 의 워크스페이스 cwd 및 delegate_job_id 추출.
 # JSON 으로 emit — cwd 에 '|' 가 들어가도 안전 (review item 7; 기존 cwd|jid 파서 대체).
 MAPPED_DATA=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF'
 import os, sys, json, yaml, sqlite3
 name = os.environ['SESSION_NAME']
 yaml_path = os.environ['YAML_PATH']
 db_path = os.path.splitext(yaml_path)[0] + '.db'
 d = {}
 try:
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path, timeout=10.0)
        try:
            row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
            if row:
                s = json.loads(row[0])
                cwd = (s.get('pane') or {}).get('cwd', '')
                jid = s.get('delegate_job_id', '') or ''
                print(json.dumps({"cwd": cwd, "job_id": jid}))
                raise SystemExit(0)
        except sqlite3.OperationalError:
            pass
        row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
        if row:
            d = json.loads(row[0])
        conn.close()
    elif os.path.exists(yaml_path):
        with open(yaml_path) as f:
            d = yaml.safe_load(f) or {}
 except Exception:
    pass
 for s in d.get('tmux_sessions', []):
    if s.get('name') == name:
        cwd = (s.get('pane') or {}).get('cwd', '')
        jid = s.get('delegate_job_id', '') or ''
        print(json.dumps({"cwd": cwd, "job_id": jid}))
        raise SystemExit(0)
 raise SystemExit(7)
 PYEOF
 ) || {
  echo "ERROR: session '$SESSION_NAME' not in $AGENT_SESSIONS_YAML" >&2
  exit 1
 }
 TARGET_CWD=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("cwd",""))')
 DELEGATE_JOB_ID=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("job_id",""))')
 # 멱등성: STOP 모드에서 이미 stopped 인 세션이면 no-op + exit 0
 if [ "$STOP_MODE" = "1" ]; then
  if STOPPED_INFO=$(is_already_stopped "$SESSION_NAME"); then
    echo "already stopped (status=stopped, $STOPPED_INFO) — no-op"
    exit 0
  fi
 fi
 # purge 확인
 if [ "$PURGE" = "1" ] && [ "$YES" != "1" ]; then
  echo "DANGER: --purge-conversation will DELETE this workspace's on-disk conversation."
  echo "  workspace: ${TARGET_CWD:-<unknown>}"
  echo "  This means: no future multi-agent-mux-resume for this session."
  echo "  Re-run with --yes to confirm."
  exit 3
 fi
 # purge 대상 UUID 를 워크스페이스 격리해서 해결 (P0-C — 전역 참조 금지)
 PURGE_UUID=""
 if [ "$PURGE" = "1" ] && [ -n "$TARGET_CWD" ]; then
  PURGE_UUID=$(find_workspace_uuid "$TARGET_CWD" "$AGENT" || true)
 fi
 NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
 NOW_EPOCH=$(date +%s)
 # tmux 상태 + 마지막 TUI 스냅샷 (살아있을 때만; capture-pane 내용은 env 로만 전달)
 TMUX_ALIVE=0
 LAST_STATUS=""
 if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
  TMUX_ALIVE=1
  LAST_STATUS=$(tmux capture-pane -t "$SESSION_NAME" -p -S -10 2>/dev/null | tr '\n' ' ' | head -c 500 || true)
 fi
 # --capture-id: kill 직전에 conversation id 를 해결 (process/jsonl 이 아직 살아있을 때).
 # find_workspace_uuid 가 tier-1(row) -> tier-2(workspace-scoped disk scan) -> tier-3(cache)
 # 를 알아서 시도하므로 tmux 생사와 무관하게 동작.
 CAPTURED_UUID=""
 if [ "$CAPTURE_ID" = "1" ] && [ -n "$TARGET_CWD" ]; then
  CAPTURED_UUID=$(capture_conversation_id "$AGENT" "$TARGET_CWD" || true)
  if [ -n "$CAPTURED_UUID" ]; then
    echo "captured conversation id: $CAPTURED_UUID"
  else
    echo "WARN: --capture-id requested but no conversation id resolved (nothing on disk yet)"
  fi
 fi
 delegate_publish_event "$DELEGATE_JOB_ID" progress "terminating"
 # --graceful: send-keys 로 정상 종료 유도 → 폴백 체인 (SIGTERM → SIGKILL).
 graceful_stop() {
  local pane_pid exitkey
  pane_pid=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null | head -1 || true)
  case "$AGENT" in
    claude) exitkey="/exit" ;;
    agy)    exitkey="Exit" ;;
    hermes) exitkey="/exit" ;;
    *)      exitkey="/exit" ;;
  esac
  echo "graceful: send-keys '$exitkey' to $SESSION_NAME"
  tmux send-keys -t "$SESSION_NAME" "$exitkey" Enter 2>/dev/null || true
  sleep 3
  if ! tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
    echo "graceful: exited cleanly"
    return 0
  fi
  echo "graceful: still alive → kill-session (SIGTERM)"
  tmux kill-session -t "$SESSION_NAME" 2>/dev/null || true
  sleep 5
  if ! tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
    echo "graceful: terminated after kill-session"
    return 0
  fi
  echo "graceful: STILL alive → SIGKILL fallback (pane pid $pane_pid)"
  [ -n "$pane_pid" ] && kill -9 "$pane_pid" 2>/dev/null || true
 }
 # tmux 종료: graceful 이면 폴백 체인, 아니면 기존 hard kill.
 if [ "$GRACEFUL" = "1" ] && [ "$TMUX_ALIVE" = "1" ]; then
  graceful_stop
 elif [ "$TMUX_ALIVE" = "1" ]; then
  tmux kill-session -t "$SESSION_NAME"
  echo "killed tmux: $SESSION_NAME"
 else
  echo "tmux already dead, just updating YAML"
 fi
 atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
  SESSION_NAME="$SESSION_NAME" AGENT="$AGENT" PURGE="$PURGE" \
  NOW_ISO="$NOW_ISO" NOW_EPOCH="$NOW_EPOCH" LAST_STATUS="$LAST_STATUS" \
  PURGE_UUID="$PURGE_UUID" TARGET_CWD="$TARGET_CWD" \
  REASON="$REASON" CAPTURED_UUID="$CAPTURED_UUID" <<'PYEOF'
 import shutil
 name = os.environ['SESSION_NAME']
 agent = os.environ['AGENT']
 purge = os.environ['PURGE'] == '1'
 now = os.environ['NOW_ISO']
 home = os.environ['HOME_DIR']
 last_status = os.environ.get('LAST_STATUS', '')
 purge_uuid = os.environ.get('PURGE_UUID', '').strip()
 ws = os.environ.get('TARGET_CWD', '')
 reason = os.environ.get('REASON', '') or 'manual_stop'
 captured = os.environ.get('CAPTURED_UUID', '').strip()
 target = None
 for s in d.get('tmux_sessions', []):
    if s.get('name') == name:
        target = s
        break
 if target is None:
    print(f"ERROR: disappeared during script: {name}", flush=True)
    raise SystemExit(1)
 if purge:
    target['status'] = 'terminated'
    target['terminated_at'] = now
    target['terminated_at_epoch'] = int(os.environ['NOW_EPOCH'])
    target['termination_mode'] = 'purge'
 else:
    target['status'] = 'stopped'
    target['stopped_at'] = now
    target['stopped_at_epoch'] = int(os.environ['NOW_EPOCH'])
    target['stop_reason'] = reason
    target['termination_mode'] = 'graceful'
 if last_status:
    target['last_visible_status_at_termination'] = last_status
 # --capture-id: 항상 captured UUID 기록 (purge가 아닐 때만)
 if captured and not purge:
    if agent == 'claude':
        target['claude_session_id_own'] = captured
    elif agent == 'agy':
        target['agy_conversation_id_own'] = captured
    elif agent == 'hermes':
        target['hermes_conversation_id_own'] = captured
    target['resumable'] = True
 # --purge-conversation: 워크스페이스 격리된 UUID 의 디스크 artifact 만 삭제 (P0-C)
 if purge and purge_uuid:
    if agent == 'claude':
        key = ws.replace('/', '-').replace('_', '-')
        claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
        jsonl = f"{claude_project_dir}/{key}/{purge_uuid}.jsonl"
        if os.path.exists(jsonl):
            os.remove(jsonl)
            print(f"purged: {jsonl}", flush=True)
        target['claude_session_id_own'] = None
    elif agent == 'agy':
        db = f"{home}/.gemini/antigravity-cli/conversations/{purge_uuid}.db"
        if os.path.exists(db):
            os.remove(db)
            print(f"purged: {db}", flush=True)
        brain = f"{home}/.gemini/antigravity-cli/brain/{purge_uuid}"
        if os.path.isdir(brain):
            shutil.rmtree(brain)
            print(f"purged: {brain}", flush=True)
        target['agy_conversation_id_own'] = None
    elif agent == 'hermes':
        json_file = f"{home}/.mam/sessions/session_{purge_uuid}.json"
        if os.path.exists(json_file):
            os.remove(json_file)
            print(f"purged: {json_file}", flush=True)
        hdb = f"{home}/.mam/state.db"
        if os.path.exists(hdb):
            try:
                import sqlite3
                conn = sqlite3.connect(hdb)
                conn.execute("DELETE FROM sessions WHERE id=?", (purge_uuid,))
                conn.execute("DELETE FROM messages WHERE session_id=?", (purge_uuid,))
                conn.commit()
                conn.close()
                print(f"purged db records for session: {purge_uuid}", flush=True)
            except Exception as e:
                print(f"WARN: purge hermes db records failed: {e}", flush=True)
        target['hermes_conversation_id_own'] = None
    # agent_identities 는 cache — 이 워크스페이스 것일 때만 비운다
    ai = (d.get('agent_identities') or {}).get(agent) or {}
    if ai.get('project_cwd') == ws:
        if agent == 'claude' and ai.get('session_id') == purge_uuid:
            ai['session_id'] = None
            ai['session_jsonl'] = None
            ai.pop('session_size_bytes', None)
            ai.pop('session_lines', None)
        elif agent == 'agy' and ai.get('conversation_id') == purge_uuid:
            ai['conversation_id'] = None
            ai['conversation_db'] = None
            ai['conversation_brain_dir'] = None
        elif agent == 'hermes' and ai.get('session_id') == purge_uuid:
            ai['session_id'] = None
 elif purge and not purge_uuid:
    print("WARN: --purge-conversation requested but no workspace-scoped UUID resolved; nothing purged", flush=True)
 if purge:
    target['resumable'] = False
 print(f"updated: {name} status={target['status']}", flush=True)
 PYEOF
 delegate_publish_event "$DELEGATE_JOB_ID" completed "session terminated"
 echo
 echo "=== stop complete ==="
 echo "  session:  $SESSION_NAME"
 echo "  agent:    $AGENT"
 echo "  reason:   $REASON"
 echo "  captured: ${CAPTURED_UUID:-<none>}"
 echo "  purge:    $PURGE${PURGE_UUID:+ (uuid $PURGE_UUID)}"
 echo "  time:     $NOW_ISO"
 echo
 echo "Recovery: multi-agent-mux-create + multi-agent-mux-resume 로 동일 컨텍스트 복원 가능"
 echo "  (단 --purge-conversation 사용 시 복원 불가)"
@@ -0,0 +1 @@
 tmux_sessions: []
@@ -0,0 +1,33 @@
 .agents/skills/multi-agent-mux-stop/scripts/stop_session.sh
 .agents/skills/multi-agent-mux-stop/SKILL.md
 .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh
 .agents/skills/multi-agent-mux-monitor/SKILL.md
 .agents/skills/multi-agent-mux-delegate-job/mqtt-broker-setup.md
 .agents/skills/multi-agent-mux-delegate-job/requirements.txt
 .agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job
 .agents/skills/multi-agent-mux-delegate-job/README.md
 .agents/skills/multi-agent-mux-delegate-job/scripts/publish_event.py
 .agents/skills/multi-agent-mux-delegate-job/scripts/registry.py
 .agents/skills/multi-agent-mux-delegate-job/scripts/mqtt_common.py
 .agents/skills/multi-agent-mux-delegate-job/scripts/job_subscriber.py
 .agents/skills/multi-agent-mux-delegate-job/job-protocol.md
 .agents/skills/multi-agent-mux-delegate-job/SKILL.md
 .agents/skills/multi-agent-mux-delegate-job/registry.md
 .agents/skills/multi-agent-mux-create/scripts/create_session.sh
 .agents/skills/multi-agent-mux-create/SKILL.md
 .agents/skills/lib.sh
 .agents/skills/multi-agent-mux-resume/scripts/resolve_session_id.sh
 .agents/skills/multi-agent-mux-resume/scripts/update_yaml_resumed.sh
 .agents/skills/multi-agent-mux-resume/SKILL.md
 .agents/skills/multi-agent-mux-status/scripts/status.sh
 .agents/skills/multi-agent-mux-status/SKILL.md
 AGENT.md
 AGENT.ko.md
 MESSAGING.md
 BOOTSTRAP.md
 BOOTSTRAP.ko.md
 INSTRUCTION.md
 remove.sh
 update.sh
 .env.example
 .env