mam 설치

This commit is contained in:
2026-06-25 12:19:24 +09:00
parent 06a95a6d5b
commit b76249a2a6
25 changed files with 5780 additions and 0 deletions
+787
View File
@@ -0,0 +1,787 @@
#!/usr/bin/env bash
# lib.sh — shared library for the multi-agent-mux-* skills.
#
# Single source of truth for the four things that were inconsistently
# re-implemented across create/resume/delete/monitor (REVIEW.md §4.1):
# - derive_session_name : the tmux session slug (P0-A)
# - atomic_dump_yaml : SQLite db transaction + temp+rename + .bak + validate (P0-B)
# - env_python : env-safe Python (no heredoc injection) (P0-B / P1-B)
# - find_workspace_uuid : workspace-SCOPED resume id lookup (P0-C)
#
# Source it from each script with a path computed from the script location:
# source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
#
# HARD RULE: the agent-sessions.yaml file is only ever written through
# atomic_dump_yaml. Never `open(yaml_path, 'w')` anywhere else.
SKILL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
WORKSPACE_ROOT="$(cd "$SKILL_DIR/../.." && pwd)"
AGENT_SESSIONS_YAML="${AGENT_SESSIONS_YAML:-$WORKSPACE_ROOT/.mam/agent-sessions.yaml}"
# Workspace-relative defaults with environment overrides (Phase Z)
HOME_DIR="${HOME_DIR:-$WORKSPACE_ROOT}"
CLAUDE_PROJECT_DIR="${CLAUDE_PROJECT_DIR:-$HOME/.claude/projects}"
LOCAL_BIN="${LOCAL_BIN:-$HOME/.local/bin}"
# ---------------------------------------------------------------------------
# Tmux Server Isolation support
# ---------------------------------------------------------------------------
# Paths to exclude when resolving the real tmux binary (shim/wrapper dirs).
_TMUX_SHIM_DIR_PATTERN="${_TMUX_SHIM_DIR_PATTERN:-/multi-agent-tmux-shim/}"
_TMUX_SKILLS_BIN_PATTERN="${_TMUX_SKILLS_BIN_PATTERN:-/.agents/skills/.bin}"
TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}"
_resolve_real_tmux_path() {
if [ -z "${_REAL_TMUX_PATH:-}" ] || [[ "$_REAL_TMUX_PATH" == *"${_TMUX_SHIM_DIR_PATTERN}"* ]] || [[ "$_REAL_TMUX_PATH" == *"${_TMUX_SKILLS_BIN_PATTERN}"* ]]; then
local dir save_ifs="$IFS"
_REAL_TMUX_PATH=""
IFS=:
for dir in $PATH; do
if [[ "$dir" != *"${_TMUX_SHIM_DIR_PATTERN}"* ]] && [[ "$dir" != *"${_TMUX_SKILLS_BIN_PATTERN}"* ]] && [ -x "$dir/tmux" ]; then
_REAL_TMUX_PATH="$dir/tmux"
break
fi
done
IFS="$save_ifs"
if [ -z "$_REAL_TMUX_PATH" ]; then
_REAL_TMUX_PATH="tmux"
fi
export _REAL_TMUX_PATH
fi
}
_init_tmux_isolation() {
_resolve_real_tmux_path
if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
local wrapper_dir="${TMPDIR:-/tmp}${_TMUX_SHIM_DIR_PATTERN}${TMUX_SERVER_NAME}"
if [[ ":$PATH:" != *":$wrapper_dir:"* ]]; then
mkdir -p "$wrapper_dir"
cat <<EOF > "$wrapper_dir/tmux"
#!/usr/bin/env bash
if [ -z "\${TMUX_SERVER_NAME:-}" ] || [ "\$TMUX_SERVER_NAME" = "default" ]; then
exec "$_REAL_TMUX_PATH" "\$@"
else
exec "$_REAL_TMUX_PATH" -L "\$TMUX_SERVER_NAME" "\$@"
fi
EOF
chmod +x "$wrapper_dir/tmux"
export PATH="$wrapper_dir:$PATH"
fi
else
# 격리 비활성화 시 shim 자동 cleanup (PATH에서 제거)
local new_path="" dir save_ifs="$IFS"
IFS=:
for dir in $PATH; do
if [[ "$dir" != *"${_TMUX_SHIM_DIR_PATTERN}"* ]] && [[ "$dir" != *"${_TMUX_SKILLS_BIN_PATTERN}"* ]]; then
if [ -z "$new_path" ]; then
new_path="$dir"
else
new_path="$new_path:$dir"
fi
fi
done
IFS="$save_ifs"
export PATH="$new_path"
fi
}
_tmux() {
_init_tmux_isolation
if [ -z "${TMUX_SERVER_NAME:-}" ] || [ "$TMUX_SERVER_NAME" = "default" ]; then
"$_REAL_TMUX_PATH" "$@"
else
"$_REAL_TMUX_PATH" -L "$TMUX_SERVER_NAME" "$@"
fi
}
tmux() {
_tmux "$@"
}
# ---------------------------------------------------------------------------
# resolve_tmux_server <session_name>
#
# Query agent-sessions.yaml to find the tmux_server associated with a session.
# Fallback to TMUX_SERVER_NAME or 'default' if not registered or field is missing.
# Prints the resolved server name on stdout.
# ---------------------------------------------------------------------------
resolve_tmux_server() {
local session_name="$1"
SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
import os, sys, sqlite3, json, yaml
name = os.environ['SESSION_NAME']
yaml_path = os.environ['YAML_PATH']
db_path = os.path.splitext(yaml_path)[0] + '.db'
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
try:
row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
if row:
s = json.loads(row[0])
server = s.get('tmux_server')
if server:
print(server)
sys.exit(0)
except sqlite3.OperationalError:
pass
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row:
d = json.loads(row[0])
for s in d.get('tmux_sessions', []):
if s.get('name') == name:
server = s.get('tmux_server')
if server:
print(server)
sys.exit(0)
conn.close()
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
for s in d.get('tmux_sessions', []):
if s.get('name') == name:
server = s.get('tmux_server')
if server:
print(server)
sys.exit(0)
except Exception:
pass
# Fallback
print(os.environ.get('TMUX_SERVER_NAME', 'default'))
PYEOF
}
# ---------------------------------------------------------------------------
# derive_session_name <workspace> <agent>
#
# THE single source of truth for the tmux session name. Rule:
# slug = the two trailing path components of the absolute workspace,
# '_' -> '-', lowercased, joined with '-'
# name = "<slug>-creator-<agent>"
#
# Workspace root 기준 상대 해석. 예:
# $WORKSPACE_ROOT/landing_page/refer_landing_page + claude
# -> landing-page-refer-landing-page-creator-claude
#
# Decision (REVIEW P0-A): the actual workspace basename (refer_landing_page)
# IS included. The hand-written historical entry that dropped it
# (lab-landing-page-creator-claude) was the bug, not the convention.
# Every script and SKILL.md must use exactly this rule.
# ---------------------------------------------------------------------------
derive_session_name() {
local workspace="$1" agent="$2"
local abs parent work slug
abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
parent="$(basename "$(dirname "$abs")" 2>/dev/null || echo "")"
work="$(basename "$abs" 2>/dev/null || echo "root")"
if [ -z "$parent" ] || [ "$parent" = "/" ] || [ "$parent" = "." ]; then
parent="workspace"
fi
if [ -z "$work" ] || [ "$work" = "/" ] || [ "$work" = "." ]; then
work="root"
fi
slug="$(printf '%s-%s' "$parent" "$work" | tr '[:upper:]' '[:lower:]' | tr '_' '-')"
slug="$(printf '%s' "$slug" | tr -cd 'a-zA-Z0-9-')"
printf '%s-creator-%s' "$slug" "$agent"
}
# ---------------------------------------------------------------------------
# env_python <yaml_path> [KEY=VALUE ...] (Python source read from stdin)
#
# Run python3 with the source supplied on stdin via a *quoted* heredoc, so the
# shell never interpolates the source. All values are passed through the
# environment (YAML_PATH plus any KEY=VALUE pairs). Untrusted data (workspace
# paths, capture-pane text) must travel as env vars and be read via os.environ
# inside the script — never spliced into the source. Read-only by convention;
# use atomic_dump_yaml when you need to write the YAML.
# ---------------------------------------------------------------------------
_validate_env_key() {
local key="$1"
if [[ ! "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then
echo "ERROR: Invalid environment variable name: $key" >&2
return 1
fi
case "$key" in
LD_PRELOAD|LD_LIBRARY_PATH|PYTHONPATH|PYTHONHOME|PYTHONINSPECT|PYTHONSTARTUP)
echo "ERROR: Blocked environment variable: $key" >&2
return 1
;;
esac
return 0
}
env_python() {
local yaml_path="$1"; shift
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN")
while [ $# -gt 0 ]; do
case "$1" in
*=*)
local key="${1%%=*}"
_validate_env_key "$key" || return 1
envs+=("$1")
shift
;;
*)
break
;;
esac
done
env "${envs[@]}" python3 - "$@"
}
# ---------------------------------------------------------------------------
# atomic_dump_yaml <yaml_path> [KEY=VALUE ...] (mutation source from stdin)
#
# The ONLY sanctioned way to write agent-sessions.yaml. It:
# 1. takes an exclusive SQLite BEGIN IMMEDIATE transaction lock on
# agent-sessions.db (serialises all writers)
# 2. loads the current state into `d` (seeds from YAML if DB is empty)
# 3. exec()s the caller's mutation source (sees d, yaml, os, datetime,
# timezone, glob, subprocess; reads values via os.environ). The mutation
# may print and may `raise SystemExit(n)` to abort *without* writing.
# 4. validates the resulting schema
# 5. backs up to <yaml_path>.bak, then writes YAML atomically (temp + os.replace)
# when a session transitions to a finished state.
#
# The mutation source is passed via env and exec()'d — it is never string
# spliced and untrusted data never lands in Python source (P0-B / P1-B).
# ---------------------------------------------------------------------------
# Check if the workspace is on NFS — locking behaves differently on NFS
_check_is_nfs() {
local f="$1"
local mountpoint
mountpoint="$(df --output=target "$f" 2>/dev/null | tail -1)" || return 1
if mount | grep -q "$mountpoint.*nfs\|$mountpoint.*cifs\|$mountpoint.*fuse.sshfs"; then
return 0 # is NFS
fi
return 1 # not NFS
}
atomic_dump_yaml() {
local yaml_path="$1"; shift
if [ -z "${MAM_IS_NFS:-}" ]; then
if _check_is_nfs "$(dirname "$yaml_path")"; then
export MAM_IS_NFS="true"
echo "WARNING: $(dirname "$yaml_path") appears to be a network filesystem (NFS/CIFS/SSHFS)." >&2
echo "WARNING: SQLite journal_mode automatically falls back to DELETE." >&2
else
export MAM_IS_NFS="false"
fi
fi
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN" "MAM_IS_NFS=$MAM_IS_NFS")
while [ $# -gt 0 ]; do
case "$1" in
*=*)
local key="${1%%=*}"
_validate_env_key "$key" || return 1
envs+=("$1")
shift
;;
*)
break
;;
esac
done
local mutation; mutation="$(cat)"
env "${envs[@]}" AGENT_SESSIONS_MUTATION="$mutation" python3 - <<'PYEOF'
import os, sys, tempfile, shutil, glob, subprocess, json, sqlite3
from datetime import datetime, timezone
import yaml
yaml_path = os.environ['YAML_PATH']
db_path = os.path.splitext(yaml_path)[0] + '.db'
def _validate(d):
if not isinstance(d, dict):
raise SystemExit("VALIDATE: top-level is not a mapping")
sessions = d.get('tmux_sessions', [])
if not isinstance(sessions, list):
raise SystemExit("VALIDATE: tmux_sessions is not a list")
valid = {'running', 'terminated', 'archived', 'stopped'}
for i, s in enumerate(sessions):
if not isinstance(s, dict):
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] not a mapping")
if not s.get('name') or not s.get('status'):
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] missing name/status")
if s['status'] not in valid:
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} bad status {s['status']!r}")
if not isinstance(s.get('pane'), dict):
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} missing pane")
def get_terminal_set(d):
return {s.get('name'): s.get('status') for s in d.get('tmux_sessions', []) if s.get('status') in ('stopped', 'terminated', 'archived')}
os.makedirs(os.path.dirname(db_path) or '.', exist_ok=True)
conn = sqlite3.connect(db_path, timeout=60.0)
for f in [db_path, db_path + '-wal', db_path + '-shm']:
if os.path.exists(f):
try:
os.chmod(f, 0o600)
except Exception:
pass
is_nfs = os.environ.get('MAM_IS_NFS') == 'true'
if is_nfs:
conn.execute('PRAGMA journal_mode=DELETE')
else:
conn.execute('PRAGMA journal_mode=WAL')
try:
# Disable auto-commit by explicitly starting a transaction with BEGIN IMMEDIATE
# This prevents the read-modify-write lost update race condition.
conn.execute('BEGIN IMMEDIATE')
conn.execute('CREATE TABLE IF NOT EXISTS state (id INTEGER PRIMARY KEY, data TEXT)')
conn.execute('CREATE TABLE IF NOT EXISTS sessions (name TEXT PRIMARY KEY, status TEXT, pane_cwd TEXT, data JSON)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_sessions_pane_cwd ON sessions(pane_cwd)')
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row:
d = json.loads(row[0])
else:
# Seed from YAML
if os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
else:
d = {}
# Assemble d['tmux_sessions'] from sessions table if table contains data
db_sessions = []
cursor = conn.execute('SELECT name, status, pane_cwd, data FROM sessions')
for s_row in cursor.fetchall():
s_data = json.loads(s_row[3])
s_data['name'] = s_row[0]
s_data['status'] = s_row[1]
if 'pane' not in s_data:
s_data['pane'] = {}
s_data['pane']['cwd'] = s_row[2]
db_sessions.append(s_data)
if db_sessions:
d['tmux_sessions'] = db_sessions
elif 'tmux_sessions' not in d:
d['tmux_sessions'] = []
old_terminals = get_terminal_set(d)
# --- caller mutation (module scope: sees d, yaml, os, glob, subprocess) ---
exec(compile(os.environ['AGENT_SESSIONS_MUTATION'], '<mutation>', 'exec'), globals())
_validate(d)
# Separate globals and sessions for normalization
d_state = {k: v for k, v in d.items() if k != 'tmux_sessions'}
conn.execute('REPLACE INTO state (id, data) VALUES (1, ?)', (json.dumps(d_state),))
current_names = []
for s in d.get('tmux_sessions', []):
name = s.get('name')
status = s.get('status')
pane_cwd = (s.get('pane') or {}).get('cwd', '')
conn.execute('REPLACE INTO sessions (name, status, pane_cwd, data) VALUES (?, ?, ?, ?)',
(name, status, pane_cwd, json.dumps(s)))
current_names.append(name)
if current_names:
placeholders = ','.join('?' for _ in current_names)
conn.execute(f'DELETE FROM sessions WHERE name NOT IN ({placeholders})', current_names)
else:
conn.execute('DELETE FROM sessions')
new_terminals = get_terminal_set(d)
conn.commit()
# Write to YAML ONLY when a session transitions to a finished state
# (Moved after conn.commit() per Claude's feedback)
if new_terminals != old_terminals:
if os.path.exists(yaml_path):
try:
shutil.copy2(yaml_path, yaml_path + '.bak')
except Exception:
pass
dir_ = os.path.dirname(yaml_path) or '.'
fd, tmp = tempfile.mkstemp(dir=dir_, prefix='.agent-sessions.', suffix='.tmp')
try:
with os.fdopen(fd, 'w') as f:
yaml.safe_dump(d, f, default_flow_style=False, sort_keys=False,
allow_unicode=True, width=4096)
os.replace(tmp, yaml_path)
except Exception:
if os.path.exists(tmp):
os.remove(tmp)
raise
try:
conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
except Exception:
pass
except Exception:
conn.rollback()
raise
finally:
conn.close()
# H3: Re-apply chmod 0600 after close to cover newly created -wal / -shm files
try:
os.chmod(db_path, 0o600)
wal = db_path + '-wal'
if os.path.exists(wal): os.chmod(wal, 0o600)
shm = db_path + '-shm'
if os.path.exists(shm): os.chmod(shm, 0o600)
except Exception:
pass
PYEOF
}
# ---------------------------------------------------------------------------
# find_workspace_uuid <workspace> <agent>
#
# Workspace-SCOPED resolution of the resume UUID (P0-C). It NEVER returns a
# global agent_identities id unless that id's project_cwd matches THIS
# workspace. Resolution order:
# 1) tmux_sessions[] row whose pane.cwd == this workspace -> per-row own id
# (claude_session_id_own / agy_conversation_id_own)
# 2) on-disk scan scoped to this workspace
# (claude: ~/.claude/projects/<key>/*.jsonl ; agy: last_conversations.json[cwd])
# 3) agent_identities cache, ONLY when its project_cwd == this workspace
# Prints the UUID on stdout (empty line if none). Always exits 0.
# ---------------------------------------------------------------------------
find_workspace_uuid() {
local workspace="$1" agent="$2"
local abs; abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
WS_ABS="$abs" AGENT="$agent" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
import os, json, glob, sqlite3
import yaml
ws = os.environ['WS_ABS']
agent = os.environ['AGENT']
home = os.environ['HOME_DIR']
yaml_path = os.environ['YAML_PATH']
db_path = os.path.splitext(yaml_path)[0] + '.db'
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
def jsonl_exists(uuid):
key = ws.replace('/', '-').replace('_', '-')
return os.path.exists(f"{claude_project_dir}/{key}/{uuid}.jsonl")
def db_exists(uuid):
return os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{uuid}.db")
def hermes_exists(uuid):
hdb = f"{home}/.mam/state.db"
if not os.path.exists(hdb):
return False
try:
conn = sqlite3.connect(hdb)
r = conn.execute("SELECT 1 FROM sessions WHERE id=?", (uuid,)).fetchone()
conn.close()
return r is not None
except Exception:
return False
def emit(u):
print(u)
raise SystemExit(0)
# 1) per-row own id for THIS workspace (optimized with direct sqlite query if db exists)
sessions = []
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
has_sessions_table = False
try:
cursor = conn.execute('SELECT data FROM sessions WHERE pane_cwd=?', (ws,))
for row in cursor.fetchall():
sessions.append(json.loads(row[0]))
has_sessions_table = True
except sqlite3.OperationalError:
pass
if not has_sessions_table or not sessions:
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row:
d = json.loads(row[0])
for s in d.get('tmux_sessions', []):
if isinstance(s, dict) and (s.get('pane') or {}).get('cwd') == ws:
sessions.append(s)
conn.close()
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
for s in d.get('tmux_sessions', []):
if isinstance(s, dict) and (s.get('pane') or {}).get('cwd') == ws:
sessions.append(s)
except Exception:
pass
for s in sessions:
name = s.get('name', '')
if agent == 'claude' and name.endswith('-creator-claude'):
cand = s.get('claude_session_id_own')
if cand and jsonl_exists(cand):
emit(cand)
if agent == 'agy' and name.endswith('-creator-agy'):
cand = s.get('agy_conversation_id_own')
if cand and db_exists(cand):
emit(cand)
if agent == 'hermes' and name.endswith('-creator-hermes'):
cand = s.get('hermes_conversation_id_own')
if cand and hermes_exists(cand):
emit(cand)
# 2) disk scan scoped to THIS workspace
if agent == 'claude':
key = ws.replace('/', '-').replace('_', '-')
proj = f"{claude_project_dir}/{key}"
if os.path.isdir(proj):
for j in sorted(glob.glob(f"{proj}/*.jsonl"), key=os.path.getmtime, reverse=True):
sid = None
try:
with open(j) as f:
first = f.readline().strip()
if first:
sid = json.loads(first).get('sessionId')
except Exception:
sid = None
cand = sid or os.path.basename(j)[:-6]
if cand and jsonl_exists(cand):
emit(cand)
elif agent == 'agy':
lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json"
if os.path.exists(lc):
cand = None
try:
cand = json.load(open(lc)).get(ws)
except Exception:
cand = None
if cand and db_exists(cand):
emit(cand)
elif agent == 'hermes':
hdb = f"{home}/.mam/state.db"
if os.path.exists(hdb):
cand = None
try:
conn = sqlite3.connect(hdb)
r = conn.execute("SELECT id FROM sessions WHERE cwd=? ORDER BY started_at DESC LIMIT 1", (ws,)).fetchone()
conn.close()
if r:
cand = r[0]
except Exception:
cand = None
if cand:
emit(cand)
# 3) agent_identities cache, ONLY when its project_cwd == this workspace
ai = {}
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row:
ai = json.loads(row[0]).get('agent_identities', {})
conn.close()
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
ai = d.get('agent_identities', {})
except Exception:
pass
ai_agent = ai.get(agent) or {}
if ai_agent.get('project_cwd') == ws:
if agent == 'claude':
cand = ai_agent.get('session_id')
if cand and jsonl_exists(cand):
emit(cand)
elif agent == 'agy':
cand = ai.get('conversation_id')
if cand and db_exists(cand):
emit(cand)
elif agent == 'hermes':
cand = ai_agent.get('session_id') or ai.get('conversation_id')
if cand and hermes_exists(cand):
emit(cand)
print('')
PYEOF
}
# ---------------------------------------------------------------------------
# capture_conversation_id <agent> <workdir>
#
# Thin wrapper over find_workspace_uuid: resolves THIS workspace's conversation
# id (claude jsonl sessionId / agy db uuid) and prints it on stdout (empty line
# if none). find_workspace_uuid is already a workspace-scoped, 3-tier, race-free
# resolver (per-row own id -> workspace-scoped disk scan -> cwd-matched cache),
# so recording its result into the row before kill guarantees tier-1 on the next
# resume. Always exits 0.
# ---------------------------------------------------------------------------
capture_conversation_id() {
local agent="$1" workdir="$2"
find_workspace_uuid "$workdir" "$agent"
}
# ---------------------------------------------------------------------------
# is_already_stopped <session_name>
#
# Exits 0 if the row's status is 'stopped' (printing "stopped_at=<ts>" on
# stdout), 1 otherwise (including not-found). Used for idempotency: a second
# stop on an already-stopped session is a no-op.
# ---------------------------------------------------------------------------
is_already_stopped() {
local session_name="$1"
SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
import os, yaml, sqlite3, json
name = os.environ['SESSION_NAME']
yaml_path = os.environ['YAML_PATH']
db_path = os.path.splitext(yaml_path)[0] + '.db'
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
has_sessions_table = False
try:
row = conn.execute('SELECT status, data FROM sessions WHERE name=?', (name,)).fetchone()
if row:
status, s_data_str = row[0], row[1]
if status == 'stopped':
s = json.loads(s_data_str)
print(f"stopped_at={s.get('stopped_at', '?')}")
raise SystemExit(0)
has_sessions_table = True
except sqlite3.OperationalError:
pass
if not has_sessions_table:
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row:
d = json.loads(row[0])
for s in d.get('tmux_sessions', []):
if s.get('name') == name and s.get('status') == 'stopped':
print(f"stopped_at={s.get('stopped_at', '?')}")
raise SystemExit(0)
conn.close()
raise SystemExit(1)
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
for s in d.get('tmux_sessions', []):
if s.get('name') == name and s.get('status') == 'stopped':
print(f"stopped_at={s.get('stopped_at', '?')}")
raise SystemExit(0)
except Exception:
pass
raise SystemExit(1)
PYEOF
}
# ---------------------------------------------------------------------------
# multi-agent-mux-delegate-job integration helpers
#
# All paths are resolved relative to lib.sh's own location (BASH_SOURCE), so the
# skill tree is relocatable — no hardcoded absolute paths (review item 6).
# ---------------------------------------------------------------------------
# _delegate_py_bin — echo the virtualenv python (walk up from .agents/skills/), else python3.
_delegate_py_bin() {
# Return cached result if available (shell variable, not exported — avoids cross-workspace pollution)
if [ -n "${AGENT_PYTHON_BIN:-}" ] && [ -x "$AGENT_PYTHON_BIN" ]; then
printf '%s\n' "$AGENT_PYTHON_BIN"; return 0
fi
local d
d="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
while [ "$d" != "/" ] && [ -n "$d" ]; do
if [ -x "$d/.venv/bin/python" ]; then
AGENT_PYTHON_BIN="$d/.venv/bin/python"
printf '%s\n' "$AGENT_PYTHON_BIN"; return 0
fi
d="$(dirname "$d")"
done
AGENT_PYTHON_BIN="$(command -v python3 || echo python3)"
printf '%s\n' "$AGENT_PYTHON_BIN"
}
# _delegate_script <name> — echo the path to a multi-agent-mux-delegate-job script, resolved
# relative to .agents/skills/ (lib.sh dir). Empty if not found.
_delegate_script() {
local name="$1" skill_dir cand
skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cand="$skill_dir/multi-agent-mux-delegate-job/scripts/$name"
if [ -f "$cand" ]; then printf '%s\n' "$cand"; return 0; fi
printf '%s\n' "$(find "$skill_dir" -name "$name" 2>/dev/null | head -n 1 || true)"
}
# delegate_submit_job <prompt> <agent> <agent_session>
#
# Register a job in the multi-agent-mux-delegate-job registry. Prints the new JID on stdout.
delegate_submit_job() {
local prompt="$1" agent="$2" session="$3"
local py_bin registry_py
py_bin="$(_delegate_py_bin)"
registry_py="$(_delegate_script registry.py)"
if [ -z "$registry_py" ] || [ ! -f "$registry_py" ]; then
echo "ERROR: multi-agent-mux-delegate-job registry.py not found under .agents/skills/" >&2
return 1
fi
"$py_bin" "$registry_py" register \
--prompt "$prompt" \
--agent "$agent" \
--agent-session "$session"
}
# delegate_publish_event <job_id> <event> [detail]
#
# Publish a lifecycle event to the multi-agent-mux-delegate-job registry. Consolidates the
# inline .venv-walk + publish_event.py blocks that were duplicated across
# create/delete/resume (review item 7). Non-fatal by contract: an empty job id,
# a missing script, or a broker failure never aborts the caller.
delegate_publish_event() {
local job_id="$1" event="$2" detail="${3:-}"
[ -n "$job_id" ] || return 0
local py_bin pub
py_bin="$(_delegate_py_bin)"
pub="$(_delegate_script publish_event.py)"
[ -n "$pub" ] && [ -f "$pub" ] || return 0
"$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true
}
# start_watchdog <job_id> [workdir]
# Spawns a watchdog process to monitor a delegate-job JOB in the background.
# The watchdog re-spawns the subscriber every 2 minutes (or whatever hard
# limit we set) and exits automatically when the JOB reaches terminal state.
# Returns the watchdog PID via stdout.
start_watchdog() {
local job_id="$1"
local workdir="${2:-$PWD}"
local monitor_script="$workdir/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh"
local log_file="$workdir/.mam/multi-agent-mux-monitor.log"
if [ ! -f "$monitor_script" ]; then
echo "ERROR: monitor script not found: $monitor_script" >&2
return 1
fi
# Check if reconcile.sh --subscribe is already running on this workspace
local pid
pid=$(pgrep -f "bash $monitor_script --subscribe" || true)
if [ -z "$pid" ]; then
# Start the wildcard monitor subscriber daemon with --idle-timeout 0 (never idle out)
# and ensure it runs with $workdir as cwd to anchor relative log paths.
local orig_pwd="$PWD"
cd "$workdir"
nohup bash "$monitor_script" --subscribe --idle-timeout 0 >> "$log_file" 2>&1 &
pid=$!
cd "$orig_pwd"
fi
echo "$pid"
}
@@ -0,0 +1,220 @@
---
name: multi-agent-mux-create
description: "Create a new agent session (claude, antigravity/agy) in a dedicated tmux session for context-preserving long-running work. Always creates a tmux session — never backgrounds with nohup/disown. Writes the new session to .mam/agent-sessions.yaml. Use when you want to start a fresh agent (no prior UUID) for a new project workspace."
version: 1.0.0
author: godopu
license: MIT
platforms: [linux, macos]
environments: [terminal, tmux]
metadata:
hermes:
tags: [agent, tmux, claude, antigravity, agy, multi-agent, context, session]
related_skills: [multi-agent-mux-resume, multi-agent-mux-stop, multi-agent-mux-monitor, claude-code]
prereq_skills: [claude-code]
---
# Multi-Agent Create — Start a Fresh Agent in a tmux Session
> **Companion skills**: `multi-agent-mux-resume` (resume an existing UUID), `multi-agent-mux-stop` (terminate), `multi-agent-mux-monitor` (live status).
> **Single source of truth**: `./.mam/agent-sessions.yaml` (this skill writes to it; never read it ad-hoc — go through this skill).
## What this skill does
Spawn a new agent (`claude` or `agy`/antigravity-cli) in a **dedicated tmux session** for context-preserving long-running work. The tmux session is the *container*; the agent's session ID is *data* inside the container. **This skill creates the container + starts the agent — but does not resume an old conversation** (use `multi-agent-mux-resume` for that).
For all agents: the tmux session name is produced by **`lib.sh::derive_session_name`** — the single source of truth shared by create/resume/stop/status/monitor (P0-A). The rule (verbatim from the function):
> slug = the **two trailing path components** of the absolute workspace, `_`→`-`, lowercased, joined with `-`; name = `<slug>-creator-<agent>`.
So `$WORKSPACE_ROOT/landing_page/refer_landing_page` + `claude``landing-page-refer-landing-page-creator-claude`. The workspace basename (`refer_landing_page`) **is** included; the hand-written historical entry that dropped it (`lab-landing-page-creator-claude`) was the bug, not the convention.
## Pre-flight checks
Before doing anything, verify the environment:
```bash
# 1) tmux available and isolated server status
command -v tmux || { echo "ERROR: tmux not installed"; exit 1; }
echo "Tmux server name: ${TMUX_SERVER_NAME:-default}"
# 2) claude / agy available
command -v claude # required for --agent claude
command -v agy # required for --agent agy
# 3) claude auth (if --agent claude)
claude auth status 2>&1 | python3 -c "import json,sys; d=json.load(sys.stdin); assert d.get('loggedIn'), 'claude not logged in'"
# 4) target workspace exists
test -d "$WORKSPACE" || { echo "ERROR: workspace $WORKSPACE not a directory"; exit 1; }
```
If any check fails → `kanban_block(reason="...")` (worker path) or report to user (interactive path). Do not proceed with a half-broken setup.
## Standard names
- **tmux session name**: `derive_session_name <workspace> <agent>` (lib.sh)
- `<workspace-slug>` = `basename $(dirname $WORKSPACE)` `-` `basename $WORKSPACE` (lowercase, `_``-`)
- examples: `landing-page-refer-landing-page-creator-claude`, `paper-pdf2md-creator-agy`
- never re-derive this by hand — source lib.sh and call the function
- **wrapper script** (claude only): `~/.local/bin/<workspace-slug>-creator-claude`
- contents: tmux new-session with `claude` inside, auto-handles trust/bypass dialogs
- see `<workdir>/agent_sessions.md` for the canonical wrapper template
## Tmux Server Isolation (격리 서버)
When running multiple agent sessions alongside other workflows (e.g., cmux, Kanban workers, manual tmux sessions), sharing the default tmux server can lead to session name conflicts, monitoring clutter, and accidental destruction of user sessions via global commands.
To prevent this, you can run this skill inside an **isolated tmux server** using the `TMUX_SERVER_NAME` environment variable or the `--tmux-server <name>` flag (opt-in).
### How to use
1. **Via Environment Variable**:
```bash
export TMUX_SERVER_NAME=multi-agent-canary
# All subsequent commands (create, status, stop, etc.) will run in the isolated 'multi-agent-canary' tmux server.
```
2. **Via Option Flag**:
```bash
bash scripts/create_session.sh --workspace /path/to/project --agent claude --tmux-server multi-agent-canary
```
3. **Submit Job Integration**:
You can automatically register a delegated job with a prompt when creating a session:
```bash
bash scripts/create_session.sh --workspace /path/to/project --agent claude --submit-job "Task prompt here"
```
### Recommended Alias
You can set an alias in your shell to easily query sessions on the isolated server:
```bash
alias tmc='tmux -L multi-agent-canary'
tmc ls # Lists only your multi-agent sessions
```
### Safety Rules (Pitfall 29 Summary)
- Never use global server termination commands like `tmux kill-server` or `tmux kill-session -a` as they will destroy all sessions on that server (including your own workspace sessions if they share the server).
- By using an isolated server via `TMUX_SERVER_NAME`, your agent sessions are completely separated from your default user workspace, ensuring 0% interference.
## Workflow
```bash
WORKSPACE=/path/to/project
AGENT=claude # or agy
source .agents/skills/lib.sh
SESSION_NAME="$(derive_session_name "$WORKSPACE" "$AGENT")"
# 1. If session already alive, fail fast
tmux has-session -t "$SESSION_NAME" 2>/dev/null && {
echo "ERROR: tmux session '$SESSION_NAME' already exists. Use multi-agent-mux-resume to attach or multi-agent-mux-stop first."
exit 1
}
# 2. Spawn the tmux session with the agent inside
case "$AGENT" in
claude)
# Use the wrapper if it exists, else inline tmux new-session
# Use the wrapper if it exists (LOCAL_BIN env var overrides default $HOME/.local/bin)
local_bin="${LOCAL_BIN:-$HOME/.local/bin}"
if [ -x "$local_bin/$SESSION_NAME" ]; then
nohup "$local_bin/$SESSION_NAME" >/dev/null 2>&1 &
else
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude"
fi
;;
agy)
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "agy --dangerously-skip-permissions"
;;
*) echo "ERROR: --agent must be claude or agy, got: $AGENT"; exit 2 ;;
esac
# 3. Wait for agent TUI to be ready (varies: claude ~5s, agy ~3s)
sleep 6
# 4. Capture pane metadata
PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}')
PANE_CWD=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_path}')
PANE_CMD=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_command}')
TMUX_EPOCH=$(tmux list-sessions -F '#{session_created}' -t "$SESSION_NAME" 2>/dev/null | head -1)
```
## Registering the session in agent-sessions.yaml
After spawn, append a new `tmux_sessions[]` entry to `.mam/agent-sessions.yaml`:
```yaml
- name: <SESSION_NAME>
status: running
tmux_session_created_at: 2026-06-17T...Z # ISO 8601 UTC
tmux_session_epoch: <TMUX_EPOCH>
tmux_server: <TMUX_SERVER_NAME> # Isolated server name (default: 'default')
pane:
index: 0
pid: <PANE_PID>
cmd: <AGENT> # 'claude' or 'agy'
cmd_full: <full command line, see table below>
cwd: <PANE_CWD>
tui: # only for claude
model: <from TUI status>
provider: <from TUI status>
plan: <from TUI status>
account: <from TUI status>
version: <from TUI status>
start_command: <the exact tmux new-session command used>
attach_command: "tmux attach -t <SESSION_NAME>"
kill_command: "tmux kill-session -t <SESSION_NAME>"
```
`cmd_full` per agent (this is the actual command line in the pane, not the resume command):
| agent | cmd_full |
|---|---|
| claude (interactive) | `claude` |
| agy (interactive) | `agy --dangerously-skip-permissions` |
Use the `agent-sessions-yaml-edit` script in `scripts/` to safely append (preserves comments + format):
```bash
bash .agents/skills/multi-agent-mux-create/scripts/create_session.sh \
--workspace "$WORKSPACE" --agent "$AGENT" --session "$SESSION_NAME"
```
The script handles the YAML append, pane capture, and the `last_visible_status` placeholder.
## Pitfalls
- **Don't use `nohup`/`disown`/`setsid` for the agent itself** — those background the agent outside tmux. The whole point of this skill is *the tmux session is the supervisor*. `nohup` is OK only for *launching the wrapper* (which itself creates the tmux session via `tmux new-session -d`).
- **Don't trust `--session-id <uuid>` flags blindly** — claude/agy may not accept a fixed session id on first spawn. The session id is *assigned* on first user message; you can read it back from `~/.claude/projects/.../session.jsonl` headers or `~/.gemini/.../cache/last_conversations.json` AFTER the first message.
- **Wrapper script MUST NOT be created via `hermes profile alias`** — that command writes a `hermes -p <profile>` wrapper that destroys the tmux behavior. Create wrappers manually (see `lab-landing-page-creator-claude` template).
- **Always use the workspace-relative path** in tmux `cwd` — relative paths break when tmux respawns in a different shell context.
- **The first `claude` message generates the session id** — `multi-agent-mux-create` only sets up the *container*. If you need a known session id for later resume, send a placeholder message (e.g. "init") and read it back, then call `multi-agent-mux-resume` later.
## Verification
After spawn + YAML append:
```bash
# 1. tmux session is alive
tmux has-session -t "$SESSION_NAME" && echo OK || echo MISSING
# 2. pane has the expected cmd + cwd
tmux list-panes -t "$SESSION_NAME" -F 'cmd=#{pane_current_command} cwd=#{pane_current_path}'
# 3. agent-sessions.yaml has the new entry
python3 -c "
import yaml
d = yaml.safe_load(open('.mam/agent-sessions.yaml'))
names = [s['name'] for s in d['tmux_sessions']]
assert '$SESSION_NAME' in names, 'session not registered'
print('OK:', names)
"
# 4. Optional: send a probe via tmux send-keys and capture-pane
tmux send-keys -t "$SESSION_NAME" "" Enter
sleep 2
tmux capture-pane -t "$SESSION_NAME" -p -S -20
```
## When NOT to use this skill
- **Resuming an old conversation** → `multi-agent-mux-resume`
- **Killing an existing session** → `multi-agent-mux-stop`
- **Just attaching to an existing session** → `tmux attach -t <name>` (no skill needed)
- **One-shot print mode (claude -p "...")** → no tmux needed; use `claude-code` skill's print mode
@@ -0,0 +1,294 @@
#!/usr/bin/env bash
# create_session.sh — multi-agent-mux-create 의 부속 스크립트
# Usage:
# bash create_session.sh --workspace <path> --agent <claude|agy> [--session <name>] [--wrapper]
#
# 동작:
# 1) preflight: tmux/claude/agy 가용성, workspace 존재
# 2) tmux 세션 이름 결정 (--session 없으면 자동)
# 3) tmux 세션 시작 (claude 는 wrapper 우선, agy 는 인라인)
# 4) pane 메타 캡처 (pid, cmd, cwd)
# 5) agent-sessions.yaml 에 tmux_sessions[] 엔트리 append
# 6) 검증 출력
#
# Exit codes:
# 0 = success
# 1 = preflight failure
# 2 = invalid args
# 3 = tmux session already exists (use multi-agent-mux-resume or delete first)
# 4 = agent-sessions.yaml append failure
set -euo pipefail
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
usage() {
cat <<EOF
Usage: $0 --workspace <path> --agent <claude|agy|hermes> [options]
Options:
--workspace PATH project directory (required)
--agent AGENT claude | agy | hermes (required)
--session NAME tmux session name (default: derived from workspace)
--wrapper force use of ~/.local/bin/<session> wrapper even if not present
--dry-run print commands without executing
--tmux-server NAME specify isolated tmux server name
--submit-job PROMPT submit a job to multi-agent-mux-delegate-job registry with the given prompt
-h, --help this help
EOF
}
WORKSPACE=""
AGENT=""
SESSION_NAME=""
USE_WRAPPER=0
DRY_RUN=0
TMUX_SERVER_OPT=""
SUBMIT_JOB_PROMPT=""
while [ $# -gt 0 ]; do
case "$1" in
--workspace) WORKSPACE="$2"; shift 2 ;;
--agent) AGENT="$2"; shift 2 ;;
--session) SESSION_NAME="$2"; shift 2 ;;
--wrapper) USE_WRAPPER=1; shift ;;
--dry-run) DRY_RUN=1; shift ;;
--tmux-server) TMUX_SERVER_OPT="$2"; shift 2 ;;
--submit-job) SUBMIT_JOB_PROMPT="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "ERROR: unknown arg: $1" >&2; usage; exit 2 ;;
esac
done
if [ -n "$TMUX_SERVER_OPT" ]; then
export TMUX_SERVER_NAME="$TMUX_SERVER_OPT"
fi
# Preflight
[ -n "$WORKSPACE" ] || { echo "ERROR: --workspace required" >&2; usage; exit 2; }
[ -n "$AGENT" ] || { echo "ERROR: --agent required" >&2; usage; exit 2; }
[ -d "$WORKSPACE" ] || { echo "ERROR: workspace $WORKSPACE not a directory" >&2; exit 1; }
command -v tmux >/dev/null || { echo "ERROR: tmux not installed" >&2; exit 1; }
command -v "$AGENT" >/dev/null || { echo "ERROR: $AGENT CLI not in PATH" >&2; exit 1; }
# Auth Check (OAuth check for agy, loggedIn check for claude, status for hermes)
if [ "$AGENT" = "claude" ]; then
if ! claude auth status 2>/dev/null | grep -q '"loggedIn":\s*true'; then
echo "ERROR: claude not logged in. Run 'claude auth login' first." >&2
exit 1
fi
elif [ "$AGENT" = "agy" ]; then
if ! agy models >/dev/null 2>&1; then
echo "ERROR: agy is not authenticated. Please log in first." >&2
exit 1
fi
elif [ "$AGENT" = "hermes" ]; then
if ! hermes status >/dev/null 2>&1; then
echo "ERROR: hermes is not functional. Run 'hermes setup' first." >&2
exit 1
fi
fi
# 세션 이름 — lib.sh::derive_session_name 이 단일 소스 (P0-A)
if [ -z "$SESSION_NAME" ]; then
SESSION_NAME="$(derive_session_name "$WORKSPACE" "$AGENT")"
fi
# 이미 살아있으면 실패
if _tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
echo "ERROR: tmux session '$SESSION_NAME' already exists. Use multi-agent-mux-resume to attach, or multi-agent-mux-stop first." >&2
exit 3
fi
# tmux 세션 띄우기
LOCAL_BIN="${LOCAL_BIN:-$HOME/.local/bin}"
WRAPPER="$LOCAL_BIN/$SESSION_NAME"
spawn() {
case "$AGENT" in
claude)
if { [ -x "$WRAPPER" ] && [ "$(basename "$WRAPPER")" != "claude" ]; } || [ "$USE_WRAPPER" = "1" ]; then
nohup "$WRAPPER" >/dev/null 2>&1 &
disown
else
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude --dangerously-skip-permissions"
fi
;;
agy)
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "agy --dangerously-skip-permissions"
;;
hermes)
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "hermes"
;;
*) echo "ERROR: --agent must be claude, agy or hermes, got: $AGENT" >&2; exit 2 ;;
esac
}
if [ "$DRY_RUN" = "1" ]; then
echo "[dry-run] would spawn: tmux session '$SESSION_NAME' in $WORKSPACE (agent=$AGENT)"
exit 0
fi
spawn
# TUI 준비 대기
sleep 6
# pane 메타 캡처
PANE_PID=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null || echo "")
PANE_CWD=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_path}' 2>/dev/null || echo "$WORKSPACE")
PANE_CMD=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_command}' 2>/dev/null || echo "$AGENT")
TMUX_EPOCH=$(date +%s)
NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
# cmd_full 결정
case "$AGENT" in
claude) CMD_FULL='claude --dangerously-skip-permissions' ;;
agy) CMD_FULL='agy --dangerously-skip-permissions' ;;
hermes) CMD_FULL='hermes' ;;
esac
# 시작 명령
local_tmux="tmux"
if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
local_tmux="tmux -L $TMUX_SERVER_NAME"
fi
case "$AGENT" in
claude)
if [ -x "$WRAPPER" ]; then
START_CMD="$WRAPPER # ~/.local/bin 의 래퍼"
else
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude --dangerously-skip-permissions\""
fi
;;
agy|hermes)
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"$CMD_FULL\""
;;
esac
# agent-sessions.yaml 에 append
DELEGATE_JOB_ID=""
if [ -n "$SUBMIT_JOB_PROMPT" ]; then
delegate_agent=""
if [ "$AGENT" = "claude" ]; then
delegate_agent="claude-code"
elif [ "$AGENT" = "hermes" ]; then
delegate_agent="hermes-agent"
else
delegate_agent="antigravity-cli"
fi
agent_session="tmux:$SESSION_NAME"
DELEGATE_JOB_ID=$(delegate_submit_job "$SUBMIT_JOB_PROMPT" "$delegate_agent" "$agent_session")
echo "Submitted delegated job: $DELEGATE_JOB_ID"
fi
if [ ! -f "$AGENT_SESSIONS_YAML" ]; then
mkdir -p "$(dirname "$AGENT_SESSIONS_YAML")"
echo "tmux_sessions: []" > "$AGENT_SESSIONS_YAML"
fi
# atomic_dump_yaml: flock + temp+rename + .bak + schema validate (P0-B).
# 모든 값은 환경변수로 전달 — heredoc interpolation 없음 (P1-B).
# 자식 pid 는 bash 에서 pgrep 으로 미리 구함 (P2: 도구명 필터).
CHILD_PID=0
if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then
CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
CHILD_PID="${CHILD_PID:-0}"
fi
atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
SESSION_NAME="$SESSION_NAME" AGENT="$AGENT" NOW_ISO="$NOW_ISO" \
TMUX_EPOCH="$TMUX_EPOCH" PANE_PID="$PANE_PID" PANE_CWD="$PANE_CWD" \
CMD_FULL="$CMD_FULL" START_CMD="$START_CMD" CHILD_PID="$CHILD_PID" \
TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}" \
DELEGATE_JOB_ID="$DELEGATE_JOB_ID" <<'PYEOF'
name = os.environ['SESSION_NAME']
agent = os.environ['AGENT']
pid = os.environ.get('PANE_PID', '')
epoch = os.environ.get('TMUX_EPOCH', '')
server_name = os.environ.get('TMUX_SERVER_NAME', 'default')
server_opt = f"-L {server_name} " if server_name and server_name != 'default' else ""
sessions = d.setdefault('tmux_sessions', [])
# P0-D: 같은 이름 엔트리가 status=running 이면만 거부. terminated/archived 는
# 재사용 가능 — 낡은 엔트리를 제거하고 새로 append (create -> delete -> create).
running_same = [s for s in sessions if s.get('name') == name and s.get('status') == 'running']
if running_same:
print(f"ERROR: {name} already running in agent-sessions.yaml", flush=True)
raise SystemExit(4)
sessions[:] = [s for s in sessions if s.get('name') != name]
entry = {
'name': name,
'status': 'running',
'tmux_session_created_at': os.environ['NOW_ISO'],
'tmux_session_epoch': int(epoch) if epoch.isdigit() else 0,
'tmux_server': server_name,
'delegate_job_id': os.environ.get('DELEGATE_JOB_ID', '') or None,
'pane': {
'index': 0,
'pid': int(pid) if pid.isdigit() else 0,
'cmd': agent,
'cmd_full': os.environ['CMD_FULL'],
'cwd': os.environ['PANE_CWD'],
},
'start_command': os.environ['START_CMD'],
'attach_command': f'tmux {server_opt}attach -t {name}',
'kill_command': f'tmux {server_opt}kill-session -t {name}',
}
if agent == 'claude':
entry['tui'] = {
'model': '(unknown — capture after first message)',
'provider': 'anthropic',
'plan': '(unknown)',
'account': '(unknown — read from claude auth status)',
'version': '(unknown — read from TUI)',
}
entry['claude_session_id_own'] = None
entry['last_visible_status'] = "TUI started; awaiting first user message"
elif agent == 'agy':
cp = os.environ.get('CHILD_PID', '0')
entry['child_pid'] = int(cp) if cp.isdigit() else 0
entry['agy_conversation_id_own'] = None
entry['mcp_attachments'] = [
{
'name': 'stitch',
'transport': 'mcp-remote',
'endpoint': 'https://stitch.googleapis.com/mcp'
}
]
entry['last_visible_status'] = "TUI started; awaiting first user message"
elif agent == 'hermes':
cp = os.environ.get('CHILD_PID', '0')
entry['child_pid'] = int(cp) if cp.isdigit() else 0
entry['hermes_conversation_id_own'] = None
entry['last_visible_status'] = "TUI started; awaiting first user message"
sessions.append(entry)
snap = d.setdefault('snapshot', {})
snap['taken_at'] = os.environ['NOW_ISO']
snap['cwd'] = os.environ['PANE_CWD']
print(f"appended: {name}", flush=True)
PYEOF
echo
echo "=== created ==="
echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_CWD)"
if [ -n "$DELEGATE_JOB_ID" ]; then
echo "delegate job: $DELEGATE_JOB_ID"
delegate_publish_event "$DELEGATE_JOB_ID" started "multi-agent-mux session created"
WD_PID=$(start_watchdog "$DELEGATE_JOB_ID" "$WORKSPACE")
echo "watchdog PID: $WD_PID"
fi
echo "agent-sessions.yaml updated"
echo
if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
echo "Attach: tmux -L $TMUX_SERVER_NAME attach -t $SESSION_NAME"
else
echo "Attach: tmux attach -t $SESSION_NAME"
fi
echo "Delete: use multi-agent-mux-stop skill"
echo "Resume: use multi-agent-mux-resume skill (after first message creates a session id)"
@@ -0,0 +1,11 @@
# multi-agent-mux-delegate-job 스킬
작업(Job)을 자율 에이전트(claude-code/codex/opencode/human)에게 위임하고 MQTT
이벤트 채널로 비동기 관찰하는 Hermes 스킬. **시작점은 [`SKILL.md`](./SKILL.md).**
- 프로토콜/스키마: [`job-protocol.md`](./job-protocol.md)
- 브로커 PoC→운영 전환: [`mqtt-broker-setup.md`](./mqtt-broker-setup.md)
- 레지스트리 포맷/동시성: [`registry.md`](./registry.md)
- 참조 구현: [`multi-agent-mux-delegate-job`](./multi-agent-mux-delegate-job) (bash wrapper), [`scripts/publish_event.py`](./scripts/publish_event.py), [`scripts/job_subscriber.py`](./scripts/job_subscriber.py), [`scripts/registry.py`](./scripts/registry.py), [`scripts/mqtt_common.py`](./scripts/mqtt_common.py)
- 영구 감사 로그: `.mam/delegate_job_logs/<job_id>/` (`meta.json`·`events.ndjson`·`status.json`)
`multi-agent-mux-delegate-job logs <id>` 또는 `multi-agent-mux-delegate-job logs --list`로 조회 (SKILL.md "Audit Logs" 참조)
@@ -0,0 +1,385 @@
---
name: multi-agent-mux-delegate-job
description: "Delegate a unit of work to any autonomous agent (claude-code, codex, opencode, or a human) and observe it asynchronously over an MQTT event channel. Each job gets a unique id, a registry record (prompt, broker, status, timeouts), and a single per-job topic that carries started/permission_required/progress/completed/error events as schema-versioned JSON. The delegator starts a subscriber first, runs the agent, and treats a completed/error event or a timeout as the job's terminal state. Ships a working reference implementation (publish_event.py, job_subscriber.py, registry.py, mqtt_common.py, multi-agent-mux-delegate-job wrapper) plus a PoC-to-production path: validate on a public broker, then move to an authenticated TLS broker by changing config only — no code change. Use when you need fire-and-observe delegation, multi-job fan-out across tmux sessions, or a uniform completion-signal protocol shared by several agent types."
version: 1.0.0
author: Hermes Agent
license: MIT
platforms: [linux, macos, windows]
metadata:
hermes:
tags: [agent-delegation, mqtt, jobs, orchestration, async-completion]
related_skills: [claude-code, codex, opencode, hermes-agent-skill-authoring]
---
# multi-agent-mux-delegate-job — Async Job Delegation over MQTT
Delegate a unit of work to an autonomous agent, then **observe** it instead of
blocking on it. Every job gets a unique id and a registry record; the agent
publishes lifecycle events (`started`, `permission_required`, `progress`,
`completed`, `error`) to a per-job MQTT topic; the delegator subscribes and
treats `completed`/`error` — or a timeout — as the terminal state.
This skill is a **reference implementation**: copy the files in this directory
into your project and customise. The `communication_over_mqtt` project is the
canonical concrete instance.
## Overview
The model is deliberately small. A **job** is one delegated task. An **agent**
is a worker (a claude-code tmux session, a codex run, a human). The **registry**
(`.mam/jobs/<id>.json`) holds everything about a job so nothing important
lives in environment variables — which means one tmux session can process many
jobs sequentially, and many sessions can fan out in parallel, with no env
collisions. The **event channel** is one MQTT topic per job carrying JSON
payloads; `event` discriminates the type.
Responsibility is split into exactly one entry point each:
[`publish_event.py`](./scripts/publish_event.py) emits events (registry lookup,
monotonic `seq`, retry+backoff) and [`job_subscriber.py`](./scripts/job_subscriber.py)
observes them (timeouts, terminal state machine, defensive parsing). Shared
logic lives in [`mqtt_common.py`](./scripts/mqtt_common.py); registry I/O in
[`registry.py`](./scripts/registry.py). The demo `publisher.py`/`subscriber.py`
in the host project stay frozen.
Two stages, same code. **PoC** runs on the public `broker.hivemq.com` to wire up
the protocol. **Production** moves to your own authenticated TLS broker — the
switch is **config only** (env vars + the registry `broker.*` block), never a
code change. See [`mqtt-broker-setup.md`](./mqtt-broker-setup.md).
## When to Use / When NOT to Use
**Use when:**
- you want **fire-and-observe** delegation — kick off work and get a completion
signal rather than blocking a terminal;
- several agent types (claude-code, codex, opencode, human) must follow **one**
completion protocol;
- you need **multi-job fan-out** across tmux sessions with safe job claiming;
- you want a clean PoC → authenticated-broker upgrade path.
**Do NOT use when:**
- a one-shot `claude -p '…'` that returns inline is enough (no async signal
needed) — just use the [claude-code](../claude-code/SKILL.md) skill directly;
- you need request/response RPC or large artifact transfer (this is a
one-direction event stream, not a data bus);
- the payload would carry secrets and you're still on the public broker — move
to the own-broker stage first.
## Quick Start
The one-line wrapper handles register + subscriber-first + agent launch. If
you're new, **start here** and only fall back to the manual 5-step flow when
you need finer control.
```bash
# 1) one line: register → start subscriber → launch agent in tmux
# (uses public broker by default; last stdout line is the audit-log dir)
multi-agent-mux-delegate-job submit \
--agent claude-code \
--prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \
--workdir /path/to/project \
--agent-session tmux:demo \
--timeout 3600 --idle-timeout 120
# → stdout: registered job: <JID>
# subscriber pid: …
# agent launched in tmux session: demo
# subscriber output: <one line per event>
# /path/to/project/.mam/delegate_job_logs/<JID> ← audit log dir
# 2) at any time, query the job or its audit log
multi-agent-mux-delegate-job status --job <JID>
multi-agent-mux-delegate-job logs <JID> # pretty timeline
multi-agent-mux-delegate-job logs --list # every job, live status
# 3) run a user-supplied validator against the job's artifacts
multi-agent-mux-delegate-job verify --job <JID> --validate ./validate.sh
```
The wrapper enforces the **subscribe-before-publish** ordering and **forwards
the freshly-minted `JOB_ID` into the agent's prompt** (so the agent calls
`publish_event.py --job <JID>` with the right id — see Pitfall §"Wrong job_id
propagated to the agent"). When you need finer control, the manual flow is:
```bash
# Manual 5-step (same outcome, more knobs)
PY=.venv/bin/python
SKILL=./.agents/skills/multi-agent-mux-delegate-job/scripts
# 1) register
JID=$($PY "$SKILL/registry.py" register \
--prompt "…" --agent claude-code --agent-session tmux:demo \
--timeout 3600 --idle-timeout 120)
# 2) START THE SUBSCRIBER FIRST (MQTT does not queue non-retained msgs)
$PY "$SKILL/job_subscriber.py" --job "$JID" --timeout 3600 --idle-timeout 120 &
# 3) pass JID to the agent and instruct it to publish events with --job "$JID"
# (don't hard-code a job id you saw earlier — see Pitfall §"Wrong job_id")
# 4) on completion the subscriber prints events and exits 0/1/2
# 5) inspect any time
$PY "$SKILL/registry.py" get --job "$JID"
$PY "$SKILL/registry.py" logs "$JID" # positional job id
$PY "$SKILL/registry.py" logs --list
```
## Job Protocol
One topic per job: `python/mqtt/jobs/<job_id>/events`. Payload (JSON, UTF-8,
`schema_version=1`):
```json
{ "schema_version": 1, "seq": 7, "job_id": "abc12345",
"event": "started|permission_required|progress|completed|error",
"timestamp": "2026-06-19T09:32:00Z", "detail": "generalised text",
"data": { "optional": "metadata" } }
```
- `seq` is monotonic per job (first = 1); the subscriber uses it to spot
reorder/duplication.
- `timestamp` is advisory — timeouts are measured from **receive** time.
- `detail`/`data` carry **no** secrets or absolute paths.
- A `schema_version` or `job_id` mismatch is **dropped** (defensive parsing).
`started` and `completed`/`error` are the mandatory bookends; `completed`→exit 0,
`error`→exit 1. Full catalogue + production `auth_token` handling:
[`job-protocol.md`](./job-protocol.md).
## Registry Format
```
.mam/jobs/<id>.json # metadata record (single source of truth)
.mam/jobs/<id>.events.log # append-only JSON-lines log (debug, optional)
.mam/jobs/.lock # fcntl advisory lock for the registry
```
The record holds `status`, `prompt`, `agent`, `agent_session`, a `broker` block,
`topic_prefix`, `timeout_sec`/`idle_timeout_sec`, `expected_artifacts`,
`last_seq`, and (production) `auth_token`. Because the `broker` block lives in
the record, `publish_event.py` connects from the registry alone. Concurrency,
the atomic rename trick, and multi-session job claiming are in
[`registry.md`](./registry.md).
## Audit Logs
Every job's lifecycle is mirrored to a **persistent, append-only audit log**
under `.mam/delegate_job_logs/` (override with `DELEGATE_JOB_LOGS_DIR`;
default `<cwd>/.mam/delegate_job_logs`). Unlike the registry — live state
mutated in place and liable to be cleaned up — the audit log is durable
history you can replay after the fact. It is git-ignored.
```
.mam/delegate_job_logs/<job_id>/
meta.json # registration snapshot: prompt, agent, broker, timeouts, …
events.ndjson # append-only, one JSON event per line, in time order
status.json # current status only (fast point-query)
```
**What is logged, automatically:**
| When | `events.ndjson` line | Written by |
|------|----------------------|------------|
| job registered | `registered` (also seeds meta.json + status.json) | `registry.register_job` |
| any status change | `status_changed` (`from`/`to`; also rewrites status.json) | `update_job_status`, `pick_pending` |
| event published | `published` (carries the exact payload — reproducible) | `publish_event.py` |
| event received | `received` (subscriber's external view) | `job_subscriber.py` |
Both the emitter side (`published`) and the observer side (`received`) are
recorded, so a dropped publish or a missed receive is still visible from the
other. Every write is **best-effort and isolated** — an fcntl-locked append
guarded by `try/except` that only ever emits a `logger.warning`, so a logging
failure can never break a publish, a subscribe, or a registry write. stdout is
never touched.
**Reading them:**
```bash
multi-agent-mux-delegate-job logs <job_id> # pretty-print one job's timeline
multi-agent-mux-delegate-job logs --list # summarise every logged job (with live status)
# or directly via the registry CLI:
$PY scripts/registry.py logs <job_id> [--tail N] [--json]
$PY scripts/registry.py logs --list [--json]
```
`submit` prints the job's audit-log directory as its last stdout line, so a
caller can `tail -n1` to locate it.
## Broker Setup
| Stage | Broker | Auth | Transport |
|-------|--------|------|-----------|
| PoC | `broker.hivemq.com` | none | 1883 plaintext |
| Production | self-hosted Mosquitto/EMQX | user/pass + ACL | 8883 TLS |
All connection settings come from env (`MQTT_BROKER`, `MQTT_PORT`, `MQTT_TLS`,
`MQTT_USERNAME`/`MQTT_PASSWORD`, `MQTT_CA_CERTS`, …) resolved by
`broker_config_from_env()`, with the registry `broker.*` block overriding per
job. Moving to your own broker is **config only**: install Mosquitto, set
`persistence true` + `acl_file` + `password_file` + a TLS `listener 8883`, grant
the worker `write python/mqtt/jobs/+/events` and Hermes `read`, then flip
`MQTT_TLS=1` and fill the registry `broker.*`. Step-by-step (conf, ACL,
`mosquitto_passwd`, self-signed/private-CA certs, cut-over verification):
[`mqtt-broker-setup.md`](./mqtt-broker-setup.md).
## Agent Adapters
Each agent voluntarily follows the contract: receive a `JOB_ID` (or registry
path), call `publish_event.py` at lifecycle points, exit 0/1/2. **The contract
in one line**: every event call uses `--job "$JOB_ID"` where `$JOB_ID` is the
**freshly-issued id from the registry record for *this* delegation** — never a
job_id you saw in an earlier session (Pitfall §"Wrong job_id propagated to the
agent").
- **claude-code** — Claude Code calls `publish_event.py` via its Bash tool at
lifecycle points. `submit --mode tmux` injects a prompt that already names
`$JOB_ID`; if you drive claude manually, hand it the id explicitly. Reference
instruction block (the wrapper injects something equivalent):
```text
Your job_id is "$JOB_ID" (read it from the registry record for this delegation —
do not reuse any job_id you saw before).
On start: $PY multi-agent-mux-delegate-job/scripts/publish_event.py --job "$JOB_ID" --event started
On permission: $PY … --job "$JOB_ID" --event permission_required --detail "<tool>:<what>"
On progress: $PY … --job "$JOB_ID" --event progress --detail "<short status>"
On success: $PY … --job "$JOB_ID" --event completed --detail "<one-line summary>"
On failure: $PY … --job "$JOB_ID" --event error --detail "<one-line reason>"
Task: <the user's prompt>
The subscriber for "$JOB_ID" is already running; your completed/error event
ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.
```
See [claude-code](../claude-code/SKILL.md) for tmux orchestration patterns.
- **codex** — same contract. Invoke `codex exec "<instruction-block-above>"` or
wire `publish_event.py` as an MCP tool so the agent can call it directly.
- **opencode** — wire `publish_event.py` as a tool/command the agent can call;
identical event points.
- **human** — a person does the work, reads the registry record, then runs
`publish_event.py --job <id> --event completed` (or `error`) by hand.
## User Interface
The [`multi-agent-mux-delegate-job`](./multi-agent-mux-delegate-job) bash wrapper bundles register +
subscribe-first + run-agent + validate:
```bash
multi-agent-mux-delegate-job submit --agent claude-code \
--prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \
--workdir /path/to/project --timeout 3600 [--validate ./validate.sh]
multi-agent-mux-delegate-job status --job <id> # one record, pretty-printed
multi-agent-mux-delegate-job list # all jobs, one line each
multi-agent-mux-delegate-job verify --job <id> --validate ./validate.sh # runs it, reports exit code
multi-agent-mux-delegate-job wait [--job <id>] # block until terminal (else --wait-any)
```
`submit` **always starts the subscriber before the agent** (the ordering
dependency), runs the agent in `--mode print` (one-shot) or `--mode tmux`, and
calls `--validate` afterward if given. The skill automates job-id generation,
registry creation, broker resolution, subscriber-first ordering, agent launch,
and completion detection; it does **not** automate the agent's internals or your
business-logic validation — those are hooks you fill (`validate.sh` reads
`$JOB_ID`/`$REGISTRY_DIR`).
## Common Pitfalls
- **Publishing before subscribing** — MQTT does not queue non-retained messages
for absent subscribers. Start `job_subscriber.py` *before* the agent, or rely
on retained terminal events (production). `submit` enforces this.
- **Wrong job_id propagated to the agent** — the wrapper prints a fresh `JOB_ID`
on every `submit`. If your agent instruction (or the wrapper's prompt template)
hard-codes an old job_id, the agent calls `publish_event.py --job <wrong>`,
the subscriber's defensive parser drops it as a `job_id` mismatch, and the
delegator waits until idle timeout (exit 2). Fix: instruct the agent to
**read the job_id from the registry record for *this* delegation** (or pass it
in via env / `--prompt` interpolation), never from prior runs. `submit`'s
default prompt template interpolates `$JOB_ID` for you — if you build a custom
prompt, do the same.
- **tmux session name collision** — `submit --mode tmux` derives the session
name from `--agent-session tmux:<name>` (default `tmux:claude`). If a session
with that name is already attached (e.g. you ran the demo and the previous
session is still open), `tmux new-session -d -s <name>` fails and the agent
never launches. Pick a unique `--agent-session` per concurrent delegation
(e.g. `tmux:demo`, `tmux:claude-a`, `tmux:claude-b`) or kill the stale one
(`tmux kill-session -t claude`) before re-running.
- **Timeout before `started`** — a cold-starting agent may not emit `started`
for a while; the wall-clock timeout starts at subscribe time so a stuck agent
still terminates. Don't set `--timeout` so low you false-positive a slow start.
- **No retry on publish** — a dropped `completed` would hang the delegator
forever; `publish_event.py` retries with exponential backoff and exits 2 if it
still fails, so the delegator is never left waiting silently.
- **QoS-1 duplicates / reorders** — a terminal event can arrive twice, or
`error` can trail `completed`; the subscriber's terminal state machine
finalises each job once and ignores the rest.
- **Trusting the public broker** — anyone can publish there; never make a real
decision on a PoC signal. Add `auth_token` + an authenticated broker first.
- **Secrets in `detail`/`data`** — keep payloads generalised; no paths, keys, or
tokens (except the production `auth_token` in `data`).
## Subagent Orchestration Pattern
When using this skill from a Hermes `delegate_task` subagent to dispatch work to
a coding-agent CLI (agy/claude) running in a tmux session, the following pattern
has been verified (2026-06-21, 6-batch refactoring sprint):
### Roles
- **Main worker** (implementation): one agent session (e.g. `agy-new`) receives
brief files and executes code changes.
- **Reviewers** (spec compliance + code quality): two other agent sessions
(e.g. `agy-existing`, `claude-existing`) review the diff in parallel.
- **Hermes** (orchestrator): dispatches subagents, verifies diffs, commits,
and falls back to direct fixes when reviewers find issues.
### Key lessons learned
1. **Brief delivery via file path** — don't paste long briefs inline via
`tmux send-keys`; the TUI may swallow them. Instead, send a short instruction
like "follow /tmp/batch1-brief.md" and let the agent read the file.
2. **Polling vs MQTT subscriber** — for short tasks (<5min), pane polling
(`capture-pane` + grep for completion markers) is simpler and more reliable
than registering a job via `registry.py` + `job_subscriber.py`. Use MQTT
subscriber only for long-running jobs (>5min) where push notification matters.
3. **Reviewers catch different bugs** — in practice, agy (Flash) caught
semantic issues (slash matching, export scope), while claude (Opus) caught
API signature mismatches (paho v2 5-arg vs 4-arg `on_disconnect`). Two
reviewers with different models provide complementary coverage.
4. **Hermes fallback fix** — when reviewers find a small, well-defined issue
(wrong argument count, missing slash), Hermes should fix it directly rather
than re-dispatching the implementer. This saves a full round-trip.
5. **Batch grouping** — group 2-3 FW items per batch when they touch different
files (no file overlap). This amortises the dispatch overhead. Items touching
the same file must be in separate batches to avoid conflicts.
6. **Pane Snapshots & Truncation Prevention** — to prevent long agent responses from being scrolled out and truncated due to TUI viewport limitations, enforce the following snapshotting pattern:
- Immediately after dispatching a brief, capture the pre-brief pane buffer via `capture-pane -S -200`.
- During long execution, run a background loop taking incremental snapshots (e.g. every 30 seconds `>> /tmp/pane-snap.txt`).
- Immediately after job termination, capture the entire final pane state to ensure no terminal logs are lost.
## Verification Checklist
- [ ] `started` → `completed` over the public broker: subscriber prints the
lines and exits **0**.
- [ ] `error` path: subscriber exits **1**.
- [ ] timeout path: no terminal event within `--timeout`/`--idle-timeout` →
exit **2**.
- [ ] polluted payload (bad JSON, wrong `schema_version`, wrong `job_id`) is
dropped with a warning, not crashed on.
- [ ] one tmux session processes two registry jobs in sequence; a second
session with a different `agent_session` claims only its own.
- [ ] broker cut-over: same scripts reach an authenticated TLS broker with env
changes only; a credential without write ACL is rejected; a late
subscriber still receives the retained terminal event.
- [ ] `publisher.py`/`subscriber.py`/`README.md` demo on `python/mqtt/sample`
still works unchanged (regression).
- [ ] **audit log integrity** — for a completed job,
`.mam/delegate_job_logs/<JID>/events.ndjson` contains `registered` →
`received started` → `published completed` (in that order), and
`status.json.status == "completed"` matches the registry record. A
logging failure (e.g. read-only log dir) does not break the publish or
subscribe path — only a `logger.warning` is emitted.
- [ ] **end-to-end demo smoke** — run
`multi-agent-mux-delegate-job submit --agent claude-code --agent-session tmux:demo-smoke
--prompt "echo hello and call publish_event.py --job <JID>
--event completed" --timeout 120` and confirm
(a) registered job id echoed, (b) subscriber pid echoed, (c) tmux session
name printed, (d) `events.ndjson` grows as the agent runs, (e) final
stdout line is the audit-log dir.
@@ -0,0 +1,114 @@
# Job Event Protocol
The wire contract every multi-agent-mux-delegate-job agent (claude-code, codex, opencode,
human, …) speaks. One job → one MQTT topic → JSON event payloads. Stable across
the PoC (public broker) and production (own broker) stages; only transport
hardening changes, never the payload shape.
Reference implementation: [`./scripts/publish_event.py`](./scripts/publish_event.py)
(emit) and [`./scripts/job_subscriber.py`](./scripts/job_subscriber.py) (observe).
---
## 1. Topic design
| Topic | Purpose |
|-------|---------|
| `python/mqtt/sample` | Legacy demo topic — **never changed** (README compat). |
| `python/mqtt/jobs/<job_id>/events` | Per-job event stream (this protocol). |
- One topic per job, JSON payload, `event` field discriminates the type.
- Single-direction publish only (worker → observer). No request/response.
- Future split is reserved but not required:
`<job_id>/events`, `<job_id>/logs`, `<job_id>/artifacts`.
- `topic_prefix` is stored in the job record so publishers resolve the topic
from the registry alone (`<topic_prefix>/events`).
---
## 2. Payload schema (JSON, UTF-8, `schema_version = 1`)
```json
{
"schema_version": 1,
"seq": 7,
"job_id": "abc12345",
"event": "started | permission_required | progress | completed | error",
"timestamp": "2026-06-19T09:32:00Z",
"detail": "generalised, whitelisted human-readable string",
"data": { "optional": "metadata" }
}
```
| Field | Rule |
|-------|------|
| `schema_version` | If publisher/subscriber disagree, the subscriber **drops** the event with a warning (defensive parsing). |
| `seq` | Monotonic **per `job_id`**, first publish = 1. Lets the subscriber detect reorder/duplication. Persisted in the registry (`last_seq`) so it survives restarts. |
| `job_id` | Subscriber drops any event whose `job_id` it did not subscribe for. |
| `timestamp` | Publisher host clock, **advisory only**. The delegator's timeout is measured from *receive* time, not this field. |
| `detail` | Generalised text only. **No absolute paths, keys, or tokens.** |
| `data` | Optional metadata. Production may add `hmac_sig`, `build_id`, etc. |
---
## 3. Event catalogue
| event | When emitted | `detail` example | seq |
|-------|--------------|------------------|-----|
| `started` | Agent first picks up the job | `"Job a1b2c3d4 started"` | 1 |
| `permission_required` | Agent needs a tool/permission grant | `"needs to write sort_problems.md"` | as it happens |
| `progress` | Optional intermediate checkpoint | `"creating problem 5/10"` | as it happens |
| `completed` | Successful terminal state | `"saved to sort_problems.md"` | last |
| `error` | Failure / exception terminal state | `"internal error, see logs"` | last |
`started` and `completed`/`error` are mandatory bookends; `permission_required`
and `progress` are optional. `detail` must stay on the whitelist of generalised
phrasings — never leak secrets through it.
### Terminal semantics
- `completed` → subscriber exits 0; `error` → exits 1.
- The subscriber runs a **terminal state machine**: it finalises a job on the
first `completed`/`error` it sees and ignores any later terminal event for
that job (QoS-1 duplicate, or an `error`-after-`completed` reorder). When all
watched jobs are finalised it exits.
- Wall-clock timeout *or* idle timeout before a terminal event → exit 2.
---
## 4. Production hardening (own broker stage)
The payload shape is unchanged; the transport and trust model tighten. See
[`mqtt-broker-setup.md`](./mqtt-broker-setup.md) for the broker side.
- **Auth / ACL** — username/password + per-topic ACL. `jobs/+/events` publish is
granted to the worker credential, subscribe to the Hermes credential.
- **HMAC Signature Verification (`data.hmac_sig`)** — to authenticate the publisher and verify message integrity without exposing the raw secret token over the wire, each job record contains a per-job `auth_token` (`secrets.token_urlsafe(32)`). The publisher computes an HMAC-SHA256 signature over the serialized payload (excluding `data.hmac_sig` itself) using the `auth_token` as the key, and appends it to **`data.hmac_sig`**. The subscriber reconstructs this signature and **drops any message that does not match or lacks a valid signature**.
```json
{ "...": "...", "data": { "hmac_sig": "d2f3...", "build_id": "42" } }
```
- **TLS** — port 8883 + private CA. Toggled with `MQTT_TLS=1` (+ `MQTT_CA_CERTS`);
no code change.
- **Retained terminal events** — `completed`/`error` publish with `retain=True`
so a subscriber that joins late immediately receives the last terminal state
instead of a stale view. The reference publisher auto-retains terminal events;
`--retained` forces it for any event.
- **Dual timeouts** — total wall-clock budget + last-activity idle detection,
both measured from receive time.
- **Clock trust** — never trust the payload `timestamp` for timeout decisions.
---
## 5. Why a public broker is PoC-only
On `broker.hivemq.com` anyone can publish/subscribe the same topic. Therefore:
- No secret data in payloads.
- `started`/`completed`/`error` are *signals*, never a basis for a security
decision.
- Non-retained messages are **not queued** for absent subscribers — start the
subscriber **before** the agent (ordering dependency), or rely on retained
terminal events in production.
- Real operational decisions belong to the own-broker stage with auth + ACL.
@@ -0,0 +1,176 @@
# MQTT Broker Setup — PoC → Production
The multi-agent-mux-delegate-job scripts read **all** broker settings from environment
variables (or a job record's `broker.*` block) through a single helper,
`broker_config_from_env()` in
[`./scripts/mqtt_common.py`](./scripts/mqtt_common.py). The design goal:
**switch from the public PoC broker to your own broker with config only — no
code change.**
| Env var | Meaning | PoC default | Production |
|---------|---------|-------------|-----------|
| `MQTT_BROKER` | host | `broker.hivemq.com` | internal hostname/IP |
| `MQTT_PORT` | port | `1883` | `8883` (TLS) |
| `MQTT_TLS` | TLS on/off (`1`/`0`) | `0` | `1` |
| `MQTT_USERNAME` / `MQTT_PASSWORD` | auth | (none) | broker-issued |
| `MQTT_CA_CERTS` | CA bundle path | (none) | private CA path |
| `MQTT_CERTFILE` / `MQTT_KEYFILE` | client cert (optional mTLS) | (none) | per-client |
| `MQTT_CLIENT_ID_PREFIX` | client id prefix | `hermes` | per-environment |
---
## 1. PoC: public broker (`broker.hivemq.com`)
**Pros** — zero setup, reachable from anywhere, perfect for wiring up the
publish/subscribe loop and the timeout/state-machine logic.
**Cons / accepted assumptions** — no auth, no integrity, shared with the world:
- no secrets in payloads;
- `started`/`completed`/`error` are advisory signals only;
- non-retained messages are **not queued** for absent subscribers, so the
subscriber must start before the agent;
- a re-subscribing client cannot recover past (non-retained) events.
Use it only to validate the protocol, never for real decisions.
---
## 2. Production: self-hosted Mosquitto (or EMQX)
Both support MQTT 5 + ACL + TLS. Mosquitto shown below; EMQX is a drop-in for
the same env vars.
### 2.1 Install
```bash
# macOS
brew install mosquitto
# Debian/Ubuntu
sudo apt-get update && sudo apt-get install -y mosquitto mosquitto-clients
# Docker
docker run -d --name mosquitto -p 8883:8883 \
-v "$PWD/mosquitto.conf:/mosquitto/config/mosquitto.conf" \
-v "$PWD/certs:/mosquitto/certs" \
-v "$PWD/auth:/mosquitto/auth" \
eclipse-mosquitto:2
```
### 2.2 `mosquitto.conf` (key lines)
```conf
persistence true
persistence_location /mosquitto/data/
password_file /mosquitto/auth/passwd
acl_file /mosquitto/auth/acl
allow_anonymous false
listener 8883
cafile /mosquitto/certs/ca.crt
certfile /mosquitto/certs/server.crt
keyfile /mosquitto/certs/server.key
```
`persistence true` + QoS 1 + retained terminal events means a subscriber that
joins after a job finished still sees the final `completed`/`error`.
### 2.3 Users (username/password)
```bash
# create the file with the first user, then add more with -b
mosquitto_passwd -c /mosquitto/auth/passwd hermes # subscriber/delegator
mosquitto_passwd /mosquitto/auth/passwd claude-worker # publisher/agent
# (omit -c after the first; -c truncates the file)
```
### 2.4 ACL — least privilege
The worker only **publishes** events; Hermes only **subscribes**:
```conf
# /mosquitto/auth/acl
# claude-worker: may publish job events, may not read others' streams
user claude-worker
topic write python/mqtt/jobs/+/events
# hermes: observes every job's events
user hermes
topic read python/mqtt/jobs/+/events
# keep the legacy demo topic usable for both, if desired
pattern readwrite python/mqtt/sample
```
### 2.5 TLS certificates
**Quick self-signed (single host, internal only):**
```bash
mkdir -p certs && cd certs
openssl req -x509 -newkey rsa:2048 -nodes -days 825 \
-keyout server.key -out server.crt \
-subj "/CN=mqtt.internal"
cp server.crt ca.crt # clients trust this as the CA bundle
```
**Private CA (recommended — separate CA from server cert):**
```bash
# 1) CA
openssl genrsa -out ca.key 4096
openssl req -x509 -new -nodes -key ca.key -days 3650 -out ca.crt -subj "/CN=Hermes-CA"
# 2) server cert signed by the CA
openssl genrsa -out server.key 2048
openssl req -new -key server.key -out server.csr -subj "/CN=mqtt.internal"
openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \
-out server.crt -days 825
```
Clients trust `ca.crt` via `MQTT_CA_CERTS=/path/to/ca.crt`.
---
## 3. Cut-over verification (config-only, no code change)
Goal: prove the **same scripts** talk to your broker by changing only env/registry.
```bash
# 1) point the env at the new broker
export MQTT_BROKER=mqtt.internal
export MQTT_PORT=8883
export MQTT_TLS=1
export MQTT_CA_CERTS=$PWD/certs/ca.crt
export MQTT_USERNAME=hermes
export MQTT_PASSWORD=# subscriber side
# (publisher side uses claude-worker creds via the job record's broker block)
# 2) sanity-check with the mosquitto CLI first
mosquitto_sub -h "$MQTT_BROKER" -p 8883 --cafile "$MQTT_CA_CERTS" \
-u hermes -P "$MQTT_PASSWORD" -t 'python/mqtt/jobs/+/events' -v &
# 3) run the unchanged multi-agent-mux-delegate-job loop
PY=.venv/bin/python
JID=$($PY scripts/registry.py register --prompt "broker cutover smoke")
$PY scripts/job_subscriber.py --job "$JID" --timeout 30 &
sleep 3
$PY scripts/publish_event.py --job "$JID" --event started
$PY scripts/publish_event.py --job "$JID" --event completed # auto-retained
```
Expected:
- subscriber prints the `started` and `completed` lines and exits 0;
- `mosquitto_sub` shows the same events (ACL allows `hermes` to read);
- publishing as a credential **without** write ACL is rejected by the broker;
- a subscriber started *after* `completed` still receives it (retained).
If all four hold, the migration is config-only. Persist the broker block into
each job record so `publish_event.py` connects from the registry alone:
```json
"broker": { "host": "mqtt.internal", "port": 8883, "tls": true,
"username": "claude-worker", "password": "…" }
```
@@ -0,0 +1,277 @@
#!/usr/bin/env bash
# multi-agent-mux-delegate-job — user-facing orchestrator for the multi-agent-mux-delegate-job skill.
#
# Subcommands:
# submit register a job, start the subscriber FIRST, then run the agent,
# then (optionally) run a validation script.
# status show one job record.
# list list all jobs.
# verify run a user-supplied --validate script against a job's artifacts.
# wait block until all running/pending jobs reach a terminal state.
#
# This is a reference wrapper: it shells out to the python scripts that live
# next to it. Copy it into your project and customise as needed. It never hard
# fails if `claude`/`codex`/`tmux` are missing — it prints what it would run.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Pick an interpreter: prefer a project .venv, else python3.
pick_python() {
local py_bin
if [[ -n "${DELEGATE_JOB_PYTHON:-}" ]]; then
py_bin="$DELEGATE_JOB_PYTHON"
elif [[ -x "${WORKDIR:-.}/.venv/bin/python" ]]; then
py_bin="${WORKDIR}/.venv/bin/python"
elif [[ -x ".venv/bin/python" ]]; then
py_bin="$(pwd)/.venv/bin/python"
else
py_bin="python3"
fi
if ! "$py_bin" -c "import paho.mqtt" 2>/dev/null; then
echo "ERROR: paho-mqtt package is missing for $py_bin." >&2
echo " Please create a virtual environment and install it:" >&2
echo " python3 -m venv .venv && .venv/bin/pip install -r \"$SCRIPT_DIR/requirements.txt\"" >&2
exit 1
fi
echo "$py_bin"
}
REGISTRY_DIR_DEFAULT=".mam/jobs"
usage() {
cat <<'EOF'
multi-agent-mux-delegate-job <command> [options]
submit --agent <name> --prompt <text> [--workdir <dir>] [--agent-session <label>]
[--timeout <sec>] [--idle-timeout <sec>] [--validate <script>]
[--registry-dir <dir>] [--dry-run]
# The skill is tmux-interactive only; --mode print was removed.
status --job <id> [--registry-dir <dir>]
list [--registry-dir <dir>]
verify --job <id> --validate <script> [--registry-dir <dir>]
wait [--job <id>] [--timeout <sec>] [--registry-dir <dir>]
logs <job_id> | --list # persistent audit log (delegate_job_logs/)
EOF
}
# ---- arg parsing helpers --------------------------------------------------
AGENT="claude-code"; PROMPT=""; WORKDIR="$(pwd)"; AGENT_SESSION="tmux:claude"
TIMEOUT=3600; IDLE_TIMEOUT=120; VALIDATE=""; DRY_RUN=0
JOB_ID=""; REGISTRY_DIR="$REGISTRY_DIR_DEFAULT"
parse_opts() {
while [[ $# -gt 0 ]]; do
case "$1" in
--agent) AGENT="$2"; shift 2;;
--prompt) PROMPT="$2"; shift 2;;
--workdir) WORKDIR="$2"; shift 2;;
--agent-session) AGENT_SESSION="$2"; shift 2;;
--timeout) TIMEOUT="$2"; shift 2;;
--idle-timeout) IDLE_TIMEOUT="$2"; shift 2;;
--validate) VALIDATE="$2"; shift 2;;
--job) JOB_ID="$2"; shift 2;;
--registry-dir) REGISTRY_DIR="$2"; shift 2;;
--dry-run) DRY_RUN=1; shift;;
*) echo "unknown option: $1" >&2; usage; exit 1;;
esac
done
}
cmd_submit() {
parse_opts "$@"
[[ -n "$PROMPT" ]] || { echo "submit requires --prompt" >&2; exit 1; }
PY="$(pick_python)"
cd "$WORKDIR"
mkdir -p "$REGISTRY_DIR"
# 1) register job (prints the new job id)
JOB_ID="$("$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" register \
--prompt "$PROMPT" --agent "$AGENT" --agent-session "$AGENT_SESSION" \
--timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT")"
echo "registered job: $JOB_ID"
# 2) START THE SUBSCRIBER FIRST (ordering dependency — MQTT does not queue
# non-retained messages for absent subscribers).
local logf="$REGISTRY_DIR/$JOB_ID.subscriber.out"
"$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
--job "$JOB_ID" --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT" \
>"$logf" 2>&1 &
local sub_pid=$!
echo "subscriber pid: $sub_pid (log: $logf)"
sleep 1 # give the subscriber time to CONNACK + SUBSCRIBE before the agent runs
# 3) run the agent (or print the command for dry-run / missing binary)
local pub="$PY $SCRIPT_DIR/scripts/publish_event.py --registry-dir $REGISTRY_DIR --job $JOB_ID"
# NOTE: the agent MUST use --job "$JOB_ID" (the one we just minted). Hard-coding
# an id from an earlier session is the #1 reason a delegated job sits idle and
# times out (see SKILL.md "Wrong job_id propagated to the agent"). We make the
# freshness explicit in the instruction header.
local instructions="Your job_id is \"$JOB_ID\" (the one just registered for THIS delegation — read it from the registry record, do NOT reuse any job_id you saw in earlier runs).
On start run: $pub --event started.
On permission/tool prompt run: $pub --event permission_required --detail '<tool>:<what>'.
On progress (optional): $pub --event progress --detail '<short status>'.
On success run: $pub --event completed --detail '<one-line summary>'.
On failure run: $pub --event error --detail '<one-line reason>'.
The subscriber for this job_id is already running; your completed/error event ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.
Task: $PROMPT"
run_agent "$JOB_ID" "$instructions"
# 4) optional validation hook
if [[ -n "$VALIDATE" ]]; then
echo "running validation: $VALIDATE"
if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
echo "validation: PASS"
else
local rc=$?
echo "validation: FAIL (exit $rc)"
fi
fi
if [[ "$DRY_RUN" == "1" ]]; then
# In dry-run we never started a real subscriber (the wrapper short-circuits
# before launching one), but the wait below would still try to join the
# background sub_pid from cmd_submit. Skip both the wait and the subscriber
# log dump; the user just wants to see the instruction that would have run.
local logs_root_dry="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
echo "$logs_root_dry/$JOB_ID"
return 0
fi
wait "$sub_pid" || true
echo "subscriber output:"; cat "$logf" || true
# Last stdout line: the persistent audit-log dir for this job (see SKILL.md
# "Audit Logs"). Callers can scrape `tail -n1` to find it.
local logs_root="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
echo "$logs_root/$JOB_ID"
}
run_agent() {
local job_id="$1"; local instructions="$2"
# The skill is INTERACTIVE-ONLY. We never invoke `claude -p` or any other
# one-shot print mode, because:
# - claude -p exits the moment stdin is drained, so there's nothing to
# `tmux attach` to afterwards.
# - fire-and-forget via wrapper defeats the whole point of the audit log
# (you can't tell what happened if the agent crashes mid-turn).
# - the job registry already gives us an authoritative completion signal,
# so we don't need a wrapper-side exit code to know "done".
# The user attaches with `tmux attach -t <session>` and types follow-up
# prompts themselves. We pre-load the first prompt via stdin and `read`
# keeps the pane open after the agent exits so the user can review.
if [ "$AGENT" = "human" ]; then
echo "[human agent] complete the task, then run publish_event.py --event completed"
return
fi
local sess="${AGENT_SESSION#tmux:}"
if [[ "$DRY_RUN" == "1" ]]; then
echo "[dry-run] would delegate task to running agent '$AGENT' in tmux session '$sess' with instructions:"
echo "----"; echo "$instructions"; echo "----"
return
fi
if ! command -v tmux >/dev/null 2>&1; then
echo "ERROR: this skill requires tmux (interactive agent sessions)." >&2
echo " Install with: brew install tmux (or your package manager)" >&2
return 1
fi
local _tmux="tmux"
if [ -n "${TMUX_SERVER_NAME:-}" ]; then
_tmux="tmux -L $TMUX_SERVER_NAME"
fi
if ! $_tmux has-session -t "$sess" 2>/dev/null; then
echo "ERROR: 에이전트 세션 '$sess'이 존재하지 않습니다. 작업을 위임하기 전에 먼저 에이전트 세션을 기동해 주세요." >&2
echo " 팁: 'multi-agent-mux-resume' 또는 'multi-agent-mux-create'를 통해 에이전트를 먼저 생성할 수 있습니다." >&2
return 1
fi
# Before launching the agent, set up error trap to publish error event
if [ -n "${job_id:-}" ] && [ -n "${PY:-}" ]; then
local pub_script="$SCRIPT_DIR/scripts/publish_event.py"
trap 'rc=$?; if [ $rc -ne 0 ]; then "$PY" "$pub_script" --job "$job_id" --event error --detail "agent bootstrap failed (exit $rc)"; fi' EXIT
fi
echo "살아있는 에이전트 세션 '$sess'에 작업을 위임합니다..."
$_tmux set-buffer -b "job_buf_$job_id" "$instructions"
$_tmux paste-buffer -b "job_buf_$job_id" -t "$sess"
$_tmux send-keys -t "$sess" C-m
$_tmux delete-buffer -b "job_buf_$job_id"
echo "작업이 세션 '$sess'에 전송되었습니다. (연결하려면: $_tmux attach -t $sess)"
trap - EXIT
}
cmd_status() {
parse_opts "$@"
[[ -n "$JOB_ID" ]] || { echo "status requires --job" >&2; exit 1; }
PY="$(pick_python)"
"$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" get --job "$JOB_ID"
}
cmd_list() {
parse_opts "$@"
PY="$(pick_python)"
"$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" list
}
cmd_verify() {
parse_opts "$@"
[[ -n "$JOB_ID" ]] || { echo "verify requires --job" >&2; exit 1; }
[[ -n "$VALIDATE" ]] || { echo "verify requires --validate <script>" >&2; exit 1; }
echo "verifying job $JOB_ID with $VALIDATE"
if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
echo "verify: PASS (exit 0)"; exit 0
else
rc=$?; echo "verify: FAIL (exit $rc)"; exit "$rc"
fi
}
cmd_logs() {
# logs <job_id> | logs --list — delegates to registry.py's logs CLI, which
# reads the persistent audit log under $DELEGATE_JOB_LOGS_DIR (or
# <cwd>/delegate_job_logs). Run from your project dir so the default resolves.
PY="$(pick_python)"
if [[ "${1:-}" == "--list" ]]; then
"$PY" "$SCRIPT_DIR/scripts/registry.py" logs --list
else
local jid="${1:-}"
[[ -n "$jid" ]] || { echo "logs requires <job_id> or --list" >&2; exit 1; }
"$PY" "$SCRIPT_DIR/scripts/registry.py" logs "$jid"
fi
}
cmd_wait() {
parse_opts "$@"
PY="$(pick_python)"
if [[ -n "$JOB_ID" ]]; then
"$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
--job "$JOB_ID" --timeout "$TIMEOUT"
else
"$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
--wait-any --timeout "$TIMEOUT"
fi
}
main() {
local sub="${1:-}"; shift || true
case "$sub" in
submit) cmd_submit "$@";;
status) cmd_status "$@";;
list) cmd_list "$@";;
verify) cmd_verify "$@";;
wait) cmd_wait "$@";;
logs) cmd_logs "$@";;
""|-h|--help|help) usage;;
*) echo "unknown command: $sub" >&2; usage; exit 1;;
esac
}
main "$@"
@@ -0,0 +1,183 @@
# Job Registry
The registry is the **single source of truth** for delegated work. Job metadata
(id, prompt, broker, status, timeouts) lives in files, **not** environment
variables — so one tmux session can handle many jobs sequentially or in
parallel without collisions, and `publish_event.py` / `job_subscriber.py` can
reconstruct everything they need from the registry alone.
Reference implementation: [`./scripts/registry.py`](./scripts/registry.py)
(library + CLI) over the primitives in
[`./scripts/mqtt_common.py`](./scripts/mqtt_common.py).
---
## 1. Directory layout
```
.mam/jobs/
<job_id>.json # job metadata record (schema below)
<job_id>.events.log # append-only JSON-lines event log (debug, optional)
.lock # shared advisory lock (fcntl) for the whole registry
```
`registry_dir` defaults to `.mam/jobs` and is overridable everywhere via
`--registry-dir`.
---
## 2. Job record schema
```json
{
"schema_version": 1,
"job_id": "abc12345",
"status": "pending | running | completed | error | cancelled",
"created_at": "2026-06-19T09:30:00Z",
"updated_at": "2026-06-19T09:32:00Z",
"prompt": "정렬 문제 10개를 만들어 sort_problems.md로 저장…",
"agent": "claude-code",
"agent_session": "tmux:claude",
"broker": {
"host": "broker.hivemq.com",
"port": 1883,
"tls": false,
"username": null,
"password": null
},
"topic_prefix": "python/mqtt/jobs/abc12345",
"timeout_sec": 3600,
"idle_timeout_sec": 120,
"expected_artifacts": ["sort_problems.md"],
"last_seq": 0,
"auth_token": null
}
```
- `broker` lets `publish_event.py` connect from the record alone (env still
overrides toggles like `MQTT_TLS`).
- `topic_prefix` → the events topic is `<topic_prefix>/events`.
- `last_seq` backs the monotonic `seq` counter so it survives process restarts.
- `expected_artifacts` is the hook a user `validate.sh` checks (existence/content).
- `auth_token` is `null` in PoC; production sets `secrets.token_urlsafe(32)`.
---
## 3. Concurrency rules
### PoC — fcntl advisory lock
Every read-modify-write (`register_job`, `pick_pending`, `update_status`,
`next_seq`) runs inside `registry_lock(registry_dir)`, an exclusive
`fcntl.flock` over `.lock`. Single-host, good enough for many tmux sessions on
one machine.
### Production — SQLite WAL
When delegation spans **multiple hosts**, the file lock no longer serialises
across machines. Migrate the same operations to a SQLite database in WAL mode
(`PRAGMA journal_mode=WAL`) with a transaction per claim. The function
signatures stay identical; only the storage backend changes.
---
## 4. How multiple sessions take only their own work
Each tmux session carries an `agent_session` label (`tmux:claude`,
`tmux:claude-a`, `tmux:claude-b`, …). `pick_pending(agent_session)`:
1. acquires the registry lock,
2. scans for the **oldest** record with `status == "pending"` **and**
matching `agent_session`,
3. flips it to `running` and writes it back **atomically**,
4. releases the lock and returns the `job_id` (or `None`).
Because the scan + flip happen under one lock, two sessions can never claim the
same job. Sessions with distinct labels naturally partition the work; sessions
sharing a label compete safely — first to acquire the lock wins, the other sees
the job already `running` and moves on.
```bash
# session A only ever runs its own pending jobs
PY scripts/registry.py pick --agent-session tmux:claude-a # prints id or exits 3
```
---
## 5. Atomic status updates
All writes use a temp-file + `os.replace` rename, which is atomic on POSIX:
1. take the registry lock,
2. load the current record,
3. mutate fields + refresh `updated_at` (and `last_seq` for `next_seq`),
4. write to `.<job_id>.<rand>.tmp` in the **same directory**, `fsync`,
5. `os.replace(tmp, <job_id>.json)`,
6. release the lock.
A reader therefore always sees either the old or the new complete record, never
a half-written file. This is the file-based equivalent of the rename trick
(`pending.<session>``running.<session>`) and maps cleanly onto a single
SQLite transaction when you migrate.
---
## 6. CLI quick reference
```bash
PY=.venv/bin/python
$PY scripts/registry.py register --prompt "…" --agent claude-code \
--agent-session tmux:claude --timeout 3600 --idle-timeout 120 # → prints job_id
$PY scripts/registry.py list # human table
$PY scripts/registry.py list --json # full records
$PY scripts/registry.py get --job <id> # one record
$PY scripts/registry.py status --job <id> --set completed # set status
$PY scripts/registry.py pick --agent-session tmux:claude # claim → running
```
Exit codes: `0` ok, `1` not found / bad status, `3` (`pick`) no pending job for
that session.
---
## 7. Persistent audit log
Separate from the registry, every job is also mirrored to a durable append-only
audit log at `.mam/delegate_job_logs/<job_id>/` (override with
`DELEGATE_JOB_LOGS_DIR`, default `<cwd>/.mam/delegate_job_logs`). The registry
is **live state** mutated in place; the audit log is **history** that survives
even after the registry dir is cleaned up. It is git-ignored.
```
.mam/delegate_job_logs/<job_id>/
meta.json # registration snapshot (the full job record at register time)
events.ndjson # append-only, one JSON event per line, time-ordered
status.json # current status only (fast point-query)
```
`events.ndjson` lines are written automatically at four points:
| Trigger | line `event` | Source |
|---------|-------------|--------|
| `register_job` | `registered` | `registry.register_job``mqtt_common.init_job_log` |
| status change (`update_status`, `pick`, publish status sync) | `status_changed` (`from`/`to`) | `mqtt_common.update_job_status` / `pick_pending` |
| event published | `published` (embeds the exact payload) | `publish_event.py` |
| event received | `received` | `job_subscriber.py` |
Helpers live in [`./scripts/mqtt_common.py`](./scripts/mqtt_common.py):
`LOGS_DIR`, `job_log_path`, `init_job_log`, `append_event` (fcntl-locked,
concurrent-append safe), `update_logged_status`, and the readers
`read_logged_meta` / `read_logged_status` / `iter_logged_events` /
`list_logged_jobs`. Every writer is **best-effort and isolated** — wrapped in
`try/except` with a `logger.warning`, so an audit-log failure never breaks the
registry write, the publish, or the subscribe it shadows.
Read them via the CLI:
```bash
PY=.venv/bin/python
$PY scripts/registry.py logs <job_id> # pretty timeline
$PY scripts/registry.py logs <job_id> --tail 20 # last 20 events
$PY scripts/registry.py logs <job_id> --json # raw JSON lines
$PY scripts/registry.py logs --list # every job, live status
```
@@ -0,0 +1,2 @@
paho-mqtt>=2.0.0
pyyaml
@@ -0,0 +1,252 @@
#!/usr/bin/env python3
"""job_subscriber.py — the single entry point for observing Job events.
Subscribes to one job's ``<topic_prefix>/events`` (or, with ``--wait-any``, the
events of every running/pending job in the registry), prints one line to stdout
per accepted event, and exits on a terminal event or a timeout.
Design points (all flagged in the PLAN review):
- terminal state machine: ``completed``/``error`` is acted on exactly once per
job, so QoS-1 duplicates or an ``error``-after-``completed`` reorder are safe.
- dual timeouts: a wall-clock ``--timeout`` (total budget, started at
subscribe time so a cold start can't hang forever) AND an idle
``--idle-timeout`` (no new event for N seconds).
- defensive parsing: undecodable payloads, ``schema_version`` mismatches, and
``job_id`` values we did not subscribe for are logged and dropped.
stdout = event lines only. Diagnostics go to stderr via logging.
Exit codes:
0 all watched jobs reached ``completed``
1 any watched job reached ``error``
2 timed out (wall-clock or idle) before all jobs finished
"""
from __future__ import annotations
import argparse
import json
import logging
import queue
import sys
import time
from typing import Any, Dict, List, Optional, Set, Tuple
import mqtt_common
import registry
from mqtt_common import (
DEFAULT_REGISTRY_DIR,
SCHEMA_VERSION,
broker_config_from_job,
load_job,
make_client,
)
logger = logging.getLogger("delegate_job.job_subscriber")
TERMINAL_EVENTS = ("completed", "error")
def _format_line(topic: str, payload: Dict[str, Any]) -> str:
return (
f"{payload.get('timestamp','-')} "
f"job={payload.get('job_id','?')} "
f"seq={payload.get('seq','?')} "
f"{payload.get('event','?'):<20} "
f"{payload.get('detail','')}"
)
class _Watcher:
"""Holds the shared queue + the set of job_ids we accept events for."""
def __init__(self, expected_job_ids: Set[str], expected_tokens: Dict[str, Optional[str]]):
self.events: "queue.Queue[Tuple[str, Dict[str, Any]]]" = queue.Queue()
self.expected = set(expected_job_ids)
self.tokens = expected_tokens # job_id -> expected auth_token (or None)
self.last_seq: Dict[str, int] = {jid: 0 for jid in expected_job_ids}
def on_message(self, _client, _userdata, msg) -> None:
# --- defensive parsing -------------------------------------------
try:
payload = json.loads(msg.payload.decode("utf-8"))
except (UnicodeDecodeError, json.JSONDecodeError) as exc:
logger.warning("drop unparseable payload on %s: %s", msg.topic, exc)
return
if not isinstance(payload, dict):
logger.warning("drop non-object payload on %s", msg.topic)
return
if payload.get("schema_version") != SCHEMA_VERSION:
logger.warning("drop event with schema_version=%r (expected %d)",
payload.get("schema_version"), SCHEMA_VERSION)
return
jid = payload.get("job_id")
if jid not in self.expected:
logger.warning("drop event for unexpected job_id=%r on %s", jid, msg.topic)
return
# --- production auth check: data.auth_token must match if expected ---
expected_token = self.tokens.get(jid)
if not mqtt_common.verify_hmac(payload, expected_token):
logger.warning("drop event for job %s: HMAC verify failed", jid)
return
# --- replay attack defense: check monotonic sequence ---
seq = payload.get("seq")
if seq is None or not isinstance(seq, int):
logger.warning("drop event for job %s: missing or invalid seq", jid)
return
if seq <= self.last_seq.get(jid, 0):
logger.warning("drop event for job %s: seq %d is not monotonically increasing (last %d)",
jid, seq, self.last_seq.get(jid, 0))
return
self.last_seq[jid] = seq
# Persistent audit log from the *subscriber's* vantage point: every event
# that survives defensive parsing is recorded here, including ones a
# different host published. This is the external-observer record that
# backstops the publisher's own "published" line if it never wrote one.
mqtt_common.append_event(jid, {
"event": "received",
"source_event": payload.get("event"),
"seq": payload.get("seq"),
"topic": msg.topic,
"timestamp": payload.get("timestamp"),
"detail": payload.get("detail", ""),
})
self.events.put((msg.topic, payload))
def _collect_jobs(args) -> List[Dict[str, Any]]:
"""Resolve the list of job records this invocation should watch."""
if args.wait_any:
jobs = [r for r in registry.list_jobs(args.registry_dir)
if r.get("status") in ("pending", "running")]
if not jobs:
logger.error("no pending/running jobs to wait for")
return jobs
job = load_job(args.job, args.registry_dir) # raises FileNotFoundError
return [job]
def main(argv=None) -> int:
parser = argparse.ArgumentParser(description="Subscribe to Job events on MQTT")
target = parser.add_mutually_exclusive_group(required=True)
target.add_argument("--job", help="job id to watch")
target.add_argument("--wait-any", action="store_true",
help="watch every pending/running job in the registry")
parser.add_argument("--timeout", type=float, default=None,
help="wall-clock budget in seconds (default: job.timeout_sec or 3600)")
parser.add_argument("--idle-timeout", type=float, default=None,
help="max seconds with no new event (default: job.idle_timeout_sec or 120)")
parser.add_argument("--expect-retention", action="store_true",
help="warn if no retained terminal event arrives promptly")
parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args(argv)
mqtt_common.setup_logging(logging.DEBUG if args.verbose else logging.WARNING)
try:
jobs = _collect_jobs(args)
except FileNotFoundError as exc:
logger.error("%s", exc)
return 2
if not jobs:
return 2
expected_ids: Set[str] = {j["job_id"] for j in jobs}
tokens = {j["job_id"]: j.get("auth_token") for j in jobs}
watcher = _Watcher(expected_ids, tokens)
# Resolve timeouts from CLI, falling back to the (first) job's settings.
base_job = jobs[0]
wall_timeout = args.timeout if args.timeout is not None else float(base_job.get("timeout_sec", 3600))
idle_timeout = args.idle_timeout if args.idle_timeout is not None else float(base_job.get("idle_timeout_sec", 120))
# All watched jobs share a broker in practice; connect using the first
# job's broker and subscribe to each job's events topic.
config = broker_config_from_job(base_job)
client = make_client("subscriber", config)
client.on_message = watcher.on_message
subscribed_topics = []
for job in jobs:
prefix = job.get("topic_prefix") or mqtt_common.topic_prefix_for(job["job_id"])
subscribed_topics.append(f"{prefix}/events")
def on_connect(_c, _u, _flags, reason_code, _props):
if mqtt_common.reason_code_value(reason_code) != 0:
logger.error("broker connection failed: rc=%s", reason_code)
return
for topic in subscribed_topics:
_c.subscribe(topic, qos=1)
logger.info("subscribed to %s", topic)
def on_disconnect(_c, _u, _flags, reason_code, _props):
rc = mqtt_common.reason_code_value(reason_code)
if rc != 0:
logger.warning("broker disconnected (rc=%s); will retry reconnect", reason_code)
client.on_connect = on_connect
client.on_disconnect = on_disconnect
client.reconnect_delay_set(min_delay=1, max_delay=16)
mqtt_common.with_retry(
lambda: client.connect(config.host, config.port, config.keepalive),
attempts=5, base_delay=1.0, max_delay=16.0
)()
client.loop_start()
terminal: Dict[str, str] = {} # job_id -> "completed"/"error"
pending: Set[str] = set(expected_ids)
start = time.monotonic()
wall_deadline = start + wall_timeout
last_event = start
retention_checked = not args.expect_retention
try:
while pending:
now = time.monotonic()
if now >= wall_deadline:
logger.error("wall-clock timeout (%.0fs); still pending: %s",
wall_timeout, ", ".join(sorted(pending)))
return 2
idle_left = idle_timeout - (now - last_event)
if idle_left <= 0:
logger.error("idle timeout (%.0fs, no events); still pending: %s",
idle_timeout, ", ".join(sorted(pending)))
return 2
wait = min(wall_deadline - now, idle_left, 1.0)
try:
topic, payload = watcher.events.get(timeout=wait)
except queue.Empty:
if not retention_checked and (now - start) > 3.0:
logger.warning("--expect-retention set but no retained "
"terminal event observed yet")
retention_checked = True
continue
last_event = time.monotonic()
retention_checked = True
print(_format_line(topic, payload), flush=True)
jid = payload["job_id"]
event = payload.get("event")
if event in TERMINAL_EVENTS:
if jid in terminal:
# Already finalised: ignore duplicates / late reorders.
logger.info("ignoring duplicate terminal %s for %s", event, jid)
continue
terminal[jid] = event
pending.discard(jid)
finally:
client.loop_stop()
try:
client.disconnect()
except Exception: # pragma: no cover
pass
# All jobs reached a terminal state. error wins over completed.
if any(state == "error" for state in terminal.values()):
return 1
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,616 @@
"""Shared MQTT + registry helpers for the multi-agent-mux-delegate-job skill.
Single entry point for:
- broker configuration (env -> dataclass),
- paho client construction (auth + TLS + unique client id),
- monotonic per-job sequence counters,
- retry-with-exponential-backoff,
- atomic registry record load/update under an fcntl lock.
Requires paho-mqtt >= 2.0 (uses CallbackAPIVersion.VERSION2).
This module is the *only* place that talks to the broker config and to the
raw job record file, so PoC -> production migration touches just env/registry
values, never code (see references/mqtt-broker-setup.md).
"""
from __future__ import annotations
import functools
import hashlib
import hmac
import json
import logging
import os
import tempfile
import time
import uuid
from contextlib import contextmanager
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional
import paho.mqtt.client as mqtt
logger = logging.getLogger("delegate_job.mqtt_common")
def _load_dotenv(workspace_dir: str = None) -> None:
"""Load .env file from workspace if it exists and env var not already set.
This ensures Python scripts get the same env vars as the shell wrapper
scripts that source .env. Only sets vars that are not already in os.environ
(i.e. OS env takes precedence over .env file).
"""
import os
if workspace_dir is None:
# Walk up from this script to find workspace root
d = os.path.dirname(os.path.abspath(__file__))
for _ in range(5):
if os.path.isfile(os.path.join(d, ".env")):
break
d = os.path.dirname(d)
else:
d = workspace_dir
env_path = os.path.join(d, ".env")
if not os.path.isfile(env_path):
return
with open(env_path, "r") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" in line:
key, _, val = line.partition("=")
key = key.strip()
val = val.strip().strip('"').strip("'")
if key and key not in os.environ:
os.environ[key] = val
_load_dotenv()
# --------------------------------------------------------------------------
# Constants
# --------------------------------------------------------------------------
SCHEMA_VERSION = 1
DEFAULT_REGISTRY_DIR = ".mam/jobs"
DEFAULT_TOPIC_ROOT = "python/mqtt/jobs"
LOCK_FILENAME = ".lock"
# Persistent audit-log layout: .mam/delegate_job_logs/<job_id>/{meta,events,status}.
# This is a *separate* artifact from the registry: the registry is the live job
# record (mutated in place), the audit log is an append-only history that
# survives even if the registry dir is cleaned up.
META_FILENAME = "meta.json"
EVENTS_FILENAME = "events.ndjson"
STATUS_FILENAME = "status.json"
def _default_logs_dir() -> str:
"""Audit-log root. Overridable with ``DELEGATE_JOB_LOGS_DIR``; otherwise
``<cwd>/.mam/delegate_job_logs`` — we keep audit logs next to the
live registry (``.mam/jobs/``) so the two runtime artifacts sit
under the same parent dir and follow the same ``.gitignore`` rule.
The cwd of whichever process emits events (the bash wrapper and
scripts) is used as the anchor."""
env = os.environ.get("DELEGATE_JOB_LOGS_DIR")
if env and env.strip():
return env
return os.path.join(os.getcwd(), ".mam", "delegate_job_logs")
LOGS_DIR = _default_logs_dir()
# --------------------------------------------------------------------------
# Broker configuration
# --------------------------------------------------------------------------
@dataclass
class BrokerConfig:
"""Resolved broker connection settings.
PoC defaults target the public HiveMQ broker. Production overrides arrive
either from environment variables or from a job record's ``broker.*`` block
(see ``broker_config_from_job``).
"""
host: str = "broker.hivemq.com"
port: int = 1883
tls: bool = False
username: Optional[str] = None
password: Optional[str] = None
client_id_prefix: str = "hermes"
# TLS material (only consulted when tls is True).
ca_certs: Optional[str] = None
certfile: Optional[str] = None
keyfile: Optional[str] = None
keepalive: int = 60
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
def to_registry_block(self) -> Dict[str, Any]:
"""The subset that gets persisted into a job record's broker block."""
return {
"host": self.host,
"port": self.port,
"tls": self.tls,
"username": self.username,
"password": self.password,
}
def _env_bool(name: str, default: bool = False) -> bool:
raw = os.environ.get(name)
if raw is None:
return default
return raw.strip().lower() in ("1", "true", "yes", "on")
def _env_int(name: str, default: int) -> int:
raw = os.environ.get(name)
if raw is None or raw.strip() == "":
return default
try:
return int(raw)
except ValueError:
logger.warning("invalid int for %s=%r; using default %d", name, raw, default)
return default
def broker_config_from_env(overrides: Optional[Dict[str, Any]] = None) -> BrokerConfig:
"""Build a :class:`BrokerConfig` from environment variables.
Recognised vars (all optional, PoC defaults shown):
MQTT_BROKER (broker.hivemq.com), MQTT_PORT (1883), MQTT_TLS (0),
MQTT_USERNAME, MQTT_PASSWORD, MQTT_CLIENT_ID_PREFIX (hermes),
MQTT_CA_CERTS, MQTT_CERTFILE, MQTT_KEYFILE, MQTT_KEEPALIVE (60).
``overrides`` (e.g. a job record's broker block) wins over the env values
for any key it specifies with a non-None value.
"""
cfg = BrokerConfig(
host=os.environ.get("MQTT_BROKER", "broker.hivemq.com"),
port=_env_int("MQTT_PORT", 1883),
tls=_env_bool("MQTT_TLS", False),
username=os.environ.get("MQTT_USERNAME") or None,
password=os.environ.get("MQTT_PASSWORD") or None,
client_id_prefix=os.environ.get("MQTT_CLIENT_ID_PREFIX", "hermes"),
ca_certs=os.environ.get("MQTT_CA_CERTS") or None,
certfile=os.environ.get("MQTT_CERTFILE") or None,
keyfile=os.environ.get("MQTT_KEYFILE") or None,
keepalive=_env_int("MQTT_KEEPALIVE", 60),
)
if overrides:
for key, value in overrides.items():
if value is not None and hasattr(cfg, key):
setattr(cfg, key, value)
return cfg
def broker_config_from_job(job: Dict[str, Any]) -> BrokerConfig:
"""Resolve broker config for a job: env defaults, then the job's broker.*
block overrides. This lets ``publish_event.py`` connect from the registry
alone, while still honouring environment toggles (e.g. MQTT_TLS=1)."""
return broker_config_from_env(overrides=job.get("broker") or {})
def make_client(role: str, config: Optional[BrokerConfig] = None) -> mqtt.Client:
"""Return a configured paho ``Client`` (not yet connected).
The client id is ``f"{prefix}-{role}-{uuid8}"`` so concurrent publishers /
subscribers never collide on the broker. Auth and TLS are applied when the
config supplies them.
"""
config = config or broker_config_from_env()
client_id = f"{config.client_id_prefix}-{role}-{uuid.uuid4().hex[:8]}"
client = mqtt.Client(
callback_api_version=mqtt.CallbackAPIVersion.VERSION2,
client_id=client_id,
)
if config.username:
client.username_pw_set(config.username, config.password)
if config.tls:
# If ca_certs is None paho uses the system trust store (good enough for
# public CAs); a private CA bundle path is passed through unchanged.
client.tls_set(
ca_certs=config.ca_certs,
certfile=config.certfile,
keyfile=config.keyfile,
)
logger.debug("built client id=%s tls=%s host=%s", client_id, config.tls, config.host)
return client
def reason_code_value(rc: Any) -> int:
"""Normalise a paho v2 connect reason code to an int.
paho-mqtt 2.x hands callbacks a ``ReasonCode`` object (not an int); older
paths may pass a plain int. ``ReasonCode`` exposes ``.value``; 0 == success.
"""
return int(getattr(rc, "value", rc))
def verify_hmac(payload: dict, auth_token: Optional[str]) -> bool:
"""Verify HMAC-SHA256 signature. Returns True if valid or no token set."""
if not auth_token:
return True # PoC mode — no auth
sig = payload.get("data", {}).get("hmac_sig")
if not sig:
return False
sign_payload = {k: v for k, v in payload.items() if k != "data"}
sign_payload["data"] = {k: v for k, v in payload.get("data", {}).items() if k != "hmac_sig"}
msg = json.dumps(sign_payload, sort_keys=True, separators=(",", ":")).encode()
expected = hmac.new(auth_token.encode(), msg, hashlib.sha256).hexdigest()
return hmac.compare_digest(sig, expected)
def topic_prefix_for(job_id: str, root: str = DEFAULT_TOPIC_ROOT) -> str:
return f"{root}/{job_id}"
def events_topic_for(job_id: str, root: str = DEFAULT_TOPIC_ROOT) -> str:
return f"{topic_prefix_for(job_id, root)}/events"
# --------------------------------------------------------------------------
# Registry primitives (single source of truth for raw record I/O)
# --------------------------------------------------------------------------
def _job_path(job_id: str, registry_dir: str) -> Path:
return Path(registry_dir) / f"{job_id}.json"
def _lock_path(registry_dir: str) -> Path:
return Path(registry_dir) / LOCK_FILENAME
@contextmanager
def registry_lock(registry_dir: str):
"""Advisory exclusive lock over the whole registry dir via fcntl.
PoC-grade single-host concurrency control. Multiple tmux sessions / scripts
serialise their read-modify-write of job records through this lock so two
sessions never claim the same pending job. For multi-host delegation move
to SQLite WAL (see references/registry.md)."""
import fcntl # POSIX only; imported lazily so import works on Windows.
Path(registry_dir).mkdir(parents=True, exist_ok=True)
lock_file = _lock_path(registry_dir)
fh = open(lock_file, "a+")
try:
fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
yield
finally:
try:
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
finally:
fh.close()
def load_job(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> Dict[str, Any]:
"""Load and parse a job record. Raises FileNotFoundError if absent."""
path = _job_path(job_id, registry_dir)
if not path.exists():
raise FileNotFoundError(f"job record not found: {path}")
with open(path, "r", encoding="utf-8") as fh:
return json.load(fh)
def _atomic_write_record(job_id: str, registry_dir: str, record: Dict[str, Any]) -> None:
"""Write a record atomically: temp file in the same dir + os.replace.
The rename is atomic on POSIX, so readers never observe a half-written
file. Callers MUST already hold ``registry_lock`` for read-modify-write
correctness."""
Path(registry_dir).mkdir(parents=True, exist_ok=True)
path = _job_path(job_id, registry_dir)
fd, tmp = tempfile.mkstemp(dir=str(path.parent), prefix=f".{job_id}.", suffix=".tmp")
try:
with os.fdopen(fd, "w", encoding="utf-8") as fh:
json.dump(record, fh, ensure_ascii=False, indent=2)
fh.write("\n")
fh.flush()
os.fsync(fh.fileno())
os.replace(tmp, path)
try:
os.chmod(path, 0o600)
except Exception:
pass
except BaseException:
if os.path.exists(tmp):
os.unlink(tmp)
raise
def update_job_status(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR, **fields: Any) -> Dict[str, Any]:
"""Atomically merge ``fields`` into a job record under the registry lock.
Always refreshes ``updated_at``. Returns the new record. Raises
FileNotFoundError if the job does not exist.
This is the single chokepoint for status writes (both ``registry.update_status``
and ``publish_event.py``'s status sync route through here), so it also mirrors
any ``status`` change into the persistent audit log. We perform the log mirror
under the lock to guarantee sequential consistency in audit history."""
with registry_lock(registry_dir):
record = load_job(job_id, registry_dir)
old_status = record.get("status")
record.update(fields)
record["updated_at"] = _utcnow()
_atomic_write_record(job_id, registry_dir, record)
if "status" in fields:
new_status = record.get("status")
update_logged_status(job_id, new_status, updated_at=record["updated_at"])
if old_status != new_status:
append_event(job_id, {
"event": "status_changed",
"from": old_status,
"to": new_status,
"timestamp": record["updated_at"],
})
return record
def next_seq(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> int:
"""Return the next monotonic sequence number for a job, persisted in the
record's ``last_seq`` field so it stays consistent across process restarts.
First call returns 1."""
with registry_lock(registry_dir):
record = load_job(job_id, registry_dir)
seq = int(record.get("last_seq", 0)) + 1
record["last_seq"] = seq
record["updated_at"] = _utcnow()
_atomic_write_record(job_id, registry_dir, record)
return seq
def _utcnow() -> str:
"""ISO-8601 UTC timestamp with trailing Z (payload `timestamp` field)."""
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
def _utcnow_precise() -> str:
"""ISO-8601 UTC timestamp with millisecond resolution. Used for the audit
log's ``logged_at`` so events sort cleanly even within the same second."""
now = time.time()
base = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(now))
return f"{base}.{int((now % 1) * 1000):03d}Z"
# --------------------------------------------------------------------------
# Persistent audit log (.mam/delegate_job_logs/<job_id>/...)
#
# Every function here is idempotent, concurrency-safe, and *best-effort*: a
# logging failure is swallowed with a logger.warning and never propagated, so it
# can never break a publish, a subscribe, or a registry write. stdout is never
# touched (it is reserved for data output).
# --------------------------------------------------------------------------
def job_log_dir(job_id: str, logs_dir: Optional[str] = None) -> Path:
return Path(logs_dir or LOGS_DIR) / job_id
def job_log_path(job_id: str, kind: str, logs_dir: Optional[str] = None) -> Path:
"""Path to one audit-log file for a job. ``kind`` is a filename, e.g. the
module constants META_FILENAME / EVENTS_FILENAME / STATUS_FILENAME."""
return job_log_dir(job_id, logs_dir) / kind
@contextmanager
def _file_lock(fh):
"""Best-effort exclusive lock over a single open file via fcntl, so two
processes appending to events.ndjson never interleave a line. A no-op where
fcntl is unavailable (Windows); a short append is atomic enough there."""
try:
import fcntl
except ImportError: # pragma: no cover - non-POSIX
yield
return
fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
try:
yield
finally:
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
def _redact_dict(d: Any) -> Any:
"""Recursively mask sensitive values (passwords, secrets, tokens) inside logs."""
if isinstance(d, dict):
redacted = {}
for k, v in d.items():
if any(s in k.lower() for s in ("password", "token", "secret", "auth_token", "key")):
redacted[k] = "[REDACTED]"
else:
redacted[k] = _redact_dict(v)
return redacted
elif isinstance(d, list):
return [_redact_dict(item) for item in d]
return d
def append_event(job_id: str, event_dict: Dict[str, Any], logs_dir: Optional[str] = None) -> None:
"""Append one event as a JSON line to ``<logs>/<job_id>/events.ndjson``.
Concurrency-safe (fcntl lock over the file) and best-effort. A millisecond
``logged_at`` is stamped when the caller did not supply one."""
try:
path = job_log_path(job_id, EVENTS_FILENAME, logs_dir)
path.parent.mkdir(parents=True, exist_ok=True)
record = _redact_dict(dict(event_dict))
record.setdefault("logged_at", _utcnow_precise())
line = json.dumps(record, ensure_ascii=False) + "\n"
with open(path, "a", encoding="utf-8") as fh:
with _file_lock(fh):
fh.write(line)
fh.flush()
except Exception as exc: # pragma: no cover - best effort
logger.warning("append_event failed for job %s: %s", job_id, exc)
def update_logged_status(job_id: str, status: str, logs_dir: Optional[str] = None, **extras: Any) -> None:
"""Rewrite ``<logs>/<job_id>/status.json`` (current status for fast point
queries) atomically. Best-effort; merges any ``extras``."""
try:
path = job_log_path(job_id, STATUS_FILENAME, logs_dir)
path.parent.mkdir(parents=True, exist_ok=True)
record: Dict[str, Any] = {"job_id": job_id, "status": status, "updated_at": _utcnow()}
record.update(extras)
tmp = path.with_name(path.name + ".tmp")
with open(tmp, "w", encoding="utf-8") as fh:
json.dump(record, fh, ensure_ascii=False, indent=2)
fh.write("\n")
os.replace(tmp, path)
except Exception as exc: # pragma: no cover - best effort
logger.warning("update_logged_status failed for job %s: %s", job_id, exc)
def init_job_log(job_id: str, meta: Dict[str, Any], logs_dir: Optional[str] = None) -> None:
"""Seed the per-job audit-log dir: write meta.json, status.json, and a first
``registered`` line in events.ndjson. Idempotent (the ``registered`` line is
written only when events.ndjson does not yet exist) and best-effort."""
try:
d = job_log_dir(job_id, logs_dir)
d.mkdir(parents=True, exist_ok=True)
meta_redacted = _redact_dict(meta)
with open(d / META_FILENAME, "w", encoding="utf-8") as fh:
json.dump(meta_redacted, fh, ensure_ascii=False, indent=2)
fh.write("\n")
status = meta.get("status", "pending")
update_logged_status(
job_id, status, logs_dir=logs_dir,
created_at=meta.get("created_at"), prompt=meta.get("prompt"),
)
events_path = d / EVENTS_FILENAME
first_time = not events_path.exists()
events_path.touch(exist_ok=True)
if first_time:
append_event(job_id, {
"event": "registered",
"status": status,
"agent": meta.get("agent"),
"agent_session": meta.get("agent_session"),
"topic_prefix": meta.get("topic_prefix"),
"timestamp": meta.get("created_at"),
}, logs_dir=logs_dir)
except Exception as exc: # pragma: no cover - best effort
logger.warning("init_job_log failed for job %s: %s", job_id, exc)
def read_logged_meta(job_id: str, logs_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""Return a job's audit meta.json (registration snapshot), or None."""
try:
with open(job_log_path(job_id, META_FILENAME, logs_dir), "r", encoding="utf-8") as fh:
return json.load(fh)
except (OSError, json.JSONDecodeError):
return None
def read_logged_status(job_id: str, logs_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""Return a job's current status.json, or None. This is the fast point-query
file (current status only), separate from the registration-time meta.json."""
try:
with open(job_log_path(job_id, STATUS_FILENAME, logs_dir), "r", encoding="utf-8") as fh:
return json.load(fh)
except (OSError, json.JSONDecodeError):
return None
def iter_logged_events(job_id: str, logs_dir: Optional[str] = None):
"""Yield each parsed event from a job's events.ndjson in file (time) order.
Malformed lines are skipped with a warning."""
path = job_log_path(job_id, EVENTS_FILENAME, logs_dir)
if not path.exists():
return
with open(path, "r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
try:
yield json.loads(line)
except json.JSONDecodeError:
logger.warning("skipping malformed audit line in %s", path)
def list_logged_jobs(logs_dir: Optional[str] = None) -> List[Dict[str, Any]]:
"""Return one meta record per job directory under the logs root, oldest
first. Falls back to ``{"job_id": <dir>}`` when meta.json is missing."""
base = Path(logs_dir or LOGS_DIR)
out: List[Dict[str, Any]] = []
if not base.exists():
return out
for d in sorted(base.iterdir()):
if not d.is_dir():
continue
meta = read_logged_meta(d.name, logs_dir) or {"job_id": d.name}
# Overlay the live status.json so the summary reflects current state, not
# the registration-time snapshot frozen in meta.json.
status = read_logged_status(d.name, logs_dir)
if status:
meta = {**meta,
"status": status.get("status", meta.get("status")),
"updated_at": status.get("updated_at", meta.get("updated_at"))}
out.append(meta)
out.sort(key=lambda m: m.get("created_at") or "")
return out
# --------------------------------------------------------------------------
# Retry helper
# --------------------------------------------------------------------------
def with_retry(
fn: Optional[Callable] = None,
*,
attempts: int = 3,
base_delay: float = 0.5,
factor: float = 2.0,
max_delay: float = 8.0,
exceptions: Iterable[type] = (Exception,),
) -> Callable:
"""Retry ``fn`` with exponential backoff.
Usable two ways::
result = with_retry(do_publish, attempts=3)() # wrap-and-call
@with_retry(attempts=5, base_delay=1.0) # decorator
def do_publish(): ...
Re-raises the last exception once ``attempts`` is exhausted.
"""
exc_tuple = tuple(exceptions)
def decorate(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args: Any, **kwargs: Any) -> Any:
delay = base_delay
last_exc: Optional[BaseException] = None
for attempt in range(1, attempts + 1):
try:
return func(*args, **kwargs)
except exc_tuple as exc:
last_exc = exc
if attempt >= attempts:
break
logger.warning(
"attempt %d/%d failed: %s; retrying in %.1fs",
attempt, attempts, exc, delay,
)
time.sleep(delay)
delay = min(delay * factor, max_delay)
assert last_exc is not None
raise last_exc
return wrapper
if fn is not None:
return decorate(fn)
return decorate
def setup_logging(level: int = logging.WARNING) -> None:
"""Configure root logging to stderr. stdout is reserved for data output
(subscriber event lines, registry ids)."""
import sys
logging.basicConfig(
level=level,
stream=sys.stderr,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)
@@ -0,0 +1,229 @@
#!/usr/bin/env python3
"""publish_event.py — the single entry point for emitting a Job event.
Loads the job record from the registry, resolves its broker, assigns the next
monotonic ``seq``, builds the schema-v1 JSON payload, and publishes it to
``<topic_prefix>/events`` over QoS 1 with exponential-backoff retry.
Silent by design: nothing is printed to stdout. Diagnostics go to stderr via
logging. Terminal events (``completed``/``error``) publish with retain=True so
a late subscriber still observes the final state (production hardening).
Exit codes:
0 published successfully
1 parameter / registry error (bad args, unknown job, no pending job)
2 publish failed after retries (network / broker / ACK timeout)
Usage:
publish_event.py --job <id> --event started [--detail "..."] [--data '{...}']
publish_event.py --pick-pending --agent-session tmux:claude --event completed
publish_event.py --job <id> --event completed --retained
"""
from __future__ import annotations
import argparse
import hashlib
import hmac
import json
import logging
import sys
import time
from typing import Any, Dict, Optional
import mqtt_common
import registry
from mqtt_common import (
DEFAULT_REGISTRY_DIR,
SCHEMA_VERSION,
broker_config_from_job,
events_topic_for,
load_job,
make_client,
next_seq,
with_retry,
)
logger = logging.getLogger("delegate_job.publish_event")
VALID_EVENTS = ("started", "permission_required", "progress", "completed", "error")
TERMINAL_EVENTS = ("completed", "error")
# event -> registry status to sync as a best-effort side effect
EVENT_TO_STATUS = {
"started": "running",
"completed": "completed",
"error": "error",
}
CONNECT_ACK_TIMEOUT = 10 # seconds to wait for CONNACK
PUBLISH_ACK_TIMEOUT = 5 # seconds to wait for QoS-1 PUBACK
def build_payload(
job_id: str,
seq: int,
event: str,
detail: str,
data: Optional[Dict[str, Any]],
auth_token: Optional[str],
) -> Dict[str, Any]:
payload: Dict[str, Any] = {
"schema_version": SCHEMA_VERSION,
"seq": seq,
"job_id": job_id,
"event": event,
"timestamp": mqtt_common._utcnow(),
"detail": detail,
"data": dict(data) if data else {},
}
# Production: carry the per-job HMAC-SHA256 signature in `data.hmac_sig` so
# the subscriber can verify the publisher without exposing the secret token.
# The signature is calculated over the entire payload (with `data.hmac_sig` excluded).
if auth_token:
sign_payload = {k: v for k, v in payload.items() if k != "data"}
sign_payload["data"] = {k: v for k, v in payload.get("data", {}).items() if k != "hmac_sig"}
msg = json.dumps(sign_payload, sort_keys=True, separators=(",", ":")).encode()
sig = hmac.new(auth_token.encode(), msg, hashlib.sha256).hexdigest()
payload["data"]["hmac_sig"] = sig
return payload
def _publish_once(config, topic: str, body: bytes, retain: bool) -> None:
"""Connect, publish one QoS-1 message, wait for the broker ACK, disconnect.
Raises on any failure so ``with_retry`` can re-run the whole sequence (a
fresh connection per attempt is the robust choice for a PoC)."""
client = make_client("publisher", config)
connected = {"rc": None}
def on_connect(_c, _u, _flags, reason_code, _props):
connected["rc"] = reason_code
client.on_connect = on_connect
client.connect(config.host, config.port, config.keepalive)
client.loop_start()
try:
# Wait for CONNACK so we fail fast on auth/TLS errors.
deadline = time.monotonic() + CONNECT_ACK_TIMEOUT
while connected["rc"] is None and time.monotonic() < deadline:
time.sleep(0.05)
if connected["rc"] is None:
raise TimeoutError("no CONNACK from broker")
if mqtt_common.reason_code_value(connected["rc"]) != 0:
raise ConnectionError(f"broker refused connection: rc={connected['rc']}")
info = client.publish(topic, payload=body, qos=1, retain=retain)
info.wait_for_publish(timeout=PUBLISH_ACK_TIMEOUT)
if not info.is_published():
raise TimeoutError("publish not acknowledged within timeout")
finally:
client.loop_stop()
try:
client.disconnect()
except Exception: # pragma: no cover - disconnect best effort
pass
def _resolve_job_id(args) -> Optional[str]:
if args.pick_pending:
return registry.pick_pending(args.agent_session, args.registry_dir)
return args.job
def main(argv=None) -> int:
parser = argparse.ArgumentParser(description="Publish a Job event to MQTT")
target = parser.add_mutually_exclusive_group(required=True)
target.add_argument("--job", help="job id to publish for")
target.add_argument("--pick-pending", action="store_true",
help="auto-select a pending job for --agent-session")
parser.add_argument("--agent-session", default="tmux:claude",
help="session label used with --pick-pending")
parser.add_argument("--event", default="progress", choices=VALID_EVENTS)
parser.add_argument("--detail", default="")
parser.add_argument("--data", default=None, help="optional JSON object string")
parser.add_argument("--retained", action="store_true",
help="force retain=True (auto for completed/error)")
parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
parser.add_argument("--attempts", type=int, default=3)
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args(argv)
mqtt_common.setup_logging(logging.DEBUG if args.verbose else logging.WARNING)
# --- parse optional data JSON (parameter error -> exit 1) ---
data: Optional[Dict[str, Any]] = None
if args.data:
try:
data = json.loads(args.data)
if not isinstance(data, dict):
raise ValueError("--data must be a JSON object")
except (ValueError, json.JSONDecodeError) as exc:
logger.error("invalid --data: %s", exc)
return 1
job_id = _resolve_job_id(args)
if not job_id:
logger.error("no job to publish for (unknown --job or no pending job)")
return 1
try:
job = load_job(job_id, args.registry_dir)
except FileNotFoundError as exc:
logger.error("%s", exc)
return 1
config = broker_config_from_job(job)
topic = job.get("topic_prefix")
topic = f"{topic}/events" if topic else events_topic_for(job_id)
seq = next_seq(job_id, args.registry_dir)
payload = build_payload(
job_id=job_id,
seq=seq,
event=args.event,
detail=args.detail,
data=data,
auth_token=job.get("auth_token"),
)
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
retain = args.retained or args.event in TERMINAL_EVENTS
publish = with_retry(
_publish_once,
attempts=args.attempts,
exceptions=(OSError, TimeoutError, ConnectionError, ValueError),
)
try:
publish(config, topic, body, retain)
except Exception as exc:
logger.error("publish failed after %d attempts: %s", args.attempts, exc)
return 2
# Persistent audit log: record the exact payload we put on the wire so the
# publish is reproducible from the log alone. Best-effort (isolated inside
# append_event) — never fails the publish.
mqtt_common.append_event(job_id, {
"event": "published",
"source_event": args.event,
"seq": seq,
"topic": topic,
"retain": retain,
"timestamp": payload["timestamp"],
"detail": args.detail,
"payload": payload,
})
# Best-effort side effects: registry status sync + (debug) event log. Never
# fail the publish on these.
registry.append_event(job_id, args.registry_dir, payload)
new_status = EVENT_TO_STATUS.get(args.event)
if new_status:
try:
mqtt_common.update_job_status(job_id, args.registry_dir, status=new_status)
except Exception as exc: # pragma: no cover - best effort
logger.warning("status sync failed: %s", exc)
logger.info("published %s seq=%d job=%s retain=%s", args.event, seq, job_id, retain)
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,334 @@
"""Job registry for the multi-agent-mux-delegate-job skill.
A job record is the single source of truth for one delegated unit of work:
its id, prompt, owning agent session, broker connection, timeouts, and status.
Records live as ``<registry_dir>/<job_id>.json`` with an append-only event log
``<registry_dir>/<job_id>.events.log`` and a shared ``<registry_dir>/.lock``.
Concurrency is handled via the fcntl lock in :mod:`mqtt_common` (PoC). For
multi-host delegation, migrate to SQLite WAL — see references/registry.md.
Importable as a library and runnable as a CLI (``register``/``list``/``get``/
``status``/``pick``) so the ``multi-agent-mux-delegate-job`` bash wrapper can shell out.
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import uuid
from pathlib import Path
from typing import Any, Dict, List, Optional
import mqtt_common
from mqtt_common import (
DEFAULT_REGISTRY_DIR,
SCHEMA_VERSION,
_atomic_write_record,
_utcnow,
broker_config_from_env,
load_job,
registry_lock,
topic_prefix_for,
)
logger = logging.getLogger("delegate_job.registry")
TERMINAL_STATUSES = ("completed", "error", "cancelled")
VALID_STATUSES = ("pending", "running", "completed", "error", "cancelled")
def generate_job_id(bits: int = 32) -> str:
"""PoC: 32-bit hex (8 chars). Production: 128-bit (full uuid4 hex)."""
if bits >= 128:
return uuid.uuid4().hex
nibbles = max(1, bits // 4)
return uuid.uuid4().hex[:nibbles]
def register_job(
prompt: str,
agent: str = "claude-code",
agent_session: str = "tmux:claude",
broker: Optional[Dict[str, Any]] = None,
timeout_sec: int = 3600,
idle_timeout_sec: int = 120,
registry_dir: str = DEFAULT_REGISTRY_DIR,
job_id: Optional[str] = None,
expected_artifacts: Optional[List[str]] = None,
bits: int = 32,
auth_token: Optional[str] = None,
) -> str:
"""Create a new ``pending`` job record and return its id.
``broker`` defaults to the current environment's resolved broker block, so
the registry alone is enough for ``publish_event.py`` to connect later.
"""
job_id = job_id or generate_job_id(bits)
if broker is None:
broker = broker_config_from_env().to_registry_block()
if auth_token is None:
# Auto-generate token if secure broker configuration (TLS or username) is detected
if broker.get("tls") or broker.get("username"):
import secrets
auth_token = secrets.token_urlsafe(32)
now = _utcnow()
record: Dict[str, Any] = {
"schema_version": SCHEMA_VERSION,
"job_id": job_id,
"status": "pending",
"created_at": now,
"updated_at": now,
"prompt": prompt,
"agent": agent,
"agent_session": agent_session,
"broker": broker,
"topic_prefix": topic_prefix_for(job_id),
"timeout_sec": int(timeout_sec),
"idle_timeout_sec": int(idle_timeout_sec),
"expected_artifacts": expected_artifacts or [],
"last_seq": 0,
"auth_token": auth_token,
}
with registry_lock(registry_dir):
if mqtt_common._job_path(job_id, registry_dir).exists():
raise FileExistsError(f"job already exists: {job_id}")
_atomic_write_record(job_id, registry_dir, record)
# Seed the persistent audit log (meta.json + status.json + a "registered"
# event). Best-effort inside init_job_log — never blocks registration.
mqtt_common.init_job_log(job_id, meta=record)
logger.info("registered job %s (agent=%s session=%s)", job_id, agent, agent_session)
return job_id
def pick_pending(agent_session: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> Optional[str]:
"""Claim the oldest ``pending`` job for ``agent_session``, flipping it to
``running`` atomically under the lock. Returns the job id, or None if no
pending job matches. This is how each tmux session takes only its own work
without two sessions grabbing the same job."""
with registry_lock(registry_dir):
candidates = []
for record in _iter_records(registry_dir):
if record.get("status") == "pending" and record.get("agent_session") == agent_session:
candidates.append(record)
if not candidates:
return None
candidates.sort(key=lambda r: r.get("created_at", ""))
chosen = candidates[0]
chosen["status"] = "running"
chosen["updated_at"] = _utcnow()
_atomic_write_record(chosen["job_id"], registry_dir, chosen)
logger.info("session %s picked job %s", agent_session, chosen["job_id"])
job_id = chosen["job_id"]
updated_at = chosen["updated_at"]
# pick_pending writes the record directly (not via update_job_status), so it
# mirrors the pending->running transition into the audit log here. Best-effort.
mqtt_common.update_logged_status(job_id, "running", updated_at=updated_at)
mqtt_common.append_event(job_id, {
"event": "status_changed",
"from": "pending",
"to": "running",
"by": agent_session,
"timestamp": updated_at,
})
return job_id
def update_status(job_id: str, registry_dir: str, status: str) -> Dict[str, Any]:
if status not in VALID_STATUSES:
raise ValueError(f"invalid status {status!r}; expected one of {VALID_STATUSES}")
return mqtt_common.update_job_status(job_id, registry_dir, status=status)
def list_jobs(registry_dir: str = DEFAULT_REGISTRY_DIR, status: Optional[str] = None) -> List[Dict[str, Any]]:
records = list(_iter_records(registry_dir))
if status:
records = [r for r in records if r.get("status") == status]
records.sort(key=lambda r: r.get("created_at", ""))
return records
def append_event(job_id: str, registry_dir: str, payload: Dict[str, Any]) -> None:
"""Append one event payload as a JSON line to the job's events log. Best
effort, debug-only; failures are logged but never raised to the caller."""
try:
Path(registry_dir).mkdir(parents=True, exist_ok=True)
log_path = Path(registry_dir) / f"{job_id}.events.log"
with open(log_path, "a", encoding="utf-8") as fh:
fh.write(json.dumps(payload, ensure_ascii=False) + "\n")
except OSError as exc: # pragma: no cover - best effort
logger.warning("could not append event for %s: %s", job_id, exc)
# convenience re-export so callers can `from registry import load_job`
__all__ = [
"register_job", "pick_pending", "update_status", "load_job",
"list_jobs", "append_event", "generate_job_id",
]
def _iter_records(registry_dir: str):
base = Path(registry_dir)
if not base.exists():
return
for path in sorted(base.glob("*.json")):
try:
with open(path, "r", encoding="utf-8") as fh:
yield json.load(fh)
except (OSError, json.JSONDecodeError) as exc:
logger.warning("skipping unreadable record %s: %s", path, exc)
# --------------------------------------------------------------------------
# CLI (so the bash wrapper can shell out without inline python)
# --------------------------------------------------------------------------
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="multi-agent-mux-delegate-job registry CLI")
parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
sub = parser.add_subparsers(dest="command", required=True)
p_reg = sub.add_parser("register", help="create a pending job; prints the job id")
p_reg.add_argument("--prompt", required=True)
p_reg.add_argument("--agent", default="claude-code")
p_reg.add_argument("--agent-session", default="tmux:claude")
p_reg.add_argument("--timeout", type=int, default=3600)
p_reg.add_argument("--idle-timeout", type=int, default=120)
p_reg.add_argument("--bits", type=int, default=32, help="32 (PoC) or 128 (prod)")
p_reg.add_argument("--artifact", action="append", default=[], dest="artifacts")
p_reg.add_argument("--auth-token", default=None, help="HMAC auth token for the job (auto-generated if secure broker is detected)")
p_list = sub.add_parser("list", help="list jobs (optionally by status)")
p_list.add_argument("--status", default=None)
p_list.add_argument("--json", action="store_true")
p_get = sub.add_parser("get", help="print one job record as JSON")
p_get.add_argument("--job", required=True)
p_status = sub.add_parser("status", help="set a job status")
p_status.add_argument("--job", required=True)
p_status.add_argument("--set", required=True, dest="status")
p_pick = sub.add_parser("pick", help="claim a pending job for a session; prints id")
p_pick.add_argument("--agent-session", default="tmux:claude")
p_logs = sub.add_parser(
"logs",
help="show the persistent audit log for a job, or --list every logged job",
)
p_logs.add_argument("job_id", nargs="?", default=None,
help="job id whose events.ndjson to print")
p_logs.add_argument("--list", action="store_true", dest="list_all",
help="summarise every job under the logs dir instead")
p_logs.add_argument("--logs-dir", default=None,
help="override the audit-log root (default: $DELEGATE_JOB_LOGS_DIR "
"or <cwd>/.mam/delegate_job_logs)")
p_logs.add_argument("--tail", type=int, default=0,
help="show only the last N events (0 = all)")
p_logs.add_argument("--json", action="store_true",
help="emit raw JSON lines / records instead of a table")
return parser
def main(argv: Optional[List[str]] = None) -> int:
mqtt_common.setup_logging(logging.INFO)
args = _build_parser().parse_args(argv)
rd = args.registry_dir
if args.command == "register":
job_id = register_job(
prompt=args.prompt,
agent=args.agent,
agent_session=args.agent_session,
timeout_sec=args.timeout,
idle_timeout_sec=args.idle_timeout,
registry_dir=rd,
expected_artifacts=args.artifacts,
bits=args.bits,
auth_token=args.auth_token,
)
print(job_id)
return 0
if args.command == "list":
records = list_jobs(rd, status=args.status)
if args.json:
print(json.dumps(records, ensure_ascii=False, indent=2))
else:
if not records:
print("(no jobs)")
for r in records:
print(f"{r['job_id']} {r.get('status','?'):10s} {r.get('agent_session','')}"
f" {r.get('prompt','')[:48]}")
return 0
if args.command == "get":
try:
print(json.dumps(load_job(args.job, rd), ensure_ascii=False, indent=2))
except FileNotFoundError as exc:
print(str(exc), file=sys.stderr)
return 1
return 0
if args.command == "status":
try:
update_status(args.job, rd, args.status)
except (FileNotFoundError, ValueError) as exc:
print(str(exc), file=sys.stderr)
return 1
return 0
if args.command == "pick":
job_id = pick_pending(args.agent_session, rd)
if job_id is None:
return 3 # no pending job for this session
print(job_id)
return 0
if args.command == "logs":
return _cmd_logs(args)
return 1
def _cmd_logs(args) -> int:
"""Pretty-print one job's events.ndjson, or summarise all logged jobs."""
logs_dir = args.logs_dir or mqtt_common.LOGS_DIR
if args.list_all:
jobs = mqtt_common.list_logged_jobs(logs_dir)
if args.json:
print(json.dumps(jobs, ensure_ascii=False, indent=2))
return 0
if not jobs:
print(f"(no logged jobs under {logs_dir})")
return 0
for m in jobs:
print(f"{m.get('job_id','?')} {m.get('status','?'):10s} "
f"{m.get('created_at','-'):20s} {(m.get('prompt') or '')[:48]}")
return 0
if not args.job_id:
print("logs requires a <job_id> or --list", file=sys.stderr)
return 1
events = list(mqtt_common.iter_logged_events(args.job_id, logs_dir))
if not events and not mqtt_common.job_log_dir(args.job_id, logs_dir).exists():
print(f"no audit log for job {args.job_id} under {logs_dir}", file=sys.stderr)
return 1
if args.tail and args.tail > 0:
events = events[-args.tail:]
if args.json:
for e in events:
print(json.dumps(e, ensure_ascii=False))
return 0
for e in events:
ts = e.get("logged_at") or e.get("timestamp") or "-"
extra = e.get("detail") or e.get("to") or e.get("source_event") or ""
print(f"{ts:24s} {e.get('event','?'):<16s} {extra}")
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,236 @@
---
name: multi-agent-mux-monitor
description: "Run a long-lived Kanban worker that polls .mam/agent-sessions.yaml against the actual tmux/agent runtime state and reconciles them. Use when you want live visibility into which agent sessions are running, which are dead, which have stale YAML entries, and which have new session ids that haven't been recorded yet. Designed to be dispatched as a Kanban goal_mode task (--goal) so it keeps running until the user stops it."
version: 1.0.0
author: godopu
license: MIT
platforms: [linux, macos]
environments: [kanban, terminal, tmux]
metadata:
hermes:
tags: [agent, tmux, claude, antigravity, agy, monitor, kanban, observation, reconciliation]
related_skills: [multi-agent-mux-create, multi-agent-mux-resume, multi-agent-mux-stop, kanban-orchestrator]
prereq_skills: [kanban-worker, multi-agent-mux-create]
---
# Agent Sessions Monitor — Live Reconciliation via Kanban Worker
> **Companion skills**: `multi-agent-mux-create` / `multi-agent-mux-resume` / `multi-agent-mux-stop` (mutators); this skill is the **observer**.
> **Single source of truth**: `./.mam/agent-sessions.yaml`.
## What this skill does
Dispatch a **Kanban worker** (in `goal_mode`) that:
1. Every ~30s polls the actual state of:
- `tmux ls` (which sessions are alive)
- `tmux list-panes -t <session> ...` (pane cmd, cwd, pid)
- `~/.claude/projects/<workspace-key>/*.jsonl` mtime + first-line sessionId
- `~/.gemini/antigravity-cli/cache/last_conversations.json` (agy workspace → conversation mapping)
- `~/.gemini/antigravity-cli/conversations/<uuid>.db` mtime (agy)
2. Compares the live state to `agent-sessions.yaml`
3. Detects 4 classes of drift:
- **yaml-only terminated/archived/stopped**: tmux dead, YAML says `terminated`, `archived`, or `stopped` → OK, left untouched (deliberate end states)
- **yaml-only running, tmux dead**: YAML says `running`, tmux is gone → mark `terminated` with timestamp
- **tmux-only running, not in YAML**: tmux session exists with `<workspace>-creator-*` naming but YAML doesn't know about it → register as a new entry
- **stale UUID**: YAML has a UUID, but the on-disk artifact is gone → flag in comment
4. Writes a Kanban `kanban_comment` on every drift event with diff details
5. Heartbeat every 5 minutes
6. **Goal loop**: judge (auxiliary model) re-checks the card after each turn against the body to decide "is monitoring still wanted?". When the user says "stop monitoring" via comment, the worker blocks with `reason=stop-requested`.
## When to use
- You have multiple workspaces with tmux agent sessions and want a single source of truth
- You suspect YAML drift after a host reboot / crash
- You want a notification when a session id was just created (so you can record it before next restart)
- You're running multi-day work and want to know "what's actually running right now"
## When NOT to use
- One-off interactive session — just check `tmux ls` and read the YAML
- A single, short session — overhead > benefit
- You don't have a Kanban dispatcher running
## Dispatching the monitor
```bash
# Goal-mode task: keeps running until the user signals stop
hermes kanban create \
--title "agent-sessions monitor (live reconcile)" \
--assignee default \
--workspace worktree \
--branch wt/multi-agent-mux-monitor \
--goal \
--goal-max-turns 100 \
--max-runtime 8h \
--max-retries 1 \
--skill multi-agent-mux-monitor \
--body "$(cat <<'EOF'
You are the agent-sessions monitor. Every 30 seconds, do:
1. Read .mam/agent-sessions.yaml
2. Run `tmux ls` and `tmux list-panes -F 'session=#{session_name} pid=#{pane_pid} cmd=#{pane_current_command} cwd=#{pane_current_path}'`
3. For each session in the YAML, check the corresponding tmux state
4. For each tmux session matching `*-creator-claude` or `*-creator-agy` that's not in the YAML, register it
5. For any drift, call `kanban_comment` with the diff
6. Sleep 30 seconds, then repeat
If the user comments `stop` or `stop monitoring` on this card, call `kanban_block(reason="stop-requested by user")`.
If you find that a Claude session's `claude_session_id_own` is null but there's a new *.jsonl in the project dir, read the sessionId from the first line and update the YAML.
Use the helper script at .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh for the YAML updates — it handles all the merge logic and writes a structured comment to this card.
EOF
)"
```
## Helper script: `reconcile.sh`
The worker calls this script every 30s. It:
1. Diffs YAML ↔ tmux ↔ disk artifacts
2. Updates YAML if needed (only when changes are real, not on every poll — avoids spamming)
3. Emits a JSON diff to stdout that the worker turns into a `kanban_comment`
```bash
# Reconcile + auto-update YAML (atomic, flock-guarded). Emits JSON drift to stdout.
bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --once --emit-diff
# Read-only: compute drift WITHOUT writing the YAML (use for "what's running?" checks).
bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --once --emit-diff --dry-run
# Push-based MQTT Monitor: listen to delegated job events on the broker and update the YAML instantly.
# Bounded run that exits after 5 min idle, or 1 h wall-clock; falls back to polling if the broker is down.
bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --subscribe --idle-timeout 300 --timeout 3600
# Persistent monitor (no timeouts): runs until interrupted; still polls if the broker is unreachable.
bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --subscribe --idle-timeout 0
```
Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (push-based MQTT subscription monitoring). `--subscribe` sub-flags: `--timeout N` (exit after N seconds of wall-clock; `0` = no limit, default), `--idle-timeout N` (exit after N seconds with no message; default `3600`, `0` = never idle-out). On a broker connection failure (connect error **or** non-zero CONNACK), `--subscribe` falls back to a polling loop that re-runs `--once --emit-diff` every `RECONCILE_POLL_INTERVAL` (default 15) seconds until `--timeout`. Terminal-event YAML updates are written through `lib.sh::atomic_dump_yaml` (flock + schema-validate + `.bak`). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself.
## Drift classes (what the script handles)
### Status Enum
The `status` and `last_visible_status` fields MUST be one of the following exact strings: `running`, `stopped`, `terminated`, `archived`.
Any unstructured comments or reasons for the status change should be placed in `last_visible_note` or `termination_mode`.
### A. tmux dead, YAML says running → auto-terminate
```
YAML: status=running, pane.pid=201132, cmd=claude
tmux: no session
→ set status=terminated, terminated_at=<now>, termination_mode=auto-detected
→ comment: "lab-landing-page-creator-claude: tmux gone (was pane 201132, cmd claude). Marked terminated."
```
**Skip-set**: the auto-terminate only fires for sessions whose status is `running`.
Rows already in a deliberate end state — `terminated`, `archived`, or **`stopped`**
(set by `multi-agent-mux-stop`) — are
left untouched. This is critical: a `stopped` row keeps its `resumable: true` and
captured `*_session_id_own`, so the monitor must **not** overwrite it with
`terminated ("auto-detected")` when its tmux is (expectedly) gone.
### B. tmux alive, not in YAML → auto-register
```
tmux: session=lab-paper-pdf2md-creator-agy, pid=...,
cmd=agy, cwd=$WORKSPACE_ROOT/paper-pdf2md
YAML: no such session
→ register as new entry: status=running, last_visible_status=running, last_visible_note=auto-registered
→ comment: "lab-paper-pdf2md-creator-agy: tmux found but not in YAML. Auto-registered."
```
### C. New session id materializes (claude first message sent)
```
YAML: claude_session_id_own=null (placeholder)
disk: ~/.claude/projects/.../b3a7...c2f.jsonl exists, mtime=now,
first line sessionId=b3a7...c2f
→ update claude_session_id_own=b3a7...c2f
→ comment: "lab-landing-page-creator-claude: session id materialized b3a7...c2f"
```
### D. Stale UUID (artifact gone)
```
YAML: agent_identities.claude.session_id=87dc548e-...
disk: ~/.claude/projects/.../87dc548e-...jsonl: missing
→ flag in comment, but DO NOT delete from YAML
(the user may have moved the file or the disk may be temporarily unavailable;
only `--purge-conversation` should remove the id)
```
## Pitfalls
- **Don't run the monitor without `--goal`** — without goal mode, a single turn will spawn, do one reconcile, and complete. Goal mode keeps the worker alive across many turns.
- **The 30s poll is a default** — workers may override if they detect heavy churn. A workspace with 5+ agent sessions should bump to 60s to avoid noise.
- **`kanban_comment` rate limits** — Kanban may throttle if you comment too fast. Coalesce: only comment when the diff is *new* (not the same drift on every poll). The script tracks a state file at `.cache/multi-agent-mux-monitor/<workspace>.state` in the workspace root for this (overridable via `AGENT_SESSIONS_STATE_DIR`).
- **Don't fight the user's explicit action** — if `multi-agent-mux-stop` is mid-flight and the monitor sees the same session in two states within 5s, prefer the user's most recent action. The monitor should not auto-revert a fresh `terminated` to `running` because of a stale `tmux has-session` check.
- **The monitor should never modify the conversation artifacts** (jsonl, db) — only the YAML. If you see a stale UUID, comment about it but don't delete the file.
- **TUI capture-pane is expensive** — only capture when you need to update `last_visible_status`, not every poll.
## Worker body template (for `hermes kanban create --body`)
The `--body` of the dispatched task IS the worker's behavior spec. Here's a tested template:
```markdown
# agent-sessions monitor
## Loop (every 30s)
1. Read agent-sessions.yaml
2. Bash: `bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --emit-diff`
3. Parse the JSON diff from stdout
4. If `drifts` is non-empty:
- For each drift, call `kanban_comment` with the diff message
5. Bash: `sleep 30`
6. Heartbeat every 5 min: `kanban_heartbeat(progress="alive, N drifts detected, last at <time>")`
## Stop condition
If `$HERMES_KANBAN_TASK` card has any comment containing "stop" or "stop monitoring" from a user:
- Call `kanban_block(reason="stop-requested by user at <timestamp>")`
## Drift responses
- A. tmux dead + YAML running: auto-terminate YAML, comment
- B. tmux alive not in YAML: auto-register, comment
- C. New session id from *.jsonl: update YAML, comment
- D. Stale UUID: comment only, no YAML change
## Hard rules
- Do NOT modify conversation artifacts (jsonl, db, brain/)
- Do NOT spawn/delete tmux sessions — that's the create/delete skills' job
- Do NOT call multi-agent-mux-create or multi-agent-mux-stop — only the user initiates those
- Do NOT call `git commit` / `git push`
```
## Security: --subscribe on Public Brokers
When using `--subscribe` with the default PoC public broker
(`broker.hivemq.com:1883`), be aware that:
1. **Wildcard subscription** means anyone can publish events to your job topics.
2. **Auto-kill on terminal events** means a spoofed `completed` or `error`
event from a third party can terminate your agent session.
3. **Mitigation**: Use `--subscribe` only on private TLS-enabled brokers
(production mode). For PoC, prefer polling-based monitor (`--once` or
no `--subscribe`) which reads YAML/tmux state directly without MQTT.
4. **HMAC verification**: Events are now verified via `verify_hmac()` in
`mqtt_common.py` (see FW-05). Ensure `auth_token` is set for each job
to enable signature validation — unauthenticated events will be dropped.
## Verification (one-shot)
```bash
# Run reconcile once and inspect output
bash .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh --emit-diff --once \
| python3 -m json.tool
```
## Related skills
- `kanban-worker` — base lifecycle for the dispatched worker
- `kanban-orchestrator` — if you want to dispatch this monitor *from* an orchestrator, use this to know how to phrase the body
+542
View File
@@ -0,0 +1,542 @@
#!/usr/bin/env bash
# reconcile.sh — multi-agent-mux-monitor 의 부속 스크립트
# YAML ↔ tmux ↔ 디스크 artifact 간 drift 감지 (+ YAML 자동 갱신).
#
# Usage:
# bash reconcile.sh --once --emit-diff # drift 감지 + 갱신
# bash reconcile.sh --once --emit-diff --dry-run # drift 만 계산, 쓰기 안 함 (P1-E)
#
# --dry-run: 부수효과 없는 read-only. "지금 뭐 돌고 있지?" 질문에 안전.
# multi-agent-mux-status 스킬이 이걸 재사용.
#
# 출력 (JSON): {timestamp, yaml_path, tmux_sessions_alive, tmux_confirmed, drifts, actions}
#
# Exit codes: 0 = ok | 1 = YAML not found | 2 = error
set -euo pipefail
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
STATE_DIR="${AGENT_SESSIONS_STATE_DIR:-$WORKSPACE_ROOT/.cache/multi-agent-mux-monitor}"
ONCE=0
EMIT_DIFF=0
DRY_RUN=0
SUBSCRIBE=0
# --subscribe controls (review item 4): 0 = no overall timeout; idle default 3600s
# (raised from 600s to align with job timeout defaults); idle 0 = never idle-out.
SUB_TIMEOUT=0
SUB_IDLE_TIMEOUT=3600
POLL_INTERVAL="${RECONCILE_POLL_INTERVAL:-15}"
while [ $# -gt 0 ]; do
case "$1" in
--once) ONCE=1; shift ;;
--emit-diff) EMIT_DIFF=1; shift ;;
--dry-run) DRY_RUN=1; shift ;;
--subscribe) SUBSCRIBE=1; shift ;;
--timeout) SUB_TIMEOUT="$2"; shift 2 ;;
--idle-timeout) SUB_IDLE_TIMEOUT="$2"; shift 2 ;;
-h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe [--timeout N] [--idle-timeout N]]"; exit 0 ;;
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
esac
done
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
if [ "$SUBSCRIBE" = "1" ]; then
# Paths resolved relative to this script (review item 6): skills/ dir + lib.sh.
SKILLS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
LIB_SH="$SKILLS_DIR/lib.sh"
# MQTT client lives in the project venv (has paho). All YAML work is delegated
# to lib.sh::atomic_dump_yaml, which runs the system python3 (has PyYAML) — so
# no single interpreter needs both paho and PyYAML (review items 4/5/6).
PYBIN="$(_delegate_py_bin)"
# The MQTT subscribe loop exits 3 to signal "broker unavailable → poll instead".
set +e
YAML_PATH="$AGENT_SESSIONS_YAML" HOME_DIR="$HOME_DIR" CLAUDE_PROJECT_DIR="$CLAUDE_PROJECT_DIR" LOCAL_BIN="$LOCAL_BIN" \
WORKSPACE_ROOT="$WORKSPACE_ROOT" SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
SKILLS_DIR="$SKILLS_DIR" LIB_SH="$LIB_SH" \
"$PYBIN" - <<'PYEOF'
import os, sys, json, time, subprocess
lib_sh = os.environ.get('LIB_SH', '')
skills_dir = os.environ.get('SKILLS_DIR', '')
yaml_path = os.environ.get('YAML_PATH', '')
workspace_root = os.environ.get('WORKSPACE_ROOT', '')
timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout
idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '3600') or '0') # 0 = no idle timeout
# Prevent duplicate wildcard subscribers for this workspace (concurrency race)
import fcntl
lock_file_path = os.path.join(workspace_root or '.', '.mam', 'monitor.lock')
try:
os.makedirs(os.path.dirname(lock_file_path), exist_ok=True)
lock_file = open(lock_file_path, 'w')
fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB)
except BlockingIOError:
print("MQTT Monitor: another subscriber is already running for this workspace. Exiting.", flush=True)
sys.exit(0)
except Exception as e:
print(f"MQTT Monitor: failed to acquire monitor lock ({e}). Exiting.", flush=True)
sys.exit(1)
# Locate skills/multi-agent-mux-delegate-job/scripts to import mqtt_common — relative first, then
# an upward walk from cwd. No hardcoded absolute path (review item 6).
cand = os.path.join(skills_dir, 'multi-agent-mux-delegate-job', 'scripts') if skills_dir else ''
if cand and os.path.isdir(cand):
sys.path.append(cand)
else:
d = os.getcwd()
while d and d != '/':
hit = None
for sub in (('.agents', 'skills', 'multi-agent-mux-delegate-job', 'scripts'), ('skills', 'multi-agent-mux-delegate-job', 'scripts'), ('multi-agent-mux-delegate-job', 'scripts')):
p = os.path.join(d, *sub)
if os.path.isdir(p):
hit = p
break
if hit:
sys.path.append(hit)
break
d = os.path.dirname(d)
import mqtt_common
import registry
# Executed INSIDE lib.sh::atomic_dump_yaml (system python3 + PyYAML), under the
# YAML flock with schema-validate + .bak (review item 5). Marks matching running
# sessions terminated and kills their tmux (review item 3 behaviour preserved),
# or aborts the write entirely when nothing matches. The untrusted MQTT job id /
# event arrive via env (MQTT_JID / MQTT_EVENT) — never spliced into source (P1-B).
_MUTATION = r'''
import os, subprocess
from datetime import datetime, timezone
_jid = os.environ['MQTT_JID']
_event = os.environ['MQTT_EVENT']
_now = datetime.now(timezone.utc)
_changed = False
for s in d.get('tmux_sessions', []):
if s.get('delegate_job_id') == _jid and s.get('status') == 'running':
s['status'] = 'terminated'
s['terminated_at'] = _now.strftime('%Y-%m-%dT%H:%M:%SZ')
s['terminated_at_epoch'] = int(_now.timestamp())
s['termination_mode'] = 'auto-detected (MQTT ' + _event + ')'
_name = s.get('name')
_srv = s.get('tmux_server') or 'default'
_cmd = ['tmux'] + (['-L', _srv] if _srv != 'default' else []) + ['kill-session', '-t', _name]
subprocess.run(_cmd, capture_output=True)
print('MQTT Monitor: terminated + killed ' + str(_name) + ' on ' + str(_srv), flush=True)
_changed = True
if not _changed:
raise SystemExit(0) # nothing matched — skip the write entirely
'''
def handle_terminal(jid, event):
if not lib_sh or not os.path.isfile(lib_sh):
print('MQTT Monitor: lib.sh not found, cannot update YAML', flush=True)
return
env = dict(os.environ)
env['MQTT_JID'] = jid
env['MQTT_EVENT'] = event
cmd = ['bash', '-c',
'source "$LIB_SH"; atomic_dump_yaml "$YAML_PATH" MQTT_JID="$MQTT_JID" MQTT_EVENT="$MQTT_EVENT"']
r = subprocess.run(cmd, input=_MUTATION, text=True, env=env, capture_output=True)
if (r.stdout or '').strip():
print(r.stdout.strip(), flush=True)
if r.returncode != 0 and (r.stderr or '').strip():
print('MQTT Monitor: atomic_dump_yaml stderr: ' + r.stderr.strip(), flush=True)
state = {'last_msg': time.time(), 'connected': False, 'failed': False}
last_seqs = {}
def on_message(_client, _userdata, msg):
state['last_msg'] = time.time()
try:
payload = json.loads(msg.payload.decode("utf-8"))
jid = payload.get("job_id")
event = payload.get("event")
if not jid or not event:
return
if workspace_root:
registry_dir = os.path.join(workspace_root, '.mam', 'jobs')
else:
yaml_dir = os.path.dirname(yaml_path) if yaml_path else ""
registry_dir = os.path.join(yaml_dir, 'jobs') if yaml_dir else '.mam/jobs'
try:
job = registry.load_job(jid, registry_dir)
except FileNotFoundError:
# Silently ignore events for jobs not in the local registry
return
expected_token = job.get("auth_token")
if not mqtt_common.verify_hmac(payload, expected_token):
print(f"MQTT Monitor: drop event for job {jid}: HMAC verify failed", flush=True)
return
seq = payload.get("seq")
if seq is None or not isinstance(seq, int):
print(f"MQTT Monitor: drop event for job {jid}: missing or invalid seq", flush=True)
return
if seq <= last_seqs.get(jid, 0):
print(f"MQTT Monitor: drop event for job {jid}: seq {seq} not monotonic (last {last_seqs.get(jid, 0)})", flush=True)
return
last_seqs[jid] = seq
# Append the event to events.ndjson audit trail
mqtt_common.append_event(jid, {
"event": "received",
"source_event": event,
"seq": seq,
"topic": msg.topic,
"timestamp": payload.get("timestamp"),
"detail": payload.get("detail", ""),
})
print(f"MQTT Monitor: recorded event {event} for job {jid} (seq={seq})", flush=True)
if event in ("completed", "error"):
print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
handle_terminal(jid, event)
except Exception as e:
print(f"MQTT Monitor error parsing message: {e}", flush=True)
def on_connect(_c, _u, _flags, reason_code, _props):
rc = mqtt_common.reason_code_value(reason_code)
if rc == 0:
state['connected'] = True
_c.subscribe("python/mqtt/jobs/+/events", qos=1)
print("MQTT Monitor: subscribed to python/mqtt/jobs/+/events", flush=True)
else:
state['failed'] = True
print(f"MQTT Monitor connection failed: rc={rc}", flush=True)
cfg = mqtt_common.broker_config_from_env()
client = mqtt_common.make_client("monitor_sub", cfg)
client.on_message = on_message
client.on_connect = on_connect
print(f"MQTT Monitor: connecting to {cfg.host}:{cfg.port} (TLS={cfg.tls})...", flush=True)
# Connection failure → fall back to polling (review item 4).
try:
client.connect(cfg.host, cfg.port, cfg.keepalive)
except Exception as e:
print(f"MQTT Monitor: connect failed ({e}); falling back to polling", flush=True)
sys.exit(3)
client.loop_start()
_wait = time.time()
while time.time() - _wait < 5 and not state['connected'] and not state['failed']:
time.sleep(0.1)
if not state['connected']:
print("MQTT Monitor: broker did not accept connection; falling back to polling", flush=True)
client.loop_stop()
sys.exit(3)
start = time.time()
try:
while True:
now = time.time()
if timeout and (now - start) >= timeout:
print(f"MQTT Monitor: --timeout {timeout}s reached, exiting", flush=True)
break
if idle_timeout and (now - state['last_msg']) >= idle_timeout:
print(f"MQTT Monitor: --idle-timeout {idle_timeout}s reached, exiting", flush=True)
break
time.sleep(0.5)
finally:
client.loop_stop()
try:
client.disconnect()
except Exception:
pass
sys.exit(0)
PYEOF
sub_rc=$?
set -e
if [ "$sub_rc" = "3" ]; then
echo "MQTT Monitor: broker unavailable — falling back to polling (interval ${POLL_INTERVAL}s)" >&2
_self="$SKILLS_DIR/multi-agent-mux-monitor/scripts/reconcile.sh"
_start=$(date +%s)
while :; do
bash "$_self" --once --emit-diff >/dev/null 2>&1 || true
if [ "$SUB_TIMEOUT" != "0" ] && [ "$(( $(date +%s) - _start ))" -ge "$SUB_TIMEOUT" ]; then
break
fi
sleep "$POLL_INTERVAL"
done
fi
exit 0
fi
mkdir -p "$STATE_DIR"
# 모든 비교 로직을 단일 소스로 둔다. dry-run 은 env_python(읽기전용), 그 외엔
# atomic_dump_yaml(flock + temp+rename) 로 같은 소스를 돌린다. atomic 래퍼에서는
# 'actions' 가 없으면 SystemExit(0) 으로 쓰기를 건너뛴다 (불필요한 재포맷 방지).
read -r -d '' RECON_SRC <<'PYEOF' || true
import os, json, glob, subprocess, time
from datetime import datetime, timezone
import yaml
yaml_path = os.environ['YAML_PATH']
home = os.environ['HOME_DIR']
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
now_iso = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
# atomic 래퍼에서는 d 가 이미 로드돼 있음. env_python(dry-run)에서는 여기서 로드.
try:
d
except NameError:
import sqlite3
db_path = os.path.splitext(yaml_path)[0] + '.db'
d = {}
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row: d = json.loads(row[0])
try:
db_sessions = []
cursor = conn.execute('SELECT data FROM sessions')
for s_row in cursor.fetchall():
db_sessions.append(json.loads(s_row[0]))
d['tmux_sessions'] = db_sessions
except sqlite3.OperationalError:
pass
conn.close()
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
except Exception:
pass
drifts = []
actions = []
# === 현재 tmux 상태 — transient 실패를 'no sessions' 와 구분 (P1-E) ===
tmux_sessions = []
tmux_confirmed = True
# YAML 에 등록된 고유한 tmux_server 목록 수집 + 환경변수 TMUX_SERVER_NAME 포함
unique_servers = {'default'}
if 'TMUX_SERVER_NAME' in os.environ:
unique_servers.add(os.environ['TMUX_SERVER_NAME'])
for s in d.get('tmux_sessions', []):
srv = s.get('tmux_server') or 'default'
unique_servers.add(srv)
try:
for srv in sorted(unique_servers):
cmd = ['tmux']
if srv != 'default':
cmd += ['-L', srv]
cmd += ['ls', '-F', '#{session_name}|#{session_created}']
r = subprocess.run(cmd, capture_output=True, text=True)
if r.returncode == 0:
for line in r.stdout.strip().split('\n'):
if not line:
continue
name, created = line.split('|', 1)
tmux_sessions.append({'name': name, 'created': int(created), 'server': srv})
else:
err = (r.stderr or '').lower()
is_empty = ('no server running' in err) or ('no sessions' in err) or ('failed to connect' in err)
if not is_empty:
tmux_confirmed = False
except Exception:
tmux_confirmed = False
def pane_meta(session, srv):
try:
cmd = ['tmux']
if srv != 'default':
cmd += ['-L', srv]
cmd += ['list-panes', '-t', session, '-F',
'#{pane_pid}|#{pane_current_path}|#{pane_current_command}']
out = subprocess.check_output(cmd, text=True)
parts = out.strip().split('\n')[0].split('|')
return {'pid': int(parts[0]), 'cwd': parts[1], 'cmd': parts[2]}
except Exception:
return None
yaml_sessions = d.get('tmux_sessions', [])
yaml_session_names = {s['name'] for s in yaml_sessions if s.get('name')}
alive_set = {(t['name'], t.get('server', 'default')) for t in tmux_sessions}
# === drift A: tmux dead + YAML running → auto-terminate ===
# tmux 응답을 확정했을 때만. transient 실패 시 모두 terminated 로 마크하지 않음 (P1-E)
if tmux_confirmed:
for s in yaml_sessions:
name = s.get('name')
if not name:
continue
# 'stopped' 도 deliberate한 종료 상태 — drift 로 보지 않고 그대로 둔다.
# (없으면 tmux-dead stopped 세션을 'terminated' 로 덮어써 resumable 플래그가 소실됨)
if s.get('status') in ('terminated', 'archived', 'stopped'):
continue
srv = s.get('tmux_server') or 'default'
if (name, srv) not in alive_set:
s['status'] = 'terminated'
s['terminated_at'] = now_iso
s['terminated_at_epoch'] = int(datetime.now(timezone.utc).timestamp())
s['termination_mode'] = 'auto-detected (tmux gone)'
pane = s.get('pane') or {}
drifts.append({'class': 'A', 'name': name,
'msg': f"{name}: tmux gone (was pane {pane.get('pid')}, cmd {pane.get('cmd')}). Marked terminated."})
actions.append(f"terminated: {name}")
# === drift B: tmux alive + not in YAML → auto-register ===
if tmux_confirmed:
for t in tmux_sessions:
name = t['name']
if name in yaml_session_names:
continue
if not (name.endswith('-creator-claude') or name.endswith('-creator-agy')):
continue
srv = t.get('server', 'default')
pm = pane_meta(name, srv)
if not pm:
continue
agent = 'claude' if name.endswith('-creator-claude') else 'agy'
cmd_full = 'claude --dangerously-skip-permissions' if agent == 'claude' else 'agy --dangerously-skip-permissions'
server_opt = f"-L {srv} " if srv != 'default' else ""
entry = {
'name': name,
'status': 'running',
'tmux_session_created_at': datetime.fromtimestamp(t['created'], tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
'tmux_session_epoch': t['created'],
'tmux_server': srv,
'pane': {'index': 0, 'pid': pm['pid'], 'cmd': agent, 'cmd_full': cmd_full, 'cwd': pm['cwd']},
# P2: cwd 인용
'start_command': f'tmux {server_opt}new-session -d -s "{name}" -x 140 -y 40 -c "{pm["cwd"]}" "{cmd_full}"',
'attach_command': f'tmux {server_opt}attach -t {name}',
'kill_command': f'tmux {server_opt}kill-session -t {name}',
'last_visible_status': 'running',
'last_visible_note': 'auto-registered by monitor',
}
if agent == 'claude':
entry['tui'] = {'model': '(unknown — capture after first message)', 'provider': 'anthropic',
'plan': '(unknown)', 'account': '(unknown)', 'version': '(unknown)'}
entry['claude_session_id_own'] = None
else:
entry['child_pid'] = 0
entry['agy_conversation_id_own'] = None
entry['mcp_attachments'] = [
{
'name': 'stitch',
'transport': 'mcp-remote',
'endpoint': 'https://stitch.googleapis.com/mcp'
}
]
d.setdefault('tmux_sessions', []).append(entry)
yaml_session_names.add(name)
drifts.append({'class': 'B', 'name': name,
'msg': f"{name}: tmux found but not in YAML. Auto-registered (pane {pm['pid']}, cmd {pm['cmd']}, cwd {pm['cwd']})."})
actions.append(f"registered: {name}")
# === drift C: claude 새 session id materialize (per-row own id) ===
for s in d.get('tmux_sessions', []):
if not s.get('name', '').endswith('-creator-claude'):
continue
if s.get('status') != 'running':
continue
if s.get('claude_session_id_own'):
continue
cwd = (s.get('pane') or {}).get('cwd', '')
if not cwd:
continue
proj_key = cwd.replace('/', '-').replace('_', '-')
proj_dir = f"{claude_project_dir}/{proj_key}"
if not os.path.isdir(proj_dir):
continue
jsonls = sorted(glob.glob(f"{proj_dir}/*.jsonl"), key=os.path.getmtime, reverse=True)
if not jsonls:
continue
latest = jsonls[0]
if time.time() - os.path.getmtime(latest) > 300:
continue
try:
with open(latest) as f:
first = f.readline().strip()
if not first:
continue
sid = json.loads(first).get('sessionId')
if not sid:
continue
except Exception:
continue
s['claude_session_id_own'] = sid
drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: session id materialized: {sid}"})
actions.append(f"updated session id: {sid}")
# === drift C (agy): agy 새 session id materialize (per-row own id) ===
for s in d.get('tmux_sessions', []):
if not s.get('name', '').endswith('-creator-agy'):
continue
if s.get('status') != 'running':
continue
if s.get('agy_conversation_id_own'):
continue
cwd = (s.get('pane') or {}).get('cwd', '')
if not cwd:
continue
lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json"
if os.path.exists(lc):
try:
with open(lc) as f:
lc_data = json.load(f)
cid = lc_data.get(cwd)
if cid and os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
s['agy_conversation_id_own'] = cid
drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: conversation id materialized: {cid}"})
actions.append(f"updated conversation id: {cid}")
except Exception:
pass
# === drift D: stale UUID (cache 의 artifact 가 사라짐) — 보고만, 변경 없음 ===
ai = d.get('agent_identities', {}) or {}
cl = (ai.get('claude') or {})
if cl.get('session_id'):
sid = cl['session_id']
if not glob.glob(f"{claude_project_dir}/*/{sid}.jsonl"):
drifts.append({'class': 'D', 'name': '(claude identity cache)',
'msg': f"stale UUID in agent_identities.claude.session_id: {sid} (jsonl missing)"})
ag = (ai.get('agy') or {})
if ag.get('conversation_id'):
cid = ag['conversation_id']
if not os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
drifts.append({'class': 'D', 'name': '(agy identity cache)',
'msg': f"stale UUID in agent_identities.agy.conversation_id: {cid} (.db missing)"})
result = {
'timestamp': now_iso,
'yaml_path': yaml_path,
'tmux_sessions_alive': sorted(f"{t['name']}|{t.get('server', 'default')}" for t in tmux_sessions),
'tmux_confirmed': tmux_confirmed,
'drifts': drifts,
'actions': actions,
}
print(json.dumps(result, indent=2, ensure_ascii=False))
# atomic 래퍼: actions 가 없으면 쓰기를 건너뛴다. env_python(dry-run)에선 무해.
if not actions:
raise SystemExit(0)
PYEOF
if [ "$DRY_RUN" = "1" ]; then
printf '%s' "$RECON_SRC" | env_python "$AGENT_SESSIONS_YAML"
else
printf '%s' "$RECON_SRC" | atomic_dump_yaml "$AGENT_SESSIONS_YAML"
fi
@@ -0,0 +1,151 @@
---
name: multi-agent-mux-resume
description: "Resume an existing agent (claude, antigravity/agy) conversation by UUID into a tmux session. Reads .mam/agent-sessions.yaml for the saved session/conversation id, spawns (or reuses) a tmux session of the matching name, and runs `claude -r <id>` or `agy --conversation <id>` inside. Use when you want to reattach to a previous session's context, or revive a session whose tmux died but the agent's conversation is still on disk."
version: 1.0.0
author: godopu
license: MIT
platforms: [linux, macos]
environments: [terminal, tmux]
metadata:
hermes:
tags: [agent, tmux, claude, antigravity, agy, multi-agent, context, resume, session-id]
related_skills: [multi-agent-mux-create, multi-agent-mux-stop, multi-agent-mux-monitor, claude-code]
prereq_skills: [multi-agent-mux-create]
---
# Multi-Agent Resume — Reattach to a Saved Conversation
> **Companion skills**: `multi-agent-mux-create` (start a fresh agent), `multi-agent-mux-stop` (terminate), `multi-agent-mux-monitor` (live status).
> **Tmux Isolation**: `TMUX_SERVER_NAME` env var를 create에서 설정한 경우, 동일 서버에서 동작합니다. 자세한 격리 패턴은 [multi-agent-mux-create/SKILL.md](../multi-agent-mux-create/SKILL.md) 참조.
> **Single source of truth**: `./.mam/agent-sessions.yaml`.
## What this skill does
**Container + data reconstruction**: spawn a tmux session (the container), then run the agent inside with a specific session id (the data) so the previous conversation's context is restored.
Three cases this skill handles:
1. **tmux is dead, conversation lives**`agent-sessions.yaml` has the UUID. The JSONL/db is on disk. Re-spawn the tmux session + run `claude -r <id>` / `agy --conversation <id>`.
2. **tmux is alive but empty** — You started a session with `multi-agent-mux-create` but haven't sent a message yet (so no session id was assigned). The user can either send their first message (and the id is auto-assigned), or you can read the *workspace's* most recent conversation from `$HOME_DIR/.gemini/antigravity-cli/cache/last_conversations.json` (defaults to `~/.gemini/...`) for agy, or the latest `*.jsonl` in `$CLAUDE_PROJECT_DIR/<workspace-key>/` (defaults to `~/.claude/projects/`) for claude.
3. **tmux is alive AND the agent inside is already running** — Just attach. No re-spawn needed.
### Resuming a `stopped` session (`stopped → running`)
When a session was ended via `multi-agent-mux-stop` (which captures the ID and gracefully stops by default),
its row is `status: stopped` with `resumable: true` and the conversation id
already recorded in `claude_session_id_own` / `agy_conversation_id_own`. This is the
ideal resume path:
- **tier-1, race-free**: because the stop command wrote the id into the row at stop
time, `resolve_session_id.sh` resolves it via `find_workspace_uuid` tier-1 (the
per-row own id) — no reliance on the mtime-based disk scan, so a concurrent
session in another workspace can never shadow it.
- On resume, `update_yaml_resumed.sh` transitions `stopped → running` and **clears
the stop metadata** (`stopped_at`, `stopped_at_epoch`, `stop_reason`, `resumable`)
along with the usual `terminated_at*` / `termination_mode` / `archived_at`, so the
row reflects a clean running state with no stale end-of-session fields.
## UUID resolution order
`agent-sessions.yaml` is the *primary* source. The skill reads in this order:
1. **`agent-sessions.yaml``agent_identities.<agent>.session_id` (claude) / `conversation_id` (agy)** — explicit saved value
2. **`agent-sessions.yaml``agent_identities.<agent>.session_jsonl` (claude) / `conversation_db` (agy)** — the on-disk artifact
3. **Fallback: scan disk for the workspace's most recent conversation** (Note: `CLAUDE_PROJECT_DIR` overrides the default `~/.claude/projects/` path, and `HOME_DIR` overrides the `~` path) —
- claude: `ls -t $CLAUDE_PROJECT_DIR/<workspace-key>/*.jsonl | head -1` and parse the `sessionId` from the first line
- agy: `jq -r '."<workspace>"' $HOME_DIR/.gemini/antigravity-cli/cache/last_conversations.json`
If all three are empty → the workspace has no conversation yet. Fall back to `multi-agent-mux-create`.
## Workflow
```bash
WORKSPACE=/path/to/project
AGENT=claude # or agy or hermes
SESSION_NAME=<workspace>-creator-<agent> # same convention as multi-agent-mux-create
# 1. Resolve the session id
UUID=$(bash .agents/skills/multi-agent-mux-resume/scripts/resolve_session_id.sh \
--workspace "$WORKSPACE" --agent "$AGENT")
if [ -z "$UUID" ]; then
echo "No saved session for $WORKSPACE ($AGENT). Use multi-agent-mux-create first."
exit 1
fi
# Resolve the isolated tmux server name
source .agents/skills/lib.sh
export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
# 2. If tmux is alive, attach. Done.
if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
echo "tmux '$SESSION_NAME' already running. Attaching..."
exec tmux attach -t "$SESSION_NAME"
fi
# 3. Spawn new tmux session + run agent with the saved id
case "$AGENT" in
claude)
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
"claude --dangerously-skip-permissions -r $UUID"
# auto-handle trust / bypass dialogs
sleep 5
tmux send-keys -t "$SESSION_NAME" Enter 2>/dev/null || true
sleep 3
tmux send-keys -t "$SESSION_NAME" Down 2>/dev/null || true
sleep 0.3
tmux send-keys -t "$SESSION_NAME" Enter 2>/dev/null || true
;;
agy)
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
"agy --dangerously-skip-permissions --conversation $UUID"
;;
hermes)
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
"hermes --resume $UUID"
;;
esac
# 4. Update agent-sessions.yaml: status running, last_visible_status
# (Also automatically publishes a `progress --detail "resumed"` event to the multi-agent-mux-delegate-job registry if a delegate_job_id exists)
bash .agents/skills/multi-agent-mux-resume/scripts/update_yaml_resumed.sh \
--session "$SESSION_NAME" --uuid "$UUID"
# 5. Attach
tmux attach -t "$SESSION_NAME"
```
## Pitfalls
- **`claude -r` requires the SAME project directory** — if the workspace path differs from when the session was created, claude will create a new project dir key (`-home-...-different-name`) and put the resume in a different location. Always `-c` (cd to workspace) before running.
- **agy's `--conversation` flag name varies by version** — older versions used `--resume` or `-r`. Check `agy --help | grep -E "conversation|resume"` and use the right flag. v1.0.x: `--conversation`.
- **The first message after resume might re-trigger TUI dialogs** — if the original session was created with `--dangerously-skip-permissions`, those flags are NOT persisted; you must re-apply them on resume. The script above re-passes them.
- **Don't resume if the session is brand new and empty** — `multi-agent-mux-create` already set up an empty container; sending a probe message ("init") is the right way to materialize a session id, NOT `claude -r` with a placeholder.
- **`agy --conversation <id>` will fail if the conversation was deleted from disk** — check `~/.gemini/antigravity-cli/conversations/<uuid>.db` exists before attempting resume. If missing, the conversation is gone; you need a fresh session via `multi-agent-mux-create`.
## Verification
```bash
# 1. tmux alive with the right cmd
tmux list-panes -t "$SESSION_NAME" -F 'cmd=#{pane_current_command} cwd=#{pane_current_path}'
# 2. agent-sessions.yaml updated
python3 -c "
import yaml
d = yaml.safe_load(open('.mam/agent-sessions.yaml'))
s = [s for s in d['tmux_sessions'] if s['name'] == '$SESSION_NAME'][0]
print(f' status: {s[\"status\"]}')
print(f' pane.cmd_full: {s[\"pane\"][\"cmd_full\"]}')
"
# 3. TUI shows resumed conversation (capture-pane to verify)
sleep 5
tmux capture-pane -t "$SESSION_NAME" -p -S -30
# look for the previous message at top of the buffer (claude) or last_visible_status set (agy)
```
## When NOT to use this skill
- **No saved session yet** → `multi-agent-mux-create`
- **Killing an existing session** → `multi-agent-mux-stop`
- **Just attaching** → `tmux attach -t <name>` (no skill needed)
@@ -0,0 +1,40 @@
#!/usr/bin/env bash
# resolve_session_id.sh — multi-agent-mux-resume 의 부속 스크립트
# Usage:
# bash resolve_session_id.sh --workspace <path> --agent <claude|agy>
# 출력: stdout 으로 UUID 한 줄 (없으면 빈 줄 + exit 0)
#
# P0-C: 전역 agent_identities 를 즉시 반환하지 않는다. lib.sh::find_workspace_uuid
# 가 워크스페이스 격리된 해결 경로(per-row own id -> 디스크 스캔 -> cwd 일치하는
# cache)만 사용. 다른 워크스페이스의 UUID 를 절대 반환하지 않음.
set -euo pipefail
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
usage() {
cat <<EOF
Usage: $0 --workspace <path> --agent <claude|agy>
Outputs the resolved UUID on stdout (empty if not found).
EOF
}
WORKSPACE=""
AGENT=""
while [ $# -gt 0 ]; do
case "$1" in
--workspace) WORKSPACE="$2"; shift 2 ;;
--agent) AGENT="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
esac
done
[ -n "$WORKSPACE" ] || { echo "ERROR: --workspace required" >&2; exit 2; }
[ -n "$AGENT" ] || { echo "ERROR: --agent required" >&2; exit 2; }
case "$AGENT" in
claude|agy|hermes) ;;
*) echo "ERROR: --agent must be claude or agy or hermes" >&2; exit 2 ;;
esac
find_workspace_uuid "$WORKSPACE" "$AGENT"
@@ -0,0 +1,156 @@
#!/usr/bin/env bash
# update_yaml_resumed.sh — multi-agent-mux-resume 의 부속 스크립트
# Resume 한 세션의 agent-sessions.yaml 엔트리를 status=running + resume 메타로 갱신.
# resume UUID 를 per-row own id (claude_session_id_own / agy_conversation_id_own)
# 에 박는다 — agent_identities 전역은 더 이상 primary 아님 (cache 로 강등, P0-C/단계 e).
#
# Usage: bash update_yaml_resumed.sh --session <name> --uuid <id> [--agent claude|agy]
set -euo pipefail
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
usage() {
cat <<EOF
Usage: $0 --session <name> --uuid <id> [--agent claude|agy]
EOF
}
SESSION_NAME=""
UUID=""
AGENT=""
while [ $# -gt 0 ]; do
case "$1" in
--session) SESSION_NAME="$2"; shift 2 ;;
--uuid) UUID="$2"; shift 2 ;;
--agent) AGENT="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
esac
done
[ -n "$SESSION_NAME" ] || { echo "ERROR: --session required" >&2; exit 2; }
[ -n "$UUID" ] || { echo "ERROR: --uuid required" >&2; exit 2; }
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
# --agent 미지정 시 이름 suffix 로 fallback (P1-F: 가능하면 --agent 명시)
if [ -z "$AGENT" ]; then
case "$SESSION_NAME" in
*-creator-claude) AGENT=claude ;;
*-creator-agy) AGENT=agy ;;
*-creator-hermes) AGENT=hermes ;;
*) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
esac
fi
NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
# 새 tmux pane pid / 자식 pid 를 bash 에서 캡처 (env 로 전달, P1-B)
PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null | head -1 || true)
PANE_PID="${PANE_PID:-}"
CHILD_PID=0
if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then
CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
CHILD_PID="${CHILD_PID:-0}"
fi
DELEGATE_JOB_ID=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF'
import os, sys, sqlite3, json, yaml
name = os.environ['SESSION_NAME']
yaml_path = os.environ['YAML_PATH']
db_path = os.path.splitext(yaml_path)[0] + '.db'
d = {}
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
try:
row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
if row:
s = json.loads(row[0])
print(s.get('delegate_job_id', '') or '')
raise SystemExit(0)
except sqlite3.OperationalError:
pass
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row:
d = json.loads(row[0])
conn.close()
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
except Exception:
pass
for s in d.get('tmux_sessions', []):
if s.get('name') == name:
print(s.get('delegate_job_id', '') or '')
raise SystemExit(0)
raise SystemExit(0)
PYEOF
)
atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
SESSION_NAME="$SESSION_NAME" UUID="$UUID" AGENT="$AGENT" NOW_ISO="$NOW_ISO" \
PANE_PID="$PANE_PID" CHILD_PID="$CHILD_PID" <<'PYEOF'
name = os.environ['SESSION_NAME']
uuid = os.environ['UUID']
agent = os.environ['AGENT']
now = os.environ['NOW_ISO']
pane_pid = os.environ.get('PANE_PID', '')
target = None
for s in d.get('tmux_sessions', []):
if s.get('name') == name:
target = s
break
if target is None:
print(f"ERROR: session not in YAML: {name}", flush=True)
raise SystemExit(1)
target['status'] = 'running'
target.pop('terminated_at', None)
target.pop('terminated_at_epoch', None)
target.pop('termination_mode', None)
target.pop('archived_at', None)
# stop 메타도 정리 — resume 하면 더 이상 stopped 상태가 아니므로 잔존 필드를 제거.
target.pop('stopped_at', None)
target.pop('stopped_at_epoch', None)
target.pop('stop_reason', None)
target.pop('resumable', None)
target['last_visible_status'] = f'resumed conversation {uuid} at {now}'
target.setdefault('pane', {})
if pane_pid.isdigit():
target['pane']['pid'] = int(pane_pid)
if agent == 'claude':
target['pane']['cmd'] = 'claude'
target['pane']['cmd_full'] = f'claude --dangerously-skip-permissions -r {uuid}'
target['claude_session_id_own'] = uuid
elif agent == 'agy':
target['pane']['cmd'] = 'agy'
target['pane']['cmd_full'] = f'agy --dangerously-skip-permissions --conversation {uuid}'
target['agy_conversation_id_own'] = uuid
cp = os.environ.get('CHILD_PID', '0')
if cp.isdigit() and int(cp) > 0:
target['child_pid'] = int(cp)
elif agent == 'hermes':
target['pane']['cmd'] = 'hermes'
target['pane']['cmd_full'] = f'hermes --resume {uuid}'
target['hermes_conversation_id_own'] = uuid
cp = os.environ.get('CHILD_PID', '0')
if cp.isdigit() and int(cp) > 0:
target['child_pid'] = int(cp)
snap = d.setdefault('snapshot', {})
snap['taken_at'] = now
snap.pop('terminated_at', None)
snap.pop('terminated_at_epoch', None)
print(f"updated: {name} status=running (resume id -> per-row own id)", flush=True)
PYEOF
delegate_publish_event "$DELEGATE_JOB_ID" progress "resumed"
@@ -0,0 +1,124 @@
---
name: multi-agent-mux-status
description: "Read-only instant snapshot of all agent tmux sessions — name, YAML status, tmux alive, pane cmd/cwd, resume UUID on disk, and any drift. No Kanban, no mutation. Reuses reconcile.sh --dry-run for the diff logic. Use when you want to know 'what's running RIGHT NOW' without spinning up a Kanban monitor worker."
version: 1.0.0
author: godopu
license: MIT
platforms: [linux, macos]
environments: [terminal, tmux]
metadata:
hermes:
tags: [agent, tmux, claude, antigravity, agy, status, read-only, snapshot]
related_skills: [multi-agent-mux-create, multi-agent-mux-resume, multi-agent-mux-stop, multi-agent-mux-monitor]
prereq_skills: [multi-agent-mux-create, multi-agent-mux-monitor]
---
# Multi-Agent Status — Read-Only Instant Snapshot
> **Companion skills**: `multi-agent-mux-create` (start), `multi-agent-mux-resume` (re-attach), `multi-agent-mux-stop` (terminate), `multi-agent-mux-monitor` (live polling).
> **Tmux Isolation**: `status` 명령은 YAML에 등록된 모든 세션의 격리 서버(`tmux_server` 필드)를 자동으로 조회하여 상태를 확인하므로, `TMUX_SERVER_NAME` 환경변수를 수동으로 지정하지 않아도 모든 격리 서버의 세션 상태를 통합 조회합니다.
> **Single source of truth**: `./.mam/agent-sessions.yaml`.
## What this skill does
Print a single table of every agent tmux session, comparing YAML state to actual tmux state. **No mutation. No Kanban. No polling loop.**
This is the "what's running right now?" answer — faster than dispatching `multi-agent-mux-monitor` (which polls every 30s) and safer than `reconcile.sh --once --emit-diff` (which mutates as a side effect).
## Pre-flight
```bash
command -v tmux
command -v python3
test -f .mam/agent-sessions.yaml
```
If `agent-sessions.yaml` doesn't exist or is malformed → print clear error, exit 1. **Do not create it.** (Use `multi-agent-mux-create` first.)
## Workflow
```bash
bash .agents/skills/multi-agent-mux-status/scripts/status.sh [--json]
```
The script:
1. Calls `reconcile.sh --once --emit-diff --dry-run` (read-only; no YAML mutation) for the drift snapshot
2. Loads `agent-sessions.yaml` (read-only) to enrich the table
3. For each row in `tmux_sessions[]`:
- tmux alive? (via `tmux has-session -t <name>`)
- pane cmd, cwd (via `tmux list-panes`)
- resume UUID on disk? (claude: `$CLAUDE_PROJECT_DIR/<key>/<uuid>.jsonl` with default `~/.claude/projects/`; agy: `$HOME_DIR/.gemini/antigravity-cli/conversations/<uuid>.db` with default `~/.gemini/...`)
4. For each tmux session matching `*-creator-*` not in YAML → flag as "unregistered"
5. Prints a table (default) or JSON (with `--json`)
## Output format (default = aligned table)
```
agent-sessions status — 2026-06-19T14:20:00Z (tmux_confirmed=True)
========================================================================================================================================
NAME SERVER YAML TMUX CMD RESUME JOB_ID JOB_STATUS DRIFT
----------------------------------------------------------------------------------------------------------------------------------------
lab-landing-page-creator-claude default running alive claude yes - - -
lab-landing-page-creator-agy default terminated dead agy yes 5fe09ba8 completed -
lab-paper-pdf2md-creator-claude default running alive claude scan - - -
========================================================================================================================================
```
## Output format (`--json`)
```json
{
"yaml_path": "...",
"tmux_sessions_alive": ["..."],
"yaml_entries": [...],
"rows": [
{
"name": "lab-landing-page-creator-claude",
"yaml_status": "running",
"tmux_alive": true,
"pane_cmd": "claude",
"pane_cwd": "/home/.../refer_landing_page",
"resume_uuid_on_disk": true,
"drift": null
},
{
"name": "lab-landing-page-creator-agy",
"yaml_status": "terminated",
"tmux_alive": false,
"drift": "yaml-says-terminated-but-disk-uuid-still-present"
}
],
"unregistered": [],
"drifts": []
}
```
## Drift classes (read-only — never mutates)
| Class | Detection | Meaning |
|---|---|---|
| `A` | YAML `running`, tmux dead | session died without going through `multi-agent-mux-stop`. *Could* auto-terminate but won't — that's `multi-agent-mux-monitor`'s job. |
| `B` | tmux alive, not in YAML | ad-hoc session someone started without `multi-agent-mux-create`. Suggest: "use multi-agent-mux-create to register, or tmux kill-session to clean up." |
| `C` | YAML has `claude_session_id_own: null` AND a new *.jsonl exists | new session id materialized; suggest: "run multi-agent-mux-resume or reconcile to register it." |
| `D` | YAML has UUID in `agent_identities`, but the on-disk artifact is gone | stale UUID; user should `multi-agent-mux-stop --purge-conversation` to clean up. |
## Pitfalls
- **Do NOT use this skill to drive mutations** — the output is a snapshot, not a call to action. If you need to fix drifts, dispatch `multi-agent-mux-monitor` (Kanban worker) or run `multi-agent-mux-resume` / `multi-agent-mux-stop` manually.
- **Read-only is enforced by script** — `status.sh` opens the YAML with `open(path)` (no `'w'`), never calls `tmux kill-session`, never writes anywhere. The `reconcile.sh --dry-run` mode is the same path.
- **If `agent-sessions.yaml` is malformed** — print the YAML error verbatim and exit 1. Do NOT attempt recovery (that's `multi-agent-mux-stop --purge-conversation` or manual edit's job).
- **Sessions outside the `<workspace>-creator-*` naming convention** are still shown but tagged `ad-hoc` — they didn't go through `multi-agent-mux-create` and aren't tracked in YAML.
## When to use
- "Is the claude session still running?" → this skill, not the monitor
- "What UUID does this workspace have?" → this skill
- "Is there drift between YAML and reality?" → this skill, then dispatch monitor or fix manually
- Quick sanity check before dispatching a long Kanban task
## When NOT to use
- Continuous live tracking → `multi-agent-mux-monitor` (Kanban worker)
- Recovering from corruption → manual edit + `.bak` restore
- Polling more than once a minute → `multi-agent-mux-monitor` (it dedupes)
+140
View File
@@ -0,0 +1,140 @@
#!/usr/bin/env bash
# status.sh — multi-agent-mux-status 의 부속 스크립트 (READ-ONLY)
# 한 번 호출로 현재 agent 세션 상태표를 출력. 부수효과 없음.
# reconcile.sh --dry-run 을 재사용해 drift 를 계산하고 (P1-E), YAML/디스크에서
# 보강한 표를 그린다. YAML 을 절대 수정하지 않는다.
#
# Usage: bash status.sh [--json]
set -euo pipefail
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
RECONCILE="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/multi-agent-mux-monitor/scripts/reconcile.sh"
JSON=0
[ "${1:-}" = "--json" ] && JSON=1
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found. Run multi-agent-mux-create first." >&2; exit 1; }
# read-only drift snapshot — reconcile.sh --dry-run (no side effects)
DRIFT_JSON="$(bash "$RECONCILE" --once --emit-diff --dry-run)"
if [ "$JSON" = "1" ]; then
printf '%s\n' "$DRIFT_JSON"
exit 0
fi
# Project root (parent of .agents/) holds the multi-agent-mux-delegate-job .mam registry.
# Resolved relative to this script — no hardcoded absolute path (review item 6).
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)"
DRIFT_JSON="$DRIFT_JSON" env_python "$AGENT_SESSIONS_YAML" PROJECT_ROOT="$PROJECT_ROOT" <<'PYEOF'
import os, json, glob
import yaml
yaml_path = os.environ['YAML_PATH']
home = os.environ['HOME_DIR']
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
drift = json.loads(os.environ['DRIFT_JSON'])
db_path = os.path.splitext(yaml_path)[0] + '.db'
d = {}
import sqlite3
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row: d = json.loads(row[0])
try:
db_sessions = []
cursor = conn.execute('SELECT data FROM sessions')
for s_row in cursor.fetchall():
db_sessions.append(json.loads(s_row[0]))
d['tmux_sessions'] = db_sessions
except sqlite3.OperationalError:
pass
conn.close()
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
except Exception:
pass
alive = set(drift.get('tmux_sessions_alive', []))
drift_by_name = {}
for dr in drift.get('drifts', []):
drift_by_name.setdefault(dr['name'], []).append(dr['class'])
def resume_on_disk(s):
# workspace-SCOPED check only — per-row own id, never a global identity (P0-C)
name = s.get('name', '')
cwd = (s.get('pane') or {}).get('cwd', '')
if name.endswith('-creator-claude'):
u = s.get('claude_session_id_own')
if u:
key = cwd.replace('/', '-').replace('_', '-')
return 'yes' if os.path.exists(f"{claude_project_dir}/{key}/{u}.jsonl") else 'MISSING'
key = cwd.replace('/', '-').replace('_', '-')
return 'scan' if glob.glob(f"{claude_project_dir}/{key}/*.jsonl") else 'no'
if name.endswith('-creator-agy'):
u = s.get('agy_conversation_id_own')
if u:
return 'yes' if os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{u}.db") else 'MISSING'
return 'no'
return '?'
def get_job_status(s):
jid = s.get('delegate_job_id')
if not jid:
return ('-', '-')
project_root = os.environ.get('PROJECT_ROOT', '.')
# Candidate locations (review item 6: project-root-relative, no hardcoded abs paths):
# 1) cwd-relative registry 2) project-root registry 3) project-root audit log
candidates = [
os.path.join('.mam', 'jobs', f"{jid}.json"),
os.path.join(project_root, '.mam', 'jobs', f"{jid}.json"),
os.path.join(project_root, '.mam', 'delegate_job_logs', jid, 'status.json'),
]
for path in candidates:
if os.path.exists(path):
try:
with open(path) as jf:
job_data = json.load(jf)
return (jid, job_data.get('status', 'unknown'))
except Exception:
pass
return (jid, 'unknown')
sessions = d.get('tmux_sessions', [])
print(f"agent-sessions status — {drift['timestamp']} (tmux_confirmed={drift['tmux_confirmed']})")
print("=" * 136)
print(f"{'NAME':<44} {'SERVER':<12} {'YAML':<10} {'TMUX':<6} {'CMD':<6} {'RESUME':<8} {'JOB_ID':<10} {'JOB_STATUS':<12} DRIFT")
print("-" * 136)
if not sessions:
print("(no sessions registered)")
for s in sessions:
name = s.get('name', '?')
server = s.get('tmux_server') or 'default'
status = s.get('status', '?')
tmux = 'alive' if f"{name}|{server}" in alive else 'dead'
cmd = (s.get('pane') or {}).get('cmd', '?')
res = resume_on_disk(s)
jid, jstatus = get_job_status(s)
drs = ','.join(drift_by_name.get(name, [])) or '-'
print(f"{name:<44} {server:<12} {status:<10} {tmux:<6} {cmd:<6} {res:<8} {jid:<10} {jstatus:<12} {drs}")
# drifts not tied to a registered row (e.g. class B unregistered, class D cache)
known = {s.get('name') for s in sessions}
extra = [dr for dr in drift.get('drifts', []) if dr['name'] not in known]
if extra:
print("-" * 136)
for dr in extra:
print(f" [{dr['class']}] {dr['msg']}")
print("=" * 136)
print(f"alive tmux: {sorted(alive)}")
PYEOF
@@ -0,0 +1,136 @@
---
name: multi-agent-mux-stop
description: "Stop an agent tmux session (claude, antigravity/agy) and update .mam/agent-sessions.yaml. Default stops gracefully and marks status=stopped with conversation preserved for resume. Does NOT delete on-disk conversation artifacts (jsonl/db) — those are preserved unless --purge-conversation is passed. Use when ending a work session, switching to a different one, or cleaning up before a fresh start."
version: 1.0.0
author: godopu
license: MIT
platforms: [linux, macos]
environments: [terminal, tmux]
metadata:
hermes:
tags: [agent, tmux, claude, antigravity, agy, multi-agent, stop, terminate, cleanup]
related_skills: [multi-agent-mux-create, multi-agent-mux-resume, multi-agent-mux-monitor]
prereq_skills: [multi-agent-mux-create, multi-agent-mux-resume]
---
# Multi-Agent Stop — Stop an Agent tmux Session
> **Companion skills**: `multi-agent-mux-create` (start), `multi-agent-mux-resume` (re-attach), `multi-agent-mux-monitor` (live status).
> **Tmux Isolation**: `stop` 명령은 YAML의 `tmux_server` 필드를 자동으로 파싱하여 해당 격리 서버의 세션을 안전하게 종료(kill)하므로, `TMUX_SERVER_NAME` 환경변수를 수동으로 지정할 필요가 없습니다.
> **Single source of truth**: `./.mam/agent-sessions.yaml`.
## What this skill does
Stop an agent's tmux session gracefully, resolve and store the conversation ID, and **mark the YAML entry (status=stopped)**. Preserves:
- The tmux session's recorded `pane.pid / cmd / cwd / mcp_attachments` for audit
- The agent's on-disk conversation (claude `*.jsonl`, agy `conversations/*.db`) — so the user can `multi-agent-mux-resume` later
- The `start_command` so a future `multi-agent-mux-create --session <name>` reproduces the same tmux spec
The stop command is always **graceful by default**:
1. Sends exit keys to the agent TUI (`/exit` for Claude, `Exit` for Agy) and waits 3 seconds.
2. If still alive, issues `tmux kill-session` (SIGTERM) and waits 5 seconds.
3. If still alive, kills the pane PID via SIGKILL (`kill -9`) as a last resort.
4. Auto-captures the conversation ID into the row (`claude_session_id_own`/`agy_conversation_id_own`) before killing, ensuring the next resume uses a race-free tier-1 lookup.
## Pre-flight
```bash
SESSION_NAME=<workspace>-creator-<agent> # convention
AGENT_SESSIONS_YAML=.mam/agent-sessions.yaml
# 1) Session is registered?
python3 -c "
import yaml
d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
names = [s['name'] for s in d.get('tmux_sessions', [])]
if '$SESSION_NAME' not in names:
print('NOT in YAML — refusing to stop (no audit trail). Use multi-agent-mux-create first, or pass --force-no-yaml.')
raise SystemExit(1)
"
# 2) Already stopped?
ALREADY=$(python3 -c "
import yaml
d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
s = [x for x in d['tmux_sessions'] if x['name']=='$SESSION_NAME'][0]
print(s.get('status', 'unknown'))
")
if [ "$ALREADY" = "stopped" ]; then
echo "Already stopped."
fi
```
## Workflow
```bash
# 1. Stop gracefully (default — captures ID, shuts down safely, status=stopped)
bash .agents/skills/multi-agent-mux-stop/scripts/stop_session.sh \
--session "$SESSION_NAME"
# 2. Stop gracefully + record a custom stop reason
bash .agents/skills/multi-agent-mux-stop/scripts/stop_session.sh \
--session "$SESSION_NAME" --reason api_error
# 3. Stop gracefully + clean up on-disk conversation (DANGEROUS)
# — this prevents any future resume (status=terminated, resumable=false).
bash .agents/skills/multi-agent-mux-stop/scripts/stop_session.sh \
--session "$SESSION_NAME" --purge-conversation
```
**Idempotency**: if the row is already `status: stopped`, the script prints `already stopped (...)` and exits 0 — re-running is a safe no-op.
### State machine
```
running ──(stop default / --reason)────────► stopped (resumable:true, conv preserved)
running ──(stop --purge-conversation --yes)► terminated (resumable:false, conv deleted)
stopped ──(stop default … again)───────────► stopped (idempotent no-op)
```
Fields written in STOP mode: `status: stopped`, `stopped_at`, `stopped_at_epoch`, `stop_reason`, `termination_mode: graceful`, `claude_session_id_own`/`agy_conversation_id_own` and `resumable: true`.
If `--purge-conversation` is used: `status: terminated`, `terminated_at`, `terminated_at_epoch`, `termination_mode: purge` and `resumable: false`.
The script:
1. Verifies the session is in agent-sessions.yaml
2. If `delegate_job_id` is set, automatically publishes a `progress --detail "terminating"` event to the multi-agent-mux-delegate-job registry
3. Captures the `last_visible_status` from `tmux capture-pane` (so we have a final TUI snapshot for audit)
4. Attempts graceful exit keys → SIGTERM kill-session → SIGKILL fallback
5. For `purge-conversation`: deletes `~/.claude/projects/.../jsonl` (claude) or `~/.gemini/antigravity-cli/conversations/...db` + `brain/...` (agy)
6. Updates the YAML entry and SQLite database atomically
7. If `delegate_job_id` is set, publishes a `completed` event to the multi-agent-mux-delegate-job registry
## Pitfalls
- **Don't delete on-disk artifacts by default** — the agent's `*.jsonl` / `conversations/*.db` is the data that `multi-agent-mux-resume` needs. `--purge-conversation` is for when the user is genuinely done with the conversation and wants zero recovery chance.
- **YAML is append-only until you write a stop** — if a previous run left the entry as `running` but tmux is actually dead (crash, host reboot), the YAML is stale. Running `multi-agent-mux-stop` will detect "tmux already dead, just update YAML" and proceed.
- **Don't delete the `claude_session_id_own: null` placeholder** — when the user creates a fresh session with `multi-agent-mux-create` and never sent a message, the entry has `claude_session_id_own: null`. Stopping must preserve that field.
- **Monitor skill may still be tracking** — if `multi-agent-mux-monitor` is running a heartbeat loop, stopping a session while it watches will trigger its `tmux ls != yaml` reconciliation. That's expected — let the monitor run, it will mark the entry as `terminated` on its own.
## Verification
```bash
# 1. tmux gone
tmux has-session -t "$SESSION_NAME" 2>/dev/null && echo "STILL ALIVE" || echo "OK: tmux gone"
# 2. YAML has stopped entry
python3 -c "
import yaml
d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
s = [x for x in d['tmux_sessions'] if x['name']=='$SESSION_NAME'][0]
assert s['status'] == 'stopped', f'expected stopped, got {s[\"status\"]}'
assert s.get('stopped_at'), 'missing stopped_at'
print(f'OK: stopped at {s[\"stopped_at\"]}')
print(f' preserved: pane.pid={s[\"pane\"][\"pid\"]}, cmd={s[\"pane\"][\"cmd\"]}, cwd={s[\"pane\"][\"cwd\"]}')
"
# 3. (if --purge-conversation) disk artifacts gone
[ -f "${CLAUDE_PROJECT_DIR:-$HOME/.claude/projects}/<projkey>/<uuid>.jsonl" ] && echo "WARN: jsonl still exists" || echo "OK: jsonl purged"
```
## When NOT to use this skill
- **Just detaching** → `tmux detach` (Ctrl-B d) or just close the terminal. The tmux session keeps running.
- **Stopping the agent inside but keeping tmux** → send `Ctrl-C` or `/exit` (claude) / `Ctrl-D` (agy) via `tmux send-keys`. The tmux session stays but the agent process is gone.
- **Replacing an existing session with a new one** → `multi-agent-mux-stop` first, then `multi-agent-mux-create`.
+341
View File
@@ -0,0 +1,341 @@
#!/usr/bin/env bash
# stop_session.sh — multi-agent-mux-stop 의 부속 스크립트
# Usage:
# bash stop_session.sh --session <name> [--agent claude|agy] \
# [--mode soft|hard] [--purge-conversation] [--yes]
#
# mode:
# soft — YAML 을 status=archived 로 마크, tmux 세션은 그대로 둠 (P1-A:
# terminated 는 tmux 가 실제로 죽은 상태에만 사용)
# hard — tmux kill-session + YAML status=terminated
# --purge-conversation: --mode hard 일 때만. 삭제 대상 세션의 *워크스페이스에
# 격리된* conversation artifact 만 삭제 (P0-C). 전역
# agent_identities 를 참조하지 않음. resume 불가.
#
# Stop extension (Option A — stop 확장, 새 6번째 스킬 없이 stop 의미론 흡수):
# --capture-id — kill 직전에 이 워크스페이스의 conversation id 를 row 에 확정
# 기록 (claude_session_id_own / agy_conversation_id_own) →
# 다음 resume 이 tier-1(race-free) 로 복원. find_workspace_uuid
# 재사용 (per-row -> workspace-scoped disk scan -> cache).
# --reason R — 상태 전이 사유 (stop_reason). 기본값 manual_stop.
# --graceful — kill-session 즉시 종료 대신 send-keys 로 정상 종료 유도 →
# 3초 대기 → 미종료 시 kill-session(SIGTERM) → 5초 → SIGKILL.
# 위 세 옵션 중 하나라도 주면 STOP 모드: status 가 terminated 가 아니라 stopped
# 로 전이 (running -> stopped). 멱등: 이미 stopped 면 no-op + exit 0.
# 옵션 미지정 시 기존 hard/soft 동작 그대로 (backward compatible).
#
# Exit codes:
# 0 = success (or already-stopped no-op) | 1 = YAML not found / not registered
# 2 = invalid args | 3 = interactive confirmation required (--yes 누락)
set -euo pipefail
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
usage() {
cat <<EOF
Usage: $0 --session <name> [--agent claude|agy] [--purge-conversation] [--yes] [--reason <reason>]
Stop arguments:
--reason <reason> — stop_reason field (default: manual_stop)
(idempotent: stopping an already-stopped session is a no-op with exit 0)
EOF
}
SESSION_NAME=""
AGENT=""
PURGE=0
YES=0
CAPTURE_ID=1
GRACEFUL=1
REASON="manual_stop"
STOP_MODE=1
while [ $# -gt 0 ]; do
case "$1" in
--session) SESSION_NAME="$2"; shift 2 ;;
--agent) AGENT="$2"; shift 2 ;;
--purge-conversation) PURGE=1; shift ;;
--yes) YES=1; shift ;;
--reason) REASON="$2"; shift 2 ;;
--mode|--capture-id|--graceful)
echo "ERROR: $1 option is deprecated. Stop now always stops gracefully and captures IDs." >&2
exit 2
;;
-h|--help) usage; exit 0 ;;
*) echo "ERROR: unknown arg: $1" >&2; usage; exit 2 ;;
esac
done
[ -n "$SESSION_NAME" ] || { echo "ERROR: --session required" >&2; usage; exit 2; }
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
# --agent 미지정 시 이름 suffix 로 fallback (P1-F)
if [ -z "$AGENT" ]; then
case "$SESSION_NAME" in
*-creator-claude) AGENT=claude ;;
*-creator-agy) AGENT=agy ;;
*-creator-hermes) AGENT=hermes ;;
*) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
esac
fi
# 세션이 YAML 에 있는지 + 해당 row 의 워크스페이스 cwd 및 delegate_job_id 추출.
# JSON 으로 emit — cwd 에 '|' 가 들어가도 안전 (review item 7; 기존 cwd|jid 파서 대체).
MAPPED_DATA=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF'
import os, sys, json, yaml, sqlite3
name = os.environ['SESSION_NAME']
yaml_path = os.environ['YAML_PATH']
db_path = os.path.splitext(yaml_path)[0] + '.db'
d = {}
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
try:
row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
if row:
s = json.loads(row[0])
cwd = (s.get('pane') or {}).get('cwd', '')
jid = s.get('delegate_job_id', '') or ''
print(json.dumps({"cwd": cwd, "job_id": jid}))
raise SystemExit(0)
except sqlite3.OperationalError:
pass
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row:
d = json.loads(row[0])
conn.close()
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
except Exception:
pass
for s in d.get('tmux_sessions', []):
if s.get('name') == name:
cwd = (s.get('pane') or {}).get('cwd', '')
jid = s.get('delegate_job_id', '') or ''
print(json.dumps({"cwd": cwd, "job_id": jid}))
raise SystemExit(0)
raise SystemExit(7)
PYEOF
) || {
echo "ERROR: session '$SESSION_NAME' not in $AGENT_SESSIONS_YAML" >&2
exit 1
}
TARGET_CWD=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("cwd",""))')
DELEGATE_JOB_ID=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("job_id",""))')
# 멱등성: STOP 모드에서 이미 stopped 인 세션이면 no-op + exit 0
if [ "$STOP_MODE" = "1" ]; then
if STOPPED_INFO=$(is_already_stopped "$SESSION_NAME"); then
echo "already stopped (status=stopped, $STOPPED_INFO) — no-op"
exit 0
fi
fi
# purge 확인
if [ "$PURGE" = "1" ] && [ "$YES" != "1" ]; then
echo "DANGER: --purge-conversation will DELETE this workspace's on-disk conversation."
echo " workspace: ${TARGET_CWD:-<unknown>}"
echo " This means: no future multi-agent-mux-resume for this session."
echo " Re-run with --yes to confirm."
exit 3
fi
# purge 대상 UUID 를 워크스페이스 격리해서 해결 (P0-C — 전역 참조 금지)
PURGE_UUID=""
if [ "$PURGE" = "1" ] && [ -n "$TARGET_CWD" ]; then
PURGE_UUID=$(find_workspace_uuid "$TARGET_CWD" "$AGENT" || true)
fi
NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
NOW_EPOCH=$(date +%s)
# tmux 상태 + 마지막 TUI 스냅샷 (살아있을 때만; capture-pane 내용은 env 로만 전달)
TMUX_ALIVE=0
LAST_STATUS=""
if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
TMUX_ALIVE=1
LAST_STATUS=$(tmux capture-pane -t "$SESSION_NAME" -p -S -10 2>/dev/null | tr '\n' ' ' | head -c 500 || true)
fi
# --capture-id: kill 직전에 conversation id 를 해결 (process/jsonl 이 아직 살아있을 때).
# find_workspace_uuid 가 tier-1(row) -> tier-2(workspace-scoped disk scan) -> tier-3(cache)
# 를 알아서 시도하므로 tmux 생사와 무관하게 동작.
CAPTURED_UUID=""
if [ "$CAPTURE_ID" = "1" ] && [ -n "$TARGET_CWD" ]; then
CAPTURED_UUID=$(capture_conversation_id "$AGENT" "$TARGET_CWD" || true)
if [ -n "$CAPTURED_UUID" ]; then
echo "captured conversation id: $CAPTURED_UUID"
else
echo "WARN: --capture-id requested but no conversation id resolved (nothing on disk yet)"
fi
fi
delegate_publish_event "$DELEGATE_JOB_ID" progress "terminating"
# --graceful: send-keys 로 정상 종료 유도 → 폴백 체인 (SIGTERM → SIGKILL).
graceful_stop() {
local pane_pid exitkey
pane_pid=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null | head -1 || true)
case "$AGENT" in
claude) exitkey="/exit" ;;
agy) exitkey="Exit" ;;
hermes) exitkey="/exit" ;;
*) exitkey="/exit" ;;
esac
echo "graceful: send-keys '$exitkey' to $SESSION_NAME"
tmux send-keys -t "$SESSION_NAME" "$exitkey" Enter 2>/dev/null || true
sleep 3
if ! tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
echo "graceful: exited cleanly"
return 0
fi
echo "graceful: still alive → kill-session (SIGTERM)"
tmux kill-session -t "$SESSION_NAME" 2>/dev/null || true
sleep 5
if ! tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
echo "graceful: terminated after kill-session"
return 0
fi
echo "graceful: STILL alive → SIGKILL fallback (pane pid $pane_pid)"
[ -n "$pane_pid" ] && kill -9 "$pane_pid" 2>/dev/null || true
}
# tmux 종료: graceful 이면 폴백 체인, 아니면 기존 hard kill.
if [ "$GRACEFUL" = "1" ] && [ "$TMUX_ALIVE" = "1" ]; then
graceful_stop
elif [ "$TMUX_ALIVE" = "1" ]; then
tmux kill-session -t "$SESSION_NAME"
echo "killed tmux: $SESSION_NAME"
else
echo "tmux already dead, just updating YAML"
fi
atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
SESSION_NAME="$SESSION_NAME" AGENT="$AGENT" PURGE="$PURGE" \
NOW_ISO="$NOW_ISO" NOW_EPOCH="$NOW_EPOCH" LAST_STATUS="$LAST_STATUS" \
PURGE_UUID="$PURGE_UUID" TARGET_CWD="$TARGET_CWD" \
REASON="$REASON" CAPTURED_UUID="$CAPTURED_UUID" <<'PYEOF'
import shutil
name = os.environ['SESSION_NAME']
agent = os.environ['AGENT']
purge = os.environ['PURGE'] == '1'
now = os.environ['NOW_ISO']
home = os.environ['HOME_DIR']
last_status = os.environ.get('LAST_STATUS', '')
purge_uuid = os.environ.get('PURGE_UUID', '').strip()
ws = os.environ.get('TARGET_CWD', '')
reason = os.environ.get('REASON', '') or 'manual_stop'
captured = os.environ.get('CAPTURED_UUID', '').strip()
target = None
for s in d.get('tmux_sessions', []):
if s.get('name') == name:
target = s
break
if target is None:
print(f"ERROR: disappeared during script: {name}", flush=True)
raise SystemExit(1)
if purge:
target['status'] = 'terminated'
target['terminated_at'] = now
target['terminated_at_epoch'] = int(os.environ['NOW_EPOCH'])
target['termination_mode'] = 'purge'
else:
target['status'] = 'stopped'
target['stopped_at'] = now
target['stopped_at_epoch'] = int(os.environ['NOW_EPOCH'])
target['stop_reason'] = reason
target['termination_mode'] = 'graceful'
if last_status:
target['last_visible_status_at_termination'] = last_status
# --capture-id: 항상 captured UUID 기록 (purge가 아닐 때만)
if captured and not purge:
if agent == 'claude':
target['claude_session_id_own'] = captured
elif agent == 'agy':
target['agy_conversation_id_own'] = captured
elif agent == 'hermes':
target['hermes_conversation_id_own'] = captured
target['resumable'] = True
# --purge-conversation: 워크스페이스 격리된 UUID 의 디스크 artifact 만 삭제 (P0-C)
if purge and purge_uuid:
if agent == 'claude':
key = ws.replace('/', '-').replace('_', '-')
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
jsonl = f"{claude_project_dir}/{key}/{purge_uuid}.jsonl"
if os.path.exists(jsonl):
os.remove(jsonl)
print(f"purged: {jsonl}", flush=True)
target['claude_session_id_own'] = None
elif agent == 'agy':
db = f"{home}/.gemini/antigravity-cli/conversations/{purge_uuid}.db"
if os.path.exists(db):
os.remove(db)
print(f"purged: {db}", flush=True)
brain = f"{home}/.gemini/antigravity-cli/brain/{purge_uuid}"
if os.path.isdir(brain):
shutil.rmtree(brain)
print(f"purged: {brain}", flush=True)
target['agy_conversation_id_own'] = None
elif agent == 'hermes':
json_file = f"{home}/.mam/sessions/session_{purge_uuid}.json"
if os.path.exists(json_file):
os.remove(json_file)
print(f"purged: {json_file}", flush=True)
hdb = f"{home}/.mam/state.db"
if os.path.exists(hdb):
try:
import sqlite3
conn = sqlite3.connect(hdb)
conn.execute("DELETE FROM sessions WHERE id=?", (purge_uuid,))
conn.execute("DELETE FROM messages WHERE session_id=?", (purge_uuid,))
conn.commit()
conn.close()
print(f"purged db records for session: {purge_uuid}", flush=True)
except Exception as e:
print(f"WARN: purge hermes db records failed: {e}", flush=True)
target['hermes_conversation_id_own'] = None
# agent_identities 는 cache — 이 워크스페이스 것일 때만 비운다
ai = (d.get('agent_identities') or {}).get(agent) or {}
if ai.get('project_cwd') == ws:
if agent == 'claude' and ai.get('session_id') == purge_uuid:
ai['session_id'] = None
ai['session_jsonl'] = None
ai.pop('session_size_bytes', None)
ai.pop('session_lines', None)
elif agent == 'agy' and ai.get('conversation_id') == purge_uuid:
ai['conversation_id'] = None
ai['conversation_db'] = None
ai['conversation_brain_dir'] = None
elif agent == 'hermes' and ai.get('session_id') == purge_uuid:
ai['session_id'] = None
elif purge and not purge_uuid:
print("WARN: --purge-conversation requested but no workspace-scoped UUID resolved; nothing purged", flush=True)
if purge:
target['resumable'] = False
print(f"updated: {name} status={target['status']}", flush=True)
PYEOF
delegate_publish_event "$DELEGATE_JOB_ID" completed "session terminated"
echo
echo "=== stop complete ==="
echo " session: $SESSION_NAME"
echo " agent: $AGENT"
echo " reason: $REASON"
echo " captured: ${CAPTURED_UUID:-<none>}"
echo " purge: $PURGE${PURGE_UUID:+ (uuid $PURGE_UUID)}"
echo " time: $NOW_ISO"
echo
echo "Recovery: multi-agent-mux-create + multi-agent-mux-resume 로 동일 컨텍스트 복원 가능"
echo " (단 --purge-conversation 사용 시 복원 불가)"
+1
View File
@@ -0,0 +1 @@
tmux_sessions: []
+33
View File
@@ -0,0 +1,33 @@
.agents/skills/multi-agent-mux-stop/scripts/stop_session.sh
.agents/skills/multi-agent-mux-stop/SKILL.md
.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh
.agents/skills/multi-agent-mux-monitor/SKILL.md
.agents/skills/multi-agent-mux-delegate-job/mqtt-broker-setup.md
.agents/skills/multi-agent-mux-delegate-job/requirements.txt
.agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job
.agents/skills/multi-agent-mux-delegate-job/README.md
.agents/skills/multi-agent-mux-delegate-job/scripts/publish_event.py
.agents/skills/multi-agent-mux-delegate-job/scripts/registry.py
.agents/skills/multi-agent-mux-delegate-job/scripts/mqtt_common.py
.agents/skills/multi-agent-mux-delegate-job/scripts/job_subscriber.py
.agents/skills/multi-agent-mux-delegate-job/job-protocol.md
.agents/skills/multi-agent-mux-delegate-job/SKILL.md
.agents/skills/multi-agent-mux-delegate-job/registry.md
.agents/skills/multi-agent-mux-create/scripts/create_session.sh
.agents/skills/multi-agent-mux-create/SKILL.md
.agents/skills/lib.sh
.agents/skills/multi-agent-mux-resume/scripts/resolve_session_id.sh
.agents/skills/multi-agent-mux-resume/scripts/update_yaml_resumed.sh
.agents/skills/multi-agent-mux-resume/SKILL.md
.agents/skills/multi-agent-mux-status/scripts/status.sh
.agents/skills/multi-agent-mux-status/SKILL.md
AGENT.md
AGENT.ko.md
MESSAGING.md
BOOTSTRAP.md
BOOTSTRAP.ko.md
INSTRUCTION.md
remove.sh
update.sh
.env.example
.env