From b76249a2a634b88e9c3bd9bbf0a3ead43a9400a1 Mon Sep 17 00:00:00 2001 From: Godopu Date: Thu, 25 Jun 2026 12:19:24 +0900 Subject: [PATCH] =?UTF-8?q?mam=20=EC=84=A4=EC=B9=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .agents/skills/lib.sh | 787 ++++++++++++++++++ .../skills/multi-agent-mux-create/SKILL.md | 220 +++++ .../scripts/create_session.sh | 294 +++++++ .../multi-agent-mux-delegate-job/README.md | 11 + .../multi-agent-mux-delegate-job/SKILL.md | 385 +++++++++ .../job-protocol.md | 114 +++ .../mqtt-broker-setup.md | 176 ++++ .../multi-agent-mux-delegate-job | 277 ++++++ .../multi-agent-mux-delegate-job/registry.md | 183 ++++ .../requirements.txt | 2 + .../scripts/job_subscriber.py | 252 ++++++ .../scripts/mqtt_common.py | 616 ++++++++++++++ .../scripts/publish_event.py | 229 +++++ .../scripts/registry.py | 334 ++++++++ .../skills/multi-agent-mux-monitor/SKILL.md | 236 ++++++ .../scripts/reconcile.sh | 542 ++++++++++++ .../skills/multi-agent-mux-resume/SKILL.md | 151 ++++ .../scripts/resolve_session_id.sh | 40 + .../scripts/update_yaml_resumed.sh | 156 ++++ .../skills/multi-agent-mux-status/SKILL.md | 124 +++ .../multi-agent-mux-status/scripts/status.sh | 140 ++++ .agents/skills/multi-agent-mux-stop/SKILL.md | 136 +++ .../scripts/stop_session.sh | 341 ++++++++ .mam/agent-sessions.yaml | 1 + .mam/install_manifest.txt | 33 + 25 files changed, 5780 insertions(+) create mode 100644 .agents/skills/lib.sh create mode 100644 .agents/skills/multi-agent-mux-create/SKILL.md create mode 100755 .agents/skills/multi-agent-mux-create/scripts/create_session.sh create mode 100644 .agents/skills/multi-agent-mux-delegate-job/README.md create mode 100644 .agents/skills/multi-agent-mux-delegate-job/SKILL.md create mode 100644 .agents/skills/multi-agent-mux-delegate-job/job-protocol.md create mode 100644 .agents/skills/multi-agent-mux-delegate-job/mqtt-broker-setup.md create mode 100755 .agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job create mode 100644 .agents/skills/multi-agent-mux-delegate-job/registry.md create mode 100644 .agents/skills/multi-agent-mux-delegate-job/requirements.txt create mode 100755 .agents/skills/multi-agent-mux-delegate-job/scripts/job_subscriber.py create mode 100644 .agents/skills/multi-agent-mux-delegate-job/scripts/mqtt_common.py create mode 100755 .agents/skills/multi-agent-mux-delegate-job/scripts/publish_event.py create mode 100755 .agents/skills/multi-agent-mux-delegate-job/scripts/registry.py create mode 100644 .agents/skills/multi-agent-mux-monitor/SKILL.md create mode 100755 .agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh create mode 100644 .agents/skills/multi-agent-mux-resume/SKILL.md create mode 100755 .agents/skills/multi-agent-mux-resume/scripts/resolve_session_id.sh create mode 100755 .agents/skills/multi-agent-mux-resume/scripts/update_yaml_resumed.sh create mode 100644 .agents/skills/multi-agent-mux-status/SKILL.md create mode 100755 .agents/skills/multi-agent-mux-status/scripts/status.sh create mode 100644 .agents/skills/multi-agent-mux-stop/SKILL.md create mode 100755 .agents/skills/multi-agent-mux-stop/scripts/stop_session.sh create mode 100644 .mam/agent-sessions.yaml create mode 100644 .mam/install_manifest.txt diff --git a/.agents/skills/lib.sh b/.agents/skills/lib.sh new file mode 100644 index 0000000..ee93964 --- /dev/null +++ b/.agents/skills/lib.sh @@ -0,0 +1,787 @@ +#!/usr/bin/env bash +# lib.sh — shared library for the multi-agent-mux-* skills. +# +# Single source of truth for the four things that were inconsistently +# re-implemented across create/resume/delete/monitor (REVIEW.md §4.1): +# - derive_session_name : the tmux session slug (P0-A) +# - atomic_dump_yaml : SQLite db transaction + temp+rename + .bak + validate (P0-B) +# - env_python : env-safe Python (no heredoc injection) (P0-B / P1-B) +# - find_workspace_uuid : workspace-SCOPED resume id lookup (P0-C) +# +# Source it from each script with a path computed from the script location: +# source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh" +# +# HARD RULE: the agent-sessions.yaml file is only ever written through +# atomic_dump_yaml. Never `open(yaml_path, 'w')` anywhere else. + +SKILL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORKSPACE_ROOT="$(cd "$SKILL_DIR/../.." && pwd)" +AGENT_SESSIONS_YAML="${AGENT_SESSIONS_YAML:-$WORKSPACE_ROOT/.mam/agent-sessions.yaml}" + +# Workspace-relative defaults with environment overrides (Phase Z) +HOME_DIR="${HOME_DIR:-$WORKSPACE_ROOT}" +CLAUDE_PROJECT_DIR="${CLAUDE_PROJECT_DIR:-$HOME/.claude/projects}" +LOCAL_BIN="${LOCAL_BIN:-$HOME/.local/bin}" + +# --------------------------------------------------------------------------- +# Tmux Server Isolation support +# --------------------------------------------------------------------------- +# Paths to exclude when resolving the real tmux binary (shim/wrapper dirs). +_TMUX_SHIM_DIR_PATTERN="${_TMUX_SHIM_DIR_PATTERN:-/multi-agent-tmux-shim/}" +_TMUX_SKILLS_BIN_PATTERN="${_TMUX_SKILLS_BIN_PATTERN:-/.agents/skills/.bin}" + +TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}" + +_resolve_real_tmux_path() { + if [ -z "${_REAL_TMUX_PATH:-}" ] || [[ "$_REAL_TMUX_PATH" == *"${_TMUX_SHIM_DIR_PATTERN}"* ]] || [[ "$_REAL_TMUX_PATH" == *"${_TMUX_SKILLS_BIN_PATTERN}"* ]]; then + local dir save_ifs="$IFS" + _REAL_TMUX_PATH="" + IFS=: + for dir in $PATH; do + if [[ "$dir" != *"${_TMUX_SHIM_DIR_PATTERN}"* ]] && [[ "$dir" != *"${_TMUX_SKILLS_BIN_PATTERN}"* ]] && [ -x "$dir/tmux" ]; then + _REAL_TMUX_PATH="$dir/tmux" + break + fi + done + IFS="$save_ifs" + if [ -z "$_REAL_TMUX_PATH" ]; then + _REAL_TMUX_PATH="tmux" + fi + export _REAL_TMUX_PATH + fi +} + +_init_tmux_isolation() { + _resolve_real_tmux_path + if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then + local wrapper_dir="${TMPDIR:-/tmp}${_TMUX_SHIM_DIR_PATTERN}${TMUX_SERVER_NAME}" + if [[ ":$PATH:" != *":$wrapper_dir:"* ]]; then + mkdir -p "$wrapper_dir" + cat < "$wrapper_dir/tmux" +#!/usr/bin/env bash +if [ -z "\${TMUX_SERVER_NAME:-}" ] || [ "\$TMUX_SERVER_NAME" = "default" ]; then + exec "$_REAL_TMUX_PATH" "\$@" +else + exec "$_REAL_TMUX_PATH" -L "\$TMUX_SERVER_NAME" "\$@" +fi +EOF + chmod +x "$wrapper_dir/tmux" + export PATH="$wrapper_dir:$PATH" + fi + else + # 격리 비활성화 시 shim 자동 cleanup (PATH에서 제거) + local new_path="" dir save_ifs="$IFS" + IFS=: + for dir in $PATH; do + if [[ "$dir" != *"${_TMUX_SHIM_DIR_PATTERN}"* ]] && [[ "$dir" != *"${_TMUX_SKILLS_BIN_PATTERN}"* ]]; then + if [ -z "$new_path" ]; then + new_path="$dir" + else + new_path="$new_path:$dir" + fi + fi + done + IFS="$save_ifs" + export PATH="$new_path" + fi +} + +_tmux() { + _init_tmux_isolation + if [ -z "${TMUX_SERVER_NAME:-}" ] || [ "$TMUX_SERVER_NAME" = "default" ]; then + "$_REAL_TMUX_PATH" "$@" + else + "$_REAL_TMUX_PATH" -L "$TMUX_SERVER_NAME" "$@" + fi +} + +tmux() { + _tmux "$@" +} + +# --------------------------------------------------------------------------- +# resolve_tmux_server +# +# Query agent-sessions.yaml to find the tmux_server associated with a session. +# Fallback to TMUX_SERVER_NAME or 'default' if not registered or field is missing. +# Prints the resolved server name on stdout. +# --------------------------------------------------------------------------- +resolve_tmux_server() { + local session_name="$1" + SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF' +import os, sys, sqlite3, json, yaml +name = os.environ['SESSION_NAME'] +yaml_path = os.environ['YAML_PATH'] +db_path = os.path.splitext(yaml_path)[0] + '.db' +try: + if os.path.exists(db_path): + conn = sqlite3.connect(db_path, timeout=10.0) + try: + row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone() + if row: + s = json.loads(row[0]) + server = s.get('tmux_server') + if server: + print(server) + sys.exit(0) + except sqlite3.OperationalError: + pass + row = conn.execute('SELECT data FROM state WHERE id=1').fetchone() + if row: + d = json.loads(row[0]) + for s in d.get('tmux_sessions', []): + if s.get('name') == name: + server = s.get('tmux_server') + if server: + print(server) + sys.exit(0) + conn.close() + elif os.path.exists(yaml_path): + with open(yaml_path) as f: + d = yaml.safe_load(f) or {} + for s in d.get('tmux_sessions', []): + if s.get('name') == name: + server = s.get('tmux_server') + if server: + print(server) + sys.exit(0) +except Exception: + pass +# Fallback +print(os.environ.get('TMUX_SERVER_NAME', 'default')) +PYEOF +} + +# --------------------------------------------------------------------------- +# derive_session_name +# +# THE single source of truth for the tmux session name. Rule: +# slug = the two trailing path components of the absolute workspace, +# '_' -> '-', lowercased, joined with '-' +# name = "-creator-" +# +# Workspace root 기준 상대 해석. 예: +# $WORKSPACE_ROOT/landing_page/refer_landing_page + claude +# -> landing-page-refer-landing-page-creator-claude +# +# Decision (REVIEW P0-A): the actual workspace basename (refer_landing_page) +# IS included. The hand-written historical entry that dropped it +# (lab-landing-page-creator-claude) was the bug, not the convention. +# Every script and SKILL.md must use exactly this rule. +# --------------------------------------------------------------------------- +derive_session_name() { + local workspace="$1" agent="$2" + local abs parent work slug + abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace" + parent="$(basename "$(dirname "$abs")" 2>/dev/null || echo "")" + work="$(basename "$abs" 2>/dev/null || echo "root")" + if [ -z "$parent" ] || [ "$parent" = "/" ] || [ "$parent" = "." ]; then + parent="workspace" + fi + if [ -z "$work" ] || [ "$work" = "/" ] || [ "$work" = "." ]; then + work="root" + fi + slug="$(printf '%s-%s' "$parent" "$work" | tr '[:upper:]' '[:lower:]' | tr '_' '-')" + slug="$(printf '%s' "$slug" | tr -cd 'a-zA-Z0-9-')" + printf '%s-creator-%s' "$slug" "$agent" +} + +# --------------------------------------------------------------------------- +# env_python [KEY=VALUE ...] (Python source read from stdin) +# +# Run python3 with the source supplied on stdin via a *quoted* heredoc, so the +# shell never interpolates the source. All values are passed through the +# environment (YAML_PATH plus any KEY=VALUE pairs). Untrusted data (workspace +# paths, capture-pane text) must travel as env vars and be read via os.environ +# inside the script — never spliced into the source. Read-only by convention; +# use atomic_dump_yaml when you need to write the YAML. +# --------------------------------------------------------------------------- +_validate_env_key() { + local key="$1" + if [[ ! "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then + echo "ERROR: Invalid environment variable name: $key" >&2 + return 1 + fi + case "$key" in + LD_PRELOAD|LD_LIBRARY_PATH|PYTHONPATH|PYTHONHOME|PYTHONINSPECT|PYTHONSTARTUP) + echo "ERROR: Blocked environment variable: $key" >&2 + return 1 + ;; + esac + return 0 +} + +env_python() { + local yaml_path="$1"; shift + local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN") + while [ $# -gt 0 ]; do + case "$1" in + *=*) + local key="${1%%=*}" + _validate_env_key "$key" || return 1 + envs+=("$1") + shift + ;; + *) + break + ;; + esac + done + env "${envs[@]}" python3 - "$@" +} + +# --------------------------------------------------------------------------- +# atomic_dump_yaml [KEY=VALUE ...] (mutation source from stdin) +# +# The ONLY sanctioned way to write agent-sessions.yaml. It: +# 1. takes an exclusive SQLite BEGIN IMMEDIATE transaction lock on +# agent-sessions.db (serialises all writers) +# 2. loads the current state into `d` (seeds from YAML if DB is empty) +# 3. exec()s the caller's mutation source (sees d, yaml, os, datetime, +# timezone, glob, subprocess; reads values via os.environ). The mutation +# may print and may `raise SystemExit(n)` to abort *without* writing. +# 4. validates the resulting schema +# 5. backs up to .bak, then writes YAML atomically (temp + os.replace) +# when a session transitions to a finished state. +# +# The mutation source is passed via env and exec()'d — it is never string +# spliced and untrusted data never lands in Python source (P0-B / P1-B). +# --------------------------------------------------------------------------- +# Check if the workspace is on NFS — locking behaves differently on NFS +_check_is_nfs() { + local f="$1" + local mountpoint + mountpoint="$(df --output=target "$f" 2>/dev/null | tail -1)" || return 1 + if mount | grep -q "$mountpoint.*nfs\|$mountpoint.*cifs\|$mountpoint.*fuse.sshfs"; then + return 0 # is NFS + fi + return 1 # not NFS +} + +atomic_dump_yaml() { + local yaml_path="$1"; shift + if [ -z "${MAM_IS_NFS:-}" ]; then + if _check_is_nfs "$(dirname "$yaml_path")"; then + export MAM_IS_NFS="true" + echo "WARNING: $(dirname "$yaml_path") appears to be a network filesystem (NFS/CIFS/SSHFS)." >&2 + echo "WARNING: SQLite journal_mode automatically falls back to DELETE." >&2 + else + export MAM_IS_NFS="false" + fi + fi + + local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN" "MAM_IS_NFS=$MAM_IS_NFS") + while [ $# -gt 0 ]; do + case "$1" in + *=*) + local key="${1%%=*}" + _validate_env_key "$key" || return 1 + envs+=("$1") + shift + ;; + *) + break + ;; + esac + done + local mutation; mutation="$(cat)" + env "${envs[@]}" AGENT_SESSIONS_MUTATION="$mutation" python3 - <<'PYEOF' +import os, sys, tempfile, shutil, glob, subprocess, json, sqlite3 +from datetime import datetime, timezone +import yaml + +yaml_path = os.environ['YAML_PATH'] +db_path = os.path.splitext(yaml_path)[0] + '.db' + +def _validate(d): + if not isinstance(d, dict): + raise SystemExit("VALIDATE: top-level is not a mapping") + sessions = d.get('tmux_sessions', []) + if not isinstance(sessions, list): + raise SystemExit("VALIDATE: tmux_sessions is not a list") + valid = {'running', 'terminated', 'archived', 'stopped'} + for i, s in enumerate(sessions): + if not isinstance(s, dict): + raise SystemExit(f"VALIDATE: tmux_sessions[{i}] not a mapping") + if not s.get('name') or not s.get('status'): + raise SystemExit(f"VALIDATE: tmux_sessions[{i}] missing name/status") + if s['status'] not in valid: + raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} bad status {s['status']!r}") + if not isinstance(s.get('pane'), dict): + raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} missing pane") + +def get_terminal_set(d): + return {s.get('name'): s.get('status') for s in d.get('tmux_sessions', []) if s.get('status') in ('stopped', 'terminated', 'archived')} + +os.makedirs(os.path.dirname(db_path) or '.', exist_ok=True) +conn = sqlite3.connect(db_path, timeout=60.0) + +for f in [db_path, db_path + '-wal', db_path + '-shm']: + if os.path.exists(f): + try: + os.chmod(f, 0o600) + except Exception: + pass + +is_nfs = os.environ.get('MAM_IS_NFS') == 'true' +if is_nfs: + conn.execute('PRAGMA journal_mode=DELETE') +else: + conn.execute('PRAGMA journal_mode=WAL') + +try: + # Disable auto-commit by explicitly starting a transaction with BEGIN IMMEDIATE + # This prevents the read-modify-write lost update race condition. + conn.execute('BEGIN IMMEDIATE') + conn.execute('CREATE TABLE IF NOT EXISTS state (id INTEGER PRIMARY KEY, data TEXT)') + conn.execute('CREATE TABLE IF NOT EXISTS sessions (name TEXT PRIMARY KEY, status TEXT, pane_cwd TEXT, data JSON)') + conn.execute('CREATE INDEX IF NOT EXISTS idx_sessions_pane_cwd ON sessions(pane_cwd)') + + row = conn.execute('SELECT data FROM state WHERE id=1').fetchone() + if row: + d = json.loads(row[0]) + else: + # Seed from YAML + if os.path.exists(yaml_path): + with open(yaml_path) as f: + d = yaml.safe_load(f) or {} + else: + d = {} + + # Assemble d['tmux_sessions'] from sessions table if table contains data + db_sessions = [] + cursor = conn.execute('SELECT name, status, pane_cwd, data FROM sessions') + for s_row in cursor.fetchall(): + s_data = json.loads(s_row[3]) + s_data['name'] = s_row[0] + s_data['status'] = s_row[1] + if 'pane' not in s_data: + s_data['pane'] = {} + s_data['pane']['cwd'] = s_row[2] + db_sessions.append(s_data) + + if db_sessions: + d['tmux_sessions'] = db_sessions + elif 'tmux_sessions' not in d: + d['tmux_sessions'] = [] + + old_terminals = get_terminal_set(d) + + # --- caller mutation (module scope: sees d, yaml, os, glob, subprocess) --- + exec(compile(os.environ['AGENT_SESSIONS_MUTATION'], '', 'exec'), globals()) + + _validate(d) + + # Separate globals and sessions for normalization + d_state = {k: v for k, v in d.items() if k != 'tmux_sessions'} + conn.execute('REPLACE INTO state (id, data) VALUES (1, ?)', (json.dumps(d_state),)) + + current_names = [] + for s in d.get('tmux_sessions', []): + name = s.get('name') + status = s.get('status') + pane_cwd = (s.get('pane') or {}).get('cwd', '') + conn.execute('REPLACE INTO sessions (name, status, pane_cwd, data) VALUES (?, ?, ?, ?)', + (name, status, pane_cwd, json.dumps(s))) + current_names.append(name) + + if current_names: + placeholders = ','.join('?' for _ in current_names) + conn.execute(f'DELETE FROM sessions WHERE name NOT IN ({placeholders})', current_names) + else: + conn.execute('DELETE FROM sessions') + + new_terminals = get_terminal_set(d) + + conn.commit() + + # Write to YAML ONLY when a session transitions to a finished state + # (Moved after conn.commit() per Claude's feedback) + if new_terminals != old_terminals: + if os.path.exists(yaml_path): + try: + shutil.copy2(yaml_path, yaml_path + '.bak') + except Exception: + pass + dir_ = os.path.dirname(yaml_path) or '.' + fd, tmp = tempfile.mkstemp(dir=dir_, prefix='.agent-sessions.', suffix='.tmp') + try: + with os.fdopen(fd, 'w') as f: + yaml.safe_dump(d, f, default_flow_style=False, sort_keys=False, + allow_unicode=True, width=4096) + os.replace(tmp, yaml_path) + except Exception: + if os.path.exists(tmp): + os.remove(tmp) + raise + + try: + conn.execute('PRAGMA wal_checkpoint(TRUNCATE)') + except Exception: + pass +except Exception: + conn.rollback() + raise +finally: + conn.close() + + # H3: Re-apply chmod 0600 after close to cover newly created -wal / -shm files + try: + os.chmod(db_path, 0o600) + wal = db_path + '-wal' + if os.path.exists(wal): os.chmod(wal, 0o600) + shm = db_path + '-shm' + if os.path.exists(shm): os.chmod(shm, 0o600) + except Exception: + pass +PYEOF +} + + +# --------------------------------------------------------------------------- +# find_workspace_uuid +# +# Workspace-SCOPED resolution of the resume UUID (P0-C). It NEVER returns a +# global agent_identities id unless that id's project_cwd matches THIS +# workspace. Resolution order: +# 1) tmux_sessions[] row whose pane.cwd == this workspace -> per-row own id +# (claude_session_id_own / agy_conversation_id_own) +# 2) on-disk scan scoped to this workspace +# (claude: ~/.claude/projects//*.jsonl ; agy: last_conversations.json[cwd]) +# 3) agent_identities cache, ONLY when its project_cwd == this workspace +# Prints the UUID on stdout (empty line if none). Always exits 0. +# --------------------------------------------------------------------------- +find_workspace_uuid() { + local workspace="$1" agent="$2" + local abs; abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace" + WS_ABS="$abs" AGENT="$agent" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF' +import os, json, glob, sqlite3 +import yaml + +ws = os.environ['WS_ABS'] +agent = os.environ['AGENT'] +home = os.environ['HOME_DIR'] +yaml_path = os.environ['YAML_PATH'] +db_path = os.path.splitext(yaml_path)[0] + '.db' +claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects") + +def jsonl_exists(uuid): + key = ws.replace('/', '-').replace('_', '-') + return os.path.exists(f"{claude_project_dir}/{key}/{uuid}.jsonl") + + +def db_exists(uuid): + return os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{uuid}.db") + + +def hermes_exists(uuid): + hdb = f"{home}/.mam/state.db" + if not os.path.exists(hdb): + return False + try: + conn = sqlite3.connect(hdb) + r = conn.execute("SELECT 1 FROM sessions WHERE id=?", (uuid,)).fetchone() + conn.close() + return r is not None + except Exception: + return False + + +def emit(u): + print(u) + raise SystemExit(0) + + +# 1) per-row own id for THIS workspace (optimized with direct sqlite query if db exists) +sessions = [] +try: + if os.path.exists(db_path): + conn = sqlite3.connect(db_path, timeout=10.0) + has_sessions_table = False + try: + cursor = conn.execute('SELECT data FROM sessions WHERE pane_cwd=?', (ws,)) + for row in cursor.fetchall(): + sessions.append(json.loads(row[0])) + has_sessions_table = True + except sqlite3.OperationalError: + pass + if not has_sessions_table or not sessions: + row = conn.execute('SELECT data FROM state WHERE id=1').fetchone() + if row: + d = json.loads(row[0]) + for s in d.get('tmux_sessions', []): + if isinstance(s, dict) and (s.get('pane') or {}).get('cwd') == ws: + sessions.append(s) + conn.close() + elif os.path.exists(yaml_path): + with open(yaml_path) as f: + d = yaml.safe_load(f) or {} + for s in d.get('tmux_sessions', []): + if isinstance(s, dict) and (s.get('pane') or {}).get('cwd') == ws: + sessions.append(s) +except Exception: + pass + +for s in sessions: + name = s.get('name', '') + if agent == 'claude' and name.endswith('-creator-claude'): + cand = s.get('claude_session_id_own') + if cand and jsonl_exists(cand): + emit(cand) + if agent == 'agy' and name.endswith('-creator-agy'): + cand = s.get('agy_conversation_id_own') + if cand and db_exists(cand): + emit(cand) + if agent == 'hermes' and name.endswith('-creator-hermes'): + cand = s.get('hermes_conversation_id_own') + if cand and hermes_exists(cand): + emit(cand) + +# 2) disk scan scoped to THIS workspace +if agent == 'claude': + key = ws.replace('/', '-').replace('_', '-') + proj = f"{claude_project_dir}/{key}" + if os.path.isdir(proj): + for j in sorted(glob.glob(f"{proj}/*.jsonl"), key=os.path.getmtime, reverse=True): + sid = None + try: + with open(j) as f: + first = f.readline().strip() + if first: + sid = json.loads(first).get('sessionId') + except Exception: + sid = None + cand = sid or os.path.basename(j)[:-6] + if cand and jsonl_exists(cand): + emit(cand) +elif agent == 'agy': + lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json" + if os.path.exists(lc): + cand = None + try: + cand = json.load(open(lc)).get(ws) + except Exception: + cand = None + if cand and db_exists(cand): + emit(cand) +elif agent == 'hermes': + hdb = f"{home}/.mam/state.db" + if os.path.exists(hdb): + cand = None + try: + conn = sqlite3.connect(hdb) + r = conn.execute("SELECT id FROM sessions WHERE cwd=? ORDER BY started_at DESC LIMIT 1", (ws,)).fetchone() + conn.close() + if r: + cand = r[0] + except Exception: + cand = None + if cand: + emit(cand) + +# 3) agent_identities cache, ONLY when its project_cwd == this workspace +ai = {} +try: + if os.path.exists(db_path): + conn = sqlite3.connect(db_path, timeout=10.0) + row = conn.execute('SELECT data FROM state WHERE id=1').fetchone() + if row: + ai = json.loads(row[0]).get('agent_identities', {}) + conn.close() + elif os.path.exists(yaml_path): + with open(yaml_path) as f: + d = yaml.safe_load(f) or {} + ai = d.get('agent_identities', {}) +except Exception: + pass + +ai_agent = ai.get(agent) or {} +if ai_agent.get('project_cwd') == ws: + if agent == 'claude': + cand = ai_agent.get('session_id') + if cand and jsonl_exists(cand): + emit(cand) + elif agent == 'agy': + cand = ai.get('conversation_id') + if cand and db_exists(cand): + emit(cand) + elif agent == 'hermes': + cand = ai_agent.get('session_id') or ai.get('conversation_id') + if cand and hermes_exists(cand): + emit(cand) + +print('') +PYEOF +} + +# --------------------------------------------------------------------------- +# capture_conversation_id +# +# Thin wrapper over find_workspace_uuid: resolves THIS workspace's conversation +# id (claude jsonl sessionId / agy db uuid) and prints it on stdout (empty line +# if none). find_workspace_uuid is already a workspace-scoped, 3-tier, race-free +# resolver (per-row own id -> workspace-scoped disk scan -> cwd-matched cache), +# so recording its result into the row before kill guarantees tier-1 on the next +# resume. Always exits 0. +# --------------------------------------------------------------------------- +capture_conversation_id() { + local agent="$1" workdir="$2" + find_workspace_uuid "$workdir" "$agent" +} + +# --------------------------------------------------------------------------- +# is_already_stopped +# +# Exits 0 if the row's status is 'stopped' (printing "stopped_at=" on +# stdout), 1 otherwise (including not-found). Used for idempotency: a second +# stop on an already-stopped session is a no-op. +# --------------------------------------------------------------------------- +is_already_stopped() { + local session_name="$1" + SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF' +import os, yaml, sqlite3, json +name = os.environ['SESSION_NAME'] +yaml_path = os.environ['YAML_PATH'] +db_path = os.path.splitext(yaml_path)[0] + '.db' +try: + if os.path.exists(db_path): + conn = sqlite3.connect(db_path, timeout=10.0) + has_sessions_table = False + try: + row = conn.execute('SELECT status, data FROM sessions WHERE name=?', (name,)).fetchone() + if row: + status, s_data_str = row[0], row[1] + if status == 'stopped': + s = json.loads(s_data_str) + print(f"stopped_at={s.get('stopped_at', '?')}") + raise SystemExit(0) + has_sessions_table = True + except sqlite3.OperationalError: + pass + if not has_sessions_table: + row = conn.execute('SELECT data FROM state WHERE id=1').fetchone() + if row: + d = json.loads(row[0]) + for s in d.get('tmux_sessions', []): + if s.get('name') == name and s.get('status') == 'stopped': + print(f"stopped_at={s.get('stopped_at', '?')}") + raise SystemExit(0) + conn.close() + raise SystemExit(1) + elif os.path.exists(yaml_path): + with open(yaml_path) as f: + d = yaml.safe_load(f) or {} + for s in d.get('tmux_sessions', []): + if s.get('name') == name and s.get('status') == 'stopped': + print(f"stopped_at={s.get('stopped_at', '?')}") + raise SystemExit(0) +except Exception: + pass +raise SystemExit(1) +PYEOF +} + +# --------------------------------------------------------------------------- +# multi-agent-mux-delegate-job integration helpers +# +# All paths are resolved relative to lib.sh's own location (BASH_SOURCE), so the +# skill tree is relocatable — no hardcoded absolute paths (review item 6). +# --------------------------------------------------------------------------- + +# _delegate_py_bin — echo the virtualenv python (walk up from .agents/skills/), else python3. +_delegate_py_bin() { + # Return cached result if available (shell variable, not exported — avoids cross-workspace pollution) + if [ -n "${AGENT_PYTHON_BIN:-}" ] && [ -x "$AGENT_PYTHON_BIN" ]; then + printf '%s\n' "$AGENT_PYTHON_BIN"; return 0 + fi + local d + d="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + while [ "$d" != "/" ] && [ -n "$d" ]; do + if [ -x "$d/.venv/bin/python" ]; then + AGENT_PYTHON_BIN="$d/.venv/bin/python" + printf '%s\n' "$AGENT_PYTHON_BIN"; return 0 + fi + d="$(dirname "$d")" + done + AGENT_PYTHON_BIN="$(command -v python3 || echo python3)" + printf '%s\n' "$AGENT_PYTHON_BIN" +} + +# _delegate_script — echo the path to a multi-agent-mux-delegate-job script, resolved +# relative to .agents/skills/ (lib.sh dir). Empty if not found. +_delegate_script() { + local name="$1" skill_dir cand + skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + cand="$skill_dir/multi-agent-mux-delegate-job/scripts/$name" + if [ -f "$cand" ]; then printf '%s\n' "$cand"; return 0; fi + printf '%s\n' "$(find "$skill_dir" -name "$name" 2>/dev/null | head -n 1 || true)" +} + +# delegate_submit_job +# +# Register a job in the multi-agent-mux-delegate-job registry. Prints the new JID on stdout. +delegate_submit_job() { + local prompt="$1" agent="$2" session="$3" + local py_bin registry_py + py_bin="$(_delegate_py_bin)" + registry_py="$(_delegate_script registry.py)" + if [ -z "$registry_py" ] || [ ! -f "$registry_py" ]; then + echo "ERROR: multi-agent-mux-delegate-job registry.py not found under .agents/skills/" >&2 + return 1 + fi + "$py_bin" "$registry_py" register \ + --prompt "$prompt" \ + --agent "$agent" \ + --agent-session "$session" +} + +# delegate_publish_event [detail] +# +# Publish a lifecycle event to the multi-agent-mux-delegate-job registry. Consolidates the +# inline .venv-walk + publish_event.py blocks that were duplicated across +# create/delete/resume (review item 7). Non-fatal by contract: an empty job id, +# a missing script, or a broker failure never aborts the caller. +delegate_publish_event() { + local job_id="$1" event="$2" detail="${3:-}" + [ -n "$job_id" ] || return 0 + local py_bin pub + py_bin="$(_delegate_py_bin)" + pub="$(_delegate_script publish_event.py)" + [ -n "$pub" ] && [ -f "$pub" ] || return 0 + "$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true +} + +# start_watchdog [workdir] +# Spawns a watchdog process to monitor a delegate-job JOB in the background. +# The watchdog re-spawns the subscriber every 2 minutes (or whatever hard +# limit we set) and exits automatically when the JOB reaches terminal state. +# Returns the watchdog PID via stdout. +start_watchdog() { + local job_id="$1" + local workdir="${2:-$PWD}" + local monitor_script="$workdir/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh" + local log_file="$workdir/.mam/multi-agent-mux-monitor.log" + + if [ ! -f "$monitor_script" ]; then + echo "ERROR: monitor script not found: $monitor_script" >&2 + return 1 + fi + + # Check if reconcile.sh --subscribe is already running on this workspace + local pid + pid=$(pgrep -f "bash $monitor_script --subscribe" || true) + + if [ -z "$pid" ]; then + # Start the wildcard monitor subscriber daemon with --idle-timeout 0 (never idle out) + # and ensure it runs with $workdir as cwd to anchor relative log paths. + local orig_pwd="$PWD" + cd "$workdir" + nohup bash "$monitor_script" --subscribe --idle-timeout 0 >> "$log_file" 2>&1 & + pid=$! + cd "$orig_pwd" + fi + + echo "$pid" +} + + diff --git a/.agents/skills/multi-agent-mux-create/SKILL.md b/.agents/skills/multi-agent-mux-create/SKILL.md new file mode 100644 index 0000000..70983e9 --- /dev/null +++ b/.agents/skills/multi-agent-mux-create/SKILL.md @@ -0,0 +1,220 @@ +--- +name: multi-agent-mux-create +description: "Create a new agent session (claude, antigravity/agy) in a dedicated tmux session for context-preserving long-running work. Always creates a tmux session — never backgrounds with nohup/disown. Writes the new session to .mam/agent-sessions.yaml. Use when you want to start a fresh agent (no prior UUID) for a new project workspace." +version: 1.0.0 +author: godopu +license: MIT +platforms: [linux, macos] +environments: [terminal, tmux] +metadata: + hermes: + tags: [agent, tmux, claude, antigravity, agy, multi-agent, context, session] + related_skills: [multi-agent-mux-resume, multi-agent-mux-stop, multi-agent-mux-monitor, claude-code] + prereq_skills: [claude-code] +--- + +# Multi-Agent Create — Start a Fresh Agent in a tmux Session + +> **Companion skills**: `multi-agent-mux-resume` (resume an existing UUID), `multi-agent-mux-stop` (terminate), `multi-agent-mux-monitor` (live status). +> **Single source of truth**: `./.mam/agent-sessions.yaml` (this skill writes to it; never read it ad-hoc — go through this skill). + +## What this skill does + +Spawn a new agent (`claude` or `agy`/antigravity-cli) in a **dedicated tmux session** for context-preserving long-running work. The tmux session is the *container*; the agent's session ID is *data* inside the container. **This skill creates the container + starts the agent — but does not resume an old conversation** (use `multi-agent-mux-resume` for that). + +For all agents: the tmux session name is produced by **`lib.sh::derive_session_name`** — the single source of truth shared by create/resume/stop/status/monitor (P0-A). The rule (verbatim from the function): + +> slug = the **two trailing path components** of the absolute workspace, `_`→`-`, lowercased, joined with `-`; name = `-creator-`. + +So `$WORKSPACE_ROOT/landing_page/refer_landing_page` + `claude` → `landing-page-refer-landing-page-creator-claude`. The workspace basename (`refer_landing_page`) **is** included; the hand-written historical entry that dropped it (`lab-landing-page-creator-claude`) was the bug, not the convention. + +## Pre-flight checks + +Before doing anything, verify the environment: + +```bash +# 1) tmux available and isolated server status +command -v tmux || { echo "ERROR: tmux not installed"; exit 1; } +echo "Tmux server name: ${TMUX_SERVER_NAME:-default}" + +# 2) claude / agy available +command -v claude # required for --agent claude +command -v agy # required for --agent agy + +# 3) claude auth (if --agent claude) +claude auth status 2>&1 | python3 -c "import json,sys; d=json.load(sys.stdin); assert d.get('loggedIn'), 'claude not logged in'" + +# 4) target workspace exists +test -d "$WORKSPACE" || { echo "ERROR: workspace $WORKSPACE not a directory"; exit 1; } +``` + +If any check fails → `kanban_block(reason="...")` (worker path) or report to user (interactive path). Do not proceed with a half-broken setup. + +## Standard names + +- **tmux session name**: `derive_session_name ` (lib.sh) + - `` = `basename $(dirname $WORKSPACE)` `-` `basename $WORKSPACE` (lowercase, `_`→`-`) + - examples: `landing-page-refer-landing-page-creator-claude`, `paper-pdf2md-creator-agy` + - never re-derive this by hand — source lib.sh and call the function +- **wrapper script** (claude only): `~/.local/bin/-creator-claude` + - contents: tmux new-session with `claude` inside, auto-handles trust/bypass dialogs + - see `/agent_sessions.md` for the canonical wrapper template + +## Tmux Server Isolation (격리 서버) + +When running multiple agent sessions alongside other workflows (e.g., cmux, Kanban workers, manual tmux sessions), sharing the default tmux server can lead to session name conflicts, monitoring clutter, and accidental destruction of user sessions via global commands. + +To prevent this, you can run this skill inside an **isolated tmux server** using the `TMUX_SERVER_NAME` environment variable or the `--tmux-server ` flag (opt-in). + +### How to use +1. **Via Environment Variable**: + ```bash + export TMUX_SERVER_NAME=multi-agent-canary + # All subsequent commands (create, status, stop, etc.) will run in the isolated 'multi-agent-canary' tmux server. + ``` +2. **Via Option Flag**: + ```bash + bash scripts/create_session.sh --workspace /path/to/project --agent claude --tmux-server multi-agent-canary + ``` +3. **Submit Job Integration**: + You can automatically register a delegated job with a prompt when creating a session: + ```bash + bash scripts/create_session.sh --workspace /path/to/project --agent claude --submit-job "Task prompt here" + ``` + +### Recommended Alias +You can set an alias in your shell to easily query sessions on the isolated server: +```bash +alias tmc='tmux -L multi-agent-canary' +tmc ls # Lists only your multi-agent sessions +``` + +### Safety Rules (Pitfall 29 Summary) +- Never use global server termination commands like `tmux kill-server` or `tmux kill-session -a` as they will destroy all sessions on that server (including your own workspace sessions if they share the server). +- By using an isolated server via `TMUX_SERVER_NAME`, your agent sessions are completely separated from your default user workspace, ensuring 0% interference. + +## Workflow + +```bash +WORKSPACE=/path/to/project +AGENT=claude # or agy +source .agents/skills/lib.sh +SESSION_NAME="$(derive_session_name "$WORKSPACE" "$AGENT")" + +# 1. If session already alive, fail fast +tmux has-session -t "$SESSION_NAME" 2>/dev/null && { + echo "ERROR: tmux session '$SESSION_NAME' already exists. Use multi-agent-mux-resume to attach or multi-agent-mux-stop first." + exit 1 +} + +# 2. Spawn the tmux session with the agent inside +case "$AGENT" in + claude) + # Use the wrapper if it exists, else inline tmux new-session + # Use the wrapper if it exists (LOCAL_BIN env var overrides default $HOME/.local/bin) + local_bin="${LOCAL_BIN:-$HOME/.local/bin}" + if [ -x "$local_bin/$SESSION_NAME" ]; then + nohup "$local_bin/$SESSION_NAME" >/dev/null 2>&1 & + else + tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude" + fi + ;; + agy) + tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "agy --dangerously-skip-permissions" + ;; + *) echo "ERROR: --agent must be claude or agy, got: $AGENT"; exit 2 ;; +esac + +# 3. Wait for agent TUI to be ready (varies: claude ~5s, agy ~3s) +sleep 6 + +# 4. Capture pane metadata +PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}') +PANE_CWD=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_path}') +PANE_CMD=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_command}') +TMUX_EPOCH=$(tmux list-sessions -F '#{session_created}' -t "$SESSION_NAME" 2>/dev/null | head -1) +``` + +## Registering the session in agent-sessions.yaml + +After spawn, append a new `tmux_sessions[]` entry to `.mam/agent-sessions.yaml`: + +```yaml +- name: + status: running + tmux_session_created_at: 2026-06-17T...Z # ISO 8601 UTC + tmux_session_epoch: + tmux_server: # Isolated server name (default: 'default') + pane: + index: 0 + pid: + cmd: # 'claude' or 'agy' + cmd_full: + cwd: + tui: # only for claude + model: + provider: + plan: + account: + version: + start_command: + attach_command: "tmux attach -t " + kill_command: "tmux kill-session -t " +``` + +`cmd_full` per agent (this is the actual command line in the pane, not the resume command): + +| agent | cmd_full | +|---|---| +| claude (interactive) | `claude` | +| agy (interactive) | `agy --dangerously-skip-permissions` | + +Use the `agent-sessions-yaml-edit` script in `scripts/` to safely append (preserves comments + format): + +```bash +bash .agents/skills/multi-agent-mux-create/scripts/create_session.sh \ + --workspace "$WORKSPACE" --agent "$AGENT" --session "$SESSION_NAME" +``` + +The script handles the YAML append, pane capture, and the `last_visible_status` placeholder. + +## Pitfalls + +- **Don't use `nohup`/`disown`/`setsid` for the agent itself** — those background the agent outside tmux. The whole point of this skill is *the tmux session is the supervisor*. `nohup` is OK only for *launching the wrapper* (which itself creates the tmux session via `tmux new-session -d`). +- **Don't trust `--session-id ` flags blindly** — claude/agy may not accept a fixed session id on first spawn. The session id is *assigned* on first user message; you can read it back from `~/.claude/projects/.../session.jsonl` headers or `~/.gemini/.../cache/last_conversations.json` AFTER the first message. +- **Wrapper script MUST NOT be created via `hermes profile alias`** — that command writes a `hermes -p ` wrapper that destroys the tmux behavior. Create wrappers manually (see `lab-landing-page-creator-claude` template). +- **Always use the workspace-relative path** in tmux `cwd` — relative paths break when tmux respawns in a different shell context. +- **The first `claude` message generates the session id** — `multi-agent-mux-create` only sets up the *container*. If you need a known session id for later resume, send a placeholder message (e.g. "init") and read it back, then call `multi-agent-mux-resume` later. + +## Verification + +After spawn + YAML append: + +```bash +# 1. tmux session is alive +tmux has-session -t "$SESSION_NAME" && echo OK || echo MISSING + +# 2. pane has the expected cmd + cwd +tmux list-panes -t "$SESSION_NAME" -F 'cmd=#{pane_current_command} cwd=#{pane_current_path}' + +# 3. agent-sessions.yaml has the new entry +python3 -c " +import yaml +d = yaml.safe_load(open('.mam/agent-sessions.yaml')) +names = [s['name'] for s in d['tmux_sessions']] +assert '$SESSION_NAME' in names, 'session not registered' +print('OK:', names) +" + +# 4. Optional: send a probe via tmux send-keys and capture-pane +tmux send-keys -t "$SESSION_NAME" "" Enter +sleep 2 +tmux capture-pane -t "$SESSION_NAME" -p -S -20 +``` + +## When NOT to use this skill + +- **Resuming an old conversation** → `multi-agent-mux-resume` +- **Killing an existing session** → `multi-agent-mux-stop` +- **Just attaching to an existing session** → `tmux attach -t ` (no skill needed) +- **One-shot print mode (claude -p "...")** → no tmux needed; use `claude-code` skill's print mode diff --git a/.agents/skills/multi-agent-mux-create/scripts/create_session.sh b/.agents/skills/multi-agent-mux-create/scripts/create_session.sh new file mode 100755 index 0000000..72b5e11 --- /dev/null +++ b/.agents/skills/multi-agent-mux-create/scripts/create_session.sh @@ -0,0 +1,294 @@ +#!/usr/bin/env bash +# create_session.sh — multi-agent-mux-create 의 부속 스크립트 +# Usage: +# bash create_session.sh --workspace --agent [--session ] [--wrapper] +# +# 동작: +# 1) preflight: tmux/claude/agy 가용성, workspace 존재 +# 2) tmux 세션 이름 결정 (--session 없으면 자동) +# 3) tmux 세션 시작 (claude 는 wrapper 우선, agy 는 인라인) +# 4) pane 메타 캡처 (pid, cmd, cwd) +# 5) agent-sessions.yaml 에 tmux_sessions[] 엔트리 append +# 6) 검증 출력 +# +# Exit codes: +# 0 = success +# 1 = preflight failure +# 2 = invalid args +# 3 = tmux session already exists (use multi-agent-mux-resume or delete first) +# 4 = agent-sessions.yaml append failure +set -euo pipefail + +source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh" + +usage() { + cat < --agent [options] + +Options: + --workspace PATH project directory (required) + --agent AGENT claude | agy | hermes (required) + --session NAME tmux session name (default: derived from workspace) + --wrapper force use of ~/.local/bin/ wrapper even if not present + --dry-run print commands without executing + --tmux-server NAME specify isolated tmux server name + --submit-job PROMPT submit a job to multi-agent-mux-delegate-job registry with the given prompt + -h, --help this help +EOF +} + +WORKSPACE="" +AGENT="" +SESSION_NAME="" +USE_WRAPPER=0 +DRY_RUN=0 +TMUX_SERVER_OPT="" +SUBMIT_JOB_PROMPT="" + +while [ $# -gt 0 ]; do + case "$1" in + --workspace) WORKSPACE="$2"; shift 2 ;; + --agent) AGENT="$2"; shift 2 ;; + --session) SESSION_NAME="$2"; shift 2 ;; + --wrapper) USE_WRAPPER=1; shift ;; + --dry-run) DRY_RUN=1; shift ;; + --tmux-server) TMUX_SERVER_OPT="$2"; shift 2 ;; + --submit-job) SUBMIT_JOB_PROMPT="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "ERROR: unknown arg: $1" >&2; usage; exit 2 ;; + esac +done + +if [ -n "$TMUX_SERVER_OPT" ]; then + export TMUX_SERVER_NAME="$TMUX_SERVER_OPT" +fi + +# Preflight +[ -n "$WORKSPACE" ] || { echo "ERROR: --workspace required" >&2; usage; exit 2; } +[ -n "$AGENT" ] || { echo "ERROR: --agent required" >&2; usage; exit 2; } +[ -d "$WORKSPACE" ] || { echo "ERROR: workspace $WORKSPACE not a directory" >&2; exit 1; } +command -v tmux >/dev/null || { echo "ERROR: tmux not installed" >&2; exit 1; } +command -v "$AGENT" >/dev/null || { echo "ERROR: $AGENT CLI not in PATH" >&2; exit 1; } + +# Auth Check (OAuth check for agy, loggedIn check for claude, status for hermes) +if [ "$AGENT" = "claude" ]; then + if ! claude auth status 2>/dev/null | grep -q '"loggedIn":\s*true'; then + echo "ERROR: claude not logged in. Run 'claude auth login' first." >&2 + exit 1 + fi +elif [ "$AGENT" = "agy" ]; then + if ! agy models >/dev/null 2>&1; then + echo "ERROR: agy is not authenticated. Please log in first." >&2 + exit 1 + fi +elif [ "$AGENT" = "hermes" ]; then + if ! hermes status >/dev/null 2>&1; then + echo "ERROR: hermes is not functional. Run 'hermes setup' first." >&2 + exit 1 + fi +fi + +# 세션 이름 — lib.sh::derive_session_name 이 단일 소스 (P0-A) +if [ -z "$SESSION_NAME" ]; then + SESSION_NAME="$(derive_session_name "$WORKSPACE" "$AGENT")" +fi + +# 이미 살아있으면 실패 +if _tmux has-session -t "$SESSION_NAME" 2>/dev/null; then + echo "ERROR: tmux session '$SESSION_NAME' already exists. Use multi-agent-mux-resume to attach, or multi-agent-mux-stop first." >&2 + exit 3 +fi + +# tmux 세션 띄우기 +LOCAL_BIN="${LOCAL_BIN:-$HOME/.local/bin}" +WRAPPER="$LOCAL_BIN/$SESSION_NAME" + +spawn() { + case "$AGENT" in + claude) + if { [ -x "$WRAPPER" ] && [ "$(basename "$WRAPPER")" != "claude" ]; } || [ "$USE_WRAPPER" = "1" ]; then + nohup "$WRAPPER" >/dev/null 2>&1 & + disown + else + _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude --dangerously-skip-permissions" + fi + ;; + agy) + _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "agy --dangerously-skip-permissions" + ;; + hermes) + _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "hermes" + ;; + *) echo "ERROR: --agent must be claude, agy or hermes, got: $AGENT" >&2; exit 2 ;; + esac +} + +if [ "$DRY_RUN" = "1" ]; then + echo "[dry-run] would spawn: tmux session '$SESSION_NAME' in $WORKSPACE (agent=$AGENT)" + exit 0 +fi + +spawn + +# TUI 준비 대기 +sleep 6 + +# pane 메타 캡처 +PANE_PID=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null || echo "") +PANE_CWD=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_path}' 2>/dev/null || echo "$WORKSPACE") +PANE_CMD=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_command}' 2>/dev/null || echo "$AGENT") +TMUX_EPOCH=$(date +%s) +NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ') + +# cmd_full 결정 +case "$AGENT" in + claude) CMD_FULL='claude --dangerously-skip-permissions' ;; + agy) CMD_FULL='agy --dangerously-skip-permissions' ;; + hermes) CMD_FULL='hermes' ;; +esac + +# 시작 명령 +local_tmux="tmux" +if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then + local_tmux="tmux -L $TMUX_SERVER_NAME" +fi + +case "$AGENT" in + claude) + if [ -x "$WRAPPER" ]; then + START_CMD="$WRAPPER # ~/.local/bin 의 래퍼" + else + START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude --dangerously-skip-permissions\"" + fi + ;; + agy|hermes) + START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"$CMD_FULL\"" + ;; +esac + +# agent-sessions.yaml 에 append +DELEGATE_JOB_ID="" +if [ -n "$SUBMIT_JOB_PROMPT" ]; then + delegate_agent="" + if [ "$AGENT" = "claude" ]; then + delegate_agent="claude-code" + elif [ "$AGENT" = "hermes" ]; then + delegate_agent="hermes-agent" + else + delegate_agent="antigravity-cli" + fi + agent_session="tmux:$SESSION_NAME" + DELEGATE_JOB_ID=$(delegate_submit_job "$SUBMIT_JOB_PROMPT" "$delegate_agent" "$agent_session") + echo "Submitted delegated job: $DELEGATE_JOB_ID" +fi + +if [ ! -f "$AGENT_SESSIONS_YAML" ]; then + mkdir -p "$(dirname "$AGENT_SESSIONS_YAML")" + echo "tmux_sessions: []" > "$AGENT_SESSIONS_YAML" +fi + +# atomic_dump_yaml: flock + temp+rename + .bak + schema validate (P0-B). +# 모든 값은 환경변수로 전달 — heredoc interpolation 없음 (P1-B). +# 자식 pid 는 bash 에서 pgrep 으로 미리 구함 (P2: 도구명 필터). +CHILD_PID=0 +if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then + CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true) + CHILD_PID="${CHILD_PID:-0}" +fi + +atomic_dump_yaml "$AGENT_SESSIONS_YAML" \ + SESSION_NAME="$SESSION_NAME" AGENT="$AGENT" NOW_ISO="$NOW_ISO" \ + TMUX_EPOCH="$TMUX_EPOCH" PANE_PID="$PANE_PID" PANE_CWD="$PANE_CWD" \ + CMD_FULL="$CMD_FULL" START_CMD="$START_CMD" CHILD_PID="$CHILD_PID" \ + TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}" \ + DELEGATE_JOB_ID="$DELEGATE_JOB_ID" <<'PYEOF' +name = os.environ['SESSION_NAME'] +agent = os.environ['AGENT'] +pid = os.environ.get('PANE_PID', '') +epoch = os.environ.get('TMUX_EPOCH', '') +server_name = os.environ.get('TMUX_SERVER_NAME', 'default') +server_opt = f"-L {server_name} " if server_name and server_name != 'default' else "" + +sessions = d.setdefault('tmux_sessions', []) + +# P0-D: 같은 이름 엔트리가 status=running 이면만 거부. terminated/archived 는 +# 재사용 가능 — 낡은 엔트리를 제거하고 새로 append (create -> delete -> create). +running_same = [s for s in sessions if s.get('name') == name and s.get('status') == 'running'] +if running_same: + print(f"ERROR: {name} already running in agent-sessions.yaml", flush=True) + raise SystemExit(4) +sessions[:] = [s for s in sessions if s.get('name') != name] + +entry = { + 'name': name, + 'status': 'running', + 'tmux_session_created_at': os.environ['NOW_ISO'], + 'tmux_session_epoch': int(epoch) if epoch.isdigit() else 0, + 'tmux_server': server_name, + 'delegate_job_id': os.environ.get('DELEGATE_JOB_ID', '') or None, + 'pane': { + 'index': 0, + 'pid': int(pid) if pid.isdigit() else 0, + 'cmd': agent, + 'cmd_full': os.environ['CMD_FULL'], + 'cwd': os.environ['PANE_CWD'], + }, + 'start_command': os.environ['START_CMD'], + 'attach_command': f'tmux {server_opt}attach -t {name}', + 'kill_command': f'tmux {server_opt}kill-session -t {name}', +} + +if agent == 'claude': + entry['tui'] = { + 'model': '(unknown — capture after first message)', + 'provider': 'anthropic', + 'plan': '(unknown)', + 'account': '(unknown — read from claude auth status)', + 'version': '(unknown — read from TUI)', + } + entry['claude_session_id_own'] = None + entry['last_visible_status'] = "TUI started; awaiting first user message" +elif agent == 'agy': + cp = os.environ.get('CHILD_PID', '0') + entry['child_pid'] = int(cp) if cp.isdigit() else 0 + entry['agy_conversation_id_own'] = None + entry['mcp_attachments'] = [ + { + 'name': 'stitch', + 'transport': 'mcp-remote', + 'endpoint': 'https://stitch.googleapis.com/mcp' + } + ] + entry['last_visible_status'] = "TUI started; awaiting first user message" +elif agent == 'hermes': + cp = os.environ.get('CHILD_PID', '0') + entry['child_pid'] = int(cp) if cp.isdigit() else 0 + entry['hermes_conversation_id_own'] = None + entry['last_visible_status'] = "TUI started; awaiting first user message" + +sessions.append(entry) + +snap = d.setdefault('snapshot', {}) +snap['taken_at'] = os.environ['NOW_ISO'] +snap['cwd'] = os.environ['PANE_CWD'] +print(f"appended: {name}", flush=True) +PYEOF + +echo +echo "=== created ===" +echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_CWD)" +if [ -n "$DELEGATE_JOB_ID" ]; then + echo "delegate job: $DELEGATE_JOB_ID" + delegate_publish_event "$DELEGATE_JOB_ID" started "multi-agent-mux session created" + WD_PID=$(start_watchdog "$DELEGATE_JOB_ID" "$WORKSPACE") + echo "watchdog PID: $WD_PID" +fi +echo "agent-sessions.yaml updated" +echo +if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then + echo "Attach: tmux -L $TMUX_SERVER_NAME attach -t $SESSION_NAME" +else + echo "Attach: tmux attach -t $SESSION_NAME" +fi +echo "Delete: use multi-agent-mux-stop skill" +echo "Resume: use multi-agent-mux-resume skill (after first message creates a session id)" diff --git a/.agents/skills/multi-agent-mux-delegate-job/README.md b/.agents/skills/multi-agent-mux-delegate-job/README.md new file mode 100644 index 0000000..7fb2bd8 --- /dev/null +++ b/.agents/skills/multi-agent-mux-delegate-job/README.md @@ -0,0 +1,11 @@ +# multi-agent-mux-delegate-job 스킬 + +작업(Job)을 자율 에이전트(claude-code/codex/opencode/human)에게 위임하고 MQTT +이벤트 채널로 비동기 관찰하는 Hermes 스킬. **시작점은 [`SKILL.md`](./SKILL.md).** + +- 프로토콜/스키마: [`job-protocol.md`](./job-protocol.md) +- 브로커 PoC→운영 전환: [`mqtt-broker-setup.md`](./mqtt-broker-setup.md) +- 레지스트리 포맷/동시성: [`registry.md`](./registry.md) +- 참조 구현: [`multi-agent-mux-delegate-job`](./multi-agent-mux-delegate-job) (bash wrapper), [`scripts/publish_event.py`](./scripts/publish_event.py), [`scripts/job_subscriber.py`](./scripts/job_subscriber.py), [`scripts/registry.py`](./scripts/registry.py), [`scripts/mqtt_common.py`](./scripts/mqtt_common.py) +- 영구 감사 로그: `.mam/delegate_job_logs//` (`meta.json`·`events.ndjson`·`status.json`) + — `multi-agent-mux-delegate-job logs ` 또는 `multi-agent-mux-delegate-job logs --list`로 조회 (SKILL.md "Audit Logs" 참조) diff --git a/.agents/skills/multi-agent-mux-delegate-job/SKILL.md b/.agents/skills/multi-agent-mux-delegate-job/SKILL.md new file mode 100644 index 0000000..4c3d324 --- /dev/null +++ b/.agents/skills/multi-agent-mux-delegate-job/SKILL.md @@ -0,0 +1,385 @@ +--- +name: multi-agent-mux-delegate-job +description: "Delegate a unit of work to any autonomous agent (claude-code, codex, opencode, or a human) and observe it asynchronously over an MQTT event channel. Each job gets a unique id, a registry record (prompt, broker, status, timeouts), and a single per-job topic that carries started/permission_required/progress/completed/error events as schema-versioned JSON. The delegator starts a subscriber first, runs the agent, and treats a completed/error event or a timeout as the job's terminal state. Ships a working reference implementation (publish_event.py, job_subscriber.py, registry.py, mqtt_common.py, multi-agent-mux-delegate-job wrapper) plus a PoC-to-production path: validate on a public broker, then move to an authenticated TLS broker by changing config only — no code change. Use when you need fire-and-observe delegation, multi-job fan-out across tmux sessions, or a uniform completion-signal protocol shared by several agent types." +version: 1.0.0 +author: Hermes Agent +license: MIT +platforms: [linux, macos, windows] +metadata: + hermes: + tags: [agent-delegation, mqtt, jobs, orchestration, async-completion] + related_skills: [claude-code, codex, opencode, hermes-agent-skill-authoring] +--- + +# multi-agent-mux-delegate-job — Async Job Delegation over MQTT + +Delegate a unit of work to an autonomous agent, then **observe** it instead of +blocking on it. Every job gets a unique id and a registry record; the agent +publishes lifecycle events (`started`, `permission_required`, `progress`, +`completed`, `error`) to a per-job MQTT topic; the delegator subscribes and +treats `completed`/`error` — or a timeout — as the terminal state. + +This skill is a **reference implementation**: copy the files in this directory +into your project and customise. The `communication_over_mqtt` project is the +canonical concrete instance. + +## Overview + +The model is deliberately small. A **job** is one delegated task. An **agent** +is a worker (a claude-code tmux session, a codex run, a human). The **registry** +(`.mam/jobs/.json`) holds everything about a job so nothing important +lives in environment variables — which means one tmux session can process many +jobs sequentially, and many sessions can fan out in parallel, with no env +collisions. The **event channel** is one MQTT topic per job carrying JSON +payloads; `event` discriminates the type. + +Responsibility is split into exactly one entry point each: +[`publish_event.py`](./scripts/publish_event.py) emits events (registry lookup, +monotonic `seq`, retry+backoff) and [`job_subscriber.py`](./scripts/job_subscriber.py) +observes them (timeouts, terminal state machine, defensive parsing). Shared +logic lives in [`mqtt_common.py`](./scripts/mqtt_common.py); registry I/O in +[`registry.py`](./scripts/registry.py). The demo `publisher.py`/`subscriber.py` +in the host project stay frozen. + +Two stages, same code. **PoC** runs on the public `broker.hivemq.com` to wire up +the protocol. **Production** moves to your own authenticated TLS broker — the +switch is **config only** (env vars + the registry `broker.*` block), never a +code change. See [`mqtt-broker-setup.md`](./mqtt-broker-setup.md). + +## When to Use / When NOT to Use + +**Use when:** +- you want **fire-and-observe** delegation — kick off work and get a completion + signal rather than blocking a terminal; +- several agent types (claude-code, codex, opencode, human) must follow **one** + completion protocol; +- you need **multi-job fan-out** across tmux sessions with safe job claiming; +- you want a clean PoC → authenticated-broker upgrade path. + +**Do NOT use when:** +- a one-shot `claude -p '…'` that returns inline is enough (no async signal + needed) — just use the [claude-code](../claude-code/SKILL.md) skill directly; +- you need request/response RPC or large artifact transfer (this is a + one-direction event stream, not a data bus); +- the payload would carry secrets and you're still on the public broker — move + to the own-broker stage first. + +## Quick Start + +The one-line wrapper handles register + subscriber-first + agent launch. If +you're new, **start here** and only fall back to the manual 5-step flow when +you need finer control. + +```bash +# 1) one line: register → start subscriber → launch agent in tmux +# (uses public broker by default; last stdout line is the audit-log dir) +multi-agent-mux-delegate-job submit \ + --agent claude-code \ + --prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \ + --workdir /path/to/project \ + --agent-session tmux:demo \ + --timeout 3600 --idle-timeout 120 +# → stdout: registered job: +# subscriber pid: … +# agent launched in tmux session: demo +# subscriber output: +# /path/to/project/.mam/delegate_job_logs/ ← audit log dir + +# 2) at any time, query the job or its audit log +multi-agent-mux-delegate-job status --job +multi-agent-mux-delegate-job logs # pretty timeline +multi-agent-mux-delegate-job logs --list # every job, live status + +# 3) run a user-supplied validator against the job's artifacts +multi-agent-mux-delegate-job verify --job --validate ./validate.sh +``` + +The wrapper enforces the **subscribe-before-publish** ordering and **forwards +the freshly-minted `JOB_ID` into the agent's prompt** (so the agent calls +`publish_event.py --job ` with the right id — see Pitfall §"Wrong job_id +propagated to the agent"). When you need finer control, the manual flow is: + +```bash +# Manual 5-step (same outcome, more knobs) +PY=.venv/bin/python +SKILL=./.agents/skills/multi-agent-mux-delegate-job/scripts + +# 1) register +JID=$($PY "$SKILL/registry.py" register \ + --prompt "…" --agent claude-code --agent-session tmux:demo \ + --timeout 3600 --idle-timeout 120) + +# 2) START THE SUBSCRIBER FIRST (MQTT does not queue non-retained msgs) +$PY "$SKILL/job_subscriber.py" --job "$JID" --timeout 3600 --idle-timeout 120 & + +# 3) pass JID to the agent and instruct it to publish events with --job "$JID" +# (don't hard-code a job id you saw earlier — see Pitfall §"Wrong job_id") + +# 4) on completion the subscriber prints events and exits 0/1/2 + +# 5) inspect any time +$PY "$SKILL/registry.py" get --job "$JID" +$PY "$SKILL/registry.py" logs "$JID" # positional job id +$PY "$SKILL/registry.py" logs --list +``` + +## Job Protocol + +One topic per job: `python/mqtt/jobs//events`. Payload (JSON, UTF-8, +`schema_version=1`): + +```json +{ "schema_version": 1, "seq": 7, "job_id": "abc12345", + "event": "started|permission_required|progress|completed|error", + "timestamp": "2026-06-19T09:32:00Z", "detail": "generalised text", + "data": { "optional": "metadata" } } +``` + +- `seq` is monotonic per job (first = 1); the subscriber uses it to spot + reorder/duplication. +- `timestamp` is advisory — timeouts are measured from **receive** time. +- `detail`/`data` carry **no** secrets or absolute paths. +- A `schema_version` or `job_id` mismatch is **dropped** (defensive parsing). + +`started` and `completed`/`error` are the mandatory bookends; `completed`→exit 0, +`error`→exit 1. Full catalogue + production `auth_token` handling: +[`job-protocol.md`](./job-protocol.md). + +## Registry Format + +``` +.mam/jobs/.json # metadata record (single source of truth) +.mam/jobs/.events.log # append-only JSON-lines log (debug, optional) +.mam/jobs/.lock # fcntl advisory lock for the registry +``` + +The record holds `status`, `prompt`, `agent`, `agent_session`, a `broker` block, +`topic_prefix`, `timeout_sec`/`idle_timeout_sec`, `expected_artifacts`, +`last_seq`, and (production) `auth_token`. Because the `broker` block lives in +the record, `publish_event.py` connects from the registry alone. Concurrency, +the atomic rename trick, and multi-session job claiming are in +[`registry.md`](./registry.md). + +## Audit Logs + +Every job's lifecycle is mirrored to a **persistent, append-only audit log** +under `.mam/delegate_job_logs/` (override with `DELEGATE_JOB_LOGS_DIR`; +default `/.mam/delegate_job_logs`). Unlike the registry — live state +mutated in place and liable to be cleaned up — the audit log is durable +history you can replay after the fact. It is git-ignored. + +``` +.mam/delegate_job_logs// + meta.json # registration snapshot: prompt, agent, broker, timeouts, … + events.ndjson # append-only, one JSON event per line, in time order + status.json # current status only (fast point-query) +``` + +**What is logged, automatically:** + +| When | `events.ndjson` line | Written by | +|------|----------------------|------------| +| job registered | `registered` (also seeds meta.json + status.json) | `registry.register_job` | +| any status change | `status_changed` (`from`/`to`; also rewrites status.json) | `update_job_status`, `pick_pending` | +| event published | `published` (carries the exact payload — reproducible) | `publish_event.py` | +| event received | `received` (subscriber's external view) | `job_subscriber.py` | + +Both the emitter side (`published`) and the observer side (`received`) are +recorded, so a dropped publish or a missed receive is still visible from the +other. Every write is **best-effort and isolated** — an fcntl-locked append +guarded by `try/except` that only ever emits a `logger.warning`, so a logging +failure can never break a publish, a subscribe, or a registry write. stdout is +never touched. + +**Reading them:** + +```bash +multi-agent-mux-delegate-job logs # pretty-print one job's timeline +multi-agent-mux-delegate-job logs --list # summarise every logged job (with live status) +# or directly via the registry CLI: +$PY scripts/registry.py logs [--tail N] [--json] +$PY scripts/registry.py logs --list [--json] +``` + +`submit` prints the job's audit-log directory as its last stdout line, so a +caller can `tail -n1` to locate it. + +## Broker Setup + +| Stage | Broker | Auth | Transport | +|-------|--------|------|-----------| +| PoC | `broker.hivemq.com` | none | 1883 plaintext | +| Production | self-hosted Mosquitto/EMQX | user/pass + ACL | 8883 TLS | + +All connection settings come from env (`MQTT_BROKER`, `MQTT_PORT`, `MQTT_TLS`, +`MQTT_USERNAME`/`MQTT_PASSWORD`, `MQTT_CA_CERTS`, …) resolved by +`broker_config_from_env()`, with the registry `broker.*` block overriding per +job. Moving to your own broker is **config only**: install Mosquitto, set +`persistence true` + `acl_file` + `password_file` + a TLS `listener 8883`, grant +the worker `write python/mqtt/jobs/+/events` and Hermes `read`, then flip +`MQTT_TLS=1` and fill the registry `broker.*`. Step-by-step (conf, ACL, +`mosquitto_passwd`, self-signed/private-CA certs, cut-over verification): +[`mqtt-broker-setup.md`](./mqtt-broker-setup.md). + +## Agent Adapters + +Each agent voluntarily follows the contract: receive a `JOB_ID` (or registry +path), call `publish_event.py` at lifecycle points, exit 0/1/2. **The contract +in one line**: every event call uses `--job "$JOB_ID"` where `$JOB_ID` is the +**freshly-issued id from the registry record for *this* delegation** — never a +job_id you saw in an earlier session (Pitfall §"Wrong job_id propagated to the +agent"). + +- **claude-code** — Claude Code calls `publish_event.py` via its Bash tool at + lifecycle points. `submit --mode tmux` injects a prompt that already names + `$JOB_ID`; if you drive claude manually, hand it the id explicitly. Reference + instruction block (the wrapper injects something equivalent): + + ```text + Your job_id is "$JOB_ID" (read it from the registry record for this delegation — + do not reuse any job_id you saw before). + + On start: $PY multi-agent-mux-delegate-job/scripts/publish_event.py --job "$JOB_ID" --event started + On permission: $PY … --job "$JOB_ID" --event permission_required --detail ":" + On progress: $PY … --job "$JOB_ID" --event progress --detail "" + On success: $PY … --job "$JOB_ID" --event completed --detail "" + On failure: $PY … --job "$JOB_ID" --event error --detail "" + + Task: + + The subscriber for "$JOB_ID" is already running; your completed/error event + ends the job. Exit codes: 0 completed, 1 error, 2 publish failure. + ``` + + See [claude-code](../claude-code/SKILL.md) for tmux orchestration patterns. +- **codex** — same contract. Invoke `codex exec ""` or + wire `publish_event.py` as an MCP tool so the agent can call it directly. +- **opencode** — wire `publish_event.py` as a tool/command the agent can call; + identical event points. +- **human** — a person does the work, reads the registry record, then runs + `publish_event.py --job --event completed` (or `error`) by hand. + +## User Interface + +The [`multi-agent-mux-delegate-job`](./multi-agent-mux-delegate-job) bash wrapper bundles register + +subscribe-first + run-agent + validate: + +```bash +multi-agent-mux-delegate-job submit --agent claude-code \ + --prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \ + --workdir /path/to/project --timeout 3600 [--validate ./validate.sh] +multi-agent-mux-delegate-job status --job # one record, pretty-printed +multi-agent-mux-delegate-job list # all jobs, one line each +multi-agent-mux-delegate-job verify --job --validate ./validate.sh # runs it, reports exit code +multi-agent-mux-delegate-job wait [--job ] # block until terminal (else --wait-any) +``` + +`submit` **always starts the subscriber before the agent** (the ordering +dependency), runs the agent in `--mode print` (one-shot) or `--mode tmux`, and +calls `--validate` afterward if given. The skill automates job-id generation, +registry creation, broker resolution, subscriber-first ordering, agent launch, +and completion detection; it does **not** automate the agent's internals or your +business-logic validation — those are hooks you fill (`validate.sh` reads +`$JOB_ID`/`$REGISTRY_DIR`). + +## Common Pitfalls + +- **Publishing before subscribing** — MQTT does not queue non-retained messages + for absent subscribers. Start `job_subscriber.py` *before* the agent, or rely + on retained terminal events (production). `submit` enforces this. +- **Wrong job_id propagated to the agent** — the wrapper prints a fresh `JOB_ID` + on every `submit`. If your agent instruction (or the wrapper's prompt template) + hard-codes an old job_id, the agent calls `publish_event.py --job `, + the subscriber's defensive parser drops it as a `job_id` mismatch, and the + delegator waits until idle timeout (exit 2). Fix: instruct the agent to + **read the job_id from the registry record for *this* delegation** (or pass it + in via env / `--prompt` interpolation), never from prior runs. `submit`'s + default prompt template interpolates `$JOB_ID` for you — if you build a custom + prompt, do the same. +- **tmux session name collision** — `submit --mode tmux` derives the session + name from `--agent-session tmux:` (default `tmux:claude`). If a session + with that name is already attached (e.g. you ran the demo and the previous + session is still open), `tmux new-session -d -s ` fails and the agent + never launches. Pick a unique `--agent-session` per concurrent delegation + (e.g. `tmux:demo`, `tmux:claude-a`, `tmux:claude-b`) or kill the stale one + (`tmux kill-session -t claude`) before re-running. +- **Timeout before `started`** — a cold-starting agent may not emit `started` + for a while; the wall-clock timeout starts at subscribe time so a stuck agent + still terminates. Don't set `--timeout` so low you false-positive a slow start. +- **No retry on publish** — a dropped `completed` would hang the delegator + forever; `publish_event.py` retries with exponential backoff and exits 2 if it + still fails, so the delegator is never left waiting silently. +- **QoS-1 duplicates / reorders** — a terminal event can arrive twice, or + `error` can trail `completed`; the subscriber's terminal state machine + finalises each job once and ignores the rest. +- **Trusting the public broker** — anyone can publish there; never make a real + decision on a PoC signal. Add `auth_token` + an authenticated broker first. +- **Secrets in `detail`/`data`** — keep payloads generalised; no paths, keys, or + tokens (except the production `auth_token` in `data`). + +## Subagent Orchestration Pattern + +When using this skill from a Hermes `delegate_task` subagent to dispatch work to +a coding-agent CLI (agy/claude) running in a tmux session, the following pattern +has been verified (2026-06-21, 6-batch refactoring sprint): + +### Roles +- **Main worker** (implementation): one agent session (e.g. `agy-new`) receives + brief files and executes code changes. +- **Reviewers** (spec compliance + code quality): two other agent sessions + (e.g. `agy-existing`, `claude-existing`) review the diff in parallel. +- **Hermes** (orchestrator): dispatches subagents, verifies diffs, commits, + and falls back to direct fixes when reviewers find issues. + +### Key lessons learned +1. **Brief delivery via file path** — don't paste long briefs inline via + `tmux send-keys`; the TUI may swallow them. Instead, send a short instruction + like "follow /tmp/batch1-brief.md" and let the agent read the file. +2. **Polling vs MQTT subscriber** — for short tasks (<5min), pane polling + (`capture-pane` + grep for completion markers) is simpler and more reliable + than registering a job via `registry.py` + `job_subscriber.py`. Use MQTT + subscriber only for long-running jobs (>5min) where push notification matters. +3. **Reviewers catch different bugs** — in practice, agy (Flash) caught + semantic issues (slash matching, export scope), while claude (Opus) caught + API signature mismatches (paho v2 5-arg vs 4-arg `on_disconnect`). Two + reviewers with different models provide complementary coverage. +4. **Hermes fallback fix** — when reviewers find a small, well-defined issue + (wrong argument count, missing slash), Hermes should fix it directly rather + than re-dispatching the implementer. This saves a full round-trip. +5. **Batch grouping** — group 2-3 FW items per batch when they touch different + files (no file overlap). This amortises the dispatch overhead. Items touching + the same file must be in separate batches to avoid conflicts. +6. **Pane Snapshots & Truncation Prevention** — to prevent long agent responses from being scrolled out and truncated due to TUI viewport limitations, enforce the following snapshotting pattern: + - Immediately after dispatching a brief, capture the pre-brief pane buffer via `capture-pane -S -200`. + - During long execution, run a background loop taking incremental snapshots (e.g. every 30 seconds `>> /tmp/pane-snap.txt`). + - Immediately after job termination, capture the entire final pane state to ensure no terminal logs are lost. + +## Verification Checklist + +- [ ] `started` → `completed` over the public broker: subscriber prints the + lines and exits **0**. +- [ ] `error` path: subscriber exits **1**. +- [ ] timeout path: no terminal event within `--timeout`/`--idle-timeout` → + exit **2**. +- [ ] polluted payload (bad JSON, wrong `schema_version`, wrong `job_id`) is + dropped with a warning, not crashed on. +- [ ] one tmux session processes two registry jobs in sequence; a second + session with a different `agent_session` claims only its own. +- [ ] broker cut-over: same scripts reach an authenticated TLS broker with env + changes only; a credential without write ACL is rejected; a late + subscriber still receives the retained terminal event. +- [ ] `publisher.py`/`subscriber.py`/`README.md` demo on `python/mqtt/sample` + still works unchanged (regression). +- [ ] **audit log integrity** — for a completed job, + `.mam/delegate_job_logs//events.ndjson` contains `registered` → + `received started` → `published completed` (in that order), and + `status.json.status == "completed"` matches the registry record. A + logging failure (e.g. read-only log dir) does not break the publish or + subscribe path — only a `logger.warning` is emitted. +- [ ] **end-to-end demo smoke** — run + `multi-agent-mux-delegate-job submit --agent claude-code --agent-session tmux:demo-smoke + --prompt "echo hello and call publish_event.py --job + --event completed" --timeout 120` and confirm + (a) registered job id echoed, (b) subscriber pid echoed, (c) tmux session + name printed, (d) `events.ndjson` grows as the agent runs, (e) final + stdout line is the audit-log dir. diff --git a/.agents/skills/multi-agent-mux-delegate-job/job-protocol.md b/.agents/skills/multi-agent-mux-delegate-job/job-protocol.md new file mode 100644 index 0000000..50e7365 --- /dev/null +++ b/.agents/skills/multi-agent-mux-delegate-job/job-protocol.md @@ -0,0 +1,114 @@ +# Job Event Protocol + +The wire contract every multi-agent-mux-delegate-job agent (claude-code, codex, opencode, +human, …) speaks. One job → one MQTT topic → JSON event payloads. Stable across +the PoC (public broker) and production (own broker) stages; only transport +hardening changes, never the payload shape. + +Reference implementation: [`./scripts/publish_event.py`](./scripts/publish_event.py) +(emit) and [`./scripts/job_subscriber.py`](./scripts/job_subscriber.py) (observe). + +--- + +## 1. Topic design + +| Topic | Purpose | +|-------|---------| +| `python/mqtt/sample` | Legacy demo topic — **never changed** (README compat). | +| `python/mqtt/jobs//events` | Per-job event stream (this protocol). | + +- One topic per job, JSON payload, `event` field discriminates the type. +- Single-direction publish only (worker → observer). No request/response. +- Future split is reserved but not required: + `/events`, `/logs`, `/artifacts`. +- `topic_prefix` is stored in the job record so publishers resolve the topic + from the registry alone (`/events`). + +--- + +## 2. Payload schema (JSON, UTF-8, `schema_version = 1`) + +```json +{ + "schema_version": 1, + "seq": 7, + "job_id": "abc12345", + "event": "started | permission_required | progress | completed | error", + "timestamp": "2026-06-19T09:32:00Z", + "detail": "generalised, whitelisted human-readable string", + "data": { "optional": "metadata" } +} +``` + +| Field | Rule | +|-------|------| +| `schema_version` | If publisher/subscriber disagree, the subscriber **drops** the event with a warning (defensive parsing). | +| `seq` | Monotonic **per `job_id`**, first publish = 1. Lets the subscriber detect reorder/duplication. Persisted in the registry (`last_seq`) so it survives restarts. | +| `job_id` | Subscriber drops any event whose `job_id` it did not subscribe for. | +| `timestamp` | Publisher host clock, **advisory only**. The delegator's timeout is measured from *receive* time, not this field. | +| `detail` | Generalised text only. **No absolute paths, keys, or tokens.** | +| `data` | Optional metadata. Production may add `hmac_sig`, `build_id`, etc. | + +--- + +## 3. Event catalogue + +| event | When emitted | `detail` example | seq | +|-------|--------------|------------------|-----| +| `started` | Agent first picks up the job | `"Job a1b2c3d4 started"` | 1 | +| `permission_required` | Agent needs a tool/permission grant | `"needs to write sort_problems.md"` | as it happens | +| `progress` | Optional intermediate checkpoint | `"creating problem 5/10"` | as it happens | +| `completed` | Successful terminal state | `"saved to sort_problems.md"` | last | +| `error` | Failure / exception terminal state | `"internal error, see logs"` | last | + +`started` and `completed`/`error` are mandatory bookends; `permission_required` +and `progress` are optional. `detail` must stay on the whitelist of generalised +phrasings — never leak secrets through it. + +### Terminal semantics + +- `completed` → subscriber exits 0; `error` → exits 1. +- The subscriber runs a **terminal state machine**: it finalises a job on the + first `completed`/`error` it sees and ignores any later terminal event for + that job (QoS-1 duplicate, or an `error`-after-`completed` reorder). When all + watched jobs are finalised it exits. +- Wall-clock timeout *or* idle timeout before a terminal event → exit 2. + +--- + +## 4. Production hardening (own broker stage) + +The payload shape is unchanged; the transport and trust model tighten. See +[`mqtt-broker-setup.md`](./mqtt-broker-setup.md) for the broker side. + +- **Auth / ACL** — username/password + per-topic ACL. `jobs/+/events` publish is + granted to the worker credential, subscribe to the Hermes credential. +- **HMAC Signature Verification (`data.hmac_sig`)** — to authenticate the publisher and verify message integrity without exposing the raw secret token over the wire, each job record contains a per-job `auth_token` (`secrets.token_urlsafe(32)`). The publisher computes an HMAC-SHA256 signature over the serialized payload (excluding `data.hmac_sig` itself) using the `auth_token` as the key, and appends it to **`data.hmac_sig`**. The subscriber reconstructs this signature and **drops any message that does not match or lacks a valid signature**. + + ```json + { "...": "...", "data": { "hmac_sig": "d2f3...", "build_id": "42" } } + ``` + +- **TLS** — port 8883 + private CA. Toggled with `MQTT_TLS=1` (+ `MQTT_CA_CERTS`); + no code change. +- **Retained terminal events** — `completed`/`error` publish with `retain=True` + so a subscriber that joins late immediately receives the last terminal state + instead of a stale view. The reference publisher auto-retains terminal events; + `--retained` forces it for any event. +- **Dual timeouts** — total wall-clock budget + last-activity idle detection, + both measured from receive time. +- **Clock trust** — never trust the payload `timestamp` for timeout decisions. + +--- + +## 5. Why a public broker is PoC-only + +On `broker.hivemq.com` anyone can publish/subscribe the same topic. Therefore: + +- No secret data in payloads. +- `started`/`completed`/`error` are *signals*, never a basis for a security + decision. +- Non-retained messages are **not queued** for absent subscribers — start the + subscriber **before** the agent (ordering dependency), or rely on retained + terminal events in production. +- Real operational decisions belong to the own-broker stage with auth + ACL. diff --git a/.agents/skills/multi-agent-mux-delegate-job/mqtt-broker-setup.md b/.agents/skills/multi-agent-mux-delegate-job/mqtt-broker-setup.md new file mode 100644 index 0000000..685f62e --- /dev/null +++ b/.agents/skills/multi-agent-mux-delegate-job/mqtt-broker-setup.md @@ -0,0 +1,176 @@ +# MQTT Broker Setup — PoC → Production + +The multi-agent-mux-delegate-job scripts read **all** broker settings from environment +variables (or a job record's `broker.*` block) through a single helper, +`broker_config_from_env()` in +[`./scripts/mqtt_common.py`](./scripts/mqtt_common.py). The design goal: +**switch from the public PoC broker to your own broker with config only — no +code change.** + +| Env var | Meaning | PoC default | Production | +|---------|---------|-------------|-----------| +| `MQTT_BROKER` | host | `broker.hivemq.com` | internal hostname/IP | +| `MQTT_PORT` | port | `1883` | `8883` (TLS) | +| `MQTT_TLS` | TLS on/off (`1`/`0`) | `0` | `1` | +| `MQTT_USERNAME` / `MQTT_PASSWORD` | auth | (none) | broker-issued | +| `MQTT_CA_CERTS` | CA bundle path | (none) | private CA path | +| `MQTT_CERTFILE` / `MQTT_KEYFILE` | client cert (optional mTLS) | (none) | per-client | +| `MQTT_CLIENT_ID_PREFIX` | client id prefix | `hermes` | per-environment | + +--- + +## 1. PoC: public broker (`broker.hivemq.com`) + +**Pros** — zero setup, reachable from anywhere, perfect for wiring up the +publish/subscribe loop and the timeout/state-machine logic. + +**Cons / accepted assumptions** — no auth, no integrity, shared with the world: + +- no secrets in payloads; +- `started`/`completed`/`error` are advisory signals only; +- non-retained messages are **not queued** for absent subscribers, so the + subscriber must start before the agent; +- a re-subscribing client cannot recover past (non-retained) events. + +Use it only to validate the protocol, never for real decisions. + +--- + +## 2. Production: self-hosted Mosquitto (or EMQX) + +Both support MQTT 5 + ACL + TLS. Mosquitto shown below; EMQX is a drop-in for +the same env vars. + +### 2.1 Install + +```bash +# macOS +brew install mosquitto + +# Debian/Ubuntu +sudo apt-get update && sudo apt-get install -y mosquitto mosquitto-clients + +# Docker +docker run -d --name mosquitto -p 8883:8883 \ + -v "$PWD/mosquitto.conf:/mosquitto/config/mosquitto.conf" \ + -v "$PWD/certs:/mosquitto/certs" \ + -v "$PWD/auth:/mosquitto/auth" \ + eclipse-mosquitto:2 +``` + +### 2.2 `mosquitto.conf` (key lines) + +```conf +persistence true +persistence_location /mosquitto/data/ + +password_file /mosquitto/auth/passwd +acl_file /mosquitto/auth/acl +allow_anonymous false + +listener 8883 +cafile /mosquitto/certs/ca.crt +certfile /mosquitto/certs/server.crt +keyfile /mosquitto/certs/server.key +``` + +`persistence true` + QoS 1 + retained terminal events means a subscriber that +joins after a job finished still sees the final `completed`/`error`. + +### 2.3 Users (username/password) + +```bash +# create the file with the first user, then add more with -b +mosquitto_passwd -c /mosquitto/auth/passwd hermes # subscriber/delegator +mosquitto_passwd /mosquitto/auth/passwd claude-worker # publisher/agent +# (omit -c after the first; -c truncates the file) +``` + +### 2.4 ACL — least privilege + +The worker only **publishes** events; Hermes only **subscribes**: + +```conf +# /mosquitto/auth/acl + +# claude-worker: may publish job events, may not read others' streams +user claude-worker +topic write python/mqtt/jobs/+/events + +# hermes: observes every job's events +user hermes +topic read python/mqtt/jobs/+/events + +# keep the legacy demo topic usable for both, if desired +pattern readwrite python/mqtt/sample +``` + +### 2.5 TLS certificates + +**Quick self-signed (single host, internal only):** + +```bash +mkdir -p certs && cd certs +openssl req -x509 -newkey rsa:2048 -nodes -days 825 \ + -keyout server.key -out server.crt \ + -subj "/CN=mqtt.internal" +cp server.crt ca.crt # clients trust this as the CA bundle +``` + +**Private CA (recommended — separate CA from server cert):** + +```bash +# 1) CA +openssl genrsa -out ca.key 4096 +openssl req -x509 -new -nodes -key ca.key -days 3650 -out ca.crt -subj "/CN=Hermes-CA" +# 2) server cert signed by the CA +openssl genrsa -out server.key 2048 +openssl req -new -key server.key -out server.csr -subj "/CN=mqtt.internal" +openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out server.crt -days 825 +``` + +Clients trust `ca.crt` via `MQTT_CA_CERTS=/path/to/ca.crt`. + +--- + +## 3. Cut-over verification (config-only, no code change) + +Goal: prove the **same scripts** talk to your broker by changing only env/registry. + +```bash +# 1) point the env at the new broker +export MQTT_BROKER=mqtt.internal +export MQTT_PORT=8883 +export MQTT_TLS=1 +export MQTT_CA_CERTS=$PWD/certs/ca.crt +export MQTT_USERNAME=hermes +export MQTT_PASSWORD=… # subscriber side +# (publisher side uses claude-worker creds via the job record's broker block) + +# 2) sanity-check with the mosquitto CLI first +mosquitto_sub -h "$MQTT_BROKER" -p 8883 --cafile "$MQTT_CA_CERTS" \ + -u hermes -P "$MQTT_PASSWORD" -t 'python/mqtt/jobs/+/events' -v & + +# 3) run the unchanged multi-agent-mux-delegate-job loop +PY=.venv/bin/python +JID=$($PY scripts/registry.py register --prompt "broker cutover smoke") +$PY scripts/job_subscriber.py --job "$JID" --timeout 30 & +sleep 3 +$PY scripts/publish_event.py --job "$JID" --event started +$PY scripts/publish_event.py --job "$JID" --event completed # auto-retained +``` + +Expected: +- subscriber prints the `started` and `completed` lines and exits 0; +- `mosquitto_sub` shows the same events (ACL allows `hermes` to read); +- publishing as a credential **without** write ACL is rejected by the broker; +- a subscriber started *after* `completed` still receives it (retained). + +If all four hold, the migration is config-only. Persist the broker block into +each job record so `publish_event.py` connects from the registry alone: + +```json +"broker": { "host": "mqtt.internal", "port": 8883, "tls": true, + "username": "claude-worker", "password": "…" } +``` diff --git a/.agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job b/.agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job new file mode 100755 index 0000000..8b32861 --- /dev/null +++ b/.agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job @@ -0,0 +1,277 @@ +#!/usr/bin/env bash +# multi-agent-mux-delegate-job — user-facing orchestrator for the multi-agent-mux-delegate-job skill. +# +# Subcommands: +# submit register a job, start the subscriber FIRST, then run the agent, +# then (optionally) run a validation script. +# status show one job record. +# list list all jobs. +# verify run a user-supplied --validate script against a job's artifacts. +# wait block until all running/pending jobs reach a terminal state. +# +# This is a reference wrapper: it shells out to the python scripts that live +# next to it. Copy it into your project and customise as needed. It never hard +# fails if `claude`/`codex`/`tmux` are missing — it prints what it would run. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Pick an interpreter: prefer a project .venv, else python3. +pick_python() { + local py_bin + if [[ -n "${DELEGATE_JOB_PYTHON:-}" ]]; then + py_bin="$DELEGATE_JOB_PYTHON" + elif [[ -x "${WORKDIR:-.}/.venv/bin/python" ]]; then + py_bin="${WORKDIR}/.venv/bin/python" + elif [[ -x ".venv/bin/python" ]]; then + py_bin="$(pwd)/.venv/bin/python" + else + py_bin="python3" + fi + if ! "$py_bin" -c "import paho.mqtt" 2>/dev/null; then + echo "ERROR: paho-mqtt package is missing for $py_bin." >&2 + echo " Please create a virtual environment and install it:" >&2 + echo " python3 -m venv .venv && .venv/bin/pip install -r \"$SCRIPT_DIR/requirements.txt\"" >&2 + exit 1 + fi + echo "$py_bin" +} + +REGISTRY_DIR_DEFAULT=".mam/jobs" + +usage() { + cat <<'EOF' +multi-agent-mux-delegate-job [options] + + submit --agent --prompt [--workdir ] [--agent-session