refactor: migrate skills/ directory to .agents/skills/
This commit is contained in:
@@ -0,0 +1,739 @@
|
||||
#!/usr/bin/env bash
|
||||
# lib.sh — shared library for the tmux-agent-orchestrate-* skills.
|
||||
#
|
||||
# Single source of truth for the four things that were inconsistently
|
||||
# re-implemented across create/resume/delete/monitor (REVIEW.md §4.1):
|
||||
# - derive_session_name : the tmux session slug (P0-A)
|
||||
# - atomic_dump_yaml : flock + temp+rename + .bak + validate (P0-B)
|
||||
# - env_python : env-safe Python (no heredoc injection) (P0-B / P1-B)
|
||||
# - find_workspace_uuid : workspace-SCOPED resume id lookup (P0-C)
|
||||
#
|
||||
# Source it from each script with a path computed from the script location:
|
||||
# source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
|
||||
#
|
||||
# HARD RULE: the agent-sessions.yaml file is only ever written through
|
||||
# atomic_dump_yaml. Never `open(yaml_path, 'w')` anywhere else.
|
||||
|
||||
SKILL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
WORKSPACE_ROOT="$(cd "$SKILL_DIR/../.." && pwd)"
|
||||
AGENT_SESSIONS_YAML="${AGENT_SESSIONS_YAML:-$WORKSPACE_ROOT/.hermes/agent-sessions.yaml}"
|
||||
|
||||
# Workspace-relative defaults with environment overrides (Phase Z)
|
||||
HOME_DIR="${HOME_DIR:-$WORKSPACE_ROOT}"
|
||||
CLAUDE_PROJECT_DIR="${CLAUDE_PROJECT_DIR:-$HOME/.claude/projects}"
|
||||
LOCAL_BIN="${LOCAL_BIN:-$HOME/.local/bin}"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tmux Server Isolation support
|
||||
# ---------------------------------------------------------------------------
|
||||
# Paths to exclude when resolving the real tmux binary (shim/wrapper dirs).
|
||||
_TMUX_SHIM_DIR_PATTERN="${_TMUX_SHIM_DIR_PATTERN:-/multi-agent-tmux-shim/}"
|
||||
_TMUX_SKILLS_BIN_PATTERN="${_TMUX_SKILLS_BIN_PATTERN:-/.agents/skills/.bin}"
|
||||
|
||||
TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}"
|
||||
|
||||
_resolve_real_tmux_path() {
|
||||
if [ -z "${_REAL_TMUX_PATH:-}" ] || [[ "$_REAL_TMUX_PATH" == *"${_TMUX_SHIM_DIR_PATTERN}"* ]] || [[ "$_REAL_TMUX_PATH" == *"${_TMUX_SKILLS_BIN_PATTERN}"* ]]; then
|
||||
local dir save_ifs="$IFS"
|
||||
_REAL_TMUX_PATH=""
|
||||
IFS=:
|
||||
for dir in $PATH; do
|
||||
if [[ "$dir" != *"${_TMUX_SHIM_DIR_PATTERN}"* ]] && [[ "$dir" != *"${_TMUX_SKILLS_BIN_PATTERN}"* ]] && [ -x "$dir/tmux" ]; then
|
||||
_REAL_TMUX_PATH="$dir/tmux"
|
||||
break
|
||||
fi
|
||||
done
|
||||
IFS="$save_ifs"
|
||||
if [ -z "$_REAL_TMUX_PATH" ]; then
|
||||
_REAL_TMUX_PATH="tmux"
|
||||
fi
|
||||
export _REAL_TMUX_PATH
|
||||
fi
|
||||
}
|
||||
|
||||
_init_tmux_isolation() {
|
||||
_resolve_real_tmux_path
|
||||
if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
|
||||
local wrapper_dir="${TMPDIR:-/tmp}${_TMUX_SHIM_DIR_PATTERN}${TMUX_SERVER_NAME}"
|
||||
if [[ ":$PATH:" != *":$wrapper_dir:"* ]]; then
|
||||
mkdir -p "$wrapper_dir"
|
||||
cat <<EOF > "$wrapper_dir/tmux"
|
||||
#!/usr/bin/env bash
|
||||
if [ -z "\${TMUX_SERVER_NAME:-}" ] || [ "\$TMUX_SERVER_NAME" = "default" ]; then
|
||||
exec "$_REAL_TMUX_PATH" "\$@"
|
||||
else
|
||||
exec "$_REAL_TMUX_PATH" -L "\$TMUX_SERVER_NAME" "\$@"
|
||||
fi
|
||||
EOF
|
||||
chmod +x "$wrapper_dir/tmux"
|
||||
export PATH="$wrapper_dir:$PATH"
|
||||
fi
|
||||
else
|
||||
# 격리 비활성화 시 shim 자동 cleanup (PATH에서 제거)
|
||||
local new_path="" dir save_ifs="$IFS"
|
||||
IFS=:
|
||||
for dir in $PATH; do
|
||||
if [[ "$dir" != *"${_TMUX_SHIM_DIR_PATTERN}"* ]] && [[ "$dir" != *"${_TMUX_SKILLS_BIN_PATTERN}"* ]]; then
|
||||
if [ -z "$new_path" ]; then
|
||||
new_path="$dir"
|
||||
else
|
||||
new_path="$new_path:$dir"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
IFS="$save_ifs"
|
||||
export PATH="$new_path"
|
||||
fi
|
||||
}
|
||||
|
||||
_tmux() {
|
||||
_init_tmux_isolation
|
||||
if [ -z "${TMUX_SERVER_NAME:-}" ] || [ "$TMUX_SERVER_NAME" = "default" ]; then
|
||||
"$_REAL_TMUX_PATH" "$@"
|
||||
else
|
||||
"$_REAL_TMUX_PATH" -L "$TMUX_SERVER_NAME" "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
tmux() {
|
||||
_tmux "$@"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# resolve_tmux_server <session_name>
|
||||
#
|
||||
# Query agent-sessions.yaml to find the tmux_server associated with a session.
|
||||
# Fallback to TMUX_SERVER_NAME or 'default' if not registered or field is missing.
|
||||
# Prints the resolved server name on stdout.
|
||||
# ---------------------------------------------------------------------------
|
||||
resolve_tmux_server() {
|
||||
local session_name="$1"
|
||||
SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
|
||||
import os, sys, sqlite3, json, yaml
|
||||
name = os.environ['SESSION_NAME']
|
||||
yaml_path = os.environ['YAML_PATH']
|
||||
db_path = os.path.splitext(yaml_path)[0] + '.db'
|
||||
try:
|
||||
if os.path.exists(db_path):
|
||||
conn = sqlite3.connect(db_path, timeout=10.0)
|
||||
try:
|
||||
row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
|
||||
if row:
|
||||
s = json.loads(row[0])
|
||||
server = s.get('tmux_server')
|
||||
if server:
|
||||
print(server)
|
||||
sys.exit(0)
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row:
|
||||
d = json.loads(row[0])
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('name') == name:
|
||||
server = s.get('tmux_server')
|
||||
if server:
|
||||
print(server)
|
||||
sys.exit(0)
|
||||
conn.close()
|
||||
elif os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('name') == name:
|
||||
server = s.get('tmux_server')
|
||||
if server:
|
||||
print(server)
|
||||
sys.exit(0)
|
||||
except Exception:
|
||||
pass
|
||||
# Fallback
|
||||
print(os.environ.get('TMUX_SERVER_NAME', 'default'))
|
||||
PYEOF
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# derive_session_name <workspace> <agent>
|
||||
#
|
||||
# THE single source of truth for the tmux session name. Rule:
|
||||
# slug = the two trailing path components of the absolute workspace,
|
||||
# '_' -> '-', lowercased, joined with '-'
|
||||
# name = "<slug>-creator-<agent>"
|
||||
#
|
||||
# Workspace root 기준 상대 해석. 예:
|
||||
# $WORKSPACE_ROOT/landing_page/refer_landing_page + claude
|
||||
# -> landing-page-refer-landing-page-creator-claude
|
||||
#
|
||||
# Decision (REVIEW P0-A): the actual workspace basename (refer_landing_page)
|
||||
# IS included. The hand-written historical entry that dropped it
|
||||
# (lab-landing-page-creator-claude) was the bug, not the convention.
|
||||
# Every script and SKILL.md must use exactly this rule.
|
||||
# ---------------------------------------------------------------------------
|
||||
derive_session_name() {
|
||||
local workspace="$1" agent="$2"
|
||||
local abs parent work slug
|
||||
abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
|
||||
parent="$(basename "$(dirname "$abs")")"
|
||||
work="$(basename "$abs")"
|
||||
slug="$(printf '%s-%s' "$parent" "$work" | tr '[:upper:]' '[:lower:]' | tr '_' '-')"
|
||||
printf '%s-creator-%s' "$slug" "$agent"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# env_python <yaml_path> [KEY=VALUE ...] (Python source read from stdin)
|
||||
#
|
||||
# Run python3 with the source supplied on stdin via a *quoted* heredoc, so the
|
||||
# shell never interpolates the source. All values are passed through the
|
||||
# environment (YAML_PATH plus any KEY=VALUE pairs). Untrusted data (workspace
|
||||
# paths, capture-pane text) must travel as env vars and be read via os.environ
|
||||
# inside the script — never spliced into the source. Read-only by convention;
|
||||
# use atomic_dump_yaml when you need to write the YAML.
|
||||
# ---------------------------------------------------------------------------
|
||||
env_python() {
|
||||
local yaml_path="$1"; shift
|
||||
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN")
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
*=*) envs+=("$1"); shift ;;
|
||||
*) break ;;
|
||||
esac
|
||||
done
|
||||
env "${envs[@]}" python3 - "$@"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# atomic_dump_yaml <yaml_path> [KEY=VALUE ...] (mutation source from stdin)
|
||||
#
|
||||
# The ONLY sanctioned way to write agent-sessions.yaml. It:
|
||||
# 1. takes an exclusive flock on <yaml_path>.lock (serialises all writers)
|
||||
# 2. loads the YAML into `d`
|
||||
# 3. exec()s the caller's mutation source (sees d, yaml, os, datetime,
|
||||
# timezone, glob, subprocess; reads values via os.environ). The mutation
|
||||
# may print and may `raise SystemExit(n)` to abort *without* writing.
|
||||
# 4. validates the resulting schema
|
||||
# 5. backs up to <yaml_path>.bak, then writes atomically (temp + os.replace)
|
||||
#
|
||||
# The mutation source is passed via env and exec()'d — it is never string
|
||||
# spliced and untrusted data never lands in Python source (P0-B / P1-B).
|
||||
# ---------------------------------------------------------------------------
|
||||
# Check if the workspace is on NFS — flock is unreliable on NFS
|
||||
_atomic_dump_yaml_check_nfs() {
|
||||
local f="$1"
|
||||
local mountpoint
|
||||
mountpoint="$(df --output=target "$f" 2>/dev/null | tail -1)" || return 0
|
||||
if mount | grep -q "$mountpoint.*nfs\|$mountpoint.*cifs\|$mountpoint.*fuse.sshfs"; then
|
||||
echo "WARNING: $mountpoint appears to be a network filesystem (NFS/CIFS/SSHFS)." >&2
|
||||
echo "WARNING: fcntl.flock-based atomic writes are unreliable on network filesystems." >&2
|
||||
echo "WARNING: SQLite journal_mode automatically falls back to DELETE." >&2
|
||||
fi
|
||||
}
|
||||
|
||||
atomic_dump_yaml() {
|
||||
local yaml_path="$1"; shift
|
||||
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN")
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
*=*) envs+=("$1"); shift ;;
|
||||
*) break ;;
|
||||
esac
|
||||
done
|
||||
local mutation; mutation="$(cat)"
|
||||
env "${envs[@]}" AGENT_SESSIONS_MUTATION="$mutation" python3 - <<'PYEOF'
|
||||
import os, sys, tempfile, shutil, glob, subprocess, json, sqlite3
|
||||
from datetime import datetime, timezone
|
||||
import yaml
|
||||
|
||||
yaml_path = os.environ['YAML_PATH']
|
||||
db_path = os.path.splitext(yaml_path)[0] + '.db'
|
||||
|
||||
def _validate(d):
|
||||
if not isinstance(d, dict):
|
||||
raise SystemExit("VALIDATE: top-level is not a mapping")
|
||||
sessions = d.get('tmux_sessions', [])
|
||||
if not isinstance(sessions, list):
|
||||
raise SystemExit("VALIDATE: tmux_sessions is not a list")
|
||||
valid = {'running', 'terminated', 'archived', 'stopped'}
|
||||
for i, s in enumerate(sessions):
|
||||
if not isinstance(s, dict):
|
||||
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] not a mapping")
|
||||
if not s.get('name') or not s.get('status'):
|
||||
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] missing name/status")
|
||||
if s['status'] not in valid:
|
||||
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} bad status {s['status']!r}")
|
||||
if not isinstance(s.get('pane'), dict):
|
||||
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} missing pane")
|
||||
|
||||
def get_terminal_set(d):
|
||||
return {s.get('name'): s.get('status') for s in d.get('tmux_sessions', []) if s.get('status') in ('stopped', 'terminated', 'archived')}
|
||||
|
||||
os.makedirs(os.path.dirname(db_path) or '.', exist_ok=True)
|
||||
conn = sqlite3.connect(db_path, timeout=60.0)
|
||||
|
||||
for f in [db_path, db_path + '-wal', db_path + '-shm']:
|
||||
if os.path.exists(f):
|
||||
try:
|
||||
os.chmod(f, 0o600)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def is_nfs(path):
|
||||
try:
|
||||
df_out = subprocess.check_output(['df', '--output=target', path], text=True, stderr=subprocess.DEVNULL)
|
||||
target = df_out.strip().split('\n')[-1].strip()
|
||||
mount_out = subprocess.check_output(['mount'], text=True)
|
||||
for line in mount_out.split('\n'):
|
||||
if f" on {target} " in line and (' type nfs ' in line or ' type cifs ' in line or ' fuse.sshfs ' in line):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
if is_nfs(os.path.dirname(db_path) or '.'):
|
||||
conn.execute('PRAGMA journal_mode=DELETE')
|
||||
else:
|
||||
conn.execute('PRAGMA journal_mode=WAL')
|
||||
|
||||
try:
|
||||
# Disable auto-commit by explicitly starting a transaction with BEGIN IMMEDIATE
|
||||
# This prevents the read-modify-write lost update race condition.
|
||||
conn.execute('BEGIN IMMEDIATE')
|
||||
conn.execute('CREATE TABLE IF NOT EXISTS state (id INTEGER PRIMARY KEY, data TEXT)')
|
||||
conn.execute('CREATE TABLE IF NOT EXISTS sessions (name TEXT PRIMARY KEY, status TEXT, pane_cwd TEXT, data JSON)')
|
||||
conn.execute('CREATE INDEX IF NOT EXISTS idx_sessions_pane_cwd ON sessions(pane_cwd)')
|
||||
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row:
|
||||
d = json.loads(row[0])
|
||||
else:
|
||||
# Seed from YAML
|
||||
if os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
else:
|
||||
d = {}
|
||||
|
||||
# Assemble d['tmux_sessions'] from sessions table if table contains data
|
||||
db_sessions = []
|
||||
cursor = conn.execute('SELECT name, status, pane_cwd, data FROM sessions')
|
||||
for s_row in cursor.fetchall():
|
||||
s_data = json.loads(s_row[3])
|
||||
s_data['name'] = s_row[0]
|
||||
s_data['status'] = s_row[1]
|
||||
if 'pane' not in s_data:
|
||||
s_data['pane'] = {}
|
||||
s_data['pane']['cwd'] = s_row[2]
|
||||
db_sessions.append(s_data)
|
||||
|
||||
if db_sessions:
|
||||
d['tmux_sessions'] = db_sessions
|
||||
elif 'tmux_sessions' not in d:
|
||||
d['tmux_sessions'] = []
|
||||
|
||||
old_terminals = get_terminal_set(d)
|
||||
|
||||
# --- caller mutation (module scope: sees d, yaml, os, glob, subprocess) ---
|
||||
exec(compile(os.environ['AGENT_SESSIONS_MUTATION'], '<mutation>', 'exec'), globals())
|
||||
|
||||
_validate(d)
|
||||
|
||||
# Separate globals and sessions for normalization
|
||||
d_state = {k: v for k, v in d.items() if k != 'tmux_sessions'}
|
||||
conn.execute('REPLACE INTO state (id, data) VALUES (1, ?)', (json.dumps(d_state),))
|
||||
|
||||
current_names = []
|
||||
for s in d.get('tmux_sessions', []):
|
||||
name = s.get('name')
|
||||
status = s.get('status')
|
||||
pane_cwd = (s.get('pane') or {}).get('cwd', '')
|
||||
conn.execute('REPLACE INTO sessions (name, status, pane_cwd, data) VALUES (?, ?, ?, ?)',
|
||||
(name, status, pane_cwd, json.dumps(s)))
|
||||
current_names.append(name)
|
||||
|
||||
if current_names:
|
||||
placeholders = ','.join('?' for _ in current_names)
|
||||
conn.execute(f'DELETE FROM sessions WHERE name NOT IN ({placeholders})', current_names)
|
||||
else:
|
||||
conn.execute('DELETE FROM sessions')
|
||||
|
||||
new_terminals = get_terminal_set(d)
|
||||
|
||||
conn.commit()
|
||||
|
||||
# Write to YAML ONLY when a session transitions to a finished state
|
||||
# (Moved after conn.commit() per Claude's feedback)
|
||||
if new_terminals != old_terminals:
|
||||
if os.path.exists(yaml_path):
|
||||
try:
|
||||
shutil.copy2(yaml_path, yaml_path + '.bak')
|
||||
except Exception:
|
||||
pass
|
||||
dir_ = os.path.dirname(yaml_path) or '.'
|
||||
fd, tmp = tempfile.mkstemp(dir=dir_, prefix='.agent-sessions.', suffix='.tmp')
|
||||
try:
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
yaml.safe_dump(d, f, default_flow_style=False, sort_keys=False,
|
||||
allow_unicode=True, width=4096)
|
||||
os.replace(tmp, yaml_path)
|
||||
except Exception:
|
||||
if os.path.exists(tmp):
|
||||
os.remove(tmp)
|
||||
raise
|
||||
|
||||
try:
|
||||
conn.execute('PRAGMA wal_checkpoint(TRUNCATE)')
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# H3: Re-apply chmod 0600 after close to cover newly created -wal / -shm files
|
||||
try:
|
||||
os.chmod(db_path, 0o600)
|
||||
wal = db_path + '-wal'
|
||||
if os.path.exists(wal): os.chmod(wal, 0o600)
|
||||
shm = db_path + '-shm'
|
||||
if os.path.exists(shm): os.chmod(shm, 0o600)
|
||||
except Exception:
|
||||
pass
|
||||
PYEOF
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# find_workspace_uuid <workspace> <agent>
|
||||
#
|
||||
# Workspace-SCOPED resolution of the resume UUID (P0-C). It NEVER returns a
|
||||
# global agent_identities id unless that id's project_cwd matches THIS
|
||||
# workspace. Resolution order:
|
||||
# 1) tmux_sessions[] row whose pane.cwd == this workspace -> per-row own id
|
||||
# (claude_session_id_own / agy_conversation_id_own)
|
||||
# 2) on-disk scan scoped to this workspace
|
||||
# (claude: ~/.claude/projects/<key>/*.jsonl ; agy: last_conversations.json[cwd])
|
||||
# 3) agent_identities cache, ONLY when its project_cwd == this workspace
|
||||
# Prints the UUID on stdout (empty line if none). Always exits 0.
|
||||
# ---------------------------------------------------------------------------
|
||||
find_workspace_uuid() {
|
||||
local workspace="$1" agent="$2"
|
||||
local abs; abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
|
||||
WS_ABS="$abs" AGENT="$agent" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
|
||||
import os, json, glob, sqlite3
|
||||
import yaml
|
||||
|
||||
ws = os.environ['WS_ABS']
|
||||
agent = os.environ['AGENT']
|
||||
home = os.environ['HOME_DIR']
|
||||
yaml_path = os.environ['YAML_PATH']
|
||||
db_path = os.path.splitext(yaml_path)[0] + '.db'
|
||||
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
|
||||
|
||||
def jsonl_exists(uuid):
|
||||
key = ws.replace('/', '-').replace('_', '-')
|
||||
return os.path.exists(f"{claude_project_dir}/{key}/{uuid}.jsonl")
|
||||
|
||||
|
||||
def db_exists(uuid):
|
||||
return os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{uuid}.db")
|
||||
|
||||
|
||||
def hermes_exists(uuid):
|
||||
hdb = f"{home}/.hermes/state.db"
|
||||
if not os.path.exists(hdb):
|
||||
return False
|
||||
try:
|
||||
conn = sqlite3.connect(hdb)
|
||||
r = conn.execute("SELECT 1 FROM sessions WHERE id=?", (uuid,)).fetchone()
|
||||
conn.close()
|
||||
return r is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def emit(u):
|
||||
print(u)
|
||||
raise SystemExit(0)
|
||||
|
||||
|
||||
# 1) per-row own id for THIS workspace (optimized with direct sqlite query if db exists)
|
||||
sessions = []
|
||||
try:
|
||||
if os.path.exists(db_path):
|
||||
conn = sqlite3.connect(db_path, timeout=10.0)
|
||||
has_sessions_table = False
|
||||
try:
|
||||
cursor = conn.execute('SELECT data FROM sessions WHERE pane_cwd=?', (ws,))
|
||||
for row in cursor.fetchall():
|
||||
sessions.append(json.loads(row[0]))
|
||||
has_sessions_table = True
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
if not has_sessions_table or not sessions:
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row:
|
||||
d = json.loads(row[0])
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if isinstance(s, dict) and (s.get('pane') or {}).get('cwd') == ws:
|
||||
sessions.append(s)
|
||||
conn.close()
|
||||
elif os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if isinstance(s, dict) and (s.get('pane') or {}).get('cwd') == ws:
|
||||
sessions.append(s)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for s in sessions:
|
||||
name = s.get('name', '')
|
||||
if agent == 'claude' and name.endswith('-creator-claude'):
|
||||
cand = s.get('claude_session_id_own')
|
||||
if cand and jsonl_exists(cand):
|
||||
emit(cand)
|
||||
if agent == 'agy' and name.endswith('-creator-agy'):
|
||||
cand = s.get('agy_conversation_id_own')
|
||||
if cand and db_exists(cand):
|
||||
emit(cand)
|
||||
if agent == 'hermes' and name.endswith('-creator-hermes'):
|
||||
cand = s.get('hermes_conversation_id_own')
|
||||
if cand and hermes_exists(cand):
|
||||
emit(cand)
|
||||
|
||||
# 2) disk scan scoped to THIS workspace
|
||||
if agent == 'claude':
|
||||
key = ws.replace('/', '-').replace('_', '-')
|
||||
proj = f"{claude_project_dir}/{key}"
|
||||
if os.path.isdir(proj):
|
||||
for j in sorted(glob.glob(f"{proj}/*.jsonl"), key=os.path.getmtime, reverse=True):
|
||||
sid = None
|
||||
try:
|
||||
with open(j) as f:
|
||||
first = f.readline().strip()
|
||||
if first:
|
||||
sid = json.loads(first).get('sessionId')
|
||||
except Exception:
|
||||
sid = None
|
||||
cand = sid or os.path.basename(j)[:-6]
|
||||
if cand and jsonl_exists(cand):
|
||||
emit(cand)
|
||||
elif agent == 'agy':
|
||||
lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json"
|
||||
if os.path.exists(lc):
|
||||
cand = None
|
||||
try:
|
||||
cand = json.load(open(lc)).get(ws)
|
||||
except Exception:
|
||||
cand = None
|
||||
if cand and db_exists(cand):
|
||||
emit(cand)
|
||||
elif agent == 'hermes':
|
||||
hdb = f"{home}/.hermes/state.db"
|
||||
if os.path.exists(hdb):
|
||||
cand = None
|
||||
try:
|
||||
conn = sqlite3.connect(hdb)
|
||||
r = conn.execute("SELECT id FROM sessions WHERE cwd=? ORDER BY started_at DESC LIMIT 1", (ws,)).fetchone()
|
||||
conn.close()
|
||||
if r:
|
||||
cand = r[0]
|
||||
except Exception:
|
||||
cand = None
|
||||
if cand:
|
||||
emit(cand)
|
||||
|
||||
# 3) agent_identities cache, ONLY when its project_cwd == this workspace
|
||||
ai = {}
|
||||
try:
|
||||
if os.path.exists(db_path):
|
||||
conn = sqlite3.connect(db_path, timeout=10.0)
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row:
|
||||
ai = json.loads(row[0]).get('agent_identities', {})
|
||||
conn.close()
|
||||
elif os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
ai = d.get('agent_identities', {})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
ai_agent = ai.get(agent) or {}
|
||||
if ai_agent.get('project_cwd') == ws:
|
||||
if agent == 'claude':
|
||||
cand = ai_agent.get('session_id')
|
||||
if cand and jsonl_exists(cand):
|
||||
emit(cand)
|
||||
elif agent == 'agy':
|
||||
cand = ai.get('conversation_id')
|
||||
if cand and db_exists(cand):
|
||||
emit(cand)
|
||||
elif agent == 'hermes':
|
||||
cand = ai_agent.get('session_id') or ai.get('conversation_id')
|
||||
if cand and hermes_exists(cand):
|
||||
emit(cand)
|
||||
|
||||
print('')
|
||||
PYEOF
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# capture_conversation_id <agent> <workdir>
|
||||
#
|
||||
# Thin wrapper over find_workspace_uuid: resolves THIS workspace's conversation
|
||||
# id (claude jsonl sessionId / agy db uuid) and prints it on stdout (empty line
|
||||
# if none). find_workspace_uuid is already a workspace-scoped, 3-tier, race-free
|
||||
# resolver (per-row own id -> workspace-scoped disk scan -> cwd-matched cache),
|
||||
# so recording its result into the row before kill guarantees tier-1 on the next
|
||||
# resume. Always exits 0.
|
||||
# ---------------------------------------------------------------------------
|
||||
capture_conversation_id() {
|
||||
local agent="$1" workdir="$2"
|
||||
find_workspace_uuid "$workdir" "$agent"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# is_already_stopped <session_name>
|
||||
#
|
||||
# Exits 0 if the row's status is 'stopped' (printing "stopped_at=<ts>" on
|
||||
# stdout), 1 otherwise (including not-found). Used for idempotency: a second
|
||||
# stop on an already-stopped session is a no-op.
|
||||
# ---------------------------------------------------------------------------
|
||||
is_already_stopped() {
|
||||
local session_name="$1"
|
||||
SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
|
||||
import os, yaml, sqlite3, json
|
||||
name = os.environ['SESSION_NAME']
|
||||
yaml_path = os.environ['YAML_PATH']
|
||||
db_path = os.path.splitext(yaml_path)[0] + '.db'
|
||||
try:
|
||||
if os.path.exists(db_path):
|
||||
conn = sqlite3.connect(db_path, timeout=10.0)
|
||||
has_sessions_table = False
|
||||
try:
|
||||
row = conn.execute('SELECT status, data FROM sessions WHERE name=?', (name,)).fetchone()
|
||||
if row:
|
||||
status, s_data_str = row[0], row[1]
|
||||
if status == 'stopped':
|
||||
s = json.loads(s_data_str)
|
||||
print(f"stopped_at={s.get('stopped_at', '?')}")
|
||||
raise SystemExit(0)
|
||||
has_sessions_table = True
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
if not has_sessions_table:
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row:
|
||||
d = json.loads(row[0])
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('name') == name and s.get('status') == 'stopped':
|
||||
print(f"stopped_at={s.get('stopped_at', '?')}")
|
||||
raise SystemExit(0)
|
||||
conn.close()
|
||||
raise SystemExit(1)
|
||||
elif os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('name') == name and s.get('status') == 'stopped':
|
||||
print(f"stopped_at={s.get('stopped_at', '?')}")
|
||||
raise SystemExit(0)
|
||||
except Exception:
|
||||
pass
|
||||
raise SystemExit(1)
|
||||
PYEOF
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# tmux-agent-orchestrate-delegate-job integration helpers
|
||||
#
|
||||
# All paths are resolved relative to lib.sh's own location (BASH_SOURCE), so the
|
||||
# skill tree is relocatable — no hardcoded absolute paths (review item 6).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# _delegate_py_bin — echo the virtualenv python (walk up from .agents/skills/), else python3.
|
||||
_delegate_py_bin() {
|
||||
# Return cached result if available (shell variable, not exported — avoids cross-workspace pollution)
|
||||
if [ -n "${AGENT_PYTHON_BIN:-}" ] && [ -x "$AGENT_PYTHON_BIN" ]; then
|
||||
printf '%s\n' "$AGENT_PYTHON_BIN"; return 0
|
||||
fi
|
||||
local d
|
||||
d="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
while [ "$d" != "/" ] && [ -n "$d" ]; do
|
||||
if [ -x "$d/.venv/bin/python" ]; then
|
||||
AGENT_PYTHON_BIN="$d/.venv/bin/python"
|
||||
printf '%s\n' "$AGENT_PYTHON_BIN"; return 0
|
||||
fi
|
||||
d="$(dirname "$d")"
|
||||
done
|
||||
AGENT_PYTHON_BIN="$(command -v python3 || echo python3)"
|
||||
printf '%s\n' "$AGENT_PYTHON_BIN"
|
||||
}
|
||||
|
||||
# _delegate_script <name> — echo the path to a tmux-agent-orchestrate-delegate-job script, resolved
|
||||
# relative to .agents/skills/ (lib.sh dir). Empty if not found.
|
||||
_delegate_script() {
|
||||
local name="$1" skill_dir cand
|
||||
skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cand="$skill_dir/tmux-agent-orchestrate-delegate-job/scripts/$name"
|
||||
if [ -f "$cand" ]; then printf '%s\n' "$cand"; return 0; fi
|
||||
printf '%s\n' "$(find "$skill_dir" -name "$name" 2>/dev/null | head -n 1 || true)"
|
||||
}
|
||||
|
||||
# delegate_submit_job <prompt> <agent> <agent_session>
|
||||
#
|
||||
# Register a job in the tmux-agent-orchestrate-delegate-job registry. Prints the new JID on stdout.
|
||||
delegate_submit_job() {
|
||||
local prompt="$1" agent="$2" session="$3"
|
||||
local py_bin registry_py
|
||||
py_bin="$(_delegate_py_bin)"
|
||||
registry_py="$(_delegate_script registry.py)"
|
||||
if [ -z "$registry_py" ] || [ ! -f "$registry_py" ]; then
|
||||
echo "ERROR: tmux-agent-orchestrate-delegate-job registry.py not found under .agents/skills/" >&2
|
||||
return 1
|
||||
fi
|
||||
"$py_bin" "$registry_py" register \
|
||||
--prompt "$prompt" \
|
||||
--agent "$agent" \
|
||||
--agent-session "$session"
|
||||
}
|
||||
|
||||
# delegate_publish_event <job_id> <event> [detail]
|
||||
#
|
||||
# Publish a lifecycle event to the tmux-agent-orchestrate-delegate-job registry. Consolidates the
|
||||
# inline .venv-walk + publish_event.py blocks that were duplicated across
|
||||
# create/delete/resume (review item 7). Non-fatal by contract: an empty job id,
|
||||
# a missing script, or a broker failure never aborts the caller.
|
||||
delegate_publish_event() {
|
||||
local job_id="$1" event="$2" detail="${3:-}"
|
||||
[ -n "$job_id" ] || return 0
|
||||
local py_bin pub
|
||||
py_bin="$(_delegate_py_bin)"
|
||||
pub="$(_delegate_script publish_event.py)"
|
||||
[ -n "$pub" ] && [ -f "$pub" ] || return 0
|
||||
"$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true
|
||||
}
|
||||
|
||||
# start_watchdog <job_id> [workdir]
|
||||
# Spawns a watchdog process to monitor a delegate-job JOB in the background.
|
||||
# The watchdog re-spawns the subscriber every 2 minutes (or whatever hard
|
||||
# limit we set) and exits automatically when the JOB reaches terminal state.
|
||||
# Returns the watchdog PID via stdout.
|
||||
start_watchdog() {
|
||||
local job_id="$1"
|
||||
local workdir="${2:-$PWD}"
|
||||
local watchdog_script="$workdir/.agents/skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh"
|
||||
local log_file="$workdir/.hermes/jobs/${job_id}.watchdog.log"
|
||||
|
||||
if [ ! -x "$watchdog_script" ]; then
|
||||
echo "ERROR: watchdog not found or not executable: $watchdog_script" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
nohup "$watchdog_script" "$job_id" "$workdir" > "$log_file" 2>&1 &
|
||||
local pid=$!
|
||||
echo "$pid"
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,220 @@
|
||||
---
|
||||
name: tmux-agent-orchestrate-create
|
||||
description: "Create a new agent session (claude, antigravity/agy) in a dedicated tmux session for context-preserving long-running work. Always creates a tmux session — never backgrounds with nohup/disown. Writes the new session to .hermes/agent-sessions.yaml. Use when you want to start a fresh agent (no prior UUID) for a new project workspace."
|
||||
version: 1.0.0
|
||||
author: godopu
|
||||
license: MIT
|
||||
platforms: [linux, macos]
|
||||
environments: [terminal, tmux]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [agent, tmux, claude, antigravity, agy, multi-agent, context, session]
|
||||
related_skills: [tmux-agent-orchestrate-resume, tmux-agent-orchestrate-stop, tmux-agent-orchestrate-monitor, claude-code]
|
||||
prereq_skills: [claude-code]
|
||||
---
|
||||
|
||||
# Multi-Agent Create — Start a Fresh Agent in a tmux Session
|
||||
|
||||
> **Companion skills**: `tmux-agent-orchestrate-resume` (resume an existing UUID), `tmux-agent-orchestrate-stop` (terminate), `tmux-agent-orchestrate-monitor` (live status).
|
||||
> **Single source of truth**: `./.hermes/agent-sessions.yaml` (this skill writes to it; never read it ad-hoc — go through this skill).
|
||||
|
||||
## What this skill does
|
||||
|
||||
Spawn a new agent (`claude` or `agy`/antigravity-cli) in a **dedicated tmux session** for context-preserving long-running work. The tmux session is the *container*; the agent's session ID is *data* inside the container. **This skill creates the container + starts the agent — but does not resume an old conversation** (use `tmux-agent-orchestrate-resume` for that).
|
||||
|
||||
For all agents: the tmux session name is produced by **`lib.sh::derive_session_name`** — the single source of truth shared by create/resume/stop/status/monitor (P0-A). The rule (verbatim from the function):
|
||||
|
||||
> slug = the **two trailing path components** of the absolute workspace, `_`→`-`, lowercased, joined with `-`; name = `<slug>-creator-<agent>`.
|
||||
|
||||
So `$WORKSPACE_ROOT/landing_page/refer_landing_page` + `claude` → `landing-page-refer-landing-page-creator-claude`. The workspace basename (`refer_landing_page`) **is** included; the hand-written historical entry that dropped it (`lab-landing-page-creator-claude`) was the bug, not the convention.
|
||||
|
||||
## Pre-flight checks
|
||||
|
||||
Before doing anything, verify the environment:
|
||||
|
||||
```bash
|
||||
# 1) tmux available and isolated server status
|
||||
command -v tmux || { echo "ERROR: tmux not installed"; exit 1; }
|
||||
echo "Tmux server name: ${TMUX_SERVER_NAME:-default}"
|
||||
|
||||
# 2) claude / agy available
|
||||
command -v claude # required for --agent claude
|
||||
command -v agy # required for --agent agy
|
||||
|
||||
# 3) claude auth (if --agent claude)
|
||||
claude auth status 2>&1 | python3 -c "import json,sys; d=json.load(sys.stdin); assert d.get('loggedIn'), 'claude not logged in'"
|
||||
|
||||
# 4) target workspace exists
|
||||
test -d "$WORKSPACE" || { echo "ERROR: workspace $WORKSPACE not a directory"; exit 1; }
|
||||
```
|
||||
|
||||
If any check fails → `kanban_block(reason="...")` (worker path) or report to user (interactive path). Do not proceed with a half-broken setup.
|
||||
|
||||
## Standard names
|
||||
|
||||
- **tmux session name**: `derive_session_name <workspace> <agent>` (lib.sh)
|
||||
- `<workspace-slug>` = `basename $(dirname $WORKSPACE)` `-` `basename $WORKSPACE` (lowercase, `_`→`-`)
|
||||
- examples: `landing-page-refer-landing-page-creator-claude`, `paper-pdf2md-creator-agy`
|
||||
- never re-derive this by hand — source lib.sh and call the function
|
||||
- **wrapper script** (claude only): `~/.local/bin/<workspace-slug>-creator-claude`
|
||||
- contents: tmux new-session with `claude` inside, auto-handles trust/bypass dialogs
|
||||
- see `<workdir>/agent_sessions.md` for the canonical wrapper template
|
||||
|
||||
## Tmux Server Isolation (격리 서버)
|
||||
|
||||
When running multiple agent sessions alongside other workflows (e.g., cmux, Kanban workers, manual tmux sessions), sharing the default tmux server can lead to session name conflicts, monitoring clutter, and accidental destruction of user sessions via global commands.
|
||||
|
||||
To prevent this, you can run this skill inside an **isolated tmux server** using the `TMUX_SERVER_NAME` environment variable or the `--tmux-server <name>` flag (opt-in).
|
||||
|
||||
### How to use
|
||||
1. **Via Environment Variable**:
|
||||
```bash
|
||||
export TMUX_SERVER_NAME=multi-agent-canary
|
||||
# All subsequent commands (create, status, stop, etc.) will run in the isolated 'multi-agent-canary' tmux server.
|
||||
```
|
||||
2. **Via Option Flag**:
|
||||
```bash
|
||||
bash scripts/create_session.sh --workspace /path/to/project --agent claude --tmux-server multi-agent-canary
|
||||
```
|
||||
3. **Submit Job Integration**:
|
||||
You can automatically register a delegated job with a prompt when creating a session:
|
||||
```bash
|
||||
bash scripts/create_session.sh --workspace /path/to/project --agent claude --submit-job "Task prompt here"
|
||||
```
|
||||
|
||||
### Recommended Alias
|
||||
You can set an alias in your shell to easily query sessions on the isolated server:
|
||||
```bash
|
||||
alias tmc='tmux -L multi-agent-canary'
|
||||
tmc ls # Lists only your multi-agent sessions
|
||||
```
|
||||
|
||||
### Safety Rules (Pitfall 29 Summary)
|
||||
- Never use global server termination commands like `tmux kill-server` or `tmux kill-session -a` as they will destroy all sessions on that server (including your own workspace sessions if they share the server).
|
||||
- By using an isolated server via `TMUX_SERVER_NAME`, your agent sessions are completely separated from your default user workspace, ensuring 0% interference.
|
||||
|
||||
## Workflow
|
||||
|
||||
```bash
|
||||
WORKSPACE=/path/to/project
|
||||
AGENT=claude # or agy
|
||||
source .agents/skills/lib.sh
|
||||
SESSION_NAME="$(derive_session_name "$WORKSPACE" "$AGENT")"
|
||||
|
||||
# 1. If session already alive, fail fast
|
||||
tmux has-session -t "$SESSION_NAME" 2>/dev/null && {
|
||||
echo "ERROR: tmux session '$SESSION_NAME' already exists. Use tmux-agent-orchestrate-resume to attach or tmux-agent-orchestrate-stop first."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 2. Spawn the tmux session with the agent inside
|
||||
case "$AGENT" in
|
||||
claude)
|
||||
# Use the wrapper if it exists, else inline tmux new-session
|
||||
# Use the wrapper if it exists (LOCAL_BIN env var overrides default $HOME/.local/bin)
|
||||
local_bin="${LOCAL_BIN:-$HOME/.local/bin}"
|
||||
if [ -x "$local_bin/$SESSION_NAME" ]; then
|
||||
nohup "$local_bin/$SESSION_NAME" >/dev/null 2>&1 &
|
||||
else
|
||||
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude"
|
||||
fi
|
||||
;;
|
||||
agy)
|
||||
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "agy --dangerously-skip-permissions"
|
||||
;;
|
||||
*) echo "ERROR: --agent must be claude or agy, got: $AGENT"; exit 2 ;;
|
||||
esac
|
||||
|
||||
# 3. Wait for agent TUI to be ready (varies: claude ~5s, agy ~3s)
|
||||
sleep 6
|
||||
|
||||
# 4. Capture pane metadata
|
||||
PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}')
|
||||
PANE_CWD=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_path}')
|
||||
PANE_CMD=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_command}')
|
||||
TMUX_EPOCH=$(tmux list-sessions -F '#{session_created}' -t "$SESSION_NAME" 2>/dev/null | head -1)
|
||||
```
|
||||
|
||||
## Registering the session in agent-sessions.yaml
|
||||
|
||||
After spawn, append a new `tmux_sessions[]` entry to `.hermes/agent-sessions.yaml`:
|
||||
|
||||
```yaml
|
||||
- name: <SESSION_NAME>
|
||||
status: running
|
||||
tmux_session_created_at: 2026-06-17T...Z # ISO 8601 UTC
|
||||
tmux_session_epoch: <TMUX_EPOCH>
|
||||
tmux_server: <TMUX_SERVER_NAME> # Isolated server name (default: 'default')
|
||||
pane:
|
||||
index: 0
|
||||
pid: <PANE_PID>
|
||||
cmd: <AGENT> # 'claude' or 'agy'
|
||||
cmd_full: <full command line, see table below>
|
||||
cwd: <PANE_CWD>
|
||||
tui: # only for claude
|
||||
model: <from TUI status>
|
||||
provider: <from TUI status>
|
||||
plan: <from TUI status>
|
||||
account: <from TUI status>
|
||||
version: <from TUI status>
|
||||
start_command: <the exact tmux new-session command used>
|
||||
attach_command: "tmux attach -t <SESSION_NAME>"
|
||||
kill_command: "tmux kill-session -t <SESSION_NAME>"
|
||||
```
|
||||
|
||||
`cmd_full` per agent (this is the actual command line in the pane, not the resume command):
|
||||
|
||||
| agent | cmd_full |
|
||||
|---|---|
|
||||
| claude (interactive) | `claude` |
|
||||
| agy (interactive) | `agy --dangerously-skip-permissions` |
|
||||
|
||||
Use the `agent-sessions-yaml-edit` script in `scripts/` to safely append (preserves comments + format):
|
||||
|
||||
```bash
|
||||
bash .agents/skills/tmux-agent-orchestrate-create/scripts/create_session.sh \
|
||||
--workspace "$WORKSPACE" --agent "$AGENT" --session "$SESSION_NAME"
|
||||
```
|
||||
|
||||
The script handles the YAML append, pane capture, and the `last_visible_status` placeholder.
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- **Don't use `nohup`/`disown`/`setsid` for the agent itself** — those background the agent outside tmux. The whole point of this skill is *the tmux session is the supervisor*. `nohup` is OK only for *launching the wrapper* (which itself creates the tmux session via `tmux new-session -d`).
|
||||
- **Don't trust `--session-id <uuid>` flags blindly** — claude/agy may not accept a fixed session id on first spawn. The session id is *assigned* on first user message; you can read it back from `~/.claude/projects/.../session.jsonl` headers or `~/.gemini/.../cache/last_conversations.json` AFTER the first message.
|
||||
- **Wrapper script MUST NOT be created via `hermes profile alias`** — that command writes a `hermes -p <profile>` wrapper that destroys the tmux behavior. Create wrappers manually (see `lab-landing-page-creator-claude` template).
|
||||
- **Always use the workspace-relative path** in tmux `cwd` — relative paths break when tmux respawns in a different shell context.
|
||||
- **The first `claude` message generates the session id** — `tmux-agent-orchestrate-create` only sets up the *container*. If you need a known session id for later resume, send a placeholder message (e.g. "init") and read it back, then call `tmux-agent-orchestrate-resume` later.
|
||||
|
||||
## Verification
|
||||
|
||||
After spawn + YAML append:
|
||||
|
||||
```bash
|
||||
# 1. tmux session is alive
|
||||
tmux has-session -t "$SESSION_NAME" && echo OK || echo MISSING
|
||||
|
||||
# 2. pane has the expected cmd + cwd
|
||||
tmux list-panes -t "$SESSION_NAME" -F 'cmd=#{pane_current_command} cwd=#{pane_current_path}'
|
||||
|
||||
# 3. agent-sessions.yaml has the new entry
|
||||
python3 -c "
|
||||
import yaml
|
||||
d = yaml.safe_load(open('.hermes/agent-sessions.yaml'))
|
||||
names = [s['name'] for s in d['tmux_sessions']]
|
||||
assert '$SESSION_NAME' in names, 'session not registered'
|
||||
print('OK:', names)
|
||||
"
|
||||
|
||||
# 4. Optional: send a probe via tmux send-keys and capture-pane
|
||||
tmux send-keys -t "$SESSION_NAME" "" Enter
|
||||
sleep 2
|
||||
tmux capture-pane -t "$SESSION_NAME" -p -S -20
|
||||
```
|
||||
|
||||
## When NOT to use this skill
|
||||
|
||||
- **Resuming an old conversation** → `tmux-agent-orchestrate-resume`
|
||||
- **Killing an existing session** → `tmux-agent-orchestrate-stop`
|
||||
- **Just attaching to an existing session** → `tmux attach -t <name>` (no skill needed)
|
||||
- **One-shot print mode (claude -p "...")** → no tmux needed; use `claude-code` skill's print mode
|
||||
@@ -0,0 +1,294 @@
|
||||
#!/usr/bin/env bash
|
||||
# create_session.sh — tmux-agent-orchestrate-create 의 부속 스크립트
|
||||
# Usage:
|
||||
# bash create_session.sh --workspace <path> --agent <claude|agy> [--session <name>] [--wrapper]
|
||||
#
|
||||
# 동작:
|
||||
# 1) preflight: tmux/claude/agy 가용성, workspace 존재
|
||||
# 2) tmux 세션 이름 결정 (--session 없으면 자동)
|
||||
# 3) tmux 세션 시작 (claude 는 wrapper 우선, agy 는 인라인)
|
||||
# 4) pane 메타 캡처 (pid, cmd, cwd)
|
||||
# 5) agent-sessions.yaml 에 tmux_sessions[] 엔트리 append
|
||||
# 6) 검증 출력
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 = success
|
||||
# 1 = preflight failure
|
||||
# 2 = invalid args
|
||||
# 3 = tmux session already exists (use tmux-agent-orchestrate-resume or delete first)
|
||||
# 4 = agent-sessions.yaml append failure
|
||||
set -euo pipefail
|
||||
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 --workspace <path> --agent <claude|agy|hermes> [options]
|
||||
|
||||
Options:
|
||||
--workspace PATH project directory (required)
|
||||
--agent AGENT claude | agy | hermes (required)
|
||||
--session NAME tmux session name (default: derived from workspace)
|
||||
--wrapper force use of ~/.local/bin/<session> wrapper even if not present
|
||||
--dry-run print commands without executing
|
||||
--tmux-server NAME specify isolated tmux server name
|
||||
--submit-job PROMPT submit a job to tmux-agent-orchestrate-delegate-job registry with the given prompt
|
||||
-h, --help this help
|
||||
EOF
|
||||
}
|
||||
|
||||
WORKSPACE=""
|
||||
AGENT=""
|
||||
SESSION_NAME=""
|
||||
USE_WRAPPER=0
|
||||
DRY_RUN=0
|
||||
TMUX_SERVER_OPT=""
|
||||
SUBMIT_JOB_PROMPT=""
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--workspace) WORKSPACE="$2"; shift 2 ;;
|
||||
--agent) AGENT="$2"; shift 2 ;;
|
||||
--session) SESSION_NAME="$2"; shift 2 ;;
|
||||
--wrapper) USE_WRAPPER=1; shift ;;
|
||||
--dry-run) DRY_RUN=1; shift ;;
|
||||
--tmux-server) TMUX_SERVER_OPT="$2"; shift 2 ;;
|
||||
--submit-job) SUBMIT_JOB_PROMPT="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) echo "ERROR: unknown arg: $1" >&2; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -n "$TMUX_SERVER_OPT" ]; then
|
||||
export TMUX_SERVER_NAME="$TMUX_SERVER_OPT"
|
||||
fi
|
||||
|
||||
# Preflight
|
||||
[ -n "$WORKSPACE" ] || { echo "ERROR: --workspace required" >&2; usage; exit 2; }
|
||||
[ -n "$AGENT" ] || { echo "ERROR: --agent required" >&2; usage; exit 2; }
|
||||
[ -d "$WORKSPACE" ] || { echo "ERROR: workspace $WORKSPACE not a directory" >&2; exit 1; }
|
||||
command -v tmux >/dev/null || { echo "ERROR: tmux not installed" >&2; exit 1; }
|
||||
command -v "$AGENT" >/dev/null || { echo "ERROR: $AGENT CLI not in PATH" >&2; exit 1; }
|
||||
|
||||
# Auth Check (OAuth check for agy, loggedIn check for claude, status for hermes)
|
||||
if [ "$AGENT" = "claude" ]; then
|
||||
if ! claude auth status 2>/dev/null | grep -q '"loggedIn":\s*true'; then
|
||||
echo "ERROR: claude not logged in. Run 'claude auth login' first." >&2
|
||||
exit 1
|
||||
fi
|
||||
elif [ "$AGENT" = "agy" ]; then
|
||||
if ! agy models >/dev/null 2>&1; then
|
||||
echo "ERROR: agy is not authenticated. Please log in first." >&2
|
||||
exit 1
|
||||
fi
|
||||
elif [ "$AGENT" = "hermes" ]; then
|
||||
if ! hermes status >/dev/null 2>&1; then
|
||||
echo "ERROR: hermes is not functional. Run 'hermes setup' first." >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# 세션 이름 — lib.sh::derive_session_name 이 단일 소스 (P0-A)
|
||||
if [ -z "$SESSION_NAME" ]; then
|
||||
SESSION_NAME="$(derive_session_name "$WORKSPACE" "$AGENT")"
|
||||
fi
|
||||
|
||||
# 이미 살아있으면 실패
|
||||
if _tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
|
||||
echo "ERROR: tmux session '$SESSION_NAME' already exists. Use tmux-agent-orchestrate-resume to attach, or tmux-agent-orchestrate-stop first." >&2
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# tmux 세션 띄우기
|
||||
LOCAL_BIN="${LOCAL_BIN:-$HOME/.local/bin}"
|
||||
WRAPPER="$LOCAL_BIN/$SESSION_NAME"
|
||||
|
||||
spawn() {
|
||||
case "$AGENT" in
|
||||
claude)
|
||||
if [ -x "$WRAPPER" ] || [ "$USE_WRAPPER" = "1" ]; then
|
||||
nohup "$WRAPPER" >/dev/null 2>&1 &
|
||||
disown
|
||||
else
|
||||
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude"
|
||||
fi
|
||||
;;
|
||||
agy)
|
||||
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "agy --dangerously-skip-permissions"
|
||||
;;
|
||||
hermes)
|
||||
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "hermes"
|
||||
;;
|
||||
*) echo "ERROR: --agent must be claude, agy or hermes, got: $AGENT" >&2; exit 2 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
if [ "$DRY_RUN" = "1" ]; then
|
||||
echo "[dry-run] would spawn: tmux session '$SESSION_NAME' in $WORKSPACE (agent=$AGENT)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
spawn
|
||||
|
||||
# TUI 준비 대기
|
||||
sleep 6
|
||||
|
||||
# pane 메타 캡처
|
||||
PANE_PID=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null || echo "")
|
||||
PANE_CWD=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_path}' 2>/dev/null || echo "$WORKSPACE")
|
||||
PANE_CMD=$(_tmux list-panes -t "$SESSION_NAME" -F '#{pane_current_command}' 2>/dev/null || echo "$AGENT")
|
||||
TMUX_EPOCH=$(date +%s)
|
||||
NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
# cmd_full 결정
|
||||
case "$AGENT" in
|
||||
claude) CMD_FULL='claude' ;;
|
||||
agy) CMD_FULL='agy --dangerously-skip-permissions' ;;
|
||||
hermes) CMD_FULL='hermes' ;;
|
||||
esac
|
||||
|
||||
# 시작 명령
|
||||
local_tmux="tmux"
|
||||
if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
|
||||
local_tmux="tmux -L $TMUX_SERVER_NAME"
|
||||
fi
|
||||
|
||||
case "$AGENT" in
|
||||
claude)
|
||||
if [ -x "$WRAPPER" ]; then
|
||||
START_CMD="$WRAPPER # ~/.local/bin 의 래퍼"
|
||||
else
|
||||
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude\""
|
||||
fi
|
||||
;;
|
||||
agy|hermes)
|
||||
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"$CMD_FULL\""
|
||||
;;
|
||||
esac
|
||||
|
||||
# agent-sessions.yaml 에 append
|
||||
DELEGATE_JOB_ID=""
|
||||
if [ -n "$SUBMIT_JOB_PROMPT" ]; then
|
||||
delegate_agent=""
|
||||
if [ "$AGENT" = "claude" ]; then
|
||||
delegate_agent="claude-code"
|
||||
elif [ "$AGENT" = "hermes" ]; then
|
||||
delegate_agent="hermes-agent"
|
||||
else
|
||||
delegate_agent="antigravity-cli"
|
||||
fi
|
||||
agent_session="tmux:$SESSION_NAME"
|
||||
DELEGATE_JOB_ID=$(delegate_submit_job "$SUBMIT_JOB_PROMPT" "$delegate_agent" "$agent_session")
|
||||
echo "Submitted delegated job: $DELEGATE_JOB_ID"
|
||||
fi
|
||||
|
||||
if [ ! -f "$AGENT_SESSIONS_YAML" ]; then
|
||||
mkdir -p "$(dirname "$AGENT_SESSIONS_YAML")"
|
||||
echo "tmux_sessions: []" > "$AGENT_SESSIONS_YAML"
|
||||
fi
|
||||
|
||||
# atomic_dump_yaml: flock + temp+rename + .bak + schema validate (P0-B).
|
||||
# 모든 값은 환경변수로 전달 — heredoc interpolation 없음 (P1-B).
|
||||
# 자식 pid 는 bash 에서 pgrep 으로 미리 구함 (P2: 도구명 필터).
|
||||
CHILD_PID=0
|
||||
if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then
|
||||
CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
|
||||
CHILD_PID="${CHILD_PID:-0}"
|
||||
fi
|
||||
|
||||
atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
|
||||
SESSION_NAME="$SESSION_NAME" AGENT="$AGENT" NOW_ISO="$NOW_ISO" \
|
||||
TMUX_EPOCH="$TMUX_EPOCH" PANE_PID="$PANE_PID" PANE_CWD="$PANE_CWD" \
|
||||
CMD_FULL="$CMD_FULL" START_CMD="$START_CMD" CHILD_PID="$CHILD_PID" \
|
||||
TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}" \
|
||||
DELEGATE_JOB_ID="$DELEGATE_JOB_ID" <<'PYEOF'
|
||||
name = os.environ['SESSION_NAME']
|
||||
agent = os.environ['AGENT']
|
||||
pid = os.environ.get('PANE_PID', '')
|
||||
epoch = os.environ.get('TMUX_EPOCH', '')
|
||||
server_name = os.environ.get('TMUX_SERVER_NAME', 'default')
|
||||
server_opt = f"-L {server_name} " if server_name and server_name != 'default' else ""
|
||||
|
||||
sessions = d.setdefault('tmux_sessions', [])
|
||||
|
||||
# P0-D: 같은 이름 엔트리가 status=running 이면만 거부. terminated/archived 는
|
||||
# 재사용 가능 — 낡은 엔트리를 제거하고 새로 append (create -> delete -> create).
|
||||
running_same = [s for s in sessions if s.get('name') == name and s.get('status') == 'running']
|
||||
if running_same:
|
||||
print(f"ERROR: {name} already running in agent-sessions.yaml", flush=True)
|
||||
raise SystemExit(4)
|
||||
sessions[:] = [s for s in sessions if s.get('name') != name]
|
||||
|
||||
entry = {
|
||||
'name': name,
|
||||
'status': 'running',
|
||||
'tmux_session_created_at': os.environ['NOW_ISO'],
|
||||
'tmux_session_epoch': int(epoch) if epoch.isdigit() else 0,
|
||||
'tmux_server': server_name,
|
||||
'delegate_job_id': os.environ.get('DELEGATE_JOB_ID', '') or None,
|
||||
'pane': {
|
||||
'index': 0,
|
||||
'pid': int(pid) if pid.isdigit() else 0,
|
||||
'cmd': agent,
|
||||
'cmd_full': os.environ['CMD_FULL'],
|
||||
'cwd': os.environ['PANE_CWD'],
|
||||
},
|
||||
'start_command': os.environ['START_CMD'],
|
||||
'attach_command': f'tmux {server_opt}attach -t {name}',
|
||||
'kill_command': f'tmux {server_opt}kill-session -t {name}',
|
||||
}
|
||||
|
||||
if agent == 'claude':
|
||||
entry['tui'] = {
|
||||
'model': '(unknown — capture after first message)',
|
||||
'provider': 'anthropic',
|
||||
'plan': '(unknown)',
|
||||
'account': '(unknown — read from claude auth status)',
|
||||
'version': '(unknown — read from TUI)',
|
||||
}
|
||||
entry['claude_session_id_own'] = None
|
||||
entry['last_visible_status'] = "TUI started; awaiting first user message"
|
||||
elif agent == 'agy':
|
||||
cp = os.environ.get('CHILD_PID', '0')
|
||||
entry['child_pid'] = int(cp) if cp.isdigit() else 0
|
||||
entry['agy_conversation_id_own'] = None
|
||||
entry['mcp_attachments'] = [
|
||||
{
|
||||
'name': 'stitch',
|
||||
'transport': 'mcp-remote',
|
||||
'endpoint': 'https://stitch.googleapis.com/mcp'
|
||||
}
|
||||
]
|
||||
entry['last_visible_status'] = "TUI started; awaiting first user message"
|
||||
elif agent == 'hermes':
|
||||
cp = os.environ.get('CHILD_PID', '0')
|
||||
entry['child_pid'] = int(cp) if cp.isdigit() else 0
|
||||
entry['hermes_conversation_id_own'] = None
|
||||
entry['last_visible_status'] = "TUI started; awaiting first user message"
|
||||
|
||||
sessions.append(entry)
|
||||
|
||||
snap = d.setdefault('snapshot', {})
|
||||
snap['taken_at'] = os.environ['NOW_ISO']
|
||||
snap['cwd'] = os.environ['PANE_CWD']
|
||||
print(f"appended: {name}", flush=True)
|
||||
PYEOF
|
||||
|
||||
echo
|
||||
echo "=== created ==="
|
||||
echo "tmux session: $SESSION_NAME (pane pid $PANE_PID, cmd $PANE_CMD, cwd $PANE_CWD)"
|
||||
if [ -n "$DELEGATE_JOB_ID" ]; then
|
||||
echo "delegate job: $DELEGATE_JOB_ID"
|
||||
delegate_publish_event "$DELEGATE_JOB_ID" started "tmux-agent-orchestrate session created"
|
||||
WD_PID=$(start_watchdog "$DELEGATE_JOB_ID" "$WORKSPACE")
|
||||
echo "watchdog PID: $WD_PID"
|
||||
fi
|
||||
echo "agent-sessions.yaml updated"
|
||||
echo
|
||||
if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
|
||||
echo "Attach: tmux -L $TMUX_SERVER_NAME attach -t $SESSION_NAME"
|
||||
else
|
||||
echo "Attach: tmux attach -t $SESSION_NAME"
|
||||
fi
|
||||
echo "Delete: use tmux-agent-orchestrate-stop skill"
|
||||
echo "Resume: use tmux-agent-orchestrate-resume skill (after first message creates a session id)"
|
||||
@@ -0,0 +1,11 @@
|
||||
# tmux-agent-orchestrate-delegate-job 스킬
|
||||
|
||||
작업(Job)을 자율 에이전트(claude-code/codex/opencode/human)에게 위임하고 MQTT
|
||||
이벤트 채널로 비동기 관찰하는 Hermes 스킬. **시작점은 [`SKILL.md`](./SKILL.md).**
|
||||
|
||||
- 프로토콜/스키마: [`job-protocol.md`](./job-protocol.md)
|
||||
- 브로커 PoC→운영 전환: [`mqtt-broker-setup.md`](./mqtt-broker-setup.md)
|
||||
- 레지스트리 포맷/동시성: [`registry.md`](./registry.md)
|
||||
- 참조 구현: [`tmux-agent-orchestrate-delegate-job`](./tmux-agent-orchestrate-delegate-job) (bash wrapper), [`scripts/publish_event.py`](./scripts/publish_event.py), [`scripts/job_subscriber.py`](./scripts/job_subscriber.py), [`scripts/registry.py`](./scripts/registry.py), [`scripts/mqtt_common.py`](./scripts/mqtt_common.py)
|
||||
- 영구 감사 로그: `.hermes/delegate_job_logs/<job_id>/` (`meta.json`·`events.ndjson`·`status.json`)
|
||||
— `tmux-agent-orchestrate-delegate-job logs <id>` 또는 `tmux-agent-orchestrate-delegate-job logs --list`로 조회 (SKILL.md "Audit Logs" 참조)
|
||||
@@ -0,0 +1,385 @@
|
||||
---
|
||||
name: tmux-agent-orchestrate-delegate-job
|
||||
description: "Delegate a unit of work to any autonomous agent (claude-code, codex, opencode, or a human) and observe it asynchronously over an MQTT event channel. Each job gets a unique id, a registry record (prompt, broker, status, timeouts), and a single per-job topic that carries started/permission_required/progress/completed/error events as schema-versioned JSON. The delegator starts a subscriber first, runs the agent, and treats a completed/error event or a timeout as the job's terminal state. Ships a working reference implementation (publish_event.py, job_subscriber.py, registry.py, mqtt_common.py, tmux-agent-orchestrate-delegate-job wrapper) plus a PoC-to-production path: validate on a public broker, then move to an authenticated TLS broker by changing config only — no code change. Use when you need fire-and-observe delegation, multi-job fan-out across tmux sessions, or a uniform completion-signal protocol shared by several agent types."
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
platforms: [linux, macos, windows]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [agent-delegation, mqtt, jobs, orchestration, async-completion]
|
||||
related_skills: [claude-code, codex, opencode, hermes-agent-skill-authoring]
|
||||
---
|
||||
|
||||
# tmux-agent-orchestrate-delegate-job — Async Job Delegation over MQTT
|
||||
|
||||
Delegate a unit of work to an autonomous agent, then **observe** it instead of
|
||||
blocking on it. Every job gets a unique id and a registry record; the agent
|
||||
publishes lifecycle events (`started`, `permission_required`, `progress`,
|
||||
`completed`, `error`) to a per-job MQTT topic; the delegator subscribes and
|
||||
treats `completed`/`error` — or a timeout — as the terminal state.
|
||||
|
||||
This skill is a **reference implementation**: copy the files in this directory
|
||||
into your project and customise. The `communication_over_mqtt` project is the
|
||||
canonical concrete instance.
|
||||
|
||||
## Overview
|
||||
|
||||
The model is deliberately small. A **job** is one delegated task. An **agent**
|
||||
is a worker (a claude-code tmux session, a codex run, a human). The **registry**
|
||||
(`.hermes/jobs/<id>.json`) holds everything about a job so nothing important
|
||||
lives in environment variables — which means one tmux session can process many
|
||||
jobs sequentially, and many sessions can fan out in parallel, with no env
|
||||
collisions. The **event channel** is one MQTT topic per job carrying JSON
|
||||
payloads; `event` discriminates the type.
|
||||
|
||||
Responsibility is split into exactly one entry point each:
|
||||
[`publish_event.py`](./scripts/publish_event.py) emits events (registry lookup,
|
||||
monotonic `seq`, retry+backoff) and [`job_subscriber.py`](./scripts/job_subscriber.py)
|
||||
observes them (timeouts, terminal state machine, defensive parsing). Shared
|
||||
logic lives in [`mqtt_common.py`](./scripts/mqtt_common.py); registry I/O in
|
||||
[`registry.py`](./scripts/registry.py). The demo `publisher.py`/`subscriber.py`
|
||||
in the host project stay frozen.
|
||||
|
||||
Two stages, same code. **PoC** runs on the public `broker.hivemq.com` to wire up
|
||||
the protocol. **Production** moves to your own authenticated TLS broker — the
|
||||
switch is **config only** (env vars + the registry `broker.*` block), never a
|
||||
code change. See [`mqtt-broker-setup.md`](./mqtt-broker-setup.md).
|
||||
|
||||
## When to Use / When NOT to Use
|
||||
|
||||
**Use when:**
|
||||
- you want **fire-and-observe** delegation — kick off work and get a completion
|
||||
signal rather than blocking a terminal;
|
||||
- several agent types (claude-code, codex, opencode, human) must follow **one**
|
||||
completion protocol;
|
||||
- you need **multi-job fan-out** across tmux sessions with safe job claiming;
|
||||
- you want a clean PoC → authenticated-broker upgrade path.
|
||||
|
||||
**Do NOT use when:**
|
||||
- a one-shot `claude -p '…'` that returns inline is enough (no async signal
|
||||
needed) — just use the [claude-code](../claude-code/SKILL.md) skill directly;
|
||||
- you need request/response RPC or large artifact transfer (this is a
|
||||
one-direction event stream, not a data bus);
|
||||
- the payload would carry secrets and you're still on the public broker — move
|
||||
to the own-broker stage first.
|
||||
|
||||
## Quick Start
|
||||
|
||||
The one-line wrapper handles register + subscriber-first + agent launch. If
|
||||
you're new, **start here** and only fall back to the manual 5-step flow when
|
||||
you need finer control.
|
||||
|
||||
```bash
|
||||
# 1) one line: register → start subscriber → launch agent in tmux
|
||||
# (uses public broker by default; last stdout line is the audit-log dir)
|
||||
tmux-agent-orchestrate-delegate-job submit \
|
||||
--agent claude-code \
|
||||
--prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \
|
||||
--workdir /path/to/project \
|
||||
--agent-session tmux:demo \
|
||||
--timeout 3600 --idle-timeout 120
|
||||
# → stdout: registered job: <JID>
|
||||
# subscriber pid: …
|
||||
# agent launched in tmux session: demo
|
||||
# subscriber output: <one line per event>
|
||||
# /path/to/project/.hermes/delegate_job_logs/<JID> ← audit log dir
|
||||
|
||||
# 2) at any time, query the job or its audit log
|
||||
tmux-agent-orchestrate-delegate-job status --job <JID>
|
||||
tmux-agent-orchestrate-delegate-job logs <JID> # pretty timeline
|
||||
tmux-agent-orchestrate-delegate-job logs --list # every job, live status
|
||||
|
||||
# 3) run a user-supplied validator against the job's artifacts
|
||||
tmux-agent-orchestrate-delegate-job verify --job <JID> --validate ./validate.sh
|
||||
```
|
||||
|
||||
The wrapper enforces the **subscribe-before-publish** ordering and **forwards
|
||||
the freshly-minted `JOB_ID` into the agent's prompt** (so the agent calls
|
||||
`publish_event.py --job <JID>` with the right id — see Pitfall §"Wrong job_id
|
||||
propagated to the agent"). When you need finer control, the manual flow is:
|
||||
|
||||
```bash
|
||||
# Manual 5-step (same outcome, more knobs)
|
||||
PY=.venv/bin/python
|
||||
SKILL=./.agents/skills/tmux-agent-orchestrate-delegate-job/scripts
|
||||
|
||||
# 1) register
|
||||
JID=$($PY "$SKILL/registry.py" register \
|
||||
--prompt "…" --agent claude-code --agent-session tmux:demo \
|
||||
--timeout 3600 --idle-timeout 120)
|
||||
|
||||
# 2) START THE SUBSCRIBER FIRST (MQTT does not queue non-retained msgs)
|
||||
$PY "$SKILL/job_subscriber.py" --job "$JID" --timeout 3600 --idle-timeout 120 &
|
||||
|
||||
# 3) pass JID to the agent and instruct it to publish events with --job "$JID"
|
||||
# (don't hard-code a job id you saw earlier — see Pitfall §"Wrong job_id")
|
||||
|
||||
# 4) on completion the subscriber prints events and exits 0/1/2
|
||||
|
||||
# 5) inspect any time
|
||||
$PY "$SKILL/registry.py" get --job "$JID"
|
||||
$PY "$SKILL/registry.py" logs "$JID" # positional job id
|
||||
$PY "$SKILL/registry.py" logs --list
|
||||
```
|
||||
|
||||
## Job Protocol
|
||||
|
||||
One topic per job: `python/mqtt/jobs/<job_id>/events`. Payload (JSON, UTF-8,
|
||||
`schema_version=1`):
|
||||
|
||||
```json
|
||||
{ "schema_version": 1, "seq": 7, "job_id": "abc12345",
|
||||
"event": "started|permission_required|progress|completed|error",
|
||||
"timestamp": "2026-06-19T09:32:00Z", "detail": "generalised text",
|
||||
"data": { "optional": "metadata" } }
|
||||
```
|
||||
|
||||
- `seq` is monotonic per job (first = 1); the subscriber uses it to spot
|
||||
reorder/duplication.
|
||||
- `timestamp` is advisory — timeouts are measured from **receive** time.
|
||||
- `detail`/`data` carry **no** secrets or absolute paths.
|
||||
- A `schema_version` or `job_id` mismatch is **dropped** (defensive parsing).
|
||||
|
||||
`started` and `completed`/`error` are the mandatory bookends; `completed`→exit 0,
|
||||
`error`→exit 1. Full catalogue + production `auth_token` handling:
|
||||
[`job-protocol.md`](./job-protocol.md).
|
||||
|
||||
## Registry Format
|
||||
|
||||
```
|
||||
.hermes/jobs/<id>.json # metadata record (single source of truth)
|
||||
.hermes/jobs/<id>.events.log # append-only JSON-lines log (debug, optional)
|
||||
.hermes/jobs/.lock # fcntl advisory lock for the registry
|
||||
```
|
||||
|
||||
The record holds `status`, `prompt`, `agent`, `agent_session`, a `broker` block,
|
||||
`topic_prefix`, `timeout_sec`/`idle_timeout_sec`, `expected_artifacts`,
|
||||
`last_seq`, and (production) `auth_token`. Because the `broker` block lives in
|
||||
the record, `publish_event.py` connects from the registry alone. Concurrency,
|
||||
the atomic rename trick, and multi-session job claiming are in
|
||||
[`registry.md`](./registry.md).
|
||||
|
||||
## Audit Logs
|
||||
|
||||
Every job's lifecycle is mirrored to a **persistent, append-only audit log**
|
||||
under `.hermes/delegate_job_logs/` (override with `DELEGATE_JOB_LOGS_DIR`;
|
||||
default `<cwd>/.hermes/delegate_job_logs`). Unlike the registry — live state
|
||||
mutated in place and liable to be cleaned up — the audit log is durable
|
||||
history you can replay after the fact. It is git-ignored.
|
||||
|
||||
```
|
||||
.hermes/delegate_job_logs/<job_id>/
|
||||
meta.json # registration snapshot: prompt, agent, broker, timeouts, …
|
||||
events.ndjson # append-only, one JSON event per line, in time order
|
||||
status.json # current status only (fast point-query)
|
||||
```
|
||||
|
||||
**What is logged, automatically:**
|
||||
|
||||
| When | `events.ndjson` line | Written by |
|
||||
|------|----------------------|------------|
|
||||
| job registered | `registered` (also seeds meta.json + status.json) | `registry.register_job` |
|
||||
| any status change | `status_changed` (`from`/`to`; also rewrites status.json) | `update_job_status`, `pick_pending` |
|
||||
| event published | `published` (carries the exact payload — reproducible) | `publish_event.py` |
|
||||
| event received | `received` (subscriber's external view) | `job_subscriber.py` |
|
||||
|
||||
Both the emitter side (`published`) and the observer side (`received`) are
|
||||
recorded, so a dropped publish or a missed receive is still visible from the
|
||||
other. Every write is **best-effort and isolated** — an fcntl-locked append
|
||||
guarded by `try/except` that only ever emits a `logger.warning`, so a logging
|
||||
failure can never break a publish, a subscribe, or a registry write. stdout is
|
||||
never touched.
|
||||
|
||||
**Reading them:**
|
||||
|
||||
```bash
|
||||
tmux-agent-orchestrate-delegate-job logs <job_id> # pretty-print one job's timeline
|
||||
tmux-agent-orchestrate-delegate-job logs --list # summarise every logged job (with live status)
|
||||
# or directly via the registry CLI:
|
||||
$PY scripts/registry.py logs <job_id> [--tail N] [--json]
|
||||
$PY scripts/registry.py logs --list [--json]
|
||||
```
|
||||
|
||||
`submit` prints the job's audit-log directory as its last stdout line, so a
|
||||
caller can `tail -n1` to locate it.
|
||||
|
||||
## Broker Setup
|
||||
|
||||
| Stage | Broker | Auth | Transport |
|
||||
|-------|--------|------|-----------|
|
||||
| PoC | `broker.hivemq.com` | none | 1883 plaintext |
|
||||
| Production | self-hosted Mosquitto/EMQX | user/pass + ACL | 8883 TLS |
|
||||
|
||||
All connection settings come from env (`MQTT_BROKER`, `MQTT_PORT`, `MQTT_TLS`,
|
||||
`MQTT_USERNAME`/`MQTT_PASSWORD`, `MQTT_CA_CERTS`, …) resolved by
|
||||
`broker_config_from_env()`, with the registry `broker.*` block overriding per
|
||||
job. Moving to your own broker is **config only**: install Mosquitto, set
|
||||
`persistence true` + `acl_file` + `password_file` + a TLS `listener 8883`, grant
|
||||
the worker `write python/mqtt/jobs/+/events` and Hermes `read`, then flip
|
||||
`MQTT_TLS=1` and fill the registry `broker.*`. Step-by-step (conf, ACL,
|
||||
`mosquitto_passwd`, self-signed/private-CA certs, cut-over verification):
|
||||
[`mqtt-broker-setup.md`](./mqtt-broker-setup.md).
|
||||
|
||||
## Agent Adapters
|
||||
|
||||
Each agent voluntarily follows the contract: receive a `JOB_ID` (or registry
|
||||
path), call `publish_event.py` at lifecycle points, exit 0/1/2. **The contract
|
||||
in one line**: every event call uses `--job "$JOB_ID"` where `$JOB_ID` is the
|
||||
**freshly-issued id from the registry record for *this* delegation** — never a
|
||||
job_id you saw in an earlier session (Pitfall §"Wrong job_id propagated to the
|
||||
agent").
|
||||
|
||||
- **claude-code** — Claude Code calls `publish_event.py` via its Bash tool at
|
||||
lifecycle points. `submit --mode tmux` injects a prompt that already names
|
||||
`$JOB_ID`; if you drive claude manually, hand it the id explicitly. Reference
|
||||
instruction block (the wrapper injects something equivalent):
|
||||
|
||||
```text
|
||||
Your job_id is "$JOB_ID" (read it from the registry record for this delegation —
|
||||
do not reuse any job_id you saw before).
|
||||
|
||||
On start: $PY tmux-agent-orchestrate-delegate-job/scripts/publish_event.py --job "$JOB_ID" --event started
|
||||
On permission: $PY … --job "$JOB_ID" --event permission_required --detail "<tool>:<what>"
|
||||
On progress: $PY … --job "$JOB_ID" --event progress --detail "<short status>"
|
||||
On success: $PY … --job "$JOB_ID" --event completed --detail "<one-line summary>"
|
||||
On failure: $PY … --job "$JOB_ID" --event error --detail "<one-line reason>"
|
||||
|
||||
Task: <the user's prompt>
|
||||
|
||||
The subscriber for "$JOB_ID" is already running; your completed/error event
|
||||
ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.
|
||||
```
|
||||
|
||||
See [claude-code](../claude-code/SKILL.md) for tmux orchestration patterns.
|
||||
- **codex** — same contract. Invoke `codex exec "<instruction-block-above>"` or
|
||||
wire `publish_event.py` as an MCP tool so the agent can call it directly.
|
||||
- **opencode** — wire `publish_event.py` as a tool/command the agent can call;
|
||||
identical event points.
|
||||
- **human** — a person does the work, reads the registry record, then runs
|
||||
`publish_event.py --job <id> --event completed` (or `error`) by hand.
|
||||
|
||||
## User Interface
|
||||
|
||||
The [`tmux-agent-orchestrate-delegate-job`](./tmux-agent-orchestrate-delegate-job) bash wrapper bundles register +
|
||||
subscribe-first + run-agent + validate:
|
||||
|
||||
```bash
|
||||
tmux-agent-orchestrate-delegate-job submit --agent claude-code \
|
||||
--prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \
|
||||
--workdir /path/to/project --timeout 3600 [--validate ./validate.sh]
|
||||
tmux-agent-orchestrate-delegate-job status --job <id> # one record, pretty-printed
|
||||
tmux-agent-orchestrate-delegate-job list # all jobs, one line each
|
||||
tmux-agent-orchestrate-delegate-job verify --job <id> --validate ./validate.sh # runs it, reports exit code
|
||||
tmux-agent-orchestrate-delegate-job wait [--job <id>] # block until terminal (else --wait-any)
|
||||
```
|
||||
|
||||
`submit` **always starts the subscriber before the agent** (the ordering
|
||||
dependency), runs the agent in `--mode print` (one-shot) or `--mode tmux`, and
|
||||
calls `--validate` afterward if given. The skill automates job-id generation,
|
||||
registry creation, broker resolution, subscriber-first ordering, agent launch,
|
||||
and completion detection; it does **not** automate the agent's internals or your
|
||||
business-logic validation — those are hooks you fill (`validate.sh` reads
|
||||
`$JOB_ID`/`$REGISTRY_DIR`).
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
- **Publishing before subscribing** — MQTT does not queue non-retained messages
|
||||
for absent subscribers. Start `job_subscriber.py` *before* the agent, or rely
|
||||
on retained terminal events (production). `submit` enforces this.
|
||||
- **Wrong job_id propagated to the agent** — the wrapper prints a fresh `JOB_ID`
|
||||
on every `submit`. If your agent instruction (or the wrapper's prompt template)
|
||||
hard-codes an old job_id, the agent calls `publish_event.py --job <wrong>`,
|
||||
the subscriber's defensive parser drops it as a `job_id` mismatch, and the
|
||||
delegator waits until idle timeout (exit 2). Fix: instruct the agent to
|
||||
**read the job_id from the registry record for *this* delegation** (or pass it
|
||||
in via env / `--prompt` interpolation), never from prior runs. `submit`'s
|
||||
default prompt template interpolates `$JOB_ID` for you — if you build a custom
|
||||
prompt, do the same.
|
||||
- **tmux session name collision** — `submit --mode tmux` derives the session
|
||||
name from `--agent-session tmux:<name>` (default `tmux:claude`). If a session
|
||||
with that name is already attached (e.g. you ran the demo and the previous
|
||||
session is still open), `tmux new-session -d -s <name>` fails and the agent
|
||||
never launches. Pick a unique `--agent-session` per concurrent delegation
|
||||
(e.g. `tmux:demo`, `tmux:claude-a`, `tmux:claude-b`) or kill the stale one
|
||||
(`tmux kill-session -t claude`) before re-running.
|
||||
- **Timeout before `started`** — a cold-starting agent may not emit `started`
|
||||
for a while; the wall-clock timeout starts at subscribe time so a stuck agent
|
||||
still terminates. Don't set `--timeout` so low you false-positive a slow start.
|
||||
- **No retry on publish** — a dropped `completed` would hang the delegator
|
||||
forever; `publish_event.py` retries with exponential backoff and exits 2 if it
|
||||
still fails, so the delegator is never left waiting silently.
|
||||
- **QoS-1 duplicates / reorders** — a terminal event can arrive twice, or
|
||||
`error` can trail `completed`; the subscriber's terminal state machine
|
||||
finalises each job once and ignores the rest.
|
||||
- **Trusting the public broker** — anyone can publish there; never make a real
|
||||
decision on a PoC signal. Add `auth_token` + an authenticated broker first.
|
||||
- **Secrets in `detail`/`data`** — keep payloads generalised; no paths, keys, or
|
||||
tokens (except the production `auth_token` in `data`).
|
||||
|
||||
## Subagent Orchestration Pattern
|
||||
|
||||
When using this skill from a Hermes `delegate_task` subagent to dispatch work to
|
||||
a coding-agent CLI (agy/claude) running in a tmux session, the following pattern
|
||||
has been verified (2026-06-21, 6-batch refactoring sprint):
|
||||
|
||||
### Roles
|
||||
- **Main worker** (implementation): one agent session (e.g. `agy-new`) receives
|
||||
brief files and executes code changes.
|
||||
- **Reviewers** (spec compliance + code quality): two other agent sessions
|
||||
(e.g. `agy-existing`, `claude-existing`) review the diff in parallel.
|
||||
- **Hermes** (orchestrator): dispatches subagents, verifies diffs, commits,
|
||||
and falls back to direct fixes when reviewers find issues.
|
||||
|
||||
### Key lessons learned
|
||||
1. **Brief delivery via file path** — don't paste long briefs inline via
|
||||
`tmux send-keys`; the TUI may swallow them. Instead, send a short instruction
|
||||
like "follow /tmp/batch1-brief.md" and let the agent read the file.
|
||||
2. **Polling vs MQTT subscriber** — for short tasks (<5min), pane polling
|
||||
(`capture-pane` + grep for completion markers) is simpler and more reliable
|
||||
than registering a job via `registry.py` + `job_subscriber.py`. Use MQTT
|
||||
subscriber only for long-running jobs (>5min) where push notification matters.
|
||||
3. **Reviewers catch different bugs** — in practice, agy (Flash) caught
|
||||
semantic issues (slash matching, export scope), while claude (Opus) caught
|
||||
API signature mismatches (paho v2 5-arg vs 4-arg `on_disconnect`). Two
|
||||
reviewers with different models provide complementary coverage.
|
||||
4. **Hermes fallback fix** — when reviewers find a small, well-defined issue
|
||||
(wrong argument count, missing slash), Hermes should fix it directly rather
|
||||
than re-dispatching the implementer. This saves a full round-trip.
|
||||
5. **Batch grouping** — group 2-3 FW items per batch when they touch different
|
||||
files (no file overlap). This amortises the dispatch overhead. Items touching
|
||||
the same file must be in separate batches to avoid conflicts.
|
||||
6. **Pane Snapshots & Truncation Prevention** — to prevent long agent responses from being scrolled out and truncated due to TUI viewport limitations, enforce the following snapshotting pattern:
|
||||
- Immediately after dispatching a brief, capture the pre-brief pane buffer via `capture-pane -S -200`.
|
||||
- During long execution, run a background loop taking incremental snapshots (e.g. every 30 seconds `>> /tmp/pane-snap.txt`).
|
||||
- Immediately after job termination, capture the entire final pane state to ensure no terminal logs are lost.
|
||||
|
||||
## Verification Checklist
|
||||
|
||||
- [ ] `started` → `completed` over the public broker: subscriber prints the
|
||||
lines and exits **0**.
|
||||
- [ ] `error` path: subscriber exits **1**.
|
||||
- [ ] timeout path: no terminal event within `--timeout`/`--idle-timeout` →
|
||||
exit **2**.
|
||||
- [ ] polluted payload (bad JSON, wrong `schema_version`, wrong `job_id`) is
|
||||
dropped with a warning, not crashed on.
|
||||
- [ ] one tmux session processes two registry jobs in sequence; a second
|
||||
session with a different `agent_session` claims only its own.
|
||||
- [ ] broker cut-over: same scripts reach an authenticated TLS broker with env
|
||||
changes only; a credential without write ACL is rejected; a late
|
||||
subscriber still receives the retained terminal event.
|
||||
- [ ] `publisher.py`/`subscriber.py`/`README.md` demo on `python/mqtt/sample`
|
||||
still works unchanged (regression).
|
||||
- [ ] **audit log integrity** — for a completed job,
|
||||
`.hermes/delegate_job_logs/<JID>/events.ndjson` contains `registered` →
|
||||
`received started` → `published completed` (in that order), and
|
||||
`status.json.status == "completed"` matches the registry record. A
|
||||
logging failure (e.g. read-only log dir) does not break the publish or
|
||||
subscribe path — only a `logger.warning` is emitted.
|
||||
- [ ] **end-to-end demo smoke** — run
|
||||
`tmux-agent-orchestrate-delegate-job submit --agent claude-code --agent-session tmux:demo-smoke
|
||||
--prompt "echo hello and call publish_event.py --job <JID>
|
||||
--event completed" --timeout 120` and confirm
|
||||
(a) registered job id echoed, (b) subscriber pid echoed, (c) tmux session
|
||||
name printed, (d) `events.ndjson` grows as the agent runs, (e) final
|
||||
stdout line is the audit-log dir.
|
||||
@@ -0,0 +1,114 @@
|
||||
# Job Event Protocol
|
||||
|
||||
The wire contract every tmux-agent-orchestrate-delegate-job agent (claude-code, codex, opencode,
|
||||
human, …) speaks. One job → one MQTT topic → JSON event payloads. Stable across
|
||||
the PoC (public broker) and production (own broker) stages; only transport
|
||||
hardening changes, never the payload shape.
|
||||
|
||||
Reference implementation: [`./scripts/publish_event.py`](./scripts/publish_event.py)
|
||||
(emit) and [`./scripts/job_subscriber.py`](./scripts/job_subscriber.py) (observe).
|
||||
|
||||
---
|
||||
|
||||
## 1. Topic design
|
||||
|
||||
| Topic | Purpose |
|
||||
|-------|---------|
|
||||
| `python/mqtt/sample` | Legacy demo topic — **never changed** (README compat). |
|
||||
| `python/mqtt/jobs/<job_id>/events` | Per-job event stream (this protocol). |
|
||||
|
||||
- One topic per job, JSON payload, `event` field discriminates the type.
|
||||
- Single-direction publish only (worker → observer). No request/response.
|
||||
- Future split is reserved but not required:
|
||||
`<job_id>/events`, `<job_id>/logs`, `<job_id>/artifacts`.
|
||||
- `topic_prefix` is stored in the job record so publishers resolve the topic
|
||||
from the registry alone (`<topic_prefix>/events`).
|
||||
|
||||
---
|
||||
|
||||
## 2. Payload schema (JSON, UTF-8, `schema_version = 1`)
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_version": 1,
|
||||
"seq": 7,
|
||||
"job_id": "abc12345",
|
||||
"event": "started | permission_required | progress | completed | error",
|
||||
"timestamp": "2026-06-19T09:32:00Z",
|
||||
"detail": "generalised, whitelisted human-readable string",
|
||||
"data": { "optional": "metadata" }
|
||||
}
|
||||
```
|
||||
|
||||
| Field | Rule |
|
||||
|-------|------|
|
||||
| `schema_version` | If publisher/subscriber disagree, the subscriber **drops** the event with a warning (defensive parsing). |
|
||||
| `seq` | Monotonic **per `job_id`**, first publish = 1. Lets the subscriber detect reorder/duplication. Persisted in the registry (`last_seq`) so it survives restarts. |
|
||||
| `job_id` | Subscriber drops any event whose `job_id` it did not subscribe for. |
|
||||
| `timestamp` | Publisher host clock, **advisory only**. The delegator's timeout is measured from *receive* time, not this field. |
|
||||
| `detail` | Generalised text only. **No absolute paths, keys, or tokens.** |
|
||||
| `data` | Optional metadata. Production may add `hmac_sig`, `build_id`, etc. |
|
||||
|
||||
---
|
||||
|
||||
## 3. Event catalogue
|
||||
|
||||
| event | When emitted | `detail` example | seq |
|
||||
|-------|--------------|------------------|-----|
|
||||
| `started` | Agent first picks up the job | `"Job a1b2c3d4 started"` | 1 |
|
||||
| `permission_required` | Agent needs a tool/permission grant | `"needs to write sort_problems.md"` | as it happens |
|
||||
| `progress` | Optional intermediate checkpoint | `"creating problem 5/10"` | as it happens |
|
||||
| `completed` | Successful terminal state | `"saved to sort_problems.md"` | last |
|
||||
| `error` | Failure / exception terminal state | `"internal error, see logs"` | last |
|
||||
|
||||
`started` and `completed`/`error` are mandatory bookends; `permission_required`
|
||||
and `progress` are optional. `detail` must stay on the whitelist of generalised
|
||||
phrasings — never leak secrets through it.
|
||||
|
||||
### Terminal semantics
|
||||
|
||||
- `completed` → subscriber exits 0; `error` → exits 1.
|
||||
- The subscriber runs a **terminal state machine**: it finalises a job on the
|
||||
first `completed`/`error` it sees and ignores any later terminal event for
|
||||
that job (QoS-1 duplicate, or an `error`-after-`completed` reorder). When all
|
||||
watched jobs are finalised it exits.
|
||||
- Wall-clock timeout *or* idle timeout before a terminal event → exit 2.
|
||||
|
||||
---
|
||||
|
||||
## 4. Production hardening (own broker stage)
|
||||
|
||||
The payload shape is unchanged; the transport and trust model tighten. See
|
||||
[`mqtt-broker-setup.md`](./mqtt-broker-setup.md) for the broker side.
|
||||
|
||||
- **Auth / ACL** — username/password + per-topic ACL. `jobs/+/events` publish is
|
||||
granted to the worker credential, subscribe to the Hermes credential.
|
||||
- **HMAC Signature Verification (`data.hmac_sig`)** — to authenticate the publisher and verify message integrity without exposing the raw secret token over the wire, each job record contains a per-job `auth_token` (`secrets.token_urlsafe(32)`). The publisher computes an HMAC-SHA256 signature over the serialized payload (excluding `data.hmac_sig` itself) using the `auth_token` as the key, and appends it to **`data.hmac_sig`**. The subscriber reconstructs this signature and **drops any message that does not match or lacks a valid signature**.
|
||||
|
||||
```json
|
||||
{ "...": "...", "data": { "hmac_sig": "d2f3...", "build_id": "42" } }
|
||||
```
|
||||
|
||||
- **TLS** — port 8883 + private CA. Toggled with `MQTT_TLS=1` (+ `MQTT_CA_CERTS`);
|
||||
no code change.
|
||||
- **Retained terminal events** — `completed`/`error` publish with `retain=True`
|
||||
so a subscriber that joins late immediately receives the last terminal state
|
||||
instead of a stale view. The reference publisher auto-retains terminal events;
|
||||
`--retained` forces it for any event.
|
||||
- **Dual timeouts** — total wall-clock budget + last-activity idle detection,
|
||||
both measured from receive time.
|
||||
- **Clock trust** — never trust the payload `timestamp` for timeout decisions.
|
||||
|
||||
---
|
||||
|
||||
## 5. Why a public broker is PoC-only
|
||||
|
||||
On `broker.hivemq.com` anyone can publish/subscribe the same topic. Therefore:
|
||||
|
||||
- No secret data in payloads.
|
||||
- `started`/`completed`/`error` are *signals*, never a basis for a security
|
||||
decision.
|
||||
- Non-retained messages are **not queued** for absent subscribers — start the
|
||||
subscriber **before** the agent (ordering dependency), or rely on retained
|
||||
terminal events in production.
|
||||
- Real operational decisions belong to the own-broker stage with auth + ACL.
|
||||
@@ -0,0 +1,176 @@
|
||||
# MQTT Broker Setup — PoC → Production
|
||||
|
||||
The tmux-agent-orchestrate-delegate-job scripts read **all** broker settings from environment
|
||||
variables (or a job record's `broker.*` block) through a single helper,
|
||||
`broker_config_from_env()` in
|
||||
[`./scripts/mqtt_common.py`](./scripts/mqtt_common.py). The design goal:
|
||||
**switch from the public PoC broker to your own broker with config only — no
|
||||
code change.**
|
||||
|
||||
| Env var | Meaning | PoC default | Production |
|
||||
|---------|---------|-------------|-----------|
|
||||
| `MQTT_BROKER` | host | `broker.hivemq.com` | internal hostname/IP |
|
||||
| `MQTT_PORT` | port | `1883` | `8883` (TLS) |
|
||||
| `MQTT_TLS` | TLS on/off (`1`/`0`) | `0` | `1` |
|
||||
| `MQTT_USERNAME` / `MQTT_PASSWORD` | auth | (none) | broker-issued |
|
||||
| `MQTT_CA_CERTS` | CA bundle path | (none) | private CA path |
|
||||
| `MQTT_CERTFILE` / `MQTT_KEYFILE` | client cert (optional mTLS) | (none) | per-client |
|
||||
| `MQTT_CLIENT_ID_PREFIX` | client id prefix | `hermes` | per-environment |
|
||||
|
||||
---
|
||||
|
||||
## 1. PoC: public broker (`broker.hivemq.com`)
|
||||
|
||||
**Pros** — zero setup, reachable from anywhere, perfect for wiring up the
|
||||
publish/subscribe loop and the timeout/state-machine logic.
|
||||
|
||||
**Cons / accepted assumptions** — no auth, no integrity, shared with the world:
|
||||
|
||||
- no secrets in payloads;
|
||||
- `started`/`completed`/`error` are advisory signals only;
|
||||
- non-retained messages are **not queued** for absent subscribers, so the
|
||||
subscriber must start before the agent;
|
||||
- a re-subscribing client cannot recover past (non-retained) events.
|
||||
|
||||
Use it only to validate the protocol, never for real decisions.
|
||||
|
||||
---
|
||||
|
||||
## 2. Production: self-hosted Mosquitto (or EMQX)
|
||||
|
||||
Both support MQTT 5 + ACL + TLS. Mosquitto shown below; EMQX is a drop-in for
|
||||
the same env vars.
|
||||
|
||||
### 2.1 Install
|
||||
|
||||
```bash
|
||||
# macOS
|
||||
brew install mosquitto
|
||||
|
||||
# Debian/Ubuntu
|
||||
sudo apt-get update && sudo apt-get install -y mosquitto mosquitto-clients
|
||||
|
||||
# Docker
|
||||
docker run -d --name mosquitto -p 8883:8883 \
|
||||
-v "$PWD/mosquitto.conf:/mosquitto/config/mosquitto.conf" \
|
||||
-v "$PWD/certs:/mosquitto/certs" \
|
||||
-v "$PWD/auth:/mosquitto/auth" \
|
||||
eclipse-mosquitto:2
|
||||
```
|
||||
|
||||
### 2.2 `mosquitto.conf` (key lines)
|
||||
|
||||
```conf
|
||||
persistence true
|
||||
persistence_location /mosquitto/data/
|
||||
|
||||
password_file /mosquitto/auth/passwd
|
||||
acl_file /mosquitto/auth/acl
|
||||
allow_anonymous false
|
||||
|
||||
listener 8883
|
||||
cafile /mosquitto/certs/ca.crt
|
||||
certfile /mosquitto/certs/server.crt
|
||||
keyfile /mosquitto/certs/server.key
|
||||
```
|
||||
|
||||
`persistence true` + QoS 1 + retained terminal events means a subscriber that
|
||||
joins after a job finished still sees the final `completed`/`error`.
|
||||
|
||||
### 2.3 Users (username/password)
|
||||
|
||||
```bash
|
||||
# create the file with the first user, then add more with -b
|
||||
mosquitto_passwd -c /mosquitto/auth/passwd hermes # subscriber/delegator
|
||||
mosquitto_passwd /mosquitto/auth/passwd claude-worker # publisher/agent
|
||||
# (omit -c after the first; -c truncates the file)
|
||||
```
|
||||
|
||||
### 2.4 ACL — least privilege
|
||||
|
||||
The worker only **publishes** events; Hermes only **subscribes**:
|
||||
|
||||
```conf
|
||||
# /mosquitto/auth/acl
|
||||
|
||||
# claude-worker: may publish job events, may not read others' streams
|
||||
user claude-worker
|
||||
topic write python/mqtt/jobs/+/events
|
||||
|
||||
# hermes: observes every job's events
|
||||
user hermes
|
||||
topic read python/mqtt/jobs/+/events
|
||||
|
||||
# keep the legacy demo topic usable for both, if desired
|
||||
pattern readwrite python/mqtt/sample
|
||||
```
|
||||
|
||||
### 2.5 TLS certificates
|
||||
|
||||
**Quick self-signed (single host, internal only):**
|
||||
|
||||
```bash
|
||||
mkdir -p certs && cd certs
|
||||
openssl req -x509 -newkey rsa:2048 -nodes -days 825 \
|
||||
-keyout server.key -out server.crt \
|
||||
-subj "/CN=mqtt.internal"
|
||||
cp server.crt ca.crt # clients trust this as the CA bundle
|
||||
```
|
||||
|
||||
**Private CA (recommended — separate CA from server cert):**
|
||||
|
||||
```bash
|
||||
# 1) CA
|
||||
openssl genrsa -out ca.key 4096
|
||||
openssl req -x509 -new -nodes -key ca.key -days 3650 -out ca.crt -subj "/CN=Hermes-CA"
|
||||
# 2) server cert signed by the CA
|
||||
openssl genrsa -out server.key 2048
|
||||
openssl req -new -key server.key -out server.csr -subj "/CN=mqtt.internal"
|
||||
openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \
|
||||
-out server.crt -days 825
|
||||
```
|
||||
|
||||
Clients trust `ca.crt` via `MQTT_CA_CERTS=/path/to/ca.crt`.
|
||||
|
||||
---
|
||||
|
||||
## 3. Cut-over verification (config-only, no code change)
|
||||
|
||||
Goal: prove the **same scripts** talk to your broker by changing only env/registry.
|
||||
|
||||
```bash
|
||||
# 1) point the env at the new broker
|
||||
export MQTT_BROKER=mqtt.internal
|
||||
export MQTT_PORT=8883
|
||||
export MQTT_TLS=1
|
||||
export MQTT_CA_CERTS=$PWD/certs/ca.crt
|
||||
export MQTT_USERNAME=hermes
|
||||
export MQTT_PASSWORD=… # subscriber side
|
||||
# (publisher side uses claude-worker creds via the job record's broker block)
|
||||
|
||||
# 2) sanity-check with the mosquitto CLI first
|
||||
mosquitto_sub -h "$MQTT_BROKER" -p 8883 --cafile "$MQTT_CA_CERTS" \
|
||||
-u hermes -P "$MQTT_PASSWORD" -t 'python/mqtt/jobs/+/events' -v &
|
||||
|
||||
# 3) run the unchanged tmux-agent-orchestrate-delegate-job loop
|
||||
PY=.venv/bin/python
|
||||
JID=$($PY scripts/registry.py register --prompt "broker cutover smoke")
|
||||
$PY scripts/job_subscriber.py --job "$JID" --timeout 30 &
|
||||
sleep 3
|
||||
$PY scripts/publish_event.py --job "$JID" --event started
|
||||
$PY scripts/publish_event.py --job "$JID" --event completed # auto-retained
|
||||
```
|
||||
|
||||
Expected:
|
||||
- subscriber prints the `started` and `completed` lines and exits 0;
|
||||
- `mosquitto_sub` shows the same events (ACL allows `hermes` to read);
|
||||
- publishing as a credential **without** write ACL is rejected by the broker;
|
||||
- a subscriber started *after* `completed` still receives it (retained).
|
||||
|
||||
If all four hold, the migration is config-only. Persist the broker block into
|
||||
each job record so `publish_event.py` connects from the registry alone:
|
||||
|
||||
```json
|
||||
"broker": { "host": "mqtt.internal", "port": 8883, "tls": true,
|
||||
"username": "claude-worker", "password": "…" }
|
||||
```
|
||||
@@ -0,0 +1,183 @@
|
||||
# Job Registry
|
||||
|
||||
The registry is the **single source of truth** for delegated work. Job metadata
|
||||
(id, prompt, broker, status, timeouts) lives in files, **not** environment
|
||||
variables — so one tmux session can handle many jobs sequentially or in
|
||||
parallel without collisions, and `publish_event.py` / `job_subscriber.py` can
|
||||
reconstruct everything they need from the registry alone.
|
||||
|
||||
Reference implementation: [`./scripts/registry.py`](./scripts/registry.py)
|
||||
(library + CLI) over the primitives in
|
||||
[`./scripts/mqtt_common.py`](./scripts/mqtt_common.py).
|
||||
|
||||
---
|
||||
|
||||
## 1. Directory layout
|
||||
|
||||
```
|
||||
.hermes/jobs/
|
||||
<job_id>.json # job metadata record (schema below)
|
||||
<job_id>.events.log # append-only JSON-lines event log (debug, optional)
|
||||
.lock # shared advisory lock (fcntl) for the whole registry
|
||||
```
|
||||
|
||||
`registry_dir` defaults to `.hermes/jobs` and is overridable everywhere via
|
||||
`--registry-dir`.
|
||||
|
||||
---
|
||||
|
||||
## 2. Job record schema
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_version": 1,
|
||||
"job_id": "abc12345",
|
||||
"status": "pending | running | completed | error | cancelled",
|
||||
"created_at": "2026-06-19T09:30:00Z",
|
||||
"updated_at": "2026-06-19T09:32:00Z",
|
||||
"prompt": "정렬 문제 10개를 만들어 sort_problems.md로 저장…",
|
||||
"agent": "claude-code",
|
||||
"agent_session": "tmux:claude",
|
||||
"broker": {
|
||||
"host": "broker.hivemq.com",
|
||||
"port": 1883,
|
||||
"tls": false,
|
||||
"username": null,
|
||||
"password": null
|
||||
},
|
||||
"topic_prefix": "python/mqtt/jobs/abc12345",
|
||||
"timeout_sec": 3600,
|
||||
"idle_timeout_sec": 120,
|
||||
"expected_artifacts": ["sort_problems.md"],
|
||||
"last_seq": 0,
|
||||
"auth_token": null
|
||||
}
|
||||
```
|
||||
|
||||
- `broker` lets `publish_event.py` connect from the record alone (env still
|
||||
overrides toggles like `MQTT_TLS`).
|
||||
- `topic_prefix` → the events topic is `<topic_prefix>/events`.
|
||||
- `last_seq` backs the monotonic `seq` counter so it survives process restarts.
|
||||
- `expected_artifacts` is the hook a user `validate.sh` checks (existence/content).
|
||||
- `auth_token` is `null` in PoC; production sets `secrets.token_urlsafe(32)`.
|
||||
|
||||
---
|
||||
|
||||
## 3. Concurrency rules
|
||||
|
||||
### PoC — fcntl advisory lock
|
||||
|
||||
Every read-modify-write (`register_job`, `pick_pending`, `update_status`,
|
||||
`next_seq`) runs inside `registry_lock(registry_dir)`, an exclusive
|
||||
`fcntl.flock` over `.lock`. Single-host, good enough for many tmux sessions on
|
||||
one machine.
|
||||
|
||||
### Production — SQLite WAL
|
||||
|
||||
When delegation spans **multiple hosts**, the file lock no longer serialises
|
||||
across machines. Migrate the same operations to a SQLite database in WAL mode
|
||||
(`PRAGMA journal_mode=WAL`) with a transaction per claim. The function
|
||||
signatures stay identical; only the storage backend changes.
|
||||
|
||||
---
|
||||
|
||||
## 4. How multiple sessions take only their own work
|
||||
|
||||
Each tmux session carries an `agent_session` label (`tmux:claude`,
|
||||
`tmux:claude-a`, `tmux:claude-b`, …). `pick_pending(agent_session)`:
|
||||
|
||||
1. acquires the registry lock,
|
||||
2. scans for the **oldest** record with `status == "pending"` **and**
|
||||
matching `agent_session`,
|
||||
3. flips it to `running` and writes it back **atomically**,
|
||||
4. releases the lock and returns the `job_id` (or `None`).
|
||||
|
||||
Because the scan + flip happen under one lock, two sessions can never claim the
|
||||
same job. Sessions with distinct labels naturally partition the work; sessions
|
||||
sharing a label compete safely — first to acquire the lock wins, the other sees
|
||||
the job already `running` and moves on.
|
||||
|
||||
```bash
|
||||
# session A only ever runs its own pending jobs
|
||||
PY scripts/registry.py pick --agent-session tmux:claude-a # prints id or exits 3
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Atomic status updates
|
||||
|
||||
All writes use a temp-file + `os.replace` rename, which is atomic on POSIX:
|
||||
|
||||
1. take the registry lock,
|
||||
2. load the current record,
|
||||
3. mutate fields + refresh `updated_at` (and `last_seq` for `next_seq`),
|
||||
4. write to `.<job_id>.<rand>.tmp` in the **same directory**, `fsync`,
|
||||
5. `os.replace(tmp, <job_id>.json)`,
|
||||
6. release the lock.
|
||||
|
||||
A reader therefore always sees either the old or the new complete record, never
|
||||
a half-written file. This is the file-based equivalent of the rename trick
|
||||
(`pending.<session>` → `running.<session>`) and maps cleanly onto a single
|
||||
SQLite transaction when you migrate.
|
||||
|
||||
---
|
||||
|
||||
## 6. CLI quick reference
|
||||
|
||||
```bash
|
||||
PY=.venv/bin/python
|
||||
$PY scripts/registry.py register --prompt "…" --agent claude-code \
|
||||
--agent-session tmux:claude --timeout 3600 --idle-timeout 120 # → prints job_id
|
||||
$PY scripts/registry.py list # human table
|
||||
$PY scripts/registry.py list --json # full records
|
||||
$PY scripts/registry.py get --job <id> # one record
|
||||
$PY scripts/registry.py status --job <id> --set completed # set status
|
||||
$PY scripts/registry.py pick --agent-session tmux:claude # claim → running
|
||||
```
|
||||
|
||||
Exit codes: `0` ok, `1` not found / bad status, `3` (`pick`) no pending job for
|
||||
that session.
|
||||
|
||||
---
|
||||
|
||||
## 7. Persistent audit log
|
||||
|
||||
Separate from the registry, every job is also mirrored to a durable append-only
|
||||
audit log at `.hermes/delegate_job_logs/<job_id>/` (override with
|
||||
`DELEGATE_JOB_LOGS_DIR`, default `<cwd>/.hermes/delegate_job_logs`). The registry
|
||||
is **live state** mutated in place; the audit log is **history** that survives
|
||||
even after the registry dir is cleaned up. It is git-ignored.
|
||||
|
||||
```
|
||||
.hermes/delegate_job_logs/<job_id>/
|
||||
meta.json # registration snapshot (the full job record at register time)
|
||||
events.ndjson # append-only, one JSON event per line, time-ordered
|
||||
status.json # current status only (fast point-query)
|
||||
```
|
||||
|
||||
`events.ndjson` lines are written automatically at four points:
|
||||
|
||||
| Trigger | line `event` | Source |
|
||||
|---------|-------------|--------|
|
||||
| `register_job` | `registered` | `registry.register_job` → `mqtt_common.init_job_log` |
|
||||
| status change (`update_status`, `pick`, publish status sync) | `status_changed` (`from`/`to`) | `mqtt_common.update_job_status` / `pick_pending` |
|
||||
| event published | `published` (embeds the exact payload) | `publish_event.py` |
|
||||
| event received | `received` | `job_subscriber.py` |
|
||||
|
||||
Helpers live in [`./scripts/mqtt_common.py`](./scripts/mqtt_common.py):
|
||||
`LOGS_DIR`, `job_log_path`, `init_job_log`, `append_event` (fcntl-locked,
|
||||
concurrent-append safe), `update_logged_status`, and the readers
|
||||
`read_logged_meta` / `read_logged_status` / `iter_logged_events` /
|
||||
`list_logged_jobs`. Every writer is **best-effort and isolated** — wrapped in
|
||||
`try/except` with a `logger.warning`, so an audit-log failure never breaks the
|
||||
registry write, the publish, or the subscribe it shadows.
|
||||
|
||||
Read them via the CLI:
|
||||
|
||||
```bash
|
||||
PY=.venv/bin/python
|
||||
$PY scripts/registry.py logs <job_id> # pretty timeline
|
||||
$PY scripts/registry.py logs <job_id> --tail 20 # last 20 events
|
||||
$PY scripts/registry.py logs <job_id> --json # raw JSON lines
|
||||
$PY scripts/registry.py logs --list # every job, live status
|
||||
```
|
||||
@@ -0,0 +1,2 @@
|
||||
paho-mqtt>=2.0.0
|
||||
pyyaml
|
||||
+252
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python3
|
||||
"""job_subscriber.py — the single entry point for observing Job events.
|
||||
|
||||
Subscribes to one job's ``<topic_prefix>/events`` (or, with ``--wait-any``, the
|
||||
events of every running/pending job in the registry), prints one line to stdout
|
||||
per accepted event, and exits on a terminal event or a timeout.
|
||||
|
||||
Design points (all flagged in the PLAN review):
|
||||
- terminal state machine: ``completed``/``error`` is acted on exactly once per
|
||||
job, so QoS-1 duplicates or an ``error``-after-``completed`` reorder are safe.
|
||||
- dual timeouts: a wall-clock ``--timeout`` (total budget, started at
|
||||
subscribe time so a cold start can't hang forever) AND an idle
|
||||
``--idle-timeout`` (no new event for N seconds).
|
||||
- defensive parsing: undecodable payloads, ``schema_version`` mismatches, and
|
||||
``job_id`` values we did not subscribe for are logged and dropped.
|
||||
|
||||
stdout = event lines only. Diagnostics go to stderr via logging.
|
||||
|
||||
Exit codes:
|
||||
0 all watched jobs reached ``completed``
|
||||
1 any watched job reached ``error``
|
||||
2 timed out (wall-clock or idle) before all jobs finished
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import queue
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
import mqtt_common
|
||||
import registry
|
||||
from mqtt_common import (
|
||||
DEFAULT_REGISTRY_DIR,
|
||||
SCHEMA_VERSION,
|
||||
broker_config_from_job,
|
||||
load_job,
|
||||
make_client,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("delegate_job.job_subscriber")
|
||||
|
||||
TERMINAL_EVENTS = ("completed", "error")
|
||||
|
||||
|
||||
def _format_line(topic: str, payload: Dict[str, Any]) -> str:
|
||||
return (
|
||||
f"{payload.get('timestamp','-')} "
|
||||
f"job={payload.get('job_id','?')} "
|
||||
f"seq={payload.get('seq','?')} "
|
||||
f"{payload.get('event','?'):<20} "
|
||||
f"{payload.get('detail','')}"
|
||||
)
|
||||
|
||||
|
||||
class _Watcher:
|
||||
"""Holds the shared queue + the set of job_ids we accept events for."""
|
||||
|
||||
def __init__(self, expected_job_ids: Set[str], expected_tokens: Dict[str, Optional[str]]):
|
||||
self.events: "queue.Queue[Tuple[str, Dict[str, Any]]]" = queue.Queue()
|
||||
self.expected = set(expected_job_ids)
|
||||
self.tokens = expected_tokens # job_id -> expected auth_token (or None)
|
||||
self.last_seq: Dict[str, int] = {jid: 0 for jid in expected_job_ids}
|
||||
|
||||
def on_message(self, _client, _userdata, msg) -> None:
|
||||
# --- defensive parsing -------------------------------------------
|
||||
try:
|
||||
payload = json.loads(msg.payload.decode("utf-8"))
|
||||
except (UnicodeDecodeError, json.JSONDecodeError) as exc:
|
||||
logger.warning("drop unparseable payload on %s: %s", msg.topic, exc)
|
||||
return
|
||||
if not isinstance(payload, dict):
|
||||
logger.warning("drop non-object payload on %s", msg.topic)
|
||||
return
|
||||
if payload.get("schema_version") != SCHEMA_VERSION:
|
||||
logger.warning("drop event with schema_version=%r (expected %d)",
|
||||
payload.get("schema_version"), SCHEMA_VERSION)
|
||||
return
|
||||
jid = payload.get("job_id")
|
||||
if jid not in self.expected:
|
||||
logger.warning("drop event for unexpected job_id=%r on %s", jid, msg.topic)
|
||||
return
|
||||
# --- production auth check: data.auth_token must match if expected ---
|
||||
expected_token = self.tokens.get(jid)
|
||||
if not mqtt_common.verify_hmac(payload, expected_token):
|
||||
logger.warning("drop event for job %s: HMAC verify failed", jid)
|
||||
return
|
||||
# --- replay attack defense: check monotonic sequence ---
|
||||
seq = payload.get("seq")
|
||||
if seq is None or not isinstance(seq, int):
|
||||
logger.warning("drop event for job %s: missing or invalid seq", jid)
|
||||
return
|
||||
if seq <= self.last_seq.get(jid, 0):
|
||||
logger.warning("drop event for job %s: seq %d is not monotonically increasing (last %d)",
|
||||
jid, seq, self.last_seq.get(jid, 0))
|
||||
return
|
||||
self.last_seq[jid] = seq
|
||||
# Persistent audit log from the *subscriber's* vantage point: every event
|
||||
# that survives defensive parsing is recorded here, including ones a
|
||||
# different host published. This is the external-observer record that
|
||||
# backstops the publisher's own "published" line if it never wrote one.
|
||||
mqtt_common.append_event(jid, {
|
||||
"event": "received",
|
||||
"source_event": payload.get("event"),
|
||||
"seq": payload.get("seq"),
|
||||
"topic": msg.topic,
|
||||
"timestamp": payload.get("timestamp"),
|
||||
"detail": payload.get("detail", ""),
|
||||
})
|
||||
self.events.put((msg.topic, payload))
|
||||
|
||||
|
||||
def _collect_jobs(args) -> List[Dict[str, Any]]:
|
||||
"""Resolve the list of job records this invocation should watch."""
|
||||
if args.wait_any:
|
||||
jobs = [r for r in registry.list_jobs(args.registry_dir)
|
||||
if r.get("status") in ("pending", "running")]
|
||||
if not jobs:
|
||||
logger.error("no pending/running jobs to wait for")
|
||||
return jobs
|
||||
job = load_job(args.job, args.registry_dir) # raises FileNotFoundError
|
||||
return [job]
|
||||
|
||||
|
||||
def main(argv=None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Subscribe to Job events on MQTT")
|
||||
target = parser.add_mutually_exclusive_group(required=True)
|
||||
target.add_argument("--job", help="job id to watch")
|
||||
target.add_argument("--wait-any", action="store_true",
|
||||
help="watch every pending/running job in the registry")
|
||||
parser.add_argument("--timeout", type=float, default=None,
|
||||
help="wall-clock budget in seconds (default: job.timeout_sec or 3600)")
|
||||
parser.add_argument("--idle-timeout", type=float, default=None,
|
||||
help="max seconds with no new event (default: job.idle_timeout_sec or 120)")
|
||||
parser.add_argument("--expect-retention", action="store_true",
|
||||
help="warn if no retained terminal event arrives promptly")
|
||||
parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
mqtt_common.setup_logging(logging.DEBUG if args.verbose else logging.WARNING)
|
||||
|
||||
try:
|
||||
jobs = _collect_jobs(args)
|
||||
except FileNotFoundError as exc:
|
||||
logger.error("%s", exc)
|
||||
return 2
|
||||
if not jobs:
|
||||
return 2
|
||||
|
||||
expected_ids: Set[str] = {j["job_id"] for j in jobs}
|
||||
tokens = {j["job_id"]: j.get("auth_token") for j in jobs}
|
||||
watcher = _Watcher(expected_ids, tokens)
|
||||
|
||||
# Resolve timeouts from CLI, falling back to the (first) job's settings.
|
||||
base_job = jobs[0]
|
||||
wall_timeout = args.timeout if args.timeout is not None else float(base_job.get("timeout_sec", 3600))
|
||||
idle_timeout = args.idle_timeout if args.idle_timeout is not None else float(base_job.get("idle_timeout_sec", 120))
|
||||
|
||||
# All watched jobs share a broker in practice; connect using the first
|
||||
# job's broker and subscribe to each job's events topic.
|
||||
config = broker_config_from_job(base_job)
|
||||
client = make_client("subscriber", config)
|
||||
client.on_message = watcher.on_message
|
||||
|
||||
subscribed_topics = []
|
||||
for job in jobs:
|
||||
prefix = job.get("topic_prefix") or mqtt_common.topic_prefix_for(job["job_id"])
|
||||
subscribed_topics.append(f"{prefix}/events")
|
||||
|
||||
def on_connect(_c, _u, _flags, reason_code, _props):
|
||||
if mqtt_common.reason_code_value(reason_code) != 0:
|
||||
logger.error("broker connection failed: rc=%s", reason_code)
|
||||
return
|
||||
for topic in subscribed_topics:
|
||||
_c.subscribe(topic, qos=1)
|
||||
logger.info("subscribed to %s", topic)
|
||||
|
||||
def on_disconnect(_c, _u, _flags, reason_code, _props):
|
||||
rc = mqtt_common.reason_code_value(reason_code)
|
||||
if rc != 0:
|
||||
logger.warning("broker disconnected (rc=%s); will retry reconnect", reason_code)
|
||||
|
||||
client.on_connect = on_connect
|
||||
client.on_disconnect = on_disconnect
|
||||
client.reconnect_delay_set(min_delay=1, max_delay=16)
|
||||
mqtt_common.with_retry(
|
||||
lambda: client.connect(config.host, config.port, config.keepalive),
|
||||
attempts=5, base_delay=1.0, max_delay=16.0
|
||||
)()
|
||||
client.loop_start()
|
||||
|
||||
terminal: Dict[str, str] = {} # job_id -> "completed"/"error"
|
||||
pending: Set[str] = set(expected_ids)
|
||||
start = time.monotonic()
|
||||
wall_deadline = start + wall_timeout
|
||||
last_event = start
|
||||
retention_checked = not args.expect_retention
|
||||
|
||||
try:
|
||||
while pending:
|
||||
now = time.monotonic()
|
||||
if now >= wall_deadline:
|
||||
logger.error("wall-clock timeout (%.0fs); still pending: %s",
|
||||
wall_timeout, ", ".join(sorted(pending)))
|
||||
return 2
|
||||
idle_left = idle_timeout - (now - last_event)
|
||||
if idle_left <= 0:
|
||||
logger.error("idle timeout (%.0fs, no events); still pending: %s",
|
||||
idle_timeout, ", ".join(sorted(pending)))
|
||||
return 2
|
||||
wait = min(wall_deadline - now, idle_left, 1.0)
|
||||
try:
|
||||
topic, payload = watcher.events.get(timeout=wait)
|
||||
except queue.Empty:
|
||||
if not retention_checked and (now - start) > 3.0:
|
||||
logger.warning("--expect-retention set but no retained "
|
||||
"terminal event observed yet")
|
||||
retention_checked = True
|
||||
continue
|
||||
|
||||
last_event = time.monotonic()
|
||||
retention_checked = True
|
||||
print(_format_line(topic, payload), flush=True)
|
||||
|
||||
jid = payload["job_id"]
|
||||
event = payload.get("event")
|
||||
if event in TERMINAL_EVENTS:
|
||||
if jid in terminal:
|
||||
# Already finalised: ignore duplicates / late reorders.
|
||||
logger.info("ignoring duplicate terminal %s for %s", event, jid)
|
||||
continue
|
||||
terminal[jid] = event
|
||||
pending.discard(jid)
|
||||
finally:
|
||||
client.loop_stop()
|
||||
try:
|
||||
client.disconnect()
|
||||
except Exception: # pragma: no cover
|
||||
pass
|
||||
|
||||
# All jobs reached a terminal state. error wins over completed.
|
||||
if any(state == "error" for state in terminal.values()):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,600 @@
|
||||
"""Shared MQTT + registry helpers for the tmux-agent-orchestrate-delegate-job skill.
|
||||
|
||||
Single entry point for:
|
||||
- broker configuration (env -> dataclass),
|
||||
- paho client construction (auth + TLS + unique client id),
|
||||
- monotonic per-job sequence counters,
|
||||
- retry-with-exponential-backoff,
|
||||
- atomic registry record load/update under an fcntl lock.
|
||||
|
||||
Requires paho-mqtt >= 2.0 (uses CallbackAPIVersion.VERSION2).
|
||||
|
||||
This module is the *only* place that talks to the broker config and to the
|
||||
raw job record file, so PoC -> production migration touches just env/registry
|
||||
values, never code (see references/mqtt-broker-setup.md).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional
|
||||
|
||||
import paho.mqtt.client as mqtt
|
||||
|
||||
logger = logging.getLogger("delegate_job.mqtt_common")
|
||||
|
||||
def _load_dotenv(workspace_dir: str = None) -> None:
|
||||
"""Load .env file from workspace if it exists and env var not already set.
|
||||
|
||||
This ensures Python scripts get the same env vars as the shell wrapper
|
||||
scripts that source .env. Only sets vars that are not already in os.environ
|
||||
(i.e. OS env takes precedence over .env file).
|
||||
"""
|
||||
import os
|
||||
if workspace_dir is None:
|
||||
# Walk up from this script to find workspace root
|
||||
d = os.path.dirname(os.path.abspath(__file__))
|
||||
for _ in range(5):
|
||||
if os.path.isfile(os.path.join(d, ".env")):
|
||||
break
|
||||
d = os.path.dirname(d)
|
||||
else:
|
||||
d = workspace_dir
|
||||
env_path = os.path.join(d, ".env")
|
||||
if not os.path.isfile(env_path):
|
||||
return
|
||||
with open(env_path, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if "=" in line:
|
||||
key, _, val = line.partition("=")
|
||||
key = key.strip()
|
||||
val = val.strip().strip('"').strip("'")
|
||||
if key and key not in os.environ:
|
||||
os.environ[key] = val
|
||||
|
||||
_load_dotenv()
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Constants
|
||||
# --------------------------------------------------------------------------
|
||||
SCHEMA_VERSION = 1
|
||||
DEFAULT_REGISTRY_DIR = ".hermes/jobs"
|
||||
DEFAULT_TOPIC_ROOT = "python/mqtt/jobs"
|
||||
LOCK_FILENAME = ".lock"
|
||||
|
||||
# Persistent audit-log layout: .hermes/delegate_job_logs/<job_id>/{meta,events,status}.
|
||||
# This is a *separate* artifact from the registry: the registry is the live job
|
||||
# record (mutated in place), the audit log is an append-only history that
|
||||
# survives even if the registry dir is cleaned up.
|
||||
META_FILENAME = "meta.json"
|
||||
EVENTS_FILENAME = "events.ndjson"
|
||||
STATUS_FILENAME = "status.json"
|
||||
|
||||
|
||||
def _default_logs_dir() -> str:
|
||||
"""Audit-log root. Overridable with ``DELEGATE_JOB_LOGS_DIR``; otherwise
|
||||
``<cwd>/.hermes/delegate_job_logs`` — we keep audit logs next to the
|
||||
live registry (``.hermes/jobs/``) so the two runtime artifacts sit
|
||||
under the same parent dir and follow the same ``.gitignore`` rule.
|
||||
The cwd of whichever process emits events (the bash wrapper and
|
||||
scripts) is used as the anchor."""
|
||||
env = os.environ.get("DELEGATE_JOB_LOGS_DIR")
|
||||
if env and env.strip():
|
||||
return env
|
||||
return os.path.join(os.getcwd(), ".hermes", "delegate_job_logs")
|
||||
|
||||
|
||||
LOGS_DIR = _default_logs_dir()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Broker configuration
|
||||
# --------------------------------------------------------------------------
|
||||
@dataclass
|
||||
class BrokerConfig:
|
||||
"""Resolved broker connection settings.
|
||||
|
||||
PoC defaults target the public HiveMQ broker. Production overrides arrive
|
||||
either from environment variables or from a job record's ``broker.*`` block
|
||||
(see ``broker_config_from_job``).
|
||||
"""
|
||||
|
||||
host: str = "broker.hivemq.com"
|
||||
port: int = 1883
|
||||
tls: bool = False
|
||||
username: Optional[str] = None
|
||||
password: Optional[str] = None
|
||||
client_id_prefix: str = "hermes"
|
||||
# TLS material (only consulted when tls is True).
|
||||
ca_certs: Optional[str] = None
|
||||
certfile: Optional[str] = None
|
||||
keyfile: Optional[str] = None
|
||||
keepalive: int = 60
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
def to_registry_block(self) -> Dict[str, Any]:
|
||||
"""The subset that gets persisted into a job record's broker block."""
|
||||
return {
|
||||
"host": self.host,
|
||||
"port": self.port,
|
||||
"tls": self.tls,
|
||||
"username": self.username,
|
||||
"password": self.password,
|
||||
}
|
||||
|
||||
|
||||
def _env_bool(name: str, default: bool = False) -> bool:
|
||||
raw = os.environ.get(name)
|
||||
if raw is None:
|
||||
return default
|
||||
return raw.strip().lower() in ("1", "true", "yes", "on")
|
||||
|
||||
|
||||
def _env_int(name: str, default: int) -> int:
|
||||
raw = os.environ.get(name)
|
||||
if raw is None or raw.strip() == "":
|
||||
return default
|
||||
try:
|
||||
return int(raw)
|
||||
except ValueError:
|
||||
logger.warning("invalid int for %s=%r; using default %d", name, raw, default)
|
||||
return default
|
||||
|
||||
|
||||
def broker_config_from_env(overrides: Optional[Dict[str, Any]] = None) -> BrokerConfig:
|
||||
"""Build a :class:`BrokerConfig` from environment variables.
|
||||
|
||||
Recognised vars (all optional, PoC defaults shown):
|
||||
MQTT_BROKER (broker.hivemq.com), MQTT_PORT (1883), MQTT_TLS (0),
|
||||
MQTT_USERNAME, MQTT_PASSWORD, MQTT_CLIENT_ID_PREFIX (hermes),
|
||||
MQTT_CA_CERTS, MQTT_CERTFILE, MQTT_KEYFILE, MQTT_KEEPALIVE (60).
|
||||
|
||||
``overrides`` (e.g. a job record's broker block) wins over the env values
|
||||
for any key it specifies with a non-None value.
|
||||
"""
|
||||
cfg = BrokerConfig(
|
||||
host=os.environ.get("MQTT_BROKER", "broker.hivemq.com"),
|
||||
port=_env_int("MQTT_PORT", 1883),
|
||||
tls=_env_bool("MQTT_TLS", False),
|
||||
username=os.environ.get("MQTT_USERNAME") or None,
|
||||
password=os.environ.get("MQTT_PASSWORD") or None,
|
||||
client_id_prefix=os.environ.get("MQTT_CLIENT_ID_PREFIX", "hermes"),
|
||||
ca_certs=os.environ.get("MQTT_CA_CERTS") or None,
|
||||
certfile=os.environ.get("MQTT_CERTFILE") or None,
|
||||
keyfile=os.environ.get("MQTT_KEYFILE") or None,
|
||||
keepalive=_env_int("MQTT_KEEPALIVE", 60),
|
||||
)
|
||||
if overrides:
|
||||
for key, value in overrides.items():
|
||||
if value is not None and hasattr(cfg, key):
|
||||
setattr(cfg, key, value)
|
||||
return cfg
|
||||
|
||||
|
||||
def broker_config_from_job(job: Dict[str, Any]) -> BrokerConfig:
|
||||
"""Resolve broker config for a job: env defaults, then the job's broker.*
|
||||
block overrides. This lets ``publish_event.py`` connect from the registry
|
||||
alone, while still honouring environment toggles (e.g. MQTT_TLS=1)."""
|
||||
return broker_config_from_env(overrides=job.get("broker") or {})
|
||||
|
||||
|
||||
def make_client(role: str, config: Optional[BrokerConfig] = None) -> mqtt.Client:
|
||||
"""Return a configured paho ``Client`` (not yet connected).
|
||||
|
||||
The client id is ``f"{prefix}-{role}-{uuid8}"`` so concurrent publishers /
|
||||
subscribers never collide on the broker. Auth and TLS are applied when the
|
||||
config supplies them.
|
||||
"""
|
||||
config = config or broker_config_from_env()
|
||||
client_id = f"{config.client_id_prefix}-{role}-{uuid.uuid4().hex[:8]}"
|
||||
client = mqtt.Client(
|
||||
callback_api_version=mqtt.CallbackAPIVersion.VERSION2,
|
||||
client_id=client_id,
|
||||
)
|
||||
if config.username:
|
||||
client.username_pw_set(config.username, config.password)
|
||||
if config.tls:
|
||||
# If ca_certs is None paho uses the system trust store (good enough for
|
||||
# public CAs); a private CA bundle path is passed through unchanged.
|
||||
client.tls_set(
|
||||
ca_certs=config.ca_certs,
|
||||
certfile=config.certfile,
|
||||
keyfile=config.keyfile,
|
||||
)
|
||||
logger.debug("built client id=%s tls=%s host=%s", client_id, config.tls, config.host)
|
||||
return client
|
||||
|
||||
|
||||
def reason_code_value(rc: Any) -> int:
|
||||
"""Normalise a paho v2 connect reason code to an int.
|
||||
|
||||
paho-mqtt 2.x hands callbacks a ``ReasonCode`` object (not an int); older
|
||||
paths may pass a plain int. ``ReasonCode`` exposes ``.value``; 0 == success.
|
||||
"""
|
||||
return int(getattr(rc, "value", rc))
|
||||
|
||||
|
||||
def verify_hmac(payload: dict, auth_token: Optional[str]) -> bool:
|
||||
"""Verify HMAC-SHA256 signature. Returns True if valid or no token set."""
|
||||
if not auth_token:
|
||||
return True # PoC mode — no auth
|
||||
sig = payload.get("data", {}).get("hmac_sig")
|
||||
if not sig:
|
||||
return False
|
||||
sign_payload = {k: v for k, v in payload.items() if k != "data"}
|
||||
sign_payload["data"] = {k: v for k, v in payload.get("data", {}).items() if k != "hmac_sig"}
|
||||
msg = json.dumps(sign_payload, sort_keys=True, separators=(",", ":")).encode()
|
||||
expected = hmac.new(auth_token.encode(), msg, hashlib.sha256).hexdigest()
|
||||
return hmac.compare_digest(sig, expected)
|
||||
|
||||
|
||||
def topic_prefix_for(job_id: str, root: str = DEFAULT_TOPIC_ROOT) -> str:
|
||||
return f"{root}/{job_id}"
|
||||
|
||||
|
||||
def events_topic_for(job_id: str, root: str = DEFAULT_TOPIC_ROOT) -> str:
|
||||
return f"{topic_prefix_for(job_id, root)}/events"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Registry primitives (single source of truth for raw record I/O)
|
||||
# --------------------------------------------------------------------------
|
||||
def _job_path(job_id: str, registry_dir: str) -> Path:
|
||||
return Path(registry_dir) / f"{job_id}.json"
|
||||
|
||||
|
||||
def _lock_path(registry_dir: str) -> Path:
|
||||
return Path(registry_dir) / LOCK_FILENAME
|
||||
|
||||
|
||||
@contextmanager
|
||||
def registry_lock(registry_dir: str):
|
||||
"""Advisory exclusive lock over the whole registry dir via fcntl.
|
||||
|
||||
PoC-grade single-host concurrency control. Multiple tmux sessions / scripts
|
||||
serialise their read-modify-write of job records through this lock so two
|
||||
sessions never claim the same pending job. For multi-host delegation move
|
||||
to SQLite WAL (see references/registry.md)."""
|
||||
import fcntl # POSIX only; imported lazily so import works on Windows.
|
||||
|
||||
Path(registry_dir).mkdir(parents=True, exist_ok=True)
|
||||
lock_file = _lock_path(registry_dir)
|
||||
fh = open(lock_file, "a+")
|
||||
try:
|
||||
fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
|
||||
yield
|
||||
finally:
|
||||
try:
|
||||
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
|
||||
finally:
|
||||
fh.close()
|
||||
|
||||
|
||||
def load_job(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> Dict[str, Any]:
|
||||
"""Load and parse a job record. Raises FileNotFoundError if absent."""
|
||||
path = _job_path(job_id, registry_dir)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"job record not found: {path}")
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _atomic_write_record(job_id: str, registry_dir: str, record: Dict[str, Any]) -> None:
|
||||
"""Write a record atomically: temp file in the same dir + os.replace.
|
||||
|
||||
The rename is atomic on POSIX, so readers never observe a half-written
|
||||
file. Callers MUST already hold ``registry_lock`` for read-modify-write
|
||||
correctness."""
|
||||
Path(registry_dir).mkdir(parents=True, exist_ok=True)
|
||||
path = _job_path(job_id, registry_dir)
|
||||
fd, tmp = tempfile.mkstemp(dir=str(path.parent), prefix=f".{job_id}.", suffix=".tmp")
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as fh:
|
||||
json.dump(record, fh, ensure_ascii=False, indent=2)
|
||||
fh.write("\n")
|
||||
fh.flush()
|
||||
os.fsync(fh.fileno())
|
||||
os.replace(tmp, path)
|
||||
try:
|
||||
os.chmod(path, 0o600)
|
||||
except Exception:
|
||||
pass
|
||||
except BaseException:
|
||||
if os.path.exists(tmp):
|
||||
os.unlink(tmp)
|
||||
raise
|
||||
|
||||
|
||||
def update_job_status(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR, **fields: Any) -> Dict[str, Any]:
|
||||
"""Atomically merge ``fields`` into a job record under the registry lock.
|
||||
|
||||
Always refreshes ``updated_at``. Returns the new record. Raises
|
||||
FileNotFoundError if the job does not exist.
|
||||
|
||||
This is the single chokepoint for status writes (both ``registry.update_status``
|
||||
and ``publish_event.py``'s status sync route through here), so it also mirrors
|
||||
any ``status`` change into the persistent audit log — best-effort, after the
|
||||
registry lock is released so a slow/failed log write never blocks the record."""
|
||||
with registry_lock(registry_dir):
|
||||
record = load_job(job_id, registry_dir)
|
||||
old_status = record.get("status")
|
||||
record.update(fields)
|
||||
record["updated_at"] = _utcnow()
|
||||
_atomic_write_record(job_id, registry_dir, record)
|
||||
if "status" in fields:
|
||||
new_status = record.get("status")
|
||||
update_logged_status(job_id, new_status, updated_at=record["updated_at"])
|
||||
if old_status != new_status:
|
||||
append_event(job_id, {
|
||||
"event": "status_changed",
|
||||
"from": old_status,
|
||||
"to": new_status,
|
||||
"timestamp": record["updated_at"],
|
||||
})
|
||||
return record
|
||||
|
||||
|
||||
def next_seq(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> int:
|
||||
"""Return the next monotonic sequence number for a job, persisted in the
|
||||
record's ``last_seq`` field so it stays consistent across process restarts.
|
||||
First call returns 1."""
|
||||
with registry_lock(registry_dir):
|
||||
record = load_job(job_id, registry_dir)
|
||||
seq = int(record.get("last_seq", 0)) + 1
|
||||
record["last_seq"] = seq
|
||||
record["updated_at"] = _utcnow()
|
||||
_atomic_write_record(job_id, registry_dir, record)
|
||||
return seq
|
||||
|
||||
|
||||
def _utcnow() -> str:
|
||||
"""ISO-8601 UTC timestamp with trailing Z (payload `timestamp` field)."""
|
||||
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
|
||||
|
||||
def _utcnow_precise() -> str:
|
||||
"""ISO-8601 UTC timestamp with millisecond resolution. Used for the audit
|
||||
log's ``logged_at`` so events sort cleanly even within the same second."""
|
||||
now = time.time()
|
||||
base = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(now))
|
||||
return f"{base}.{int((now % 1) * 1000):03d}Z"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Persistent audit log (.hermes/delegate_job_logs/<job_id>/...)
|
||||
#
|
||||
# Every function here is idempotent, concurrency-safe, and *best-effort*: a
|
||||
# logging failure is swallowed with a logger.warning and never propagated, so it
|
||||
# can never break a publish, a subscribe, or a registry write. stdout is never
|
||||
# touched (it is reserved for data output).
|
||||
# --------------------------------------------------------------------------
|
||||
def job_log_dir(job_id: str, logs_dir: Optional[str] = None) -> Path:
|
||||
return Path(logs_dir or LOGS_DIR) / job_id
|
||||
|
||||
|
||||
def job_log_path(job_id: str, kind: str, logs_dir: Optional[str] = None) -> Path:
|
||||
"""Path to one audit-log file for a job. ``kind`` is a filename, e.g. the
|
||||
module constants META_FILENAME / EVENTS_FILENAME / STATUS_FILENAME."""
|
||||
return job_log_dir(job_id, logs_dir) / kind
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _file_lock(fh):
|
||||
"""Best-effort exclusive lock over a single open file via fcntl, so two
|
||||
processes appending to events.ndjson never interleave a line. A no-op where
|
||||
fcntl is unavailable (Windows); a short append is atomic enough there."""
|
||||
try:
|
||||
import fcntl
|
||||
except ImportError: # pragma: no cover - non-POSIX
|
||||
yield
|
||||
return
|
||||
fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
|
||||
|
||||
|
||||
def append_event(job_id: str, event_dict: Dict[str, Any], logs_dir: Optional[str] = None) -> None:
|
||||
"""Append one event as a JSON line to ``<logs>/<job_id>/events.ndjson``.
|
||||
|
||||
Concurrency-safe (fcntl lock over the file) and best-effort. A millisecond
|
||||
``logged_at`` is stamped when the caller did not supply one."""
|
||||
try:
|
||||
path = job_log_path(job_id, EVENTS_FILENAME, logs_dir)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
record = dict(event_dict)
|
||||
record.setdefault("logged_at", _utcnow_precise())
|
||||
line = json.dumps(record, ensure_ascii=False) + "\n"
|
||||
with open(path, "a", encoding="utf-8") as fh:
|
||||
with _file_lock(fh):
|
||||
fh.write(line)
|
||||
fh.flush()
|
||||
except Exception as exc: # pragma: no cover - best effort
|
||||
logger.warning("append_event failed for job %s: %s", job_id, exc)
|
||||
|
||||
|
||||
def update_logged_status(job_id: str, status: str, logs_dir: Optional[str] = None, **extras: Any) -> None:
|
||||
"""Rewrite ``<logs>/<job_id>/status.json`` (current status for fast point
|
||||
queries) atomically. Best-effort; merges any ``extras``."""
|
||||
try:
|
||||
path = job_log_path(job_id, STATUS_FILENAME, logs_dir)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
record: Dict[str, Any] = {"job_id": job_id, "status": status, "updated_at": _utcnow()}
|
||||
record.update(extras)
|
||||
tmp = path.with_name(path.name + ".tmp")
|
||||
with open(tmp, "w", encoding="utf-8") as fh:
|
||||
json.dump(record, fh, ensure_ascii=False, indent=2)
|
||||
fh.write("\n")
|
||||
os.replace(tmp, path)
|
||||
except Exception as exc: # pragma: no cover - best effort
|
||||
logger.warning("update_logged_status failed for job %s: %s", job_id, exc)
|
||||
|
||||
|
||||
def init_job_log(job_id: str, meta: Dict[str, Any], logs_dir: Optional[str] = None) -> None:
|
||||
"""Seed the per-job audit-log dir: write meta.json, status.json, and a first
|
||||
``registered`` line in events.ndjson. Idempotent (the ``registered`` line is
|
||||
written only when events.ndjson does not yet exist) and best-effort."""
|
||||
try:
|
||||
d = job_log_dir(job_id, logs_dir)
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
with open(d / META_FILENAME, "w", encoding="utf-8") as fh:
|
||||
json.dump(meta, fh, ensure_ascii=False, indent=2)
|
||||
fh.write("\n")
|
||||
status = meta.get("status", "pending")
|
||||
update_logged_status(
|
||||
job_id, status, logs_dir=logs_dir,
|
||||
created_at=meta.get("created_at"), prompt=meta.get("prompt"),
|
||||
)
|
||||
events_path = d / EVENTS_FILENAME
|
||||
first_time = not events_path.exists()
|
||||
events_path.touch(exist_ok=True)
|
||||
if first_time:
|
||||
append_event(job_id, {
|
||||
"event": "registered",
|
||||
"status": status,
|
||||
"agent": meta.get("agent"),
|
||||
"agent_session": meta.get("agent_session"),
|
||||
"topic_prefix": meta.get("topic_prefix"),
|
||||
"timestamp": meta.get("created_at"),
|
||||
}, logs_dir=logs_dir)
|
||||
except Exception as exc: # pragma: no cover - best effort
|
||||
logger.warning("init_job_log failed for job %s: %s", job_id, exc)
|
||||
|
||||
|
||||
def read_logged_meta(job_id: str, logs_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
||||
"""Return a job's audit meta.json (registration snapshot), or None."""
|
||||
try:
|
||||
with open(job_log_path(job_id, META_FILENAME, logs_dir), "r", encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
|
||||
def read_logged_status(job_id: str, logs_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
||||
"""Return a job's current status.json, or None. This is the fast point-query
|
||||
file (current status only), separate from the registration-time meta.json."""
|
||||
try:
|
||||
with open(job_log_path(job_id, STATUS_FILENAME, logs_dir), "r", encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
|
||||
def iter_logged_events(job_id: str, logs_dir: Optional[str] = None):
|
||||
"""Yield each parsed event from a job's events.ndjson in file (time) order.
|
||||
Malformed lines are skipped with a warning."""
|
||||
path = job_log_path(job_id, EVENTS_FILENAME, logs_dir)
|
||||
if not path.exists():
|
||||
return
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
yield json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("skipping malformed audit line in %s", path)
|
||||
|
||||
|
||||
def list_logged_jobs(logs_dir: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""Return one meta record per job directory under the logs root, oldest
|
||||
first. Falls back to ``{"job_id": <dir>}`` when meta.json is missing."""
|
||||
base = Path(logs_dir or LOGS_DIR)
|
||||
out: List[Dict[str, Any]] = []
|
||||
if not base.exists():
|
||||
return out
|
||||
for d in sorted(base.iterdir()):
|
||||
if not d.is_dir():
|
||||
continue
|
||||
meta = read_logged_meta(d.name, logs_dir) or {"job_id": d.name}
|
||||
# Overlay the live status.json so the summary reflects current state, not
|
||||
# the registration-time snapshot frozen in meta.json.
|
||||
status = read_logged_status(d.name, logs_dir)
|
||||
if status:
|
||||
meta = {**meta,
|
||||
"status": status.get("status", meta.get("status")),
|
||||
"updated_at": status.get("updated_at", meta.get("updated_at"))}
|
||||
out.append(meta)
|
||||
out.sort(key=lambda m: m.get("created_at") or "")
|
||||
return out
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Retry helper
|
||||
# --------------------------------------------------------------------------
|
||||
def with_retry(
|
||||
fn: Optional[Callable] = None,
|
||||
*,
|
||||
attempts: int = 3,
|
||||
base_delay: float = 0.5,
|
||||
factor: float = 2.0,
|
||||
max_delay: float = 8.0,
|
||||
exceptions: Iterable[type] = (Exception,),
|
||||
) -> Callable:
|
||||
"""Retry ``fn`` with exponential backoff.
|
||||
|
||||
Usable two ways::
|
||||
|
||||
result = with_retry(do_publish, attempts=3)() # wrap-and-call
|
||||
@with_retry(attempts=5, base_delay=1.0) # decorator
|
||||
def do_publish(): ...
|
||||
|
||||
Re-raises the last exception once ``attempts`` is exhausted.
|
||||
"""
|
||||
exc_tuple = tuple(exceptions)
|
||||
|
||||
def decorate(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
||||
delay = base_delay
|
||||
last_exc: Optional[BaseException] = None
|
||||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except exc_tuple as exc:
|
||||
last_exc = exc
|
||||
if attempt >= attempts:
|
||||
break
|
||||
logger.warning(
|
||||
"attempt %d/%d failed: %s; retrying in %.1fs",
|
||||
attempt, attempts, exc, delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
delay = min(delay * factor, max_delay)
|
||||
assert last_exc is not None
|
||||
raise last_exc
|
||||
|
||||
return wrapper
|
||||
|
||||
if fn is not None:
|
||||
return decorate(fn)
|
||||
return decorate
|
||||
|
||||
|
||||
def setup_logging(level: int = logging.WARNING) -> None:
|
||||
"""Configure root logging to stderr. stdout is reserved for data output
|
||||
(subscriber event lines, registry ids)."""
|
||||
import sys
|
||||
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
stream=sys.stderr,
|
||||
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
||||
)
|
||||
+229
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python3
|
||||
"""publish_event.py — the single entry point for emitting a Job event.
|
||||
|
||||
Loads the job record from the registry, resolves its broker, assigns the next
|
||||
monotonic ``seq``, builds the schema-v1 JSON payload, and publishes it to
|
||||
``<topic_prefix>/events`` over QoS 1 with exponential-backoff retry.
|
||||
|
||||
Silent by design: nothing is printed to stdout. Diagnostics go to stderr via
|
||||
logging. Terminal events (``completed``/``error``) publish with retain=True so
|
||||
a late subscriber still observes the final state (production hardening).
|
||||
|
||||
Exit codes:
|
||||
0 published successfully
|
||||
1 parameter / registry error (bad args, unknown job, no pending job)
|
||||
2 publish failed after retries (network / broker / ACK timeout)
|
||||
|
||||
Usage:
|
||||
publish_event.py --job <id> --event started [--detail "..."] [--data '{...}']
|
||||
publish_event.py --pick-pending --agent-session tmux:claude --event completed
|
||||
publish_event.py --job <id> --event completed --retained
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import mqtt_common
|
||||
import registry
|
||||
from mqtt_common import (
|
||||
DEFAULT_REGISTRY_DIR,
|
||||
SCHEMA_VERSION,
|
||||
broker_config_from_job,
|
||||
events_topic_for,
|
||||
load_job,
|
||||
make_client,
|
||||
next_seq,
|
||||
with_retry,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("delegate_job.publish_event")
|
||||
|
||||
VALID_EVENTS = ("started", "permission_required", "progress", "completed", "error")
|
||||
TERMINAL_EVENTS = ("completed", "error")
|
||||
# event -> registry status to sync as a best-effort side effect
|
||||
EVENT_TO_STATUS = {
|
||||
"started": "running",
|
||||
"completed": "completed",
|
||||
"error": "error",
|
||||
}
|
||||
|
||||
CONNECT_ACK_TIMEOUT = 10 # seconds to wait for CONNACK
|
||||
PUBLISH_ACK_TIMEOUT = 5 # seconds to wait for QoS-1 PUBACK
|
||||
|
||||
|
||||
def build_payload(
|
||||
job_id: str,
|
||||
seq: int,
|
||||
event: str,
|
||||
detail: str,
|
||||
data: Optional[Dict[str, Any]],
|
||||
auth_token: Optional[str],
|
||||
) -> Dict[str, Any]:
|
||||
payload: Dict[str, Any] = {
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
"seq": seq,
|
||||
"job_id": job_id,
|
||||
"event": event,
|
||||
"timestamp": mqtt_common._utcnow(),
|
||||
"detail": detail,
|
||||
"data": dict(data) if data else {},
|
||||
}
|
||||
# Production: carry the per-job HMAC-SHA256 signature in `data.hmac_sig` so
|
||||
# the subscriber can verify the publisher without exposing the secret token.
|
||||
# The signature is calculated over the entire payload (with `data.hmac_sig` excluded).
|
||||
if auth_token:
|
||||
sign_payload = {k: v for k, v in payload.items() if k != "data"}
|
||||
sign_payload["data"] = {k: v for k, v in payload.get("data", {}).items() if k != "hmac_sig"}
|
||||
msg = json.dumps(sign_payload, sort_keys=True, separators=(",", ":")).encode()
|
||||
sig = hmac.new(auth_token.encode(), msg, hashlib.sha256).hexdigest()
|
||||
payload["data"]["hmac_sig"] = sig
|
||||
return payload
|
||||
|
||||
|
||||
def _publish_once(config, topic: str, body: bytes, retain: bool) -> None:
|
||||
"""Connect, publish one QoS-1 message, wait for the broker ACK, disconnect.
|
||||
|
||||
Raises on any failure so ``with_retry`` can re-run the whole sequence (a
|
||||
fresh connection per attempt is the robust choice for a PoC)."""
|
||||
client = make_client("publisher", config)
|
||||
connected = {"rc": None}
|
||||
|
||||
def on_connect(_c, _u, _flags, reason_code, _props):
|
||||
connected["rc"] = reason_code
|
||||
|
||||
client.on_connect = on_connect
|
||||
client.connect(config.host, config.port, config.keepalive)
|
||||
client.loop_start()
|
||||
try:
|
||||
# Wait for CONNACK so we fail fast on auth/TLS errors.
|
||||
deadline = time.monotonic() + CONNECT_ACK_TIMEOUT
|
||||
while connected["rc"] is None and time.monotonic() < deadline:
|
||||
time.sleep(0.05)
|
||||
if connected["rc"] is None:
|
||||
raise TimeoutError("no CONNACK from broker")
|
||||
if mqtt_common.reason_code_value(connected["rc"]) != 0:
|
||||
raise ConnectionError(f"broker refused connection: rc={connected['rc']}")
|
||||
|
||||
info = client.publish(topic, payload=body, qos=1, retain=retain)
|
||||
info.wait_for_publish(timeout=PUBLISH_ACK_TIMEOUT)
|
||||
if not info.is_published():
|
||||
raise TimeoutError("publish not acknowledged within timeout")
|
||||
finally:
|
||||
client.loop_stop()
|
||||
try:
|
||||
client.disconnect()
|
||||
except Exception: # pragma: no cover - disconnect best effort
|
||||
pass
|
||||
|
||||
|
||||
def _resolve_job_id(args) -> Optional[str]:
|
||||
if args.pick_pending:
|
||||
return registry.pick_pending(args.agent_session, args.registry_dir)
|
||||
return args.job
|
||||
|
||||
|
||||
def main(argv=None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Publish a Job event to MQTT")
|
||||
target = parser.add_mutually_exclusive_group(required=True)
|
||||
target.add_argument("--job", help="job id to publish for")
|
||||
target.add_argument("--pick-pending", action="store_true",
|
||||
help="auto-select a pending job for --agent-session")
|
||||
parser.add_argument("--agent-session", default="tmux:claude",
|
||||
help="session label used with --pick-pending")
|
||||
parser.add_argument("--event", default="progress", choices=VALID_EVENTS)
|
||||
parser.add_argument("--detail", default="")
|
||||
parser.add_argument("--data", default=None, help="optional JSON object string")
|
||||
parser.add_argument("--retained", action="store_true",
|
||||
help="force retain=True (auto for completed/error)")
|
||||
parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
|
||||
parser.add_argument("--attempts", type=int, default=3)
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
mqtt_common.setup_logging(logging.DEBUG if args.verbose else logging.WARNING)
|
||||
|
||||
# --- parse optional data JSON (parameter error -> exit 1) ---
|
||||
data: Optional[Dict[str, Any]] = None
|
||||
if args.data:
|
||||
try:
|
||||
data = json.loads(args.data)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError("--data must be a JSON object")
|
||||
except (ValueError, json.JSONDecodeError) as exc:
|
||||
logger.error("invalid --data: %s", exc)
|
||||
return 1
|
||||
|
||||
job_id = _resolve_job_id(args)
|
||||
if not job_id:
|
||||
logger.error("no job to publish for (unknown --job or no pending job)")
|
||||
return 1
|
||||
|
||||
try:
|
||||
job = load_job(job_id, args.registry_dir)
|
||||
except FileNotFoundError as exc:
|
||||
logger.error("%s", exc)
|
||||
return 1
|
||||
|
||||
config = broker_config_from_job(job)
|
||||
topic = job.get("topic_prefix")
|
||||
topic = f"{topic}/events" if topic else events_topic_for(job_id)
|
||||
seq = next_seq(job_id, args.registry_dir)
|
||||
payload = build_payload(
|
||||
job_id=job_id,
|
||||
seq=seq,
|
||||
event=args.event,
|
||||
detail=args.detail,
|
||||
data=data,
|
||||
auth_token=job.get("auth_token"),
|
||||
)
|
||||
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
||||
retain = args.retained or args.event in TERMINAL_EVENTS
|
||||
|
||||
publish = with_retry(
|
||||
_publish_once,
|
||||
attempts=args.attempts,
|
||||
exceptions=(OSError, TimeoutError, ConnectionError, ValueError),
|
||||
)
|
||||
try:
|
||||
publish(config, topic, body, retain)
|
||||
except Exception as exc:
|
||||
logger.error("publish failed after %d attempts: %s", args.attempts, exc)
|
||||
return 2
|
||||
|
||||
# Persistent audit log: record the exact payload we put on the wire so the
|
||||
# publish is reproducible from the log alone. Best-effort (isolated inside
|
||||
# append_event) — never fails the publish.
|
||||
mqtt_common.append_event(job_id, {
|
||||
"event": "published",
|
||||
"source_event": args.event,
|
||||
"seq": seq,
|
||||
"topic": topic,
|
||||
"retain": retain,
|
||||
"timestamp": payload["timestamp"],
|
||||
"detail": args.detail,
|
||||
"payload": payload,
|
||||
})
|
||||
|
||||
# Best-effort side effects: registry status sync + (debug) event log. Never
|
||||
# fail the publish on these.
|
||||
registry.append_event(job_id, args.registry_dir, payload)
|
||||
new_status = EVENT_TO_STATUS.get(args.event)
|
||||
if new_status:
|
||||
try:
|
||||
mqtt_common.update_job_status(job_id, args.registry_dir, status=new_status)
|
||||
except Exception as exc: # pragma: no cover - best effort
|
||||
logger.warning("status sync failed: %s", exc)
|
||||
|
||||
logger.info("published %s seq=%d job=%s retain=%s", args.event, seq, job_id, retain)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,334 @@
|
||||
"""Job registry for the tmux-agent-orchestrate-delegate-job skill.
|
||||
|
||||
A job record is the single source of truth for one delegated unit of work:
|
||||
its id, prompt, owning agent session, broker connection, timeouts, and status.
|
||||
Records live as ``<registry_dir>/<job_id>.json`` with an append-only event log
|
||||
``<registry_dir>/<job_id>.events.log`` and a shared ``<registry_dir>/.lock``.
|
||||
|
||||
Concurrency is handled via the fcntl lock in :mod:`mqtt_common` (PoC). For
|
||||
multi-host delegation, migrate to SQLite WAL — see references/registry.md.
|
||||
|
||||
Importable as a library and runnable as a CLI (``register``/``list``/``get``/
|
||||
``status``/``pick``) so the ``tmux-agent-orchestrate-delegate-job`` bash wrapper can shell out.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import mqtt_common
|
||||
from mqtt_common import (
|
||||
DEFAULT_REGISTRY_DIR,
|
||||
SCHEMA_VERSION,
|
||||
_atomic_write_record,
|
||||
_utcnow,
|
||||
broker_config_from_env,
|
||||
load_job,
|
||||
registry_lock,
|
||||
topic_prefix_for,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("delegate_job.registry")
|
||||
|
||||
TERMINAL_STATUSES = ("completed", "error", "cancelled")
|
||||
VALID_STATUSES = ("pending", "running", "completed", "error", "cancelled")
|
||||
|
||||
|
||||
def generate_job_id(bits: int = 32) -> str:
|
||||
"""PoC: 32-bit hex (8 chars). Production: 128-bit (full uuid4 hex)."""
|
||||
if bits >= 128:
|
||||
return uuid.uuid4().hex
|
||||
nibbles = max(1, bits // 4)
|
||||
return uuid.uuid4().hex[:nibbles]
|
||||
|
||||
|
||||
def register_job(
|
||||
prompt: str,
|
||||
agent: str = "claude-code",
|
||||
agent_session: str = "tmux:claude",
|
||||
broker: Optional[Dict[str, Any]] = None,
|
||||
timeout_sec: int = 3600,
|
||||
idle_timeout_sec: int = 120,
|
||||
registry_dir: str = DEFAULT_REGISTRY_DIR,
|
||||
job_id: Optional[str] = None,
|
||||
expected_artifacts: Optional[List[str]] = None,
|
||||
bits: int = 32,
|
||||
auth_token: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Create a new ``pending`` job record and return its id.
|
||||
|
||||
``broker`` defaults to the current environment's resolved broker block, so
|
||||
the registry alone is enough for ``publish_event.py`` to connect later.
|
||||
"""
|
||||
job_id = job_id or generate_job_id(bits)
|
||||
if broker is None:
|
||||
broker = broker_config_from_env().to_registry_block()
|
||||
if auth_token is None:
|
||||
# Auto-generate token if secure broker configuration (TLS or username) is detected
|
||||
if broker.get("tls") or broker.get("username"):
|
||||
import secrets
|
||||
auth_token = secrets.token_urlsafe(32)
|
||||
now = _utcnow()
|
||||
record: Dict[str, Any] = {
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
"job_id": job_id,
|
||||
"status": "pending",
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"prompt": prompt,
|
||||
"agent": agent,
|
||||
"agent_session": agent_session,
|
||||
"broker": broker,
|
||||
"topic_prefix": topic_prefix_for(job_id),
|
||||
"timeout_sec": int(timeout_sec),
|
||||
"idle_timeout_sec": int(idle_timeout_sec),
|
||||
"expected_artifacts": expected_artifacts or [],
|
||||
"last_seq": 0,
|
||||
"auth_token": auth_token,
|
||||
}
|
||||
with registry_lock(registry_dir):
|
||||
if mqtt_common._job_path(job_id, registry_dir).exists():
|
||||
raise FileExistsError(f"job already exists: {job_id}")
|
||||
_atomic_write_record(job_id, registry_dir, record)
|
||||
# Seed the persistent audit log (meta.json + status.json + a "registered"
|
||||
# event). Best-effort inside init_job_log — never blocks registration.
|
||||
mqtt_common.init_job_log(job_id, meta=record)
|
||||
logger.info("registered job %s (agent=%s session=%s)", job_id, agent, agent_session)
|
||||
return job_id
|
||||
|
||||
|
||||
def pick_pending(agent_session: str, registry_dir: str = DEFAULT_REGISTRY_DIR) -> Optional[str]:
|
||||
"""Claim the oldest ``pending`` job for ``agent_session``, flipping it to
|
||||
``running`` atomically under the lock. Returns the job id, or None if no
|
||||
pending job matches. This is how each tmux session takes only its own work
|
||||
without two sessions grabbing the same job."""
|
||||
with registry_lock(registry_dir):
|
||||
candidates = []
|
||||
for record in _iter_records(registry_dir):
|
||||
if record.get("status") == "pending" and record.get("agent_session") == agent_session:
|
||||
candidates.append(record)
|
||||
if not candidates:
|
||||
return None
|
||||
candidates.sort(key=lambda r: r.get("created_at", ""))
|
||||
chosen = candidates[0]
|
||||
chosen["status"] = "running"
|
||||
chosen["updated_at"] = _utcnow()
|
||||
_atomic_write_record(chosen["job_id"], registry_dir, chosen)
|
||||
logger.info("session %s picked job %s", agent_session, chosen["job_id"])
|
||||
job_id = chosen["job_id"]
|
||||
updated_at = chosen["updated_at"]
|
||||
# pick_pending writes the record directly (not via update_job_status), so it
|
||||
# mirrors the pending->running transition into the audit log here. Best-effort.
|
||||
mqtt_common.update_logged_status(job_id, "running", updated_at=updated_at)
|
||||
mqtt_common.append_event(job_id, {
|
||||
"event": "status_changed",
|
||||
"from": "pending",
|
||||
"to": "running",
|
||||
"by": agent_session,
|
||||
"timestamp": updated_at,
|
||||
})
|
||||
return job_id
|
||||
|
||||
|
||||
def update_status(job_id: str, registry_dir: str, status: str) -> Dict[str, Any]:
|
||||
if status not in VALID_STATUSES:
|
||||
raise ValueError(f"invalid status {status!r}; expected one of {VALID_STATUSES}")
|
||||
return mqtt_common.update_job_status(job_id, registry_dir, status=status)
|
||||
|
||||
|
||||
def list_jobs(registry_dir: str = DEFAULT_REGISTRY_DIR, status: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
records = list(_iter_records(registry_dir))
|
||||
if status:
|
||||
records = [r for r in records if r.get("status") == status]
|
||||
records.sort(key=lambda r: r.get("created_at", ""))
|
||||
return records
|
||||
|
||||
|
||||
def append_event(job_id: str, registry_dir: str, payload: Dict[str, Any]) -> None:
|
||||
"""Append one event payload as a JSON line to the job's events log. Best
|
||||
effort, debug-only; failures are logged but never raised to the caller."""
|
||||
try:
|
||||
Path(registry_dir).mkdir(parents=True, exist_ok=True)
|
||||
log_path = Path(registry_dir) / f"{job_id}.events.log"
|
||||
with open(log_path, "a", encoding="utf-8") as fh:
|
||||
fh.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
||||
except OSError as exc: # pragma: no cover - best effort
|
||||
logger.warning("could not append event for %s: %s", job_id, exc)
|
||||
|
||||
|
||||
# convenience re-export so callers can `from registry import load_job`
|
||||
__all__ = [
|
||||
"register_job", "pick_pending", "update_status", "load_job",
|
||||
"list_jobs", "append_event", "generate_job_id",
|
||||
]
|
||||
|
||||
|
||||
def _iter_records(registry_dir: str):
|
||||
base = Path(registry_dir)
|
||||
if not base.exists():
|
||||
return
|
||||
for path in sorted(base.glob("*.json")):
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
yield json.load(fh)
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
logger.warning("skipping unreadable record %s: %s", path, exc)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CLI (so the bash wrapper can shell out without inline python)
|
||||
# --------------------------------------------------------------------------
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="tmux-agent-orchestrate-delegate-job registry CLI")
|
||||
parser.add_argument("--registry-dir", default=DEFAULT_REGISTRY_DIR)
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
p_reg = sub.add_parser("register", help="create a pending job; prints the job id")
|
||||
p_reg.add_argument("--prompt", required=True)
|
||||
p_reg.add_argument("--agent", default="claude-code")
|
||||
p_reg.add_argument("--agent-session", default="tmux:claude")
|
||||
p_reg.add_argument("--timeout", type=int, default=3600)
|
||||
p_reg.add_argument("--idle-timeout", type=int, default=120)
|
||||
p_reg.add_argument("--bits", type=int, default=32, help="32 (PoC) or 128 (prod)")
|
||||
p_reg.add_argument("--artifact", action="append", default=[], dest="artifacts")
|
||||
p_reg.add_argument("--auth-token", default=None, help="HMAC auth token for the job (auto-generated if secure broker is detected)")
|
||||
|
||||
p_list = sub.add_parser("list", help="list jobs (optionally by status)")
|
||||
p_list.add_argument("--status", default=None)
|
||||
p_list.add_argument("--json", action="store_true")
|
||||
|
||||
p_get = sub.add_parser("get", help="print one job record as JSON")
|
||||
p_get.add_argument("--job", required=True)
|
||||
|
||||
p_status = sub.add_parser("status", help="set a job status")
|
||||
p_status.add_argument("--job", required=True)
|
||||
p_status.add_argument("--set", required=True, dest="status")
|
||||
|
||||
p_pick = sub.add_parser("pick", help="claim a pending job for a session; prints id")
|
||||
p_pick.add_argument("--agent-session", default="tmux:claude")
|
||||
|
||||
p_logs = sub.add_parser(
|
||||
"logs",
|
||||
help="show the persistent audit log for a job, or --list every logged job",
|
||||
)
|
||||
p_logs.add_argument("job_id", nargs="?", default=None,
|
||||
help="job id whose events.ndjson to print")
|
||||
p_logs.add_argument("--list", action="store_true", dest="list_all",
|
||||
help="summarise every job under the logs dir instead")
|
||||
p_logs.add_argument("--logs-dir", default=None,
|
||||
help="override the audit-log root (default: $DELEGATE_JOB_LOGS_DIR "
|
||||
"or <cwd>/.hermes/delegate_job_logs)")
|
||||
p_logs.add_argument("--tail", type=int, default=0,
|
||||
help="show only the last N events (0 = all)")
|
||||
p_logs.add_argument("--json", action="store_true",
|
||||
help="emit raw JSON lines / records instead of a table")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: Optional[List[str]] = None) -> int:
|
||||
mqtt_common.setup_logging(logging.INFO)
|
||||
args = _build_parser().parse_args(argv)
|
||||
rd = args.registry_dir
|
||||
|
||||
if args.command == "register":
|
||||
job_id = register_job(
|
||||
prompt=args.prompt,
|
||||
agent=args.agent,
|
||||
agent_session=args.agent_session,
|
||||
timeout_sec=args.timeout,
|
||||
idle_timeout_sec=args.idle_timeout,
|
||||
registry_dir=rd,
|
||||
expected_artifacts=args.artifacts,
|
||||
bits=args.bits,
|
||||
auth_token=args.auth_token,
|
||||
)
|
||||
print(job_id)
|
||||
return 0
|
||||
|
||||
if args.command == "list":
|
||||
records = list_jobs(rd, status=args.status)
|
||||
if args.json:
|
||||
print(json.dumps(records, ensure_ascii=False, indent=2))
|
||||
else:
|
||||
if not records:
|
||||
print("(no jobs)")
|
||||
for r in records:
|
||||
print(f"{r['job_id']} {r.get('status','?'):10s} {r.get('agent_session','')}"
|
||||
f" {r.get('prompt','')[:48]}")
|
||||
return 0
|
||||
|
||||
if args.command == "get":
|
||||
try:
|
||||
print(json.dumps(load_job(args.job, rd), ensure_ascii=False, indent=2))
|
||||
except FileNotFoundError as exc:
|
||||
print(str(exc), file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
if args.command == "status":
|
||||
try:
|
||||
update_status(args.job, rd, args.status)
|
||||
except (FileNotFoundError, ValueError) as exc:
|
||||
print(str(exc), file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
if args.command == "pick":
|
||||
job_id = pick_pending(args.agent_session, rd)
|
||||
if job_id is None:
|
||||
return 3 # no pending job for this session
|
||||
print(job_id)
|
||||
return 0
|
||||
|
||||
if args.command == "logs":
|
||||
return _cmd_logs(args)
|
||||
|
||||
return 1
|
||||
|
||||
|
||||
def _cmd_logs(args) -> int:
|
||||
"""Pretty-print one job's events.ndjson, or summarise all logged jobs."""
|
||||
logs_dir = args.logs_dir or mqtt_common.LOGS_DIR
|
||||
|
||||
if args.list_all:
|
||||
jobs = mqtt_common.list_logged_jobs(logs_dir)
|
||||
if args.json:
|
||||
print(json.dumps(jobs, ensure_ascii=False, indent=2))
|
||||
return 0
|
||||
if not jobs:
|
||||
print(f"(no logged jobs under {logs_dir})")
|
||||
return 0
|
||||
for m in jobs:
|
||||
print(f"{m.get('job_id','?')} {m.get('status','?'):10s} "
|
||||
f"{m.get('created_at','-'):20s} {(m.get('prompt') or '')[:48]}")
|
||||
return 0
|
||||
|
||||
if not args.job_id:
|
||||
print("logs requires a <job_id> or --list", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
events = list(mqtt_common.iter_logged_events(args.job_id, logs_dir))
|
||||
if not events and not mqtt_common.job_log_dir(args.job_id, logs_dir).exists():
|
||||
print(f"no audit log for job {args.job_id} under {logs_dir}", file=sys.stderr)
|
||||
return 1
|
||||
if args.tail and args.tail > 0:
|
||||
events = events[-args.tail:]
|
||||
if args.json:
|
||||
for e in events:
|
||||
print(json.dumps(e, ensure_ascii=False))
|
||||
return 0
|
||||
for e in events:
|
||||
ts = e.get("logged_at") or e.get("timestamp") or "-"
|
||||
extra = e.get("detail") or e.get("to") or e.get("source_event") or ""
|
||||
print(f"{ts:24s} {e.get('event','?'):<16s} {extra}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
+280
@@ -0,0 +1,280 @@
|
||||
#!/usr/bin/env bash
|
||||
# tmux-agent-orchestrate-delegate-job — user-facing orchestrator for the tmux-agent-orchestrate-delegate-job skill.
|
||||
#
|
||||
# Subcommands:
|
||||
# submit register a job, start the subscriber FIRST, then run the agent,
|
||||
# then (optionally) run a validation script.
|
||||
# status show one job record.
|
||||
# list list all jobs.
|
||||
# verify run a user-supplied --validate script against a job's artifacts.
|
||||
# wait block until all running/pending jobs reach a terminal state.
|
||||
#
|
||||
# This is a reference wrapper: it shells out to the python scripts that live
|
||||
# next to it. Copy it into your project and customise as needed. It never hard
|
||||
# fails if `claude`/`codex`/`tmux` are missing — it prints what it would run.
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# Pick an interpreter: prefer a project .venv, else python3.
|
||||
pick_python() {
|
||||
local py_bin
|
||||
if [[ -n "${DELEGATE_JOB_PYTHON:-}" ]]; then
|
||||
py_bin="$DELEGATE_JOB_PYTHON"
|
||||
elif [[ -x "${WORKDIR:-.}/.venv/bin/python" ]]; then
|
||||
py_bin="${WORKDIR}/.venv/bin/python"
|
||||
elif [[ -x ".venv/bin/python" ]]; then
|
||||
py_bin="$(pwd)/.venv/bin/python"
|
||||
else
|
||||
py_bin="python3"
|
||||
fi
|
||||
if ! "$py_bin" -c "import paho.mqtt" 2>/dev/null; then
|
||||
echo "ERROR: paho-mqtt package is missing for $py_bin." >&2
|
||||
echo " Please create a virtual environment and install it:" >&2
|
||||
echo " python3 -m venv .venv && .venv/bin/pip install -r \"$SCRIPT_DIR/requirements.txt\"" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "$py_bin"
|
||||
}
|
||||
|
||||
REGISTRY_DIR_DEFAULT=".hermes/jobs"
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
tmux-agent-orchestrate-delegate-job <command> [options]
|
||||
|
||||
submit --agent <name> --prompt <text> [--workdir <dir>] [--agent-session <label>]
|
||||
[--timeout <sec>] [--idle-timeout <sec>] [--validate <script>]
|
||||
[--registry-dir <dir>] [--dry-run]
|
||||
# The skill is tmux-interactive only; --mode print was removed.
|
||||
status --job <id> [--registry-dir <dir>]
|
||||
list [--registry-dir <dir>]
|
||||
verify --job <id> --validate <script> [--registry-dir <dir>]
|
||||
wait [--job <id>] [--timeout <sec>] [--registry-dir <dir>]
|
||||
logs <job_id> | --list # persistent audit log (delegate_job_logs/)
|
||||
EOF
|
||||
}
|
||||
|
||||
# ---- arg parsing helpers --------------------------------------------------
|
||||
AGENT="claude-code"; PROMPT=""; WORKDIR="$(pwd)"; AGENT_SESSION="tmux:claude"
|
||||
TIMEOUT=3600; IDLE_TIMEOUT=120; VALIDATE=""; DRY_RUN=0
|
||||
JOB_ID=""; REGISTRY_DIR="$REGISTRY_DIR_DEFAULT"
|
||||
|
||||
parse_opts() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--agent) AGENT="$2"; shift 2;;
|
||||
--prompt) PROMPT="$2"; shift 2;;
|
||||
--workdir) WORKDIR="$2"; shift 2;;
|
||||
--agent-session) AGENT_SESSION="$2"; shift 2;;
|
||||
--timeout) TIMEOUT="$2"; shift 2;;
|
||||
--idle-timeout) IDLE_TIMEOUT="$2"; shift 2;;
|
||||
--validate) VALIDATE="$2"; shift 2;;
|
||||
--job) JOB_ID="$2"; shift 2;;
|
||||
--registry-dir) REGISTRY_DIR="$2"; shift 2;;
|
||||
--dry-run) DRY_RUN=1; shift;;
|
||||
*) echo "unknown option: $1" >&2; usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
cmd_submit() {
|
||||
parse_opts "$@"
|
||||
[[ -n "$PROMPT" ]] || { echo "submit requires --prompt" >&2; exit 1; }
|
||||
PY="$(pick_python)"
|
||||
cd "$WORKDIR"
|
||||
mkdir -p "$REGISTRY_DIR"
|
||||
|
||||
# 1) register job (prints the new job id)
|
||||
JOB_ID="$("$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" register \
|
||||
--prompt "$PROMPT" --agent "$AGENT" --agent-session "$AGENT_SESSION" \
|
||||
--timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT")"
|
||||
echo "registered job: $JOB_ID"
|
||||
|
||||
# 2) START THE SUBSCRIBER FIRST (ordering dependency — MQTT does not queue
|
||||
# non-retained messages for absent subscribers).
|
||||
local logf="$REGISTRY_DIR/$JOB_ID.subscriber.out"
|
||||
"$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
|
||||
--job "$JOB_ID" --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT" \
|
||||
>"$logf" 2>&1 &
|
||||
local sub_pid=$!
|
||||
echo "subscriber pid: $sub_pid (log: $logf)"
|
||||
sleep 1 # give the subscriber time to CONNACK + SUBSCRIBE before the agent runs
|
||||
|
||||
# 3) run the agent (or print the command for dry-run / missing binary)
|
||||
local pub="$PY $SCRIPT_DIR/scripts/publish_event.py --registry-dir $REGISTRY_DIR --job $JOB_ID"
|
||||
# NOTE: the agent MUST use --job "$JOB_ID" (the one we just minted). Hard-coding
|
||||
# an id from an earlier session is the #1 reason a delegated job sits idle and
|
||||
# times out (see SKILL.md "Wrong job_id propagated to the agent"). We make the
|
||||
# freshness explicit in the instruction header.
|
||||
local instructions="Your job_id is \"$JOB_ID\" (the one just registered for THIS delegation — read it from the registry record, do NOT reuse any job_id you saw in earlier runs).
|
||||
|
||||
On start run: $pub --event started.
|
||||
On permission/tool prompt run: $pub --event permission_required --detail '<tool>:<what>'.
|
||||
On progress (optional): $pub --event progress --detail '<short status>'.
|
||||
On success run: $pub --event completed --detail '<one-line summary>'.
|
||||
On failure run: $pub --event error --detail '<one-line reason>'.
|
||||
|
||||
The subscriber for this job_id is already running; your completed/error event ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.
|
||||
|
||||
Task: $PROMPT"
|
||||
|
||||
run_agent "$JOB_ID" "$instructions"
|
||||
|
||||
# 4) optional validation hook
|
||||
if [[ -n "$VALIDATE" ]]; then
|
||||
echo "running validation: $VALIDATE"
|
||||
if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
|
||||
echo "validation: PASS"
|
||||
else
|
||||
local rc=$?
|
||||
echo "validation: FAIL (exit $rc)"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
# In dry-run we never started a real subscriber (the wrapper short-circuits
|
||||
# before launching one), but the wait below would still try to join the
|
||||
# background sub_pid from cmd_submit. Skip both the wait and the subscriber
|
||||
# log dump; the user just wants to see the instruction that would have run.
|
||||
local logs_root_dry="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
|
||||
echo "$logs_root_dry/$JOB_ID"
|
||||
return 0
|
||||
fi
|
||||
|
||||
wait "$sub_pid" || true
|
||||
echo "subscriber output:"; cat "$logf" || true
|
||||
|
||||
# Last stdout line: the persistent audit-log dir for this job (see SKILL.md
|
||||
# "Audit Logs"). Callers can scrape `tail -n1` to find it.
|
||||
local logs_root="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
|
||||
echo "$logs_root/$JOB_ID"
|
||||
}
|
||||
|
||||
run_agent() {
|
||||
local job_id="$1"; local instructions="$2"
|
||||
# The skill is INTERACTIVE-ONLY. We never invoke `claude -p` or any other
|
||||
# one-shot print mode, because:
|
||||
# - claude -p exits the moment stdin is drained, so there's nothing to
|
||||
# `tmux attach` to afterwards.
|
||||
# - fire-and-forget via wrapper defeats the whole point of the audit log
|
||||
# (you can't tell what happened if the agent crashes mid-turn).
|
||||
# - the job registry already gives us an authoritative completion signal,
|
||||
# so we don't need a wrapper-side exit code to know "done".
|
||||
# The user attaches with `tmux attach -t <session>` and types follow-up
|
||||
# prompts themselves. We pre-load the first prompt via stdin and `read`
|
||||
# keeps the pane open after the agent exits so the user can review.
|
||||
case "$AGENT" in
|
||||
claude-code) bin="claude";;
|
||||
codex) bin="codex";;
|
||||
human) echo "[human agent] complete the task, then run publish_event.py --event completed"; return;;
|
||||
*) bin="$AGENT";;
|
||||
esac
|
||||
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
echo "[dry-run] would launch agent '$AGENT' in a fresh tmux session with instructions:"
|
||||
echo "----"; echo "$instructions"; echo "----"
|
||||
return
|
||||
fi
|
||||
|
||||
if ! command -v tmux >/dev/null 2>&1; then
|
||||
echo "ERROR: this skill requires tmux (interactive agent sessions)." >&2
|
||||
echo " Install with: brew install tmux (or your package manager)" >&2
|
||||
return 1
|
||||
fi
|
||||
if ! command -v "$bin" >/dev/null 2>&1; then
|
||||
echo "ERROR: agent binary '$bin' not found in PATH." >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
local sess="${AGENT_SESSION#tmux:}"
|
||||
# Detect a stale session with the same name (e.g. the user is still attached
|
||||
# from an earlier run, or a previous wrapper died without cleanup). tmux
|
||||
# new-session on an existing name fails silently; check first and fail loud.
|
||||
if tmux has-session -t "$sess" 2>/dev/null; then
|
||||
local attached
|
||||
attached=$(tmux list-clients -t "$sess" 2>/dev/null | wc -l | tr -d ' ')
|
||||
echo "ERROR: tmux session '$sess' already exists (clients attached: $attached)." >&2
|
||||
echo " Pick a unique --agent-session (e.g. tmux:demo, tmux:claude-a) or" >&2
|
||||
echo " kill the stale one first: tmux kill-session -t $sess" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Before launching the agent, set up error trap to publish error event
|
||||
if [ -n "${job_id:-}" ] && [ -n "${PY:-}" ]; then
|
||||
local pub_script="$SCRIPT_DIR/scripts/publish_event.py"
|
||||
trap 'rc=$?; if [ $rc -ne 0 ]; then "$PY" "$pub_script" --job "$job_id" --event error --detail "agent bootstrap failed (exit $rc)"; fi' EXIT
|
||||
fi
|
||||
|
||||
tmux new-session -d -s "$sess" -c "$WORKDIR" \
|
||||
"printf '%s' \"$instructions\" | $bin --dangerously-skip-permissions; echo; echo '--- agent exited (job $job_id); press enter to close ---'; read"
|
||||
echo "agent launched in tmux session: $sess (attach with: tmux attach -t $sess)"
|
||||
trap - EXIT
|
||||
}
|
||||
|
||||
cmd_status() {
|
||||
parse_opts "$@"
|
||||
[[ -n "$JOB_ID" ]] || { echo "status requires --job" >&2; exit 1; }
|
||||
PY="$(pick_python)"
|
||||
"$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" get --job "$JOB_ID"
|
||||
}
|
||||
|
||||
cmd_list() {
|
||||
parse_opts "$@"
|
||||
PY="$(pick_python)"
|
||||
"$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" list
|
||||
}
|
||||
|
||||
cmd_verify() {
|
||||
parse_opts "$@"
|
||||
[[ -n "$JOB_ID" ]] || { echo "verify requires --job" >&2; exit 1; }
|
||||
[[ -n "$VALIDATE" ]] || { echo "verify requires --validate <script>" >&2; exit 1; }
|
||||
echo "verifying job $JOB_ID with $VALIDATE"
|
||||
if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
|
||||
echo "verify: PASS (exit 0)"; exit 0
|
||||
else
|
||||
rc=$?; echo "verify: FAIL (exit $rc)"; exit "$rc"
|
||||
fi
|
||||
}
|
||||
|
||||
cmd_logs() {
|
||||
# logs <job_id> | logs --list — delegates to registry.py's logs CLI, which
|
||||
# reads the persistent audit log under $DELEGATE_JOB_LOGS_DIR (or
|
||||
# <cwd>/delegate_job_logs). Run from your project dir so the default resolves.
|
||||
PY="$(pick_python)"
|
||||
if [[ "${1:-}" == "--list" ]]; then
|
||||
"$PY" "$SCRIPT_DIR/scripts/registry.py" logs --list
|
||||
else
|
||||
local jid="${1:-}"
|
||||
[[ -n "$jid" ]] || { echo "logs requires <job_id> or --list" >&2; exit 1; }
|
||||
"$PY" "$SCRIPT_DIR/scripts/registry.py" logs "$jid"
|
||||
fi
|
||||
}
|
||||
|
||||
cmd_wait() {
|
||||
parse_opts "$@"
|
||||
PY="$(pick_python)"
|
||||
if [[ -n "$JOB_ID" ]]; then
|
||||
"$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
|
||||
--job "$JOB_ID" --timeout "$TIMEOUT"
|
||||
else
|
||||
"$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
|
||||
--wait-any --timeout "$TIMEOUT"
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
local sub="${1:-}"; shift || true
|
||||
case "$sub" in
|
||||
submit) cmd_submit "$@";;
|
||||
status) cmd_status "$@";;
|
||||
list) cmd_list "$@";;
|
||||
verify) cmd_verify "$@";;
|
||||
wait) cmd_wait "$@";;
|
||||
logs) cmd_logs "$@";;
|
||||
""|-h|--help|help) usage;;
|
||||
*) echo "unknown command: $sub" >&2; usage; exit 1;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@@ -0,0 +1,236 @@
|
||||
---
|
||||
name: tmux-agent-orchestrate-monitor
|
||||
description: "Run a long-lived Kanban worker that polls .hermes/agent-sessions.yaml against the actual tmux/agent runtime state and reconciles them. Use when you want live visibility into which agent sessions are running, which are dead, which have stale YAML entries, and which have new session ids that haven't been recorded yet. Designed to be dispatched as a Kanban goal_mode task (--goal) so it keeps running until the user stops it."
|
||||
version: 1.0.0
|
||||
author: godopu
|
||||
license: MIT
|
||||
platforms: [linux, macos]
|
||||
environments: [kanban, terminal, tmux]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [agent, tmux, claude, antigravity, agy, monitor, kanban, observation, reconciliation]
|
||||
related_skills: [tmux-agent-orchestrate-create, tmux-agent-orchestrate-resume, tmux-agent-orchestrate-stop, kanban-orchestrator]
|
||||
prereq_skills: [kanban-worker, tmux-agent-orchestrate-create]
|
||||
---
|
||||
|
||||
# Agent Sessions Monitor — Live Reconciliation via Kanban Worker
|
||||
|
||||
> **Companion skills**: `tmux-agent-orchestrate-create` / `tmux-agent-orchestrate-resume` / `tmux-agent-orchestrate-stop` (mutators); this skill is the **observer**.
|
||||
> **Single source of truth**: `./.hermes/agent-sessions.yaml`.
|
||||
|
||||
## What this skill does
|
||||
|
||||
Dispatch a **Kanban worker** (in `goal_mode`) that:
|
||||
|
||||
1. Every ~30s polls the actual state of:
|
||||
- `tmux ls` (which sessions are alive)
|
||||
- `tmux list-panes -t <session> ...` (pane cmd, cwd, pid)
|
||||
- `~/.claude/projects/<workspace-key>/*.jsonl` mtime + first-line sessionId
|
||||
- `~/.gemini/antigravity-cli/cache/last_conversations.json` (agy workspace → conversation mapping)
|
||||
- `~/.gemini/antigravity-cli/conversations/<uuid>.db` mtime (agy)
|
||||
2. Compares the live state to `agent-sessions.yaml`
|
||||
3. Detects 4 classes of drift:
|
||||
- **yaml-only terminated/archived/stopped**: tmux dead, YAML says `terminated`, `archived`, or `stopped` → OK, left untouched (deliberate end states)
|
||||
- **yaml-only running, tmux dead**: YAML says `running`, tmux is gone → mark `terminated` with timestamp
|
||||
- **tmux-only running, not in YAML**: tmux session exists with `<workspace>-creator-*` naming but YAML doesn't know about it → register as a new entry
|
||||
- **stale UUID**: YAML has a UUID, but the on-disk artifact is gone → flag in comment
|
||||
4. Writes a Kanban `kanban_comment` on every drift event with diff details
|
||||
5. Heartbeat every 5 minutes
|
||||
6. **Goal loop**: judge (auxiliary model) re-checks the card after each turn against the body to decide "is monitoring still wanted?". When the user says "stop monitoring" via comment, the worker blocks with `reason=stop-requested`.
|
||||
|
||||
## When to use
|
||||
|
||||
- You have multiple workspaces with tmux agent sessions and want a single source of truth
|
||||
- You suspect YAML drift after a host reboot / crash
|
||||
- You want a notification when a session id was just created (so you can record it before next restart)
|
||||
- You're running multi-day work and want to know "what's actually running right now"
|
||||
|
||||
## When NOT to use
|
||||
|
||||
- One-off interactive session — just check `tmux ls` and read the YAML
|
||||
- A single, short session — overhead > benefit
|
||||
- You don't have a Kanban dispatcher running
|
||||
|
||||
## Dispatching the monitor
|
||||
|
||||
```bash
|
||||
# Goal-mode task: keeps running until the user signals stop
|
||||
hermes kanban create \
|
||||
--title "agent-sessions monitor (live reconcile)" \
|
||||
--assignee default \
|
||||
--workspace worktree \
|
||||
--branch wt/tmux-agent-orchestrate-monitor \
|
||||
--goal \
|
||||
--goal-max-turns 100 \
|
||||
--max-runtime 8h \
|
||||
--max-retries 1 \
|
||||
--skill tmux-agent-orchestrate-monitor \
|
||||
--body "$(cat <<'EOF'
|
||||
You are the agent-sessions monitor. Every 30 seconds, do:
|
||||
|
||||
1. Read .hermes/agent-sessions.yaml
|
||||
2. Run `tmux ls` and `tmux list-panes -F 'session=#{session_name} pid=#{pane_pid} cmd=#{pane_current_command} cwd=#{pane_current_path}'`
|
||||
3. For each session in the YAML, check the corresponding tmux state
|
||||
4. For each tmux session matching `*-creator-claude` or `*-creator-agy` that's not in the YAML, register it
|
||||
5. For any drift, call `kanban_comment` with the diff
|
||||
6. Sleep 30 seconds, then repeat
|
||||
|
||||
If the user comments `stop` or `stop monitoring` on this card, call `kanban_block(reason="stop-requested by user")`.
|
||||
|
||||
If you find that a Claude session's `claude_session_id_own` is null but there's a new *.jsonl in the project dir, read the sessionId from the first line and update the YAML.
|
||||
|
||||
Use the helper script at .agents/skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh for the YAML updates — it handles all the merge logic and writes a structured comment to this card.
|
||||
EOF
|
||||
)"
|
||||
```
|
||||
|
||||
## Helper script: `reconcile.sh`
|
||||
|
||||
The worker calls this script every 30s. It:
|
||||
|
||||
1. Diffs YAML ↔ tmux ↔ disk artifacts
|
||||
2. Updates YAML if needed (only when changes are real, not on every poll — avoids spamming)
|
||||
3. Emits a JSON diff to stdout that the worker turns into a `kanban_comment`
|
||||
|
||||
```bash
|
||||
# Reconcile + auto-update YAML (atomic, flock-guarded). Emits JSON drift to stdout.
|
||||
bash .agents/skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --once --emit-diff
|
||||
|
||||
# Read-only: compute drift WITHOUT writing the YAML (use for "what's running?" checks).
|
||||
bash .agents/skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --once --emit-diff --dry-run
|
||||
|
||||
# Push-based MQTT Monitor: listen to delegated job events on the broker and update the YAML instantly.
|
||||
# Bounded run that exits after 5 min idle, or 1 h wall-clock; falls back to polling if the broker is down.
|
||||
bash .agents/skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --subscribe --idle-timeout 300 --timeout 3600
|
||||
|
||||
# Persistent monitor (no timeouts): runs until interrupted; still polls if the broker is unreachable.
|
||||
bash .agents/skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --subscribe --idle-timeout 0
|
||||
```
|
||||
|
||||
Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (push-based MQTT subscription monitoring). `--subscribe` sub-flags: `--timeout N` (exit after N seconds of wall-clock; `0` = no limit, default), `--idle-timeout N` (exit after N seconds with no message; default `3600`, `0` = never idle-out). On a broker connection failure (connect error **or** non-zero CONNACK), `--subscribe` falls back to a polling loop that re-runs `--once --emit-diff` every `RECONCILE_POLL_INTERVAL` (default 15) seconds until `--timeout`. Terminal-event YAML updates are written through `lib.sh::atomic_dump_yaml` (flock + schema-validate + `.bak`). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself.
|
||||
|
||||
## Drift classes (what the script handles)
|
||||
|
||||
### Status Enum
|
||||
The `status` and `last_visible_status` fields MUST be one of the following exact strings: `running`, `stopped`, `terminated`, `archived`.
|
||||
Any unstructured comments or reasons for the status change should be placed in `last_visible_note` or `termination_mode`.
|
||||
|
||||
### A. tmux dead, YAML says running → auto-terminate
|
||||
|
||||
```
|
||||
YAML: status=running, pane.pid=201132, cmd=claude
|
||||
tmux: no session
|
||||
→ set status=terminated, terminated_at=<now>, termination_mode=auto-detected
|
||||
→ comment: "lab-landing-page-creator-claude: tmux gone (was pane 201132, cmd claude). Marked terminated."
|
||||
```
|
||||
|
||||
**Skip-set**: the auto-terminate only fires for sessions whose status is `running`.
|
||||
Rows already in a deliberate end state — `terminated`, `archived`, or **`stopped`**
|
||||
(set by `tmux-agent-orchestrate-stop`) — are
|
||||
left untouched. This is critical: a `stopped` row keeps its `resumable: true` and
|
||||
captured `*_session_id_own`, so the monitor must **not** overwrite it with
|
||||
`terminated ("auto-detected")` when its tmux is (expectedly) gone.
|
||||
|
||||
### B. tmux alive, not in YAML → auto-register
|
||||
|
||||
```
|
||||
tmux: session=lab-paper-pdf2md-creator-agy, pid=...,
|
||||
cmd=agy, cwd=$WORKSPACE_ROOT/paper-pdf2md
|
||||
YAML: no such session
|
||||
→ register as new entry: status=running, last_visible_status=running, last_visible_note=auto-registered
|
||||
→ comment: "lab-paper-pdf2md-creator-agy: tmux found but not in YAML. Auto-registered."
|
||||
```
|
||||
|
||||
### C. New session id materializes (claude first message sent)
|
||||
|
||||
```
|
||||
YAML: claude_session_id_own=null (placeholder)
|
||||
disk: ~/.claude/projects/.../b3a7...c2f.jsonl exists, mtime=now,
|
||||
first line sessionId=b3a7...c2f
|
||||
→ update claude_session_id_own=b3a7...c2f
|
||||
→ comment: "lab-landing-page-creator-claude: session id materialized b3a7...c2f"
|
||||
```
|
||||
|
||||
### D. Stale UUID (artifact gone)
|
||||
|
||||
```
|
||||
YAML: agent_identities.claude.session_id=87dc548e-...
|
||||
disk: ~/.claude/projects/.../87dc548e-...jsonl: missing
|
||||
→ flag in comment, but DO NOT delete from YAML
|
||||
(the user may have moved the file or the disk may be temporarily unavailable;
|
||||
only `--purge-conversation` should remove the id)
|
||||
```
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- **Don't run the monitor without `--goal`** — without goal mode, a single turn will spawn, do one reconcile, and complete. Goal mode keeps the worker alive across many turns.
|
||||
- **The 30s poll is a default** — workers may override if they detect heavy churn. A workspace with 5+ agent sessions should bump to 60s to avoid noise.
|
||||
- **`kanban_comment` rate limits** — Kanban may throttle if you comment too fast. Coalesce: only comment when the diff is *new* (not the same drift on every poll). The script tracks a state file at `.cache/tmux-agent-orchestrate-monitor/<workspace>.state` in the workspace root for this (overridable via `AGENT_SESSIONS_STATE_DIR`).
|
||||
- **Don't fight the user's explicit action** — if `tmux-agent-orchestrate-stop` is mid-flight and the monitor sees the same session in two states within 5s, prefer the user's most recent action. The monitor should not auto-revert a fresh `terminated` to `running` because of a stale `tmux has-session` check.
|
||||
- **The monitor should never modify the conversation artifacts** (jsonl, db) — only the YAML. If you see a stale UUID, comment about it but don't delete the file.
|
||||
- **TUI capture-pane is expensive** — only capture when you need to update `last_visible_status`, not every poll.
|
||||
|
||||
## Worker body template (for `hermes kanban create --body`)
|
||||
|
||||
The `--body` of the dispatched task IS the worker's behavior spec. Here's a tested template:
|
||||
|
||||
```markdown
|
||||
# agent-sessions monitor
|
||||
|
||||
## Loop (every 30s)
|
||||
|
||||
1. Read agent-sessions.yaml
|
||||
2. Bash: `bash .agents/skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --emit-diff`
|
||||
3. Parse the JSON diff from stdout
|
||||
4. If `drifts` is non-empty:
|
||||
- For each drift, call `kanban_comment` with the diff message
|
||||
5. Bash: `sleep 30`
|
||||
6. Heartbeat every 5 min: `kanban_heartbeat(progress="alive, N drifts detected, last at <time>")`
|
||||
|
||||
## Stop condition
|
||||
|
||||
If `$HERMES_KANBAN_TASK` card has any comment containing "stop" or "stop monitoring" from a user:
|
||||
- Call `kanban_block(reason="stop-requested by user at <timestamp>")`
|
||||
|
||||
## Drift responses
|
||||
|
||||
- A. tmux dead + YAML running: auto-terminate YAML, comment
|
||||
- B. tmux alive not in YAML: auto-register, comment
|
||||
- C. New session id from *.jsonl: update YAML, comment
|
||||
- D. Stale UUID: comment only, no YAML change
|
||||
|
||||
## Hard rules
|
||||
|
||||
- Do NOT modify conversation artifacts (jsonl, db, brain/)
|
||||
- Do NOT spawn/delete tmux sessions — that's the create/delete skills' job
|
||||
- Do NOT call tmux-agent-orchestrate-create or tmux-agent-orchestrate-stop — only the user initiates those
|
||||
- Do NOT call `git commit` / `git push`
|
||||
```
|
||||
|
||||
## Security: --subscribe on Public Brokers
|
||||
|
||||
When using `--subscribe` with the default PoC public broker
|
||||
(`broker.hivemq.com:1883`), be aware that:
|
||||
|
||||
1. **Wildcard subscription** means anyone can publish events to your job topics.
|
||||
2. **Auto-kill on terminal events** means a spoofed `completed` or `error`
|
||||
event from a third party can terminate your agent session.
|
||||
3. **Mitigation**: Use `--subscribe` only on private TLS-enabled brokers
|
||||
(production mode). For PoC, prefer polling-based monitor (`--once` or
|
||||
no `--subscribe`) which reads YAML/tmux state directly without MQTT.
|
||||
4. **HMAC verification**: Events are now verified via `verify_hmac()` in
|
||||
`mqtt_common.py` (see FW-05). Ensure `auth_token` is set for each job
|
||||
to enable signature validation — unauthenticated events will be dropped.
|
||||
|
||||
## Verification (one-shot)
|
||||
|
||||
```bash
|
||||
# Run reconcile once and inspect output
|
||||
bash .agents/skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --emit-diff --once \
|
||||
| python3 -m json.tool
|
||||
```
|
||||
|
||||
## Related skills
|
||||
|
||||
- `kanban-worker` — base lifecycle for the dispatched worker
|
||||
- `kanban-orchestrator` — if you want to dispatch this monitor *from* an orchestrator, use this to know how to phrase the body
|
||||
@@ -0,0 +1,483 @@
|
||||
#!/usr/bin/env bash
|
||||
# reconcile.sh — tmux-agent-orchestrate-monitor 의 부속 스크립트
|
||||
# YAML ↔ tmux ↔ 디스크 artifact 간 drift 감지 (+ YAML 자동 갱신).
|
||||
#
|
||||
# Usage:
|
||||
# bash reconcile.sh --once --emit-diff # drift 감지 + 갱신
|
||||
# bash reconcile.sh --once --emit-diff --dry-run # drift 만 계산, 쓰기 안 함 (P1-E)
|
||||
#
|
||||
# --dry-run: 부수효과 없는 read-only. "지금 뭐 돌고 있지?" 질문에 안전.
|
||||
# tmux-agent-orchestrate-status 스킬이 이걸 재사용.
|
||||
#
|
||||
# 출력 (JSON): {timestamp, yaml_path, tmux_sessions_alive, tmux_confirmed, drifts, actions}
|
||||
#
|
||||
# Exit codes: 0 = ok | 1 = YAML not found | 2 = error
|
||||
set -euo pipefail
|
||||
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
|
||||
|
||||
STATE_DIR="${AGENT_SESSIONS_STATE_DIR:-$WORKSPACE_ROOT/.cache/tmux-agent-orchestrate-monitor}"
|
||||
|
||||
ONCE=0
|
||||
EMIT_DIFF=0
|
||||
DRY_RUN=0
|
||||
SUBSCRIBE=0
|
||||
# --subscribe controls (review item 4): 0 = no overall timeout; idle default 3600s
|
||||
# (raised from 600s to align with job timeout defaults); idle 0 = never idle-out.
|
||||
SUB_TIMEOUT=0
|
||||
SUB_IDLE_TIMEOUT=3600
|
||||
POLL_INTERVAL="${RECONCILE_POLL_INTERVAL:-15}"
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--once) ONCE=1; shift ;;
|
||||
--emit-diff) EMIT_DIFF=1; shift ;;
|
||||
--dry-run) DRY_RUN=1; shift ;;
|
||||
--subscribe) SUBSCRIBE=1; shift ;;
|
||||
--timeout) SUB_TIMEOUT="$2"; shift 2 ;;
|
||||
--idle-timeout) SUB_IDLE_TIMEOUT="$2"; shift 2 ;;
|
||||
-h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe [--timeout N] [--idle-timeout N]]"; exit 0 ;;
|
||||
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
|
||||
|
||||
if [ "$SUBSCRIBE" = "1" ]; then
|
||||
# Paths resolved relative to this script (review item 6): skills/ dir + lib.sh.
|
||||
SKILLS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
LIB_SH="$SKILLS_DIR/lib.sh"
|
||||
# MQTT client lives in the project venv (has paho). All YAML work is delegated
|
||||
# to lib.sh::atomic_dump_yaml, which runs the system python3 (has PyYAML) — so
|
||||
# no single interpreter needs both paho and PyYAML (review items 4/5/6).
|
||||
PYBIN="$(_delegate_py_bin)"
|
||||
|
||||
# The MQTT subscribe loop exits 3 to signal "broker unavailable → poll instead".
|
||||
set +e
|
||||
YAML_PATH="$AGENT_SESSIONS_YAML" HOME_DIR="$HOME_DIR" CLAUDE_PROJECT_DIR="$CLAUDE_PROJECT_DIR" LOCAL_BIN="$LOCAL_BIN" \
|
||||
SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
|
||||
SKILLS_DIR="$SKILLS_DIR" LIB_SH="$LIB_SH" \
|
||||
"$PYBIN" - <<'PYEOF'
|
||||
import os, sys, json, time, subprocess
|
||||
|
||||
lib_sh = os.environ.get('LIB_SH', '')
|
||||
skills_dir = os.environ.get('SKILLS_DIR', '')
|
||||
timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout
|
||||
idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '3600') or '0') # 0 = no idle timeout
|
||||
|
||||
# Locate skills/tmux-agent-orchestrate-delegate-job/scripts to import mqtt_common — relative first, then
|
||||
# an upward walk from cwd. No hardcoded absolute path (review item 6).
|
||||
cand = os.path.join(skills_dir, 'tmux-agent-orchestrate-delegate-job', 'scripts') if skills_dir else ''
|
||||
if cand and os.path.isdir(cand):
|
||||
sys.path.append(cand)
|
||||
else:
|
||||
d = os.getcwd()
|
||||
while d and d != '/':
|
||||
hit = None
|
||||
for sub in (('.agents', 'skills', 'tmux-agent-orchestrate-delegate-job', 'scripts'), ('skills', 'tmux-agent-orchestrate-delegate-job', 'scripts'), ('tmux-agent-orchestrate-delegate-job', 'scripts')):
|
||||
p = os.path.join(d, *sub)
|
||||
if os.path.isdir(p):
|
||||
hit = p
|
||||
break
|
||||
if hit:
|
||||
sys.path.append(hit)
|
||||
break
|
||||
d = os.path.dirname(d)
|
||||
|
||||
import mqtt_common
|
||||
|
||||
# Executed INSIDE lib.sh::atomic_dump_yaml (system python3 + PyYAML), under the
|
||||
# YAML flock with schema-validate + .bak (review item 5). Marks matching running
|
||||
# sessions terminated and kills their tmux (review item 3 behaviour preserved),
|
||||
# or aborts the write entirely when nothing matches. The untrusted MQTT job id /
|
||||
# event arrive via env (MQTT_JID / MQTT_EVENT) — never spliced into source (P1-B).
|
||||
_MUTATION = r'''
|
||||
import os, subprocess
|
||||
from datetime import datetime, timezone
|
||||
_jid = os.environ['MQTT_JID']
|
||||
_event = os.environ['MQTT_EVENT']
|
||||
_now = datetime.now(timezone.utc)
|
||||
_changed = False
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('delegate_job_id') == _jid and s.get('status') == 'running':
|
||||
s['status'] = 'terminated'
|
||||
s['terminated_at'] = _now.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
s['terminated_at_epoch'] = int(_now.timestamp())
|
||||
s['termination_mode'] = 'auto-detected (MQTT ' + _event + ')'
|
||||
_name = s.get('name')
|
||||
_srv = s.get('tmux_server') or 'default'
|
||||
_cmd = ['tmux'] + (['-L', _srv] if _srv != 'default' else []) + ['kill-session', '-t', _name]
|
||||
subprocess.run(_cmd, capture_output=True)
|
||||
print('MQTT Monitor: terminated + killed ' + str(_name) + ' on ' + str(_srv), flush=True)
|
||||
_changed = True
|
||||
if not _changed:
|
||||
raise SystemExit(0) # nothing matched — skip the write entirely
|
||||
'''
|
||||
|
||||
|
||||
def handle_terminal(jid, event):
|
||||
if not lib_sh or not os.path.isfile(lib_sh):
|
||||
print('MQTT Monitor: lib.sh not found, cannot update YAML', flush=True)
|
||||
return
|
||||
env = dict(os.environ)
|
||||
env['MQTT_JID'] = jid
|
||||
env['MQTT_EVENT'] = event
|
||||
cmd = ['bash', '-c',
|
||||
'source "$LIB_SH"; atomic_dump_yaml "$YAML_PATH" MQTT_JID="$MQTT_JID" MQTT_EVENT="$MQTT_EVENT"']
|
||||
r = subprocess.run(cmd, input=_MUTATION, text=True, env=env, capture_output=True)
|
||||
if (r.stdout or '').strip():
|
||||
print(r.stdout.strip(), flush=True)
|
||||
if r.returncode != 0 and (r.stderr or '').strip():
|
||||
print('MQTT Monitor: atomic_dump_yaml stderr: ' + r.stderr.strip(), flush=True)
|
||||
|
||||
|
||||
state = {'last_msg': time.time(), 'connected': False, 'failed': False}
|
||||
|
||||
|
||||
def on_message(_client, _userdata, msg):
|
||||
state['last_msg'] = time.time()
|
||||
try:
|
||||
payload = json.loads(msg.payload.decode("utf-8"))
|
||||
jid = payload.get("job_id")
|
||||
event = payload.get("event")
|
||||
if jid and event in ("completed", "error"):
|
||||
print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
|
||||
handle_terminal(jid, event)
|
||||
except Exception as e:
|
||||
print(f"MQTT Monitor error parsing message: {e}", flush=True)
|
||||
|
||||
|
||||
def on_connect(_c, _u, _flags, reason_code, _props):
|
||||
rc = mqtt_common.reason_code_value(reason_code)
|
||||
if rc == 0:
|
||||
state['connected'] = True
|
||||
_c.subscribe("python/mqtt/jobs/+/events", qos=1)
|
||||
print("MQTT Monitor: subscribed to python/mqtt/jobs/+/events", flush=True)
|
||||
else:
|
||||
state['failed'] = True
|
||||
print(f"MQTT Monitor connection failed: rc={rc}", flush=True)
|
||||
|
||||
|
||||
cfg = mqtt_common.broker_config_from_env()
|
||||
client = mqtt_common.make_client("monitor_sub", cfg)
|
||||
client.on_message = on_message
|
||||
client.on_connect = on_connect
|
||||
print(f"MQTT Monitor: connecting to {cfg.host}:{cfg.port} (TLS={cfg.tls})...", flush=True)
|
||||
|
||||
# Connection failure → fall back to polling (review item 4).
|
||||
try:
|
||||
client.connect(cfg.host, cfg.port, cfg.keepalive)
|
||||
except Exception as e:
|
||||
print(f"MQTT Monitor: connect failed ({e}); falling back to polling", flush=True)
|
||||
sys.exit(3)
|
||||
|
||||
client.loop_start()
|
||||
_wait = time.time()
|
||||
while time.time() - _wait < 5 and not state['connected'] and not state['failed']:
|
||||
time.sleep(0.1)
|
||||
if not state['connected']:
|
||||
print("MQTT Monitor: broker did not accept connection; falling back to polling", flush=True)
|
||||
client.loop_stop()
|
||||
sys.exit(3)
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
while True:
|
||||
now = time.time()
|
||||
if timeout and (now - start) >= timeout:
|
||||
print(f"MQTT Monitor: --timeout {timeout}s reached, exiting", flush=True)
|
||||
break
|
||||
if idle_timeout and (now - state['last_msg']) >= idle_timeout:
|
||||
print(f"MQTT Monitor: --idle-timeout {idle_timeout}s reached, exiting", flush=True)
|
||||
break
|
||||
time.sleep(0.5)
|
||||
finally:
|
||||
client.loop_stop()
|
||||
try:
|
||||
client.disconnect()
|
||||
except Exception:
|
||||
pass
|
||||
sys.exit(0)
|
||||
PYEOF
|
||||
sub_rc=$?
|
||||
set -e
|
||||
|
||||
if [ "$sub_rc" = "3" ]; then
|
||||
echo "MQTT Monitor: broker unavailable — falling back to polling (interval ${POLL_INTERVAL}s)" >&2
|
||||
_self="$SKILLS_DIR/tmux-agent-orchestrate-monitor/scripts/reconcile.sh"
|
||||
_start=$(date +%s)
|
||||
while :; do
|
||||
bash "$_self" --once --emit-diff >/dev/null 2>&1 || true
|
||||
if [ "$SUB_TIMEOUT" != "0" ] && [ "$(( $(date +%s) - _start ))" -ge "$SUB_TIMEOUT" ]; then
|
||||
break
|
||||
fi
|
||||
sleep "$POLL_INTERVAL"
|
||||
done
|
||||
fi
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "$STATE_DIR"
|
||||
|
||||
# 모든 비교 로직을 단일 소스로 둔다. dry-run 은 env_python(읽기전용), 그 외엔
|
||||
# atomic_dump_yaml(flock + temp+rename) 로 같은 소스를 돌린다. atomic 래퍼에서는
|
||||
# 'actions' 가 없으면 SystemExit(0) 으로 쓰기를 건너뛴다 (불필요한 재포맷 방지).
|
||||
read -r -d '' RECON_SRC <<'PYEOF' || true
|
||||
import os, json, glob, subprocess, time
|
||||
from datetime import datetime, timezone
|
||||
import yaml
|
||||
|
||||
yaml_path = os.environ['YAML_PATH']
|
||||
home = os.environ['HOME_DIR']
|
||||
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
|
||||
|
||||
now_iso = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
# atomic 래퍼에서는 d 가 이미 로드돼 있음. env_python(dry-run)에서는 여기서 로드.
|
||||
try:
|
||||
d
|
||||
except NameError:
|
||||
import sqlite3
|
||||
db_path = os.path.splitext(yaml_path)[0] + '.db'
|
||||
d = {}
|
||||
try:
|
||||
if os.path.exists(db_path):
|
||||
conn = sqlite3.connect(db_path, timeout=10.0)
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row: d = json.loads(row[0])
|
||||
|
||||
try:
|
||||
db_sessions = []
|
||||
cursor = conn.execute('SELECT data FROM sessions')
|
||||
for s_row in cursor.fetchall():
|
||||
db_sessions.append(json.loads(s_row[0]))
|
||||
d['tmux_sessions'] = db_sessions
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
conn.close()
|
||||
elif os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
drifts = []
|
||||
actions = []
|
||||
|
||||
# === 현재 tmux 상태 — transient 실패를 'no sessions' 와 구분 (P1-E) ===
|
||||
tmux_sessions = []
|
||||
tmux_confirmed = True
|
||||
|
||||
# YAML 에 등록된 고유한 tmux_server 목록 수집 + 환경변수 TMUX_SERVER_NAME 포함
|
||||
unique_servers = {'default'}
|
||||
if 'TMUX_SERVER_NAME' in os.environ:
|
||||
unique_servers.add(os.environ['TMUX_SERVER_NAME'])
|
||||
for s in d.get('tmux_sessions', []):
|
||||
srv = s.get('tmux_server') or 'default'
|
||||
unique_servers.add(srv)
|
||||
|
||||
try:
|
||||
for srv in sorted(unique_servers):
|
||||
cmd = ['tmux']
|
||||
if srv != 'default':
|
||||
cmd += ['-L', srv]
|
||||
cmd += ['ls', '-F', '#{session_name}|#{session_created}']
|
||||
r = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if r.returncode == 0:
|
||||
for line in r.stdout.strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
name, created = line.split('|', 1)
|
||||
tmux_sessions.append({'name': name, 'created': int(created), 'server': srv})
|
||||
else:
|
||||
err = (r.stderr or '').lower()
|
||||
is_empty = ('no server running' in err) or ('no sessions' in err) or ('failed to connect' in err)
|
||||
if not is_empty:
|
||||
tmux_confirmed = False
|
||||
except Exception:
|
||||
tmux_confirmed = False
|
||||
|
||||
|
||||
def pane_meta(session, srv):
|
||||
try:
|
||||
cmd = ['tmux']
|
||||
if srv != 'default':
|
||||
cmd += ['-L', srv]
|
||||
cmd += ['list-panes', '-t', session, '-F',
|
||||
'#{pane_pid}|#{pane_current_path}|#{pane_current_command}']
|
||||
out = subprocess.check_output(cmd, text=True)
|
||||
parts = out.strip().split('\n')[0].split('|')
|
||||
return {'pid': int(parts[0]), 'cwd': parts[1], 'cmd': parts[2]}
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
yaml_sessions = d.get('tmux_sessions', [])
|
||||
yaml_session_names = {s['name'] for s in yaml_sessions if s.get('name')}
|
||||
alive_set = {(t['name'], t.get('server', 'default')) for t in tmux_sessions}
|
||||
|
||||
# === drift A: tmux dead + YAML running → auto-terminate ===
|
||||
# tmux 응답을 확정했을 때만. transient 실패 시 모두 terminated 로 마크하지 않음 (P1-E)
|
||||
if tmux_confirmed:
|
||||
for s in yaml_sessions:
|
||||
name = s.get('name')
|
||||
if not name:
|
||||
continue
|
||||
# 'stopped' 도 deliberate한 종료 상태 — drift 로 보지 않고 그대로 둔다.
|
||||
# (없으면 tmux-dead stopped 세션을 'terminated' 로 덮어써 resumable 플래그가 소실됨)
|
||||
if s.get('status') in ('terminated', 'archived', 'stopped'):
|
||||
continue
|
||||
srv = s.get('tmux_server') or 'default'
|
||||
if (name, srv) not in alive_set:
|
||||
s['status'] = 'terminated'
|
||||
s['terminated_at'] = now_iso
|
||||
s['terminated_at_epoch'] = int(datetime.now(timezone.utc).timestamp())
|
||||
s['termination_mode'] = 'auto-detected (tmux gone)'
|
||||
pane = s.get('pane') or {}
|
||||
drifts.append({'class': 'A', 'name': name,
|
||||
'msg': f"{name}: tmux gone (was pane {pane.get('pid')}, cmd {pane.get('cmd')}). Marked terminated."})
|
||||
actions.append(f"terminated: {name}")
|
||||
|
||||
# === drift B: tmux alive + not in YAML → auto-register ===
|
||||
if tmux_confirmed:
|
||||
for t in tmux_sessions:
|
||||
name = t['name']
|
||||
if name in yaml_session_names:
|
||||
continue
|
||||
if not (name.endswith('-creator-claude') or name.endswith('-creator-agy')):
|
||||
continue
|
||||
srv = t.get('server', 'default')
|
||||
pm = pane_meta(name, srv)
|
||||
if not pm:
|
||||
continue
|
||||
agent = 'claude' if name.endswith('-creator-claude') else 'agy'
|
||||
cmd_full = 'claude' if agent == 'claude' else 'agy --dangerously-skip-permissions'
|
||||
server_opt = f"-L {srv} " if srv != 'default' else ""
|
||||
entry = {
|
||||
'name': name,
|
||||
'status': 'running',
|
||||
'tmux_session_created_at': datetime.fromtimestamp(t['created'], tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
||||
'tmux_session_epoch': t['created'],
|
||||
'tmux_server': srv,
|
||||
'pane': {'index': 0, 'pid': pm['pid'], 'cmd': agent, 'cmd_full': cmd_full, 'cwd': pm['cwd']},
|
||||
# P2: cwd 인용
|
||||
'start_command': f'tmux {server_opt}new-session -d -s "{name}" -x 140 -y 40 -c "{pm["cwd"]}" "{cmd_full}"',
|
||||
'attach_command': f'tmux {server_opt}attach -t {name}',
|
||||
'kill_command': f'tmux {server_opt}kill-session -t {name}',
|
||||
'last_visible_status': 'running',
|
||||
'last_visible_note': 'auto-registered by monitor',
|
||||
}
|
||||
if agent == 'claude':
|
||||
entry['tui'] = {'model': '(unknown — capture after first message)', 'provider': 'anthropic',
|
||||
'plan': '(unknown)', 'account': '(unknown)', 'version': '(unknown)'}
|
||||
entry['claude_session_id_own'] = None
|
||||
else:
|
||||
entry['child_pid'] = 0
|
||||
entry['agy_conversation_id_own'] = None
|
||||
entry['mcp_attachments'] = [
|
||||
{
|
||||
'name': 'stitch',
|
||||
'transport': 'mcp-remote',
|
||||
'endpoint': 'https://stitch.googleapis.com/mcp'
|
||||
}
|
||||
]
|
||||
d.setdefault('tmux_sessions', []).append(entry)
|
||||
yaml_session_names.add(name)
|
||||
drifts.append({'class': 'B', 'name': name,
|
||||
'msg': f"{name}: tmux found but not in YAML. Auto-registered (pane {pm['pid']}, cmd {pm['cmd']}, cwd {pm['cwd']})."})
|
||||
actions.append(f"registered: {name}")
|
||||
|
||||
# === drift C: claude 새 session id materialize (per-row own id) ===
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if not s.get('name', '').endswith('-creator-claude'):
|
||||
continue
|
||||
if s.get('status') != 'running':
|
||||
continue
|
||||
if s.get('claude_session_id_own'):
|
||||
continue
|
||||
cwd = (s.get('pane') or {}).get('cwd', '')
|
||||
if not cwd:
|
||||
continue
|
||||
proj_key = cwd.replace('/', '-').replace('_', '-')
|
||||
proj_dir = f"{claude_project_dir}/{proj_key}"
|
||||
if not os.path.isdir(proj_dir):
|
||||
continue
|
||||
jsonls = sorted(glob.glob(f"{proj_dir}/*.jsonl"), key=os.path.getmtime, reverse=True)
|
||||
if not jsonls:
|
||||
continue
|
||||
latest = jsonls[0]
|
||||
if time.time() - os.path.getmtime(latest) > 300:
|
||||
continue
|
||||
try:
|
||||
with open(latest) as f:
|
||||
first = f.readline().strip()
|
||||
if not first:
|
||||
continue
|
||||
sid = json.loads(first).get('sessionId')
|
||||
if not sid:
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
s['claude_session_id_own'] = sid
|
||||
drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: session id materialized: {sid}"})
|
||||
actions.append(f"updated session id: {sid}")
|
||||
|
||||
# === drift C (agy): agy 새 session id materialize (per-row own id) ===
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if not s.get('name', '').endswith('-creator-agy'):
|
||||
continue
|
||||
if s.get('status') != 'running':
|
||||
continue
|
||||
if s.get('agy_conversation_id_own'):
|
||||
continue
|
||||
cwd = (s.get('pane') or {}).get('cwd', '')
|
||||
if not cwd:
|
||||
continue
|
||||
lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json"
|
||||
if os.path.exists(lc):
|
||||
try:
|
||||
with open(lc) as f:
|
||||
lc_data = json.load(f)
|
||||
cid = lc_data.get(cwd)
|
||||
if cid and os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
|
||||
s['agy_conversation_id_own'] = cid
|
||||
drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: conversation id materialized: {cid}"})
|
||||
actions.append(f"updated conversation id: {cid}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# === drift D: stale UUID (cache 의 artifact 가 사라짐) — 보고만, 변경 없음 ===
|
||||
ai = d.get('agent_identities', {}) or {}
|
||||
cl = (ai.get('claude') or {})
|
||||
if cl.get('session_id'):
|
||||
sid = cl['session_id']
|
||||
if not glob.glob(f"{claude_project_dir}/*/{sid}.jsonl"):
|
||||
drifts.append({'class': 'D', 'name': '(claude identity cache)',
|
||||
'msg': f"stale UUID in agent_identities.claude.session_id: {sid} (jsonl missing)"})
|
||||
ag = (ai.get('agy') or {})
|
||||
if ag.get('conversation_id'):
|
||||
cid = ag['conversation_id']
|
||||
if not os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
|
||||
drifts.append({'class': 'D', 'name': '(agy identity cache)',
|
||||
'msg': f"stale UUID in agent_identities.agy.conversation_id: {cid} (.db missing)"})
|
||||
|
||||
result = {
|
||||
'timestamp': now_iso,
|
||||
'yaml_path': yaml_path,
|
||||
'tmux_sessions_alive': sorted(f"{t['name']}|{t.get('server', 'default')}" for t in tmux_sessions),
|
||||
'tmux_confirmed': tmux_confirmed,
|
||||
'drifts': drifts,
|
||||
'actions': actions,
|
||||
}
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
|
||||
# atomic 래퍼: actions 가 없으면 쓰기를 건너뛴다. env_python(dry-run)에선 무해.
|
||||
if not actions:
|
||||
raise SystemExit(0)
|
||||
PYEOF
|
||||
|
||||
if [ "$DRY_RUN" = "1" ]; then
|
||||
printf '%s' "$RECON_SRC" | env_python "$AGENT_SESSIONS_YAML"
|
||||
else
|
||||
printf '%s' "$RECON_SRC" | atomic_dump_yaml "$AGENT_SESSIONS_YAML"
|
||||
fi
|
||||
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env bash
|
||||
# watchdog.sh — tmux-agent-orchestrate-monitor 의 부속 스크립트
|
||||
#
|
||||
# Metadata for SKILL.md:
|
||||
# description: "Watchdog helper that keeps subscriber alive and exits when JOB is done"
|
||||
# usage: "watchdog.sh <job_id> <workdir> [--help]"
|
||||
|
||||
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ] || [ $# -lt 2 ]; then
|
||||
echo "Usage: $0 <job_id> <workdir>"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
JOB_ID="$1"
|
||||
WORKDIR="$2"
|
||||
LOG_DIR="$WORKDIR/.hermes/jobs"
|
||||
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
log() {
|
||||
echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] $*"
|
||||
}
|
||||
|
||||
log "watchdog started for JOB=$JOB_ID workdir=$WORKDIR"
|
||||
|
||||
while true; do
|
||||
# 1) Get current job status with robust Python parsing
|
||||
STATUS=$(cd "$WORKDIR" && .venv/bin/python .agents/skills/tmux-agent-orchestrate-delegate-job/scripts/registry.py get --job "$JOB_ID" 2>/dev/null | python3 -c '
|
||||
import sys, json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
print(data.get("status", "unknown"))
|
||||
except Exception:
|
||||
print("unknown")
|
||||
' 2>/dev/null || echo "unknown")
|
||||
|
||||
log "JOB status: $STATUS"
|
||||
|
||||
# 2) Terminal check
|
||||
case "$STATUS" in
|
||||
completed|error|permission_required)
|
||||
log "JOB reached terminal state ($STATUS), watchdog exiting"
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# 3) Start subscriber (2min hard limit)
|
||||
LOG_FILE="$LOG_DIR/subscriber-${JOB_ID}-$(date +%s).log"
|
||||
log "starting subscriber (2min hard limit, log: $LOG_FILE)"
|
||||
|
||||
(
|
||||
cd "$WORKDIR" && timeout 120 .venv/bin/python .agents/skills/tmux-agent-orchestrate-delegate-job/scripts/job_subscriber.py \
|
||||
--job "$JOB_ID" --timeout 120 --idle-timeout 999999 --registry-dir .hermes/jobs > "$LOG_FILE" 2>&1
|
||||
echo "[$(date -u +'%Y-%m-%dT%H:%M:%SZ')] subscriber exited" >> "$LOG_FILE"
|
||||
) &
|
||||
|
||||
SUB_PID=$!
|
||||
log "subscriber PID=$SUB_PID"
|
||||
|
||||
# 4) Wait for subscriber to exit or timeout
|
||||
wait $SUB_PID 2>/dev/null
|
||||
EXIT_CODE=$?
|
||||
log "subscriber exited code=$EXIT_CODE"
|
||||
|
||||
sleep 1
|
||||
done
|
||||
@@ -0,0 +1,151 @@
|
||||
---
|
||||
name: tmux-agent-orchestrate-resume
|
||||
description: "Resume an existing agent (claude, antigravity/agy) conversation by UUID into a tmux session. Reads .hermes/agent-sessions.yaml for the saved session/conversation id, spawns (or reuses) a tmux session of the matching name, and runs `claude -r <id>` or `agy --conversation <id>` inside. Use when you want to reattach to a previous session's context, or revive a session whose tmux died but the agent's conversation is still on disk."
|
||||
version: 1.0.0
|
||||
author: godopu
|
||||
license: MIT
|
||||
platforms: [linux, macos]
|
||||
environments: [terminal, tmux]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [agent, tmux, claude, antigravity, agy, multi-agent, context, resume, session-id]
|
||||
related_skills: [tmux-agent-orchestrate-create, tmux-agent-orchestrate-stop, tmux-agent-orchestrate-monitor, claude-code]
|
||||
prereq_skills: [tmux-agent-orchestrate-create]
|
||||
---
|
||||
|
||||
# Multi-Agent Resume — Reattach to a Saved Conversation
|
||||
|
||||
> **Companion skills**: `tmux-agent-orchestrate-create` (start a fresh agent), `tmux-agent-orchestrate-stop` (terminate), `tmux-agent-orchestrate-monitor` (live status).
|
||||
> **Tmux Isolation**: `TMUX_SERVER_NAME` env var를 create에서 설정한 경우, 동일 서버에서 동작합니다. 자세한 격리 패턴은 [tmux-agent-orchestrate-create/SKILL.md](../tmux-agent-orchestrate-create/SKILL.md) 참조.
|
||||
> **Single source of truth**: `./.hermes/agent-sessions.yaml`.
|
||||
|
||||
## What this skill does
|
||||
|
||||
**Container + data reconstruction**: spawn a tmux session (the container), then run the agent inside with a specific session id (the data) so the previous conversation's context is restored.
|
||||
|
||||
Three cases this skill handles:
|
||||
|
||||
1. **tmux is dead, conversation lives** — `agent-sessions.yaml` has the UUID. The JSONL/db is on disk. Re-spawn the tmux session + run `claude -r <id>` / `agy --conversation <id>`.
|
||||
2. **tmux is alive but empty** — You started a session with `tmux-agent-orchestrate-create` but haven't sent a message yet (so no session id was assigned). The user can either send their first message (and the id is auto-assigned), or you can read the *workspace's* most recent conversation from `$HOME_DIR/.gemini/antigravity-cli/cache/last_conversations.json` (defaults to `~/.gemini/...`) for agy, or the latest `*.jsonl` in `$CLAUDE_PROJECT_DIR/<workspace-key>/` (defaults to `~/.claude/projects/`) for claude.
|
||||
3. **tmux is alive AND the agent inside is already running** — Just attach. No re-spawn needed.
|
||||
|
||||
### Resuming a `stopped` session (`stopped → running`)
|
||||
|
||||
When a session was ended via `tmux-agent-orchestrate-stop` (which captures the ID and gracefully stops by default),
|
||||
its row is `status: stopped` with `resumable: true` and the conversation id
|
||||
already recorded in `claude_session_id_own` / `agy_conversation_id_own`. This is the
|
||||
ideal resume path:
|
||||
|
||||
- **tier-1, race-free**: because the stop command wrote the id into the row at stop
|
||||
time, `resolve_session_id.sh` resolves it via `find_workspace_uuid` tier-1 (the
|
||||
per-row own id) — no reliance on the mtime-based disk scan, so a concurrent
|
||||
session in another workspace can never shadow it.
|
||||
- On resume, `update_yaml_resumed.sh` transitions `stopped → running` and **clears
|
||||
the stop metadata** (`stopped_at`, `stopped_at_epoch`, `stop_reason`, `resumable`)
|
||||
along with the usual `terminated_at*` / `termination_mode` / `archived_at`, so the
|
||||
row reflects a clean running state with no stale end-of-session fields.
|
||||
|
||||
## UUID resolution order
|
||||
|
||||
`agent-sessions.yaml` is the *primary* source. The skill reads in this order:
|
||||
|
||||
1. **`agent-sessions.yaml` → `agent_identities.<agent>.session_id` (claude) / `conversation_id` (agy)** — explicit saved value
|
||||
2. **`agent-sessions.yaml` → `agent_identities.<agent>.session_jsonl` (claude) / `conversation_db` (agy)** — the on-disk artifact
|
||||
3. **Fallback: scan disk for the workspace's most recent conversation** (Note: `CLAUDE_PROJECT_DIR` overrides the default `~/.claude/projects/` path, and `HOME_DIR` overrides the `~` path) —
|
||||
- claude: `ls -t $CLAUDE_PROJECT_DIR/<workspace-key>/*.jsonl | head -1` and parse the `sessionId` from the first line
|
||||
- agy: `jq -r '."<workspace>"' $HOME_DIR/.gemini/antigravity-cli/cache/last_conversations.json`
|
||||
|
||||
If all three are empty → the workspace has no conversation yet. Fall back to `tmux-agent-orchestrate-create`.
|
||||
|
||||
## Workflow
|
||||
|
||||
```bash
|
||||
WORKSPACE=/path/to/project
|
||||
AGENT=claude # or agy or hermes
|
||||
SESSION_NAME=<workspace>-creator-<agent> # same convention as tmux-agent-orchestrate-create
|
||||
|
||||
# 1. Resolve the session id
|
||||
UUID=$(bash .agents/skills/tmux-agent-orchestrate-resume/scripts/resolve_session_id.sh \
|
||||
--workspace "$WORKSPACE" --agent "$AGENT")
|
||||
|
||||
if [ -z "$UUID" ]; then
|
||||
echo "No saved session for $WORKSPACE ($AGENT). Use tmux-agent-orchestrate-create first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Resolve the isolated tmux server name
|
||||
source .agents/skills/lib.sh
|
||||
export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
|
||||
|
||||
# 2. If tmux is alive, attach. Done.
|
||||
if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
|
||||
echo "tmux '$SESSION_NAME' already running. Attaching..."
|
||||
exec tmux attach -t "$SESSION_NAME"
|
||||
fi
|
||||
|
||||
# 3. Spawn new tmux session + run agent with the saved id
|
||||
case "$AGENT" in
|
||||
claude)
|
||||
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
|
||||
"claude --dangerously-skip-permissions -r $UUID"
|
||||
# auto-handle trust / bypass dialogs
|
||||
sleep 5
|
||||
tmux send-keys -t "$SESSION_NAME" Enter 2>/dev/null || true
|
||||
sleep 3
|
||||
tmux send-keys -t "$SESSION_NAME" Down 2>/dev/null || true
|
||||
sleep 0.3
|
||||
tmux send-keys -t "$SESSION_NAME" Enter 2>/dev/null || true
|
||||
;;
|
||||
agy)
|
||||
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
|
||||
"agy --dangerously-skip-permissions --conversation $UUID"
|
||||
;;
|
||||
hermes)
|
||||
tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" \
|
||||
"hermes --resume $UUID"
|
||||
;;
|
||||
esac
|
||||
|
||||
# 4. Update agent-sessions.yaml: status running, last_visible_status
|
||||
# (Also automatically publishes a `progress --detail "resumed"` event to the tmux-agent-orchestrate-delegate-job registry if a delegate_job_id exists)
|
||||
bash .agents/skills/tmux-agent-orchestrate-resume/scripts/update_yaml_resumed.sh \
|
||||
--session "$SESSION_NAME" --uuid "$UUID"
|
||||
|
||||
# 5. Attach
|
||||
tmux attach -t "$SESSION_NAME"
|
||||
```
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- **`claude -r` requires the SAME project directory** — if the workspace path differs from when the session was created, claude will create a new project dir key (`-home-...-different-name`) and put the resume in a different location. Always `-c` (cd to workspace) before running.
|
||||
- **agy's `--conversation` flag name varies by version** — older versions used `--resume` or `-r`. Check `agy --help | grep -E "conversation|resume"` and use the right flag. v1.0.x: `--conversation`.
|
||||
- **The first message after resume might re-trigger TUI dialogs** — if the original session was created with `--dangerously-skip-permissions`, those flags are NOT persisted; you must re-apply them on resume. The script above re-passes them.
|
||||
- **Don't resume if the session is brand new and empty** — `tmux-agent-orchestrate-create` already set up an empty container; sending a probe message ("init") is the right way to materialize a session id, NOT `claude -r` with a placeholder.
|
||||
- **`agy --conversation <id>` will fail if the conversation was deleted from disk** — check `~/.gemini/antigravity-cli/conversations/<uuid>.db` exists before attempting resume. If missing, the conversation is gone; you need a fresh session via `tmux-agent-orchestrate-create`.
|
||||
|
||||
## Verification
|
||||
|
||||
```bash
|
||||
# 1. tmux alive with the right cmd
|
||||
tmux list-panes -t "$SESSION_NAME" -F 'cmd=#{pane_current_command} cwd=#{pane_current_path}'
|
||||
|
||||
# 2. agent-sessions.yaml updated
|
||||
python3 -c "
|
||||
import yaml
|
||||
d = yaml.safe_load(open('.hermes/agent-sessions.yaml'))
|
||||
s = [s for s in d['tmux_sessions'] if s['name'] == '$SESSION_NAME'][0]
|
||||
print(f' status: {s[\"status\"]}')
|
||||
print(f' pane.cmd_full: {s[\"pane\"][\"cmd_full\"]}')
|
||||
"
|
||||
|
||||
# 3. TUI shows resumed conversation (capture-pane to verify)
|
||||
sleep 5
|
||||
tmux capture-pane -t "$SESSION_NAME" -p -S -30
|
||||
# look for the previous message at top of the buffer (claude) or last_visible_status set (agy)
|
||||
```
|
||||
|
||||
## When NOT to use this skill
|
||||
|
||||
- **No saved session yet** → `tmux-agent-orchestrate-create`
|
||||
- **Killing an existing session** → `tmux-agent-orchestrate-stop`
|
||||
- **Just attaching** → `tmux attach -t <name>` (no skill needed)
|
||||
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env bash
|
||||
# resolve_session_id.sh — tmux-agent-orchestrate-resume 의 부속 스크립트
|
||||
# Usage:
|
||||
# bash resolve_session_id.sh --workspace <path> --agent <claude|agy>
|
||||
# 출력: stdout 으로 UUID 한 줄 (없으면 빈 줄 + exit 0)
|
||||
#
|
||||
# P0-C: 전역 agent_identities 를 즉시 반환하지 않는다. lib.sh::find_workspace_uuid
|
||||
# 가 워크스페이스 격리된 해결 경로(per-row own id -> 디스크 스캔 -> cwd 일치하는
|
||||
# cache)만 사용. 다른 워크스페이스의 UUID 를 절대 반환하지 않음.
|
||||
set -euo pipefail
|
||||
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 --workspace <path> --agent <claude|agy>
|
||||
Outputs the resolved UUID on stdout (empty if not found).
|
||||
EOF
|
||||
}
|
||||
|
||||
WORKSPACE=""
|
||||
AGENT=""
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--workspace) WORKSPACE="$2"; shift 2 ;;
|
||||
--agent) AGENT="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -n "$WORKSPACE" ] || { echo "ERROR: --workspace required" >&2; exit 2; }
|
||||
[ -n "$AGENT" ] || { echo "ERROR: --agent required" >&2; exit 2; }
|
||||
case "$AGENT" in
|
||||
claude|agy|hermes) ;;
|
||||
*) echo "ERROR: --agent must be claude or agy or hermes" >&2; exit 2 ;;
|
||||
esac
|
||||
|
||||
find_workspace_uuid "$WORKSPACE" "$AGENT"
|
||||
+156
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env bash
|
||||
# update_yaml_resumed.sh — tmux-agent-orchestrate-resume 의 부속 스크립트
|
||||
# Resume 한 세션의 agent-sessions.yaml 엔트리를 status=running + resume 메타로 갱신.
|
||||
# resume UUID 를 per-row own id (claude_session_id_own / agy_conversation_id_own)
|
||||
# 에 박는다 — agent_identities 전역은 더 이상 primary 아님 (cache 로 강등, P0-C/단계 e).
|
||||
#
|
||||
# Usage: bash update_yaml_resumed.sh --session <name> --uuid <id> [--agent claude|agy]
|
||||
set -euo pipefail
|
||||
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 --session <name> --uuid <id> [--agent claude|agy]
|
||||
EOF
|
||||
}
|
||||
|
||||
SESSION_NAME=""
|
||||
UUID=""
|
||||
AGENT=""
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--session) SESSION_NAME="$2"; shift 2 ;;
|
||||
--uuid) UUID="$2"; shift 2 ;;
|
||||
--agent) AGENT="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -n "$SESSION_NAME" ] || { echo "ERROR: --session required" >&2; exit 2; }
|
||||
[ -n "$UUID" ] || { echo "ERROR: --uuid required" >&2; exit 2; }
|
||||
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
|
||||
|
||||
export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
|
||||
|
||||
# --agent 미지정 시 이름 suffix 로 fallback (P1-F: 가능하면 --agent 명시)
|
||||
if [ -z "$AGENT" ]; then
|
||||
case "$SESSION_NAME" in
|
||||
*-creator-claude) AGENT=claude ;;
|
||||
*-creator-agy) AGENT=agy ;;
|
||||
*-creator-hermes) AGENT=hermes ;;
|
||||
*) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
# 새 tmux pane pid / 자식 pid 를 bash 에서 캡처 (env 로 전달, P1-B)
|
||||
PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null | head -1 || true)
|
||||
PANE_PID="${PANE_PID:-}"
|
||||
CHILD_PID=0
|
||||
if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then
|
||||
CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
|
||||
CHILD_PID="${CHILD_PID:-0}"
|
||||
fi
|
||||
|
||||
DELEGATE_JOB_ID=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF'
|
||||
import os, sys, sqlite3, json, yaml
|
||||
name = os.environ['SESSION_NAME']
|
||||
yaml_path = os.environ['YAML_PATH']
|
||||
db_path = os.path.splitext(yaml_path)[0] + '.db'
|
||||
d = {}
|
||||
try:
|
||||
if os.path.exists(db_path):
|
||||
conn = sqlite3.connect(db_path, timeout=10.0)
|
||||
try:
|
||||
row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
|
||||
if row:
|
||||
s = json.loads(row[0])
|
||||
print(s.get('delegate_job_id', '') or '')
|
||||
raise SystemExit(0)
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row:
|
||||
d = json.loads(row[0])
|
||||
conn.close()
|
||||
elif os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('name') == name:
|
||||
print(s.get('delegate_job_id', '') or '')
|
||||
raise SystemExit(0)
|
||||
raise SystemExit(0)
|
||||
PYEOF
|
||||
)
|
||||
|
||||
atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
|
||||
SESSION_NAME="$SESSION_NAME" UUID="$UUID" AGENT="$AGENT" NOW_ISO="$NOW_ISO" \
|
||||
PANE_PID="$PANE_PID" CHILD_PID="$CHILD_PID" <<'PYEOF'
|
||||
name = os.environ['SESSION_NAME']
|
||||
uuid = os.environ['UUID']
|
||||
agent = os.environ['AGENT']
|
||||
now = os.environ['NOW_ISO']
|
||||
pane_pid = os.environ.get('PANE_PID', '')
|
||||
|
||||
target = None
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('name') == name:
|
||||
target = s
|
||||
break
|
||||
|
||||
if target is None:
|
||||
print(f"ERROR: session not in YAML: {name}", flush=True)
|
||||
raise SystemExit(1)
|
||||
|
||||
target['status'] = 'running'
|
||||
target.pop('terminated_at', None)
|
||||
target.pop('terminated_at_epoch', None)
|
||||
target.pop('termination_mode', None)
|
||||
target.pop('archived_at', None)
|
||||
# stop 메타도 정리 — resume 하면 더 이상 stopped 상태가 아니므로 잔존 필드를 제거.
|
||||
target.pop('stopped_at', None)
|
||||
target.pop('stopped_at_epoch', None)
|
||||
target.pop('stop_reason', None)
|
||||
target.pop('resumable', None)
|
||||
target['last_visible_status'] = f'resumed conversation {uuid} at {now}'
|
||||
|
||||
target.setdefault('pane', {})
|
||||
if pane_pid.isdigit():
|
||||
target['pane']['pid'] = int(pane_pid)
|
||||
|
||||
if agent == 'claude':
|
||||
target['pane']['cmd'] = 'claude'
|
||||
target['pane']['cmd_full'] = f'claude --dangerously-skip-permissions -r {uuid}'
|
||||
target['claude_session_id_own'] = uuid
|
||||
elif agent == 'agy':
|
||||
target['pane']['cmd'] = 'agy'
|
||||
target['pane']['cmd_full'] = f'agy --dangerously-skip-permissions --conversation {uuid}'
|
||||
target['agy_conversation_id_own'] = uuid
|
||||
cp = os.environ.get('CHILD_PID', '0')
|
||||
if cp.isdigit() and int(cp) > 0:
|
||||
target['child_pid'] = int(cp)
|
||||
elif agent == 'hermes':
|
||||
target['pane']['cmd'] = 'hermes'
|
||||
target['pane']['cmd_full'] = f'hermes --resume {uuid}'
|
||||
target['hermes_conversation_id_own'] = uuid
|
||||
cp = os.environ.get('CHILD_PID', '0')
|
||||
if cp.isdigit() and int(cp) > 0:
|
||||
target['child_pid'] = int(cp)
|
||||
|
||||
snap = d.setdefault('snapshot', {})
|
||||
snap['taken_at'] = now
|
||||
snap.pop('terminated_at', None)
|
||||
snap.pop('terminated_at_epoch', None)
|
||||
|
||||
print(f"updated: {name} status=running (resume id -> per-row own id)", flush=True)
|
||||
PYEOF
|
||||
|
||||
delegate_publish_event "$DELEGATE_JOB_ID" progress "resumed"
|
||||
@@ -0,0 +1,124 @@
|
||||
---
|
||||
name: tmux-agent-orchestrate-status
|
||||
description: "Read-only instant snapshot of all agent tmux sessions — name, YAML status, tmux alive, pane cmd/cwd, resume UUID on disk, and any drift. No Kanban, no mutation. Reuses reconcile.sh --dry-run for the diff logic. Use when you want to know 'what's running RIGHT NOW' without spinning up a Kanban monitor worker."
|
||||
version: 1.0.0
|
||||
author: godopu
|
||||
license: MIT
|
||||
platforms: [linux, macos]
|
||||
environments: [terminal, tmux]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [agent, tmux, claude, antigravity, agy, status, read-only, snapshot]
|
||||
related_skills: [tmux-agent-orchestrate-create, tmux-agent-orchestrate-resume, tmux-agent-orchestrate-stop, tmux-agent-orchestrate-monitor]
|
||||
prereq_skills: [tmux-agent-orchestrate-create, tmux-agent-orchestrate-monitor]
|
||||
---
|
||||
|
||||
# Multi-Agent Status — Read-Only Instant Snapshot
|
||||
|
||||
> **Companion skills**: `tmux-agent-orchestrate-create` (start), `tmux-agent-orchestrate-resume` (re-attach), `tmux-agent-orchestrate-stop` (terminate), `tmux-agent-orchestrate-monitor` (live polling).
|
||||
> **Tmux Isolation**: `status` 명령은 YAML에 등록된 모든 세션의 격리 서버(`tmux_server` 필드)를 자동으로 조회하여 상태를 확인하므로, `TMUX_SERVER_NAME` 환경변수를 수동으로 지정하지 않아도 모든 격리 서버의 세션 상태를 통합 조회합니다.
|
||||
> **Single source of truth**: `./.hermes/agent-sessions.yaml`.
|
||||
|
||||
## What this skill does
|
||||
|
||||
Print a single table of every agent tmux session, comparing YAML state to actual tmux state. **No mutation. No Kanban. No polling loop.**
|
||||
|
||||
This is the "what's running right now?" answer — faster than dispatching `tmux-agent-orchestrate-monitor` (which polls every 30s) and safer than `reconcile.sh --once --emit-diff` (which mutates as a side effect).
|
||||
|
||||
## Pre-flight
|
||||
|
||||
```bash
|
||||
command -v tmux
|
||||
command -v python3
|
||||
test -f .hermes/agent-sessions.yaml
|
||||
```
|
||||
|
||||
If `agent-sessions.yaml` doesn't exist or is malformed → print clear error, exit 1. **Do not create it.** (Use `tmux-agent-orchestrate-create` first.)
|
||||
|
||||
## Workflow
|
||||
|
||||
```bash
|
||||
bash .agents/skills/tmux-agent-orchestrate-status/scripts/status.sh [--json]
|
||||
```
|
||||
|
||||
The script:
|
||||
|
||||
1. Calls `reconcile.sh --once --emit-diff --dry-run` (read-only; no YAML mutation) for the drift snapshot
|
||||
2. Loads `agent-sessions.yaml` (read-only) to enrich the table
|
||||
3. For each row in `tmux_sessions[]`:
|
||||
- tmux alive? (via `tmux has-session -t <name>`)
|
||||
- pane cmd, cwd (via `tmux list-panes`)
|
||||
- resume UUID on disk? (claude: `$CLAUDE_PROJECT_DIR/<key>/<uuid>.jsonl` with default `~/.claude/projects/`; agy: `$HOME_DIR/.gemini/antigravity-cli/conversations/<uuid>.db` with default `~/.gemini/...`)
|
||||
4. For each tmux session matching `*-creator-*` not in YAML → flag as "unregistered"
|
||||
5. Prints a table (default) or JSON (with `--json`)
|
||||
|
||||
## Output format (default = aligned table)
|
||||
|
||||
```
|
||||
agent-sessions status — 2026-06-19T14:20:00Z (tmux_confirmed=True)
|
||||
========================================================================================================================================
|
||||
NAME SERVER YAML TMUX CMD RESUME JOB_ID JOB_STATUS DRIFT
|
||||
----------------------------------------------------------------------------------------------------------------------------------------
|
||||
lab-landing-page-creator-claude default running alive claude yes - - -
|
||||
lab-landing-page-creator-agy default terminated dead agy yes 5fe09ba8 completed -
|
||||
lab-paper-pdf2md-creator-claude default running alive claude scan - - -
|
||||
========================================================================================================================================
|
||||
```
|
||||
|
||||
## Output format (`--json`)
|
||||
|
||||
```json
|
||||
{
|
||||
"yaml_path": "...",
|
||||
"tmux_sessions_alive": ["..."],
|
||||
"yaml_entries": [...],
|
||||
"rows": [
|
||||
{
|
||||
"name": "lab-landing-page-creator-claude",
|
||||
"yaml_status": "running",
|
||||
"tmux_alive": true,
|
||||
"pane_cmd": "claude",
|
||||
"pane_cwd": "/home/.../refer_landing_page",
|
||||
"resume_uuid_on_disk": true,
|
||||
"drift": null
|
||||
},
|
||||
{
|
||||
"name": "lab-landing-page-creator-agy",
|
||||
"yaml_status": "terminated",
|
||||
"tmux_alive": false,
|
||||
"drift": "yaml-says-terminated-but-disk-uuid-still-present"
|
||||
}
|
||||
],
|
||||
"unregistered": [],
|
||||
"drifts": []
|
||||
}
|
||||
```
|
||||
|
||||
## Drift classes (read-only — never mutates)
|
||||
|
||||
| Class | Detection | Meaning |
|
||||
|---|---|---|
|
||||
| `A` | YAML `running`, tmux dead | session died without going through `tmux-agent-orchestrate-stop`. *Could* auto-terminate but won't — that's `tmux-agent-orchestrate-monitor`'s job. |
|
||||
| `B` | tmux alive, not in YAML | ad-hoc session someone started without `tmux-agent-orchestrate-create`. Suggest: "use tmux-agent-orchestrate-create to register, or tmux kill-session to clean up." |
|
||||
| `C` | YAML has `claude_session_id_own: null` AND a new *.jsonl exists | new session id materialized; suggest: "run tmux-agent-orchestrate-resume or reconcile to register it." |
|
||||
| `D` | YAML has UUID in `agent_identities`, but the on-disk artifact is gone | stale UUID; user should `tmux-agent-orchestrate-stop --purge-conversation` to clean up. |
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- **Do NOT use this skill to drive mutations** — the output is a snapshot, not a call to action. If you need to fix drifts, dispatch `tmux-agent-orchestrate-monitor` (Kanban worker) or run `tmux-agent-orchestrate-resume` / `tmux-agent-orchestrate-stop` manually.
|
||||
- **Read-only is enforced by script** — `status.sh` opens the YAML with `open(path)` (no `'w'`), never calls `tmux kill-session`, never writes anywhere. The `reconcile.sh --dry-run` mode is the same path.
|
||||
- **If `agent-sessions.yaml` is malformed** — print the YAML error verbatim and exit 1. Do NOT attempt recovery (that's `tmux-agent-orchestrate-stop --purge-conversation` or manual edit's job).
|
||||
- **Sessions outside the `<workspace>-creator-*` naming convention** are still shown but tagged `ad-hoc` — they didn't go through `tmux-agent-orchestrate-create` and aren't tracked in YAML.
|
||||
|
||||
## When to use
|
||||
|
||||
- "Is the claude session still running?" → this skill, not the monitor
|
||||
- "What UUID does this workspace have?" → this skill
|
||||
- "Is there drift between YAML and reality?" → this skill, then dispatch monitor or fix manually
|
||||
- Quick sanity check before dispatching a long Kanban task
|
||||
|
||||
## When NOT to use
|
||||
|
||||
- Continuous live tracking → `tmux-agent-orchestrate-monitor` (Kanban worker)
|
||||
- Recovering from corruption → manual edit + `.bak` restore
|
||||
- Polling more than once a minute → `tmux-agent-orchestrate-monitor` (it dedupes)
|
||||
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env bash
|
||||
# status.sh — tmux-agent-orchestrate-status 의 부속 스크립트 (READ-ONLY)
|
||||
# 한 번 호출로 현재 agent 세션 상태표를 출력. 부수효과 없음.
|
||||
# reconcile.sh --dry-run 을 재사용해 drift 를 계산하고 (P1-E), YAML/디스크에서
|
||||
# 보강한 표를 그린다. YAML 을 절대 수정하지 않는다.
|
||||
#
|
||||
# Usage: bash status.sh [--json]
|
||||
set -euo pipefail
|
||||
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
|
||||
|
||||
RECONCILE="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/tmux-agent-orchestrate-monitor/scripts/reconcile.sh"
|
||||
|
||||
JSON=0
|
||||
[ "${1:-}" = "--json" ] && JSON=1
|
||||
|
||||
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found. Run tmux-agent-orchestrate-create first." >&2; exit 1; }
|
||||
|
||||
# read-only drift snapshot — reconcile.sh --dry-run (no side effects)
|
||||
DRIFT_JSON="$(bash "$RECONCILE" --once --emit-diff --dry-run)"
|
||||
|
||||
if [ "$JSON" = "1" ]; then
|
||||
printf '%s\n' "$DRIFT_JSON"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Project root (parent of .agents/) holds the tmux-agent-orchestrate-delegate-job .hermes registry.
|
||||
# Resolved relative to this script — no hardcoded absolute path (review item 6).
|
||||
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)"
|
||||
|
||||
DRIFT_JSON="$DRIFT_JSON" env_python "$AGENT_SESSIONS_YAML" PROJECT_ROOT="$PROJECT_ROOT" <<'PYEOF'
|
||||
import os, json, glob
|
||||
import yaml
|
||||
|
||||
yaml_path = os.environ['YAML_PATH']
|
||||
home = os.environ['HOME_DIR']
|
||||
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
|
||||
drift = json.loads(os.environ['DRIFT_JSON'])
|
||||
|
||||
db_path = os.path.splitext(yaml_path)[0] + '.db'
|
||||
d = {}
|
||||
import sqlite3
|
||||
try:
|
||||
if os.path.exists(db_path):
|
||||
conn = sqlite3.connect(db_path, timeout=10.0)
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row: d = json.loads(row[0])
|
||||
|
||||
try:
|
||||
db_sessions = []
|
||||
cursor = conn.execute('SELECT data FROM sessions')
|
||||
for s_row in cursor.fetchall():
|
||||
db_sessions.append(json.loads(s_row[0]))
|
||||
d['tmux_sessions'] = db_sessions
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
conn.close()
|
||||
elif os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
alive = set(drift.get('tmux_sessions_alive', []))
|
||||
drift_by_name = {}
|
||||
for dr in drift.get('drifts', []):
|
||||
drift_by_name.setdefault(dr['name'], []).append(dr['class'])
|
||||
|
||||
|
||||
def resume_on_disk(s):
|
||||
# workspace-SCOPED check only — per-row own id, never a global identity (P0-C)
|
||||
name = s.get('name', '')
|
||||
cwd = (s.get('pane') or {}).get('cwd', '')
|
||||
if name.endswith('-creator-claude'):
|
||||
u = s.get('claude_session_id_own')
|
||||
if u:
|
||||
key = cwd.replace('/', '-').replace('_', '-')
|
||||
return 'yes' if os.path.exists(f"{claude_project_dir}/{key}/{u}.jsonl") else 'MISSING'
|
||||
key = cwd.replace('/', '-').replace('_', '-')
|
||||
return 'scan' if glob.glob(f"{claude_project_dir}/{key}/*.jsonl") else 'no'
|
||||
if name.endswith('-creator-agy'):
|
||||
u = s.get('agy_conversation_id_own')
|
||||
if u:
|
||||
return 'yes' if os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{u}.db") else 'MISSING'
|
||||
return 'no'
|
||||
return '?'
|
||||
|
||||
|
||||
def get_job_status(s):
|
||||
jid = s.get('delegate_job_id')
|
||||
if not jid:
|
||||
return ('-', '-')
|
||||
|
||||
project_root = os.environ.get('PROJECT_ROOT', '.')
|
||||
# Candidate locations (review item 6: project-root-relative, no hardcoded abs paths):
|
||||
# 1) cwd-relative registry 2) project-root registry 3) project-root audit log
|
||||
candidates = [
|
||||
os.path.join('.hermes', 'jobs', f"{jid}.json"),
|
||||
os.path.join(project_root, '.hermes', 'jobs', f"{jid}.json"),
|
||||
os.path.join(project_root, '.hermes', 'delegate_job_logs', jid, 'status.json'),
|
||||
]
|
||||
for path in candidates:
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
with open(path) as jf:
|
||||
job_data = json.load(jf)
|
||||
return (jid, job_data.get('status', 'unknown'))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return (jid, 'unknown')
|
||||
|
||||
|
||||
sessions = d.get('tmux_sessions', [])
|
||||
print(f"agent-sessions status — {drift['timestamp']} (tmux_confirmed={drift['tmux_confirmed']})")
|
||||
print("=" * 136)
|
||||
print(f"{'NAME':<44} {'SERVER':<12} {'YAML':<10} {'TMUX':<6} {'CMD':<6} {'RESUME':<8} {'JOB_ID':<10} {'JOB_STATUS':<12} DRIFT")
|
||||
print("-" * 136)
|
||||
if not sessions:
|
||||
print("(no sessions registered)")
|
||||
for s in sessions:
|
||||
name = s.get('name', '?')
|
||||
server = s.get('tmux_server') or 'default'
|
||||
status = s.get('status', '?')
|
||||
tmux = 'alive' if f"{name}|{server}" in alive else 'dead'
|
||||
cmd = (s.get('pane') or {}).get('cmd', '?')
|
||||
res = resume_on_disk(s)
|
||||
jid, jstatus = get_job_status(s)
|
||||
drs = ','.join(drift_by_name.get(name, [])) or '-'
|
||||
print(f"{name:<44} {server:<12} {status:<10} {tmux:<6} {cmd:<6} {res:<8} {jid:<10} {jstatus:<12} {drs}")
|
||||
# drifts not tied to a registered row (e.g. class B unregistered, class D cache)
|
||||
known = {s.get('name') for s in sessions}
|
||||
extra = [dr for dr in drift.get('drifts', []) if dr['name'] not in known]
|
||||
if extra:
|
||||
print("-" * 136)
|
||||
for dr in extra:
|
||||
print(f" [{dr['class']}] {dr['msg']}")
|
||||
print("=" * 136)
|
||||
print(f"alive tmux: {sorted(alive)}")
|
||||
PYEOF
|
||||
@@ -0,0 +1,136 @@
|
||||
---
|
||||
name: tmux-agent-orchestrate-stop
|
||||
description: "Stop an agent tmux session (claude, antigravity/agy) and update .hermes/agent-sessions.yaml. Default stops gracefully and marks status=stopped with conversation preserved for resume. Does NOT delete on-disk conversation artifacts (jsonl/db) — those are preserved unless --purge-conversation is passed. Use when ending a work session, switching to a different one, or cleaning up before a fresh start."
|
||||
version: 1.0.0
|
||||
author: godopu
|
||||
license: MIT
|
||||
platforms: [linux, macos]
|
||||
environments: [terminal, tmux]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [agent, tmux, claude, antigravity, agy, multi-agent, stop, terminate, cleanup]
|
||||
related_skills: [tmux-agent-orchestrate-create, tmux-agent-orchestrate-resume, tmux-agent-orchestrate-monitor]
|
||||
prereq_skills: [tmux-agent-orchestrate-create, tmux-agent-orchestrate-resume]
|
||||
---
|
||||
|
||||
# Multi-Agent Stop — Stop an Agent tmux Session
|
||||
|
||||
> **Companion skills**: `tmux-agent-orchestrate-create` (start), `tmux-agent-orchestrate-resume` (re-attach), `tmux-agent-orchestrate-monitor` (live status).
|
||||
> **Tmux Isolation**: `stop` 명령은 YAML의 `tmux_server` 필드를 자동으로 파싱하여 해당 격리 서버의 세션을 안전하게 종료(kill)하므로, `TMUX_SERVER_NAME` 환경변수를 수동으로 지정할 필요가 없습니다.
|
||||
> **Single source of truth**: `./.hermes/agent-sessions.yaml`.
|
||||
|
||||
## What this skill does
|
||||
|
||||
Stop an agent's tmux session gracefully, resolve and store the conversation ID, and **mark the YAML entry (status=stopped)**. Preserves:
|
||||
|
||||
- The tmux session's recorded `pane.pid / cmd / cwd / mcp_attachments` for audit
|
||||
- The agent's on-disk conversation (claude `*.jsonl`, agy `conversations/*.db`) — so the user can `tmux-agent-orchestrate-resume` later
|
||||
- The `start_command` so a future `tmux-agent-orchestrate-create --session <name>` reproduces the same tmux spec
|
||||
|
||||
The stop command is always **graceful by default**:
|
||||
1. Sends exit keys to the agent TUI (`/exit` for Claude, `Exit` for Agy) and waits 3 seconds.
|
||||
2. If still alive, issues `tmux kill-session` (SIGTERM) and waits 5 seconds.
|
||||
3. If still alive, kills the pane PID via SIGKILL (`kill -9`) as a last resort.
|
||||
4. Auto-captures the conversation ID into the row (`claude_session_id_own`/`agy_conversation_id_own`) before killing, ensuring the next resume uses a race-free tier-1 lookup.
|
||||
|
||||
## Pre-flight
|
||||
|
||||
```bash
|
||||
SESSION_NAME=<workspace>-creator-<agent> # convention
|
||||
AGENT_SESSIONS_YAML=.hermes/agent-sessions.yaml
|
||||
|
||||
# 1) Session is registered?
|
||||
python3 -c "
|
||||
import yaml
|
||||
d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
|
||||
names = [s['name'] for s in d.get('tmux_sessions', [])]
|
||||
if '$SESSION_NAME' not in names:
|
||||
print('NOT in YAML — refusing to stop (no audit trail). Use tmux-agent-orchestrate-create first, or pass --force-no-yaml.')
|
||||
raise SystemExit(1)
|
||||
"
|
||||
|
||||
# 2) Already stopped?
|
||||
ALREADY=$(python3 -c "
|
||||
import yaml
|
||||
d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
|
||||
s = [x for x in d['tmux_sessions'] if x['name']=='$SESSION_NAME'][0]
|
||||
print(s.get('status', 'unknown'))
|
||||
")
|
||||
if [ "$ALREADY" = "stopped" ]; then
|
||||
echo "Already stopped."
|
||||
fi
|
||||
```
|
||||
|
||||
## Workflow
|
||||
|
||||
```bash
|
||||
# 1. Stop gracefully (default — captures ID, shuts down safely, status=stopped)
|
||||
bash .agents/skills/tmux-agent-orchestrate-stop/scripts/stop_session.sh \
|
||||
--session "$SESSION_NAME"
|
||||
|
||||
# 2. Stop gracefully + record a custom stop reason
|
||||
bash .agents/skills/tmux-agent-orchestrate-stop/scripts/stop_session.sh \
|
||||
--session "$SESSION_NAME" --reason api_error
|
||||
|
||||
# 3. Stop gracefully + clean up on-disk conversation (DANGEROUS)
|
||||
# — this prevents any future resume (status=terminated, resumable=false).
|
||||
bash .agents/skills/tmux-agent-orchestrate-stop/scripts/stop_session.sh \
|
||||
--session "$SESSION_NAME" --purge-conversation
|
||||
```
|
||||
|
||||
**Idempotency**: if the row is already `status: stopped`, the script prints `already stopped (...)` and exits 0 — re-running is a safe no-op.
|
||||
|
||||
### State machine
|
||||
|
||||
```
|
||||
running ──(stop default / --reason)────────► stopped (resumable:true, conv preserved)
|
||||
running ──(stop --purge-conversation --yes)► terminated (resumable:false, conv deleted)
|
||||
stopped ──(stop default … again)───────────► stopped (idempotent no-op)
|
||||
```
|
||||
|
||||
Fields written in STOP mode: `status: stopped`, `stopped_at`, `stopped_at_epoch`, `stop_reason`, `termination_mode: graceful`, `claude_session_id_own`/`agy_conversation_id_own` and `resumable: true`.
|
||||
|
||||
If `--purge-conversation` is used: `status: terminated`, `terminated_at`, `terminated_at_epoch`, `termination_mode: purge` and `resumable: false`.
|
||||
|
||||
The script:
|
||||
1. Verifies the session is in agent-sessions.yaml
|
||||
2. If `delegate_job_id` is set, automatically publishes a `progress --detail "terminating"` event to the tmux-agent-orchestrate-delegate-job registry
|
||||
3. Captures the `last_visible_status` from `tmux capture-pane` (so we have a final TUI snapshot for audit)
|
||||
4. Attempts graceful exit keys → SIGTERM kill-session → SIGKILL fallback
|
||||
5. For `purge-conversation`: deletes `~/.claude/projects/.../jsonl` (claude) or `~/.gemini/antigravity-cli/conversations/...db` + `brain/...` (agy)
|
||||
6. Updates the YAML entry and SQLite database atomically
|
||||
7. If `delegate_job_id` is set, publishes a `completed` event to the tmux-agent-orchestrate-delegate-job registry
|
||||
|
||||
## Pitfalls
|
||||
|
||||
- **Don't delete on-disk artifacts by default** — the agent's `*.jsonl` / `conversations/*.db` is the data that `tmux-agent-orchestrate-resume` needs. `--purge-conversation` is for when the user is genuinely done with the conversation and wants zero recovery chance.
|
||||
- **YAML is append-only until you write a stop** — if a previous run left the entry as `running` but tmux is actually dead (crash, host reboot), the YAML is stale. Running `tmux-agent-orchestrate-stop` will detect "tmux already dead, just update YAML" and proceed.
|
||||
- **Don't delete the `claude_session_id_own: null` placeholder** — when the user creates a fresh session with `tmux-agent-orchestrate-create` and never sent a message, the entry has `claude_session_id_own: null`. Stopping must preserve that field.
|
||||
- **Monitor skill may still be tracking** — if `tmux-agent-orchestrate-monitor` is running a heartbeat loop, stopping a session while it watches will trigger its `tmux ls != yaml` reconciliation. That's expected — let the monitor run, it will mark the entry as `terminated` on its own.
|
||||
|
||||
## Verification
|
||||
|
||||
```bash
|
||||
# 1. tmux gone
|
||||
tmux has-session -t "$SESSION_NAME" 2>/dev/null && echo "STILL ALIVE" || echo "OK: tmux gone"
|
||||
|
||||
# 2. YAML has stopped entry
|
||||
python3 -c "
|
||||
import yaml
|
||||
d = yaml.safe_load(open('$AGENT_SESSIONS_YAML'))
|
||||
s = [x for x in d['tmux_sessions'] if x['name']=='$SESSION_NAME'][0]
|
||||
assert s['status'] == 'stopped', f'expected stopped, got {s[\"status\"]}'
|
||||
assert s.get('stopped_at'), 'missing stopped_at'
|
||||
print(f'OK: stopped at {s[\"stopped_at\"]}')
|
||||
print(f' preserved: pane.pid={s[\"pane\"][\"pid\"]}, cmd={s[\"pane\"][\"cmd\"]}, cwd={s[\"pane\"][\"cwd\"]}')
|
||||
"
|
||||
|
||||
# 3. (if --purge-conversation) disk artifacts gone
|
||||
[ -f "${CLAUDE_PROJECT_DIR:-$HOME/.claude/projects}/<projkey>/<uuid>.jsonl" ] && echo "WARN: jsonl still exists" || echo "OK: jsonl purged"
|
||||
```
|
||||
|
||||
## When NOT to use this skill
|
||||
|
||||
- **Just detaching** → `tmux detach` (Ctrl-B d) or just close the terminal. The tmux session keeps running.
|
||||
- **Stopping the agent inside but keeping tmux** → send `Ctrl-C` or `/exit` (claude) / `Ctrl-D` (agy) via `tmux send-keys`. The tmux session stays but the agent process is gone.
|
||||
- **Replacing an existing session with a new one** → `tmux-agent-orchestrate-stop` first, then `tmux-agent-orchestrate-create`.
|
||||
@@ -0,0 +1,341 @@
|
||||
#!/usr/bin/env bash
|
||||
# stop_session.sh — tmux-agent-orchestrate-stop 의 부속 스크립트
|
||||
# Usage:
|
||||
# bash stop_session.sh --session <name> [--agent claude|agy] \
|
||||
# [--mode soft|hard] [--purge-conversation] [--yes]
|
||||
#
|
||||
# mode:
|
||||
# soft — YAML 을 status=archived 로 마크, tmux 세션은 그대로 둠 (P1-A:
|
||||
# terminated 는 tmux 가 실제로 죽은 상태에만 사용)
|
||||
# hard — tmux kill-session + YAML status=terminated
|
||||
# --purge-conversation: --mode hard 일 때만. 삭제 대상 세션의 *워크스페이스에
|
||||
# 격리된* conversation artifact 만 삭제 (P0-C). 전역
|
||||
# agent_identities 를 참조하지 않음. resume 불가.
|
||||
#
|
||||
# Stop extension (Option A — stop 확장, 새 6번째 스킬 없이 stop 의미론 흡수):
|
||||
# --capture-id — kill 직전에 이 워크스페이스의 conversation id 를 row 에 확정
|
||||
# 기록 (claude_session_id_own / agy_conversation_id_own) →
|
||||
# 다음 resume 이 tier-1(race-free) 로 복원. find_workspace_uuid
|
||||
# 재사용 (per-row -> workspace-scoped disk scan -> cache).
|
||||
# --reason R — 상태 전이 사유 (stop_reason). 기본값 manual_stop.
|
||||
# --graceful — kill-session 즉시 종료 대신 send-keys 로 정상 종료 유도 →
|
||||
# 3초 대기 → 미종료 시 kill-session(SIGTERM) → 5초 → SIGKILL.
|
||||
# 위 세 옵션 중 하나라도 주면 STOP 모드: status 가 terminated 가 아니라 stopped
|
||||
# 로 전이 (running -> stopped). 멱등: 이미 stopped 면 no-op + exit 0.
|
||||
# 옵션 미지정 시 기존 hard/soft 동작 그대로 (backward compatible).
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 = success (or already-stopped no-op) | 1 = YAML not found / not registered
|
||||
# 2 = invalid args | 3 = interactive confirmation required (--yes 누락)
|
||||
set -euo pipefail
|
||||
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 --session <name> [--agent claude|agy] [--purge-conversation] [--yes] [--reason <reason>]
|
||||
|
||||
Stop arguments:
|
||||
--reason <reason> — stop_reason field (default: manual_stop)
|
||||
(idempotent: stopping an already-stopped session is a no-op with exit 0)
|
||||
EOF
|
||||
}
|
||||
|
||||
SESSION_NAME=""
|
||||
AGENT=""
|
||||
PURGE=0
|
||||
YES=0
|
||||
CAPTURE_ID=1
|
||||
GRACEFUL=1
|
||||
REASON="manual_stop"
|
||||
STOP_MODE=1
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--session) SESSION_NAME="$2"; shift 2 ;;
|
||||
--agent) AGENT="$2"; shift 2 ;;
|
||||
--purge-conversation) PURGE=1; shift ;;
|
||||
--yes) YES=1; shift ;;
|
||||
--reason) REASON="$2"; shift 2 ;;
|
||||
--mode|--capture-id|--graceful)
|
||||
echo "ERROR: $1 option is deprecated. Stop now always stops gracefully and captures IDs." >&2
|
||||
exit 2
|
||||
;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) echo "ERROR: unknown arg: $1" >&2; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
[ -n "$SESSION_NAME" ] || { echo "ERROR: --session required" >&2; usage; exit 2; }
|
||||
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
|
||||
|
||||
export TMUX_SERVER_NAME="$(resolve_tmux_server "$SESSION_NAME")"
|
||||
|
||||
# --agent 미지정 시 이름 suffix 로 fallback (P1-F)
|
||||
if [ -z "$AGENT" ]; then
|
||||
case "$SESSION_NAME" in
|
||||
*-creator-claude) AGENT=claude ;;
|
||||
*-creator-agy) AGENT=agy ;;
|
||||
*-creator-hermes) AGENT=hermes ;;
|
||||
*) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# 세션이 YAML 에 있는지 + 해당 row 의 워크스페이스 cwd 및 delegate_job_id 추출.
|
||||
# JSON 으로 emit — cwd 에 '|' 가 들어가도 안전 (review item 7; 기존 cwd|jid 파서 대체).
|
||||
MAPPED_DATA=$(env_python "$AGENT_SESSIONS_YAML" SESSION_NAME="$SESSION_NAME" <<'PYEOF'
|
||||
import os, sys, json, yaml, sqlite3
|
||||
name = os.environ['SESSION_NAME']
|
||||
yaml_path = os.environ['YAML_PATH']
|
||||
db_path = os.path.splitext(yaml_path)[0] + '.db'
|
||||
d = {}
|
||||
try:
|
||||
if os.path.exists(db_path):
|
||||
conn = sqlite3.connect(db_path, timeout=10.0)
|
||||
try:
|
||||
row = conn.execute('SELECT data FROM sessions WHERE name=?', (name,)).fetchone()
|
||||
if row:
|
||||
s = json.loads(row[0])
|
||||
cwd = (s.get('pane') or {}).get('cwd', '')
|
||||
jid = s.get('delegate_job_id', '') or ''
|
||||
print(json.dumps({"cwd": cwd, "job_id": jid}))
|
||||
raise SystemExit(0)
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
|
||||
if row:
|
||||
d = json.loads(row[0])
|
||||
conn.close()
|
||||
elif os.path.exists(yaml_path):
|
||||
with open(yaml_path) as f:
|
||||
d = yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('name') == name:
|
||||
cwd = (s.get('pane') or {}).get('cwd', '')
|
||||
jid = s.get('delegate_job_id', '') or ''
|
||||
print(json.dumps({"cwd": cwd, "job_id": jid}))
|
||||
raise SystemExit(0)
|
||||
raise SystemExit(7)
|
||||
PYEOF
|
||||
) || {
|
||||
echo "ERROR: session '$SESSION_NAME' not in $AGENT_SESSIONS_YAML" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
TARGET_CWD=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("cwd",""))')
|
||||
DELEGATE_JOB_ID=$(printf '%s' "$MAPPED_DATA" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("job_id",""))')
|
||||
|
||||
# 멱등성: STOP 모드에서 이미 stopped 인 세션이면 no-op + exit 0
|
||||
if [ "$STOP_MODE" = "1" ]; then
|
||||
if STOPPED_INFO=$(is_already_stopped "$SESSION_NAME"); then
|
||||
echo "already stopped (status=stopped, $STOPPED_INFO) — no-op"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# purge 확인
|
||||
if [ "$PURGE" = "1" ] && [ "$YES" != "1" ]; then
|
||||
echo "DANGER: --purge-conversation will DELETE this workspace's on-disk conversation."
|
||||
echo " workspace: ${TARGET_CWD:-<unknown>}"
|
||||
echo " This means: no future tmux-agent-orchestrate-resume for this session."
|
||||
echo " Re-run with --yes to confirm."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# purge 대상 UUID 를 워크스페이스 격리해서 해결 (P0-C — 전역 참조 금지)
|
||||
PURGE_UUID=""
|
||||
if [ "$PURGE" = "1" ] && [ -n "$TARGET_CWD" ]; then
|
||||
PURGE_UUID=$(find_workspace_uuid "$TARGET_CWD" "$AGENT" || true)
|
||||
fi
|
||||
|
||||
NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
|
||||
NOW_EPOCH=$(date +%s)
|
||||
|
||||
# tmux 상태 + 마지막 TUI 스냅샷 (살아있을 때만; capture-pane 내용은 env 로만 전달)
|
||||
TMUX_ALIVE=0
|
||||
LAST_STATUS=""
|
||||
if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
|
||||
TMUX_ALIVE=1
|
||||
LAST_STATUS=$(tmux capture-pane -t "$SESSION_NAME" -p -S -10 2>/dev/null | tr '\n' ' ' | head -c 500 || true)
|
||||
fi
|
||||
|
||||
# --capture-id: kill 직전에 conversation id 를 해결 (process/jsonl 이 아직 살아있을 때).
|
||||
# find_workspace_uuid 가 tier-1(row) -> tier-2(workspace-scoped disk scan) -> tier-3(cache)
|
||||
# 를 알아서 시도하므로 tmux 생사와 무관하게 동작.
|
||||
CAPTURED_UUID=""
|
||||
if [ "$CAPTURE_ID" = "1" ] && [ -n "$TARGET_CWD" ]; then
|
||||
CAPTURED_UUID=$(capture_conversation_id "$AGENT" "$TARGET_CWD" || true)
|
||||
if [ -n "$CAPTURED_UUID" ]; then
|
||||
echo "captured conversation id: $CAPTURED_UUID"
|
||||
else
|
||||
echo "WARN: --capture-id requested but no conversation id resolved (nothing on disk yet)"
|
||||
fi
|
||||
fi
|
||||
|
||||
delegate_publish_event "$DELEGATE_JOB_ID" progress "terminating"
|
||||
|
||||
# --graceful: send-keys 로 정상 종료 유도 → 폴백 체인 (SIGTERM → SIGKILL).
|
||||
graceful_stop() {
|
||||
local pane_pid exitkey
|
||||
pane_pid=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null | head -1 || true)
|
||||
case "$AGENT" in
|
||||
claude) exitkey="/exit" ;;
|
||||
agy) exitkey="Exit" ;;
|
||||
hermes) exitkey="/exit" ;;
|
||||
*) exitkey="/exit" ;;
|
||||
esac
|
||||
echo "graceful: send-keys '$exitkey' to $SESSION_NAME"
|
||||
tmux send-keys -t "$SESSION_NAME" "$exitkey" Enter 2>/dev/null || true
|
||||
sleep 3
|
||||
if ! tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
|
||||
echo "graceful: exited cleanly"
|
||||
return 0
|
||||
fi
|
||||
echo "graceful: still alive → kill-session (SIGTERM)"
|
||||
tmux kill-session -t "$SESSION_NAME" 2>/dev/null || true
|
||||
sleep 5
|
||||
if ! tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
|
||||
echo "graceful: terminated after kill-session"
|
||||
return 0
|
||||
fi
|
||||
echo "graceful: STILL alive → SIGKILL fallback (pane pid $pane_pid)"
|
||||
[ -n "$pane_pid" ] && kill -9 "$pane_pid" 2>/dev/null || true
|
||||
}
|
||||
|
||||
# tmux 종료: graceful 이면 폴백 체인, 아니면 기존 hard kill.
|
||||
if [ "$GRACEFUL" = "1" ] && [ "$TMUX_ALIVE" = "1" ]; then
|
||||
graceful_stop
|
||||
elif [ "$TMUX_ALIVE" = "1" ]; then
|
||||
tmux kill-session -t "$SESSION_NAME"
|
||||
echo "killed tmux: $SESSION_NAME"
|
||||
else
|
||||
echo "tmux already dead, just updating YAML"
|
||||
fi
|
||||
|
||||
atomic_dump_yaml "$AGENT_SESSIONS_YAML" \
|
||||
SESSION_NAME="$SESSION_NAME" AGENT="$AGENT" PURGE="$PURGE" \
|
||||
NOW_ISO="$NOW_ISO" NOW_EPOCH="$NOW_EPOCH" LAST_STATUS="$LAST_STATUS" \
|
||||
PURGE_UUID="$PURGE_UUID" TARGET_CWD="$TARGET_CWD" \
|
||||
REASON="$REASON" CAPTURED_UUID="$CAPTURED_UUID" <<'PYEOF'
|
||||
import shutil
|
||||
name = os.environ['SESSION_NAME']
|
||||
agent = os.environ['AGENT']
|
||||
purge = os.environ['PURGE'] == '1'
|
||||
now = os.environ['NOW_ISO']
|
||||
home = os.environ['HOME_DIR']
|
||||
last_status = os.environ.get('LAST_STATUS', '')
|
||||
purge_uuid = os.environ.get('PURGE_UUID', '').strip()
|
||||
ws = os.environ.get('TARGET_CWD', '')
|
||||
reason = os.environ.get('REASON', '') or 'manual_stop'
|
||||
captured = os.environ.get('CAPTURED_UUID', '').strip()
|
||||
|
||||
target = None
|
||||
for s in d.get('tmux_sessions', []):
|
||||
if s.get('name') == name:
|
||||
target = s
|
||||
break
|
||||
if target is None:
|
||||
print(f"ERROR: disappeared during script: {name}", flush=True)
|
||||
raise SystemExit(1)
|
||||
|
||||
if purge:
|
||||
target['status'] = 'terminated'
|
||||
target['terminated_at'] = now
|
||||
target['terminated_at_epoch'] = int(os.environ['NOW_EPOCH'])
|
||||
target['termination_mode'] = 'purge'
|
||||
else:
|
||||
target['status'] = 'stopped'
|
||||
target['stopped_at'] = now
|
||||
target['stopped_at_epoch'] = int(os.environ['NOW_EPOCH'])
|
||||
target['stop_reason'] = reason
|
||||
target['termination_mode'] = 'graceful'
|
||||
|
||||
if last_status:
|
||||
target['last_visible_status_at_termination'] = last_status
|
||||
|
||||
# --capture-id: 항상 captured UUID 기록 (purge가 아닐 때만)
|
||||
if captured and not purge:
|
||||
if agent == 'claude':
|
||||
target['claude_session_id_own'] = captured
|
||||
elif agent == 'agy':
|
||||
target['agy_conversation_id_own'] = captured
|
||||
elif agent == 'hermes':
|
||||
target['hermes_conversation_id_own'] = captured
|
||||
target['resumable'] = True
|
||||
|
||||
# --purge-conversation: 워크스페이스 격리된 UUID 의 디스크 artifact 만 삭제 (P0-C)
|
||||
if purge and purge_uuid:
|
||||
if agent == 'claude':
|
||||
key = ws.replace('/', '-').replace('_', '-')
|
||||
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
|
||||
jsonl = f"{claude_project_dir}/{key}/{purge_uuid}.jsonl"
|
||||
if os.path.exists(jsonl):
|
||||
os.remove(jsonl)
|
||||
print(f"purged: {jsonl}", flush=True)
|
||||
target['claude_session_id_own'] = None
|
||||
elif agent == 'agy':
|
||||
db = f"{home}/.gemini/antigravity-cli/conversations/{purge_uuid}.db"
|
||||
if os.path.exists(db):
|
||||
os.remove(db)
|
||||
print(f"purged: {db}", flush=True)
|
||||
brain = f"{home}/.gemini/antigravity-cli/brain/{purge_uuid}"
|
||||
if os.path.isdir(brain):
|
||||
shutil.rmtree(brain)
|
||||
print(f"purged: {brain}", flush=True)
|
||||
target['agy_conversation_id_own'] = None
|
||||
elif agent == 'hermes':
|
||||
json_file = f"{home}/.hermes/sessions/session_{purge_uuid}.json"
|
||||
if os.path.exists(json_file):
|
||||
os.remove(json_file)
|
||||
print(f"purged: {json_file}", flush=True)
|
||||
hdb = f"{home}/.hermes/state.db"
|
||||
if os.path.exists(hdb):
|
||||
try:
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(hdb)
|
||||
conn.execute("DELETE FROM sessions WHERE id=?", (purge_uuid,))
|
||||
conn.execute("DELETE FROM messages WHERE session_id=?", (purge_uuid,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"purged db records for session: {purge_uuid}", flush=True)
|
||||
except Exception as e:
|
||||
print(f"WARN: purge hermes db records failed: {e}", flush=True)
|
||||
target['hermes_conversation_id_own'] = None
|
||||
# agent_identities 는 cache — 이 워크스페이스 것일 때만 비운다
|
||||
ai = (d.get('agent_identities') or {}).get(agent) or {}
|
||||
if ai.get('project_cwd') == ws:
|
||||
if agent == 'claude' and ai.get('session_id') == purge_uuid:
|
||||
ai['session_id'] = None
|
||||
ai['session_jsonl'] = None
|
||||
ai.pop('session_size_bytes', None)
|
||||
ai.pop('session_lines', None)
|
||||
elif agent == 'agy' and ai.get('conversation_id') == purge_uuid:
|
||||
ai['conversation_id'] = None
|
||||
ai['conversation_db'] = None
|
||||
ai['conversation_brain_dir'] = None
|
||||
elif agent == 'hermes' and ai.get('session_id') == purge_uuid:
|
||||
ai['session_id'] = None
|
||||
elif purge and not purge_uuid:
|
||||
print("WARN: --purge-conversation requested but no workspace-scoped UUID resolved; nothing purged", flush=True)
|
||||
|
||||
if purge:
|
||||
target['resumable'] = False
|
||||
|
||||
print(f"updated: {name} status={target['status']}", flush=True)
|
||||
PYEOF
|
||||
|
||||
delegate_publish_event "$DELEGATE_JOB_ID" completed "session terminated"
|
||||
|
||||
echo
|
||||
echo "=== stop complete ==="
|
||||
echo " session: $SESSION_NAME"
|
||||
echo " agent: $AGENT"
|
||||
echo " reason: $REASON"
|
||||
echo " captured: ${CAPTURED_UUID:-<none>}"
|
||||
echo " purge: $PURGE${PURGE_UUID:+ (uuid $PURGE_UUID)}"
|
||||
echo " time: $NOW_ISO"
|
||||
echo
|
||||
echo "Recovery: tmux-agent-orchestrate-create + tmux-agent-orchestrate-resume 로 동일 컨텍스트 복원 가능"
|
||||
echo " (단 --purge-conversation 사용 시 복원 불가)"
|
||||
Reference in New Issue
Block a user