Files
multi-agent-mux/skills/lib.sh
T
Godopu e8eebe5eb1 feat(tmux-agent-orchestrate-monitor): integrate watchdog pattern as skill
Moved /tmp/subscriber-watchdog.sh → skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh
(skill-managed lifecycle, no longer lives outside workspace).

Added lib.sh::start_watchdog() helper:
- Spawns watchdog as background nohup process
- Writes watchdog log to .hermes/jobs/<JID>.watchdog.log
- Returns watchdog PID via stdout

Wired create_session.sh --submit-job to auto-start watchdog after JOB registration.

Fixes:
- Bug: registry.py get first-line parse was fragile (empty status → infinite loop)
  → Now uses python3 json.load for robust parsing
- Bug: old path skills/delegate-job/scripts/job_subscriber.py hardcoded
  → Now uses skills/tmux-agent-orchestrate-delegate-job/scripts/job_subscriber.py

Verified on isolated server -L agy-watchdog-skill-test (kill-server after):
- Syntax check PASS
- E2E: register job → start watchdog → publish completed → watchdog exits
- Global skill non-interference verified
- Main isolated server -L multi-agent-canary untouched
2026-06-19 23:33:46 +00:00

450 lines
16 KiB
Bash

#!/usr/bin/env bash
# lib.sh — shared library for the tmux-agent-orchestrate-* skills.
#
# Single source of truth for the four things that were inconsistently
# re-implemented across create/resume/delete/monitor (REVIEW.md §4.1):
# - derive_session_name : the tmux session slug (P0-A)
# - atomic_dump_yaml : flock + temp+rename + .bak + validate (P0-B)
# - env_python : env-safe Python (no heredoc injection) (P0-B / P1-B)
# - find_workspace_uuid : workspace-SCOPED resume id lookup (P0-C)
#
# Source it from each script with a path computed from the script location:
# source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
#
# HARD RULE: the agent-sessions.yaml file is only ever written through
# atomic_dump_yaml. Never `open(yaml_path, 'w')` anywhere else.
AGENT_SESSIONS_YAML="${AGENT_SESSIONS_YAML:-$HOME/PuKi/lab/agent_sessions/agent-sessions.yaml}"
# ---------------------------------------------------------------------------
# Tmux Server Isolation support
# ---------------------------------------------------------------------------
TMUX_SERVER_NAME="${TMUX_SERVER_NAME:-default}"
_resolve_real_tmux_path() {
if [ -z "${_REAL_TMUX_PATH:-}" ] || [[ "$_REAL_TMUX_PATH" == *"/multi-agent-tmux-shim/"* ]] || [[ "$_REAL_TMUX_PATH" == *"/skills/.bin"* ]]; then
local dir save_ifs="$IFS"
_REAL_TMUX_PATH=""
IFS=:
for dir in $PATH; do
if [[ "$dir" != *"/multi-agent-tmux-shim/"* ]] && [[ "$dir" != *"/skills/.bin"* ]] && [ -x "$dir/tmux" ]; then
_REAL_TMUX_PATH="$dir/tmux"
break
fi
done
IFS="$save_ifs"
if [ -z "$_REAL_TMUX_PATH" ]; then
_REAL_TMUX_PATH="tmux"
fi
export _REAL_TMUX_PATH
fi
}
_init_tmux_isolation() {
_resolve_real_tmux_path
if [ -n "${TMUX_SERVER_NAME:-}" ] && [ "$TMUX_SERVER_NAME" != "default" ]; then
local wrapper_dir="${TMPDIR:-/tmp}/multi-agent-tmux-shim/${TMUX_SERVER_NAME}"
if [[ ":$PATH:" != *":$wrapper_dir:"* ]]; then
mkdir -p "$wrapper_dir"
cat <<EOF > "$wrapper_dir/tmux"
#!/usr/bin/env bash
if [ -z "\${TMUX_SERVER_NAME:-}" ] || [ "\$TMUX_SERVER_NAME" = "default" ]; then
exec "$_REAL_TMUX_PATH" "\$@"
else
exec "$_REAL_TMUX_PATH" -L "\$TMUX_SERVER_NAME" "\$@"
fi
EOF
chmod +x "$wrapper_dir/tmux"
export PATH="$wrapper_dir:$PATH"
fi
else
# 격리 비활성화 시 shim 자동 cleanup (PATH에서 제거)
local new_path="" dir save_ifs="$IFS"
IFS=:
for dir in $PATH; do
if [[ "$dir" != *"/multi-agent-tmux-shim/"* ]] && [[ "$dir" != *"/skills/.bin"* ]]; then
if [ -z "$new_path" ]; then
new_path="$dir"
else
new_path="$new_path:$dir"
fi
fi
done
IFS="$save_ifs"
export PATH="$new_path"
fi
}
_tmux() {
_init_tmux_isolation
if [ -z "${TMUX_SERVER_NAME:-}" ] || [ "$TMUX_SERVER_NAME" = "default" ]; then
"$_REAL_TMUX_PATH" "$@"
else
"$_REAL_TMUX_PATH" -L "$TMUX_SERVER_NAME" "$@"
fi
}
tmux() {
_tmux "$@"
}
# ---------------------------------------------------------------------------
# resolve_tmux_server <session_name>
#
# Query agent-sessions.yaml to find the tmux_server associated with a session.
# Fallback to TMUX_SERVER_NAME or 'default' if not registered or field is missing.
# Prints the resolved server name on stdout.
# ---------------------------------------------------------------------------
resolve_tmux_server() {
local session_name="$1"
SESSION_NAME="$session_name" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
import os, sys, yaml
name = os.environ['SESSION_NAME']
yaml_path = os.environ['YAML_PATH']
if os.path.exists(yaml_path):
try:
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
for s in d.get('tmux_sessions', []):
if s.get('name') == name:
server = s.get('tmux_server')
if server:
print(server)
sys.exit(0)
except Exception:
pass
# Fallback
print(os.environ.get('TMUX_SERVER_NAME', 'default'))
PYEOF
}
# ---------------------------------------------------------------------------
# derive_session_name <workspace> <agent>
#
# THE single source of truth for the tmux session name. Rule:
# slug = the two trailing path components of the absolute workspace,
# '_' -> '-', lowercased, joined with '-'
# name = "<slug>-creator-<agent>"
#
# /home/godopu16/PuKi/lab/landing_page/refer_landing_page + claude
# -> landing-page-refer-landing-page-creator-claude
#
# Decision (REVIEW P0-A): the actual workspace basename (refer_landing_page)
# IS included. The hand-written historical entry that dropped it
# (lab-landing-page-creator-claude) was the bug, not the convention.
# Every script and SKILL.md must use exactly this rule.
# ---------------------------------------------------------------------------
derive_session_name() {
local workspace="$1" agent="$2"
local abs parent work slug
abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
parent="$(basename "$(dirname "$abs")")"
work="$(basename "$abs")"
slug="$(printf '%s-%s' "$parent" "$work" | tr '[:upper:]' '[:lower:]' | tr '_' '-')"
printf '%s-creator-%s' "$slug" "$agent"
}
# ---------------------------------------------------------------------------
# env_python <yaml_path> [KEY=VALUE ...] (Python source read from stdin)
#
# Run python3 with the source supplied on stdin via a *quoted* heredoc, so the
# shell never interpolates the source. All values are passed through the
# environment (YAML_PATH plus any KEY=VALUE pairs). Untrusted data (workspace
# paths, capture-pane text) must travel as env vars and be read via os.environ
# inside the script — never spliced into the source. Read-only by convention;
# use atomic_dump_yaml when you need to write the YAML.
# ---------------------------------------------------------------------------
env_python() {
local yaml_path="$1"; shift
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME")
while [ $# -gt 0 ]; do
case "$1" in
*=*) envs+=("$1"); shift ;;
*) break ;;
esac
done
env "${envs[@]}" python3 - "$@"
}
# ---------------------------------------------------------------------------
# atomic_dump_yaml <yaml_path> [KEY=VALUE ...] (mutation source from stdin)
#
# The ONLY sanctioned way to write agent-sessions.yaml. It:
# 1. takes an exclusive flock on <yaml_path>.lock (serialises all writers)
# 2. loads the YAML into `d`
# 3. exec()s the caller's mutation source (sees d, yaml, os, datetime,
# timezone, glob, subprocess; reads values via os.environ). The mutation
# may print and may `raise SystemExit(n)` to abort *without* writing.
# 4. validates the resulting schema
# 5. backs up to <yaml_path>.bak, then writes atomically (temp + os.replace)
#
# The mutation source is passed via env and exec()'d — it is never string
# spliced and untrusted data never lands in Python source (P0-B / P1-B).
# ---------------------------------------------------------------------------
atomic_dump_yaml() {
local yaml_path="$1"; shift
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME")
while [ $# -gt 0 ]; do
case "$1" in
*=*) envs+=("$1"); shift ;;
*) break ;;
esac
done
local mutation; mutation="$(cat)"
env "${envs[@]}" AGENT_SESSIONS_MUTATION="$mutation" python3 - <<'PYEOF'
import os, sys, fcntl, tempfile, shutil, glob, subprocess, json
from datetime import datetime, timezone
import yaml
yaml_path = os.environ['YAML_PATH']
lock_path = yaml_path + '.lock'
def _validate(d):
if not isinstance(d, dict):
raise SystemExit("VALIDATE: top-level is not a mapping")
sessions = d.get('tmux_sessions', [])
if not isinstance(sessions, list):
raise SystemExit("VALIDATE: tmux_sessions is not a list")
valid = {'running', 'terminated', 'archived'}
for i, s in enumerate(sessions):
if not isinstance(s, dict):
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] not a mapping")
if not s.get('name') or not s.get('status'):
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] missing name/status")
if s['status'] not in valid:
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} bad status {s['status']!r}")
if not isinstance(s.get('pane'), dict):
raise SystemExit(f"VALIDATE: tmux_sessions[{i}] {s.get('name')!r} missing pane")
lock_fh = open(lock_path, 'w')
fcntl.flock(lock_fh, fcntl.LOCK_EX)
try:
if os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
else:
d = {}
# --- caller mutation (module scope: sees d, yaml, os, glob, subprocess) ---
exec(compile(os.environ['AGENT_SESSIONS_MUTATION'], '<mutation>', 'exec'), globals())
_validate(d)
if os.path.exists(yaml_path):
try:
shutil.copy2(yaml_path, yaml_path + '.bak')
except Exception:
pass
dir_ = os.path.dirname(yaml_path) or '.'
fd, tmp = tempfile.mkstemp(dir=dir_, prefix='.agent-sessions.', suffix='.tmp')
try:
with os.fdopen(fd, 'w') as f:
yaml.safe_dump(d, f, default_flow_style=False, sort_keys=False,
allow_unicode=True, width=4096)
os.replace(tmp, yaml_path)
except Exception:
if os.path.exists(tmp):
os.remove(tmp)
raise
finally:
fcntl.flock(lock_fh, fcntl.LOCK_UN)
lock_fh.close()
PYEOF
}
# ---------------------------------------------------------------------------
# find_workspace_uuid <workspace> <agent>
#
# Workspace-SCOPED resolution of the resume UUID (P0-C). It NEVER returns a
# global agent_identities id unless that id's project_cwd matches THIS
# workspace. Resolution order:
# 1) tmux_sessions[] row whose pane.cwd == this workspace -> per-row own id
# (claude_session_id_own / agy_conversation_id_own)
# 2) on-disk scan scoped to this workspace
# (claude: ~/.claude/projects/<key>/*.jsonl ; agy: last_conversations.json[cwd])
# 3) agent_identities cache, ONLY when its project_cwd == this workspace
# Prints the UUID on stdout (empty line if none). Always exits 0.
# ---------------------------------------------------------------------------
find_workspace_uuid() {
local workspace="$1" agent="$2"
local abs; abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
WS_ABS="$abs" AGENT="$agent" env_python "$AGENT_SESSIONS_YAML" <<'PYEOF'
import os, json, glob
import yaml
ws = os.environ['WS_ABS']
agent = os.environ['AGENT']
home = os.environ['HOME_DIR']
yaml_path = os.environ['YAML_PATH']
d = {}
if os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
def jsonl_exists(uuid):
key = ws.replace('/', '-').replace('_', '-')
return os.path.exists(f"{home}/.claude/projects/{key}/{uuid}.jsonl")
def db_exists(uuid):
return os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{uuid}.db")
def emit(u):
print(u)
raise SystemExit(0)
# 1) per-row own id for THIS workspace
for s in d.get('tmux_sessions', []):
if not isinstance(s, dict):
continue
if (s.get('pane') or {}).get('cwd') != ws:
continue
name = s.get('name', '')
if agent == 'claude' and name.endswith('-creator-claude'):
cand = s.get('claude_session_id_own')
if cand and jsonl_exists(cand):
emit(cand)
if agent == 'agy' and name.endswith('-creator-agy'):
cand = s.get('agy_conversation_id_own')
if cand and db_exists(cand):
emit(cand)
# 2) disk scan scoped to THIS workspace
if agent == 'claude':
key = ws.replace('/', '-').replace('_', '-')
proj = f"{home}/.claude/projects/{key}"
if os.path.isdir(proj):
for j in sorted(glob.glob(f"{proj}/*.jsonl"), key=os.path.getmtime, reverse=True):
sid = None
try:
with open(j) as f:
first = f.readline().strip()
if first:
sid = json.loads(first).get('sessionId')
except Exception:
sid = None
cand = sid or os.path.basename(j)[:-6]
if cand and jsonl_exists(cand):
emit(cand)
elif agent == 'agy':
lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json"
if os.path.exists(lc):
cand = None
try:
cand = json.load(open(lc)).get(ws)
except Exception:
cand = None
if cand and db_exists(cand):
emit(cand)
# 3) agent_identities cache, workspace-checked only
ai = (d.get('agent_identities') or {}).get(agent) or {}
if ai.get('project_cwd') == ws:
if agent == 'claude':
cand = ai.get('session_id')
if cand and jsonl_exists(cand):
emit(cand)
elif agent == 'agy':
cand = ai.get('conversation_id')
if cand and db_exists(cand):
emit(cand)
print('')
PYEOF
}
# ---------------------------------------------------------------------------
# tmux-agent-orchestrate-delegate-job integration helpers
#
# All paths are resolved relative to lib.sh's own location (BASH_SOURCE), so the
# skill tree is relocatable — no hardcoded absolute paths (review item 6).
# ---------------------------------------------------------------------------
# _delegate_py_bin — echo the virtualenv python (walk up from skills/), else python3.
_delegate_py_bin() {
local d
d="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
while [ "$d" != "/" ] && [ -n "$d" ]; do
if [ -x "$d/.venv/bin/python" ]; then
printf '%s\n' "$d/.venv/bin/python"; return 0
fi
d="$(dirname "$d")"
done
printf '%s\n' "python3"
}
# _delegate_script <name> — echo the path to a tmux-agent-orchestrate-delegate-job script, resolved
# relative to skills/ (lib.sh dir). Empty if not found.
_delegate_script() {
local name="$1" skill_dir cand
skill_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cand="$skill_dir/tmux-agent-orchestrate-delegate-job/scripts/$name"
if [ -f "$cand" ]; then printf '%s\n' "$cand"; return 0; fi
printf '%s\n' "$(find "$skill_dir" -name "$name" 2>/dev/null | head -n 1 || true)"
}
# delegate_submit_job <prompt> <agent> <agent_session>
#
# Register a job in the tmux-agent-orchestrate-delegate-job registry. Prints the new JID on stdout.
delegate_submit_job() {
local prompt="$1" agent="$2" session="$3"
local py_bin registry_py
py_bin="$(_delegate_py_bin)"
registry_py="$(_delegate_script registry.py)"
if [ -z "$registry_py" ] || [ ! -f "$registry_py" ]; then
echo "ERROR: tmux-agent-orchestrate-delegate-job registry.py not found under skills/" >&2
return 1
fi
"$py_bin" "$registry_py" register \
--prompt "$prompt" \
--agent "$agent" \
--agent-session "$session"
}
# delegate_publish_event <job_id> <event> [detail]
#
# Publish a lifecycle event to the tmux-agent-orchestrate-delegate-job registry. Consolidates the
# inline .venv-walk + publish_event.py blocks that were duplicated across
# create/delete/resume (review item 7). Non-fatal by contract: an empty job id,
# a missing script, or a broker failure never aborts the caller.
delegate_publish_event() {
local job_id="$1" event="$2" detail="${3:-}"
[ -n "$job_id" ] || return 0
local py_bin pub
py_bin="$(_delegate_py_bin)"
pub="$(_delegate_script publish_event.py)"
[ -n "$pub" ] && [ -f "$pub" ] || return 0
"$py_bin" "$pub" --job "$job_id" --event "$event" --detail "$detail" || true
}
# start_watchdog <job_id> [workdir]
# Spawns a watchdog process to monitor a delegate-job JOB in the background.
# The watchdog re-spawns the subscriber every 2 minutes (or whatever hard
# limit we set) and exits automatically when the JOB reaches terminal state.
# Returns the watchdog PID via stdout.
start_watchdog() {
local job_id="$1"
local workdir="${2:-$PWD}"
local watchdog_script="$workdir/skills/tmux-agent-orchestrate-monitor/scripts/watchdog.sh"
local log_file="$workdir/.hermes/jobs/${job_id}.watchdog.log"
if [ ! -x "$watchdog_script" ]; then
echo "ERROR: watchdog not found or not executable: $watchdog_script" >&2
return 1
fi
nohup "$watchdog_script" "$job_id" "$workdir" > "$log_file" 2>&1 &
local pid=$!
echo "$pid"
}