multi-agent-mux/.agents/skills/tmux-agent-orchestrate-delegate-job/tmux-agent-orchestrate-delegate-job

#!/usr/bin/env bash
# tmux-agent-orchestrate-delegate-job — user-facing orchestrator for the tmux-agent-orchestrate-delegate-job skill.
#
# Subcommands:
#   submit   register a job, start the subscriber FIRST, then run the agent,
#            then (optionally) run a validation script.
#   status   show one job record.
#   list     list all jobs.
#   verify   run a user-supplied --validate script against a job's artifacts.
#   wait     block until all running/pending jobs reach a terminal state.
#
# This is a reference wrapper: it shells out to the python scripts that live
# next to it. Copy it into your project and customise as needed. It never hard
# fails if `claude`/`codex`/`tmux` are missing — it prints what it would run.
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Pick an interpreter: prefer a project .venv, else python3.
pick_python() {
  local py_bin
  if [[ -n "${DELEGATE_JOB_PYTHON:-}" ]]; then
    py_bin="$DELEGATE_JOB_PYTHON"
  elif [[ -x "${WORKDIR:-.}/.venv/bin/python" ]]; then
    py_bin="${WORKDIR}/.venv/bin/python"
  elif [[ -x ".venv/bin/python" ]]; then
    py_bin="$(pwd)/.venv/bin/python"
  else
    py_bin="python3"
  fi
  if ! "$py_bin" -c "import paho.mqtt" 2>/dev/null; then
    echo "ERROR: paho-mqtt package is missing for $py_bin." >&2
    echo "       Please create a virtual environment and install it:" >&2
    echo "       python3 -m venv .venv && .venv/bin/pip install -r \"$SCRIPT_DIR/requirements.txt\"" >&2
    exit 1
  fi
  echo "$py_bin"
}

REGISTRY_DIR_DEFAULT=".mam/jobs"

usage() {
  cat <<'EOF'
tmux-agent-orchestrate-delegate-job <command> [options]

  submit  --agent <name> --prompt <text> [--workdir <dir>] [--agent-session <label>]
          [--timeout <sec>] [--idle-timeout <sec>] [--validate <script>]
          [--registry-dir <dir>] [--dry-run]
          # The skill is tmux-interactive only; --mode print was removed.
  status  --job <id> [--registry-dir <dir>]
  list    [--registry-dir <dir>]
  verify  --job <id> --validate <script> [--registry-dir <dir>]
  wait    [--job <id>] [--timeout <sec>] [--registry-dir <dir>]
  logs    <job_id> | --list      # persistent audit log (delegate_job_logs/)
EOF
}

# ---- arg parsing helpers --------------------------------------------------
AGENT="claude-code"; PROMPT=""; WORKDIR="$(pwd)"; AGENT_SESSION="tmux:claude"
TIMEOUT=3600; IDLE_TIMEOUT=120; VALIDATE=""; DRY_RUN=0
JOB_ID=""; REGISTRY_DIR="$REGISTRY_DIR_DEFAULT"

parse_opts() {
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --agent) AGENT="$2"; shift 2;;
      --prompt) PROMPT="$2"; shift 2;;
      --workdir) WORKDIR="$2"; shift 2;;
      --agent-session) AGENT_SESSION="$2"; shift 2;;
      --timeout) TIMEOUT="$2"; shift 2;;
      --idle-timeout) IDLE_TIMEOUT="$2"; shift 2;;
      --validate) VALIDATE="$2"; shift 2;;
      --job) JOB_ID="$2"; shift 2;;
      --registry-dir) REGISTRY_DIR="$2"; shift 2;;
      --dry-run) DRY_RUN=1; shift;;
      *) echo "unknown option: $1" >&2; usage; exit 1;;
    esac
  done
}

cmd_submit() {
  parse_opts "$@"
  [[ -n "$PROMPT" ]] || { echo "submit requires --prompt" >&2; exit 1; }
  PY="$(pick_python)"
  cd "$WORKDIR"
  mkdir -p "$REGISTRY_DIR"

  # 1) register job (prints the new job id)
  JOB_ID="$("$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" register \
      --prompt "$PROMPT" --agent "$AGENT" --agent-session "$AGENT_SESSION" \
      --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT")"
  echo "registered job: $JOB_ID"

  # 2) START THE SUBSCRIBER FIRST (ordering dependency — MQTT does not queue
  #    non-retained messages for absent subscribers).
  local logf="$REGISTRY_DIR/$JOB_ID.subscriber.out"
  "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
      --job "$JOB_ID" --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT" \
      >"$logf" 2>&1 &
  local sub_pid=$!
  echo "subscriber pid: $sub_pid (log: $logf)"
  sleep 1  # give the subscriber time to CONNACK + SUBSCRIBE before the agent runs

  # 3) run the agent (or print the command for dry-run / missing binary)
  local pub="$PY $SCRIPT_DIR/scripts/publish_event.py --registry-dir $REGISTRY_DIR --job $JOB_ID"
  # NOTE: the agent MUST use --job "$JOB_ID" (the one we just minted). Hard-coding
  # an id from an earlier session is the #1 reason a delegated job sits idle and
  # times out (see SKILL.md "Wrong job_id propagated to the agent"). We make the
  # freshness explicit in the instruction header.
  local instructions="Your job_id is \"$JOB_ID\" (the one just registered for THIS delegation — read it from the registry record, do NOT reuse any job_id you saw in earlier runs).

On start run:        $pub --event started.
On permission/tool prompt run: $pub --event permission_required --detail '<tool>:<what>'.
On progress (optional): $pub --event progress --detail '<short status>'.
On success run:      $pub --event completed --detail '<one-line summary>'.
On failure run:      $pub --event error     --detail '<one-line reason>'.

The subscriber for this job_id is already running; your completed/error event ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.

Task: $PROMPT"

  run_agent "$JOB_ID" "$instructions"

  # 4) optional validation hook
  if [[ -n "$VALIDATE" ]]; then
    echo "running validation: $VALIDATE"
    if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
      echo "validation: PASS"
    else
      local rc=$?
      echo "validation: FAIL (exit $rc)"
    fi
  fi

  if [[ "$DRY_RUN" == "1" ]]; then
    # In dry-run we never started a real subscriber (the wrapper short-circuits
    # before launching one), but the wait below would still try to join the
    # background sub_pid from cmd_submit. Skip both the wait and the subscriber
    # log dump; the user just wants to see the instruction that would have run.
    local logs_root_dry="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
    echo "$logs_root_dry/$JOB_ID"
    return 0
  fi

  wait "$sub_pid" || true
  echo "subscriber output:"; cat "$logf" || true

  # Last stdout line: the persistent audit-log dir for this job (see SKILL.md
  # "Audit Logs"). Callers can scrape `tail -n1` to find it.
  local logs_root="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
  echo "$logs_root/$JOB_ID"
}

run_agent() {
  local job_id="$1"; local instructions="$2"
  # The skill is INTERACTIVE-ONLY. We never invoke `claude -p` or any other
  # one-shot print mode, because:
  #   - claude -p exits the moment stdin is drained, so there's nothing to
  #     `tmux attach` to afterwards.
  #   - fire-and-forget via wrapper defeats the whole point of the audit log
  #     (you can't tell what happened if the agent crashes mid-turn).
  #   - the job registry already gives us an authoritative completion signal,
  #     so we don't need a wrapper-side exit code to know "done".
  # The user attaches with `tmux attach -t <session>` and types follow-up
  # prompts themselves. We pre-load the first prompt via stdin and `read`
  # keeps the pane open after the agent exits so the user can review.
  case "$AGENT" in
    claude-code) bin="claude";;
    codex)       bin="codex";;
    human)       echo "[human agent] complete the task, then run publish_event.py --event completed"; return;;
    *)           bin="$AGENT";;
  esac

  if [[ "$DRY_RUN" == "1" ]]; then
    echo "[dry-run] would launch agent '$AGENT' in a fresh tmux session with instructions:"
    echo "----"; echo "$instructions"; echo "----"
    return
  fi

  if ! command -v tmux >/dev/null 2>&1; then
    echo "ERROR: this skill requires tmux (interactive agent sessions)." >&2
    echo "       Install with: brew install tmux   (or your package manager)" >&2
    return 1
  fi
  if ! command -v "$bin" >/dev/null 2>&1; then
    echo "ERROR: agent binary '$bin' not found in PATH." >&2
    return 1
  fi

  local sess="${AGENT_SESSION#tmux:}"
  # Detect a stale session with the same name (e.g. the user is still attached
  # from an earlier run, or a previous wrapper died without cleanup). tmux
  # new-session on an existing name fails silently; check first and fail loud.
  if tmux has-session -t "$sess" 2>/dev/null; then
    local attached
    attached=$(tmux list-clients -t "$sess" 2>/dev/null | wc -l | tr -d ' ')
    echo "ERROR: tmux session '$sess' already exists (clients attached: $attached)." >&2
    echo "       Pick a unique --agent-session (e.g. tmux:demo, tmux:claude-a) or" >&2
    echo "       kill the stale one first:  tmux kill-session -t $sess" >&2
    return 1
  fi

  # Before launching the agent, set up error trap to publish error event
  if [ -n "${job_id:-}" ] && [ -n "${PY:-}" ]; then
    local pub_script="$SCRIPT_DIR/scripts/publish_event.py"
    trap 'rc=$?; if [ $rc -ne 0 ]; then "$PY" "$pub_script" --job "$job_id" --event error --detail "agent bootstrap failed (exit $rc)"; fi' EXIT
  fi

  tmux new-session -d -s "$sess" -c "$WORKDIR" \
    "printf '%s' \"$instructions\" | $bin --dangerously-skip-permissions; echo; echo '--- agent exited (job $job_id); press enter to close ---'; read"
  echo "agent launched in tmux session: $sess (attach with: tmux attach -t $sess)"
  trap - EXIT
}

cmd_status() {
  parse_opts "$@"
  [[ -n "$JOB_ID" ]] || { echo "status requires --job" >&2; exit 1; }
  PY="$(pick_python)"
  "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" get --job "$JOB_ID"
}

cmd_list() {
  parse_opts "$@"
  PY="$(pick_python)"
  "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" list
}

cmd_verify() {
  parse_opts "$@"
  [[ -n "$JOB_ID" ]] || { echo "verify requires --job" >&2; exit 1; }
  [[ -n "$VALIDATE" ]] || { echo "verify requires --validate <script>" >&2; exit 1; }
  echo "verifying job $JOB_ID with $VALIDATE"
  if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
    echo "verify: PASS (exit 0)"; exit 0
  else
    rc=$?; echo "verify: FAIL (exit $rc)"; exit "$rc"
  fi
}

cmd_logs() {
  # logs <job_id> | logs --list — delegates to registry.py's logs CLI, which
  # reads the persistent audit log under $DELEGATE_JOB_LOGS_DIR (or
  # <cwd>/delegate_job_logs). Run from your project dir so the default resolves.
  PY="$(pick_python)"
  if [[ "${1:-}" == "--list" ]]; then
    "$PY" "$SCRIPT_DIR/scripts/registry.py" logs --list
  else
    local jid="${1:-}"
    [[ -n "$jid" ]] || { echo "logs requires <job_id> or --list" >&2; exit 1; }
    "$PY" "$SCRIPT_DIR/scripts/registry.py" logs "$jid"
  fi
}

cmd_wait() {
  parse_opts "$@"
  PY="$(pick_python)"
  if [[ -n "$JOB_ID" ]]; then
    "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
        --job "$JOB_ID" --timeout "$TIMEOUT"
  else
    "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
        --wait-any --timeout "$TIMEOUT"
  fi
}

main() {
  local sub="${1:-}"; shift || true
  case "$sub" in
    submit) cmd_submit "$@";;
    status) cmd_status "$@";;
    list)   cmd_list "$@";;
    verify) cmd_verify "$@";;
    wait)   cmd_wait "$@";;
    logs)   cmd_logs "$@";;
    ""|-h|--help|help) usage;;
    *) echo "unknown command: $sub" >&2; usage; exit 1;;
  esac
}

main "$@"