#!/usr/bin/env bash
# multi-agent-mux-delegate-job — user-facing orchestrator for the multi-agent-mux-delegate-job skill.
#
# Subcommands:
#   submit   register a job, start the subscriber FIRST, then run the agent,
#            then (optionally) run a validation script.
#   status   show one job record.
#   list     list all jobs.
#   verify   run a user-supplied --validate script against a job's artifacts.
#   wait     block until all running/pending jobs reach a terminal state.
#
# This is a reference wrapper: it shells out to the python scripts that live
# next to it. Copy it into your project and customise as needed. It never hard
# fails if `claude`/`codex`/`tmux` are missing — it prints what it would run.
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Load local .env if it exists in current dir or workspace root
if [[ -f .env ]]; then
  set -a; source .env; set +a
elif [[ -f "$SCRIPT_DIR/../../.env" ]]; then
  set -a; source "$SCRIPT_DIR/../../.env"; set +a
fi

# Pick an interpreter: prefer a project .venv, else python3.
pick_python() {
  local py_bin
  if [[ -n "${DELEGATE_JOB_PYTHON:-}" ]]; then
    py_bin="$DELEGATE_JOB_PYTHON"
  elif [[ -x "${WORKDIR:-.}/.venv/bin/python" ]]; then
    py_bin="${WORKDIR}/.venv/bin/python"
  elif [[ -x ".venv/bin/python" ]]; then
    py_bin="$(pwd)/.venv/bin/python"
  else
    py_bin="python3"
  fi
  if ! "$py_bin" -c "import paho.mqtt" 2>/dev/null; then
    echo "ERROR: paho-mqtt package is missing for $py_bin." >&2
    echo "       Please create a virtual environment and install it:" >&2
    echo "       python3 -m venv .venv && .venv/bin/pip install -r \"$SCRIPT_DIR/requirements.txt\"" >&2
    exit 1
  fi
  echo "$py_bin"
}

REGISTRY_DIR_DEFAULT=".mam/jobs"

usage() {
  cat <<'EOF'
multi-agent-mux-delegate-job <command> [options]

  submit  --agent <name> --prompt <text> [--workdir <dir>] [--agent-session <label>]
          [--timeout <sec>] [--idle-timeout <sec>] [--validate <script>]
          [--registry-dir <dir>] [--dry-run]
          [--type <direct|loop|discuss>] [--reviewer <reviewer_agent>]
          [--reviewer-session <reviewer_session>] [--max-iterations <count>]
          # The skill is tmux-interactive only; --mode print was removed.
  status  --job <id> [--registry-dir <dir>]
  list    [--registry-dir <dir>]
  verify  --job <id> --validate <script> [--registry-dir <dir>]
  wait    [--job <id>] [--timeout <sec>] [--registry-dir <dir>]
  logs    <job_id> | --list      # persistent audit log (delegate_job_logs/)
EOF
}

# ---- arg parsing helpers --------------------------------------------------
AGENT="claude-code"; PROMPT=""; WORKDIR="$(pwd)"; AGENT_SESSION="tmux:claude"
TIMEOUT=3600; IDLE_TIMEOUT=120; VALIDATE=""; DRY_RUN=0
JOB_ID=""; REGISTRY_DIR="$REGISTRY_DIR_DEFAULT"
TYPE="direct"; REVIEWER="hermes"; REVIEWER_SESSION="tmux:hermes"; MAX_ITERATIONS=5

parse_opts() {
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --agent) AGENT="$2"; shift 2;;
      --prompt) PROMPT="$2"; shift 2;;
      --workdir) WORKDIR="$2"; shift 2;;
      --agent-session) AGENT_SESSION="$2"; shift 2;;
      --timeout) TIMEOUT="$2"; shift 2;;
      --idle-timeout) IDLE_TIMEOUT="$2"; shift 2;;
      --validate) VALIDATE="$2"; shift 2;;
      --job) JOB_ID="$2"; shift 2;;
      --registry-dir) REGISTRY_DIR="$2"; shift 2;;
      --dry-run) DRY_RUN=1; shift;;
      --type) TYPE="$2"; shift 2;;
      --reviewer) REVIEWER="$2"; shift 2;;
      --reviewer-session) REVIEWER_SESSION="$2"; shift 2;;
      --max-iterations) MAX_ITERATIONS="$2"; shift 2;;
      *) echo "unknown option: $1" >&2; usage; exit 1;;
    esac
  done
}

cmd_submit() {
  parse_opts "$@"
  [[ -n "$PROMPT" ]] || { echo "submit requires --prompt" >&2; exit 1; }
  PY="$(pick_python)"
  cd "$WORKDIR"
  mkdir -p "$REGISTRY_DIR"

  # 1) register job (prints the new job id)
  JOB_ID="$("$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" register \
      --prompt "$PROMPT" --agent "$AGENT" --agent-session "$AGENT_SESSION" \
      --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT" \
      --job-type "$TYPE" --reviewer "$REVIEWER" --reviewer-session "$REVIEWER_SESSION" \
      --max-iterations "$MAX_ITERATIONS")"
  echo "registered job: $JOB_ID"

  if [[ "$TYPE" == "direct" ]]; then
    # 2) START THE SUBSCRIBER FIRST (ordering dependency — MQTT does not queue
    #    non-retained messages for absent subscribers).
    local logf="$REGISTRY_DIR/$JOB_ID.subscriber.out"
    "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
        --job "$JOB_ID" --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT" \
        >"$logf" 2>&1 &
    local sub_pid=$!
    echo "subscriber pid: $sub_pid (log: $logf)"
    sleep 1  # give the subscriber time to CONNACK + SUBSCRIBE before the agent runs

    # 3) run the agent (or print the command for dry-run / missing binary)
    local pub="$PY $SCRIPT_DIR/scripts/publish_event.py --registry-dir $REGISTRY_DIR --job $JOB_ID"
    # NOTE: the agent MUST use --job "$JOB_ID" (the one we just minted). Hard-coding
    # an id from an earlier session is the #1 reason a delegated job sits idle and
    # times out (see SKILL.md "Wrong job_id propagated to the agent"). We make the
    # freshness explicit in the instruction header.
    local instructions="Your job_id is \"$JOB_ID\" (the one just registered for THIS delegation — read it from the registry record, do NOT reuse any job_id you saw in earlier runs).

On start run:        $pub --event started.
On permission/tool prompt run: $pub --event permission_required --detail '<tool>:<what>'.
On progress (optional): $pub --event progress --detail '<short status>'.
On success run:      $pub --event completed --detail '<one-line summary>'.
On failure run:      $pub --event error     --detail '<one-line reason>'.

The subscriber for this job_id is already running; your completed/error event ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.

Task: $PROMPT"

    run_agent "$JOB_ID" "$instructions"

    # 4) optional validation hook
    if [[ -n "$VALIDATE" ]]; then
      echo "running validation: $VALIDATE"
      if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
        echo "validation: PASS"
      else
        local rc=$?
        echo "validation: FAIL (exit $rc)"
      fi
    fi

    if [[ "$DRY_RUN" == "1" ]]; then
      # In dry-run we never started a real subscriber (the wrapper short-circuits
      # before launching one), but the wait below would still try to join the
      # background sub_pid from cmd_submit. Skip both the wait and the subscriber
      # log dump; the user just wants to see the instruction that would have run.
      local logs_root_dry="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
      echo "$logs_root_dry/$JOB_ID"
      return 0
    fi

    wait "$sub_pid" || true
    echo "subscriber output:"; cat "$logf" || true

    # Last stdout line: the persistent audit-log dir for this job (see SKILL.md
    # "Audit Logs"). Callers can scrape `tail -n1` to find it.
    local logs_root="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
    echo "$logs_root/$JOB_ID"
  else
    # Implement loop/discuss orchestrator
    local iteration=1
    local current_prompt="$PROMPT"
    local current_session="$AGENT_SESSION"
    local current_role="worker"

    if [[ "$DRY_RUN" == "1" ]]; then
      echo "[dry-run] orchestrator loop would start for job: $JOB_ID type: $TYPE"
      echo "worker session: $AGENT_SESSION, reviewer session: $REVIEWER_SESSION"
      local logs_root_dry="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
      echo "$logs_root_dry/$JOB_ID"
      return 0
    fi

    while true; do
      echo "=================================================="
      echo "Iteration $iteration - Role: $current_role"
      echo "Session: $current_session"
      echo "=================================================="

      # Update job details in registry
      "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" update \
          --job "$JOB_ID" \
          --agent-session "$current_session" \
          --prompt "$current_prompt" \
          --iteration "$iteration" \
          --status "pending"

      # Start subscriber
      local logf="$REGISTRY_DIR/${JOB_ID}.iter_${iteration}_${current_role}.subscriber.out"
      "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
          --job "$JOB_ID" --timeout "$TIMEOUT" --idle-timeout "$IDLE_TIMEOUT" \
          >"$logf" 2>&1 &
      local sub_pid=$!
      echo "subscriber pid: $sub_pid (log: $logf)"
      sleep 1

      # Format instruction block
      local pub="$PY $SCRIPT_DIR/scripts/publish_event.py --registry-dir $REGISTRY_DIR --job $JOB_ID"
      local instructions="Your job_id is \"$JOB_ID\" (the one just registered for THIS delegation — read it from the registry record, do NOT reuse any job_id you saw in earlier runs).

On start run:        $pub --event started.
On permission/tool prompt run: $pub --event permission_required --detail '<tool>:<what>'.
On progress (optional): $pub --event progress --detail '<short status>'.
On success run:      $pub --event completed --detail '<one-line summary>'.
On failure run:      $pub --event error     --detail '<one-line reason>'.

The subscriber for this job_id is already running; your completed/error event ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.

Task: $current_prompt"

      # Trigger agent
      run_agent "$JOB_ID" "$instructions" "$current_session"

      # Wait for subscriber
      local sub_rc=0
      wait "$sub_pid" || sub_rc=$?
      echo "subscriber output:"; cat "$logf" || true

      # Check job status based on subscriber exit code
      local job_status="running"
      if [[ $sub_rc -eq 0 ]]; then
        job_status="completed"
      elif [[ $sub_rc -eq 1 ]]; then
        job_status="error"
      else
        job_status="timeout"
      fi
      
      echo "Job role $current_role finished with status: $job_status"

      # Retrieve feedback from the last event
      local feedback
      feedback="$("$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" get-feedback --job "$JOB_ID")"
      echo "Feedback/Detail: $feedback"

      if [[ "$current_role" == "worker" ]]; then
        if [[ "$job_status" != "completed" ]]; then
          echo "Worker did not complete successfully (status: $job_status). Terminating workflow."
          break
        fi

        # Worker completed successfully, now switch to reviewer
        current_role="reviewer"
        current_session="$REVIEWER_SESSION"
        # Build reviewer prompt based on type
        if [[ "$TYPE" == "loop" ]]; then
          current_prompt="Review the changes/artifacts generated for job $JOB_ID. Check if they meet the requirements. If correct, publish completed event with 'PASS'. If there are issues, publish error event with detailed feedback/nits. CRITICAL: When raising issues or giving a review, you MUST include the exact reason for the issue and a clear direction for improvement (문제 제시에 대한 이유와 확실한 개선 방향을 반드시 포함해야 합니다)."
        elif [[ "$TYPE" == "discuss" ]]; then
          current_prompt="Read draft/documents generated for job $JOB_ID. Review the feasibility and content. Write your feedback/objections. If you agree with the plan, reply with 'AGREE'."
        fi
      else
        if [[ "$job_status" != "completed" ]]; then
          echo "Reviewer did not complete successfully (status: $job_status). Terminating workflow."
          break
        fi
 
        # Reviewer finished. Check if pass/agree
        local success=0
        if [[ "$TYPE" == "loop" ]]; then
          if [[ "${feedback,,}" == *"pass"* ]]; then
            success=1
          fi
        elif [[ "$TYPE" == "discuss" ]]; then
          if [[ "${feedback,,}" == *"agree"* ]]; then
            success=1
          fi
        fi
 
        if [[ "$success" == "1" ]]; then
          echo "Reviewer approved the work. Finalizing job as completed."
          "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" status --job "$JOB_ID" --set "completed"
          break
        else
          # Reviewer rejected/provided feedback. Increment & check max iterations
          if [[ $iteration -ge $MAX_ITERATIONS ]]; then
            echo "Max iterations ($MAX_ITERATIONS) reached without approval. Terminating workflow."
            "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" status --job "$JOB_ID" --set "error"
            break
          fi
          
          iteration=$((iteration + 1))
          current_role="worker"
          current_session="$AGENT_SESSION"
          current_prompt="The reviewer provided the following feedback for job $JOB_ID: $feedback. Please modify the code/artifacts to address these comments. CRITICAL: As the Developer Team Leader, you must thoroughly review the suggested modifications, verify their validity, adopt/implement them if valid, and if you judge any recommendation to be invalid, do NOT implement it but instead explain your reasons clearly in your response and send it back to the reviewer (수정안을 최대한 꼼꼼히 검토하여 타당성을 검증하고, 타당하다면 수렴하여 수정을 진행하되, 타당하지 않다고 판단되는 부분이 있다면 그 이유를 명확히 밝혀 리뷰어에게 전달하십시오)."
        fi
      fi
    done

    # 4) optional validation hook
    if [[ -n "$VALIDATE" ]]; then
      echo "running validation: $VALIDATE"
      if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
        echo "validation: PASS"
      else
        local rc=$?
        echo "validation: FAIL (exit $rc)"
      fi
    fi

    # Last stdout line: the persistent audit-log dir
    local logs_root="${DELEGATE_JOB_LOGS_DIR:-$WORKDIR/delegate_job_logs}"
    echo "$logs_root/$JOB_ID"
  fi
}

run_agent() {
  local job_id="$1"; local instructions="$2"; local target_session="${3:-$AGENT_SESSION}"
  # The skill is INTERACTIVE-ONLY. We never invoke `claude -p` or any other
  # one-shot print mode, because:
  #   - claude -p exits the moment stdin is drained, so there's nothing to
  #     `tmux attach` to afterwards.
  #   - fire-and-forget via wrapper defeats the whole point of the audit log
  #     (you can't tell what happened if the agent crashes mid-turn).
  #   - the job registry already gives us an authoritative completion signal,
  #     so we don't need a wrapper-side exit code to know "done".
  # The user attaches with `tmux attach -t <session>` and types follow-up
  # prompts themselves. We pre-load the first prompt via stdin and `read`
  # keeps the pane open after the agent exits so the user can review.
  if [ "$AGENT" = "human" ]; then
    echo "[human agent] complete the task, then run publish_event.py --event completed"
    return
  fi
  local sess="${target_session#tmux:}"

  if [[ "$DRY_RUN" == "1" ]]; then
    echo "[dry-run] would delegate task to running agent '$AGENT' in tmux session '$sess' with instructions:"
    echo "----"; echo "$instructions"; echo "----"
    return
  fi

  if ! command -v tmux >/dev/null 2>&1; then
    echo "ERROR: this skill requires tmux (interactive agent sessions)." >&2
    echo "       Install with: brew install tmux   (or your package manager)" >&2
    return 1
  fi

  local _tmux="tmux"
  if [ -n "${TMUX_SERVER_NAME:-}" ]; then
    _tmux="tmux -L $TMUX_SERVER_NAME"
  fi

  if ! $_tmux has-session -t "$sess" 2>/dev/null; then
    echo "ERROR: 에이전트 세션 '$sess'이 존재하지 않습니다. 작업을 위임하기 전에 먼저 에이전트 세션을 기동해 주세요." >&2
    echo "       팁: 'multi-agent-mux-resume' 또는 'multi-agent-mux-create'를 통해 에이전트를 먼저 생성할 수 있습니다." >&2
    return 1
  fi

  # Before launching the agent, set up error trap to publish error event
  if [ -n "${job_id:-}" ] && [ -n "${PY:-}" ]; then
    local pub_script="$SCRIPT_DIR/scripts/publish_event.py"
    trap 'rc=$?; if [ $rc -ne 0 ]; then "$PY" "$pub_script" --job "$job_id" --event error --detail "agent bootstrap failed (exit $rc)"; fi' EXIT
  fi

  echo "살아있는 에이전트 세션 '$sess'에 작업을 위임합니다..."
  $_tmux set-buffer -b "job_buf_$job_id" "$instructions"
  $_tmux paste-buffer -b "job_buf_$job_id" -t "$sess"
  sleep 0.5
  $_tmux send-keys -t "$sess" C-m
  $_tmux delete-buffer -b "job_buf_$job_id"
  
  echo "작업이 세션 '$sess'에 전송되었습니다. (연결하려면: $_tmux attach -t $sess)"
  trap - EXIT
}

cmd_status() {
  parse_opts "$@"
  [[ -n "$JOB_ID" ]] || { echo "status requires --job" >&2; exit 1; }
  PY="$(pick_python)"
  "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" get --job "$JOB_ID"
}

cmd_list() {
  parse_opts "$@"
  PY="$(pick_python)"
  "$PY" "$SCRIPT_DIR/scripts/registry.py" --registry-dir "$REGISTRY_DIR" list
}

cmd_verify() {
  parse_opts "$@"
  [[ -n "$JOB_ID" ]] || { echo "verify requires --job" >&2; exit 1; }
  [[ -n "$VALIDATE" ]] || { echo "verify requires --validate <script>" >&2; exit 1; }
  echo "verifying job $JOB_ID with $VALIDATE"
  if JOB_ID="$JOB_ID" REGISTRY_DIR="$REGISTRY_DIR" bash "$VALIDATE"; then
    echo "verify: PASS (exit 0)"; exit 0
  else
    rc=$?; echo "verify: FAIL (exit $rc)"; exit "$rc"
  fi
}

cmd_logs() {
  # logs <job_id> | logs --list — delegates to registry.py's logs CLI, which
  # reads the persistent audit log under $DELEGATE_JOB_LOGS_DIR (or
  # <cwd>/delegate_job_logs). Run from your project dir so the default resolves.
  PY="$(pick_python)"
  if [[ "${1:-}" == "--list" ]]; then
    "$PY" "$SCRIPT_DIR/scripts/registry.py" logs --list
  else
    local jid="${1:-}"
    [[ -n "$jid" ]] || { echo "logs requires <job_id> or --list" >&2; exit 1; }
    "$PY" "$SCRIPT_DIR/scripts/registry.py" logs "$jid"
  fi
}

cmd_wait() {
  parse_opts "$@"
  PY="$(pick_python)"
  if [[ -n "$JOB_ID" ]]; then
    "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
        --job "$JOB_ID" --timeout "$TIMEOUT"
  else
    "$PY" "$SCRIPT_DIR/scripts/job_subscriber.py" --registry-dir "$REGISTRY_DIR" \
        --wait-any --timeout "$TIMEOUT"
  fi
}

main() {
  local sub="${1:-}"; shift || true
  case "$sub" in
    submit) cmd_submit "$@";;
    status) cmd_status "$@";;
    list)   cmd_list "$@";;
    verify) cmd_verify "$@";;
    wait)   cmd_wait "$@";;
    logs)   cmd_logs "$@";;
    ""|-h|--help|help) usage;;
    *) echo "unknown command: $sub" >&2; usage; exit 1;;
  esac
}

main "$@"
