feat(multi-agent-mux): integrate cline agent support, fix sqlite3 naming collision, simplify delegation docs, and add SKILL_FEATURES.md
This commit is contained in:
@@ -487,6 +487,10 @@ def hermes_exists(uuid):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def cline_exists(uuid):
|
||||||
|
return os.path.exists(f"{home}/.cline/data/sessions/{uuid}/{uuid}.json")
|
||||||
|
|
||||||
|
|
||||||
def emit(u):
|
def emit(u):
|
||||||
print(u)
|
print(u)
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
@@ -536,6 +540,10 @@ for s in sessions:
|
|||||||
cand = s.get('hermes_conversation_id_own')
|
cand = s.get('hermes_conversation_id_own')
|
||||||
if cand and hermes_exists(cand):
|
if cand and hermes_exists(cand):
|
||||||
emit(cand)
|
emit(cand)
|
||||||
|
if agent == 'cline' and name.endswith('-creator-cline'):
|
||||||
|
cand = s.get('cline_conversation_id_own')
|
||||||
|
if cand and cline_exists(cand):
|
||||||
|
emit(cand)
|
||||||
|
|
||||||
# 2) disk scan scoped to THIS workspace
|
# 2) disk scan scoped to THIS workspace
|
||||||
if agent == 'claude':
|
if agent == 'claude':
|
||||||
@@ -578,6 +586,27 @@ elif agent == 'hermes':
|
|||||||
cand = None
|
cand = None
|
||||||
if cand:
|
if cand:
|
||||||
emit(cand)
|
emit(cand)
|
||||||
|
elif agent == 'cline':
|
||||||
|
sessions_dir = f"{home}/.cline/data/sessions"
|
||||||
|
if os.path.isdir(sessions_dir):
|
||||||
|
candidates = []
|
||||||
|
for session_folder in glob.glob(f"{sessions_dir}/*"):
|
||||||
|
if os.path.isdir(session_folder):
|
||||||
|
folder_name = os.path.basename(session_folder)
|
||||||
|
json_file = f"{session_folder}/{folder_name}.json"
|
||||||
|
if os.path.exists(json_file):
|
||||||
|
candidates.append(json_file)
|
||||||
|
candidates.sort(key=os.path.getmtime, reverse=True)
|
||||||
|
for j in candidates:
|
||||||
|
try:
|
||||||
|
with open(j) as f:
|
||||||
|
sdata = json.load(f)
|
||||||
|
if sdata.get('cwd') == ws or sdata.get('workspace_root') == ws:
|
||||||
|
sid = sdata.get('session_id')
|
||||||
|
if sid:
|
||||||
|
emit(sid)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# 3) agent_identities cache, ONLY when its project_cwd == this workspace
|
# 3) agent_identities cache, ONLY when its project_cwd == this workspace
|
||||||
ai = {}
|
ai = {}
|
||||||
@@ -609,6 +638,10 @@ if ai_agent.get('project_cwd') == ws:
|
|||||||
cand = ai_agent.get('session_id') or ai.get('conversation_id')
|
cand = ai_agent.get('session_id') or ai.get('conversation_id')
|
||||||
if cand and hermes_exists(cand):
|
if cand and hermes_exists(cand):
|
||||||
emit(cand)
|
emit(cand)
|
||||||
|
elif agent == 'cline':
|
||||||
|
cand = ai_agent.get('session_id') or ai.get('conversation_id')
|
||||||
|
if cand and cline_exists(cand):
|
||||||
|
emit(cand)
|
||||||
|
|
||||||
print('')
|
print('')
|
||||||
PYEOF
|
PYEOF
|
||||||
|
|||||||
@@ -23,11 +23,11 @@ source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
|
|||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
Usage: $0 --workspace <path> --agent <claude|agy|hermes> [options]
|
Usage: $0 --workspace <path> --agent <claude|agy|hermes|cline> [options]
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--workspace PATH project directory (required)
|
--workspace PATH project directory (required)
|
||||||
--agent AGENT claude | agy | hermes (required)
|
--agent AGENT claude | agy | hermes | cline (required)
|
||||||
--session NAME tmux session name (default: derived from workspace)
|
--session NAME tmux session name (default: derived from workspace)
|
||||||
--wrapper force use of ~/.local/bin/<session> wrapper even if not present
|
--wrapper force use of ~/.local/bin/<session> wrapper even if not present
|
||||||
--dry-run print commands without executing
|
--dry-run print commands without executing
|
||||||
@@ -86,6 +86,11 @@ elif [ "$AGENT" = "hermes" ]; then
|
|||||||
echo "ERROR: hermes is not functional. Run 'hermes setup' first." >&2
|
echo "ERROR: hermes is not functional. Run 'hermes setup' first." >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
elif [ "$AGENT" = "cline" ]; then
|
||||||
|
if ! cline history --json >/dev/null 2>&1; then
|
||||||
|
echo "ERROR: cline is not functional or configured." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 세션 이름 — lib.sh::derive_session_name 이 단일 소스 (P0-A)
|
# 세션 이름 — lib.sh::derive_session_name 이 단일 소스 (P0-A)
|
||||||
@@ -119,7 +124,10 @@ spawn() {
|
|||||||
hermes)
|
hermes)
|
||||||
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "hermes"
|
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "hermes"
|
||||||
;;
|
;;
|
||||||
*) echo "ERROR: --agent must be claude, agy or hermes, got: $AGENT" >&2; exit 2 ;;
|
cline)
|
||||||
|
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "cline -i"
|
||||||
|
;;
|
||||||
|
*) echo "ERROR: --agent must be claude, agy, hermes or cline, got: $AGENT" >&2; exit 2 ;;
|
||||||
esac
|
esac
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,6 +153,7 @@ case "$AGENT" in
|
|||||||
claude) CMD_FULL='claude --dangerously-skip-permissions' ;;
|
claude) CMD_FULL='claude --dangerously-skip-permissions' ;;
|
||||||
agy) CMD_FULL='agy --dangerously-skip-permissions' ;;
|
agy) CMD_FULL='agy --dangerously-skip-permissions' ;;
|
||||||
hermes) CMD_FULL='hermes' ;;
|
hermes) CMD_FULL='hermes' ;;
|
||||||
|
cline) CMD_FULL='cline -i' ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
# 시작 명령
|
# 시작 명령
|
||||||
@@ -161,7 +170,7 @@ case "$AGENT" in
|
|||||||
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude --dangerously-skip-permissions\""
|
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude --dangerously-skip-permissions\""
|
||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
agy|hermes)
|
agy|hermes|cline)
|
||||||
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"$CMD_FULL\""
|
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"$CMD_FULL\""
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
@@ -174,6 +183,8 @@ if [ -n "$SUBMIT_JOB_PROMPT" ]; then
|
|||||||
delegate_agent="claude-code"
|
delegate_agent="claude-code"
|
||||||
elif [ "$AGENT" = "hermes" ]; then
|
elif [ "$AGENT" = "hermes" ]; then
|
||||||
delegate_agent="hermes-agent"
|
delegate_agent="hermes-agent"
|
||||||
|
elif [ "$AGENT" = "cline" ]; then
|
||||||
|
delegate_agent="cline-agent"
|
||||||
else
|
else
|
||||||
delegate_agent="antigravity-cli"
|
delegate_agent="antigravity-cli"
|
||||||
fi
|
fi
|
||||||
@@ -191,7 +202,7 @@ fi
|
|||||||
# 모든 값은 환경변수로 전달 — heredoc interpolation 없음 (P1-B).
|
# 모든 값은 환경변수로 전달 — heredoc interpolation 없음 (P1-B).
|
||||||
# 자식 pid 는 bash 에서 pgrep 으로 미리 구함 (P2: 도구명 필터).
|
# 자식 pid 는 bash 에서 pgrep 으로 미리 구함 (P2: 도구명 필터).
|
||||||
CHILD_PID=0
|
CHILD_PID=0
|
||||||
if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then
|
if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ] || [ "$AGENT" = "cline" ]; } && [ -n "$PANE_PID" ]; then
|
||||||
CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
|
CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
|
||||||
CHILD_PID="${CHILD_PID:-0}"
|
CHILD_PID="${CHILD_PID:-0}"
|
||||||
fi
|
fi
|
||||||
@@ -265,6 +276,11 @@ elif agent == 'hermes':
|
|||||||
entry['child_pid'] = int(cp) if cp.isdigit() else 0
|
entry['child_pid'] = int(cp) if cp.isdigit() else 0
|
||||||
entry['hermes_conversation_id_own'] = None
|
entry['hermes_conversation_id_own'] = None
|
||||||
entry['last_visible_status'] = "TUI started; awaiting first user message"
|
entry['last_visible_status'] = "TUI started; awaiting first user message"
|
||||||
|
elif agent == 'cline':
|
||||||
|
cp = os.environ.get('CHILD_PID', '0')
|
||||||
|
entry['child_pid'] = int(cp) if cp.isdigit() else 0
|
||||||
|
entry['cline_conversation_id_own'] = None
|
||||||
|
entry['last_visible_status'] = "TUI started; awaiting first user message"
|
||||||
|
|
||||||
sessions.append(entry)
|
sessions.append(entry)
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# multi-agent-mux-delegate-job 스킬
|
# multi-agent-mux-delegate-job 스킬
|
||||||
|
|
||||||
작업(Job)을 자율 에이전트(claude-code/codex/opencode/human)에게 위임하고 MQTT
|
작업(Job)을 자율 에이전트(claude-code/hermes/agy/cline/codex/opencode/human)에게 위임하고 MQTT
|
||||||
이벤트 채널로 비동기 관찰하는 Hermes 스킬. **시작점은 [`SKILL.md`](./SKILL.md).**
|
이벤트 채널로 비동기 관찰하는 범용 에이전트 협업 스킬. **시작점은 [`SKILL.md`](./SKILL.md).**
|
||||||
|
|
||||||
- 프로토콜/스키마: [`job-protocol.md`](./job-protocol.md)
|
- 프로토콜/스키마: [`job-protocol.md`](./job-protocol.md)
|
||||||
- 브로커 PoC→운영 전환: [`mqtt-broker-setup.md`](./mqtt-broker-setup.md)
|
- 브로커 PoC→운영 전환: [`mqtt-broker-setup.md`](./mqtt-broker-setup.md)
|
||||||
|
|||||||
@@ -1,385 +1,94 @@
|
|||||||
---
|
---
|
||||||
name: multi-agent-mux-delegate-job
|
name: multi-agent-mux-delegate-job
|
||||||
description: "Delegate a unit of work to any autonomous agent (claude-code, codex, opencode, or a human) and observe it asynchronously over an MQTT event channel. Each job gets a unique id, a registry record (prompt, broker, status, timeouts), and a single per-job topic that carries started/permission_required/progress/completed/error events as schema-versioned JSON. The delegator starts a subscriber first, runs the agent, and treats a completed/error event or a timeout as the job's terminal state. Ships a working reference implementation (publish_event.py, job_subscriber.py, registry.py, mqtt_common.py, multi-agent-mux-delegate-job wrapper) plus a PoC-to-production path: validate on a public broker, then move to an authenticated TLS broker by changing config only — no code change. Use when you need fire-and-observe delegation, multi-job fan-out across tmux sessions, or a uniform completion-signal protocol shared by several agent types."
|
description: "Delegate a unit of work to any autonomous agent (claude-code, hermes, agy, cline, codex, or a human) and observe it asynchronously over an MQTT event channel. Supported roles include orchestrator, worker, and reviewer."
|
||||||
version: 1.0.0
|
version: 1.1.0
|
||||||
author: Hermes Agent
|
author: Multi-Agent System
|
||||||
license: MIT
|
license: MIT
|
||||||
platforms: [linux, macos, windows]
|
platforms: [linux, macos, windows]
|
||||||
metadata:
|
|
||||||
hermes:
|
|
||||||
tags: [agent-delegation, mqtt, jobs, orchestration, async-completion]
|
|
||||||
related_skills: [claude-code, codex, opencode, hermes-agent-skill-authoring]
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# multi-agent-mux-delegate-job — Async Job Delegation over MQTT
|
# multi-agent-mux-delegate-job — Async Job Delegation over MQTT
|
||||||
|
|
||||||
Delegate a unit of work to an autonomous agent, then **observe** it instead of
|
Delegate a unit of work to any autonomous agent, then **observe** it asynchronously instead of blocking. Every job gets a unique ID and a registry record. The worker agent publishes lifecycle events (`started`, `permission_required`, `progress`, `completed`, `error`) to a per-job MQTT topic, and the delegator/orchestrator subscribes to verify the final state.
|
||||||
blocking on it. Every job gets a unique id and a registry record; the agent
|
|
||||||
publishes lifecycle events (`started`, `permission_required`, `progress`,
|
|
||||||
`completed`, `error`) to a per-job MQTT topic; the delegator subscribes and
|
|
||||||
treats `completed`/`error` — or a timeout — as the terminal state.
|
|
||||||
|
|
||||||
This skill is a **reference implementation**: copy the files in this directory
|
This skill allows any agent (`claude-code`, `hermes`, `agy`, `cline`, etc.) to play any role: **Orchestrator/Delegator**, **Worker/Implementer**, or **Reviewer**.
|
||||||
into your project and customise. The `communication_over_mqtt` project is the
|
|
||||||
canonical concrete instance.
|
|
||||||
|
|
||||||
## Overview
|
---
|
||||||
|
|
||||||
The model is deliberately small. A **job** is one delegated task. An **agent**
|
## Roles in Multi-Agent Mux
|
||||||
is a worker (a claude-code tmux session, a codex run, a human). The **registry**
|
|
||||||
(`.mam/jobs/<id>.json`) holds everything about a job so nothing important
|
|
||||||
lives in environment variables — which means one tmux session can process many
|
|
||||||
jobs sequentially, and many sessions can fan out in parallel, with no env
|
|
||||||
collisions. The **event channel** is one MQTT topic per job carrying JSON
|
|
||||||
payloads; `event` discriminates the type.
|
|
||||||
|
|
||||||
Responsibility is split into exactly one entry point each:
|
- **Orchestrator (Delegator)**: Initiates the job, coordinates other agents, handles loops and reviews, and commits final changes.
|
||||||
[`publish_event.py`](./scripts/publish_event.py) emits events (registry lookup,
|
- **Worker (Implementer)**: Receives the brief file or task prompt, performs the implementation, and emits started/completed/error events.
|
||||||
monotonic `seq`, retry+backoff) and [`job_subscriber.py`](./scripts/job_subscriber.py)
|
- **Reviewer**: Evaluates git diffs or artifacts produced by the worker, and responds with a `completed` event containing `"PASS"` or feedback.
|
||||||
observes them (timeouts, terminal state machine, defensive parsing). Shared
|
|
||||||
logic lives in [`mqtt_common.py`](./scripts/mqtt_common.py); registry I/O in
|
|
||||||
[`registry.py`](./scripts/registry.py). The demo `publisher.py`/`subscriber.py`
|
|
||||||
in the host project stay frozen.
|
|
||||||
|
|
||||||
Two stages, same code. **PoC** runs on the public `broker.hivemq.com` to wire up
|
---
|
||||||
the protocol. **Production** moves to your own authenticated TLS broker — the
|
|
||||||
switch is **config only** (env vars + the registry `broker.*` block), never a
|
|
||||||
code change. See [`mqtt-broker-setup.md`](./mqtt-broker-setup.md).
|
|
||||||
|
|
||||||
## When to Use / When NOT to Use
|
## Core Commands (CLI)
|
||||||
|
|
||||||
**Use when:**
|
The `multi-agent-mux-delegate-job` bash wrapper handles job registration, subscriber management, agent session targeting, and validation hooks:
|
||||||
- you want **fire-and-observe** delegation — kick off work and get a completion
|
|
||||||
signal rather than blocking a terminal;
|
|
||||||
- several agent types (claude-code, codex, opencode, human) must follow **one**
|
|
||||||
completion protocol;
|
|
||||||
- you need **multi-job fan-out** across tmux sessions with safe job claiming;
|
|
||||||
- you want a clean PoC → authenticated-broker upgrade path.
|
|
||||||
|
|
||||||
**Do NOT use when:**
|
|
||||||
- a one-shot `claude -p '…'` that returns inline is enough (no async signal
|
|
||||||
needed) — just use the [claude-code](../claude-code/SKILL.md) skill directly;
|
|
||||||
- you need request/response RPC or large artifact transfer (this is a
|
|
||||||
one-direction event stream, not a data bus);
|
|
||||||
- the payload would carry secrets and you're still on the public broker — move
|
|
||||||
to the own-broker stage first.
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
The one-line wrapper handles register + subscriber-first + agent launch. If
|
|
||||||
you're new, **start here** and only fall back to the manual 5-step flow when
|
|
||||||
you need finer control.
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1) one line: register → start subscriber → launch agent in tmux
|
# 1) Submit a new job to a targeted agent session (e.g. tmux session name 'demo')
|
||||||
# (uses public broker by default; last stdout line is the audit-log dir)
|
|
||||||
multi-agent-mux-delegate-job submit \
|
multi-agent-mux-delegate-job submit \
|
||||||
--agent claude-code \
|
--agent <claude-code|hermes-agent|agy-agent|cline-agent|human> \
|
||||||
--prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \
|
--agent-session tmux:<session_name> \
|
||||||
--workdir /path/to/project \
|
--prompt "Task description or instructions here" \
|
||||||
--agent-session tmux:demo \
|
|
||||||
--timeout 3600 --idle-timeout 120
|
--timeout 3600 --idle-timeout 120
|
||||||
# → stdout: registered job: <JID>
|
|
||||||
# subscriber pid: …
|
|
||||||
# agent launched in tmux session: demo
|
|
||||||
# subscriber output: <one line per event>
|
|
||||||
# /path/to/project/.mam/delegate_job_logs/<JID> ← audit log dir
|
|
||||||
|
|
||||||
# 2) at any time, query the job or its audit log
|
# 2) Submit a job with a feedback loop (Worker-Reviewer Loop)
|
||||||
multi-agent-mux-delegate-job status --job <JID>
|
multi-agent-mux-delegate-job submit \
|
||||||
multi-agent-mux-delegate-job logs <JID> # pretty timeline
|
--agent <worker_agent> --agent-session tmux:<worker_session> \
|
||||||
multi-agent-mux-delegate-job logs --list # every job, live status
|
--type loop --reviewer <reviewer_agent> --reviewer-session tmux:<reviewer_session> \
|
||||||
|
--prompt "Task description"
|
||||||
|
|
||||||
# 3) run a user-supplied validator against the job's artifacts
|
# 3) Check job status and audit logs
|
||||||
multi-agent-mux-delegate-job verify --job <JID> --validate ./validate.sh
|
multi-agent-mux-delegate-job status --job <JOB_ID>
|
||||||
|
multi-agent-mux-delegate-job logs <JOB_ID> # Chronological log of events
|
||||||
|
multi-agent-mux-delegate-job list # Summary of all registered jobs
|
||||||
|
|
||||||
|
# 4) Verify job artifacts with a validation script
|
||||||
|
multi-agent-mux-delegate-job verify --job <JOB_ID> --validate ./validate.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
The wrapper enforces the **subscribe-before-publish** ordering and **forwards
|
---
|
||||||
the freshly-minted `JOB_ID` into the agent's prompt** (so the agent calls
|
|
||||||
`publish_event.py --job <JID>` with the right id — see Pitfall §"Wrong job_id
|
|
||||||
propagated to the agent"). When you need finer control, the manual flow is:
|
|
||||||
|
|
||||||
```bash
|
## Task Delegation Types
|
||||||
# Manual 5-step (same outcome, more knobs)
|
|
||||||
PY=.venv/bin/python
|
|
||||||
SKILL=./.agents/skills/multi-agent-mux-delegate-job/scripts
|
|
||||||
|
|
||||||
# 1) register
|
Supported job types include:
|
||||||
JID=$($PY "$SKILL/registry.py" register \
|
- `direct` (default): Single agent execution (direct tasking).
|
||||||
--prompt "…" --agent claude-code --agent-session tmux:demo \
|
- `loop` (Worker-Reviewer Loop): Alternates worker execution and reviewer evaluation until reviewer approves (`PASS`) or iterations run out.
|
||||||
--timeout 3600 --idle-timeout 120)
|
- `discuss` (Research & Discussion): Collaboration between two agents to reach a consensus (e.g., agreeing on a design or plan).
|
||||||
|
|
||||||
# 2) START THE SUBSCRIBER FIRST (MQTT does not queue non-retained msgs)
|
For detailed state machine diagrams and configurations, see [DELEGATION_TYPES.md](./DELEGATION_TYPES.md).
|
||||||
$PY "$SKILL/job_subscriber.py" --job "$JID" --timeout 3600 --idle-timeout 120 &
|
|
||||||
|
|
||||||
# 3) pass JID to the agent and instruct it to publish events with --job "$JID"
|
---
|
||||||
# (don't hard-code a job id you saw earlier — see Pitfall §"Wrong job_id")
|
|
||||||
|
|
||||||
# 4) on completion the subscriber prints events and exits 0/1/2
|
## The Event Protocol Contract
|
||||||
|
|
||||||
# 5) inspect any time
|
Every agent participating in the delegation contract must follow the same lifecycle publishing protocol using `publish_event.py`:
|
||||||
$PY "$SKILL/registry.py" get --job "$JID"
|
|
||||||
$PY "$SKILL/registry.py" logs "$JID" # positional job id
|
|
||||||
$PY "$SKILL/registry.py" logs --list
|
|
||||||
```
|
|
||||||
|
|
||||||
## Job Protocol
|
1. **On Start**: Publish `started` event.
|
||||||
|
`python3 .agents/skills/multi-agent-mux-delegate-job/scripts/publish_event.py --job "$JOB_ID" --event started`
|
||||||
|
2. **On Tool/Permission Prompt**: Publish `permission_required` event.
|
||||||
|
`python3 ... --job "$JOB_ID" --event permission_required --detail "<tool>:<reason>"`
|
||||||
|
3. **On Progress Update (Optional)**: Publish `progress` event.
|
||||||
|
`python3 ... --job "$JOB_ID" --event progress --detail "<status_update>"`
|
||||||
|
4. **On Success**: Publish `completed` event.
|
||||||
|
`python3 ... --job "$JOB_ID" --event completed --detail "<summary>"` (Reviewer should include `"PASS"` in the detail to approve).
|
||||||
|
5. **On Failure/Feedback**: Publish `error` event.
|
||||||
|
`python3 ... --job "$JOB_ID" --event error --detail "<reason_or_feedback>"`
|
||||||
|
|
||||||
One topic per job: `python/mqtt/jobs/<job_id>/events`. Payload (JSON, UTF-8,
|
---
|
||||||
`schema_version=1`):
|
|
||||||
|
|
||||||
```json
|
|
||||||
{ "schema_version": 1, "seq": 7, "job_id": "abc12345",
|
|
||||||
"event": "started|permission_required|progress|completed|error",
|
|
||||||
"timestamp": "2026-06-19T09:32:00Z", "detail": "generalised text",
|
|
||||||
"data": { "optional": "metadata" } }
|
|
||||||
```
|
|
||||||
|
|
||||||
- `seq` is monotonic per job (first = 1); the subscriber uses it to spot
|
|
||||||
reorder/duplication.
|
|
||||||
- `timestamp` is advisory — timeouts are measured from **receive** time.
|
|
||||||
- `detail`/`data` carry **no** secrets or absolute paths.
|
|
||||||
- A `schema_version` or `job_id` mismatch is **dropped** (defensive parsing).
|
|
||||||
|
|
||||||
`started` and `completed`/`error` are the mandatory bookends; `completed`→exit 0,
|
|
||||||
`error`→exit 1. Full catalogue + production `auth_token` handling:
|
|
||||||
[`job-protocol.md`](./job-protocol.md).
|
|
||||||
|
|
||||||
## Registry Format
|
|
||||||
|
|
||||||
```
|
|
||||||
.mam/jobs/<id>.json # metadata record (single source of truth)
|
|
||||||
.mam/jobs/<id>.events.log # append-only JSON-lines log (debug, optional)
|
|
||||||
.mam/jobs/.lock # fcntl advisory lock for the registry
|
|
||||||
```
|
|
||||||
|
|
||||||
The record holds `status`, `prompt`, `agent`, `agent_session`, a `broker` block,
|
|
||||||
`topic_prefix`, `timeout_sec`/`idle_timeout_sec`, `expected_artifacts`,
|
|
||||||
`last_seq`, and (production) `auth_token`. Because the `broker` block lives in
|
|
||||||
the record, `publish_event.py` connects from the registry alone. Concurrency,
|
|
||||||
the atomic rename trick, and multi-session job claiming are in
|
|
||||||
[`registry.md`](./registry.md).
|
|
||||||
|
|
||||||
## Audit Logs
|
## Audit Logs
|
||||||
|
|
||||||
Every job's lifecycle is mirrored to a **persistent, append-only audit log**
|
Job lifecycle execution events are persistently mirrored to an append-only log under `.mam/delegate_job_logs/<job_id>/` (containing `meta.json`, `events.ndjson`, and `status.json`). Use `multi-agent-mux-delegate-job logs <job_id>` to view the timeline.
|
||||||
under `.mam/delegate_job_logs/` (override with `DELEGATE_JOB_LOGS_DIR`;
|
|
||||||
default `<cwd>/.mam/delegate_job_logs`). Unlike the registry — live state
|
|
||||||
mutated in place and liable to be cleaned up — the audit log is durable
|
|
||||||
history you can replay after the fact. It is git-ignored.
|
|
||||||
|
|
||||||
```
|
---
|
||||||
.mam/delegate_job_logs/<job_id>/
|
|
||||||
meta.json # registration snapshot: prompt, agent, broker, timeouts, …
|
|
||||||
events.ndjson # append-only, one JSON event per line, in time order
|
|
||||||
status.json # current status only (fast point-query)
|
|
||||||
```
|
|
||||||
|
|
||||||
**What is logged, automatically:**
|
## Best Practices and Pitfalls
|
||||||
|
|
||||||
| When | `events.ndjson` line | Written by |
|
- **Subscribe-Before-Publish**: The subscriber must be running before the agent starts publishing. The `submit` command handles this automatically by launching the subscriber in the background first.
|
||||||
|------|----------------------|------------|
|
- **Fresh job_id Propagation**: Make sure the worker agent receives the correct `JOB_ID` generated for the current run, rather than reusing stale IDs from previous sessions.
|
||||||
| job registered | `registered` (also seeds meta.json + status.json) | `registry.register_job` |
|
- **Brief delivery via file path**: For long or complex prompts, write the instructions to a file (e.g. `/tmp/task-brief.md`) and pass a short prompt pointing to the file path to prevent terminal buffer overflows.
|
||||||
| any status change | `status_changed` (`from`/`to`; also rewrites status.json) | `update_job_status`, `pick_pending` |
|
- **Batch Grouping**: Group non-overlapping tasks into batches to parallelize execution across multiple agent sessions, reducing overhead.
|
||||||
| event published | `published` (carries the exact payload — reproducible) | `publish_event.py` |
|
|
||||||
| event received | `received` (subscriber's external view) | `job_subscriber.py` |
|
|
||||||
|
|
||||||
Both the emitter side (`published`) and the observer side (`received`) are
|
|
||||||
recorded, so a dropped publish or a missed receive is still visible from the
|
|
||||||
other. Every write is **best-effort and isolated** — an fcntl-locked append
|
|
||||||
guarded by `try/except` that only ever emits a `logger.warning`, so a logging
|
|
||||||
failure can never break a publish, a subscribe, or a registry write. stdout is
|
|
||||||
never touched.
|
|
||||||
|
|
||||||
**Reading them:**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
multi-agent-mux-delegate-job logs <job_id> # pretty-print one job's timeline
|
|
||||||
multi-agent-mux-delegate-job logs --list # summarise every logged job (with live status)
|
|
||||||
# or directly via the registry CLI:
|
|
||||||
$PY scripts/registry.py logs <job_id> [--tail N] [--json]
|
|
||||||
$PY scripts/registry.py logs --list [--json]
|
|
||||||
```
|
|
||||||
|
|
||||||
`submit` prints the job's audit-log directory as its last stdout line, so a
|
|
||||||
caller can `tail -n1` to locate it.
|
|
||||||
|
|
||||||
## Broker Setup
|
|
||||||
|
|
||||||
| Stage | Broker | Auth | Transport |
|
|
||||||
|-------|--------|------|-----------|
|
|
||||||
| PoC | `broker.hivemq.com` | none | 1883 plaintext |
|
|
||||||
| Production | self-hosted Mosquitto/EMQX | user/pass + ACL | 8883 TLS |
|
|
||||||
|
|
||||||
All connection settings come from env (`MQTT_BROKER`, `MQTT_PORT`, `MQTT_TLS`,
|
|
||||||
`MQTT_USERNAME`/`MQTT_PASSWORD`, `MQTT_CA_CERTS`, …) resolved by
|
|
||||||
`broker_config_from_env()`, with the registry `broker.*` block overriding per
|
|
||||||
job. Moving to your own broker is **config only**: install Mosquitto, set
|
|
||||||
`persistence true` + `acl_file` + `password_file` + a TLS `listener 8883`, grant
|
|
||||||
the worker `write python/mqtt/jobs/+/events` and Hermes `read`, then flip
|
|
||||||
`MQTT_TLS=1` and fill the registry `broker.*`. Step-by-step (conf, ACL,
|
|
||||||
`mosquitto_passwd`, self-signed/private-CA certs, cut-over verification):
|
|
||||||
[`mqtt-broker-setup.md`](./mqtt-broker-setup.md).
|
|
||||||
|
|
||||||
## Agent Adapters
|
|
||||||
|
|
||||||
Each agent voluntarily follows the contract: receive a `JOB_ID` (or registry
|
|
||||||
path), call `publish_event.py` at lifecycle points, exit 0/1/2. **The contract
|
|
||||||
in one line**: every event call uses `--job "$JOB_ID"` where `$JOB_ID` is the
|
|
||||||
**freshly-issued id from the registry record for *this* delegation** — never a
|
|
||||||
job_id you saw in an earlier session (Pitfall §"Wrong job_id propagated to the
|
|
||||||
agent").
|
|
||||||
|
|
||||||
- **claude-code** — Claude Code calls `publish_event.py` via its Bash tool at
|
|
||||||
lifecycle points. `submit --mode tmux` injects a prompt that already names
|
|
||||||
`$JOB_ID`; if you drive claude manually, hand it the id explicitly. Reference
|
|
||||||
instruction block (the wrapper injects something equivalent):
|
|
||||||
|
|
||||||
```text
|
|
||||||
Your job_id is "$JOB_ID" (read it from the registry record for this delegation —
|
|
||||||
do not reuse any job_id you saw before).
|
|
||||||
|
|
||||||
On start: $PY multi-agent-mux-delegate-job/scripts/publish_event.py --job "$JOB_ID" --event started
|
|
||||||
On permission: $PY … --job "$JOB_ID" --event permission_required --detail "<tool>:<what>"
|
|
||||||
On progress: $PY … --job "$JOB_ID" --event progress --detail "<short status>"
|
|
||||||
On success: $PY … --job "$JOB_ID" --event completed --detail "<one-line summary>"
|
|
||||||
On failure: $PY … --job "$JOB_ID" --event error --detail "<one-line reason>"
|
|
||||||
|
|
||||||
Task: <the user's prompt>
|
|
||||||
|
|
||||||
The subscriber for "$JOB_ID" is already running; your completed/error event
|
|
||||||
ends the job. Exit codes: 0 completed, 1 error, 2 publish failure.
|
|
||||||
```
|
|
||||||
|
|
||||||
See [claude-code](../claude-code/SKILL.md) for tmux orchestration patterns.
|
|
||||||
- **codex** — same contract. Invoke `codex exec "<instruction-block-above>"` or
|
|
||||||
wire `publish_event.py` as an MCP tool so the agent can call it directly.
|
|
||||||
- **opencode** — wire `publish_event.py` as a tool/command the agent can call;
|
|
||||||
identical event points.
|
|
||||||
- **human** — a person does the work, reads the registry record, then runs
|
|
||||||
`publish_event.py --job <id> --event completed` (or `error`) by hand.
|
|
||||||
|
|
||||||
## User Interface
|
|
||||||
|
|
||||||
The [`multi-agent-mux-delegate-job`](./multi-agent-mux-delegate-job) bash wrapper bundles register +
|
|
||||||
subscribe-first + run-agent + validate:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
multi-agent-mux-delegate-job submit --agent claude-code \
|
|
||||||
--prompt "정렬 문제 10개를 만들어 sort_problems.md로 저장" \
|
|
||||||
--workdir /path/to/project --timeout 3600 [--validate ./validate.sh]
|
|
||||||
multi-agent-mux-delegate-job status --job <id> # one record, pretty-printed
|
|
||||||
multi-agent-mux-delegate-job list # all jobs, one line each
|
|
||||||
multi-agent-mux-delegate-job verify --job <id> --validate ./validate.sh # runs it, reports exit code
|
|
||||||
multi-agent-mux-delegate-job wait [--job <id>] # block until terminal (else --wait-any)
|
|
||||||
```
|
|
||||||
|
|
||||||
`submit` **always starts the subscriber before the agent** (the ordering
|
|
||||||
dependency), runs the agent in `--mode print` (one-shot) or `--mode tmux`, and
|
|
||||||
calls `--validate` afterward if given. The skill automates job-id generation,
|
|
||||||
registry creation, broker resolution, subscriber-first ordering, agent launch,
|
|
||||||
and completion detection; it does **not** automate the agent's internals or your
|
|
||||||
business-logic validation — those are hooks you fill (`validate.sh` reads
|
|
||||||
`$JOB_ID`/`$REGISTRY_DIR`).
|
|
||||||
|
|
||||||
## Common Pitfalls
|
|
||||||
|
|
||||||
- **Publishing before subscribing** — MQTT does not queue non-retained messages
|
|
||||||
for absent subscribers. Start `job_subscriber.py` *before* the agent, or rely
|
|
||||||
on retained terminal events (production). `submit` enforces this.
|
|
||||||
- **Wrong job_id propagated to the agent** — the wrapper prints a fresh `JOB_ID`
|
|
||||||
on every `submit`. If your agent instruction (or the wrapper's prompt template)
|
|
||||||
hard-codes an old job_id, the agent calls `publish_event.py --job <wrong>`,
|
|
||||||
the subscriber's defensive parser drops it as a `job_id` mismatch, and the
|
|
||||||
delegator waits until idle timeout (exit 2). Fix: instruct the agent to
|
|
||||||
**read the job_id from the registry record for *this* delegation** (or pass it
|
|
||||||
in via env / `--prompt` interpolation), never from prior runs. `submit`'s
|
|
||||||
default prompt template interpolates `$JOB_ID` for you — if you build a custom
|
|
||||||
prompt, do the same.
|
|
||||||
- **tmux session name collision** — `submit --mode tmux` derives the session
|
|
||||||
name from `--agent-session tmux:<name>` (default `tmux:claude`). If a session
|
|
||||||
with that name is already attached (e.g. you ran the demo and the previous
|
|
||||||
session is still open), `tmux new-session -d -s <name>` fails and the agent
|
|
||||||
never launches. Pick a unique `--agent-session` per concurrent delegation
|
|
||||||
(e.g. `tmux:demo`, `tmux:claude-a`, `tmux:claude-b`) or kill the stale one
|
|
||||||
(`tmux kill-session -t claude`) before re-running.
|
|
||||||
- **Timeout before `started`** — a cold-starting agent may not emit `started`
|
|
||||||
for a while; the wall-clock timeout starts at subscribe time so a stuck agent
|
|
||||||
still terminates. Don't set `--timeout` so low you false-positive a slow start.
|
|
||||||
- **No retry on publish** — a dropped `completed` would hang the delegator
|
|
||||||
forever; `publish_event.py` retries with exponential backoff and exits 2 if it
|
|
||||||
still fails, so the delegator is never left waiting silently.
|
|
||||||
- **QoS-1 duplicates / reorders** — a terminal event can arrive twice, or
|
|
||||||
`error` can trail `completed`; the subscriber's terminal state machine
|
|
||||||
finalises each job once and ignores the rest.
|
|
||||||
- **Trusting the public broker** — anyone can publish there; never make a real
|
|
||||||
decision on a PoC signal. Add `auth_token` + an authenticated broker first.
|
|
||||||
- **Secrets in `detail`/`data`** — keep payloads generalised; no paths, keys, or
|
|
||||||
tokens (except the production `auth_token` in `data`).
|
|
||||||
|
|
||||||
## Subagent Orchestration Pattern
|
|
||||||
|
|
||||||
When using this skill from a Hermes `delegate_task` subagent to dispatch work to
|
|
||||||
a coding-agent CLI (agy/claude) running in a tmux session, the following pattern
|
|
||||||
has been verified (2026-06-21, 6-batch refactoring sprint):
|
|
||||||
|
|
||||||
### Roles
|
|
||||||
- **Main worker** (implementation): one agent session (e.g. `agy-new`) receives
|
|
||||||
brief files and executes code changes.
|
|
||||||
- **Reviewers** (spec compliance + code quality): two other agent sessions
|
|
||||||
(e.g. `agy-existing`, `claude-existing`) review the diff in parallel.
|
|
||||||
- **Hermes** (orchestrator): dispatches subagents, verifies diffs, commits,
|
|
||||||
and falls back to direct fixes when reviewers find issues.
|
|
||||||
|
|
||||||
### Key lessons learned
|
|
||||||
1. **Brief delivery via file path** — don't paste long briefs inline via
|
|
||||||
`tmux send-keys`; the TUI may swallow them. Instead, send a short instruction
|
|
||||||
like "follow /tmp/batch1-brief.md" and let the agent read the file.
|
|
||||||
2. **Polling vs MQTT subscriber** — for short tasks (<5min), pane polling
|
|
||||||
(`capture-pane` + grep for completion markers) is simpler and more reliable
|
|
||||||
than registering a job via `registry.py` + `job_subscriber.py`. Use MQTT
|
|
||||||
subscriber only for long-running jobs (>5min) where push notification matters.
|
|
||||||
3. **Reviewers catch different bugs** — in practice, agy (Flash) caught
|
|
||||||
semantic issues (slash matching, export scope), while claude (Opus) caught
|
|
||||||
API signature mismatches (paho v2 5-arg vs 4-arg `on_disconnect`). Two
|
|
||||||
reviewers with different models provide complementary coverage.
|
|
||||||
4. **Hermes fallback fix** — when reviewers find a small, well-defined issue
|
|
||||||
(wrong argument count, missing slash), Hermes should fix it directly rather
|
|
||||||
than re-dispatching the implementer. This saves a full round-trip.
|
|
||||||
5. **Batch grouping** — group 2-3 FW items per batch when they touch different
|
|
||||||
files (no file overlap). This amortises the dispatch overhead. Items touching
|
|
||||||
the same file must be in separate batches to avoid conflicts.
|
|
||||||
6. **Pane Snapshots & Truncation Prevention** — to prevent long agent responses from being scrolled out and truncated due to TUI viewport limitations, enforce the following snapshotting pattern:
|
|
||||||
- Immediately after dispatching a brief, capture the pre-brief pane buffer via `capture-pane -S -200`.
|
|
||||||
- During long execution, run a background loop taking incremental snapshots (e.g. every 30 seconds `>> /tmp/pane-snap.txt`).
|
|
||||||
- Immediately after job termination, capture the entire final pane state to ensure no terminal logs are lost.
|
|
||||||
|
|
||||||
## Verification Checklist
|
|
||||||
|
|
||||||
- [ ] `started` → `completed` over the public broker: subscriber prints the
|
|
||||||
lines and exits **0**.
|
|
||||||
- [ ] `error` path: subscriber exits **1**.
|
|
||||||
- [ ] timeout path: no terminal event within `--timeout`/`--idle-timeout` →
|
|
||||||
exit **2**.
|
|
||||||
- [ ] polluted payload (bad JSON, wrong `schema_version`, wrong `job_id`) is
|
|
||||||
dropped with a warning, not crashed on.
|
|
||||||
- [ ] one tmux session processes two registry jobs in sequence; a second
|
|
||||||
session with a different `agent_session` claims only its own.
|
|
||||||
- [ ] broker cut-over: same scripts reach an authenticated TLS broker with env
|
|
||||||
changes only; a credential without write ACL is rejected; a late
|
|
||||||
subscriber still receives the retained terminal event.
|
|
||||||
- [ ] `publisher.py`/`subscriber.py`/`README.md` demo on `python/mqtt/sample`
|
|
||||||
still works unchanged (regression).
|
|
||||||
- [ ] **audit log integrity** — for a completed job,
|
|
||||||
`.mam/delegate_job_logs/<JID>/events.ndjson` contains `registered` →
|
|
||||||
`received started` → `published completed` (in that order), and
|
|
||||||
`status.json.status == "completed"` matches the registry record. A
|
|
||||||
logging failure (e.g. read-only log dir) does not break the publish or
|
|
||||||
subscribe path — only a `logger.warning` is emitted.
|
|
||||||
- [ ] **end-to-end demo smoke** — run
|
|
||||||
`multi-agent-mux-delegate-job submit --agent claude-code --agent-session tmux:demo-smoke
|
|
||||||
--prompt "echo hello and call publish_event.py --job <JID>
|
|
||||||
--event completed" --timeout 120` and confirm
|
|
||||||
(a) registered job id echoed, (b) subscriber pid echoed, (c) tmux session
|
|
||||||
name printed, (d) `events.ndjson` grows as the agent runs, (e) final
|
|
||||||
stdout line is the audit-log dir.
|
|
||||||
|
|||||||
@@ -221,7 +221,6 @@ Task: $current_prompt"
|
|||||||
# Trigger agent
|
# Trigger agent
|
||||||
run_agent "$JOB_ID" "$instructions" "$current_session"
|
run_agent "$JOB_ID" "$instructions" "$current_session"
|
||||||
|
|
||||||
# Wait for subscriber
|
|
||||||
# Wait for subscriber
|
# Wait for subscriber
|
||||||
local sub_rc=0
|
local sub_rc=0
|
||||||
wait "$sub_pid" || sub_rc=$?
|
wait "$sub_pid" || sub_rc=$?
|
||||||
|
|||||||
@@ -282,7 +282,7 @@ mkdir -p "$STATE_DIR"
|
|||||||
# atomic_dump_yaml(flock + temp+rename) 로 같은 소스를 돌린다. atomic 래퍼에서는
|
# atomic_dump_yaml(flock + temp+rename) 로 같은 소스를 돌린다. atomic 래퍼에서는
|
||||||
# 'actions' 가 없으면 SystemExit(0) 으로 쓰기를 건너뛴다 (불필요한 재포맷 방지).
|
# 'actions' 가 없으면 SystemExit(0) 으로 쓰기를 건너뛴다 (불필요한 재포맷 방지).
|
||||||
read -r -d '' RECON_SRC <<'PYEOF' || true
|
read -r -d '' RECON_SRC <<'PYEOF' || true
|
||||||
import os, json, glob, subprocess, time
|
import os, json, glob, subprocess, time, sqlite3
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
@@ -403,14 +403,28 @@ if tmux_confirmed:
|
|||||||
name = t['name']
|
name = t['name']
|
||||||
if name in yaml_session_names:
|
if name in yaml_session_names:
|
||||||
continue
|
continue
|
||||||
if not (name.endswith('-creator-claude') or name.endswith('-creator-agy')):
|
if name.endswith('-creator-claude'):
|
||||||
|
agent = 'claude'
|
||||||
|
elif name.endswith('-creator-agy'):
|
||||||
|
agent = 'agy'
|
||||||
|
elif name.endswith('-creator-hermes'):
|
||||||
|
agent = 'hermes'
|
||||||
|
elif name.endswith('-creator-cline'):
|
||||||
|
agent = 'cline'
|
||||||
|
else:
|
||||||
continue
|
continue
|
||||||
srv = t.get('server', 'default')
|
srv = t.get('server', 'default')
|
||||||
pm = pane_meta(name, srv)
|
pm = pane_meta(name, srv)
|
||||||
if not pm:
|
if not pm:
|
||||||
continue
|
continue
|
||||||
agent = 'claude' if name.endswith('-creator-claude') else 'agy'
|
if agent == 'claude':
|
||||||
cmd_full = 'claude --dangerously-skip-permissions' if agent == 'claude' else 'agy --dangerously-skip-permissions'
|
cmd_full = 'claude --dangerously-skip-permissions'
|
||||||
|
elif agent == 'agy':
|
||||||
|
cmd_full = 'agy --dangerously-skip-permissions'
|
||||||
|
elif agent == 'hermes':
|
||||||
|
cmd_full = 'hermes'
|
||||||
|
elif agent == 'cline':
|
||||||
|
cmd_full = 'cline -i'
|
||||||
server_opt = f"-L {srv} " if srv != 'default' else ""
|
server_opt = f"-L {srv} " if srv != 'default' else ""
|
||||||
entry = {
|
entry = {
|
||||||
'name': name,
|
'name': name,
|
||||||
@@ -430,7 +444,7 @@ if tmux_confirmed:
|
|||||||
entry['tui'] = {'model': '(unknown — capture after first message)', 'provider': 'anthropic',
|
entry['tui'] = {'model': '(unknown — capture after first message)', 'provider': 'anthropic',
|
||||||
'plan': '(unknown)', 'account': '(unknown)', 'version': '(unknown)'}
|
'plan': '(unknown)', 'account': '(unknown)', 'version': '(unknown)'}
|
||||||
entry['claude_session_id_own'] = None
|
entry['claude_session_id_own'] = None
|
||||||
else:
|
elif agent == 'agy':
|
||||||
entry['child_pid'] = 0
|
entry['child_pid'] = 0
|
||||||
entry['agy_conversation_id_own'] = None
|
entry['agy_conversation_id_own'] = None
|
||||||
entry['mcp_attachments'] = [
|
entry['mcp_attachments'] = [
|
||||||
@@ -440,6 +454,12 @@ if tmux_confirmed:
|
|||||||
'endpoint': 'https://stitch.googleapis.com/mcp'
|
'endpoint': 'https://stitch.googleapis.com/mcp'
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
elif agent == 'hermes':
|
||||||
|
entry['child_pid'] = 0
|
||||||
|
entry['hermes_conversation_id_own'] = None
|
||||||
|
elif agent == 'cline':
|
||||||
|
entry['child_pid'] = 0
|
||||||
|
entry['cline_conversation_id_own'] = None
|
||||||
d.setdefault('tmux_sessions', []).append(entry)
|
d.setdefault('tmux_sessions', []).append(entry)
|
||||||
yaml_session_names.add(name)
|
yaml_session_names.add(name)
|
||||||
drifts.append({'class': 'B', 'name': name,
|
drifts.append({'class': 'B', 'name': name,
|
||||||
@@ -505,6 +525,66 @@ for s in d.get('tmux_sessions', []):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# === drift C (hermes): hermes 새 session id materialize (per-row own id) ===
|
||||||
|
for s in d.get('tmux_sessions', []):
|
||||||
|
if not s.get('name', '').endswith('-creator-hermes'):
|
||||||
|
continue
|
||||||
|
if s.get('status') != 'running':
|
||||||
|
continue
|
||||||
|
if s.get('hermes_conversation_id_own'):
|
||||||
|
continue
|
||||||
|
cwd = (s.get('pane') or {}).get('cwd', '')
|
||||||
|
if not cwd:
|
||||||
|
continue
|
||||||
|
hdb = f"{home}/.hermes/state.db"
|
||||||
|
if os.path.exists(hdb):
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(hdb)
|
||||||
|
r = conn.execute("SELECT id FROM sessions WHERE cwd=? ORDER BY started_at DESC LIMIT 1", (cwd,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
if r:
|
||||||
|
cid = r[0]
|
||||||
|
s['hermes_conversation_id_own'] = cid
|
||||||
|
drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: conversation id materialized: {cid}"})
|
||||||
|
actions.append(f"updated conversation id: {cid}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# === drift C (cline): cline 새 session id materialize (per-row own id) ===
|
||||||
|
for s in d.get('tmux_sessions', []):
|
||||||
|
if not s.get('name', '').endswith('-creator-cline'):
|
||||||
|
continue
|
||||||
|
if s.get('status') != 'running':
|
||||||
|
continue
|
||||||
|
if s.get('cline_conversation_id_own'):
|
||||||
|
continue
|
||||||
|
cwd = (s.get('pane') or {}).get('cwd', '')
|
||||||
|
if not cwd:
|
||||||
|
continue
|
||||||
|
sessions_dir = f"{home}/.cline/data/sessions"
|
||||||
|
if os.path.isdir(sessions_dir):
|
||||||
|
candidates = []
|
||||||
|
for session_folder in glob.glob(f"{sessions_dir}/*"):
|
||||||
|
if os.path.isdir(session_folder):
|
||||||
|
folder_name = os.path.basename(session_folder)
|
||||||
|
json_file = f"{session_folder}/{folder_name}.json"
|
||||||
|
if os.path.exists(json_file):
|
||||||
|
candidates.append(json_file)
|
||||||
|
candidates.sort(key=os.path.getmtime, reverse=True)
|
||||||
|
for j in candidates:
|
||||||
|
try:
|
||||||
|
with open(j) as f:
|
||||||
|
sdata = json.load(f)
|
||||||
|
if sdata.get('cwd') == cwd or sdata.get('workspace_root') == cwd:
|
||||||
|
cid = sdata.get('session_id')
|
||||||
|
if cid:
|
||||||
|
s['cline_conversation_id_own'] = cid
|
||||||
|
drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: session id materialized: {cid}"})
|
||||||
|
actions.append(f"updated session id: {cid}")
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# === drift D: stale UUID (cache 의 artifact 가 사라짐) — 보고만, 변경 없음 ===
|
# === drift D: stale UUID (cache 의 artifact 가 사라짐) — 보고만, 변경 없음 ===
|
||||||
ai = d.get('agent_identities', {}) or {}
|
ai = d.get('agent_identities', {}) or {}
|
||||||
cl = (ai.get('claude') or {})
|
cl = (ai.get('claude') or {})
|
||||||
@@ -519,6 +599,28 @@ if ag.get('conversation_id'):
|
|||||||
if not os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
|
if not os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
|
||||||
drifts.append({'class': 'D', 'name': '(agy identity cache)',
|
drifts.append({'class': 'D', 'name': '(agy identity cache)',
|
||||||
'msg': f"stale UUID in agent_identities.agy.conversation_id: {cid} (.db missing)"})
|
'msg': f"stale UUID in agent_identities.agy.conversation_id: {cid} (.db missing)"})
|
||||||
|
hr = (ai.get('hermes') or {})
|
||||||
|
if hr.get('session_id'):
|
||||||
|
sid = hr['session_id']
|
||||||
|
hdb = f"{home}/.hermes/state.db"
|
||||||
|
has_session = False
|
||||||
|
if os.path.exists(hdb):
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(hdb)
|
||||||
|
r = conn.execute("SELECT 1 FROM sessions WHERE id=?", (sid,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
has_session = r is not None
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if not has_session:
|
||||||
|
drifts.append({'class': 'D', 'name': '(hermes identity cache)',
|
||||||
|
'msg': f"stale UUID in agent_identities.hermes.session_id: {sid} (session missing from db)"})
|
||||||
|
cn = (ai.get('cline') or {})
|
||||||
|
if cn.get('session_id'):
|
||||||
|
sid = cn['session_id']
|
||||||
|
if not os.path.exists(f"{home}/.cline/data/sessions/{sid}/{sid}.json"):
|
||||||
|
drifts.append({'class': 'D', 'name': '(cline identity cache)',
|
||||||
|
'msg': f"stale UUID in agent_identities.cline.session_id: {sid} (session file missing)"})
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
'timestamp': now_iso,
|
'timestamp': now_iso,
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ if [ -z "$AGENT" ]; then
|
|||||||
*-creator-claude) AGENT=claude ;;
|
*-creator-claude) AGENT=claude ;;
|
||||||
*-creator-agy) AGENT=agy ;;
|
*-creator-agy) AGENT=agy ;;
|
||||||
*-creator-hermes) AGENT=hermes ;;
|
*-creator-hermes) AGENT=hermes ;;
|
||||||
|
*-creator-cline) AGENT=cline ;;
|
||||||
*) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
|
*) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
|
||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
@@ -51,7 +52,7 @@ NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
|
|||||||
PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null | head -1 || true)
|
PANE_PID=$(tmux list-panes -t "$SESSION_NAME" -F '#{pane_pid}' 2>/dev/null | head -1 || true)
|
||||||
PANE_PID="${PANE_PID:-}"
|
PANE_PID="${PANE_PID:-}"
|
||||||
CHILD_PID=0
|
CHILD_PID=0
|
||||||
if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ]; } && [ -n "$PANE_PID" ]; then
|
if { [ "$AGENT" = "agy" ] || [ "$AGENT" = "hermes" ] || [ "$AGENT" = "cline" ]; } && [ -n "$PANE_PID" ]; then
|
||||||
CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
|
CHILD_PID=$(pgrep -P "$PANE_PID" -x "$AGENT" 2>/dev/null | head -1 || true)
|
||||||
CHILD_PID="${CHILD_PID:-0}"
|
CHILD_PID="${CHILD_PID:-0}"
|
||||||
fi
|
fi
|
||||||
@@ -144,6 +145,13 @@ elif agent == 'hermes':
|
|||||||
cp = os.environ.get('CHILD_PID', '0')
|
cp = os.environ.get('CHILD_PID', '0')
|
||||||
if cp.isdigit() and int(cp) > 0:
|
if cp.isdigit() and int(cp) > 0:
|
||||||
target['child_pid'] = int(cp)
|
target['child_pid'] = int(cp)
|
||||||
|
elif agent == 'cline':
|
||||||
|
target['pane']['cmd'] = 'cline'
|
||||||
|
target['pane']['cmd_full'] = f'cline -i --id {uuid}'
|
||||||
|
target['cline_conversation_id_own'] = uuid
|
||||||
|
cp = os.environ.get('CHILD_PID', '0')
|
||||||
|
if cp.isdigit() and int(cp) > 0:
|
||||||
|
target['child_pid'] = int(cp)
|
||||||
|
|
||||||
snap = d.setdefault('snapshot', {})
|
snap = d.setdefault('snapshot', {})
|
||||||
snap['taken_at'] = now
|
snap['taken_at'] = now
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ if [ -z "$AGENT" ]; then
|
|||||||
*-creator-claude) AGENT=claude ;;
|
*-creator-claude) AGENT=claude ;;
|
||||||
*-creator-agy) AGENT=agy ;;
|
*-creator-agy) AGENT=agy ;;
|
||||||
*-creator-hermes) AGENT=hermes ;;
|
*-creator-hermes) AGENT=hermes ;;
|
||||||
|
*-creator-cline) AGENT=cline ;;
|
||||||
*) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
|
*) echo "ERROR: cannot infer agent from '$SESSION_NAME'; pass --agent" >&2; exit 2 ;;
|
||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
@@ -184,6 +185,7 @@ graceful_stop() {
|
|||||||
claude) exitkey="/exit" ;;
|
claude) exitkey="/exit" ;;
|
||||||
agy) exitkey="Exit" ;;
|
agy) exitkey="Exit" ;;
|
||||||
hermes) exitkey="/exit" ;;
|
hermes) exitkey="/exit" ;;
|
||||||
|
cline) exitkey="/exit" ;;
|
||||||
*) exitkey="/exit" ;;
|
*) exitkey="/exit" ;;
|
||||||
esac
|
esac
|
||||||
echo "graceful: send-keys '$exitkey' to $SESSION_NAME"
|
echo "graceful: send-keys '$exitkey' to $SESSION_NAME"
|
||||||
@@ -263,6 +265,8 @@ if captured and not purge:
|
|||||||
target['agy_conversation_id_own'] = captured
|
target['agy_conversation_id_own'] = captured
|
||||||
elif agent == 'hermes':
|
elif agent == 'hermes':
|
||||||
target['hermes_conversation_id_own'] = captured
|
target['hermes_conversation_id_own'] = captured
|
||||||
|
elif agent == 'cline':
|
||||||
|
target['cline_conversation_id_own'] = captured
|
||||||
target['resumable'] = True
|
target['resumable'] = True
|
||||||
|
|
||||||
# --purge-conversation: 워크스페이스 격리된 UUID 의 디스크 artifact 만 삭제 (P0-C)
|
# --purge-conversation: 워크스페이스 격리된 UUID 의 디스크 artifact 만 삭제 (P0-C)
|
||||||
@@ -294,15 +298,21 @@ if purge and purge_uuid:
|
|||||||
if os.path.exists(hdb):
|
if os.path.exists(hdb):
|
||||||
try:
|
try:
|
||||||
import sqlite3
|
import sqlite3
|
||||||
conn = sqlite3.connect(hdb)
|
hconn = sqlite3.connect(hdb)
|
||||||
conn.execute("DELETE FROM sessions WHERE id=?", (purge_uuid,))
|
hconn.execute("DELETE FROM sessions WHERE id=?", (purge_uuid,))
|
||||||
conn.execute("DELETE FROM messages WHERE session_id=?", (purge_uuid,))
|
hconn.execute("DELETE FROM messages WHERE session_id=?", (purge_uuid,))
|
||||||
conn.commit()
|
hconn.commit()
|
||||||
conn.close()
|
hconn.close()
|
||||||
print(f"purged db records for session: {purge_uuid}", flush=True)
|
print(f"purged db records for session: {purge_uuid}", flush=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"WARN: purge hermes db records failed: {e}", flush=True)
|
print(f"WARN: purge hermes db records failed: {e}", flush=True)
|
||||||
target['hermes_conversation_id_own'] = None
|
target['hermes_conversation_id_own'] = None
|
||||||
|
elif agent == 'cline':
|
||||||
|
sessions_dir = f"{home}/.cline/data/sessions/{purge_uuid}"
|
||||||
|
if os.path.isdir(sessions_dir):
|
||||||
|
shutil.rmtree(sessions_dir)
|
||||||
|
print(f"purged: {sessions_dir}", flush=True)
|
||||||
|
target['cline_conversation_id_own'] = None
|
||||||
# agent_identities 는 cache — 이 워크스페이스 것일 때만 비운다
|
# agent_identities 는 cache — 이 워크스페이스 것일 때만 비운다
|
||||||
ai = (d.get('agent_identities') or {}).get(agent) or {}
|
ai = (d.get('agent_identities') or {}).get(agent) or {}
|
||||||
if ai.get('project_cwd') == ws:
|
if ai.get('project_cwd') == ws:
|
||||||
@@ -317,6 +327,8 @@ if purge and purge_uuid:
|
|||||||
ai['conversation_brain_dir'] = None
|
ai['conversation_brain_dir'] = None
|
||||||
elif agent == 'hermes' and ai.get('session_id') == purge_uuid:
|
elif agent == 'hermes' and ai.get('session_id') == purge_uuid:
|
||||||
ai['session_id'] = None
|
ai['session_id'] = None
|
||||||
|
elif agent == 'cline' and ai.get('session_id') == purge_uuid:
|
||||||
|
ai['session_id'] = None
|
||||||
elif purge and not purge_uuid:
|
elif purge and not purge_uuid:
|
||||||
print("WARN: --purge-conversation requested but no workspace-scoped UUID resolved; nothing purged", flush=True)
|
print("WARN: --purge-conversation requested but no workspace-scoped UUID resolved; nothing purged", flush=True)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,121 @@
|
|||||||
|
# Multi-Agent Mux: Skill Features and Architecture
|
||||||
|
|
||||||
|
이 문서는 `multi-agent-mux` 워크스페이스 내에 구현된 6개의 개별 스킬 및 공통 라이브러리의 핵심 기능, 상태 머신, CLI 사양, 그리고 상호 연동 방식을 종합 정리한 명세입니다. 스킬 최적화 및 팩토링 작업의 기준서로 사용됩니다.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 아키텍처 개요 (Architecture Overview)
|
||||||
|
|
||||||
|
`multi-agent-mux`는 다중 자율 에이전트(Claude, Agy, Cline, Hermes 등)를 격리된 Tmux 세션 환경에서 관리하고 상호 통신할 수 있게 돕는 시스템입니다.
|
||||||
|
* **중앙 상태 레지스트리**: `.mam/agent-sessions.yaml` 및 동기화된 `.mam/agent-sessions.db` (SQLite3)
|
||||||
|
* **격리 소켓**: 독립된 tmux 서버 소켓 지정 구동 가능 (예: `multi-agent-mux` 서버)
|
||||||
|
* **이벤트 버스**: MQTT 프로토콜 기반의 실시간 작업 상태 비동기 관찰 (`multi-agent-mux-delegate-job`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 공통 라이브러리: `lib.sh` (Common Library)
|
||||||
|
|
||||||
|
모든 스킬 스크립트가 로드하여 사용하는 핵심 공유 헬퍼 라이브러리입니다.
|
||||||
|
|
||||||
|
* **상태 파일 원자적 덤프 (`atomic_dump_yaml`)**:
|
||||||
|
* NFS(네트워크 파일 시스템) 감지 시 SQLite `PRAGMA journal_mode=DELETE` 폴백, 로컬 환경에서는 `PRAGMA journal_mode=WAL` 설정.
|
||||||
|
* 독점 잠금(`BEGIN IMMEDIATE`)을 활성화해 멀티프로세스 환경에서 Read-Modify-Write 데이터 유실(lost update race condition) 방지.
|
||||||
|
* 트랜잭션 커밋 완료 후 `.bak` 백업 파일 생성 및 임시파일 생성 후 `os.replace` 원자적 대체 기법 적용.
|
||||||
|
* **에이전트 세션 실재성 판단 (`*_exists` 함수군)**:
|
||||||
|
* `claude`: 프로젝트 디렉터리 하위 `<uuid>.jsonl` 존재성
|
||||||
|
* `agy`: `.gemini/antigravity-cli/conversations/<uuid>.db` 존재성
|
||||||
|
* `hermes`: `~/.hermes/state.db`의 `sessions` 테이블 내 존재성 (SQLite 쿼리 검증)
|
||||||
|
* `cline`: `.cline/data/sessions/<uuid>/<uuid>.json` 존재성
|
||||||
|
* **세션 ID 해석 엔진 (`find_workspace_uuid` 분기 구조)**:
|
||||||
|
* **Tier 1 (YAML 직접 조회)**: YAML 내 기록된 에이전트별 전용 필드(`claude_session_id_own` 등) 조회.
|
||||||
|
* **Tier 2 (디스크 잔해 스캔)**: 워크스페이스 디렉터리(`cwd` / `workspace_root`)와 매칭되는 디스크 상의 세션 로그 중 가장 최근 수정일(`mtime`) 기준 정렬 후 최신 UUID 반환.
|
||||||
|
* **Tier 3 (아이덴티티 캐시)**: 레지스트리 상단 `agent_identities` 캐시 데이터 연동.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 스킬별 상세 핵심 기능 (Skill Specifications)
|
||||||
|
|
||||||
|
### 3.1. `multi-agent-mux-create` (생성 스킬)
|
||||||
|
* **용도**: 신규 에이전트 동작용 격리된 Tmux 컨테이너 생성 및 레지스트리 신규 등록.
|
||||||
|
* **핵심 기능**:
|
||||||
|
* **사전 기능 검증 (Preflight Check)**:
|
||||||
|
* `claude`: `claude auth status`를 통한 로그인 상태(`"loggedIn": true`) 검증
|
||||||
|
* `agy`: `agy models`를 통한 API 연동 정상 상태 검증
|
||||||
|
* `hermes`: `hermes status`를 통한 연동 상태 검증
|
||||||
|
* `cline`: `cline history --json` 동작 및 설정 상태 사전 검증
|
||||||
|
* **Tmux 세션 생성 및 초기화**: 에이전트별 최적화된 화면 크기(`-x 140 -y 40`) 및 작업 디렉터리(`-c`)를 적용해 세션 백그라운드 생성.
|
||||||
|
* **초기 상태 YAML 등록**: `status: running`, `pane` 세부정보(인덱스, PID, CWD, CMD_FULL), 시작 명령 및 `mcp_attachments` 기록.
|
||||||
|
|
||||||
|
### 3.2. `multi-agent-mux-resume` (재개 스킬)
|
||||||
|
* **용도**: 중지되었거나 유실된 에이전트의 이전 컨텍스트 그대로 Tmux 세션 및 TUI 연결 복원.
|
||||||
|
* **핵심 기능**:
|
||||||
|
* **세션 ID 해석 위임**: `lib.sh::find_workspace_uuid`을 구동하여 대상 워크스페이스의 UUID 확인.
|
||||||
|
* **세션 복원 기동**:
|
||||||
|
* `claude`: `claude --dangerously-skip-permissions -r <UUID>`
|
||||||
|
* `agy`: `agy --dangerously-skip-permissions --conversation <UUID>`
|
||||||
|
* `hermes`: `hermes --resume <UUID>`
|
||||||
|
* `cline`: `cline -i --id <UUID>`
|
||||||
|
* **TUI 바이패스 자동화 (Claude)**: 기동 직후 백그라운드에서 `Enter` ➔ `Down` ➔ `Enter` 키스트로크를 주입하여 권한 우회 및 복구 확인 대화상자 자동 수락.
|
||||||
|
* **동기화**: `update_yaml_resumed.sh`를 구동해 상태를 `running`으로 전이하고 기동 시점에 맞춘 하위 자식 PID 갱신 및 기존 종료 메타데이터 제거.
|
||||||
|
|
||||||
|
### 3.3. `multi-agent-mux-stop` (종료 스킬)
|
||||||
|
* **용도**: 세션을 안전하게 정리하고, 상태 및 UUID를 안전하게 저장 및 동기화.
|
||||||
|
* **핵심 기능**:
|
||||||
|
* **종료 전 TUI 스냅숏 저장**: `tmux capture-pane`을 수행해 최종 화면 상태를 `last_visible_status_at_termination` 필드에 보존.
|
||||||
|
* **다단계 Graceful 종료 프로토콜**:
|
||||||
|
1. TUI 안전 종료 키스트로크 주입 (`/exit` 또는 `Exit`) 후 3초 대기.
|
||||||
|
2. 생존 시 `tmux kill-session` 전송 및 5초 대기.
|
||||||
|
3. 최후 수단으로 감지된 자식 PID에 `kill -9` 전송.
|
||||||
|
* **디스크 소거 (--purge-conversation)**:
|
||||||
|
* `resumable`을 `false`로 설정하고 상태를 `terminated`로 기록.
|
||||||
|
* 에이전트별 데이터 경로에 접근해 해당 세션 파일 파쇄.
|
||||||
|
* `claude`: `<proj-key>/<uuid>.jsonl` 삭제
|
||||||
|
* `agy`: `conversations/<uuid>.db` 및 `brain/<uuid>` 폴더 삭제
|
||||||
|
* `hermes`: `sessions/session_<uuid>.json` 삭제 및 `state.db` 내 이력 삭제 (내부 독자 커넥션 `hconn` 사용으로 상위 YAML DB 충돌 차단)
|
||||||
|
* `cline`: `~/.cline/data/sessions/<uuid>` 폴더 소거
|
||||||
|
|
||||||
|
### 3.4. `multi-agent-mux-delegate-job` (위임 스킬)
|
||||||
|
* **용도**: 타 에이전트에게 비동기적으로 작업을 위임하고, MQTT 이벤트로 실행 상태 관찰.
|
||||||
|
* **핵심 기능**:
|
||||||
|
* **작업 지시 유형 (Delegation Types)**:
|
||||||
|
* `direct` (기본값): 단일 타겟 세션 기동 후 작업 전달 및 대기.
|
||||||
|
* `loop` (협업 루프): 구현자(Worker)의 작업 완료 후 검토자(Reviewer)가 코드 검수를 수행하여 `"PASS"` 의견이 나올 때까지 작업 수정을 자동 반복 지시.
|
||||||
|
* `discuss` (토론/합의): 두 에이전트 간 공동 토론을 추진하여 최종 기획 및 계획 합의 도출.
|
||||||
|
* **MQTT 이벤트 규격**: `publish_event.py`와 `job_subscriber.py`를 매핑하여 `started` ➔ `permission_required` ➔ `progress` ➔ `completed`/`error` 상태 전이 추적 및 자동 이중 타임아웃 검사 (전체 실행 예산 3600초 + 120초 유휴 타임아웃).
|
||||||
|
* **감사 로그 기록**: `.mam/delegate_job_logs/<job_id>/`에 `meta.json`, `status.json` 및 원시 NDJSON 형식의 `events.ndjson`을 영속 기록.
|
||||||
|
|
||||||
|
### 3.5. `multi-agent-mux-status` (현황 스킬)
|
||||||
|
* **용도**: 레지스트리를 읽어와 실행 중인 모든 에이전트의 구동 세션 현황을 즉시 표기.
|
||||||
|
* **핵심 기능**:
|
||||||
|
* **읽기 전용 안정성**: DB 수정이나 상태 전이 유발 없이 순수 조회만 수행.
|
||||||
|
* 실시간 tmux 프로세스 상태 정보와 YAML 간의 이름 매핑 정합성을 검증하여 콘솔에 요약 출력.
|
||||||
|
|
||||||
|
### 3.6. `multi-agent-mux-monitor` (화해 스킬)
|
||||||
|
* **용도**: 운영체제 Tmux 런타임과 YAML 레지스트리 데이터 불일치를 백그라운드 루프로 감지해 자동 화해(Reconciliation) 처리.
|
||||||
|
* **핵심 기능**:
|
||||||
|
* **Drift 감지 및 복구 매뉴얼**:
|
||||||
|
* **Drift A (Crash/죽은 세션)**: YAML 상 `running`이나 실제 tmux 프로세스가 죽은 경우 감지 ➔ 상태를 `terminated`로 격하 조정.
|
||||||
|
* **Drift B (새 세션 감지)**: YAML에 없으나 tmux 상에 임의로 떠 있는 `*-creator-*` 세션을 레지스트리에 자동 등록 및 자식 PID 정보 갱신.
|
||||||
|
* **Drift C (실시간 UUID 갱신)**: 새로 시작된 에이전트가 첫 명령을 받아 세션 ID를 생성했을 때, 디스크 상의 세션 로그 중 가장 수정시간이 일치하는 최신 UUID를 찾아 `*_conversation_id_own` 필드에 주입.
|
||||||
|
* **Drift D (캐시 정합성 점검)**: 레지스트리 및 캐시 상의 세션 UUID가 실제 디스크에 존재하는지 검사하여 소거된 세션을 리포트.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. 에이전트 상태 머신 (Agent State Machine)
|
||||||
|
|
||||||
|
시스템 전반에 걸쳐 에이전트 세션은 아래 흐름을 따라 전이됩니다.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
stateDiagram-v2
|
||||||
|
[*] --> running : multi-agent-mux-create / Drift B
|
||||||
|
running --> stopped : multi-agent-mux-stop (default)
|
||||||
|
running --> terminated : multi-agent-mux-stop (--purge-conversation) / Drift A
|
||||||
|
stopped --> running : multi-agent-mux-resume
|
||||||
|
terminated --> [*]
|
||||||
|
```
|
||||||
|
|
||||||
|
## 5. 최적화 및 팩토링 작업 시 주의 사항
|
||||||
|
|
||||||
|
1. **원자적 쓰기 무력화 금지**: `lib.sh`에 설정된 `atomic_dump_yaml`은 다중 에이전트 병렬 기동 시 데이터 꼬임을 막는 중추 역할을 합니다. DB 잠금 및 트랜잭션 흐름을 훼손하지 않아야 합니다.
|
||||||
|
2. **Cline 및 Claude의 TUI 입력 바인딩 유지**: 세션 재개나 중지 시, 각 에이전트가 내부적으로 사용하는 프롬프트 제어 명령어(예: `/exit`, `--id <session>`)의 세세한 차이를 유지해야 예외 없이 동작합니다.
|
||||||
|
3. **데이터베이스 변수 충돌 주의**: 서브셸 또는 인라인 Python 스크립트 실행 시 전역 SQLite 커넥션(`conn`)의 이름 공간을 절대 오염시키지 마십시오. (예: `stop_session.sh` 버그 재발 방지).
|
||||||
Reference in New Issue
Block a user