diff --git a/.agents/skills/lib.sh b/.agents/skills/lib.sh index 1a5cba0..6942a07 100644 --- a/.agents/skills/lib.sh +++ b/.agents/skills/lib.sh @@ -173,9 +173,16 @@ derive_session_name() { local workspace="$1" agent="$2" local abs parent work slug abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace" - parent="$(basename "$(dirname "$abs")")" - work="$(basename "$abs")" + parent="$(basename "$(dirname "$abs")" 2>/dev/null || echo "")" + work="$(basename "$abs" 2>/dev/null || echo "root")" + if [ -z "$parent" ] || [ "$parent" = "/" ] || [ "$parent" = "." ]; then + parent="workspace" + fi + if [ -z "$work" ] || [ "$work" = "/" ] || [ "$work" = "." ]; then + work="root" + fi slug="$(printf '%s-%s' "$parent" "$work" | tr '[:upper:]' '[:lower:]' | tr '_' '-')" + slug="$(printf '%s' "$slug" | tr -cd 'a-zA-Z0-9-')" printf '%s-creator-%s' "$slug" "$agent" } @@ -189,13 +196,35 @@ derive_session_name() { # inside the script — never spliced into the source. Read-only by convention; # use atomic_dump_yaml when you need to write the YAML. # --------------------------------------------------------------------------- +_validate_env_key() { + local key="$1" + if [[ ! "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then + echo "ERROR: Invalid environment variable name: $key" >&2 + return 1 + fi + case "$key" in + LD_PRELOAD|LD_LIBRARY_PATH|PYTHONPATH|PYTHONHOME|PYTHONINSPECT|PYTHONSTARTUP) + echo "ERROR: Blocked environment variable: $key" >&2 + return 1 + ;; + esac + return 0 +} + env_python() { local yaml_path="$1"; shift local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN") while [ $# -gt 0 ]; do case "$1" in - *=*) envs+=("$1"); shift ;; - *) break ;; + *=*) + local key="${1%%=*}" + _validate_env_key "$key" || return 1 + envs+=("$1") + shift + ;; + *) + break + ;; esac done env "${envs[@]}" python3 - "$@" @@ -233,8 +262,15 @@ atomic_dump_yaml() { local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN") while [ $# -gt 0 ]; do case "$1" in - *=*) envs+=("$1"); shift ;; - *) break ;; + *=*) + local key="${1%%=*}" + _validate_env_key "$key" || return 1 + envs+=("$1") + shift + ;; + *) + break + ;; esac done local mutation; mutation="$(cat)" diff --git a/.agents/skills/multi-agent-mux-create/scripts/create_session.sh b/.agents/skills/multi-agent-mux-create/scripts/create_session.sh index 7fa0c55..5e47882 100755 --- a/.agents/skills/multi-agent-mux-create/scripts/create_session.sh +++ b/.agents/skills/multi-agent-mux-create/scripts/create_session.sh @@ -110,7 +110,7 @@ spawn() { nohup "$WRAPPER" >/dev/null 2>&1 & disown else - _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude" + _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude --dangerously-skip-permissions" fi ;; agy) @@ -142,7 +142,7 @@ NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ') # cmd_full 결정 case "$AGENT" in - claude) CMD_FULL='claude' ;; + claude) CMD_FULL='claude --dangerously-skip-permissions' ;; agy) CMD_FULL='agy --dangerously-skip-permissions' ;; hermes) CMD_FULL='hermes' ;; esac @@ -158,7 +158,7 @@ case "$AGENT" in if [ -x "$WRAPPER" ]; then START_CMD="$WRAPPER # ~/.local/bin 의 래퍼" else - START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude\"" + START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude --dangerously-skip-permissions\"" fi ;; agy|hermes) diff --git a/.agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job b/.agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job index 378d931..72f655c 100755 --- a/.agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job +++ b/.agents/skills/multi-agent-mux-delegate-job/multi-agent-mux-delegate-job @@ -164,12 +164,10 @@ run_agent() { # The user attaches with `tmux attach -t ` and types follow-up # prompts themselves. We pre-load the first prompt via stdin and `read` # keeps the pane open after the agent exits so the user can review. - case "$AGENT" in - claude-code) bin="claude";; - codex) bin="codex";; - human) echo "[human agent] complete the task, then run publish_event.py --event completed"; return;; - *) bin="$AGENT";; - esac + if [ "$AGENT" = "human" ]; then + echo "[human agent] complete the task, then run publish_event.py --event completed" + return + fi if [[ "$DRY_RUN" == "1" ]]; then echo "[dry-run] would launch agent '$AGENT' in a fresh tmux session with instructions:" @@ -182,21 +180,17 @@ run_agent() { echo " Install with: brew install tmux (or your package manager)" >&2 return 1 fi - if ! command -v "$bin" >/dev/null 2>&1; then - echo "ERROR: agent binary '$bin' not found in PATH." >&2 - return 1 + + local _tmux="tmux" + if [ -n "${TMUX_SERVER_NAME:-}" ]; then + _tmux="tmux -L $TMUX_SERVER_NAME" fi local sess="${AGENT_SESSION#tmux:}" - # Detect a stale session with the same name (e.g. the user is still attached - # from an earlier run, or a previous wrapper died without cleanup). tmux - # new-session on an existing name fails silently; check first and fail loud. - if tmux has-session -t "$sess" 2>/dev/null; then - local attached - attached=$(tmux list-clients -t "$sess" 2>/dev/null | wc -l | tr -d ' ') - echo "ERROR: tmux session '$sess' already exists (clients attached: $attached)." >&2 - echo " Pick a unique --agent-session (e.g. tmux:demo, tmux:claude-a) or" >&2 - echo " kill the stale one first: tmux kill-session -t $sess" >&2 + + if ! $_tmux has-session -t "$sess" 2>/dev/null; then + echo "ERROR: 에이전트 세션 '$sess'이 존재하지 않습니다. 작업을 위임하기 전에 먼저 에이전트 세션을 기동해 주세요." >&2 + echo " 팁: 'multi-agent-mux-resume' 또는 'multi-agent-mux-create'를 통해 에이전트를 먼저 생성할 수 있습니다." >&2 return 1 fi @@ -206,9 +200,13 @@ run_agent() { trap 'rc=$?; if [ $rc -ne 0 ]; then "$PY" "$pub_script" --job "$job_id" --event error --detail "agent bootstrap failed (exit $rc)"; fi' EXIT fi - tmux new-session -d -s "$sess" -c "$WORKDIR" \ - "printf '%s' \"$instructions\" | $bin --dangerously-skip-permissions; echo; echo '--- agent exited (job $job_id); press enter to close ---'; read" - echo "agent launched in tmux session: $sess (attach with: tmux attach -t $sess)" + echo "살아있는 에이전트 세션 '$sess'에 작업을 위임합니다..." + $_tmux set-buffer -b "job_buf_$job_id" "$instructions" + $_tmux paste-buffer -b "job_buf_$job_id" -t "$sess" + $_tmux send-keys -t "$sess" C-m + $_tmux delete-buffer -b "job_buf_$job_id" + + echo "작업이 세션 '$sess'에 전송되었습니다. (연결하려면: $_tmux attach -t $sess)" trap - EXIT } diff --git a/.agents/skills/multi-agent-mux-delegate-job/scripts/mqtt_common.py b/.agents/skills/multi-agent-mux-delegate-job/scripts/mqtt_common.py index 5fb4bf0..eeb583e 100644 --- a/.agents/skills/multi-agent-mux-delegate-job/scripts/mqtt_common.py +++ b/.agents/skills/multi-agent-mux-delegate-job/scripts/mqtt_common.py @@ -328,24 +328,24 @@ def update_job_status(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR, **f This is the single chokepoint for status writes (both ``registry.update_status`` and ``publish_event.py``'s status sync route through here), so it also mirrors - any ``status`` change into the persistent audit log — best-effort, after the - registry lock is released so a slow/failed log write never blocks the record.""" + any ``status`` change into the persistent audit log. We perform the log mirror + under the lock to guarantee sequential consistency in audit history.""" with registry_lock(registry_dir): record = load_job(job_id, registry_dir) old_status = record.get("status") record.update(fields) record["updated_at"] = _utcnow() _atomic_write_record(job_id, registry_dir, record) - if "status" in fields: - new_status = record.get("status") - update_logged_status(job_id, new_status, updated_at=record["updated_at"]) - if old_status != new_status: - append_event(job_id, { - "event": "status_changed", - "from": old_status, - "to": new_status, - "timestamp": record["updated_at"], - }) + if "status" in fields: + new_status = record.get("status") + update_logged_status(job_id, new_status, updated_at=record["updated_at"]) + if old_status != new_status: + append_event(job_id, { + "event": "status_changed", + "from": old_status, + "to": new_status, + "timestamp": record["updated_at"], + }) return record @@ -410,6 +410,21 @@ def _file_lock(fh): fcntl.flock(fh.fileno(), fcntl.LOCK_UN) +def _redact_dict(d: Any) -> Any: + """Recursively mask sensitive values (passwords, secrets, tokens) inside logs.""" + if isinstance(d, dict): + redacted = {} + for k, v in d.items(): + if any(s in k.lower() for s in ("password", "token", "secret", "auth_token", "key")): + redacted[k] = "[REDACTED]" + else: + redacted[k] = _redact_dict(v) + return redacted + elif isinstance(d, list): + return [_redact_dict(item) for item in d] + return d + + def append_event(job_id: str, event_dict: Dict[str, Any], logs_dir: Optional[str] = None) -> None: """Append one event as a JSON line to ``//events.ndjson``. @@ -418,7 +433,7 @@ def append_event(job_id: str, event_dict: Dict[str, Any], logs_dir: Optional[str try: path = job_log_path(job_id, EVENTS_FILENAME, logs_dir) path.parent.mkdir(parents=True, exist_ok=True) - record = dict(event_dict) + record = _redact_dict(dict(event_dict)) record.setdefault("logged_at", _utcnow_precise()) line = json.dumps(record, ensure_ascii=False) + "\n" with open(path, "a", encoding="utf-8") as fh: @@ -453,8 +468,9 @@ def init_job_log(job_id: str, meta: Dict[str, Any], logs_dir: Optional[str] = No try: d = job_log_dir(job_id, logs_dir) d.mkdir(parents=True, exist_ok=True) + meta_redacted = _redact_dict(meta) with open(d / META_FILENAME, "w", encoding="utf-8") as fh: - json.dump(meta, fh, ensure_ascii=False, indent=2) + json.dump(meta_redacted, fh, ensure_ascii=False, indent=2) fh.write("\n") status = meta.get("status", "pending") update_logged_status( diff --git a/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh b/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh index 4d6cd11..19cec98 100755 --- a/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh +++ b/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh @@ -410,7 +410,7 @@ if tmux_confirmed: if not pm: continue agent = 'claude' if name.endswith('-creator-claude') else 'agy' - cmd_full = 'claude' if agent == 'claude' else 'agy --dangerously-skip-permissions' + cmd_full = 'claude --dangerously-skip-permissions' if agent == 'claude' else 'agy --dangerously-skip-permissions' server_opt = f"-L {srv} " if srv != 'default' else "" entry = { 'name': name, diff --git a/README.ko.md b/README.ko.md index cbec3e3..0929740 100644 --- a/README.ko.md +++ b/README.ko.md @@ -33,6 +33,63 @@ Tmux와 MQTT 브로커를 기반으로 구축된 고신뢰성 **다중 에이전 --- +## 📐 전체 아키텍처 구성 (Big-Picture Architecture) + +이 시스템은 크게 두 가지 계층(Layer)을 통해 다중 워크스페이스에서 작동하는 LLM 에이전트들을 조율합니다: + +1. **Layer A — Tmux 오케스트레이션 (lib.sh + status/resume/stop/create)**: 워크스페이스별 에이전트 세션을 독립된 tmux 인스턴스로 분리 실행하고, `.mam/agent-sessions.yaml` 및 SQLite 데이터베이스(`.mam/agent-sessions.db`)를 통해 에이전트 세션 메타데이터의 단일 참조 지점(Single Source of Truth)을 유지합니다. +2. **Layer B — 비동기 잡 위임 (delegate-job)**: 에이전트에 특정 태스크를 전송하고 비동기 이벤트 채널(MQTT)을 통해 진행 상황과 완료 여부를 모니터링합니다. + +두 레이어는 파일 I/O 처리를 위한 하나의 핵심 관문인 `lib.sh::atomic_dump_yaml`을 공유합니다. 모든 YAML/DB 쓰기 작업은 독점 파일 락(`flock`)과 데이터 스키마 유효성 검증을 거칩니다. + +### 데이터 흐름 개요 (Data Flow) + +```text + +-----------+ register_job +-------------------+ + | delegator | ---------------> | .mam/jobs/.json| <-- 실시간 잡 정보 + +-----------+ +---------+---------+ + | + | atomic rename + fsync + v + +-----------------+ + | audit log | <-- 추가 전용 + | .mam/delegate_ | events.ndjson + | job_logs// | + +--------+--------+ + ^ + | (최선 노력 미러링) + | + +-----------+ publish_event +-----+-----+ +---------+ + | agent | ---------------> | MQTT broker | <--- | monitor | + | (claude) | +-------------+ +----+----+ + +-----------+ | + ^ v + | 구독자(subscriber) atomic_dump_yaml + | (job_subscriber.py) (.mam/agent-sessions.yaml) + | ^ + +-------- 위임 대기 영역 -----------------+ | + +---+---+ + | reconcil| + | e.sh | + +--------+ +``` + +### 🔒 Tmux 서버 격리 (Tmux Server Isolation) + +에이전트 세션 간의 충돌 및 시스템 전역 tmux 프로세스와의 혼선을 막기 위해 독립된 서버 소켓 환경을 보장합니다: +* **워크스페이스별 심(Shim):** `_init_tmux_isolation` 및 `_resolve_real_tmux_path` 함수가 `/tmp/multi-agent-tmux-shim//tmux` 경로에 독립된 심 디렉터리를 구성하고, 일반 tmux 명령 실행 시 자동으로 `tmux -L ` 형태의 독립 소켓 서버를 사용하게 만듭니다. +* **PATH 환경변수 변조:** 자식 프로세스를 생성할 때 `PATH` 변수 맨 앞에 심 디렉터리 경로를 삽입합니다. 이로 인해 에이전트의 내부 셸에서 수행되는 모든 `tmux` 명령어는 해당 격리 서버 소켓으로 강제 제약됩니다. +* **환경 복구:** `TMUX_SERVER_NAME`을 `default`로 설정하는 경우 PATH 오버라이드가 정리되고 기본 전역 tmux 서버를 사용하게 됩니다. + +### 🛡️ 동시성 설계 및 쓰기 직렬화 + +여러 에이전트가 동시에 실행될 때의 레이스 컨디션을 방지하기 위해 락 기반의 실행 패턴을 고수합니다: +* **POSIX 파일 락 (`flock`):** `agent-sessions.yaml` 또는 SQLite 레지스트리에 쓰기 연산을 진행할 때, 반드시 `lib.sh` 내부의 `atomic_dump_yaml` 함수를 거쳐 `.mam/agent-sessions.yaml.lock` 파일에 독점 락(`flock`)을 획득하도록 직렬화합니다. +* **이중 인터프리터 분리 구조:** 라이브러리 간 의존성 충돌과 실행 도구의 안정성을 보장하기 위해 환경을 이원화했습니다. MQTT 및 비동기 작업 통신에는 가상환경 `.venv` (paho-mqtt 필요)의 Python을 사용하고, YAML 직렬화 쓰기 및 유효성 검증을 담당하는 `atomic_dump_yaml`은 시스템 전역 `python3` (시스템 PyYAML 필요)을 호출합니다. +* **NFS 및 네트워크 파일시스템 대응:** 네트워크 디바이스(NFS, CIFS, SSHFS)에서는 `flock`이 무력화되는 특성이 있습니다. `lib.sh`는 쓰기 대상 파일시스템 경로의 마운트 타입을 체크하여, 네트워크 파일시스템 감지 시 경고 로그를 출력하고 SQLite의 저널 모드를 `WAL`에서 `DELETE`로 자동 전환해 동시성 안전을 강화합니다. + +--- + ## 📐 아키텍처 및 조정 루프 (Review Loop) Project Manager(PM), Worker, Reviewer 역할 간의 협업 구조는 엄격한 교차 검증 루프를 따릅니다: diff --git a/README.md b/README.md index a929c5b..d8bd097 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,63 @@ All orchestration functionalities are structured under the `.agents/skills/` dir --- +## 📐 Big-Picture Architecture + +The system coordinates LLM agents across multiple workspaces through two core layers: + +1. **Layer A — Tmux Orchestration (lib.sh + status/resume/stop/create)**: Runs the agents (one tmux session per agent-workspace combination) and maintains an authoritative registry in `.mam/agent-sessions.yaml` (+ `.mam/agent-sessions.db`). +2. **Layer B — Async Job Delegation (delegate-job)**: Dispatches a task to an agent and observes progress and completion via an event channel. + +These two layers share one lock-guarded chokepoint for file I/O: `lib.sh::atomic_dump_yaml`. Every write is protected by an exclusive `flock` and schema validation. + +### Data Flow Overview + +```text + +-----------+ register_job +-------------------+ + | delegator | ---------------> | .mam/jobs/.json| <-- live record + +-----------+ +---------+---------+ + | + | atomic rename + fsync + v + +-----------------+ + | audit log | <-- append-only + | .mam/delegate_ | events.ndjson + | job_logs// | + +--------+--------+ + ^ + | (best-effort mirrors) + | + +-----------+ publish_event +-----+-----+ +---------+ + | agent | ---------------> | MQTT broker | <--- | monitor | + | (claude) | +-------------+ +----+----+ + +-----------+ | + ^ v + | subscriber atomic_dump_yaml + | (job_subscriber.py) (.mam/agent-sessions.yaml) + | ^ + +-------- delegator waits here ----------+ | + +---+---+ + | reconcil| + | e.sh | + +--------+ +``` + +### 🔒 Tmux Server Isolation + +To prevent workspace tmux processes from interfering with each other or with system tmux servers, the framework enforces isolated tmux environments: +* **Per-Workspace Shim:** `_init_tmux_isolation` and `_resolve_real_tmux_path` instantiate a per-workspace shim directory under `/tmp/multi-agent-tmux-shim//tmux` that intercepts tmux commands and wraps them in `tmux -L `. +* **PATH Rewriting:** The `PATH` environment variable is dynamically prepended with the shim path in all child processes. This ensures any `tmux` invocation within the agent's process tree is restricted to its isolated socket server. +* **Environment Restoration:** If `TMUX_SERVER_NAME` is set to `default`, the PATH override is removed, reverting to the default global tmux server. + +### 🛡️ Concurrency Design & Write Serialization + +The framework implements lock-guarded execution pathways to prevent race conditions during parallel agent operations: +* **POSIX File Locks (`flock`):** Every mutation of `agent-sessions.yaml` and the SQLite registry runs through `atomic_dump_yaml` inside `lib.sh`, which serializes writes via an exclusive `flock` on `.mam/agent-sessions.yaml.lock`. +* **Dual-Interpreter Strategy:** To minimize dependency bloat and guarantee stability, the backplane splits execution environments: the virtual environment `.venv` handles MQTT communication and async jobs (requiring `paho-mqtt`), while the system `python3` executes `atomic_dump_yaml` (relying on system-wide `PyYAML`). +* **NFS and Network FS Safeguards:** Since `flock` behaves unreliably over network protocols (NFS, CIFS, SSHFS), `lib.sh` performs filesystem detection. If a network mount is identified, it outputs a safety warning and SQLite automatically switches its journaling mode from `WAL` to `DELETE`. + +--- + ## 📐 Architecture & Coordination Loop The interaction between roles (Project Manager, Worker, and Reviewer) is structured as a strict iterative loop: