refactor(security,concurrency): resolve structural issues, enforce Claude permission skip, update docs

This commit is contained in:
2026-06-23 08:03:43 +09:00
parent 12dceb14b2
commit 99ac8b3ce4
7 changed files with 209 additions and 45 deletions
+42 -6
View File
@@ -173,9 +173,16 @@ derive_session_name() {
local workspace="$1" agent="$2" local workspace="$1" agent="$2"
local abs parent work slug local abs parent work slug
abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace" abs="$(cd "$workspace" 2>/dev/null && pwd)" || abs="$workspace"
parent="$(basename "$(dirname "$abs")")" parent="$(basename "$(dirname "$abs")" 2>/dev/null || echo "")"
work="$(basename "$abs")" work="$(basename "$abs" 2>/dev/null || echo "root")"
if [ -z "$parent" ] || [ "$parent" = "/" ] || [ "$parent" = "." ]; then
parent="workspace"
fi
if [ -z "$work" ] || [ "$work" = "/" ] || [ "$work" = "." ]; then
work="root"
fi
slug="$(printf '%s-%s' "$parent" "$work" | tr '[:upper:]' '[:lower:]' | tr '_' '-')" slug="$(printf '%s-%s' "$parent" "$work" | tr '[:upper:]' '[:lower:]' | tr '_' '-')"
slug="$(printf '%s' "$slug" | tr -cd 'a-zA-Z0-9-')"
printf '%s-creator-%s' "$slug" "$agent" printf '%s-creator-%s' "$slug" "$agent"
} }
@@ -189,13 +196,35 @@ derive_session_name() {
# inside the script — never spliced into the source. Read-only by convention; # inside the script — never spliced into the source. Read-only by convention;
# use atomic_dump_yaml when you need to write the YAML. # use atomic_dump_yaml when you need to write the YAML.
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
_validate_env_key() {
local key="$1"
if [[ ! "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then
echo "ERROR: Invalid environment variable name: $key" >&2
return 1
fi
case "$key" in
LD_PRELOAD|LD_LIBRARY_PATH|PYTHONPATH|PYTHONHOME|PYTHONINSPECT|PYTHONSTARTUP)
echo "ERROR: Blocked environment variable: $key" >&2
return 1
;;
esac
return 0
}
env_python() { env_python() {
local yaml_path="$1"; shift local yaml_path="$1"; shift
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN") local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN")
while [ $# -gt 0 ]; do while [ $# -gt 0 ]; do
case "$1" in case "$1" in
*=*) envs+=("$1"); shift ;; *=*)
*) break ;; local key="${1%%=*}"
_validate_env_key "$key" || return 1
envs+=("$1")
shift
;;
*)
break
;;
esac esac
done done
env "${envs[@]}" python3 - "$@" env "${envs[@]}" python3 - "$@"
@@ -233,8 +262,15 @@ atomic_dump_yaml() {
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN") local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN")
while [ $# -gt 0 ]; do while [ $# -gt 0 ]; do
case "$1" in case "$1" in
*=*) envs+=("$1"); shift ;; *=*)
*) break ;; local key="${1%%=*}"
_validate_env_key "$key" || return 1
envs+=("$1")
shift
;;
*)
break
;;
esac esac
done done
local mutation; mutation="$(cat)" local mutation; mutation="$(cat)"
@@ -110,7 +110,7 @@ spawn() {
nohup "$WRAPPER" >/dev/null 2>&1 & nohup "$WRAPPER" >/dev/null 2>&1 &
disown disown
else else
_tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude" _tmux new-session -d -s "$SESSION_NAME" -x 140 -y 40 -c "$WORKSPACE" "claude --dangerously-skip-permissions"
fi fi
;; ;;
agy) agy)
@@ -142,7 +142,7 @@ NOW_ISO=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
# cmd_full 결정 # cmd_full 결정
case "$AGENT" in case "$AGENT" in
claude) CMD_FULL='claude' ;; claude) CMD_FULL='claude --dangerously-skip-permissions' ;;
agy) CMD_FULL='agy --dangerously-skip-permissions' ;; agy) CMD_FULL='agy --dangerously-skip-permissions' ;;
hermes) CMD_FULL='hermes' ;; hermes) CMD_FULL='hermes' ;;
esac esac
@@ -158,7 +158,7 @@ case "$AGENT" in
if [ -x "$WRAPPER" ]; then if [ -x "$WRAPPER" ]; then
START_CMD="$WRAPPER # ~/.local/bin 의 래퍼" START_CMD="$WRAPPER # ~/.local/bin 의 래퍼"
else else
START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude\"" START_CMD="$local_tmux new-session -d -s \"$SESSION_NAME\" -x 140 -y 40 -c \"$WORKSPACE\" \"claude --dangerously-skip-permissions\""
fi fi
;; ;;
agy|hermes) agy|hermes)
@@ -164,12 +164,10 @@ run_agent() {
# The user attaches with `tmux attach -t <session>` and types follow-up # The user attaches with `tmux attach -t <session>` and types follow-up
# prompts themselves. We pre-load the first prompt via stdin and `read` # prompts themselves. We pre-load the first prompt via stdin and `read`
# keeps the pane open after the agent exits so the user can review. # keeps the pane open after the agent exits so the user can review.
case "$AGENT" in if [ "$AGENT" = "human" ]; then
claude-code) bin="claude";; echo "[human agent] complete the task, then run publish_event.py --event completed"
codex) bin="codex";; return
human) echo "[human agent] complete the task, then run publish_event.py --event completed"; return;; fi
*) bin="$AGENT";;
esac
if [[ "$DRY_RUN" == "1" ]]; then if [[ "$DRY_RUN" == "1" ]]; then
echo "[dry-run] would launch agent '$AGENT' in a fresh tmux session with instructions:" echo "[dry-run] would launch agent '$AGENT' in a fresh tmux session with instructions:"
@@ -182,21 +180,17 @@ run_agent() {
echo " Install with: brew install tmux (or your package manager)" >&2 echo " Install with: brew install tmux (or your package manager)" >&2
return 1 return 1
fi fi
if ! command -v "$bin" >/dev/null 2>&1; then
echo "ERROR: agent binary '$bin' not found in PATH." >&2 local _tmux="tmux"
return 1 if [ -n "${TMUX_SERVER_NAME:-}" ]; then
_tmux="tmux -L $TMUX_SERVER_NAME"
fi fi
local sess="${AGENT_SESSION#tmux:}" local sess="${AGENT_SESSION#tmux:}"
# Detect a stale session with the same name (e.g. the user is still attached
# from an earlier run, or a previous wrapper died without cleanup). tmux if ! $_tmux has-session -t "$sess" 2>/dev/null; then
# new-session on an existing name fails silently; check first and fail loud. echo "ERROR: 에이전트 세션 '$sess'이 존재하지 않습니다. 작업을 위임하기 전에 먼저 에이전트 세션을 기동해 주세요." >&2
if tmux has-session -t "$sess" 2>/dev/null; then echo " 팁: 'multi-agent-mux-resume' 또는 'multi-agent-mux-create'를 통해 에이전트를 먼저 생성할 수 있습니다." >&2
local attached
attached=$(tmux list-clients -t "$sess" 2>/dev/null | wc -l | tr -d ' ')
echo "ERROR: tmux session '$sess' already exists (clients attached: $attached)." >&2
echo " Pick a unique --agent-session (e.g. tmux:demo, tmux:claude-a) or" >&2
echo " kill the stale one first: tmux kill-session -t $sess" >&2
return 1 return 1
fi fi
@@ -206,9 +200,13 @@ run_agent() {
trap 'rc=$?; if [ $rc -ne 0 ]; then "$PY" "$pub_script" --job "$job_id" --event error --detail "agent bootstrap failed (exit $rc)"; fi' EXIT trap 'rc=$?; if [ $rc -ne 0 ]; then "$PY" "$pub_script" --job "$job_id" --event error --detail "agent bootstrap failed (exit $rc)"; fi' EXIT
fi fi
tmux new-session -d -s "$sess" -c "$WORKDIR" \ echo "살아있는 에이전트 세션 '$sess'에 작업을 위임합니다..."
"printf '%s' \"$instructions\" | $bin --dangerously-skip-permissions; echo; echo '--- agent exited (job $job_id); press enter to close ---'; read" $_tmux set-buffer -b "job_buf_$job_id" "$instructions"
echo "agent launched in tmux session: $sess (attach with: tmux attach -t $sess)" $_tmux paste-buffer -b "job_buf_$job_id" -t "$sess"
$_tmux send-keys -t "$sess" C-m
$_tmux delete-buffer -b "job_buf_$job_id"
echo "작업이 세션 '$sess'에 전송되었습니다. (연결하려면: $_tmux attach -t $sess)"
trap - EXIT trap - EXIT
} }
@@ -328,8 +328,8 @@ def update_job_status(job_id: str, registry_dir: str = DEFAULT_REGISTRY_DIR, **f
This is the single chokepoint for status writes (both ``registry.update_status`` This is the single chokepoint for status writes (both ``registry.update_status``
and ``publish_event.py``'s status sync route through here), so it also mirrors and ``publish_event.py``'s status sync route through here), so it also mirrors
any ``status`` change into the persistent audit log — best-effort, after the any ``status`` change into the persistent audit log. We perform the log mirror
registry lock is released so a slow/failed log write never blocks the record.""" under the lock to guarantee sequential consistency in audit history."""
with registry_lock(registry_dir): with registry_lock(registry_dir):
record = load_job(job_id, registry_dir) record = load_job(job_id, registry_dir)
old_status = record.get("status") old_status = record.get("status")
@@ -410,6 +410,21 @@ def _file_lock(fh):
fcntl.flock(fh.fileno(), fcntl.LOCK_UN) fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
def _redact_dict(d: Any) -> Any:
"""Recursively mask sensitive values (passwords, secrets, tokens) inside logs."""
if isinstance(d, dict):
redacted = {}
for k, v in d.items():
if any(s in k.lower() for s in ("password", "token", "secret", "auth_token", "key")):
redacted[k] = "[REDACTED]"
else:
redacted[k] = _redact_dict(v)
return redacted
elif isinstance(d, list):
return [_redact_dict(item) for item in d]
return d
def append_event(job_id: str, event_dict: Dict[str, Any], logs_dir: Optional[str] = None) -> None: def append_event(job_id: str, event_dict: Dict[str, Any], logs_dir: Optional[str] = None) -> None:
"""Append one event as a JSON line to ``<logs>/<job_id>/events.ndjson``. """Append one event as a JSON line to ``<logs>/<job_id>/events.ndjson``.
@@ -418,7 +433,7 @@ def append_event(job_id: str, event_dict: Dict[str, Any], logs_dir: Optional[str
try: try:
path = job_log_path(job_id, EVENTS_FILENAME, logs_dir) path = job_log_path(job_id, EVENTS_FILENAME, logs_dir)
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
record = dict(event_dict) record = _redact_dict(dict(event_dict))
record.setdefault("logged_at", _utcnow_precise()) record.setdefault("logged_at", _utcnow_precise())
line = json.dumps(record, ensure_ascii=False) + "\n" line = json.dumps(record, ensure_ascii=False) + "\n"
with open(path, "a", encoding="utf-8") as fh: with open(path, "a", encoding="utf-8") as fh:
@@ -453,8 +468,9 @@ def init_job_log(job_id: str, meta: Dict[str, Any], logs_dir: Optional[str] = No
try: try:
d = job_log_dir(job_id, logs_dir) d = job_log_dir(job_id, logs_dir)
d.mkdir(parents=True, exist_ok=True) d.mkdir(parents=True, exist_ok=True)
meta_redacted = _redact_dict(meta)
with open(d / META_FILENAME, "w", encoding="utf-8") as fh: with open(d / META_FILENAME, "w", encoding="utf-8") as fh:
json.dump(meta, fh, ensure_ascii=False, indent=2) json.dump(meta_redacted, fh, ensure_ascii=False, indent=2)
fh.write("\n") fh.write("\n")
status = meta.get("status", "pending") status = meta.get("status", "pending")
update_logged_status( update_logged_status(
@@ -410,7 +410,7 @@ if tmux_confirmed:
if not pm: if not pm:
continue continue
agent = 'claude' if name.endswith('-creator-claude') else 'agy' agent = 'claude' if name.endswith('-creator-claude') else 'agy'
cmd_full = 'claude' if agent == 'claude' else 'agy --dangerously-skip-permissions' cmd_full = 'claude --dangerously-skip-permissions' if agent == 'claude' else 'agy --dangerously-skip-permissions'
server_opt = f"-L {srv} " if srv != 'default' else "" server_opt = f"-L {srv} " if srv != 'default' else ""
entry = { entry = {
'name': name, 'name': name,
+57
View File
@@ -33,6 +33,63 @@ Tmux와 MQTT 브로커를 기반으로 구축된 고신뢰성 **다중 에이전
--- ---
## 📐 전체 아키텍처 구성 (Big-Picture Architecture)
이 시스템은 크게 두 가지 계층(Layer)을 통해 다중 워크스페이스에서 작동하는 LLM 에이전트들을 조율합니다:
1. **Layer A — Tmux 오케스트레이션 (lib.sh + status/resume/stop/create)**: 워크스페이스별 에이전트 세션을 독립된 tmux 인스턴스로 분리 실행하고, `.mam/agent-sessions.yaml` 및 SQLite 데이터베이스(`.mam/agent-sessions.db`)를 통해 에이전트 세션 메타데이터의 단일 참조 지점(Single Source of Truth)을 유지합니다.
2. **Layer B — 비동기 잡 위임 (delegate-job)**: 에이전트에 특정 태스크를 전송하고 비동기 이벤트 채널(MQTT)을 통해 진행 상황과 완료 여부를 모니터링합니다.
두 레이어는 파일 I/O 처리를 위한 하나의 핵심 관문인 `lib.sh::atomic_dump_yaml`을 공유합니다. 모든 YAML/DB 쓰기 작업은 독점 파일 락(`flock`)과 데이터 스키마 유효성 검증을 거칩니다.
### 데이터 흐름 개요 (Data Flow)
```text
+-----------+ register_job +-------------------+
| delegator | ---------------> | .mam/jobs/<id>.json| <-- 실시간 잡 정보
+-----------+ +---------+---------+
|
| atomic rename + fsync
v
+-----------------+
| audit log | <-- 추가 전용
| .mam/delegate_ | events.ndjson
| job_logs/<id>/ |
+--------+--------+
^
| (최선 노력 미러링)
|
+-----------+ publish_event +-----+-----+ +---------+
| agent | ---------------> | MQTT broker | <--- | monitor |
| (claude) | +-------------+ +----+----+
+-----------+ |
^ v
| 구독자(subscriber) atomic_dump_yaml
| (job_subscriber.py) (.mam/agent-sessions.yaml)
| ^
+-------- 위임 대기 영역 -----------------+ |
+---+---+
| reconcil|
| e.sh |
+--------+
```
### 🔒 Tmux 서버 격리 (Tmux Server Isolation)
에이전트 세션 간의 충돌 및 시스템 전역 tmux 프로세스와의 혼선을 막기 위해 독립된 서버 소켓 환경을 보장합니다:
* **워크스페이스별 심(Shim):** `_init_tmux_isolation``_resolve_real_tmux_path` 함수가 `/tmp/multi-agent-tmux-shim/<TMUX_SERVER_NAME>/tmux` 경로에 독립된 심 디렉터리를 구성하고, 일반 tmux 명령 실행 시 자동으로 `tmux -L <server>` 형태의 독립 소켓 서버를 사용하게 만듭니다.
* **PATH 환경변수 변조:** 자식 프로세스를 생성할 때 `PATH` 변수 맨 앞에 심 디렉터리 경로를 삽입합니다. 이로 인해 에이전트의 내부 셸에서 수행되는 모든 `tmux` 명령어는 해당 격리 서버 소켓으로 강제 제약됩니다.
* **환경 복구:** `TMUX_SERVER_NAME``default`로 설정하는 경우 PATH 오버라이드가 정리되고 기본 전역 tmux 서버를 사용하게 됩니다.
### 🛡️ 동시성 설계 및 쓰기 직렬화
여러 에이전트가 동시에 실행될 때의 레이스 컨디션을 방지하기 위해 락 기반의 실행 패턴을 고수합니다:
* **POSIX 파일 락 (`flock`):** `agent-sessions.yaml` 또는 SQLite 레지스트리에 쓰기 연산을 진행할 때, 반드시 `lib.sh` 내부의 `atomic_dump_yaml` 함수를 거쳐 `.mam/agent-sessions.yaml.lock` 파일에 독점 락(`flock`)을 획득하도록 직렬화합니다.
* **이중 인터프리터 분리 구조:** 라이브러리 간 의존성 충돌과 실행 도구의 안정성을 보장하기 위해 환경을 이원화했습니다. MQTT 및 비동기 작업 통신에는 가상환경 `.venv` (paho-mqtt 필요)의 Python을 사용하고, YAML 직렬화 쓰기 및 유효성 검증을 담당하는 `atomic_dump_yaml`은 시스템 전역 `python3` (시스템 PyYAML 필요)을 호출합니다.
* **NFS 및 네트워크 파일시스템 대응:** 네트워크 디바이스(NFS, CIFS, SSHFS)에서는 `flock`이 무력화되는 특성이 있습니다. `lib.sh`는 쓰기 대상 파일시스템 경로의 마운트 타입을 체크하여, 네트워크 파일시스템 감지 시 경고 로그를 출력하고 SQLite의 저널 모드를 `WAL`에서 `DELETE`로 자동 전환해 동시성 안전을 강화합니다.
---
## 📐 아키텍처 및 조정 루프 (Review Loop) ## 📐 아키텍처 및 조정 루프 (Review Loop)
Project Manager(PM), Worker, Reviewer 역할 간의 협업 구조는 엄격한 교차 검증 루프를 따릅니다: Project Manager(PM), Worker, Reviewer 역할 간의 협업 구조는 엄격한 교차 검증 루프를 따릅니다:
+57
View File
@@ -33,6 +33,63 @@ All orchestration functionalities are structured under the `.agents/skills/` dir
--- ---
## 📐 Big-Picture Architecture
The system coordinates LLM agents across multiple workspaces through two core layers:
1. **Layer A — Tmux Orchestration (lib.sh + status/resume/stop/create)**: Runs the agents (one tmux session per agent-workspace combination) and maintains an authoritative registry in `.mam/agent-sessions.yaml` (+ `.mam/agent-sessions.db`).
2. **Layer B — Async Job Delegation (delegate-job)**: Dispatches a task to an agent and observes progress and completion via an event channel.
These two layers share one lock-guarded chokepoint for file I/O: `lib.sh::atomic_dump_yaml`. Every write is protected by an exclusive `flock` and schema validation.
### Data Flow Overview
```text
+-----------+ register_job +-------------------+
| delegator | ---------------> | .mam/jobs/<id>.json| <-- live record
+-----------+ +---------+---------+
|
| atomic rename + fsync
v
+-----------------+
| audit log | <-- append-only
| .mam/delegate_ | events.ndjson
| job_logs/<id>/ |
+--------+--------+
^
| (best-effort mirrors)
|
+-----------+ publish_event +-----+-----+ +---------+
| agent | ---------------> | MQTT broker | <--- | monitor |
| (claude) | +-------------+ +----+----+
+-----------+ |
^ v
| subscriber atomic_dump_yaml
| (job_subscriber.py) (.mam/agent-sessions.yaml)
| ^
+-------- delegator waits here ----------+ |
+---+---+
| reconcil|
| e.sh |
+--------+
```
### 🔒 Tmux Server Isolation
To prevent workspace tmux processes from interfering with each other or with system tmux servers, the framework enforces isolated tmux environments:
* **Per-Workspace Shim:** `_init_tmux_isolation` and `_resolve_real_tmux_path` instantiate a per-workspace shim directory under `/tmp/multi-agent-tmux-shim/<TMUX_SERVER_NAME>/tmux` that intercepts tmux commands and wraps them in `tmux -L <server>`.
* **PATH Rewriting:** The `PATH` environment variable is dynamically prepended with the shim path in all child processes. This ensures any `tmux` invocation within the agent's process tree is restricted to its isolated socket server.
* **Environment Restoration:** If `TMUX_SERVER_NAME` is set to `default`, the PATH override is removed, reverting to the default global tmux server.
### 🛡️ Concurrency Design & Write Serialization
The framework implements lock-guarded execution pathways to prevent race conditions during parallel agent operations:
* **POSIX File Locks (`flock`):** Every mutation of `agent-sessions.yaml` and the SQLite registry runs through `atomic_dump_yaml` inside `lib.sh`, which serializes writes via an exclusive `flock` on `.mam/agent-sessions.yaml.lock`.
* **Dual-Interpreter Strategy:** To minimize dependency bloat and guarantee stability, the backplane splits execution environments: the virtual environment `.venv` handles MQTT communication and async jobs (requiring `paho-mqtt`), while the system `python3` executes `atomic_dump_yaml` (relying on system-wide `PyYAML`).
* **NFS and Network FS Safeguards:** Since `flock` behaves unreliably over network protocols (NFS, CIFS, SSHFS), `lib.sh` performs filesystem detection. If a network mount is identified, it outputs a safety warning and SQLite automatically switches its journaling mode from `WAL` to `DELETE`.
---
## 📐 Architecture & Coordination Loop ## 📐 Architecture & Coordination Loop
The interaction between roles (Project Manager, Worker, and Reviewer) is structured as a strict iterative loop: The interaction between roles (Project Manager, Worker, and Reviewer) is structured as a strict iterative loop: