Compare commits

..

3 Commits

9 changed files with 68 additions and 38 deletions
+27 -27
View File
@@ -3,10 +3,10 @@
#
# Single source of truth for the four things that were inconsistently
# re-implemented across create/resume/delete/monitor (REVIEW.md §4.1):
# - derive_session_name : the tmux session slug (P0-A)
# - atomic_dump_yaml : flock + temp+rename + .bak + validate (P0-B)
# - env_python : env-safe Python (no heredoc injection) (P0-B / P1-B)
# - find_workspace_uuid : workspace-SCOPED resume id lookup (P0-C)
# - derive_session_name : the tmux session slug (P0-A)
# - atomic_dump_yaml : SQLite db transaction + temp+rename + .bak + validate (P0-B)
# - env_python : env-safe Python (no heredoc injection) (P0-B / P1-B)
# - find_workspace_uuid : workspace-SCOPED resume id lookup (P0-C)
#
# Source it from each script with a path computed from the script location:
# source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
@@ -234,32 +234,43 @@ env_python() {
# atomic_dump_yaml <yaml_path> [KEY=VALUE ...] (mutation source from stdin)
#
# The ONLY sanctioned way to write agent-sessions.yaml. It:
# 1. takes an exclusive flock on <yaml_path>.lock (serialises all writers)
# 2. loads the YAML into `d`
# 1. takes an exclusive SQLite BEGIN IMMEDIATE transaction lock on
# agent-sessions.db (serialises all writers)
# 2. loads the current state into `d` (seeds from YAML if DB is empty)
# 3. exec()s the caller's mutation source (sees d, yaml, os, datetime,
# timezone, glob, subprocess; reads values via os.environ). The mutation
# may print and may `raise SystemExit(n)` to abort *without* writing.
# 4. validates the resulting schema
# 5. backs up to <yaml_path>.bak, then writes atomically (temp + os.replace)
# 5. backs up to <yaml_path>.bak, then writes YAML atomically (temp + os.replace)
# when a session transitions to a finished state.
#
# The mutation source is passed via env and exec()'d — it is never string
# spliced and untrusted data never lands in Python source (P0-B / P1-B).
# ---------------------------------------------------------------------------
# Check if the workspace is on NFS — flock is unreliable on NFS
_atomic_dump_yaml_check_nfs() {
# Check if the workspace is on NFS — locking behaves differently on NFS
_check_is_nfs() {
local f="$1"
local mountpoint
mountpoint="$(df --output=target "$f" 2>/dev/null | tail -1)" || return 0
mountpoint="$(df --output=target "$f" 2>/dev/null | tail -1)" || return 1
if mount | grep -q "$mountpoint.*nfs\|$mountpoint.*cifs\|$mountpoint.*fuse.sshfs"; then
echo "WARNING: $mountpoint appears to be a network filesystem (NFS/CIFS/SSHFS)." >&2
echo "WARNING: fcntl.flock-based atomic writes are unreliable on network filesystems." >&2
echo "WARNING: SQLite journal_mode automatically falls back to DELETE." >&2
return 0 # is NFS
fi
return 1 # not NFS
}
atomic_dump_yaml() {
local yaml_path="$1"; shift
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN")
if [ -z "${MAM_IS_NFS:-}" ]; then
if _check_is_nfs "$(dirname "$yaml_path")"; then
export MAM_IS_NFS="true"
echo "WARNING: $(dirname "$yaml_path") appears to be a network filesystem (NFS/CIFS/SSHFS)." >&2
echo "WARNING: SQLite journal_mode automatically falls back to DELETE." >&2
else
export MAM_IS_NFS="false"
fi
fi
local -a envs=("YAML_PATH=$yaml_path" "HOME_DIR=$HOME_DIR" "CLAUDE_PROJECT_DIR=$CLAUDE_PROJECT_DIR" "LOCAL_BIN=$LOCAL_BIN" "MAM_IS_NFS=$MAM_IS_NFS")
while [ $# -gt 0 ]; do
case "$1" in
*=*)
@@ -312,19 +323,8 @@ for f in [db_path, db_path + '-wal', db_path + '-shm']:
except Exception:
pass
def is_nfs(path):
try:
df_out = subprocess.check_output(['df', '--output=target', path], text=True, stderr=subprocess.DEVNULL)
target = df_out.strip().split('\n')[-1].strip()
mount_out = subprocess.check_output(['mount'], text=True)
for line in mount_out.split('\n'):
if f" on {target} " in line and (' type nfs ' in line or ' type cifs ' in line or ' fuse.sshfs ' in line):
return True
except Exception:
pass
return False
if is_nfs(os.path.dirname(db_path) or '.'):
is_nfs = os.environ.get('MAM_IS_NFS') == 'true'
if is_nfs:
conn.execute('PRAGMA journal_mode=DELETE')
else:
conn.execute('PRAGMA journal_mode=WAL')
@@ -106,7 +106,7 @@ WRAPPER="$LOCAL_BIN/$SESSION_NAME"
spawn() {
case "$AGENT" in
claude)
if [ -x "$WRAPPER" ] || [ "$USE_WRAPPER" = "1" ]; then
if { [ -x "$WRAPPER" ] && [ "$(basename "$WRAPPER")" != "claude" ]; } || [ "$USE_WRAPPER" = "1" ]; then
nohup "$WRAPPER" >/dev/null 2>&1 &
disown
else
+1
View File
@@ -25,6 +25,7 @@
| **FW-W4** | 구독자 시퀀스 번호(last_seq)의 디스크 영속화 | P1 (High) | 중 | **워크플로우 / 보안**: 와치독 재기동 시 시퀀스 카운터가 리셋되는 구조적 취약을 방지하기 위해 `subscriber.last_seq`를 디스크/DB에 기록하여 잡 라이프타임 전체를 커버하는 Replay 방어선 유지 | 없음 |
| **FW-W5** | 리뷰어 판정을 위한 구조적 메시지 스키마 정의 | P2 (Medium) | 중 | **워크플로우**: PM 에이전트가 터미널 스크롤백 문자열을 무가공 grep 파싱하는 대신, 전용 리뷰 피드백 토픽(예: `reviews/<job_id>/verdicts`) 및 정형화된 JSON 포맷(`PASS`/`NOT_PASS` + 차단 요인) 도입 | 없음 |
| **FW-W6** | 모니터링 복구 루프의 Hermes 에이전트 지원 확장 | P2 (Medium) | 중 | **워크플로우 / 일관성**: `reconcile.sh` 내 자동 등록(drift-B) 및 ID 동기화(drift-C) 로직에 `hermes` 세션을 완전 편입시켜 Claude/Agy 세션과 동일한 모니터링 및 복구 수준 지원 | 없음 |
| **FW-W7** | derive_session_name 내 디렉터리 경로 슬러그 이름 충돌 해결 | P2 (Medium) | 소 | **워크플로우 / 충돌 방지**: 마지막 2개 디렉터리만 슬러그화할 때 발생하는 동일 이름의 중첩 디렉터리 세션 이름 충돌(예: `/projectA/src``/projectB/src` 가 동일한 세션명으로 슬러그화됨)을 해결하기 위해 워크스페이스 범위 해시 값을 포함하는 세션명 명명 규칙 적용 | 없음 |
---
+1
View File
@@ -25,6 +25,7 @@ Below is the list of pending future work items. These items were proposed based
| **FW-W4** | Persist subscriber sequence numbers alongside job records | P1 (High) | Medium | **Workflow / Security**: Persist `subscriber.last_seq` to disk or SQLite to prevent sequence counter reset on subscriber restart, locking down the replay defense window for the full job lifetime. | None |
| **FW-W5** | Define structured message schema for reviewer verdicts | P2 (Medium) | Medium | **Workflow**: Create a dedicated reviewer topic (e.g., `reviews/<job_id>/verdicts`) emitting structured JSON verdicts (`PASS` / `NOT_PASS` + details) to eliminate raw text grepping by the PM. | None |
| **FW-W6** | Expand monitor reconciliation support to Hermes agent | P2 (Medium) | Medium | **Workflow / Consistency**: Fully integrate `hermes` sessions into auto-registration (drift-B) and ID materialization (drift-C) under `reconcile.sh` to match Claude/Agy monitoring coverage. | None |
| **FW-W7** | Resolve path slug collisions in derive_session_name | P2 (Medium) | Small | **Workflow / Collision Avoidance**: Update `derive_session_name` to handle same-name nested directories (e.g. `/projectA/src` and `/projectB/src` both slugify to identical session names) by incorporating workspace-scoped identifiers or hash digests. | None |
---
+2 -2
View File
@@ -219,9 +219,9 @@ stateDiagram-v2
Two concurrency control schemes co-exist in this workspace to coordinate state modification:
1. **`lib.sh::atomic_dump_yaml()`**: Used for workspace-wide tmux session inventory (`agent-sessions.yaml`).
* **Locking**: Uses POSIX advisory locking via python's `fcntl.flock(lock_fh, fcntl.LOCK_EX)` over a sidecar lock file `<yaml_path>.lock`.
* **Locking**: Uses SQLite database transaction serialization via `BEGIN IMMEDIATE` on `agent-sessions.db`.
* **Safe Mutation**: The mutation source code is passed in an environment variable `AGENT_SESSIONS_MUTATION` and executed dynamically using `exec(compile(..., 'exec'), globals())`. This isolates the execution and avoids command-injection vectors.
* **Atomicity**: Writes to a temp file in the same directory using `tempfile.mkstemp()`, then performs an `os.replace()` rename. POSIX guarantees the replacement is atomic, preventing half-written YAML reads. A `.bak` backup copy is also preserved.
* **Atomicity**: Updates the SQLite tables and then, if a session transitions to a finished state, writes to a temp file in the same directory using `tempfile.mkstemp()` and performs an `os.replace()` rename. POSIX guarantees the replacement is atomic, preventing half-written YAML reads. A `.bak` backup copy is also preserved.
2. **`registry.py::register_job() / pick_pending() / _atomic_write_record()`**: Used for job-level metadata JSON files (`<job_id>.json`).
* **Locking**: Wraps operations in a `registry_lock(registry_dir)` context manager, implementing an advisory exclusive lock on `.lock` via `fcntl.flock`.
* **Atomicity**: In `_atomic_write_record()`, it uses `tempfile.mkstemp` inside the parent registry folder, serializes the updated job record to the temp file, flushes it, triggers a physical disk sync via `os.fsync(fh.fileno())`, and executes `os.replace` to replace the main JSON record file. The file permission is restricted to `0o600` immediately.
+3 -3
View File
@@ -40,7 +40,7 @@ Tmux와 MQTT 브로커를 기반으로 구축된 고신뢰성 **다중 에이전
1. **Layer A — Tmux 오케스트레이션 (lib.sh + status/resume/stop/create)**: 워크스페이스별 에이전트 세션을 독립된 tmux 인스턴스로 분리 실행하고, `.mam/agent-sessions.yaml` 및 SQLite 데이터베이스(`.mam/agent-sessions.db`)를 통해 에이전트 세션 메타데이터의 단일 참조 지점(Single Source of Truth)을 유지합니다.
2. **Layer B — 비동기 잡 위임 (delegate-job)**: 에이전트에 특정 태스크를 전송하고 비동기 이벤트 채널(MQTT)을 통해 진행 상황과 완료 여부를 모니터링합니다.
두 레이어는 파일 I/O 처리를 위한 하나의 핵심 관문인 `lib.sh::atomic_dump_yaml`을 공유합니다. 모든 YAML/DB 쓰기 작업은 독점 파일 락(`flock`)과 데이터 스키마 유효성 검증을 거칩니다.
두 레이어는 파일 I/O 처리를 위한 하나의 핵심 관문인 `lib.sh::atomic_dump_yaml`을 공유합니다. 모든 YAML/DB 쓰기 작업은 SQLite 데이터베이스 트랜잭션 락과 데이터 스키마 유효성 검증을 거칩니다.
### 데이터 흐름 개요 (Data Flow)
@@ -84,9 +84,9 @@ Tmux와 MQTT 브로커를 기반으로 구축된 고신뢰성 **다중 에이전
### 🛡️ 동시성 설계 및 쓰기 직렬화
여러 에이전트가 동시에 실행될 때의 레이스 컨디션을 방지하기 위해 락 기반의 실행 패턴을 고수합니다:
* **POSIX 파일 락 (`flock`):** `agent-sessions.yaml` 또는 SQLite 레지스트리에 쓰기 연산을 진행할 때, 반드시 `lib.sh` 내부의 `atomic_dump_yaml` 함수를 거쳐 `.mam/agent-sessions.yaml.lock` 파일에 독점 락(`flock`)을 획득하도록 직렬화합니다.
* **SQLite 데이터베이스 락 (`BEGIN IMMEDIATE`):** `agent-sessions.yaml` 또는 SQLite 레지스트리에 쓰기 연산을 진행할 때, 반드시 `lib.sh` 내부의 `atomic_dump_yaml` 함수를 거쳐 SQLite 데이터베이스 `.mam/agent-sessions.db``BEGIN IMMEDIATE` 트랜잭션 락을 획득하도록 직렬화합니다.
* **이중 인터프리터 분리 구조:** 라이브러리 간 의존성 충돌과 실행 도구의 안정성을 보장하기 위해 환경을 이원화했습니다. MQTT 및 비동기 작업 통신에는 가상환경 `.venv` (paho-mqtt 필요)의 Python을 사용하고, YAML 직렬화 쓰기 및 유효성 검증을 담당하는 `atomic_dump_yaml`은 시스템 전역 `python3` (시스템 PyYAML 필요)을 호출합니다.
* **NFS 및 네트워크 파일시스템 대응:** 네트워크 디바이스(NFS, CIFS, SSHFS)에서는 `flock`이 무력화되는 특성이 있습니다. `lib.sh`는 쓰기 대상 파일시스템 경로의 마운트 타입을 체크하여, 네트워크 파일시스템 감지 시 경고 로그를 출력하고 SQLite의 저널 모드를 `WAL`에서 `DELETE`로 자동 전환해 동시성 안전을 강화합니다.
* **NFS 및 네트워크 파일시스템 대응:** 네트워크 디바이스(NFS, CIFS, SSHFS)에서는 파일 락(`flock`) 및 SQLite WAL 기능이 오작동할 수 있습니다. `lib.sh`는 쓰기 대상 파일시스템 경로의 마운트 타입을 체크하여, 네트워크 파일시스템 감지 시 경고 로그를 출력하고 SQLite의 저널 모드를 `WAL`에서 `DELETE`로 자동 전환해 동시성 안전을 강화합니다.
---
+3 -3
View File
@@ -40,7 +40,7 @@ The system coordinates LLM agents across multiple workspaces through two core la
1. **Layer A — Tmux Orchestration (lib.sh + status/resume/stop/create)**: Runs the agents (one tmux session per agent-workspace combination) and maintains an authoritative registry in `.mam/agent-sessions.yaml` (+ `.mam/agent-sessions.db`).
2. **Layer B — Async Job Delegation (delegate-job)**: Dispatches a task to an agent and observes progress and completion via an event channel.
These two layers share one lock-guarded chokepoint for file I/O: `lib.sh::atomic_dump_yaml`. Every write is protected by an exclusive `flock` and schema validation.
These two layers share one lock-guarded chokepoint for file I/O: `lib.sh::atomic_dump_yaml`. Every write is protected by an exclusive SQLite database transaction lock and schema validation.
### Data Flow Overview
@@ -84,9 +84,9 @@ To prevent workspace tmux processes from interfering with each other or with sys
### 🛡️ Concurrency Design & Write Serialization
The framework implements lock-guarded execution pathways to prevent race conditions during parallel agent operations:
* **POSIX File Locks (`flock`):** Every mutation of `agent-sessions.yaml` and the SQLite registry runs through `atomic_dump_yaml` inside `lib.sh`, which serializes writes via an exclusive `flock` on `.mam/agent-sessions.yaml.lock`.
* **SQLite Database Locks (`BEGIN IMMEDIATE`):** Every mutation of `agent-sessions.yaml` and the SQLite registry runs through `atomic_dump_yaml` inside `lib.sh`, which serializes writes via an exclusive `BEGIN IMMEDIATE` transaction lock on the SQLite database `.mam/agent-sessions.db`.
* **Dual-Interpreter Strategy:** To minimize dependency bloat and guarantee stability, the backplane splits execution environments: the virtual environment `.venv` handles MQTT communication and async jobs (requiring `paho-mqtt`), while the system `python3` executes `atomic_dump_yaml` (relying on system-wide `PyYAML`).
* **NFS and Network FS Safeguards:** Since `flock` behaves unreliably over network protocols (NFS, CIFS, SSHFS), `lib.sh` performs filesystem detection. If a network mount is identified, it outputs a safety warning and SQLite automatically switches its journaling mode from `WAL` to `DELETE`.
* **NFS and Network FS Safeguards:** Since file locking (`flock`) and SQLite WAL behave unreliably over network protocols (NFS, CIFS, SSHFS), `lib.sh` performs filesystem detection. If a network mount is identified, it outputs a safety warning and SQLite automatically switches its journaling mode from `WAL` to `DELETE`.
---
+1 -1
View File
@@ -18,7 +18,7 @@ This directory contains packaging templates and installation scripts to deploy t
Once you push this repository to your Gitea instance, users can install it in their local workspace directory by running:
```bash
curl -fsSL https://<your-gitea-domain>/<username>/multi-agent-mux/raw/branch/main/deploy/install.sh | bash
curl -fsSL https://git.godopu.com/tmpl/multi-agent-mux/raw/branch/main/deploy/install.sh | bash
```
Alternatively, if they have cloned the repository, they can execute:
+29 -1
View File
@@ -42,8 +42,36 @@ else
exit 1
fi
# Verify PyYAML (needed by system python3 for atomic state writes)
if ! python3 -c "import yaml" &>/dev/null; then
echo "❌ Error: 'PyYAML' is not installed in the system python3. Please install it first" >&2
echo " (e.g., 'pip3 install PyYAML' or 'sudo apt-get install python3-yaml')." >&2
exit 1
fi
echo "✅ PyYAML (system dependency) detected."
# --- 2. Workspace Setup ---
mkdir -p "$TARGET_DIR"
cd "$TARGET_DIR"
# Download .agents/skills if missing (for curl one-liner installs)
if [ ! -d ".agents/skills" ]; then
echo "📥 .agents/skills not found. Fetching from Gitea repository..."
if command -v git &>/dev/null && [ -z "$(ls -A 2>/dev/null || echo "")" ]; then
echo "🌐 Cloning Gitea repository..."
git clone "https://git.godopu.com/tmpl/multi-agent-mux.git" .
else
echo "🌐 Downloading and extracting skills archive..."
curl -fsSL "https://git.godopu.com/tmpl/multi-agent-mux/archive/main.tar.gz" | tar -xz --strip-components=1
fi
if [ ! -d ".agents/skills" ]; then
echo "❌ Error: Fetch completed but '.agents/skills' is still missing. Target layout might be invalid." >&2
exit 1
fi
echo "✅ Skills downloaded successfully."
fi
echo "📂 Ensuring metadata directory structure (.mam/)..."
mkdir -p .mam/jobs .mam/delegate_job_logs
@@ -59,7 +87,7 @@ if command -v df &>/dev/null && command -v mount &>/dev/null; then
if [ -n "$MOUNTPOINT" ]; then
if mount | grep -q "$MOUNTPOINT.*nfs\|$MOUNTPOINT.*cifs\|$MOUNTPOINT.*fuse.sshfs"; then
echo "⚠️ WARNING: Target directory is on a network filesystem (NFS/CIFS/SSHFS)."
echo " File locks (fcntl.flock) are UNRELIABLE on network storage."
echo " SQLite WAL journaling and file locks are UNRELIABLE on network storage."
echo " The sqlite3 registry will fall back to 'DELETE' journaling instead of WAL."
else
echo "✅ File system supports WAL (Local storage detected)."