feat(lib): implement FW-N1~FW-N4 items and pane snapshot guidelines

This commit is contained in:
2026-06-21 09:19:46 +00:00
parent 8097df0cbe
commit 5258b5013c
6 changed files with 18 additions and 14 deletions
+5 -3
View File
@@ -153,11 +153,13 @@ Every event payload must adhere to the following schema structure:
--- ---
### 2.4 Integrity and Authentication Verification (Bearer Auth) ### 2.4 Integrity and Authentication Verification (HMAC-SHA256 Signatures)
To prevent unauthorized users from hijacking or spoofing events on public brokers: To prevent unauthorized users from hijacking or spoofing events on public brokers:
1. When a job is registered, a cryptographic token (`auth_token`) is generated (`secrets.token_urlsafe(32)`). 1. When a job is registered, a cryptographic token (`auth_token`) is generated (`secrets.token_urlsafe(32)`).
2. The publisher reads this token from the local job file and injects it into `data.auth_token` for all outgoing messages. 2. The publisher reads this token and signs the JSON payload. Specifically, the publisher calculates an HMAC-SHA256 signature using the `auth_token` as the secret key over the serialized payload (with the `hmac_sig` field excluded).
3. The subscriber (`job_subscriber.py`) reads the expected `auth_token` from the local registry and performs a plaintext bearer-token check on all received messages. Mismatched or missing tokens are discarded immediately. 3. The signature is attached as `data.hmac_sig` on the wire.
4. The subscriber (`job_subscriber.py`) reads the expected `auth_token` from the local registry and verifies the HMAC signature. Any message with a missing, invalid, or mismatched signature is discarded immediately with an "HMAC verify failed" log.
5. To prevent event drops, all publishers and subscribers must be updated simultaneously during deployment rollout, since the plaintext `auth_token` is never transmitted on the wire to prevent token interception.
--- ---
@@ -349,6 +349,10 @@ has been verified (2026-06-21, 6-batch refactoring sprint):
5. **Batch grouping** — group 2-3 FW items per batch when they touch different 5. **Batch grouping** — group 2-3 FW items per batch when they touch different
files (no file overlap). This amortises the dispatch overhead. Items touching files (no file overlap). This amortises the dispatch overhead. Items touching
the same file must be in separate batches to avoid conflicts. the same file must be in separate batches to avoid conflicts.
6. **Pane Snapshots & Truncation Prevention** — to prevent long agent responses from being scrolled out and truncated due to TUI viewport limitations, enforce the following snapshotting pattern:
- Immediately after dispatching a brief, capture the pre-brief pane buffer via `capture-pane -S -200`.
- During long execution, run a background loop taking incremental snapshots (e.g. every 30 seconds `>> /tmp/pane-snap.txt`).
- Immediately after job termination, capture the entire final pane state to ensure no terminal logs are lost.
## Verification Checklist ## Verification Checklist
@@ -85,7 +85,7 @@ class _Watcher:
# --- production auth check: data.auth_token must match if expected --- # --- production auth check: data.auth_token must match if expected ---
expected_token = self.tokens.get(jid) expected_token = self.tokens.get(jid)
if not mqtt_common.verify_hmac(payload, expected_token): if not mqtt_common.verify_hmac(payload, expected_token):
logger.warning("drop event for job %s: auth_token mismatch", jid) logger.warning("drop event for job %s: HMAC verify failed", jid)
return return
# Persistent audit log from the *subscriber's* vantage point: every event # Persistent audit log from the *subscriber's* vantage point: every event
# that survives defensive parsing is recorded here, including ones a # that survives defensive parsing is recorded here, including ones a
@@ -75,11 +75,9 @@ def build_payload(
"detail": detail, "detail": detail,
"data": dict(data) if data else {}, "data": dict(data) if data else {},
} }
# Production: carry the per-job auth token so the subscriber can verify the # Production: carry the per-job HMAC-SHA256 signature in `data.hmac_sig` so
# publisher. The token is compared in plain text (bearer-token style) by the # the subscriber can verify the publisher without exposing the secret token.
# subscriber — NOT an HMAC. See SKILL.md "Auth token" and PLAN 8.2. The # The signature is calculated over the entire payload (with `data.hmac_sig` excluded).
# registry stores the per-job token in `auth_token`; only include it on the
# wire when set so the public broker (no auth) doesn't leak anything.
if auth_token: if auth_token:
sign_payload = {k: v for k, v in payload.items() if k != "data"} sign_payload = {k: v for k, v in payload.items() if k != "data"}
sign_payload["data"] = {k: v for k, v in payload.get("data", {}).items() if k != "hmac_sig"} sign_payload["data"] = {k: v for k, v in payload.get("data", {}).items() if k != "hmac_sig"}
@@ -107,7 +107,7 @@ bash skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --subscribe --id
bash skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --subscribe --idle-timeout 0 bash skills/tmux-agent-orchestrate-monitor/scripts/reconcile.sh --subscribe --idle-timeout 0
``` ```
Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (push-based MQTT subscription monitoring). `--subscribe` sub-flags: `--timeout N` (exit after N seconds of wall-clock; `0` = no limit, default), `--idle-timeout N` (exit after N seconds with no message; default `600`, `0` = never idle-out). On a broker connection failure (connect error **or** non-zero CONNACK), `--subscribe` falls back to a polling loop that re-runs `--once --emit-diff` every `RECONCILE_POLL_INTERVAL` (default 15) seconds until `--timeout`. Terminal-event YAML updates are written through `lib.sh::atomic_dump_yaml` (flock + schema-validate + `.bak`). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself. Flags: `--once` (single pass), `--emit-diff` (print JSON), `--dry-run` (P1-E — no mutation), `--subscribe` (push-based MQTT subscription monitoring). `--subscribe` sub-flags: `--timeout N` (exit after N seconds of wall-clock; `0` = no limit, default), `--idle-timeout N` (exit after N seconds with no message; default `3600`, `0` = never idle-out). On a broker connection failure (connect error **or** non-zero CONNACK), `--subscribe` falls back to a polling loop that re-runs `--once --emit-diff` every `RECONCILE_POLL_INTERVAL` (default 15) seconds until `--timeout`. Terminal-event YAML updates are written through `lib.sh::atomic_dump_yaml` (flock + schema-validate + `.bak`). There are **no** `--workspace` / `--agent` / `--comment-card` flags; the worker turns the emitted JSON `drifts[]` into `kanban_comment` calls itself.
## Drift classes (what the script handles) ## Drift classes (what the script handles)
@@ -22,10 +22,10 @@ ONCE=0
EMIT_DIFF=0 EMIT_DIFF=0
DRY_RUN=0 DRY_RUN=0
SUBSCRIBE=0 SUBSCRIBE=0
# --subscribe controls (review item 4): 0 = no overall timeout; idle default 600s # --subscribe controls (review item 4): 0 = no overall timeout; idle default 3600s
# (raised from the old hardcoded 120s); idle 0 = never idle-out. # (raised from 600s to align with job timeout defaults); idle 0 = never idle-out.
SUB_TIMEOUT=0 SUB_TIMEOUT=0
SUB_IDLE_TIMEOUT=600 SUB_IDLE_TIMEOUT=3600
POLL_INTERVAL="${RECONCILE_POLL_INTERVAL:-15}" POLL_INTERVAL="${RECONCILE_POLL_INTERVAL:-15}"
while [ $# -gt 0 ]; do while [ $# -gt 0 ]; do
@@ -63,7 +63,7 @@ import os, sys, json, time, subprocess
lib_sh = os.environ.get('LIB_SH', '') lib_sh = os.environ.get('LIB_SH', '')
skills_dir = os.environ.get('SKILLS_DIR', '') skills_dir = os.environ.get('SKILLS_DIR', '')
timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout
idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '600') or '0') # 0 = no idle timeout idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '3600') or '0') # 0 = no idle timeout
# Locate skills/tmux-agent-orchestrate-delegate-job/scripts to import mqtt_common — relative first, then # Locate skills/tmux-agent-orchestrate-delegate-job/scripts to import mqtt_common — relative first, then
# an upward walk from cwd. No hardcoded absolute path (review item 6). # an upward walk from cwd. No hardcoded absolute path (review item 6).