Files
multi-agent-mux/.agents/skills/multi-agent-mux-monitor/scripts/reconcile.sh
T

484 lines
19 KiB
Bash
Executable File

#!/usr/bin/env bash
# reconcile.sh — multi-agent-mux-monitor 의 부속 스크립트
# YAML ↔ tmux ↔ 디스크 artifact 간 drift 감지 (+ YAML 자동 갱신).
#
# Usage:
# bash reconcile.sh --once --emit-diff # drift 감지 + 갱신
# bash reconcile.sh --once --emit-diff --dry-run # drift 만 계산, 쓰기 안 함 (P1-E)
#
# --dry-run: 부수효과 없는 read-only. "지금 뭐 돌고 있지?" 질문에 안전.
# multi-agent-mux-status 스킬이 이걸 재사용.
#
# 출력 (JSON): {timestamp, yaml_path, tmux_sessions_alive, tmux_confirmed, drifts, actions}
#
# Exit codes: 0 = ok | 1 = YAML not found | 2 = error
set -euo pipefail
source "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib.sh"
STATE_DIR="${AGENT_SESSIONS_STATE_DIR:-$WORKSPACE_ROOT/.cache/multi-agent-mux-monitor}"
ONCE=0
EMIT_DIFF=0
DRY_RUN=0
SUBSCRIBE=0
# --subscribe controls (review item 4): 0 = no overall timeout; idle default 3600s
# (raised from 600s to align with job timeout defaults); idle 0 = never idle-out.
SUB_TIMEOUT=0
SUB_IDLE_TIMEOUT=3600
POLL_INTERVAL="${RECONCILE_POLL_INTERVAL:-15}"
while [ $# -gt 0 ]; do
case "$1" in
--once) ONCE=1; shift ;;
--emit-diff) EMIT_DIFF=1; shift ;;
--dry-run) DRY_RUN=1; shift ;;
--subscribe) SUBSCRIBE=1; shift ;;
--timeout) SUB_TIMEOUT="$2"; shift 2 ;;
--idle-timeout) SUB_IDLE_TIMEOUT="$2"; shift 2 ;;
-h|--help) echo "Usage: $0 [--once] [--emit-diff] [--dry-run] [--subscribe [--timeout N] [--idle-timeout N]]"; exit 0 ;;
*) echo "ERROR: unknown arg: $1" >&2; exit 2 ;;
esac
done
[ -f "$AGENT_SESSIONS_YAML" ] || { echo "ERROR: $AGENT_SESSIONS_YAML not found" >&2; exit 1; }
if [ "$SUBSCRIBE" = "1" ]; then
# Paths resolved relative to this script (review item 6): skills/ dir + lib.sh.
SKILLS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
LIB_SH="$SKILLS_DIR/lib.sh"
# MQTT client lives in the project venv (has paho). All YAML work is delegated
# to lib.sh::atomic_dump_yaml, which runs the system python3 (has PyYAML) — so
# no single interpreter needs both paho and PyYAML (review items 4/5/6).
PYBIN="$(_delegate_py_bin)"
# The MQTT subscribe loop exits 3 to signal "broker unavailable → poll instead".
set +e
YAML_PATH="$AGENT_SESSIONS_YAML" HOME_DIR="$HOME_DIR" CLAUDE_PROJECT_DIR="$CLAUDE_PROJECT_DIR" LOCAL_BIN="$LOCAL_BIN" \
SUB_TIMEOUT="$SUB_TIMEOUT" SUB_IDLE_TIMEOUT="$SUB_IDLE_TIMEOUT" \
SKILLS_DIR="$SKILLS_DIR" LIB_SH="$LIB_SH" \
"$PYBIN" - <<'PYEOF'
import os, sys, json, time, subprocess
lib_sh = os.environ.get('LIB_SH', '')
skills_dir = os.environ.get('SKILLS_DIR', '')
timeout = int(os.environ.get('SUB_TIMEOUT', '0') or '0') # 0 = no overall timeout
idle_timeout = int(os.environ.get('SUB_IDLE_TIMEOUT', '3600') or '0') # 0 = no idle timeout
# Locate skills/multi-agent-mux-delegate-job/scripts to import mqtt_common — relative first, then
# an upward walk from cwd. No hardcoded absolute path (review item 6).
cand = os.path.join(skills_dir, 'multi-agent-mux-delegate-job', 'scripts') if skills_dir else ''
if cand and os.path.isdir(cand):
sys.path.append(cand)
else:
d = os.getcwd()
while d and d != '/':
hit = None
for sub in (('.agents', 'skills', 'multi-agent-mux-delegate-job', 'scripts'), ('skills', 'multi-agent-mux-delegate-job', 'scripts'), ('multi-agent-mux-delegate-job', 'scripts')):
p = os.path.join(d, *sub)
if os.path.isdir(p):
hit = p
break
if hit:
sys.path.append(hit)
break
d = os.path.dirname(d)
import mqtt_common
# Executed INSIDE lib.sh::atomic_dump_yaml (system python3 + PyYAML), under the
# YAML flock with schema-validate + .bak (review item 5). Marks matching running
# sessions terminated and kills their tmux (review item 3 behaviour preserved),
# or aborts the write entirely when nothing matches. The untrusted MQTT job id /
# event arrive via env (MQTT_JID / MQTT_EVENT) — never spliced into source (P1-B).
_MUTATION = r'''
import os, subprocess
from datetime import datetime, timezone
_jid = os.environ['MQTT_JID']
_event = os.environ['MQTT_EVENT']
_now = datetime.now(timezone.utc)
_changed = False
for s in d.get('tmux_sessions', []):
if s.get('delegate_job_id') == _jid and s.get('status') == 'running':
s['status'] = 'terminated'
s['terminated_at'] = _now.strftime('%Y-%m-%dT%H:%M:%SZ')
s['terminated_at_epoch'] = int(_now.timestamp())
s['termination_mode'] = 'auto-detected (MQTT ' + _event + ')'
_name = s.get('name')
_srv = s.get('tmux_server') or 'default'
_cmd = ['tmux'] + (['-L', _srv] if _srv != 'default' else []) + ['kill-session', '-t', _name]
subprocess.run(_cmd, capture_output=True)
print('MQTT Monitor: terminated + killed ' + str(_name) + ' on ' + str(_srv), flush=True)
_changed = True
if not _changed:
raise SystemExit(0) # nothing matched — skip the write entirely
'''
def handle_terminal(jid, event):
if not lib_sh or not os.path.isfile(lib_sh):
print('MQTT Monitor: lib.sh not found, cannot update YAML', flush=True)
return
env = dict(os.environ)
env['MQTT_JID'] = jid
env['MQTT_EVENT'] = event
cmd = ['bash', '-c',
'source "$LIB_SH"; atomic_dump_yaml "$YAML_PATH" MQTT_JID="$MQTT_JID" MQTT_EVENT="$MQTT_EVENT"']
r = subprocess.run(cmd, input=_MUTATION, text=True, env=env, capture_output=True)
if (r.stdout or '').strip():
print(r.stdout.strip(), flush=True)
if r.returncode != 0 and (r.stderr or '').strip():
print('MQTT Monitor: atomic_dump_yaml stderr: ' + r.stderr.strip(), flush=True)
state = {'last_msg': time.time(), 'connected': False, 'failed': False}
def on_message(_client, _userdata, msg):
state['last_msg'] = time.time()
try:
payload = json.loads(msg.payload.decode("utf-8"))
jid = payload.get("job_id")
event = payload.get("event")
if jid and event in ("completed", "error"):
print(f"MQTT Monitor: received terminal event {event} for job {jid}", flush=True)
handle_terminal(jid, event)
except Exception as e:
print(f"MQTT Monitor error parsing message: {e}", flush=True)
def on_connect(_c, _u, _flags, reason_code, _props):
rc = mqtt_common.reason_code_value(reason_code)
if rc == 0:
state['connected'] = True
_c.subscribe("python/mqtt/jobs/+/events", qos=1)
print("MQTT Monitor: subscribed to python/mqtt/jobs/+/events", flush=True)
else:
state['failed'] = True
print(f"MQTT Monitor connection failed: rc={rc}", flush=True)
cfg = mqtt_common.broker_config_from_env()
client = mqtt_common.make_client("monitor_sub", cfg)
client.on_message = on_message
client.on_connect = on_connect
print(f"MQTT Monitor: connecting to {cfg.host}:{cfg.port} (TLS={cfg.tls})...", flush=True)
# Connection failure → fall back to polling (review item 4).
try:
client.connect(cfg.host, cfg.port, cfg.keepalive)
except Exception as e:
print(f"MQTT Monitor: connect failed ({e}); falling back to polling", flush=True)
sys.exit(3)
client.loop_start()
_wait = time.time()
while time.time() - _wait < 5 and not state['connected'] and not state['failed']:
time.sleep(0.1)
if not state['connected']:
print("MQTT Monitor: broker did not accept connection; falling back to polling", flush=True)
client.loop_stop()
sys.exit(3)
start = time.time()
try:
while True:
now = time.time()
if timeout and (now - start) >= timeout:
print(f"MQTT Monitor: --timeout {timeout}s reached, exiting", flush=True)
break
if idle_timeout and (now - state['last_msg']) >= idle_timeout:
print(f"MQTT Monitor: --idle-timeout {idle_timeout}s reached, exiting", flush=True)
break
time.sleep(0.5)
finally:
client.loop_stop()
try:
client.disconnect()
except Exception:
pass
sys.exit(0)
PYEOF
sub_rc=$?
set -e
if [ "$sub_rc" = "3" ]; then
echo "MQTT Monitor: broker unavailable — falling back to polling (interval ${POLL_INTERVAL}s)" >&2
_self="$SKILLS_DIR/multi-agent-mux-monitor/scripts/reconcile.sh"
_start=$(date +%s)
while :; do
bash "$_self" --once --emit-diff >/dev/null 2>&1 || true
if [ "$SUB_TIMEOUT" != "0" ] && [ "$(( $(date +%s) - _start ))" -ge "$SUB_TIMEOUT" ]; then
break
fi
sleep "$POLL_INTERVAL"
done
fi
exit 0
fi
mkdir -p "$STATE_DIR"
# 모든 비교 로직을 단일 소스로 둔다. dry-run 은 env_python(읽기전용), 그 외엔
# atomic_dump_yaml(flock + temp+rename) 로 같은 소스를 돌린다. atomic 래퍼에서는
# 'actions' 가 없으면 SystemExit(0) 으로 쓰기를 건너뛴다 (불필요한 재포맷 방지).
read -r -d '' RECON_SRC <<'PYEOF' || true
import os, json, glob, subprocess, time
from datetime import datetime, timezone
import yaml
yaml_path = os.environ['YAML_PATH']
home = os.environ['HOME_DIR']
claude_project_dir = os.environ.get('CLAUDE_PROJECT_DIR', f"{home}/.claude/projects")
now_iso = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
# atomic 래퍼에서는 d 가 이미 로드돼 있음. env_python(dry-run)에서는 여기서 로드.
try:
d
except NameError:
import sqlite3
db_path = os.path.splitext(yaml_path)[0] + '.db'
d = {}
try:
if os.path.exists(db_path):
conn = sqlite3.connect(db_path, timeout=10.0)
row = conn.execute('SELECT data FROM state WHERE id=1').fetchone()
if row: d = json.loads(row[0])
try:
db_sessions = []
cursor = conn.execute('SELECT data FROM sessions')
for s_row in cursor.fetchall():
db_sessions.append(json.loads(s_row[0]))
d['tmux_sessions'] = db_sessions
except sqlite3.OperationalError:
pass
conn.close()
elif os.path.exists(yaml_path):
with open(yaml_path) as f:
d = yaml.safe_load(f) or {}
except Exception:
pass
drifts = []
actions = []
# === 현재 tmux 상태 — transient 실패를 'no sessions' 와 구분 (P1-E) ===
tmux_sessions = []
tmux_confirmed = True
# YAML 에 등록된 고유한 tmux_server 목록 수집 + 환경변수 TMUX_SERVER_NAME 포함
unique_servers = {'default'}
if 'TMUX_SERVER_NAME' in os.environ:
unique_servers.add(os.environ['TMUX_SERVER_NAME'])
for s in d.get('tmux_sessions', []):
srv = s.get('tmux_server') or 'default'
unique_servers.add(srv)
try:
for srv in sorted(unique_servers):
cmd = ['tmux']
if srv != 'default':
cmd += ['-L', srv]
cmd += ['ls', '-F', '#{session_name}|#{session_created}']
r = subprocess.run(cmd, capture_output=True, text=True)
if r.returncode == 0:
for line in r.stdout.strip().split('\n'):
if not line:
continue
name, created = line.split('|', 1)
tmux_sessions.append({'name': name, 'created': int(created), 'server': srv})
else:
err = (r.stderr or '').lower()
is_empty = ('no server running' in err) or ('no sessions' in err) or ('failed to connect' in err)
if not is_empty:
tmux_confirmed = False
except Exception:
tmux_confirmed = False
def pane_meta(session, srv):
try:
cmd = ['tmux']
if srv != 'default':
cmd += ['-L', srv]
cmd += ['list-panes', '-t', session, '-F',
'#{pane_pid}|#{pane_current_path}|#{pane_current_command}']
out = subprocess.check_output(cmd, text=True)
parts = out.strip().split('\n')[0].split('|')
return {'pid': int(parts[0]), 'cwd': parts[1], 'cmd': parts[2]}
except Exception:
return None
yaml_sessions = d.get('tmux_sessions', [])
yaml_session_names = {s['name'] for s in yaml_sessions if s.get('name')}
alive_set = {(t['name'], t.get('server', 'default')) for t in tmux_sessions}
# === drift A: tmux dead + YAML running → auto-terminate ===
# tmux 응답을 확정했을 때만. transient 실패 시 모두 terminated 로 마크하지 않음 (P1-E)
if tmux_confirmed:
for s in yaml_sessions:
name = s.get('name')
if not name:
continue
# 'stopped' 도 deliberate한 종료 상태 — drift 로 보지 않고 그대로 둔다.
# (없으면 tmux-dead stopped 세션을 'terminated' 로 덮어써 resumable 플래그가 소실됨)
if s.get('status') in ('terminated', 'archived', 'stopped'):
continue
srv = s.get('tmux_server') or 'default'
if (name, srv) not in alive_set:
s['status'] = 'terminated'
s['terminated_at'] = now_iso
s['terminated_at_epoch'] = int(datetime.now(timezone.utc).timestamp())
s['termination_mode'] = 'auto-detected (tmux gone)'
pane = s.get('pane') or {}
drifts.append({'class': 'A', 'name': name,
'msg': f"{name}: tmux gone (was pane {pane.get('pid')}, cmd {pane.get('cmd')}). Marked terminated."})
actions.append(f"terminated: {name}")
# === drift B: tmux alive + not in YAML → auto-register ===
if tmux_confirmed:
for t in tmux_sessions:
name = t['name']
if name in yaml_session_names:
continue
if not (name.endswith('-creator-claude') or name.endswith('-creator-agy')):
continue
srv = t.get('server', 'default')
pm = pane_meta(name, srv)
if not pm:
continue
agent = 'claude' if name.endswith('-creator-claude') else 'agy'
cmd_full = 'claude' if agent == 'claude' else 'agy --dangerously-skip-permissions'
server_opt = f"-L {srv} " if srv != 'default' else ""
entry = {
'name': name,
'status': 'running',
'tmux_session_created_at': datetime.fromtimestamp(t['created'], tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
'tmux_session_epoch': t['created'],
'tmux_server': srv,
'pane': {'index': 0, 'pid': pm['pid'], 'cmd': agent, 'cmd_full': cmd_full, 'cwd': pm['cwd']},
# P2: cwd 인용
'start_command': f'tmux {server_opt}new-session -d -s "{name}" -x 140 -y 40 -c "{pm["cwd"]}" "{cmd_full}"',
'attach_command': f'tmux {server_opt}attach -t {name}',
'kill_command': f'tmux {server_opt}kill-session -t {name}',
'last_visible_status': 'running',
'last_visible_note': 'auto-registered by monitor',
}
if agent == 'claude':
entry['tui'] = {'model': '(unknown — capture after first message)', 'provider': 'anthropic',
'plan': '(unknown)', 'account': '(unknown)', 'version': '(unknown)'}
entry['claude_session_id_own'] = None
else:
entry['child_pid'] = 0
entry['agy_conversation_id_own'] = None
entry['mcp_attachments'] = [
{
'name': 'stitch',
'transport': 'mcp-remote',
'endpoint': 'https://stitch.googleapis.com/mcp'
}
]
d.setdefault('tmux_sessions', []).append(entry)
yaml_session_names.add(name)
drifts.append({'class': 'B', 'name': name,
'msg': f"{name}: tmux found but not in YAML. Auto-registered (pane {pm['pid']}, cmd {pm['cmd']}, cwd {pm['cwd']})."})
actions.append(f"registered: {name}")
# === drift C: claude 새 session id materialize (per-row own id) ===
for s in d.get('tmux_sessions', []):
if not s.get('name', '').endswith('-creator-claude'):
continue
if s.get('status') != 'running':
continue
if s.get('claude_session_id_own'):
continue
cwd = (s.get('pane') or {}).get('cwd', '')
if not cwd:
continue
proj_key = cwd.replace('/', '-').replace('_', '-')
proj_dir = f"{claude_project_dir}/{proj_key}"
if not os.path.isdir(proj_dir):
continue
jsonls = sorted(glob.glob(f"{proj_dir}/*.jsonl"), key=os.path.getmtime, reverse=True)
if not jsonls:
continue
latest = jsonls[0]
if time.time() - os.path.getmtime(latest) > 300:
continue
try:
with open(latest) as f:
first = f.readline().strip()
if not first:
continue
sid = json.loads(first).get('sessionId')
if not sid:
continue
except Exception:
continue
s['claude_session_id_own'] = sid
drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: session id materialized: {sid}"})
actions.append(f"updated session id: {sid}")
# === drift C (agy): agy 새 session id materialize (per-row own id) ===
for s in d.get('tmux_sessions', []):
if not s.get('name', '').endswith('-creator-agy'):
continue
if s.get('status') != 'running':
continue
if s.get('agy_conversation_id_own'):
continue
cwd = (s.get('pane') or {}).get('cwd', '')
if not cwd:
continue
lc = f"{home}/.gemini/antigravity-cli/cache/last_conversations.json"
if os.path.exists(lc):
try:
with open(lc) as f:
lc_data = json.load(f)
cid = lc_data.get(cwd)
if cid and os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
s['agy_conversation_id_own'] = cid
drifts.append({'class': 'C', 'name': s['name'], 'msg': f"{s['name']}: conversation id materialized: {cid}"})
actions.append(f"updated conversation id: {cid}")
except Exception:
pass
# === drift D: stale UUID (cache 의 artifact 가 사라짐) — 보고만, 변경 없음 ===
ai = d.get('agent_identities', {}) or {}
cl = (ai.get('claude') or {})
if cl.get('session_id'):
sid = cl['session_id']
if not glob.glob(f"{claude_project_dir}/*/{sid}.jsonl"):
drifts.append({'class': 'D', 'name': '(claude identity cache)',
'msg': f"stale UUID in agent_identities.claude.session_id: {sid} (jsonl missing)"})
ag = (ai.get('agy') or {})
if ag.get('conversation_id'):
cid = ag['conversation_id']
if not os.path.exists(f"{home}/.gemini/antigravity-cli/conversations/{cid}.db"):
drifts.append({'class': 'D', 'name': '(agy identity cache)',
'msg': f"stale UUID in agent_identities.agy.conversation_id: {cid} (.db missing)"})
result = {
'timestamp': now_iso,
'yaml_path': yaml_path,
'tmux_sessions_alive': sorted(f"{t['name']}|{t.get('server', 'default')}" for t in tmux_sessions),
'tmux_confirmed': tmux_confirmed,
'drifts': drifts,
'actions': actions,
}
print(json.dumps(result, indent=2, ensure_ascii=False))
# atomic 래퍼: actions 가 없으면 쓰기를 건너뛴다. env_python(dry-run)에선 무해.
if not actions:
raise SystemExit(0)
PYEOF
if [ "$DRY_RUN" = "1" ]; then
printf '%s' "$RECON_SRC" | env_python "$AGENT_SESSIONS_YAML"
else
printf '%s' "$RECON_SRC" | atomic_dump_yaml "$AGENT_SESSIONS_YAML"
fi