perf(collector/watcher): 로컬 데이터 전송 성능 최적화 — mtime 프리체크, 프로젝트 캐시, re-forward 수정, 폴링 간격 조정

This commit is contained in:
Variet Worker
2026-03-13 23:13:18 +09:00
parent 1835166c5a
commit d4a2016d06
2 changed files with 89 additions and 5 deletions

View File

@@ -13,6 +13,7 @@ Flow:
import asyncio
import hashlib
import json
import os
import time
import logging
from pathlib import Path
@@ -39,7 +40,7 @@ class CollectorBridge:
self.remote = remote
self.project_name = project_name
self.event_queue = event_queue
self._poll_interval = 3 # seconds
self._poll_interval = 5 # seconds (was 3 — reduced I/O frequency)
self._running = False
# Pre-populate with existing pending files → skip on startup (prevents 만료됨 spam)
@@ -47,7 +48,13 @@ class CollectorBridge:
self._forwarded_pending: set[str] = set()
self._forwarded_timestamps: dict[str, float] = {} # rid → when forwarded
self._pending_hashes: dict[str, str] = {} # rid → content hash (for MERGE/status detection)
self._pending_mtimes: dict[str, float] = {} # rid → last known file mtime
self._RESPONSE_POLL_TTL = 300 # 5 min — stop polling responses for old pending
# Project discovery cache (avoid re-reading register/ every cycle)
self._cached_projects: set[str] | None = None
self._projects_cache_ts: float = 0
self._PROJECTS_CACHE_TTL = 60.0 # seconds
for fname in self.local.list_json_files("pending"):
rid = fname.replace(".json", "")
self._startup_pending.add(rid)
@@ -58,9 +65,33 @@ class CollectorBridge:
self._pending_hashes[rid] = hashlib.md5(
json.dumps(data, sort_keys=True).encode()
).hexdigest()
# Pre-cache mtime
try:
fpath = self.local.bridge_dir / "pending" / fname
self._pending_mtimes[rid] = fpath.stat().st_mtime
except OSError:
pass
if self._startup_pending:
logger.info(f"[COLLECTOR] skipping {len(self._startup_pending)} existing pending files")
# Startup cleanup: remove stale response files (> 5 min)
self._cleanup_stale_responses()
def _cleanup_stale_responses(self, max_age: int = 300):
"""Remove stale response files (> max_age seconds) on startup."""
now = time.time()
cleaned = 0
for fname in self.local.list_json_files("response"):
try:
fpath = self.local.bridge_dir / "response" / fname
if now - fpath.stat().st_mtime > max_age:
self.local.delete_file("response", fname)
cleaned += 1
except OSError:
pass
if cleaned:
logger.info(f"[COLLECTOR] startup cleanup: removed {cleaned} stale response files")
async def start(self):
"""Start the Collector polling loops with staggered offsets.
@@ -115,6 +146,17 @@ class CollectorBridge:
rid = fname.replace(".json", "")
current_files.add(rid)
# mtime pre-check: skip read+hash if file hasn't been modified
try:
fpath = self.local.bridge_dir / "pending" / fname
current_mtime = fpath.stat().st_mtime
except OSError:
continue
prev_mtime = self._pending_mtimes.get(rid)
if prev_mtime is not None and current_mtime == prev_mtime:
continue # File untouched since last check — skip read+hash
self._pending_mtimes[rid] = current_mtime
data = self.local.read_json("pending", fname)
if data is None:
continue
@@ -153,6 +195,12 @@ class CollectorBridge:
if rid not in current_files and rid not in self._startup_pending:
self._forwarded_pending.discard(rid)
self._pending_hashes.pop(rid, None)
self._pending_mtimes.pop(rid, None)
# Also clean up orphaned hashes/mtimes for files no longer on disk
for rid in list(self._pending_hashes):
if rid not in current_files:
self._pending_hashes.pop(rid, None)
self._pending_mtimes.pop(rid, None)
except Exception as e:
logger.error(f"[COLLECTOR] forward_pending error: {e}")
@@ -183,7 +231,10 @@ class CollectorBridge:
for rid in expired:
self._forwarded_pending.discard(rid)
self._forwarded_timestamps.pop(rid, None)
self._pending_hashes.pop(rid, None)
# NOTE: intentionally keep _pending_hashes[rid] to prevent
# re-forward cycle (expired pending would be re-detected as
# "new" if hash is cleared). Hash is cleaned up when file
# is actually deleted from disk (see _forward_pending_loop).
if expired:
logger.info(f"[COLLECTOR] expired {len(expired)} stale forwarded pending (>{self._RESPONSE_POLL_TTL}s)")
@@ -223,10 +274,19 @@ class CollectorBridge:
Reads bridge/register/*.json files, which are written by each AG window's
Extension with {conversation_id, project_name}. Returns unique project names
found, always including self.project_name as a fallback.
Results are cached for _PROJECTS_CACHE_TTL seconds to avoid re-reading
22+ register files every polling cycle.
"""
now = time.time()
if self._cached_projects is not None and now - self._projects_cache_ts < self._PROJECTS_CACHE_TTL:
return self._cached_projects
projects = {self.project_name}
register_dir = self.local.bridge_dir / "register"
if not register_dir.exists():
self._cached_projects = projects
self._projects_cache_ts = now
return projects
for f in register_dir.glob("*.json"):
try:
@@ -236,6 +296,8 @@ class CollectorBridge:
projects.add(p)
except (json.JSONDecodeError, OSError):
pass
self._cached_projects = projects
self._projects_cache_ts = now
return projects
async def _poll_commands_loop(self):
@@ -319,7 +381,7 @@ class CollectorBridge:
except Exception as e:
logger.error(f"[COLLECTOR] forward_chat_snapshots error: {e}")
await asyncio.sleep(self._poll_interval)
await asyncio.sleep(10) # Chat snapshots: less urgent, 10s interval
# ─── Forward session registrations → Gateway ───
@@ -341,12 +403,13 @@ class CollectorBridge:
await self.remote.aregister_session(conv_id, project)
forwarded_regs.add(f.name)
logger.info(f"[COLLECTOR] → Gateway: register {conv_id[:8]}{project}")
await asyncio.sleep(0.3) # Spread startup burst
except (json.JSONDecodeError, OSError) as e:
logger.warning(f"[COLLECTOR] bad register {f.name}: {e}")
except Exception as e:
logger.error(f"[COLLECTOR] forward_registrations error: {e}")
await asyncio.sleep(self._poll_interval * 3) # Less frequent
await asyncio.sleep(30) # Registration changes rarely — 30s interval
# ─── Forward brain events → Gateway ───
async def _forward_events_loop(self):
@@ -394,4 +457,4 @@ class CollectorBridge:
await self.remote.flush_retry_queue()
except Exception as e:
logger.error(f"[COLLECTOR] retry flush error: {e}")
await asyncio.sleep(10)
await asyncio.sleep(30) # Retry flush: 30s interval (was 10s)