fix(collector/bridge/gateway): rate limit 구조적 수정 — 점진적 백오프 + adaptive 폴링 + burst-friendly 윈도우

This commit is contained in:
Variet Worker
2026-03-12 22:33:49 +09:00
parent 5cdf7777a5
commit 56de71470d
3 changed files with 110 additions and 36 deletions

View File

@@ -45,7 +45,9 @@ class CollectorBridge:
# Pre-populate with existing pending files → skip on startup (prevents 만료됨 spam)
self._startup_pending: set[str] = set()
self._forwarded_pending: set[str] = set()
self._forwarded_timestamps: dict[str, float] = {} # rid → when forwarded
self._pending_hashes: dict[str, str] = {} # rid → content hash (for MERGE/status detection)
self._RESPONSE_POLL_TTL = 300 # 5 min — stop polling responses for old pending
for fname in self.local.list_json_files("pending"):
rid = fname.replace(".json", "")
self._startup_pending.add(rid)
@@ -60,20 +62,29 @@ class CollectorBridge:
logger.info(f"[COLLECTOR] skipping {len(self._startup_pending)} existing pending files")
async def start(self):
"""Start the Collector polling loops."""
"""Start the Collector polling loops with staggered offsets.
Each loop starts with a different delay to prevent all loops from waking
up at the same time and causing burst requests to Gateway.
"""
self._running = True
logger.info(f"[COLLECTOR] started for project={self.project_name}")
async def _staggered(coro, offset: float):
await asyncio.sleep(offset)
await coro()
tasks = [
self._forward_pending_loop(),
self._poll_responses_loop(),
self._poll_commands_loop(),
self._forward_chat_snapshots_loop(),
self._forward_registrations_loop(),
self._health_check_loop(),
self._retry_flush_loop(),
_staggered(self._forward_pending_loop, 0.0),
_staggered(self._poll_responses_loop, 0.5),
_staggered(self._poll_commands_loop, 1.0),
_staggered(self._forward_chat_snapshots_loop, 1.5),
_staggered(self._forward_registrations_loop, 2.0),
_staggered(self._health_check_loop, 2.5),
_staggered(self._retry_flush_loop, 3.0),
]
if self.event_queue:
tasks.append(self._forward_events_loop())
tasks.append(_staggered(self._forward_events_loop, 3.5))
await asyncio.gather(*tasks)
async def stop(self):
@@ -128,6 +139,7 @@ class CollectorBridge:
# Forward to Gateway (new or updated)
await self.remote.awrite_json("pending", fname, data)
self._forwarded_pending.add(rid)
self._forwarded_timestamps[rid] = time.time()
self._pending_hashes[rid] = content_hash
if is_new:
@@ -150,7 +162,11 @@ class CollectorBridge:
# ─── Poll Gateway responses → local ───
async def _poll_responses_loop(self):
"""Poll Gateway for responses and write them locally for Extension."""
"""Poll Gateway for responses and write them locally for Extension.
Only polls responses for recently-forwarded pending (within _RESPONSE_POLL_TTL).
Expired entries are removed from tracking to prevent request accumulation.
"""
while self._running:
try:
# Skip cycle if rate-limited
@@ -158,16 +174,31 @@ class CollectorBridge:
await asyncio.sleep(self._poll_interval)
continue
# Check each forwarded pending for a response
for rid in list(self._forwarded_pending):
if rid in self._startup_pending:
continue # Don't poll responses for pre-startup files
now = time.time()
# Clean up expired forwarded pending (stop polling responses for old ones)
expired = [
rid for rid, ts in self._forwarded_timestamps.items()
if now - ts > self._RESPONSE_POLL_TTL
]
for rid in expired:
self._forwarded_pending.discard(rid)
self._forwarded_timestamps.pop(rid, None)
self._pending_hashes.pop(rid, None)
if expired:
logger.info(f"[COLLECTOR] expired {len(expired)} stale forwarded pending (>{self._RESPONSE_POLL_TTL}s)")
# Check each active forwarded pending for a response
active_rids = [
rid for rid in self._forwarded_pending
if rid not in self._startup_pending
]
for rid in active_rids:
# Rate-limit guard: stop polling if we got rate-limited mid-cycle
if self.remote.is_rate_limited:
break
data = await self.remote.aread_json("response", f"{rid}.json")
if data is None or data.get("waiting"):
await asyncio.sleep(0.2) # Throttle between individual response polls
await asyncio.sleep(0.3) # Throttle between individual response polls
continue
# Write response locally for Extension to pick up
@@ -175,6 +206,7 @@ class CollectorBridge:
# Also delete local pending file (Extension expects this)
self.local.delete_file("pending", f"{rid}.json")
self._forwarded_pending.discard(rid)
self._forwarded_timestamps.pop(rid, None)
approved = data.get("approved", "?")
logger.info(f"[COLLECTOR] ← Gateway: response {rid[:12]} approved={approved}")
@@ -207,27 +239,56 @@ class CollectorBridge:
return projects
async def _poll_commands_loop(self):
"""Poll Gateway for commands for ALL local projects.
"""Poll Gateway for commands with adaptive per-project intervals.
Discovers projects from bridge/register/ (written by each AG Extension)
and polls commands for each. Extension-side filtering (project_name check)
ensures each AG window only processes its own commands.
When a project returns empty commands repeatedly, its poll interval
increases (3s → 10s → 30s → 60s). On receiving a command, interval
resets to base. This prevents idle projects from wasting requests.
"""
# Per-project adaptive state
project_intervals: dict[str, float] = {} # project → current interval
project_last_poll: dict[str, float] = {} # project → last poll timestamp
_BASE_INTERVAL = 3.0
_IDLE_STEPS = [10.0, 30.0, 60.0] # progressive idle intervals
project_empty_streak: dict[str, int] = {} # project → consecutive empty polls
while self._running:
try:
# Skip cycle if rate-limited
if not self.remote.is_rate_limited:
projects = self._discover_local_projects()
now = time.time()
for project in projects:
if self.remote.is_rate_limited:
break # Stop mid-cycle if rate-limited
break
# Check if this project's interval has elapsed
interval = project_intervals.get(project, _BASE_INTERVAL)
last = project_last_poll.get(project, 0)
if now - last < interval:
continue # Not time yet for this project
project_last_poll[project] = now
commands = await self.remote.apoll_commands(project)
for cmd in commands:
cmd_id = cmd.get("id", str(int(time.time() * 1000)))
fname = f"{cmd_id}.json"
self.local.write_json("commands", fname, cmd)
logger.info(f"[COLLECTOR] ← Gateway: command [{project}] {cmd.get('text', '?')[:30]}")
await asyncio.sleep(0.3) # Throttle between projects to avoid rate limit bursts
if commands:
# Got commands → reset to base interval
project_intervals[project] = _BASE_INTERVAL
project_empty_streak[project] = 0
for cmd in commands:
cmd_id = cmd.get("id", str(int(time.time() * 1000)))
fname = f"{cmd_id}.json"
self.local.write_json("commands", fname, cmd)
logger.info(f"[COLLECTOR] ← Gateway: command [{project}] {cmd.get('text', '?')[:30]}")
else:
# Empty → increase interval progressively
streak = project_empty_streak.get(project, 0) + 1
project_empty_streak[project] = streak
if streak <= len(_IDLE_STEPS):
project_intervals[project] = _IDLE_STEPS[streak - 1]
# else stays at max (60s)
await asyncio.sleep(0.3) # Throttle between projects
except Exception as e:
logger.error(f"[COLLECTOR] poll_commands error: {e}")