From 52c9526fdb3336b3e2ac9612a62afeaca8957198 Mon Sep 17 00:00:00 2001 From: Variet Worker Date: Thu, 12 Mar 2026 00:49:37 +0900 Subject: [PATCH] =?UTF-8?q?fix(bridge):=20429=20Rate=20Limit=20=EB=AC=B4?= =?UTF-8?q?=ED=95=9C=20=EB=A3=A8=ED=94=84=20=EB=B0=A9=EC=A7=80=20=E2=80=94?= =?UTF-8?q?=20=EC=A7=80=EC=88=98=20=EB=B0=B1=EC=98=A4=ED=94=84=20+=20Colle?= =?UTF-8?q?ctor=20=ED=8F=B4=EB=A7=81=20=EB=B3=B4=ED=98=B8=20+=20rate=20lim?= =?UTF-8?q?it=20=EC=99=84=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .agents/references/known-issues.md | 6 +++++ bridge.py | 40 +++++++++++++++++++++++++++++- collector.py | 28 ++++++++++++++++----- docs/devlog/2026-03-11.md | 2 +- docs/devlog/2026-03-12.md | 5 ++++ gateway.py | 3 ++- 6 files changed, 75 insertions(+), 9 deletions(-) create mode 100644 docs/devlog/2026-03-12.md diff --git a/.agents/references/known-issues.md b/.agents/references/known-issues.md index 1b8c192..f0efef9 100644 --- a/.agents/references/known-issues.md +++ b/.agents/references/known-issues.md @@ -405,4 +405,10 @@ - **해결**: **미해결** — AG가 실제 등록하는 커맨드 목록 조사 후 올바른 커맨드로 교체 필요 - **주의**: `[2026-03-09] VS Code Accept Commands` 이슈와 같은 근본 원인 (AG 커맨드 미등록) +### [2026-03-12] RemoteTransport 429 무한 루프 — Extension 크래시 + AG 먹통 +- **증상**: Collector→Gateway HTTP 요청 시 `429 Rate limited` 로그가 초당 수십 건 무한 반복. Extension 꺼지고 AG 재시작 시 화면 먹통 +- **원인**: 3가지 복합: (1) `RemoteTransport._arequest()`가 429 수신 시 백오프 없이 즉시 리턴 → 3초 후 다시 전체 재시도, (2) `_poll_responses_loop()`가 모든 forwarded pending에 개별 HTTP 요청 → pending 쌓이면 1초에 10개 초과, (3) Gateway rate limit이 10 req/1초로 너무 공격적 +- **해결**: (1) `bridge.py` — 지수 백오프 추가 (1s→2s→4s…60s) + `Retry-After` 헤더 지원 + `is_rate_limited` 프로퍼티, (2) `gateway.py` — rate limit 10→30으로 완화 + `Retry-After` 헤더 응답, (3) `collector.py` — 모든 루프에서 `is_rate_limited` 체크 + response 폴링에 0.2초 인터-리퀘스트 딜레이 +- **주의**: AG 먹통은 봇 자체가 유발한 문제. Extension이나 AG 내부를 건드린 것이 아님. 봇을 끄고 AG를 재시작하면 정상 복구 가능 + diff --git a/bridge.py b/bridge.py index 7e1116b..33ce977 100644 --- a/bridge.py +++ b/bridge.py @@ -173,6 +173,12 @@ class RemoteTransport(BridgeTransport): self._consecutive_failures = 0 self._max_failures_before_warning = 3 + # Rate limit backoff + self._rate_limited_until = 0.0 # timestamp until which we should not send requests + self._backoff_seconds = 0.0 # current backoff duration (exponential) + self._BACKOFF_BASE = 1.0 + self._BACKOFF_MAX = 60.0 + # Retry queue: list of (method, path, data) tuples self._retry_queue: list[tuple[str, str, dict | None]] = [] self._retry_queue_max = 100 @@ -194,8 +200,35 @@ class RemoteTransport(BridgeTransport): if self._session and not self._session.closed: await self._session.close() + @property + def is_rate_limited(self) -> bool: + """Check if we are currently in a rate-limit backoff period.""" + return time.time() < self._rate_limited_until + + def _apply_backoff(self, retry_after: float = 0): + """Apply exponential backoff for rate limiting.""" + if retry_after > 0: + self._backoff_seconds = min(retry_after, self._BACKOFF_MAX) + else: + if self._backoff_seconds == 0: + self._backoff_seconds = self._BACKOFF_BASE + else: + self._backoff_seconds = min(self._backoff_seconds * 2, self._BACKOFF_MAX) + self._rate_limited_until = time.time() + self._backoff_seconds + logger.warning(f"RemoteTransport: backing off {self._backoff_seconds:.0f}s (until +{self._backoff_seconds:.0f}s)") + + def _reset_backoff(self): + """Reset backoff after a successful request.""" + if self._backoff_seconds > 0: + self._backoff_seconds = 0 + self._rate_limited_until = 0 + async def _arequest(self, method: str, path: str, data: dict | None = None) -> dict | None: """Async non-blocking HTTP request to Gateway API.""" + # Skip if in backoff period (except health checks) + if self.is_rate_limited and path != "/health": + return None + session = await self._get_session() url = f"{self.base_url}{path}" try: @@ -207,7 +240,8 @@ class RemoteTransport(BridgeTransport): if resp.status == 401: logger.error("RemoteTransport: 401 Unauthorized — check GATEWAY_API_KEY") elif resp.status == 429: - logger.warning("RemoteTransport: 429 Rate limited") + retry_after = float(resp.headers.get("Retry-After", 0)) + self._apply_backoff(retry_after) else: logger.warning(f"RemoteTransport: {method} {path} → {resp.status}") return None @@ -216,6 +250,7 @@ class RemoteTransport(BridgeTransport): logger.info("RemoteTransport: ✅ Gateway connected") self.connected = True self._consecutive_failures = 0 + self._reset_backoff() return result except Exception as e: self._consecutive_failures += 1 @@ -224,6 +259,9 @@ class RemoteTransport(BridgeTransport): elif self._consecutive_failures < self._max_failures_before_warning: logger.warning(f"RemoteTransport: {method} {path} → {e}") self.connected = False + # Apply backoff on connection failures too + if self._consecutive_failures >= self._max_failures_before_warning: + self._apply_backoff() return None async def _arequest_retry(self, method: str, path: str, data: dict | None = None) -> dict | None: diff --git a/collector.py b/collector.py index f0fc6c3..a4ced9f 100644 --- a/collector.py +++ b/collector.py @@ -94,6 +94,11 @@ class CollectorBridge: """ while self._running: try: + # Skip cycle if rate-limited + if self.remote.is_rate_limited: + await asyncio.sleep(self._poll_interval) + continue + current_files = set() for fname in self.local.list_json_files("pending"): rid = fname.replace(".json", "") @@ -148,12 +153,21 @@ class CollectorBridge: """Poll Gateway for responses and write them locally for Extension.""" while self._running: try: + # Skip cycle if rate-limited + if self.remote.is_rate_limited: + await asyncio.sleep(self._poll_interval) + continue + # Check each forwarded pending for a response for rid in list(self._forwarded_pending): if rid in self._startup_pending: continue # Don't poll responses for pre-startup files + # Rate-limit guard: stop polling if we got rate-limited mid-cycle + if self.remote.is_rate_limited: + break data = await self.remote.aread_json("response", f"{rid}.json") if data is None or data.get("waiting"): + await asyncio.sleep(0.2) # Throttle between individual response polls continue # Write response locally for Extension to pick up @@ -175,12 +189,14 @@ class CollectorBridge: """Poll Gateway for commands and write them locally for Extension.""" while self._running: try: - commands = await self.remote.apoll_commands(self.project_name) - for cmd in commands: - cmd_id = cmd.get("id", str(int(time.time() * 1000))) - fname = f"{cmd_id}.json" - self.local.write_json("commands", fname, cmd) - logger.info(f"[COLLECTOR] ← Gateway: command {cmd.get('text', '?')[:30]}") + # Skip cycle if rate-limited + if not self.remote.is_rate_limited: + commands = await self.remote.apoll_commands(self.project_name) + for cmd in commands: + cmd_id = cmd.get("id", str(int(time.time() * 1000))) + fname = f"{cmd_id}.json" + self.local.write_json("commands", fname, cmd) + logger.info(f"[COLLECTOR] ← Gateway: command {cmd.get('text', '?')[:30]}") except Exception as e: logger.error(f"[COLLECTOR] poll_commands error: {e}") diff --git a/docs/devlog/2026-03-11.md b/docs/devlog/2026-03-11.md index 222364f..f77b256 100644 --- a/docs/devlog/2026-03-11.md +++ b/docs/devlog/2026-03-11.md @@ -11,4 +11,4 @@ | 007 | 19:28~19:35 | Gateway HTTP API + Docker (Dockerfile, docker-compose, Caddyfile) | `6dbbb57` | ✅ | | 008 | 19:35~19:50 | Gateway 보안: API Key 인증 미들웨어 + Caddy HTTPS + .env.example | `95da3e9` | ✅ | | 009 | 19:50~20:10 | RemoteTransport + CollectorBridge 구현 — Collector↔Gateway HTTP 통신 | `95c2905` | ✅ | -| 010 | 21:30~23:48 | 아키텍처 감사: aiohttp 전환 + 보안 + 기능 누락 수정 + 나노 검증 | `d7ed454` | 🔧 | +| 010 | 21:30~23:48 | 아키텍처 감사: aiohttp 전환 + 보안 + 기능 누락 수정 + 나노 검증 | `d7ed454` | ✅ | diff --git a/docs/devlog/2026-03-12.md b/docs/devlog/2026-03-12.md new file mode 100644 index 0000000..990a8b0 --- /dev/null +++ b/docs/devlog/2026-03-12.md @@ -0,0 +1,5 @@ +# 2026-03-12 Devlog + +| # | 시간 | 작업 설명 | 커밋 | 상태 | +|---|------|----------|------|------| +| 001 | 00:34~00:47 | 429 Rate Limit 무한 루프 디버깅 — 지수 백오프 + rate limit 완화 + Collector 폴링 보호 | `d9b36cf` | ✅ | diff --git a/gateway.py b/gateway.py index 999a9d6..547fb8e 100644 --- a/gateway.py +++ b/gateway.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) # Rate limiting RATE_LIMIT_WINDOW = 1.0 # seconds -RATE_LIMIT_MAX = 10 # max requests per window per IP +RATE_LIMIT_MAX = 30 # max requests per window per IP (Collector needs ~15-20/cycle) COMMAND_TTL = 1800 # 30 min — stale commands auto-deleted @@ -89,6 +89,7 @@ class GatewayAPI: return web.json_response( {"error": "Too Many Requests"}, status=429, + headers={"Retry-After": str(int(RATE_LIMIT_WINDOW * 2))}, ) window.append(now) self._rate_limits[ip] = window