fix(bridge): 429 Rate Limit 무한 루프 방지 — 지수 백오프 + Collector 폴링 보호 + rate limit 완화

This commit is contained in:
Variet Worker
2026-03-12 00:49:37 +09:00
parent feb8c05a73
commit 52c9526fdb
6 changed files with 75 additions and 9 deletions

View File

@@ -405,4 +405,10 @@
- **해결**: **미해결** — AG가 실제 등록하는 커맨드 목록 조사 후 올바른 커맨드로 교체 필요 - **해결**: **미해결** — AG가 실제 등록하는 커맨드 목록 조사 후 올바른 커맨드로 교체 필요
- **주의**: `[2026-03-09] VS Code Accept Commands` 이슈와 같은 근본 원인 (AG 커맨드 미등록) - **주의**: `[2026-03-09] VS Code Accept Commands` 이슈와 같은 근본 원인 (AG 커맨드 미등록)
### [2026-03-12] RemoteTransport 429 무한 루프 — Extension 크래시 + AG 먹통
- **증상**: Collector→Gateway HTTP 요청 시 `429 Rate limited` 로그가 초당 수십 건 무한 반복. Extension 꺼지고 AG 재시작 시 화면 먹통
- **원인**: 3가지 복합: (1) `RemoteTransport._arequest()`가 429 수신 시 백오프 없이 즉시 리턴 → 3초 후 다시 전체 재시도, (2) `_poll_responses_loop()`가 모든 forwarded pending에 개별 HTTP 요청 → pending 쌓이면 1초에 10개 초과, (3) Gateway rate limit이 10 req/1초로 너무 공격적
- **해결**: (1) `bridge.py` — 지수 백오프 추가 (1s→2s→4s…60s) + `Retry-After` 헤더 지원 + `is_rate_limited` 프로퍼티, (2) `gateway.py` — rate limit 10→30으로 완화 + `Retry-After` 헤더 응답, (3) `collector.py` — 모든 루프에서 `is_rate_limited` 체크 + response 폴링에 0.2초 인터-리퀘스트 딜레이
- **주의**: AG 먹통은 봇 자체가 유발한 문제. Extension이나 AG 내부를 건드린 것이 아님. 봇을 끄고 AG를 재시작하면 정상 복구 가능

View File

@@ -173,6 +173,12 @@ class RemoteTransport(BridgeTransport):
self._consecutive_failures = 0 self._consecutive_failures = 0
self._max_failures_before_warning = 3 self._max_failures_before_warning = 3
# Rate limit backoff
self._rate_limited_until = 0.0 # timestamp until which we should not send requests
self._backoff_seconds = 0.0 # current backoff duration (exponential)
self._BACKOFF_BASE = 1.0
self._BACKOFF_MAX = 60.0
# Retry queue: list of (method, path, data) tuples # Retry queue: list of (method, path, data) tuples
self._retry_queue: list[tuple[str, str, dict | None]] = [] self._retry_queue: list[tuple[str, str, dict | None]] = []
self._retry_queue_max = 100 self._retry_queue_max = 100
@@ -194,8 +200,35 @@ class RemoteTransport(BridgeTransport):
if self._session and not self._session.closed: if self._session and not self._session.closed:
await self._session.close() await self._session.close()
@property
def is_rate_limited(self) -> bool:
"""Check if we are currently in a rate-limit backoff period."""
return time.time() < self._rate_limited_until
def _apply_backoff(self, retry_after: float = 0):
"""Apply exponential backoff for rate limiting."""
if retry_after > 0:
self._backoff_seconds = min(retry_after, self._BACKOFF_MAX)
else:
if self._backoff_seconds == 0:
self._backoff_seconds = self._BACKOFF_BASE
else:
self._backoff_seconds = min(self._backoff_seconds * 2, self._BACKOFF_MAX)
self._rate_limited_until = time.time() + self._backoff_seconds
logger.warning(f"RemoteTransport: backing off {self._backoff_seconds:.0f}s (until +{self._backoff_seconds:.0f}s)")
def _reset_backoff(self):
"""Reset backoff after a successful request."""
if self._backoff_seconds > 0:
self._backoff_seconds = 0
self._rate_limited_until = 0
async def _arequest(self, method: str, path: str, data: dict | None = None) -> dict | None: async def _arequest(self, method: str, path: str, data: dict | None = None) -> dict | None:
"""Async non-blocking HTTP request to Gateway API.""" """Async non-blocking HTTP request to Gateway API."""
# Skip if in backoff period (except health checks)
if self.is_rate_limited and path != "/health":
return None
session = await self._get_session() session = await self._get_session()
url = f"{self.base_url}{path}" url = f"{self.base_url}{path}"
try: try:
@@ -207,7 +240,8 @@ class RemoteTransport(BridgeTransport):
if resp.status == 401: if resp.status == 401:
logger.error("RemoteTransport: 401 Unauthorized — check GATEWAY_API_KEY") logger.error("RemoteTransport: 401 Unauthorized — check GATEWAY_API_KEY")
elif resp.status == 429: elif resp.status == 429:
logger.warning("RemoteTransport: 429 Rate limited") retry_after = float(resp.headers.get("Retry-After", 0))
self._apply_backoff(retry_after)
else: else:
logger.warning(f"RemoteTransport: {method} {path}{resp.status}") logger.warning(f"RemoteTransport: {method} {path}{resp.status}")
return None return None
@@ -216,6 +250,7 @@ class RemoteTransport(BridgeTransport):
logger.info("RemoteTransport: ✅ Gateway connected") logger.info("RemoteTransport: ✅ Gateway connected")
self.connected = True self.connected = True
self._consecutive_failures = 0 self._consecutive_failures = 0
self._reset_backoff()
return result return result
except Exception as e: except Exception as e:
self._consecutive_failures += 1 self._consecutive_failures += 1
@@ -224,6 +259,9 @@ class RemoteTransport(BridgeTransport):
elif self._consecutive_failures < self._max_failures_before_warning: elif self._consecutive_failures < self._max_failures_before_warning:
logger.warning(f"RemoteTransport: {method} {path}{e}") logger.warning(f"RemoteTransport: {method} {path}{e}")
self.connected = False self.connected = False
# Apply backoff on connection failures too
if self._consecutive_failures >= self._max_failures_before_warning:
self._apply_backoff()
return None return None
async def _arequest_retry(self, method: str, path: str, data: dict | None = None) -> dict | None: async def _arequest_retry(self, method: str, path: str, data: dict | None = None) -> dict | None:

View File

@@ -94,6 +94,11 @@ class CollectorBridge:
""" """
while self._running: while self._running:
try: try:
# Skip cycle if rate-limited
if self.remote.is_rate_limited:
await asyncio.sleep(self._poll_interval)
continue
current_files = set() current_files = set()
for fname in self.local.list_json_files("pending"): for fname in self.local.list_json_files("pending"):
rid = fname.replace(".json", "") rid = fname.replace(".json", "")
@@ -148,12 +153,21 @@ class CollectorBridge:
"""Poll Gateway for responses and write them locally for Extension.""" """Poll Gateway for responses and write them locally for Extension."""
while self._running: while self._running:
try: try:
# Skip cycle if rate-limited
if self.remote.is_rate_limited:
await asyncio.sleep(self._poll_interval)
continue
# Check each forwarded pending for a response # Check each forwarded pending for a response
for rid in list(self._forwarded_pending): for rid in list(self._forwarded_pending):
if rid in self._startup_pending: if rid in self._startup_pending:
continue # Don't poll responses for pre-startup files continue # Don't poll responses for pre-startup files
# Rate-limit guard: stop polling if we got rate-limited mid-cycle
if self.remote.is_rate_limited:
break
data = await self.remote.aread_json("response", f"{rid}.json") data = await self.remote.aread_json("response", f"{rid}.json")
if data is None or data.get("waiting"): if data is None or data.get("waiting"):
await asyncio.sleep(0.2) # Throttle between individual response polls
continue continue
# Write response locally for Extension to pick up # Write response locally for Extension to pick up
@@ -175,12 +189,14 @@ class CollectorBridge:
"""Poll Gateway for commands and write them locally for Extension.""" """Poll Gateway for commands and write them locally for Extension."""
while self._running: while self._running:
try: try:
commands = await self.remote.apoll_commands(self.project_name) # Skip cycle if rate-limited
for cmd in commands: if not self.remote.is_rate_limited:
cmd_id = cmd.get("id", str(int(time.time() * 1000))) commands = await self.remote.apoll_commands(self.project_name)
fname = f"{cmd_id}.json" for cmd in commands:
self.local.write_json("commands", fname, cmd) cmd_id = cmd.get("id", str(int(time.time() * 1000)))
logger.info(f"[COLLECTOR] ← Gateway: command {cmd.get('text', '?')[:30]}") fname = f"{cmd_id}.json"
self.local.write_json("commands", fname, cmd)
logger.info(f"[COLLECTOR] ← Gateway: command {cmd.get('text', '?')[:30]}")
except Exception as e: except Exception as e:
logger.error(f"[COLLECTOR] poll_commands error: {e}") logger.error(f"[COLLECTOR] poll_commands error: {e}")

View File

@@ -11,4 +11,4 @@
| 007 | 19:28~19:35 | Gateway HTTP API + Docker (Dockerfile, docker-compose, Caddyfile) | `6dbbb57` | ✅ | | 007 | 19:28~19:35 | Gateway HTTP API + Docker (Dockerfile, docker-compose, Caddyfile) | `6dbbb57` | ✅ |
| 008 | 19:35~19:50 | Gateway 보안: API Key 인증 미들웨어 + Caddy HTTPS + .env.example | `95da3e9` | ✅ | | 008 | 19:35~19:50 | Gateway 보안: API Key 인증 미들웨어 + Caddy HTTPS + .env.example | `95da3e9` | ✅ |
| 009 | 19:50~20:10 | RemoteTransport + CollectorBridge 구현 — Collector↔Gateway HTTP 통신 | `95c2905` | ✅ | | 009 | 19:50~20:10 | RemoteTransport + CollectorBridge 구현 — Collector↔Gateway HTTP 통신 | `95c2905` | ✅ |
| 010 | 21:30~23:48 | 아키텍처 감사: aiohttp 전환 + 보안 + 기능 누락 수정 + 나노 검증 | `d7ed454` | 🔧 | | 010 | 21:30~23:48 | 아키텍처 감사: aiohttp 전환 + 보안 + 기능 누락 수정 + 나노 검증 | `d7ed454` | |

View File

@@ -0,0 +1,5 @@
# 2026-03-12 Devlog
| # | 시간 | 작업 설명 | 커밋 | 상태 |
|---|------|----------|------|------|
| 001 | 00:34~00:47 | 429 Rate Limit 무한 루프 디버깅 — 지수 백오프 + rate limit 완화 + Collector 폴링 보호 | `d9b36cf` | ✅ |

View File

@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
# Rate limiting # Rate limiting
RATE_LIMIT_WINDOW = 1.0 # seconds RATE_LIMIT_WINDOW = 1.0 # seconds
RATE_LIMIT_MAX = 10 # max requests per window per IP RATE_LIMIT_MAX = 30 # max requests per window per IP (Collector needs ~15-20/cycle)
COMMAND_TTL = 1800 # 30 min — stale commands auto-deleted COMMAND_TTL = 1800 # 30 min — stale commands auto-deleted
@@ -89,6 +89,7 @@ class GatewayAPI:
return web.json_response( return web.json_response(
{"error": "Too Many Requests"}, {"error": "Too Many Requests"},
status=429, status=429,
headers={"Retry-After": str(int(RATE_LIMIT_WINDOW * 2))},
) )
window.append(now) window.append(now)
self._rate_limits[ip] = window self._rate_limits[ip] = window