fix(bot): multi-project signal freeze — cache-only _get_channel + per-tick scanner cap

Root cause: When 3+ projects generated pending simultaneously, Bot's
pending_approval_scanner made 20-40 Discord API calls in one tick
(sequential await), triggering Discord 429 rate limits which blocked
the entire scanner for 10-30s, freezing ALL signal delivery.

Two fixes:
1. _get_channel(): Replace guild.fetch_channels() (API call) with
   discord.utils.get(guild.channels) (in-memory cache). Eliminates
   redundant API calls + Lock contention when multiple projects arrive.
2. pending_approval_scanner: Per-tick caps (5 new + 5 status) prevent
   one tick from monopolizing Discord API quota. Excess items are
   naturally processed in subsequent 3-second ticks.
This commit is contained in:
Variet Worker
2026-03-16 07:06:51 +09:00
parent 64f80212c3
commit 37c0aae41c
2 changed files with 46 additions and 20 deletions

60
bot.py
View File

@@ -356,32 +356,34 @@ class GravityBot(commands.Bot):
logger.info(f"Discovered {len(self.project_channels)} project channels")
async def _get_channel(self, project_name: str) -> discord.TextChannel:
"""Get or create a channel for a project. Lock-protected."""
"""Get or create a channel for a project.
Uses guild.channels cache first (NO API call), only locks + creates
if channel truly doesn't exist. This prevents O(N) fetch_channels()
API calls when multiple projects arrive simultaneously.
"""
if project_name in self.project_channels:
return self.project_channels[project_name]
channel_name = self._make_channel_name(project_name)
# 1. Check guild channel cache (NO API call — instant)
existing = discord.utils.get(
self.guild.channels, name=channel_name,
category_id=self.session_category.id,
)
if existing and isinstance(existing, discord.TextChannel):
self.project_channels[project_name] = existing
self.channel_to_project[existing.id] = project_name
logger.info(f"Found channel (cache): #{channel_name}")
return existing
# 2. Only lock + API call if truly creating new channel
async with self._channel_lock:
# Double-check after lock
# Double-check after lock (another coroutine may have created it)
if project_name in self.project_channels:
return self.project_channels[project_name]
channel_name = self._make_channel_name(project_name)
# Search existing channels FIRST (prevents duplicates)
try:
all_channels = await self.guild.fetch_channels()
for ch in all_channels:
if (isinstance(ch, discord.TextChannel)
and ch.name == channel_name
and ch.category_id == self.session_category.id):
self.project_channels[project_name] = ch
self.channel_to_project[ch.id] = project_name
logger.info(f"Found existing channel: #{channel_name}")
return ch
except Exception as e:
logger.warning(f"fetch_channels failed: {e}")
# No existing channel — create new
try:
ch = await self.guild.create_text_channel(
name=channel_name,
@@ -543,7 +545,11 @@ class GravityBot(commands.Bot):
@tasks.loop(seconds=3)
async def pending_approval_scanner(self):
"""Scan bridge/pending/ for new approval requests + reload registrations."""
"""Scan bridge/pending/ for new approval requests + reload registrations.
Per-tick caps prevent Discord API rate limit cascade when multiple
projects generate pending files simultaneously.
"""
try:
# Reload conv→project registrations each cycle
self._load_registrations()
@@ -551,8 +557,14 @@ class GravityBot(commands.Bot):
# Channels are created on-demand when actual signals arrive
# (via _get_channel in snapshot scanner / approval sender)
MAX_NEW_PER_TICK = 5 # Phase 1: max new pending to process per tick
MAX_STATUS_PER_TICK = 5 # Phase 2: max status changes to process per tick
phase1_processed = 0
requests = self.bridge.get_pending_requests()
for req in requests:
if phase1_processed >= MAX_NEW_PER_TICK:
break
if req.request_id in self._sent_approval_ids:
continue
if req.discord_message_id != 0:
@@ -571,6 +583,7 @@ class GravityBot(commands.Bot):
if req.command.strip().lower() in reject_commands:
logger.warning(f"Auto-approve BLOCKED: command='{req.command}' is reject-word — skipping")
self._sent_approval_ids.add(req.request_id)
phase1_processed += 1
continue
self._sent_approval_ids.add(req.request_id)
@@ -614,6 +627,7 @@ class GravityBot(commands.Bot):
embed.set_footer(text=f"auto-approve | {req.request_id[:12]}")
await channel.send(embed=embed)
logger.info(f"Auto-approved: {req.request_id[:12]} project={project} btn_idx={approve_btn_index}")
phase1_processed += 1
continue
# Defer short-command pendings (e.g. "Run") by 4 cycles (~12s)
@@ -640,9 +654,13 @@ class GravityBot(commands.Bot):
self._sent_approval_ids.add(req.request_id)
self._sent_commands[req.request_id] = req.command
await self._send_approval_request(channel, req)
phase1_processed += 1
# ── Single-pass: handle auto_resolved, expired, and MERGE in one glob ──
phase2_processed = 0
for f in self.bridge.pending_dir.glob("*.json"):
if phase2_processed >= MAX_STATUS_PER_TICK:
break
try:
data = json.loads(f.read_text(encoding="utf-8-sig"))
status = data.get("status", "pending")
@@ -675,6 +693,7 @@ class GravityBot(commands.Bot):
self._sent_commands.pop(rid, None)
self._approval_messages.pop(rid, None)
self._sent_approval_ids.discard(rid)
phase2_processed += 1
elif status == "expired":
msg_id = data.get("discord_message_id", 0)
@@ -697,6 +716,7 @@ class GravityBot(commands.Bot):
self._deferred_ids.pop(rid, None)
self._sent_commands.pop(rid, None)
self._sent_approval_ids.discard(rid)
phase2_processed += 1
elif status == "pending":
# MERGE check: step_probe updated command in already-sent pending