refactor: 전면 재설계 - 시작 시 채널 스팸 제거, content hash 중복 방지, 단일 이벤트 경로

This commit is contained in:
2026-03-07 11:42:11 +09:00
parent 52fed8c1d3
commit e32be6b2f3
3 changed files with 155 additions and 312 deletions

View File

@@ -2,9 +2,12 @@
Uses watchdog to detect file creation/modification events in the brain directory.
Emits events to an asyncio queue for the Discord bot to consume.
Key design: ONLY emits events for meaningful content changes using hash dedup.
"""
import asyncio
import hashlib
import time
import logging
from pathlib import Path
@@ -21,8 +24,7 @@ logger = logging.getLogger(__name__)
class EventType(Enum):
"""Types of brain events."""
SESSION_START = "session_start" # New conversation directory created
SESSION_END = "session_end" # Conversation directory removed (or program exit)
FILE_CHANGED = "file_changed" # Watched file created/modified
FILE_CHANGED = "file_changed" # Watched file modified
FILE_CREATED = "file_created" # Watched file first created
@@ -38,18 +40,19 @@ class BrainEvent:
class BrainEventHandler(FileSystemEventHandler):
"""Watchdog handler that filters and debounces brain events."""
"""Watchdog handler that filters, debounces, and deduplicates brain events."""
def __init__(self, event_queue: asyncio.Queue, loop: asyncio.AbstractEventLoop):
super().__init__()
self.event_queue = event_queue
self.loop = loop
self._last_events: dict[str, float] = {} # path -> timestamp (debounce)
self._last_events: dict[str, float] = {} # path -> timestamp (debounce)
self._content_hashes: dict[str, str] = {} # path -> md5 hash (dedup)
self._known_sessions: set[str] = set()
self._initialize_known_sessions()
def _initialize_known_sessions(self):
"""Scan existing brain directories to establish baseline."""
"""Scan existing brain directories to establish baseline (no events emitted)."""
brain_path = Config.BRAIN_PATH
if brain_path.exists():
for entry in brain_path.iterdir():
@@ -58,12 +61,10 @@ class BrainEventHandler(FileSystemEventHandler):
logger.info(f"Found {len(self._known_sessions)} existing sessions at startup")
def _is_conversation_id(self, name: str) -> bool:
"""Check if directory name looks like a UUID conversation ID."""
parts = name.split("-")
return len(parts) == 5 and all(len(p) >= 4 for p in parts)
def _get_conversation_id(self, path: Path) -> str | None:
"""Extract conversation ID from file path."""
brain_path = Config.BRAIN_PATH
try:
relative = path.relative_to(brain_path)
@@ -75,7 +76,6 @@ class BrainEventHandler(FileSystemEventHandler):
return None
def _should_debounce(self, path_str: str) -> bool:
"""Check if this event should be debounced."""
now = time.time()
last = self._last_events.get(path_str, 0)
if now - last < Config.DEBOUNCE_SECONDS:
@@ -83,8 +83,20 @@ class BrainEventHandler(FileSystemEventHandler):
self._last_events[path_str] = now
return False
def _content_changed(self, path_str: str, content: str) -> bool:
"""Check if content actually changed using MD5 hash."""
new_hash = hashlib.md5(content.encode()).hexdigest()
old_hash = self._content_hashes.get(path_str)
if old_hash == new_hash:
return False
self._content_hashes[path_str] = new_hash
return True
def _is_watched_file(self, file_name: str) -> bool:
"""Strict filter: only watch primary artifact files."""
return file_name in Config.WATCHED_FILES
def _emit(self, event: BrainEvent):
"""Thread-safe emit to asyncio queue."""
self.loop.call_soon_threadsafe(self.event_queue.put_nowait, event)
def on_created(self, event: FileSystemEvent):
@@ -98,10 +110,8 @@ class BrainEventHandler(FileSystemEventHandler):
self._handle_file_event(Path(event.src_path), EventType.FILE_CHANGED)
def _handle_directory_created(self, path: Path):
"""Detect new session directories."""
conv_id = self._get_conversation_id(path)
if conv_id and conv_id not in self._known_sessions:
# Check if this is a direct child of brain/
if path.parent == Config.BRAIN_PATH:
self._known_sessions.add(conv_id)
logger.info(f"New session detected: {conv_id}")
@@ -111,17 +121,17 @@ class BrainEventHandler(FileSystemEventHandler):
))
def _handle_file_event(self, path: Path, event_type: EventType):
"""Process file creation/modification events."""
conv_id = self._get_conversation_id(path)
if not conv_id:
return
file_name = path.name
if file_name not in Config.WATCHED_FILES:
# Check suffix patterns
if not any(file_name.endswith(s) for s in Config.WATCHED_SUFFIXES):
return
# STRICT filter: only primary artifacts
if not self._is_watched_file(file_name):
return
# Debounce: skip rapid-fire events for same file
if self._should_debounce(str(path)):
return
@@ -132,7 +142,11 @@ class BrainEventHandler(FileSystemEventHandler):
logger.warning(f"Failed to read {path}: {e}")
return
logger.info(f"File event: {event_type.value} {conv_id}/{file_name}")
# Content hash dedup: skip if content hasn't actually changed
if not self._content_changed(str(path), content):
return
logger.info(f"File event: {event_type.value} {conv_id[:8]}/{file_name}")
self._emit(BrainEvent(
event_type=event_type,
conversation_id=conv_id,
@@ -152,7 +166,6 @@ class BrainWatcher:
self.handler = BrainEventHandler(event_queue, loop)
def start(self):
"""Start watching the brain directory."""
brain_path = Config.BRAIN_PATH
if not brain_path.exists():
logger.error(f"Brain path does not exist: {brain_path}")
@@ -163,7 +176,6 @@ class BrainWatcher:
logger.info(f"Watching brain directory: {brain_path}")
def stop(self):
"""Stop the watcher."""
self.observer.stop()
self.observer.join()
logger.info("Brain watcher stopped")