fix(tools): 애니 파이프라인 버그 4건 수정 — Unicode SyntaxError, 카타카나 변환, Nyaa 검색 전략 + Python 환경 설정

2026-03-08 17:43:54 +09:00
parent a3a083f6da
commit 1dab257401
12 changed files with 109 additions and 30 deletions
--- a/tools/anime_pipeline.py
+++ b/tools/anime_pipeline.py
@@ -81,16 +81,37 @@ class AnimePipeline:
        except Exception as e:
            result.errors.append(f"자막 조회 오류: {e}")

-        # 3. Nyaa 토렌트 검색 (원제 로마자로)
+        # 3. Nyaa 토렌트 검색 (다중 전략 — suffix 있는/없는 조합)
        try:
            from tools.title_matcher import japanese_to_romaji
-            romaji_title = japanese_to_romaji(anime.original_subject)
+            import re as _re

-            # 먼저 로마자로 검색
-            torrents = await self.nyaa.search(romaji_title)
-            if not torrents:
-                # 원제 그대로 검색
-                torrents = await self.nyaa.search(anime.original_subject)
+            romaji_full = japanese_to_romaji(anime.original_subject)
+            # 한자/비ASCII 잔류 문자 제거 → 순수 로마자만 추출
+            romaji_clean = _re.sub(r'[^\x00-\x7F]+', ' ', romaji_full).strip()
+            romaji_clean = _re.sub(r'\s+', ' ', romaji_clean)
+
+            # 검색 전략 (query, use_default_suffix) 순서
+            strategies: list[tuple[str, bool]] = []
+            if romaji_clean and len(romaji_clean) >= 3:
+                strategies.append((romaji_clean, True))       # romaji + ASW HEVC
+                strategies.append((romaji_clean, False))      # romaji only
+            strategies.append((anime.original_subject, True))   # 원제 + suffix
+            strategies.append((anime.original_subject, False))  # 원제 only
+            strategies.append((anime.subject, True))            # 한글 + suffix
+            strategies.append((anime.subject, False))           # 한글 only
+
+            torrents = []
+            for query, use_suffix in strategies:
+                torrents = await self.nyaa.search(
+                    query, use_default_suffix=use_suffix,
+                )
+                if torrents:
+                    suffix_label = " +suffix" if use_suffix else ""
+                    logger.info(
+                        f"Nyaa 검색 성공: '{query}'{suffix_label} → {len(torrents)}건"
+                    )
+                    break

            # 제목 매칭 필터링
            matched = match_titles(
--- a/tools/nas_scanner.py
+++ b/tools/nas_scanner.py
@@ -1,4 +1,4 @@
-"""NAS 폴더 스캐너 — 다운로드된 애니 목록 + 파일 정보 조회.
+r"""NAS 폴더 스캐너 — 다운로드된 애니 목록 + 파일 정보 조회.

 NAS Animation 폴더 구조:
  \\192.168.10.10\NasData\Video\Animation\
--- a/tools/title_matcher.py
+++ b/tools/title_matcher.py
@@ -53,11 +53,12 @@ _KATA_OFFSET = ord('ア') - ord('あ')


 def _kata_to_hira(text: str) -> str:
-    """카타카나를 히라가나로 변환."""
+    """카타카나를 히라가나로 변환 (ー 장음기호는 유지)."""
    result = []
    for ch in text:
        cp = ord(ch)
-        if 0x30A0 <= cp <= 0x30FF:  # 카타카나 범위
+        # 카타카나 범위이되, ー(U+30FC), ・(U+30FB) 등 기호는 제외
+        if 0x30A1 <= cp <= 0x30F6:  # ア~ヶ (실제 카타카나 문자만)
            result.append(chr(cp - _KATA_OFFSET))
        else:
            result.append(ch)
@@ -71,6 +72,11 @@ def japanese_to_romaji(text: str) -> str:
    result = []
    i = 0
    while i < len(text):
+        # 장음 기호 (ー U+30FC, ー가 히라가나로 안 변환되므로 여기서 처리)
+        if text[i] == '\u30FC':  # ー
+            # 장음: 이전 모음 반복 (간략화: 스킵)
+            i += 1
+            continue
        # 2글자 매칭 우선 (きゃ 등)
        if i + 1 < len(text) and text[i:i+2] in _KANA_ROMAJI:
            result.append(_KANA_ROMAJI[text[i:i+2]])
@@ -85,8 +91,6 @@ def japanese_to_romaji(text: str) -> str:
            else:
                result.append(romaji)
            i += 1
-        elif text[i] == 'ー':  # 장음
-            i += 1
        else:
            # 한자, 영어, 숫자 등 → 그대로
            result.append(text[i])