feat(tools): 애니메이션 자동화 파이프라인 구현

- tools/anissia_client.py: Anissia API 클라이언트 (편성표/자막) - tools/nyaa_client.py: Nyaa.si RSS 토렌트 검색 - tools/qbit_client.py: qBittorrent Web API 클라이언트 - tools/subtitle_downloader.py: Google Drive/Tistory/Naver 자막 파서 - tools/title_matcher.py: 제목 매칭 + NAS 폴더명 생성 - tools/anime_pipeline.py: 전체 파이프라인 오케스트레이터 - tools/nas_scanner.py: NAS 폴더/파일 스캔 - prompts/unified.md: anime 모드 추가 (AI 평문 의도 분류) - api/discord_bot.py: AI 평문 anime 핸들러 + /anime 슬래시 커맨드 - config.py: qBittorrent/NAS 설정 추가 - .agents/: agent_guide 워크플로우 통합 - docs/devlog: 세션 기록
2026-03-08 16:07:16 +09:00
parent 49ee5f397c
commit c92433b0b1
36 changed files with 3663 additions and 128 deletions
--- a/tools/subtitle_downloader.py
+++ b/tools/subtitle_downloader.py
@@ -0,0 +1,260 @@
+"""자막 파일 다운로더 — 3개 플랫폼 파서.
+
+지원 플랫폼:
+1. Google Drive (Blogspot 제작자 대부분)
+2. Tistory (Kakao CDN 직접 다운로드)
+3. Naver Blog (네이티브 첨부파일)
+"""
+
+import httpx
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+from urllib.parse import unquote
+
+logger = logging.getLogger("variet.tools.subtitle")
+
+
+@dataclass
+class SubtitleFile:
+    """다운로드된/발견된 자막 파일 정보."""
+    filename: str
+    download_url: str
+    platform: str       # google_drive, tistory, naver
+    episode: Optional[int] = None
+    local_path: Optional[str] = None  # 다운로드 후 로컬 경로
+
+
+def _extract_episode_from_text(text: str) -> Optional[int]:
+    """텍스트에서 화수 추출."""
+    # "9화", "09화", "9 화"
+    m = re.search(r'(\d{1,4})\s*화', text)
+    if m:
+        return int(m.group(1))
+    # "- 09"
+    m = re.search(r'[-–]\s*(\d{1,4})(?:\s|$|\.)', text)
+    if m:
+        return int(m.group(1))
+    # "Episode 9", "EP09"
+    m = re.search(r'(?:EP|Episode)\s*(\d{1,4})', text, re.IGNORECASE)
+    if m:
+        return int(m.group(1))
+    return None
+
+
+# ──────────────────────────────────────────────
+#  1. Google Drive 파서
+# ──────────────────────────────────────────────
+
+def parse_google_drive_links(html: str) -> list[SubtitleFile]:
+    """HTML에서 Google Drive 다운로드 링크 추출.
+
+    패턴: drive.google.com/file/d/{fileId}/view
+    → 직접 다운로드: drive.google.com/uc?id={fileId}&export=download
+    """
+    pattern = r'https://drive\.google\.com/file/d/([a-zA-Z0-9_-]+)/view[^"]*'
+    matches = re.findall(pattern, html)
+
+    # 링크 주변 텍스트에서 에피소드 정보 추출
+    link_pattern = r'<a[^>]*href="(https://drive\.google\.com/file/d/[^"]+)"[^>]*>([^<]*)</a>'
+    link_matches = re.findall(link_pattern, html)
+
+    results = []
+    seen_ids = set()
+
+    for url, text in link_matches:
+        m = re.search(r'/d/([a-zA-Z0-9_-]+)/', url)
+        if not m:
+            continue
+        file_id = m.group(1)
+        if file_id in seen_ids:
+            continue
+        seen_ids.add(file_id)
+
+        episode = _extract_episode_from_text(text)
+        download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
+
+        results.append(SubtitleFile(
+            filename=text.strip() or f"subtitle_{file_id}",
+            download_url=download_url,
+            platform="google_drive",
+            episode=episode,
+        ))
+
+    # 매칭되지 않은 bare ID도 추가
+    for file_id in matches:
+        if file_id not in seen_ids:
+            seen_ids.add(file_id)
+            results.append(SubtitleFile(
+                filename=f"subtitle_{file_id}",
+                download_url=f"https://drive.google.com/uc?id={file_id}&export=download",
+                platform="google_drive",
+            ))
+
+    return results
+
+
+# ──────────────────────────────────────────────
+#  2. Tistory 파서
+# ──────────────────────────────────────────────
+
+def parse_tistory_links(html: str) -> list[SubtitleFile]:
+    """HTML에서 Tistory/Kakao CDN 다운로드 링크 추출.
+
+    패턴: blog.kakaocdn.net/dna/.../filename.zip?...
+    """
+    pattern = r'(https://blog\.kakaocdn\.net/[^"]+\.(zip|ass|srt|ssa|sub)[^"]*)'
+    matches = re.findall(pattern, html, re.IGNORECASE)
+
+    results = []
+    for url, ext in matches:
+        # URL에서 파일명 추출
+        name_match = re.search(r'/([^/?]+\.' + ext + r')', unquote(url))
+        filename = name_match.group(1) if name_match else f"subtitle.{ext}"
+
+        episode = _extract_episode_from_text(filename)
+
+        results.append(SubtitleFile(
+            filename=filename,
+            download_url=url,
+            platform="tistory",
+            episode=episode,
+        ))
+
+    return results
+
+
+# ──────────────────────────────────────────────
+#  3. Naver Blog 파서
+# ──────────────────────────────────────────────
+
+def parse_naver_links(html: str) -> list[SubtitleFile]:
+    """HTML에서 Naver Blog 첨부파일 다운로드 링크 추출.
+
+    패턴: download.blog.naver.com/... 또는 blogfiles.pstatic.net/...
+    """
+    results = []
+
+    # Naver 파일 다운로드 버튼
+    # <a class="se-file-save-button" href="https://download.blog.naver.com/..." ...>
+    file_pattern = r'href="(https://(?:download\.blog\.naver\.com|blogfiles\.pstatic\.net)/[^"]+)"'
+    matches = re.findall(file_pattern, html)
+
+    for url in matches:
+        # URL에서 파일명 추출
+        decoded = unquote(url)
+        name_match = re.search(r'/([^/?]+\.(?:zip|ass|srt|ssa|sub|7z))', decoded, re.IGNORECASE)
+        filename = name_match.group(1) if name_match else "subtitle_naver"
+
+        episode = _extract_episode_from_text(filename)
+
+        results.append(SubtitleFile(
+            filename=filename,
+            download_url=url,
+            platform="naver",
+            episode=episode,
+        ))
+
+    return results
+
+
+# ──────────────────────────────────────────────
+#  통합 다운로더
+# ──────────────────────────────────────────────
+
+class SubtitleDownloader:
+    """자막 파일 검색 및 다운로드."""
+
+    def __init__(self, download_dir: str = ""):
+        self.download_dir = Path(download_dir) if download_dir else Path.cwd() / "subtitles"
+
+    async def fetch_page(self, url: str) -> str:
+        """웹 페이지 HTML 가져오기."""
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            "Accept-Language": "ko-KR,ko;q=0.9",
+        }
+
+        # Naver Blog iframe bypass
+        if "blog.naver.com" in url and "PostView" not in url:
+            # blog.naver.com/{blogId}/{logNo} → PostView URL
+            m = re.search(r'blog\.naver\.com/([^/]+)/(\d+)', url)
+            if m:
+                url = f"https://blog.naver.com/PostView.naver?blogId={m.group(1)}&logNo={m.group(2)}"
+
+        async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
+            resp = await client.get(url, headers=headers)
+            resp.raise_for_status()
+            return resp.text
+
+    async def find_subtitles(self, url: str) -> list[SubtitleFile]:
+        """URL에서 자막 파일 링크 자동 탐지."""
+        html = await self.fetch_page(url)
+
+        results = []
+
+        # 플랫폼 자동 감지 후 파싱
+        if "drive.google.com" in html:
+            results.extend(parse_google_drive_links(html))
+
+        if "blog.kakaocdn.net" in html:
+            results.extend(parse_tistory_links(html))
+
+        if "download.blog.naver.com" in html or "blogfiles.pstatic.net" in html:
+            results.extend(parse_naver_links(html))
+
+        # 범용: 직접 자막 파일 링크 탐지
+        generic_pattern = r'href="([^"]+\.(?:ass|srt|ssa|sub|zip|7z))"'
+        generic = re.findall(generic_pattern, html, re.IGNORECASE)
+        seen_urls = {r.download_url for r in results}
+        for gurl in generic:
+            if gurl not in seen_urls:
+                filename = gurl.split("/")[-1].split("?")[0]
+                results.append(SubtitleFile(
+                    filename=unquote(filename),
+                    download_url=gurl,
+                    platform="generic",
+                    episode=_extract_episode_from_text(filename),
+                ))
+
+        logger.info(f"자막 {len(results)}건 발견: {url}")
+        return results
+
+    async def download_file(
+        self,
+        sub: SubtitleFile,
+        save_dir: Optional[str] = None,
+    ) -> str:
+        """자막 파일 다운로드 → 로컬 저장. 저장 경로 반환."""
+        target_dir = Path(save_dir) if save_dir else self.download_dir
+        target_dir.mkdir(parents=True, exist_ok=True)
+
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+        }
+
+        # Naver 리퍼러 헤더
+        if sub.platform == "naver":
+            headers["Referer"] = "https://blog.naver.com/"
+
+        async with httpx.AsyncClient(
+            timeout=60, follow_redirects=True, max_redirects=5
+        ) as client:
+            resp = await client.get(sub.download_url, headers=headers)
+            resp.raise_for_status()
+
+            # Content-Disposition에서 실제 파일명 추출
+            cd = resp.headers.get("content-disposition", "")
+            if "filename" in cd:
+                m = re.search(r'filename[*]?=["\']?(?:UTF-8\'\')?([^"\';\n]+)', cd)
+                if m:
+                    sub.filename = unquote(m.group(1).strip())
+
+            filepath = target_dir / sub.filename
+            filepath.write_bytes(resp.content)
+
+        sub.local_path = str(filepath)
+        logger.info(f"자막 다운로드 완료: {filepath}")
+        return str(filepath)