"""자막 파일 다운로더 — 3개 플랫폼 파서.
지원 플랫폼:
1. Google Drive (Blogspot 제작자 대부분)
2. Tistory (Kakao CDN 직접 다운로드)
3. Naver Blog (네이티브 첨부파일)
"""
import httpx
import logging
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
from urllib.parse import unquote
logger = logging.getLogger("variet.tools.subtitle")
@dataclass
class SubtitleFile:
"""다운로드된/발견된 자막 파일 정보."""
filename: str
download_url: str
platform: str # google_drive, tistory, naver
episode: Optional[int] = None
local_path: Optional[str] = None # 다운로드 후 로컬 경로
def _extract_episode_from_text(text: str) -> Optional[int]:
"""텍스트에서 화수 추출."""
# "9화", "09화", "9 화"
m = re.search(r'(\d{1,4})\s*화', text)
if m:
return int(m.group(1))
# "- 09"
m = re.search(r'[-–]\s*(\d{1,4})(?:\s|$|\.)', text)
if m:
return int(m.group(1))
# "Episode 9", "EP09"
m = re.search(r'(?:EP|Episode)\s*(\d{1,4})', text, re.IGNORECASE)
if m:
return int(m.group(1))
return None
# ──────────────────────────────────────────────
# 1. Google Drive 파서
# ──────────────────────────────────────────────
def parse_google_drive_links(html: str) -> list[SubtitleFile]:
"""HTML에서 Google Drive 다운로드 링크 추출.
패턴: drive.google.com/file/d/{fileId}/view
→ 직접 다운로드: drive.google.com/uc?id={fileId}&export=download
"""
pattern = r'https://drive\.google\.com/file/d/([a-zA-Z0-9_-]+)/view[^"]*'
matches = re.findall(pattern, html)
# 링크 주변 텍스트에서 에피소드 정보 추출
link_pattern = r']*href="(https://drive\.google\.com/file/d/[^"]+)"[^>]*>([^<]*)'
link_matches = re.findall(link_pattern, html)
results = []
seen_ids = set()
for url, text in link_matches:
m = re.search(r'/d/([a-zA-Z0-9_-]+)/', url)
if not m:
continue
file_id = m.group(1)
if file_id in seen_ids:
continue
seen_ids.add(file_id)
episode = _extract_episode_from_text(text)
download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
results.append(SubtitleFile(
filename=text.strip() or f"subtitle_{file_id}",
download_url=download_url,
platform="google_drive",
episode=episode,
))
# 매칭되지 않은 bare ID도 추가
for file_id in matches:
if file_id not in seen_ids:
seen_ids.add(file_id)
results.append(SubtitleFile(
filename=f"subtitle_{file_id}",
download_url=f"https://drive.google.com/uc?id={file_id}&export=download",
platform="google_drive",
))
return results
# ──────────────────────────────────────────────
# 2. Tistory 파서
# ──────────────────────────────────────────────
def parse_tistory_links(html: str) -> list[SubtitleFile]:
"""HTML에서 Tistory/Kakao CDN 다운로드 링크 추출.
패턴: blog.kakaocdn.net/dna/.../filename.zip?...
"""
pattern = r'(https://blog\.kakaocdn\.net/[^"]+\.(zip|ass|srt|ssa|sub)[^"]*)'
matches = re.findall(pattern, html, re.IGNORECASE)
results = []
for url, ext in matches:
# URL에서 파일명 추출
name_match = re.search(r'/([^/?]+\.' + ext + r')', unquote(url))
filename = name_match.group(1) if name_match else f"subtitle.{ext}"
episode = _extract_episode_from_text(filename)
results.append(SubtitleFile(
filename=filename,
download_url=url,
platform="tistory",
episode=episode,
))
return results
# ──────────────────────────────────────────────
# 3. Naver Blog 파서
# ──────────────────────────────────────────────
def parse_naver_links(html: str) -> list[SubtitleFile]:
"""HTML에서 Naver Blog 첨부파일 다운로드 링크 추출.
패턴: download.blog.naver.com/... 또는 blogfiles.pstatic.net/...
"""
results = []
# Naver 파일 다운로드 버튼
#
file_pattern = r'href="(https://(?:download\.blog\.naver\.com|blogfiles\.pstatic\.net)/[^"]+)"'
matches = re.findall(file_pattern, html)
for url in matches:
# URL에서 파일명 추출
decoded = unquote(url)
name_match = re.search(r'/([^/?]+\.(?:zip|ass|srt|ssa|sub|7z))', decoded, re.IGNORECASE)
filename = name_match.group(1) if name_match else "subtitle_naver"
episode = _extract_episode_from_text(filename)
results.append(SubtitleFile(
filename=filename,
download_url=url,
platform="naver",
episode=episode,
))
return results
# ──────────────────────────────────────────────
# 통합 다운로더
# ──────────────────────────────────────────────
class SubtitleDownloader:
"""자막 파일 검색 및 다운로드."""
def __init__(self, download_dir: str = ""):
self.download_dir = Path(download_dir) if download_dir else Path.cwd() / "subtitles"
async def fetch_page(self, url: str) -> str:
"""웹 페이지 HTML 가져오기."""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "ko-KR,ko;q=0.9",
}
# Naver Blog iframe bypass
if "blog.naver.com" in url and "PostView" not in url:
# blog.naver.com/{blogId}/{logNo} → PostView URL
m = re.search(r'blog\.naver\.com/([^/]+)/(\d+)', url)
if m:
url = f"https://blog.naver.com/PostView.naver?blogId={m.group(1)}&logNo={m.group(2)}"
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
resp = await client.get(url, headers=headers)
resp.raise_for_status()
return resp.text
async def find_subtitles(self, url: str) -> list[SubtitleFile]:
"""URL에서 자막 파일 링크 자동 탐지."""
html = await self.fetch_page(url)
results = []
# 플랫폼 자동 감지 후 파싱
if "drive.google.com" in html:
results.extend(parse_google_drive_links(html))
if "blog.kakaocdn.net" in html:
results.extend(parse_tistory_links(html))
if "download.blog.naver.com" in html or "blogfiles.pstatic.net" in html:
results.extend(parse_naver_links(html))
# 범용: 직접 자막 파일 링크 탐지
generic_pattern = r'href="([^"]+\.(?:ass|srt|ssa|sub|zip|7z))"'
generic = re.findall(generic_pattern, html, re.IGNORECASE)
seen_urls = {r.download_url for r in results}
for gurl in generic:
if gurl not in seen_urls:
filename = gurl.split("/")[-1].split("?")[0]
results.append(SubtitleFile(
filename=unquote(filename),
download_url=gurl,
platform="generic",
episode=_extract_episode_from_text(filename),
))
logger.info(f"자막 {len(results)}건 발견: {url}")
return results
async def download_file(
self,
sub: SubtitleFile,
save_dir: Optional[str] = None,
) -> str:
"""자막 파일 다운로드 → 로컬 저장. 저장 경로 반환."""
target_dir = Path(save_dir) if save_dir else self.download_dir
target_dir.mkdir(parents=True, exist_ok=True)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
}
# Naver 리퍼러 헤더
if sub.platform == "naver":
headers["Referer"] = "https://blog.naver.com/"
async with httpx.AsyncClient(
timeout=60, follow_redirects=True, max_redirects=5
) as client:
resp = await client.get(sub.download_url, headers=headers)
resp.raise_for_status()
# Content-Disposition에서 실제 파일명 추출
cd = resp.headers.get("content-disposition", "")
if "filename" in cd:
m = re.search(r'filename[*]?=["\']?(?:UTF-8\'\')?([^"\';\n]+)', cd)
if m:
sub.filename = unquote(m.group(1).strip())
filepath = target_dir / sub.filename
filepath.write_bytes(resp.content)
sub.local_path = str(filepath)
logger.info(f"자막 다운로드 완료: {filepath}")
return str(filepath)