fix(anime): 파이프라인 5건 수정 — 에피소드 정규식(v2/S01E), 릴리스 그룹 필터, 자막 보호, 배치 다운로드, 타임아웃

This commit is contained in:
2026-03-15 08:27:08 +09:00
parent 63818999d9
commit 9f74812710
40 changed files with 2759 additions and 815 deletions

View File

@@ -51,20 +51,16 @@ def _extract_episode_from_text(text: str) -> Optional[int]:
def parse_google_drive_links(html: str) -> list[SubtitleFile]:
"""HTML에서 Google Drive 다운로드 링크 추출.
패턴: drive.google.com/file/d/{fileId}/view
→ 직접 다운로드: drive.google.com/uc?id={fileId}&export=download
지원 패턴:
1. drive.google.com/file/d/{fileId}/view
2. drive.google.com/uc?id={fileId}&export=download (직접 다운로드)
"""
pattern = r'https://drive\.google\.com/file/d/([a-zA-Z0-9_-]+)/view[^"]*'
matches = re.findall(pattern, html)
# 링크 주변 텍스트에서 에피소드 정보 추출
link_pattern = r'<a[^>]*href="(https://drive\.google\.com/file/d/[^"]+)"[^>]*>([^<]*)</a>'
link_matches = re.findall(link_pattern, html)
results = []
seen_ids = set()
for url, text in link_matches:
# 패턴 1: /file/d/{id}/view — HTML <a> 태그
link_pattern = r'<a[^>]*href="(https://drive\.google\.com/file/d/[^"]+)"[^>]*>([^<]*)</a>'
for url, text in re.findall(link_pattern, html):
m = re.search(r'/d/([a-zA-Z0-9_-]+)/', url)
if not m:
continue
@@ -72,19 +68,15 @@ def parse_google_drive_links(html: str) -> list[SubtitleFile]:
if file_id in seen_ids:
continue
seen_ids.add(file_id)
episode = _extract_episode_from_text(text)
download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
results.append(SubtitleFile(
filename=text.strip() or f"subtitle_{file_id}",
download_url=download_url,
download_url=f"https://drive.google.com/uc?id={file_id}&export=download",
platform="google_drive",
episode=episode,
episode=_extract_episode_from_text(text),
))
# 매칭되지 않은 bare ID도 추가
for file_id in matches:
# 패턴 1: bare ID (태그 밖)
for file_id in re.findall(r'drive\.google\.com/file/d/([a-zA-Z0-9_-]+)/', html):
if file_id not in seen_ids:
seen_ids.add(file_id)
results.append(SubtitleFile(
@@ -93,6 +85,23 @@ def parse_google_drive_links(html: str) -> list[SubtitleFile]:
platform="google_drive",
))
# 패턴 2: uc?id={id} 직접 다운로드 URL (Blogspot 등)
uc_pattern = r'drive\.google\.com/uc\?[^"\s\)]*id=([a-zA-Z0-9_-]+)[^"\s\)]*'
for file_id in re.findall(uc_pattern, html):
if file_id in seen_ids:
continue
seen_ids.add(file_id)
# 주변 텍스트에서 파일명 추출 시도 (마크다운: [파일명](url))
md_pattern = r'\[([^\]]+)\]\([^)]*' + re.escape(file_id) + r'[^)]*\)'
md_match = re.search(md_pattern, html)
filename = md_match.group(1).strip() if md_match else f"subtitle_{file_id}"
results.append(SubtitleFile(
filename=filename,
download_url=f"https://drive.google.com/uc?id={file_id}&export=download",
platform="google_drive",
episode=_extract_episode_from_text(filename),
))
return results
@@ -205,20 +214,32 @@ class SubtitleDownloader:
if "download.blog.naver.com" in html or "blogfiles.pstatic.net" in html:
results.extend(parse_naver_links(html))
# 범용: 직접 자막 파일 링크 탐지
generic_pattern = r'href="([^"]+\.(?:ass|srt|ssa|sub|zip|7z))"'
generic = re.findall(generic_pattern, html, re.IGNORECASE)
# 범용: 직접 자막 파일 링크 탐지 (HTML href + 마크다운)
seen_urls = {r.download_url for r in results}
for gurl in generic:
# HTML <a href="...">
for gurl in re.findall(r'href="([^"]+\.(?:ass|srt|ssa|sub|zip|7z)(?:\?[^"]*)?)"', html, re.IGNORECASE):
if gurl not in seen_urls:
filename = gurl.split("/")[-1].split("?")[0]
seen_urls.add(gurl)
filename = unquote(gurl.split("/")[-1].split("?")[0])
results.append(SubtitleFile(
filename=unquote(filename),
filename=filename,
download_url=gurl,
platform="generic",
episode=_extract_episode_from_text(filename),
))
# 마크다운 [텍스트](url) — Blogspot 등
for text, gurl in re.findall(r'\[([^\]]+)\]\((https?://[^)]+\.(?:ass|srt|ssa|sub|zip|7z)[^)]*)\)', html, re.IGNORECASE):
if gurl not in seen_urls:
seen_urls.add(gurl)
results.append(SubtitleFile(
filename=text.strip(),
download_url=gurl,
platform="generic",
episode=_extract_episode_from_text(text),
))
logger.info(f"자막 {len(results)}건 발견: {url}")
return results
@@ -227,7 +248,7 @@ class SubtitleDownloader:
sub: SubtitleFile,
save_dir: Optional[str] = None,
) -> str:
"""자막 파일 다운로드 → 로컬 저장. 저장 경로 반환."""
"""자막 파일 다운로드 → 로컬 저장. ZIP이면 자동 해제. 저장 경로 반환."""
target_dir = Path(save_dir) if save_dir else self.download_dir
target_dir.mkdir(parents=True, exist_ok=True)
@@ -248,13 +269,55 @@ class SubtitleDownloader:
# Content-Disposition에서 실제 파일명 추출
cd = resp.headers.get("content-disposition", "")
if "filename" in cd:
m = re.search(r'filename[*]?=["\']?(?:UTF-8\'\')?([^"\';\n]+)', cd)
m = re.search(r'filename[*]?=["\']?(?:UTF-8\'\')?([^"\';\\n]+)', cd)
if m:
sub.filename = unquote(m.group(1).strip())
filepath = target_dir / sub.filename
filepath.write_bytes(resp.content)
# ZIP/7z 자동 해제
extracted = self._extract_archive(filepath, target_dir)
if extracted:
sub.local_path = extracted[0] # 첫 번째 자막 파일
sub.filename = Path(extracted[0]).name
logger.info(f"자막 ZIP 해제 완료: {len(extracted)}건 → {target_dir}")
return extracted[0]
sub.local_path = str(filepath)
logger.info(f"자막 다운로드 완료: {filepath}")
return str(filepath)
@staticmethod
def _extract_archive(filepath: Path, target_dir: Path) -> list[str]:
"""ZIP/7z 파일 해제 → 자막 파일(.ass/.srt/.ssa/.sub) 경로 리스트 반환."""
import zipfile
suffix = filepath.suffix.lower()
if suffix not in (".zip", ".7z"):
return []
extracted = []
if suffix == ".zip":
try:
with zipfile.ZipFile(filepath, "r") as zf:
for name in zf.namelist():
# 디렉토리 건너뛰기
if name.endswith("/"):
continue
ext = Path(name).suffix.lower()
if ext in (".ass", ".srt", ".ssa", ".sub"):
# 중첩 폴더 무시, 파일만 추출
out_name = Path(name).name
out_path = target_dir / out_name
with zf.open(name) as src, open(out_path, "wb") as dst:
dst.write(src.read())
extracted.append(str(out_path))
# ZIP 원본 삭제
filepath.unlink(missing_ok=True)
except (zipfile.BadZipFile, Exception) as e:
logger.warning(f"ZIP 해제 실패: {filepath} - {e}")
return extracted