guitar_score/youtube_tab_to_pdf.py

#!/usr/bin/env python3
"""
YouTube Tab → PDF 캡처 파이프라인
YouTube 기타 TAB 영상에서 Tab 프레임을 추출하여 깔끔한 A4 PDF 악보로 만듭니다.

사용법:
    python youtube_tab_to_pdf.py "https://youtu.be/VIDEO_ID"
    python youtube_tab_to_pdf.py "https://youtu.be/VIDEO_ID" -o output.pdf --debug
"""

import argparse
import os
import sys
import subprocess
import shutil
import re
from pathlib import Path
from typing import List, Tuple, Optional

import cv2
import numpy as np
import img2pdf
from PIL import Image

_ocr_reader = None

def _get_ocr_reader():
    global _ocr_reader
    if _ocr_reader is None:
        print("  → EasyOCR 모델 로딩 중 (초회 1번)...")
        try:
            import easyocr
            _ocr_reader = easyocr.Reader(['en'])
        except ImportError:
            print("  [경고] easyocr 라이브러리가 없습니다. OCR 중복 검증을 건너뜁니다.")
            return None
    return _ocr_reader

def _dedup_by_measure_number(frames: List[np.ndarray]) -> List[np.ndarray]:
    """OCR을 이용해 Tab 좌측 상단의 마디 번호를 읽고,
       연속으로 동일한 번호가 검출되면 중복으로 간주하고 제거합니다."""
    reader = _get_ocr_reader()
    if not reader:
        return frames

    print(f"  → 마디번호 기반 3차 중복 검증 시작 ({len(frames)} 프레임)")
    unique = []
    last_measure_num = None

    for i, frame in enumerate(frames):
        h, w = frame.shape[:2]
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if len(frame.shape) == 3 else frame

        # 동적 투영(Projection)을 통해 첫 번째 오선지(Staff line)의 Y좌표 스캔
        _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
        row_sums = np.sum(thresh, axis=1) / 255

        # 폭의 50% 이상을 차지하는 검은 가로선을 오선지로 간주
        staff_lines = np.where(row_sums > w * 0.5)[0]

        if len(staff_lines) > 0:
            first_line_y = staff_lines[0]
            # 오선지 바로 위 영역 45px ~ 오선지 까지 (여유공간 2px) + 좌측 8% 너비만 추출 (기타 코드 다이어그램 제외)
            crop_y_start = max(0, first_line_y - 45)
            crop_y_end = max(10, first_line_y - 2)
            crop = gray[crop_y_start:crop_y_end, :int(w * 0.08)]
        else:
            # 안전 장치: 오선지를 못 찾았을 경우 기존 하드코딩 비율 사용
            crop = gray[:int(h * 0.25), :int(w * 0.08)]

        # 작은 마디번호의 인식률 극대화를 위해 3배 업스케일링 및 이진화 처리
        upscaled = cv2.resize(crop, (0, 0), fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
        _, upscaled_thresh = cv2.threshold(upscaled, 150, 255, cv2.THRESH_BINARY_INV)

        results = reader.readtext(upscaled_thresh, allowlist='0123456789')

        measure_num = None
        if results:
            # conf > 0.4 이면서 1~3자리의 숫자로만 이루어진 텍스트를 마디 번호로 간주
            valid_results = [res[1] for res in results if res[2] > 0.4 and res[1].isdigit() and len(res[1]) <= 3]
            if valid_results:
                measure_num = valid_results[0]

        if measure_num is not None:
            if measure_num == last_measure_num:
                print(f"    - 프레임 {i+1}: 마디번호 [{measure_num}] 중복 감지 (삭제)")
                continue
            last_measure_num = measure_num
            print(f"    - 프레임 {i+1}: 마디번호 [{measure_num}] (유지)")
        else:
            print(f"    - 프레임 {i+1}: 마디번호 미검출 (유지)")

        unique.append(frame)

    print(f"  → OCR 3차: {len(unique)}개 고유 Tab 프레임")
    return unique

# Windows 콘솔 인코딩
if sys.platform == "win32":
    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    sys.stderr.reconfigure(encoding="utf-8", errors="replace")


# ─── 설정 ─────────────────────────────────────────────────────────────────

DEFAULT_FPS = 2
SIMILARITY_THRESHOLD = 0.95
OVERLAY_SIMILARITY_THRESHOLD = 0.55

OVERLAY_MIN_AREA_RATIO = 0.05
OVERLAY_MAX_AREA_RATIO = 0.6
MIN_TAB_LINES = 4

# 프레임 추출 시 최대 폭 (1080p→1280p 다운스케일로 메모리 세이브)
MAX_FRAME_WIDTH = 1280
# 검출용 업스케일 폭 (360p→960px, 1.5x → Tab 라인 두꺼워짐)
DETECT_WIDTH = 960

PDF_DPI = 150
PDF_PAGE_WIDTH_MM = 210
PDF_PAGE_HEIGHT_MM = 297
PDF_MARGIN_MM = 10
TAB_GAP_MM = 3


# ─── Step 1: 다운로드 ─────────────────────────────────────────────────────

def _find_yt_dlp() -> str:
    yt_dlp = shutil.which("yt-dlp")
    if yt_dlp:
        return yt_dlp
    for pyver in ["Python312", "Python311", "Python310"]:
        p = Path(os.environ.get("APPDATA", "")) / "Python" / pyver / "Scripts" / "yt-dlp.exe"
        if p.exists():
            return str(p)
    p = Path(sys.executable).parent / "Scripts" / "yt-dlp.exe"
    if p.exists():
        return str(p)
    raise RuntimeError("yt-dlp를 찾을 수 없습니다. pip install yt-dlp")


def download_video(url: str, output_dir: Path) -> Tuple[Path, str]:
    """영상 다운로드 (1080p 우선)"""
    print("[1/5] 영상 다운로드 중...")
    yt_dlp = _find_yt_dlp()

    result = subprocess.run(
        [yt_dlp, "--get-title", "--encoding", "utf-8", url],
        capture_output=True, encoding="utf-8", errors="replace"
    )
    title = (result.stdout or "").strip() or "untitled"
    safe_title = re.sub(r'[\\/:*?"<>|\x00-\x1f]', '_', title)[:80]
    video_path = output_dir / f"{safe_title}.mp4"

    if video_path.exists():
        print(f"  → 이미 다운로드됨: {video_path.name}")
        return video_path, safe_title

    # 720p 우선 (다운스케일링 부하 원천 차단)
    subprocess.run(
        [yt_dlp,
         "-f", "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/"
               "best[height<=720]/best",
         "--merge-output-format", "mp4",
         "-o", str(video_path), url],
        encoding="utf-8", errors="replace", check=True
    )
    print(f"  → 다운로드 완료: {video_path.name}")
    return video_path, safe_title


# ─── Step 2: 프레임 추출 ──────────────────────────────────────────────────

def extract_frames(video_path: Path, fps: float = DEFAULT_FPS) -> List[np.ndarray]:
    print(f"[2/5] 프레임 추출 중 (fps={fps})...")
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"영상을 열 수 없습니다: {video_path}")

    video_fps = cap.get(cv2.CAP_PROP_FPS)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(1, int(video_fps / fps))
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # 4K 이상 → 1080p 다운스케일 (OOM 방지)
    need_resize = w > MAX_FRAME_WIDTH
    if need_resize:
        scale = MAX_FRAME_WIDTH / w
        target_size = (MAX_FRAME_WIDTH, int(h * scale))
        print(f"  → {w}x{h} → {target_size[0]}x{target_size[1]} 다운스케일")

    frames = []
    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if idx % interval == 0:
            if need_resize:
                frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_AREA)
            frames.append(frame)
            if len(frames) % 50 == 0:
                print(f"    ... {len(frames)}번째 프레임 추출 진행 중...", flush=True)
        idx += 1

    cap.release()
    print(f"  → {len(frames)}개 프레임 추출 ({w}x{h}, 원본 {video_fps:.0f}fps)")
    return frames


# ─── 핵심: 흰색 배경 Tab 영역 검출 ───────────────────────────────────────

def _find_white_tab_strip(frame: np.ndarray, min_strip_ratio: float = 0.10) -> Optional[Tuple[int, int]]:
    """프레임에서 흰색 배경의 Tab 스트립 영역의 Y범위(top, bottom)를 반환.

    전략: HSV 색공간에서 밝고(V>180) + 무채색(S<40)인 행을 찾아
    연속된 흰색 영역이 일정 비율 이상인 영역을 Tab 영역으로 판정.
    grayscale 단독보다 노란 하이라이트, 컬러 배경을 정확히 배제.
    """
    h, w = frame.shape[:2]
    margin_x = int(w * 0.1)

    # HSV 변환: 채도(S)와 명도(V) 동시 사용
    hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
    _, s_ch, v_ch = cv2.split(hsv)

    roi_v = v_ch[:, margin_x:w - margin_x]
    roi_s = s_ch[:, margin_x:w - margin_x]

    # 2단계 흰색 마스크:
    #   1) 순수 흰색: V > 180, S < 40 (Tab 배경)
    #   2) 밝은 파스텔: V > 200, S < 100 (노란/초록 하이라이트 박스)
    pure_white = (roi_v > 180) & (roi_s < 40)
    bright_pastel = (roi_v > 200) & (roi_s < 100)
    tab_mask = pure_white | bright_pastel

    # 각 행의 Tab-like 픽셀 비율
    row_tab_ratio = np.mean(tab_mask, axis=1)
    bright_mask = row_tab_ratio > 0.5  # 행의 50% 이상이 Tab-like

    # 연속된 흰색 행 영역 찾기 (검은색 탭 라인 및 음표로 인한 끊김 허용)
    max_gap = int(h * 0.02)  # 약 2% (720p 기준 14px)까지의 흰색 끊김은 같은 영역으로 간주
    regions = []
    start = None
    gap_count = 0
    for i in range(h):
        if bright_mask[i]:
            if start is None:
                start = i
            gap_count = 0
        else:
            if start is not None:
                gap_count += 1
                if gap_count > max_gap:
                    length = (i - gap_count) - start
                    if length >= h * min_strip_ratio:
                        regions.append((start, i - gap_count))
                    start = None
    if start is not None:
        length = (h - gap_count) - start
        if length >= h * min_strip_ratio:
            regions.append((start, h - gap_count))

    if not regions:
        return None

    # 가장 넓은 흰색 스트립 반환
    best = max(regions, key=lambda r: r[1] - r[0])

    # 약간의 패딩 추가 (하단 짤림 방지)
    pad = int(h * 0.03)
    top = max(0, best[0] - pad)
    bottom = min(h, best[1] + pad)

    return (top, bottom)


def _trim_to_content(crop: np.ndarray, margin_px: int = 6) -> np.ndarray:
    """넓게 크롭된 Tab 이미지에서 Tab 콘텐츠 영역만 정밀 트림.

    전략: HSV 기반으로 각 행의 '흰색 배경 비율'을 계산.
    - Tab 영역: 30~95%가 흰색 (흰 배경 + Tab 라인/숫자)
    - 기타 영상: 흰색 < 20% (어두운 배경)
    - 순수 여백: 흰색 > 97%
    이를 통해 상/하단의 기타 영상과 빈 여백 모두 제거."""
    h, w = crop.shape[:2]
    if h < 15 or w < 50:
        return crop

    hsv = cv2.cvtColor(crop, cv2.COLOR_BGR2HSV)
    _, s_ch, v_ch = cv2.split(hsv)

    # 흰색/밝은 파스텔 픽셀 비율 (Tab 배경 감지)
    white_mask = ((v_ch > 180) & (s_ch < 40)) | ((v_ch > 200) & (s_ch < 100))
    row_white = np.mean(white_mask, axis=1)

    # Tab 행 = 흰색 비율 30~97% (라인/숫자 + 흰 배경)
    tab_rows = (row_white > 0.30) & (row_white < 0.97)

    # 콘텐츠 존재 확인 (어두운 픽셀 > 0.2%) - 마디번호 같이 아주 작은 숫자도 보존하기 위해 스레스홀드 극단적 하향
    gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) if len(crop.shape) == 3 else crop
    row_dark = np.mean(gray < 180, axis=1)
    content_rows = row_dark > 0.002

    # Tab 행 OR 콘텐츠 행
    valid_rows = tab_rows | content_rows

    # 상단: 첫 번째 유효 행
    top = 0
    for i in range(h):
        if valid_rows[i] and row_white[i] > 0.20:
            top = max(0, i - 120)  # 상단 마디번호 보존을 위해 압도적인 120px 강제 보호 (숫자가 꽤 높이 떠있음)
            break

    # 하단: 마지막 유효 행
    bottom = h
    for i in range(h - 1, -1, -1):
        if valid_rows[i] and row_white[i] > 0.20:
            bottom = min(h, i + margin_px)
            break

    if bottom - top < 15:
        return crop

    return crop[top:bottom, :]


def _has_tab_content(region: np.ndarray) -> bool:
    """흰색 영역 내에 실제 Tab 내용이 있는지 검증.
    방법: 흰색 배경 위의 어두운 픽셀(Tab 라인, 숫자, 코드명) 비율을 확인.
    Tab 영역은 일반적으로 3~25%의 어두운 콘텐츠를 포함."""
    if region is None or region.size == 0:
        return False

    gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY) if len(region.shape) == 3 else region
    h, w = gray.shape
    if h < 15 or w < 50:
        return False

    # 어두운 픽셀 비율 (< 180 = 라인/숫자/코드 등)
    dark_pixels = np.sum(gray < 180)
    dark_ratio = dark_pixels / gray.size

    # Tab 영역: 3~25%가 어두운 콘텐츠 (순수 흰 배경이면 < 1%, 기타 영상이면 > 30%)
    return 0.02 < dark_ratio < 0.30


# ─── Step 3: 패턴 감지 ────────────────────────────────────────────────────

def _detect_tab_overlay(frame: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
    """Tab을 포함한 흰색 오버레이 박스 검출"""
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    h, w = gray.shape

    _, thresh = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))
    closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
    contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    total_area = h * w
    best = None
    best_area = 0

    for cnt in contours:
        x, y, cw, ch = cv2.boundingRect(cnt)
        area = cw * ch
        ratio = area / total_area
        # 오버레이 = 프레임 폭의 85% 미만인 독립 박스 (전폭 스트립은 scroll)
        width_ratio = cw / w
        if (OVERLAY_MIN_AREA_RATIO < ratio < OVERLAY_MAX_AREA_RATIO
                and width_ratio < 0.85
                and cw > ch * 0.5 and area > best_area):
            # Tab 내용 검증
            region = frame[y:y + ch, x:x + cw]
            if _has_tab_content(region):
                best = (x, y, cw, ch)
                best_area = area

    return best


def detect_pattern(frames: List[np.ndarray], sample_count: int = 20) -> str:
    """영상 패턴 감지: scroll (우선) vs overlay"""
    print("[3/5] 영상 패턴 분석 중...")

    if len(frames) < sample_count:
        sample_count = len(frames)

    indices = np.linspace(0, len(frames) - 1, sample_count, dtype=int)
    sample_frames = [frames[i] for i in indices]

    # 1) 흰색 Tab 스트립 감지 (scroll) — 우선 검사
    tab_top_count = 0
    tab_bottom_count = 0
    for f in sample_frames:
        strip = _find_white_tab_strip(f)
        if strip is not None:
            top, bottom = strip
            h = f.shape[0]
            mid = (top + bottom) / 2
            if mid < h * 0.5:
                tab_top_count += 1
            else:
                tab_bottom_count += 1

    tab_count = tab_top_count + tab_bottom_count
    tab_ratio = tab_count / sample_count

    # 60% 이상에서 흰색 스트립 → scroll
    if tab_ratio >= 0.6:
        position = "상단" if tab_top_count > tab_bottom_count else "하단"
        print(f"  → 패턴: scroll (Tab {position}, 감지율: {tab_ratio:.0%})")
        return "scroll"

    # 2) 스트립 감지율 낮으면 오버레이 체크
    overlay_count = sum(1 for f in sample_frames if _detect_tab_overlay(f) is not None)
    overlay_ratio = overlay_count / sample_count
    if overlay_ratio > 0.2:
        print(f"  → 패턴: overlay (감지율: {overlay_ratio:.0%})")
        return "overlay"

    # 3) 둘 다 아니면 scroll 기본값
    position = "상단" if tab_top_count > tab_bottom_count else "하단"
    print(f"  → 패턴: scroll (fallback, Tab {position}, 감지율: {tab_ratio:.0%})")
    return "scroll"


# ─── Step 4: 고유 Tab 프레임 추출 ─────────────────────────────────────────

def compare_frames(frame1: np.ndarray, frame2: np.ndarray) -> float:
    """MSE 기반 유사도 (0~1, 1=동일)"""
    g1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) if len(frame1.shape) == 3 else frame1
    g2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) if len(frame2.shape) == 3 else frame2

    if g1.shape != g2.shape:
        g2 = cv2.resize(g2, (g1.shape[1], g1.shape[0]))

    target_w = 480
    if g1.shape[1] > target_w:
        scale = target_w / g1.shape[1]
        sz = (target_w, int(g1.shape[0] * scale))
        g1 = cv2.resize(g1, sz)
        g2 = cv2.resize(g2, sz)

    mse = np.mean(((g1.astype(np.float32) - g2.astype(np.float32)) / 255.0) ** 2)
    return max(0.0, 1.0 - min(mse * 8.0, 1.0))


def _dhash(image: np.ndarray, hash_size: int = 32) -> np.ndarray:
    """Difference Hash — 구조 기반 해시 (32×32 = 1024비트).
    인접 픽셀의 밝기 차이를 기록하여 위치 이동에 강건한 fingerprint 생성.
    16→32 확대로 마디번호/음표 위치까지 구분 가능."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image
    resized = cv2.resize(gray, (hash_size + 1, hash_size), interpolation=cv2.INTER_AREA)
    return (resized[:, 1:] > resized[:, :-1]).flatten()


def _dedup_by_hash(frames: List[np.ndarray],
                   max_hamming: int = 20) -> List[np.ndarray]:
    """pHash 기반 클러스터 중복 제거.
    유사 프레임을 그룹핑하고, 각 그룹에서 가장 선명한(Laplacian 분산 최대) 1장만 선택.
    → 스크롤 중복 + 반복 연습 구간 모두 제거."""
    if not frames:
        return []

    hashes = [_dhash(f) for f in frames]
    n = len(frames)
    used = [False] * n
    clusters = []

    for i in range(n):
        if used[i]:
            continue
        cluster = [i]
        used[i] = True
        for j in range(i + 1, n):
            if used[j]:
                continue
            dist = int(np.sum(hashes[i] != hashes[j]))
            if dist <= max_hamming:
                cluster.append(j)
                used[j] = True
        clusters.append(cluster)

    # 각 클러스터에서 최고 선명도 프레임 선택
    result = []
    for cluster in clusters:
        best_idx = max(cluster, key=lambda idx: cv2.Laplacian(
            cv2.cvtColor(frames[idx], cv2.COLOR_BGR2GRAY)
            if len(frames[idx].shape) == 3 else frames[idx],
            cv2.CV_64F).var())
        result.append(frames[best_idx])

    return result


def _extract_print_channel(frame: np.ndarray) -> np.ndarray:
    """PDF 출력용 채널 (Red 채널): 노란색을 투명(White)하게 만듦"""
    if len(frame.shape) != 3: return frame
    return frame[:, :, 2]

def _extract_tracking_channel(frame: np.ndarray) -> np.ndarray:
    """트래킹 전용 채널 (Blue 채널): 노란색을 거대한 검은색 마커로 만들어 반복적인 마디점프 시각적 오류를 영구차단"""
    if len(frame.shape) != 3: return frame
    return frame[:, :, 0]

def _detect_scroll_offset(frame_a: np.ndarray, frame_b: np.ndarray, min_confidence: float = 0.1) -> Tuple[int, float]:
    """이전 프레임(A)과 현재 프레임(B) 사이의 X축 이동량(Scroll)을 추정합니다."""
    h, w = frame_a.shape[:2]

    gb = _extract_tracking_channel(frame_b)
    ga = _extract_tracking_channel(frame_a)

    template_w = int(w * 0.5)
    template = ga[:, w - template_w:]
    result = cv2.matchTemplate(gb, template, cv2.TM_CCOEFF_NORMED)
    _, max_val, _, max_loc = cv2.minMaxLoc(result)
    scroll_px = (w - template_w) - max_loc[0]
    if max_val < min_confidence or scroll_px <= 0:
        return (0, max_val)
    return (scroll_px, max_val)

def _detect_measure_bars(gray_pano: np.ndarray) -> List[int]:
    """오직 기타 6현의 영역만 계산하여 세로로 쫙 채워진 마디 선(|)의 X좌표만 정밀하게 반환합니다."""
    _, thresh = cv2.threshold(gray_pano, 200, 255, cv2.THRESH_BINARY_INV)
    h, w = thresh.shape
    row_sums = np.sum(thresh, axis=1) / 255
    staff_rows = np.where(row_sums > w * 0.5)[0]

    if len(staff_rows) < 2: return []

    top_line = staff_rows[0]
    bottom_line = top_line
    for r in staff_rows:
        if r - top_line > 100: break
        bottom_line = r

    staff_region = thresh[top_line:bottom_line+1, :]
    expected_h = bottom_line - top_line + 1
    if expected_h < 10: return []

    col_sums = np.sum(staff_region, axis=0) / 255
    bar_cols = np.where(col_sums >= expected_h * 0.8)[0]

    measures = []
    curr = []
    for c in bar_cols:
        if not curr: curr.append(c)
        else:
            if c - curr[-1] < 10: curr.append(c)
            else:
                measures.append(int(np.mean(curr)))
                curr = [c]
    if curr: measures.append(int(np.mean(curr)))
    return measures

def _stamp_measure_number(measure_bgr: np.ndarray, num: int) -> np.ndarray:
    """마디 이미지 좌측 상단의 빈 공간에 자동으로 순차 진행 마디번호를 파란색 도장(Stamp)으로 찍습니다."""
    text = f"[{num}]"
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.7
    thickness = 2
    color = (200, 0, 0)
    cv2.putText(measure_bgr, text, (15, 30), font, font_scale, color, thickness, cv2.LINE_AA)
    return measure_bgr

def _stitch_scroll_segment(segment: List[np.ndarray]) -> np.ndarray:
    if len(segment) == 1: return segment[0]
    min_h = min(f.shape[0] for f in segment)
    panorama = segment[0][:min_h, :]
    for i in range(1, len(segment)):
        curr = segment[i][:min_h, :]
        scroll_px, conf = _detect_scroll_offset(segment[i-1][:min_h, :], curr, min_confidence=0.1)
        if scroll_px > 0 and conf > 0.15:
            new_strip = curr[:, curr.shape[1] - scroll_px:]
            panorama = np.hstack([panorama, new_strip])
        else:
            panorama = np.hstack([panorama, curr])
    return panorama

def _merge_scroll_candidates(candidates: List[np.ndarray], min_scroll: int = 5, min_segment_len: int = 2) -> List[np.ndarray]:
    if len(candidates) <= 1: return candidates
    result = []
    current_segment = [candidates[0]]
    prev_s_px = 0
    prev_conf = 1.0

    for i in range(1, len(candidates)):
        prev_frame = candidates[i-1]
        curr_frame = candidates[i]
        s_px, conf = _detect_scroll_offset(prev_frame, curr_frame, min_confidence=0.1)

        # 씬 전환 조건: conf 폭락, 가속도>100, 노란마커 증발(>0.4)
        is_cut = (conf <= 0.15) or (abs(s_px - prev_s_px) > 100) or (prev_conf - conf > 0.4)

        if not is_cut:
            current_segment.append(curr_frame)
        else:
            if len(current_segment) >= min_segment_len:
                result.append(_stitch_scroll_segment(current_segment))
            else:
                result.extend(current_segment)
            current_segment = [curr_frame]

        prev_s_px = s_px
        prev_conf = conf

    if len(current_segment) >= min_segment_len:
        result.append(_stitch_scroll_segment(current_segment))
    else:
        result.extend(current_segment)

    return result

def merge_panoramas_list(panoramas):
    if not panoramas: return []
    merged_list = []
    current_master = panoramas[0].copy()
    for i in range(1, len(panoramas)):
        next_pano = panoramas[i].copy()

        # 매마디가 똑같이 생긴 반주 구간(예: 코러스)이 있을 때, 검색 범위가 너무 넓거나
        # 비교 기준(head)이 너무 짧으면, OpenCV가 과거의 똑같은 반주에 현재 씬을 겹쳐버림(마디 누락/점프 발생).
        # 이를 막기 위해 비교 기준은 넓게(800), 검색 과거 이력은 짧게(1500=최대 편집 되감기 길이) 제한.
        head_w = min(800, next_pano.shape[1])
        head = next_pano[:, :head_w]

        search_w = min(1500, current_master.shape[1])
        search_region = current_master[:, -search_w:]
        h_gray = _extract_tracking_channel(head)
        s_gray = _extract_tracking_channel(search_region)
        matched = False
        if h_gray.shape[1] <= s_gray.shape[1] and h_gray.shape[0] == s_gray.shape[0]:
            res = cv2.matchTemplate(s_gray, h_gray, cv2.TM_CCOEFF_NORMED)
            _, max_val, _, max_loc = cv2.minMaxLoc(res)

            if max_val > 0.60:
                match_x_in_search = max_loc[0]
                absolute_match_x = current_master.shape[1] - search_w + match_x_in_search
                next_start_idx = current_master.shape[1] - absolute_match_x
                if next_start_idx < next_pano.shape[1]:
                    append_part = next_pano[:, next_start_idx:]
                    if append_part.shape[1] > 0:
                        current_master = np.hstack([current_master, append_part])
                matched = True

        if not matched:
            merged_list.append(current_master)
            current_master = next_pano

    merged_list.append(current_master)
    return merged_list

def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
    print(f"[4/5] 스크롤형 Tab 추출 중 (threshold={threshold})...")
    strip_tops, strip_bottoms = [], []
    for frame in frames:
        strip = _find_white_tab_strip(frame)
        if strip:
            strip_tops.append(strip[0])
            strip_bottoms.append(strip[1])
    if not strip_tops: return []
    median_top = int(np.median(strip_tops))
    median_bottom = int(np.median(strip_bottoms))

    candidates, all_compared = [], []
    for frame in frames:
        h = frame.shape[0]
        tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
        if not _has_tab_content(tab_crop): continue
        compare_img = cv2.resize(tab_crop, (480, 120), interpolation=cv2.INTER_AREA)
        is_dup = False
        for ref in all_compared:
            if compare_frames(compare_img, ref) >= threshold:
                is_dup = True
                break
        if not is_dup:
            candidates.append(tab_crop)
            all_compared.append(compare_img)

    stitched = _merge_scroll_candidates(candidates)
    merged_panoramas = merge_panoramas_list(stitched)

    chunk_width = candidates[0].shape[1] if candidates else 1280
    final_chunks = []
    global_measure_counter = 1
    current_row = None

    for pano in merged_panoramas:
        gray_pano = _extract_print_channel(pano)
        bar_coords = _detect_measure_bars(gray_pano)

        if not bar_coords:
            w = pano.shape[1]
            start_x = 0
            while start_x < w:
                chunk = pano[:, start_x:min(w, start_x + chunk_width)]
                if chunk.shape[1] < chunk_width:
                    pad = np.full((chunk.shape[0], chunk_width - chunk.shape[1], 3), 255, dtype=np.uint8)
                    chunk = np.hstack([chunk, pad])
                gray_c = _extract_print_channel(chunk)
                final_chunks.append(cv2.cvtColor(gray_c, cv2.COLOR_GRAY2BGR))
                start_x += chunk_width
            continue

        coords = [0] + bar_coords + [pano.shape[1]]
        coords = sorted(list(set(coords)))

        for i in range(len(coords) - 1):
            x_start = coords[i]
            x_end = coords[i+1]
            if x_end - x_start < 50:
                continue

            measure_img = pano[:, x_start:x_end]
            gray_m = _extract_print_channel(measure_img)
            bgr_m = cv2.cvtColor(gray_m, cv2.COLOR_GRAY2BGR)

            bgr_m = _stamp_measure_number(bgr_m, global_measure_counter)
            global_measure_counter += 1

            if current_row is None:
                current_row = bgr_m
            else:
                if current_row.shape[1] + bgr_m.shape[1] > chunk_width:
                    pad_w = chunk_width - current_row.shape[1]
                    if pad_w > 0:
                        pad_img = np.full((current_row.shape[0], pad_w, 3), 255, dtype=np.uint8)
                        current_row = np.hstack([current_row, pad_img])
                    final_chunks.append(current_row)
                    current_row = bgr_m
                else:
                    current_row = np.hstack([current_row, bgr_m])

    if current_row is not None:
        pad_w = chunk_width - current_row.shape[1]
        if pad_w > 0:
            pad_img = np.full((current_row.shape[0], pad_w, 3), 255, dtype=np.uint8)
            current_row = np.hstack([current_row, pad_img])
        final_chunks.append(current_row)

    return final_chunks

def extract_unique_overlay(frames: List[np.ndarray],
                           threshold: float = OVERLAY_SIMILARITY_THRESHOLD) -> List[np.ndarray]:
    """오버레이형: Tab 오버레이 박스 추출 + 전체 히스토리 중복 제거"""
    print("[4/5] 오버레이형 Tab 추출 중...")

    unique = []
    all_normalized = []

    for frame in frames:
        bbox = _detect_tab_overlay(frame)
        if bbox is None:
            continue

        x, y, w, h = bbox
        if h < 40 or w < 100:
            continue

        pad = 10
        x = max(0, x - pad)
        y = max(0, y - pad)
        w = min(frame.shape[1] - x, w + 2 * pad)
        h = min(frame.shape[0] - y, h + 2 * pad)

        crop = frame[y:y + h, x:x + w]

        # 밝기 필터
        if np.mean(cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)) < 120:
            continue

        # 정규화
        normalized = cv2.resize(crop, (480, 180), interpolation=cv2.INTER_AREA)
        canvas = np.full((180, 480, 3), 255, dtype=np.uint8)
        canvas[:normalized.shape[0], :normalized.shape[1]] = normalized

        # 전체 히스토리 비교
        is_dup = False
        for ref in all_normalized:
            if compare_frames(canvas, ref) >= threshold:
                is_dup = True
                break

        if not is_dup:
            unique.append(crop)
            all_normalized.append(canvas)

    # ── Phase 2: 마디번호 기반 최종 중복 제거 (OCR) ──
    if unique:
        unique = _dedup_by_measure_number(unique)

    print(f"  → 최종: {len(unique)}개 고유 Tab 오버레이")
    return unique


# ─── Step 5: A4 PDF 생성 ─────────────────────────────────────────────────

def generate_pdf(frames: List[np.ndarray], output_path: Path,
                 debug_dir: Optional[Path] = None) -> None:
    """Tab 프레임들을 A4 페이지에 여러 행으로 배치"""
    print("[5/5] A4 PDF 생성 중...")
    if not frames:
        print("  ⚠ 프레임 없음!")
        return

    page_w = int(PDF_PAGE_WIDTH_MM / 25.4 * PDF_DPI)
    page_h = int(PDF_PAGE_HEIGHT_MM / 25.4 * PDF_DPI)
    margin = int(PDF_MARGIN_MM / 25.4 * PDF_DPI)
    gap = int(TAB_GAP_MM / 25.4 * PDF_DPI)
    content_w = page_w - 2 * margin

    resized = []
    for i, frame in enumerate(frames):
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(rgb)
        if debug_dir:
            img.save(debug_dir / f"frame_{i:04d}.png")
        scale = content_w / img.width
        img_r = img.resize((content_w, int(img.height * scale)), Image.LANCZOS)
        resized.append(img_r)

    pages = []
    cur_y = margin
    page = Image.new('RGB', (page_w, page_h), (255, 255, 255))

    for img in resized:
        if cur_y + img.height > page_h - margin:
            pages.append(page)
            page = Image.new('RGB', (page_w, page_h), (255, 255, 255))
            cur_y = margin
        page.paste(img, (margin, cur_y))
        cur_y += img.height + gap

    if cur_y > margin + gap:
        pages.append(page)

    if not pages:
        return

    pages[0].save(str(output_path), save_all=True,
                  append_images=pages[1:], resolution=PDF_DPI)
    print(f"  → PDF: {len(resized)} Tab → {len(pages)} 페이지, {output_path.stat().st_size // 1024} KB")


def generate_long_image(frames: List[np.ndarray], output_path: Path) -> None:
    """Tab을 하나의 긴 이미지로"""
    if not frames:
        return
    max_w = max(f.shape[1] for f in frames)
    imgs = []
    for f in frames:
        if f.shape[1] != max_w:
            scale = max_w / f.shape[1]
            f = cv2.resize(f, (max_w, int(f.shape[0] * scale)))
        imgs.append(f)
    concat = np.vstack(imgs)
    Image.fromarray(cv2.cvtColor(concat, cv2.COLOR_BGR2RGB)).save(str(output_path))
    print(f"  → 롱 이미지: {max_w}x{concat.shape[0]}")


# ─── Main ─────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="YouTube TAB → A4 PDF")
    parser.add_argument("url", help="YouTube URL")
    parser.add_argument("-o", "--output", help="출력 PDF 경로")
    parser.add_argument("--fps", type=float, default=DEFAULT_FPS)
    parser.add_argument("--similarity", type=float, default=None)
    parser.add_argument("--pattern", choices=["auto", "scroll", "overlay"],
                        default="auto")
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()

    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)
    debug_dir = None
    if args.debug:
        debug_dir = output_dir / "debug_frames"
        debug_dir.mkdir(exist_ok=True)

    video_path, safe_title = download_video(args.url, output_dir)
    frames = extract_frames(video_path, fps=args.fps)
    if not frames:
        print("❌ 프레임 추출 실패")
        sys.exit(1)

    pattern = detect_pattern(frames) if args.pattern == "auto" else args.pattern

    if pattern == "scroll":
        sim = args.similarity if args.similarity else SIMILARITY_THRESHOLD
        unique = extract_unique_scroll(frames, threshold=sim)
    else:
        sim = args.similarity if args.similarity else OVERLAY_SIMILARITY_THRESHOLD
        unique = extract_unique_overlay(frames, threshold=sim)

    if not unique:
        print("❌ 고유 Tab 프레임 없음. --similarity를 낮추거나 --pattern을 수동 지정하세요.")
        sys.exit(1)

    pdf_path = Path(args.output) if args.output else output_dir / f"{safe_title}.pdf"
    generate_pdf(unique, pdf_path, debug_dir=debug_dir)
    generate_long_image(unique, pdf_path.with_suffix(".png"))

    print(f"\n✅ 완료! PDF: {pdf_path}")


if __name__ == "__main__":
    main()