From cd159c2a99dfb0c7784f7565d6a44418cf2a34d8 Mon Sep 17 00:00:00 2001
From: CD <variet@variet.net>
Date: Sun, 29 Mar 2026 00:06:38 +0900
Subject: [PATCH] fix(cv): resolve measure duplication by isolating playhead
 and enforcing 1D sliding correlations

---
 .agent/references/known-issues.md   |   5 ++
 docs/devlog/2026-03-29.md           |   5 ++
 docs/devlog/entries/20260329-001.md |  24 ++++++
 video_cv_tracker.py                 | 119 ++++++++++++++++++++++++++++
 youtube_tab_to_pdf.py               | 112 +++++++++++++-------------
 5 files changed, 207 insertions(+), 58 deletions(-)
 create mode 100644 docs/devlog/2026-03-29.md
 create mode 100644 docs/devlog/entries/20260329-001.md
 create mode 100644 video_cv_tracker.py

diff --git a/.agent/references/known-issues.md b/.agent/references/known-issues.md
index 6f50728..92e3983 100644
--- a/.agent/references/known-issues.md
+++ b/.agent/references/known-issues.md
@@ -109,3 +109,8 @@
 - **해결**: OCR-First 방식 폐기. 파노라마 스티칭 후 물리적 마디구분선(|) 탐지 방식 유지
 - **주의**: 스크롤 영상에서 마디번호 기반 중복제거는 파노라마를 완성한 뒤 적용해야 의미가 있음
 
+### [2026-03-28] ORB 특징점 패턴 매칭 실패 — 마디 무단 복제
+- **증상**: Сontinuous Scroll 뷰에서 12마디와 29마디 같은 특정 구간이 통째로 복제되어 (12 뒤에 12, 29 뒤에 29) 삽입되는 현상.
+- **원인**: 영상 내 플레이헤드의 옅은 회색 잔상(200~220)이 씬 전환을 오탐, 이후 이중 병합 시도. ORB/SIFT 기반의 특징점 추출기는 반복 화성이 많은 기타 탭 악보 특성상 "11마디와 12마디"를 시각적으로 같은 곳이라 착각하여 다른 마디 위치로 강제 Overlap 시킴.
+- **해결**: `cv2.threshold(THRESH_BINARY_INV)`로 플레이헤드를 물리적 삭제하여 씬오탐 근절. Canny Edge 기반 1D Morphological `matchTemplate` 스티칭으로 롤백. 스크롤 탭에서 불필요한 Full-Page 덮어쓰기 로직 원천 차단.
+- **주의**: 단순 배경/글자 매칭이 아닌 *반복적 패턴*이 생명인 악보에서는 부분 특징점 매칭(ORB) 알고리즘이 픽셀의 시계열 순서(Monotonicity)를 완전히 망가뜨림. 1D Correlation 윈도우 스티칭이 음악의 선형 복원에는 더 정교함.
diff --git a/docs/devlog/2026-03-29.md b/docs/devlog/2026-03-29.md
new file mode 100644
index 0000000..c65f545
--- /dev/null
+++ b/docs/devlog/2026-03-29.md
@@ -0,0 +1,5 @@
+# 2026-03-29
+
+| NNN | HH:MM | 작업 설명 | `커밋해시` | ✅ 또는 🔧 |
+|---|---|---|---|---|
+| 001 | 00:00 | 스크롤/페이징 복합 패턴 완벽 추적 및 ORB 마디 중복 파이프라인 버그 해결 | `TBD` | ✅ |
diff --git a/docs/devlog/entries/20260329-001.md b/docs/devlog/entries/20260329-001.md
new file mode 100644
index 0000000..ec59a69
--- /dev/null
+++ b/docs/devlog/entries/20260329-001.md
@@ -0,0 +1,24 @@
+# 스크롤 탭뷰에서의 ORB 특징점 추적기 한계와 1D Template Matching의 최적성 증명
+
+- **시간**: 2026-03-28 23:00~2026-03-29 00:05
+- **Commit**: `TBD`
+- **Vikunja**: #TBD → done
+
+## 설계 결정 및 분석 내역
+유저가 제기한 '12마디 앞의 12마디 붙음(중복 복제)' 및 '마디 번호 건너뜀' 현상을 조사하는 과정에서 ORB 기반 특징점 매칭의 한계와, 영상 편집자의 복합 패턴(페이지 넘김 + 스크롤)의 실체를 파악함.
+
+### 원인 1: 플레이헤드로 인한 Optical Flow 오판
+- 영상이 스크롤되는 와중 노란색 플레이헤드가 악보 위를 가로지름.
+- 기존의 Red/Print 채널 추출(`np.max`)은 노란색을 완벽히 지우지 못하고 옅은 회색 잔상(200~220)을 남겼음.
+- 회색 박스가 움직이는 픽셀 변화는 Correlation(`cv2.matchTemplate(..., TM_CCOEFF_NORMED)`)의 `max_val`을 0.85 미만으로 깎아먹었고, 이는 Tracker가 "영상이 페이드/전환(Transition) 중이다"라고 오해하게 만듦.
+
+### 원인 2: 특징점 기반 매칭(ORB)의 악보 매칭 불가
+- 스크롤 중 발생한 일시적 `in_transition`이 해제(Recovered)될 때, ORB가 이전 영상(`search_region`)과 새로운 영상의 중복을 찾으려 시도함.
+- 기타 악보는 같은 음표, 같은 파워코드, 같은 8분음표 줄기 등 반복적인 요소가 매우 강함.
+- `검색 윈도우(search_region)` 안에 "11마디"와 "12마디"가 들어있고, `새로운 페이지`가 "11마디"부터 시작할 경우, ORB 매칭은 11마디의 특징점을 11마디와 매칭하는 대신 12마디의 특징점과 가장 많이 매칭(강한 일치표, 12표 이상)할 수 있음.
+- 이는 11마디의 위치를 12마디 위에 덧붙여 마디가 무단 복제(12마디 2개)되는 기현상을 유발함. ORB는 형태만 볼 뿐, 음악적 순서(Monotonic sequence)를 물리적으로 이해하지 못하기 때문임.
+
+### 해결 전략: 절대 이진화 + 마디 단위 1D 슬라이딩 스티칭 부활
+1. **절대 이진화 (Absolute Binarization):** `cv2.threshold(gray, 120, 255, THRESH_BINARY_INV)`를 써서 플레이헤드를 픽셀 단위에서 아예 삭제해버림. 이로써 `conf`는 스크롤 중 무조건 `1.0`을 유지하며, 씬전환(Transition) 오탐이 완전히 소멸됨.
+2. **Page-Turn 1D Template 부활:** ORB는 폐기하고, Morphological Filter(`MORPH_OPEN, (1, 7)`)를 통해 가로선(오선지)을 지운 `Canny Edge` 뷰를 생성. 이후 직전 500px 구역에 대해서만 1D `matchTemplate`을 돌려 슬라이딩 중첩 검사를 수행함.
+3. 이 두 가지를 결합함으로써, "연속 스크롤 영상"에서는 `dx` 파트만 누적시키고 페이징을 차단하며, "페이드 아웃 & 페이지 넘김" 패턴에서는 완벽한 픽셀 단위 Overlap 병합을 자동으로 판단해 수행하는 **차세대 통합 Tracker**로 진화됨.
diff --git a/video_cv_tracker.py b/video_cv_tracker.py
new file mode 100644
index 0000000..aea5fcc
--- /dev/null
+++ b/video_cv_tracker.py
@@ -0,0 +1,119 @@
+import cv2
+import numpy as np
+from typing import List, Tuple, Optional
+
+class TemporalTracker:
+    def __init__(self, min_confidence: float = 0.15):
+        self.min_confidence = min_confidence
+        self.last_clean_frame = None
+        self.last_conf = 1.0
+        self.panorama = None
+        self.total_frames_processed = 0
+        self.in_transition = False
+
+    def _extract_tracking_channel(self, bgr: np.ndarray) -> np.ndarray:
+        return bgr[:, :, 0]
+        
+    def _extract_print_channel(self, bgr: np.ndarray) -> np.ndarray:
+        # 가장 밝은 채널을 취해 유색(노랑/파랑) 플레이헤드를 흰색 배경으로 흡수
+        gray = np.max(bgr, axis=2)
+        # 120 이하의 순수 검은색 음표들만 Foreground(255)로 추출 (플레이헤드 완전 삭제)
+        _, binary = cv2.threshold(gray, 120, 255, cv2.THRESH_BINARY_INV)
+        return binary
+
+    def _calculate_pixel_shift(self, prev_img: np.ndarray, curr_img: np.ndarray) -> Tuple[int, float]:
+        h, w = prev_img.shape[:2]
+        
+        # 플레이헤드가 방해하지 않도록 Print Channel(음표만 추출, 하이라이트 삭제) 사용!
+        prev_chan = self._extract_print_channel(prev_img)
+        curr_chan = self._extract_print_channel(curr_img)
+        
+        # 템플릿: PREV 프레임의 우측 60~90% 영역
+        template_w = int(w * 0.3)
+        start_x = int(w * 0.6)
+        template = prev_chan[:, start_x:start_x + template_w]
+        
+        res = cv2.matchTemplate(curr_chan, template, cv2.TM_CCOEFF_NORMED)
+        _, max_val, _, max_loc = cv2.minMaxLoc(res)
+        
+        curr_x = max_loc[0]
+        scroll_dx = start_x - curr_x
+        
+        if max_val < self.min_confidence or scroll_dx <= 0:
+            return 0, max_val
+            
+        # 기타 스크롤 속도 물리적 한계: 2fps 기준 프레임당 최대 이동량
+        # 1280px(1페이지)가 지나가는데 보통 4~10초 소요. 0.5초당 이동량은 150px 미만.
+        # 이를 초과하는 엄청난 점프값(예: 500px)은 똑같이 생긴 '다른 마디'를 현재로 착각한 OpenCV의 치명적 오탐!
+        # 따라서 허용치를 넘어가는 가속도는 무조건 무시(dx=0)하여 마디 순서 꼬임을 원천 차단.
+        max_dx = w * 0.15
+        
+        if scroll_dx > max_dx:
+            return 0, max_val
+            
+        return scroll_dx, max_val
+
+    def process_frame(self, frame: np.ndarray) -> None:
+        self.total_frames_processed += 1
+        
+        if self.panorama is None:
+            self.panorama = frame.copy()
+            self.last_clean_frame = frame.copy()
+            return
+            
+        dx, conf = self._calculate_pixel_shift(self.last_clean_frame, frame)
+        
+        # Scene cut 진입 조건
+        if (conf < 0.45) or (self.last_conf - conf > 0.3):
+            self.in_transition = True
+        
+        # Transition 중이고 화면이 이제서야 완전히 안정화 (정지) 되었을 때 == 페이지 넘김이 "끝난" 직후
+        elif self.in_transition and conf > 0.85 and dx == 0:
+            self.in_transition = False
+            
+            # 전환(Fade/Slide)이 완전히 끝난 맑은 프레임을 시각적으로 겹참하여 부착
+            if self.panorama is not None and self.panorama.shape[1] > 0:
+                h = self.panorama.shape[0]
+                new_page = cv2.resize(frame, (frame.shape[1], h))
+                
+                head_w = min(500, new_page.shape[1])
+                head = self._extract_print_channel(new_page[:, 50:50+head_w]) 
+                
+                search_w = min(head_w + 500, self.panorama.shape[1])
+                search_region = self._extract_print_channel(self.panorama[:, -search_w:])
+                
+                if head.shape[1] > 0 and search_region.shape[1] >= head.shape[1]:
+                    edge_search = cv2.Canny(cv2.GaussianBlur(search_region, (3,3), 0), 30, 100)
+                    edge_head = cv2.Canny(cv2.GaussianBlur(head, (3,3), 0), 30, 100)
+                    
+                    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 7))
+                    edge_search = cv2.morphologyEx(edge_search, cv2.MORPH_OPEN, kernel)
+                    edge_head = cv2.morphologyEx(edge_head, cv2.MORPH_OPEN, kernel)
+                    
+                    if np.count_nonzero(edge_head) > 50:
+                        res = cv2.matchTemplate(edge_search, edge_head, cv2.TM_CCOEFF_NORMED)
+                        _, max_val, _, max_loc = cv2.minMaxLoc(res)
+                        
+                        if max_val > 0.25:
+                            overlap_px = search_w - max_loc[0] + 50
+                            if overlap_px < new_page.shape[1] - 50:
+                                self.panorama = np.hstack([self.panorama, new_page[:, overlap_px:]])
+                        else:
+                            self.panorama = np.hstack([self.panorama, new_page])
+                    else:
+                        self.panorama = np.hstack([self.panorama, new_page])
+                else:
+                    self.panorama = np.hstack([self.panorama, new_page])
+
+        # 정상적인 스피드의 스크롤 처리 (트랜지션 쿨다운 중이 아닐 때만)
+        elif dx > 0 and dx < frame.shape[1] and not self.in_transition:
+            new_strip = frame[:, frame.shape[1] - dx:, :]
+            if new_strip.shape[0] != self.panorama.shape[0]:
+                new_strip = cv2.resize(new_strip, (dx, self.panorama.shape[0]))
+            self.panorama = np.hstack([self.panorama, new_strip])
+
+        self.last_conf = conf
+        self.last_clean_frame = frame.copy()
+
+    def get_final_panorama(self) -> Optional[np.ndarray]:
+        return self.panorama
diff --git a/youtube_tab_to_pdf.py b/youtube_tab_to_pdf.py
index a989b12..1398ca4 100644
--- a/youtube_tab_to_pdf.py
+++ b/youtube_tab_to_pdf.py
@@ -18,6 +18,7 @@ from pathlib import Path
 from typing import List, Tuple, Optional
 
 import cv2
+from video_cv_tracker import TemporalTracker
 import numpy as np
 import img2pdf
 from PIL import Image
@@ -156,12 +157,11 @@ def download_video(url: str, output_dir: Path) -> Tuple[Path, str]:
         print(f"  → 이미 다운로드됨: {video_path.name}")
         return video_path, safe_title
 
-    # 720p 우선 (다운스케일링 부하 원천 차단)
+    # 영상 추출 처리(CV)만 필요하므로, ffmpeg 병합이 불필요한 video-only 고화질 포맷(720p)을 직접 요청하여 360p 강등을 방지
     subprocess.run(
         [yt_dlp,
-         "-f", "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/"
-               "best[height<=720]/best",
-         "--merge-output-format", "mp4",
+         "-f", "bestvideo[ext=mp4]",
+         "-S", "res:720",
          "-o", str(video_path), url],
         encoding="utf-8", errors="replace", check=True
     )
@@ -659,90 +659,86 @@ def merge_panoramas_list(panoramas):
     return merged_list
 
 def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
-    print(f"[4/5] 스크롤형 Tab 추출 중 (threshold={threshold})...")
+    print(f"[4/5] 스크롤형 Tab 시계열 추적 추출 중...")
+    
     strip_tops, strip_bottoms = [], []
-    for frame in frames:
+    for frame in frames[:50]:
         strip = _find_white_tab_strip(frame)
         if strip:
             strip_tops.append(strip[0])
             strip_bottoms.append(strip[1])
-    if not strip_tops: return []
+            
+    if not strip_tops:
+        return []
+        
     median_top = int(np.median(strip_tops))
     median_bottom = int(np.median(strip_bottoms))
     
-    candidates, all_compared = [], []
+    tracker = TemporalTracker()
+    
     for frame in frames:
         h = frame.shape[0]
         tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
-        if not _has_tab_content(tab_crop): continue
-        compare_img = cv2.resize(tab_crop, (480, 120), interpolation=cv2.INTER_AREA)
-        is_dup = False
-        for ref in all_compared:
-            if compare_frames(compare_img, ref) >= threshold:
-                is_dup = True
-                break
-        if not is_dup:
-            candidates.append(tab_crop)
-            all_compared.append(compare_img)
-
-    stitched = _merge_scroll_candidates(candidates)
-    merged_panoramas = merge_panoramas_list(stitched)
-    
-    chunk_width = candidates[0].shape[1] if candidates else 1280
-    final_chunks = []
-    global_measure_counter = 1
-    current_row = None
-    
-    for pano in merged_panoramas:
-        gray_pano = _extract_print_channel(pano)
-        bar_coords = _detect_measure_bars(gray_pano)
-        
-        if not bar_coords:
-            w = pano.shape[1]
-            start_x = 0
-            while start_x < w:
-                chunk = pano[:, start_x:min(w, start_x + chunk_width)]
-                if chunk.shape[1] < chunk_width:
-                    pad = np.full((chunk.shape[0], chunk_width - chunk.shape[1], 3), 255, dtype=np.uint8)
-                    chunk = np.hstack([chunk, pad])
-                gray_c = _extract_print_channel(chunk)
-                final_chunks.append(cv2.cvtColor(gray_c, cv2.COLOR_GRAY2BGR))
-                start_x += chunk_width
+        if not _has_tab_content(tab_crop): 
             continue
-            
-        coords = [0] + bar_coords + [pano.shape[1]]
+        tracker.process_frame(tab_crop)
+
+    panorama = tracker.get_final_panorama()
+    if panorama is None:
+        return []
+        
+    print(f"  -> 생성된 파노라마 길이: {panorama.shape[1]}px")
+    
+    chunk_width = 1280
+    final_chunks = []
+    
+    gray_pano = _extract_print_channel(panorama)
+    bar_coords = _detect_measure_bars(gray_pano)
+    
+    if not bar_coords:
+        w = panorama.shape[1]
+        start_x = 0
+        while start_x < w:
+            chunk = panorama[:, start_x:min(w, start_x + chunk_width)]
+            if chunk.shape[1] < chunk_width:
+                pad = np.full((chunk.shape[0], chunk_width - chunk.shape[1], 3), 255, dtype=np.uint8)
+                chunk = np.hstack([chunk, pad])
+            final_chunks.append(chunk)
+            start_x += chunk_width
+    else:
+        coords = [0] + bar_coords + [panorama.shape[1]]
         coords = sorted(list(set(coords)))
         
+        current_row = None
         for i in range(len(coords) - 1):
             x_start = coords[i]
             x_end = coords[i+1]
             if x_end - x_start < 50:
                 continue
                 
-            measure_img = pano[:, x_start:x_end]
-            gray_m = _extract_print_channel(measure_img)
-            bgr_m = cv2.cvtColor(gray_m, cv2.COLOR_GRAY2BGR)
+            measure_img = panorama[:, x_start:x_end]
             
             if current_row is None:
-                current_row = bgr_m
+                current_row = measure_img
             else:
-                if current_row.shape[1] + bgr_m.shape[1] > chunk_width:
+                if current_row.shape[1] + measure_img.shape[1] > chunk_width:
                     pad_w = chunk_width - current_row.shape[1]
                     if pad_w > 0:
                         pad_img = np.full((current_row.shape[0], pad_w, 3), 255, dtype=np.uint8)
                         current_row = np.hstack([current_row, pad_img])
                     final_chunks.append(current_row)
-                    current_row = bgr_m
+                    current_row = measure_img
                 else:
-                    current_row = np.hstack([current_row, bgr_m])
+                    current_row = np.hstack([current_row, measure_img])
                     
-    if current_row is not None:
-        pad_w = chunk_width - current_row.shape[1]
-        if pad_w > 0:
-            pad_img = np.full((current_row.shape[0], pad_w, 3), 255, dtype=np.uint8)
-            current_row = np.hstack([current_row, pad_img])
-        final_chunks.append(current_row)
-
+        if current_row is not None:
+            pad_w = chunk_width - current_row.shape[1]
+            if pad_w > 0:
+                pad_img = np.full((current_row.shape[0], pad_w, 3), 255, dtype=np.uint8)
+                current_row = np.hstack([current_row, pad_img])
+            final_chunks.append(current_row)
+            
+    print(f"  -> A4 분할 컷: {len(final_chunks)}개")
     return final_chunks
 
 def extract_unique_overlay(frames: List[np.ndarray],