chore(docs): document ScoreExtractor tiling and refactor debug scripts (#563)

2026-03-29 17:57:40 +09:00
parent 39b55f2e9f
commit ac0c098259
698 changed files with 141180 additions and 195 deletions
--- a/youtube_tab_to_pdf.py
+++ b/youtube_tab_to_pdf.py
@@ -268,10 +268,11 @@ def _find_white_tab_strip(frame: np.ndarray, min_strip_ratio: float = 0.10) -> O
    # 가장 넓은 흰색 스트립 반환
    best = max(regions, key=lambda r: r[1] - r[0])

-    # 약간의 패딩 추가 (하단 짤림 방지)
-    pad = int(h * 0.03)
-    top = max(0, best[0] - pad)
-    bottom = min(h, best[1] + pad)
+    # 추가 패딩: 상단은 반복선 브래킷(┌─ 1.) 보존을 위해 크게 잡음
+    pad_top = int(h * 0.15)
+    pad_bottom = int(h * 0.03)
+    top = max(0, best[0] - pad_top)
+    bottom = min(h, best[1] + pad_bottom)

    return (top, bottom)

@@ -658,88 +659,55 @@ def merge_panoramas_list(panoramas):
    merged_list.append(current_master)
    return merged_list

-def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
-    print(f"[4/5] 스크롤형 Tab 시계열 추적 추출 중...")
+def extract_unique_scroll(frames: List[np.ndarray], scan_dist: int = 4) -> List[np.ndarray]:
+    """
+    Deprecated parameters kept for signature compatibility.
+    Uses the new Object-Oriented Hybrid State Machine (ScoreExtractor)
+    and robust TemporalTracker to guarantee pure monotonic structural extraction.
+    """
+    from video_cv_tracker import TemporalTracker
+    from score_extractor import ScoreExtractor
    
-    strip_tops, strip_bottoms = [], []
-    for frame in frames[:50]:
-        strip = _find_white_tab_strip(frame)
-        if strip:
-            strip_tops.append(strip[0])
-            strip_bottoms.append(strip[1])
+    print("[Pipeline] Isolating static structures via TemporalTracker")
+    # Tracker handles Temporal Median to isolate sheet music overlays
+    tracker = TemporalTracker(diff_threshold=0.05)
+    
+    # Dynamically find the pristine white tablature strip bounding box to isolate it from background noise
+    tab_bounds = None
+    for f in frames[::30]:
+        bounds = _find_white_tab_strip(f)
+        if bounds:
+            tab_bounds = bounds
+            break
            
-    if not strip_tops:
-        return []
-        
-    median_top = int(np.median(strip_tops))
-    median_bottom = int(np.median(strip_bottoms))
-    
-    tracker = TemporalTracker()
+    if tab_bounds:
+        top, bottom = tab_bounds
+        print(f"  -> Found precise sheet music bounds: Y={top} to Y={bottom}")
+    else:
+        top, bottom = 0, frames[0].shape[0]
+        print(f"  -> Bounding box not found, fallback to full frame: Y={top} to Y={bottom}")
    
    for frame in frames:
-        h = frame.shape[0]
-        tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
-        if not _has_tab_content(tab_crop): 
-            continue
-        tracker.process_frame(tab_crop)
-
-    panorama = tracker.get_final_panorama()
-    if panorama is None:
-        return []
+        # Tightly constrain the region of interest to the sheet music.
+        # This completely hides the guitarist's hands and guarantees pure static tracking.
+        roi = frame[top:bottom, :]
+        tracker.process_frame(roi)
        
-    print(f"  -> 생성된 파노라마 길이: {panorama.shape[1]}px")
+    unique_pages = tracker.get_unique_pages()
+    print(f"[Pipeline] Reduced down to {len(unique_pages)} static structural median pages.")
    
-    chunk_width = 1280
-    final_chunks = []
+    # State Machine extraction
+    extractor = ScoreExtractor()
+    extractor.process_pages(unique_pages)
+    tiled_rows = extractor.tile_to_a4(chunk_width=1800)
    
-    gray_pano = _extract_print_channel(panorama)
-    bar_coords = _detect_measure_bars(gray_pano)
-    
-    if not bar_coords:
-        w = panorama.shape[1]
-        start_x = 0
-        while start_x < w:
-            chunk = panorama[:, start_x:min(w, start_x + chunk_width)]
-            if chunk.shape[1] < chunk_width:
-                pad = np.full((chunk.shape[0], chunk_width - chunk.shape[1], 3), 255, dtype=np.uint8)
-                chunk = np.hstack([chunk, pad])
-            final_chunks.append(chunk)
-            start_x += chunk_width
-    else:
-        coords = [0] + bar_coords + [panorama.shape[1]]
-        coords = sorted(list(set(coords)))
+    # Wait, the thresholding already produced a 255 White Background with 0 Black Text!
+    # No need to invert!
+    final_a4_chunks = []
+    for row in tiled_rows:
+        final_a4_chunks.append(row)
        
-        current_row = None
-        for i in range(len(coords) - 1):
-            x_start = coords[i]
-            x_end = coords[i+1]
-            if x_end - x_start < 50:
-                continue
-                
-            measure_img = panorama[:, x_start:x_end]
-            
-            if current_row is None:
-                current_row = measure_img
-            else:
-                if current_row.shape[1] + measure_img.shape[1] > chunk_width:
-                    pad_w = chunk_width - current_row.shape[1]
-                    if pad_w > 0:
-                        pad_img = np.full((current_row.shape[0], pad_w, 3), 255, dtype=np.uint8)
-                        current_row = np.hstack([current_row, pad_img])
-                    final_chunks.append(current_row)
-                    current_row = measure_img
-                else:
-                    current_row = np.hstack([current_row, measure_img])
-                    
-        if current_row is not None:
-            pad_w = chunk_width - current_row.shape[1]
-            if pad_w > 0:
-                pad_img = np.full((current_row.shape[0], pad_w, 3), 255, dtype=np.uint8)
-                current_row = np.hstack([current_row, pad_img])
-            final_chunks.append(current_row)
-            
-    print(f"  -> A4 분할 컷: {len(final_chunks)}개")
-    return final_chunks
+    return final_a4_chunks

 def extract_unique_overlay(frames: List[np.ndarray],
                           threshold: float = OVERLAY_SIMILARITY_THRESHOLD) -> List[np.ndarray]:
@@ -804,8 +772,8 @@ def generate_pdf(frames: List[np.ndarray], output_path: Path,
        print("  ⚠ 프레임 없음!")
        return

-    page_w = int(PDF_PAGE_WIDTH_MM / 25.4 * PDF_DPI)
-    page_h = int(PDF_PAGE_HEIGHT_MM / 25.4 * PDF_DPI)
+    page_w = int(PDF_PAGE_HEIGHT_MM / 25.4 * PDF_DPI)  # Landscape width
+    page_h = int(PDF_PAGE_WIDTH_MM / 25.4 * PDF_DPI)   # Landscape height
    margin = int(PDF_MARGIN_MM / 25.4 * PDF_DPI)
    gap = int(TAB_GAP_MM / 25.4 * PDF_DPI)
    content_w = page_w - 2 * margin
@@ -843,20 +811,34 @@ def generate_pdf(frames: List[np.ndarray], output_path: Path,
    print(f"  → PDF: {len(resized)} Tab → {len(pages)} 페이지, {output_path.stat().st_size // 1024} KB")


-def generate_long_image(frames: List[np.ndarray], output_path: Path) -> None:
-    """Tab을 하나의 긴 이미지로"""
-    if not frames:
+def generate_long_image(chunks: List[np.ndarray], output_path: str):
+    if not chunks:
        return
-    max_w = max(f.shape[1] for f in frames)
-    imgs = []
-    for f in frames:
-        if f.shape[1] != max_w:
-            scale = max_w / f.shape[1]
-            f = cv2.resize(f, (max_w, int(f.shape[0] * scale)))
-        imgs.append(f)
-    concat = np.vstack(imgs)
-    Image.fromarray(cv2.cvtColor(concat, cv2.COLOR_BGR2RGB)).save(str(output_path))
-    print(f"  → 롱 이미지: {max_w}x{concat.shape[0]}")
+        
+    print(f"DEBUG: First chunk shape = {chunks[0].shape}, dtype = {chunks[0].dtype}")
+    # Calculate exact total height required
+    total_h = sum(chunk.shape[0] for chunk in chunks)
+    max_w = max(chunk.shape[1] for chunk in chunks)
+    
+    # Ensure correct channel dimensions for the canvas to prevent squishing!
+    if len(chunks[0].shape) == 3:
+        canvas = np.full((total_h, max_w, 3), 255, dtype=np.uint8)
+    else:
+        canvas = np.full((total_h, max_w), 255, dtype=np.uint8)
+        
+    y_offset = 0
+    for chunk in chunks:
+        h, w = chunk.shape[:2]
+        if len(chunk.shape) == 3 and len(canvas.shape) == 2:
+            canvas[y_offset:y_offset+h, :w] = cv2.cvtColor(chunk, cv2.COLOR_BGR2GRAY)
+        elif len(chunk.shape) == 2 and len(canvas.shape) == 3:
+            canvas[y_offset:y_offset+h, :w] = cv2.cvtColor(chunk, cv2.COLOR_GRAY2BGR)
+        else:
+            canvas[y_offset:y_offset+h, :w] = chunk
+            
+        y_offset += h
+        
+    cv2.imwrite(str(output_path), canvas)


 # ─── Main ─────────────────────────────────────────────────────────────────