wip: [01-stabilize] paused at task 1/1 - OCR Hallucination Immune logic via Semantic delta window and fret-isolation

2026-03-29 22:08:40 +09:00
parent aca7bf592a
commit 2507de45d3
4289 changed files with 732689 additions and 28672 deletions
--- a/youtube_tab_to_pdf.py
+++ b/youtube_tab_to_pdf.py
@@ -37,64 +37,56 @@ def _get_ocr_reader():
            return None
    return _ocr_reader

-def _dedup_by_measure_number(frames: List[np.ndarray]) -> List[np.ndarray]:
-    """OCR을 이용해 Tab 좌측 상단의 마디 번호를 읽고,
-       연속으로 동일한 번호가 검출되면 중복으로 간주하고 제거합니다."""
+def _extract_number_above_bars(crop: np.ndarray) -> Optional[int]:
+    """오선지 | 마디선 바로 위에 찍힌 번호(또는 좌측 여백의 번호)를 현미경 크롭하여 OCR로 판독합니다."""
    reader = _get_ocr_reader()
-    if not reader:
-        return frames
-
-    print(f"  → 마디번호 기반 3차 중복 검증 시작 ({len(frames)} 프레임)")
-    unique = []
-    last_measure_num = None
-
-    for i, frame in enumerate(frames):
-        h, w = frame.shape[:2]
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if len(frame.shape) == 3 else frame
+    if not reader: return None
+    
+    gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) if len(crop.shape) == 3 else crop.copy()
+    h, w = gray.shape
+    
+    # 1. 오선지 Top Line 탐색
+    _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
+    row_sums = np.sum(thresh, axis=1) / 255.0
+    staff_rows = np.where(row_sums > w * 0.4)[0]
+    
+    if len(staff_rows) == 0: return None
+    staff_top = staff_rows[0]
+    
+    # 2. 구조적 마디선(|) 검출 (기존 로직 활용)
+    bars = _detect_measure_bars(gray)
+    
+    candidates = []
+    # 마디 번호는 보통 맨 좌측 (5~10% 위치)이나 시작 바(|) 바로 위에 존재함
+    scan_x_anchors = [int(w * 0.05), int(w * 0.1)] + bars
+    
+    for x in scan_x_anchors:
+        # | 기준 위쪽 70픽셀 내외, 좌우 40픽셀의 좁은 영역만 집중 크롭
+        # 버그 픽스: y2를 staff_top + 10으로 하면 기타 1번줄(High E)의 프렛 번호(0,1,2,3..)가 검출되어
+        # 마디 번호로 오인되는 치명적 버그 발생! 무조건 오선지 위쪽(staff_top - 5)에서 컷해야 함.
+        y1 = max(0, staff_top - 80)
+        y2 = staff_top - 5
+        x1 = max(0, x - 40)
+        x2 = min(w, x + 40)
        
-        # 동적 투영(Projection)을 통해 첫 번째 오선지(Staff line)의 Y좌표 스캔
-        _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
-        row_sums = np.sum(thresh, axis=1) / 255
-        
-        # 폭의 50% 이상을 차지하는 검은 가로선을 오선지로 간주
-        staff_lines = np.where(row_sums > w * 0.5)[0]
-        
-        if len(staff_lines) > 0:
-            first_line_y = staff_lines[0]
-            # 오선지 바로 위 영역 45px ~ 오선지 까지 (여유공간 2px) + 좌측 8% 너비만 추출 (기타 코드 다이어그램 제외)
-            crop_y_start = max(0, first_line_y - 45)
-            crop_y_end = max(10, first_line_y - 2)
-            crop = gray[crop_y_start:crop_y_end, :int(w * 0.08)]
-        else:
-            # 안전 장치: 오선지를 못 찾았을 경우 기존 하드코딩 비율 사용
-            crop = gray[:int(h * 0.25), :int(w * 0.08)]
+        region = gray[y1:y2, x1:x2]
+        if region.size == 0 or region.shape[0] < 15 or region.shape[1] < 15:
+            continue
            
-        # 작은 마디번호의 인식률 극대화를 위해 3배 업스케일링 및 이진화 처리
-        upscaled = cv2.resize(crop, (0, 0), fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
+        # 인식률 극대화를 위해 3배 업스케일 및 이진화
+        upscaled = cv2.resize(region, (0, 0), fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
        _, upscaled_thresh = cv2.threshold(upscaled, 150, 255, cv2.THRESH_BINARY_INV)
        
        results = reader.readtext(upscaled_thresh, allowlist='0123456789')
-        
-        measure_num = None
-        if results:
-            # conf > 0.4 이면서 1~3자리의 숫자로만 이루어진 텍스트를 마디 번호로 간주
-            valid_results = [res[1] for res in results if res[2] > 0.4 and res[1].isdigit() and len(res[1]) <= 3]
-            if valid_results:
-                measure_num = valid_results[0]
-        
-        if measure_num is not None:
-            if measure_num == last_measure_num:
-                print(f"    - 프레임 {i+1}: 마디번호 [{measure_num}] 중복 감지 (삭제)")
-                continue
-            last_measure_num = measure_num
-            print(f"    - 프레임 {i+1}: 마디번호 [{measure_num}] (유지)")
-        else:
-            print(f"    - 프레임 {i+1}: 마디번호 미검출 (유지)")
-        
-        unique.append(frame)
-
-    print(f"  → OCR 3차: {len(unique)}개 고유 Tab 프레임")
-    return unique
+        for _, text, conf in results:
+            if conf > 0.35 and text.isdigit():
+                num = int(text)
+                if num < 500: # 500마디가 넘는 곡은 희귀하므로 이상치 거름
+                    candidates.append(num)
+                    
+    if candidates:
+        return min(candidates) # 한 페이지에 여러 마디 번호가 잡히면 가장 첫(작은) 번호를 대표로 삼음
+    return None

 # Windows 콘솔 인코딩
 if sys.platform == "win32":
@@ -305,7 +297,7 @@ def _trim_to_content(crop: np.ndarray, margin_px: int = 6) -> np.ndarray:
    top = 0
    for i in range(h):
        if valid_rows[i] and row_white[i] > 0.20:
-            top = max(0, i - 120)  # 상단 마디번호 보존을 위해 압도적인 120px 강제 보호 (숫자가 꽤 높이 떠있음)
+            top = max(0, i - 140)  # 마디번호 및 ┌─1 (도돌이표) 보존을 위해 역대급으로 넉넉한 140px 보호 여백 부여
            break

    # 하단: 마지막 유효 행
@@ -869,34 +861,56 @@ def extract_unique_overlay(frames: List[np.ndarray],
    print(f"[Tracker] {len(pages)}개의 최초 구분 페이지 추출됨. 전역 중복 페이지 병합 심사 중...")

    unique = []
+    last_measure_num = -1
    
-    for crop in pages:
-        if np.mean(cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)) < 80:
-            continue
-
-        is_dup = False
-        crop_gray = _extract_tracking_channel(crop)
+    def _is_duplicate_cv(new_crop: np.ndarray, past_crops: List[np.ndarray]) -> bool:
+        crop_gray = _extract_tracking_channel(new_crop)
        h_c, w_c = crop_gray.shape
        crop_gray[:int(h_c * 0.20), :] = 255
        crop_gray[int(h_c * 0.80):, :] = 255
        
-        for past_crop in unique:
-            past_gray = _extract_tracking_channel(past_crop)
+        for p_crop in past_crops:
+            past_gray = _extract_tracking_channel(p_crop)
            past_gray[:int(h_c * 0.20), :] = 255
            past_gray[int(h_c * 0.80):, :] = 255
            
-            # 약간의 위치 이동(+/- 10픽셀)을 탐색하기 위해 템플릿 사이즈를 줄임
            template = crop_gray[10:h_c-10, 10:w_c-10]
-            res = cv2.matchTemplate(past_gray, template, cv2.TM_CCOEFF_NORMED)
-            _, max_val, _, _ = cv2.minMaxLoc(res)
-            
-            # 90% 이상의 강한 상관계수를 가지면 인간의 눈에는 완벽히 똑같은 악보(도돌이표)임.
-            if max_val > 0.90:
-                is_dup = True
-                break
-                
-        if not is_dup:
-            unique.append(crop)
+            if template.shape[0] > 0 and past_gray.shape[0] >= template.shape[0] and past_gray.shape[1] >= template.shape[1]:
+                res = cv2.matchTemplate(past_gray, template, cv2.TM_CCOEFF_NORMED)
+                _, max_val, _, _ = cv2.minMaxLoc(res)
+                if max_val > 0.90:
+                    return True
+        return False
+
+    for crop in pages:
+        if np.mean(cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)) < 80:
+            continue
+
+        m_num = _extract_number_above_bars(crop)
+        
+        if m_num is not None:
+            # 1페이지당 통상 마디수 허용 범위 (+1 ~ +25 이내의 점진적 상승만 신뢰)
+            # 영상에서 마디번호가 10, 11, 12 ... 씩 1단위로 증가하거나, 1, 17, 33 처럼 페이지 단위로 증가하는 것을 상정
+            if last_measure_num < m_num <= last_measure_num + 25 or last_measure_num == -1:
+                print(f"    > [Keep] 마디번호 {m_num} (안정적인 순차 상승 패턴)")
+                last_measure_num = m_num
+                unique.append(crop)
+            else:
+                # 역행하거나(+0 이하), 너무 크게 점프한 경우(+25 초과) => 반복 코러스이거나 OCR 환각(32, 1017 등)임
+                # CV 보완 추론을 통해 진짜 새로운 악보인지 확인!
+                if _is_duplicate_cv(crop, unique):
+                    print(f"    > [Skip] 측정 번호({m_num}) 이상 & CV 분석 결과 이전 페이지와 동일(반복 코러스). 버림.")
+                    continue
+                else:
+                    print(f"    > [Keep] 측정 번호({m_num})는 OCR 환각이지만 CV 분석 결과 순수 새로운 페이지임! 번호 무시하고 채택.")
+                    # OCR 값이 환각이므로 last_measure_num은 갱신하지 않고 징검다리를 연결
+                    unique.append(crop)
+        else:
+            if not _is_duplicate_cv(crop, unique):
+                print(f"    > [Keep] OCR 무효/실패. Pixel 변화량에 따른 완전 새로운 페이지 채택")
+                unique.append(crop)
+            else:
+                print(f"    > [Skip] OCR 무효/실패 & Pixel 분석 결과 도돌이표 코러스로 판명. 버림.")

    print(f"  → 임시: {len(unique)}개 고유 오버레이 페이지 추출 성공. 상하단 여백 및 제목 정리 중...")
    
@@ -910,8 +924,8 @@ def extract_unique_overlay(frames: List[np.ndarray],
        h_c, w_c = crop.shape[:2]
        staff_rows = np.where(row_sums > w_c * 0.4)[0]
        if len(staff_rows) > 0:
-            # 상단 여백 60px (코드, 기호 등), 하단 여백 30px
-            top_y = max(0, staff_rows[0] - 60)
+            # ┌─1 (반복 기호) 및 마디 번호 등을 충분히 보존할 수 있도록 파격적으로 130px 여유를 둡니다.
+            top_y = max(0, staff_rows[0] - 130)
            bottom_y = min(h_c, staff_rows[-1] + 30)
            trimmed_unique.append(crop[top_y:bottom_y, :])
        else: