guitar_score/scripts/debug/test_ocr_crop.py

import cv2
import easyocr
import numpy as np
from youtube_tab_to_pdf import _extract_print_channel, _detect_measure_bars

cap = cv2.VideoCapture(r"output/サカナクション／新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
cap.set(cv2.CAP_PROP_POS_FRAMES, 50) # 1.6 seconds in
ret, frame = cap.read()
if not ret: exit()

gray = np.max(frame, axis=2)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > frame.shape[1] * 0.5)[0]

y_staff = staff_lines[0] if len(staff_lines) > 0 else 100

bar_coords = _detect_measure_bars(thresh)
print(f"Detected Bars at X: {bar_coords}")

reader = easyocr.Reader(['en'], gpu=False)

for idx, x_bar in enumerate(bar_coords):
    # Crop the tiny region above the bar where the number should be
    crop_y1 = max(0, y_staff - 25)
    crop_y2 = max(0, y_staff - 2)
    crop_x1 = max(0, x_bar - 5)
    crop_x2 = min(frame.shape[1], x_bar + 25)

    if crop_y2 <= crop_y1 or crop_x2 <= crop_x1:
        continue

    sprite = frame[crop_y1:crop_y2, crop_x1:crop_x2]
    cv2.imwrite(f"debug_sprite_{idx}.png", sprite)

    # Scale up for better OCR
    scaled = cv2.resize(sprite, (0,0), fx=3, fy=3, interpolation=cv2.INTER_CUBIC)

    gray_sprite = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
    _, binary_sprite = cv2.threshold(gray_sprite, 180, 255, cv2.THRESH_BINARY_INV)
    cv2.imwrite(f"debug_sprite_bin_{idx}.png", binary_sprite)

    res = reader.readtext(gray_sprite, allowlist='0123456789')
    print(f"Bar {idx} X={x_bar} OCR: {res}")