Files
guitar_score/scripts/debug/test_ocr_crop.py

45 lines
1.6 KiB
Python

import cv2
import easyocr
import numpy as np
from youtube_tab_to_pdf import _extract_print_channel, _detect_measure_bars
cap = cv2.VideoCapture(r"output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
cap.set(cv2.CAP_PROP_POS_FRAMES, 50) # 1.6 seconds in
ret, frame = cap.read()
if not ret: exit()
gray = np.max(frame, axis=2)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > frame.shape[1] * 0.5)[0]
y_staff = staff_lines[0] if len(staff_lines) > 0 else 100
bar_coords = _detect_measure_bars(thresh)
print(f"Detected Bars at X: {bar_coords}")
reader = easyocr.Reader(['en'], gpu=False)
for idx, x_bar in enumerate(bar_coords):
# Crop the tiny region above the bar where the number should be
crop_y1 = max(0, y_staff - 25)
crop_y2 = max(0, y_staff - 2)
crop_x1 = max(0, x_bar - 5)
crop_x2 = min(frame.shape[1], x_bar + 25)
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1:
continue
sprite = frame[crop_y1:crop_y2, crop_x1:crop_x2]
cv2.imwrite(f"debug_sprite_{idx}.png", sprite)
# Scale up for better OCR
scaled = cv2.resize(sprite, (0,0), fx=3, fy=3, interpolation=cv2.INTER_CUBIC)
gray_sprite = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
_, binary_sprite = cv2.threshold(gray_sprite, 180, 255, cv2.THRESH_BINARY_INV)
cv2.imwrite(f"debug_sprite_bin_{idx}.png", binary_sprite)
res = reader.readtext(gray_sprite, allowlist='0123456789')
print(f"Bar {idx} X={x_bar} OCR: {res}")