guitar_score/verify_pdf.py

import fitz
import cv2
import numpy as np

def _get_ocr_reader():
    import easyocr
    return easyocr.Reader(['en'])

def verify_pdf_with_ocr(pdf_path):
    print(f"Opening PDF for OCR Verification: {pdf_path}")
    doc = fitz.open(pdf_path)
    print(f"Total Pages: {len(doc)}")

    reader = _get_ocr_reader()

    for i in range(len(doc)):
        page = doc[i]
        pix = page.get_pixmap(dpi=150)
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)

        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        elif pix.n == 1:
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        h, w = gray.shape

        # Count staff lines
        _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
        row_sums = np.sum(thresh, axis=1) / 255.0
        staff_rows = np.where(row_sums > w * 0.4)[0]

        # Group them
        staff_blocks = []
        if len(staff_rows) > 0:
            c = [staff_rows[0]]
            for r in staff_rows[1:]:
                if r - c[-1] < 10:
                    c.append(r)
                else:
                    staff_blocks.append(c)
                    c = [r]
            staff_blocks.append(c)

        print(f"\n[Page {i+1}] (Shape: {w}x{h})")
        print(f" - Found {len(staff_blocks)} horizontal staff lines/blocks.")

        # Read measure number using OCR from top left
        if len(staff_blocks) > 0:
            top_y = staff_blocks[0][0]
        else:
            top_y = 100

        crop_y1 = max(0, top_y - 60)
        crop_y2 = top_y + 10
        crop_x2 = int(w * 0.15)

        crop = gray[crop_y1:crop_y2, :crop_x2]
        upscaled = cv2.resize(crop, (0,0), fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
        _, upscaled_thresh = cv2.threshold(upscaled, 150, 255, cv2.THRESH_BINARY_INV)

        results = reader.readtext(upscaled_thresh, allowlist='0123456789')
        if results:
            print(f" - OCR Candidate Measure Numbers: {[r[1] for r in results]}")
        else:
            print(f" - No Measure Number Detected.")

if __name__ == "__main__":
    verify_pdf_with_ocr("output/shintakarajima_perfect.pdf")