71 lines
2.3 KiB
Python
71 lines
2.3 KiB
Python
import fitz
|
|
import cv2
|
|
import numpy as np
|
|
|
|
def _get_ocr_reader():
|
|
import easyocr
|
|
return easyocr.Reader(['en'])
|
|
|
|
def verify_pdf_with_ocr(pdf_path):
|
|
print(f"Opening PDF for OCR Verification: {pdf_path}")
|
|
doc = fitz.open(pdf_path)
|
|
print(f"Total Pages: {len(doc)}")
|
|
|
|
reader = _get_ocr_reader()
|
|
|
|
for i in range(len(doc)):
|
|
page = doc[i]
|
|
pix = page.get_pixmap(dpi=150)
|
|
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
|
|
|
if pix.n == 4:
|
|
img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
|
|
elif pix.n == 1:
|
|
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
|
|
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
h, w = gray.shape
|
|
|
|
# Count staff lines
|
|
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
|
row_sums = np.sum(thresh, axis=1) / 255.0
|
|
staff_rows = np.where(row_sums > w * 0.4)[0]
|
|
|
|
# Group them
|
|
staff_blocks = []
|
|
if len(staff_rows) > 0:
|
|
c = [staff_rows[0]]
|
|
for r in staff_rows[1:]:
|
|
if r - c[-1] < 10:
|
|
c.append(r)
|
|
else:
|
|
staff_blocks.append(c)
|
|
c = [r]
|
|
staff_blocks.append(c)
|
|
|
|
print(f"\n[Page {i+1}] (Shape: {w}x{h})")
|
|
print(f" - Found {len(staff_blocks)} horizontal staff lines/blocks.")
|
|
|
|
# Read measure number using OCR from top left
|
|
if len(staff_blocks) > 0:
|
|
top_y = staff_blocks[0][0]
|
|
else:
|
|
top_y = 100
|
|
|
|
crop_y1 = max(0, top_y - 60)
|
|
crop_y2 = top_y + 10
|
|
crop_x2 = int(w * 0.15)
|
|
|
|
crop = gray[crop_y1:crop_y2, :crop_x2]
|
|
upscaled = cv2.resize(crop, (0,0), fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
|
|
_, upscaled_thresh = cv2.threshold(upscaled, 150, 255, cv2.THRESH_BINARY_INV)
|
|
|
|
results = reader.readtext(upscaled_thresh, allowlist='0123456789')
|
|
if results:
|
|
print(f" - OCR Candidate Measure Numbers: {[r[1] for r in results]}")
|
|
else:
|
|
print(f" - No Measure Number Detected.")
|
|
|
|
if __name__ == "__main__":
|
|
verify_pdf_with_ocr("output/shintakarajima_perfect.pdf")
|