chore(docs): document ScoreExtractor tiling and refactor debug scripts (#563)

This commit is contained in:
2026-03-29 17:57:40 +09:00
parent 39b55f2e9f
commit ac0c098259
698 changed files with 141180 additions and 195 deletions

View File

@@ -0,0 +1,31 @@
import cv2
import numpy as np
def img_to_ascii(image, max_w=120):
if isinstance(image, str):
image = cv2.imread(image)
if image is None: return
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image
h, w = gray.shape
scale = max_w / w
resized = cv2.resize(gray, (max_w, int(h * scale)))
chars = " .:-=+*#%@"
for r in range(resized.shape[0]):
row_str = ""
for c in range(resized.shape[1]):
val = resized[r, c]
idx = int((val / 255.0) * (len(chars) - 1))
row_str += chars[idx]
print(row_str)
if __name__ == "__main__":
img = cv2.imread("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/verify_pano_chunk_00.png")
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/verify_first_measure.png", img[:50, :200])
print("Exported verify_first_measure.png from verify_pano_chunk_00.png")
print("Exported verify_first_measure.png from raw_frame_1920.png")

BIN
scripts/debug/debug_121.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 546 KiB

BIN
scripts/debug/debug_38.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 550 KiB

View File

@@ -0,0 +1,19 @@
# Research Script for Debugging process_pages
import cv2
import pickle
import os
out_dir = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\debug_blocks"
os.makedirs(out_dir, exist_ok=True)
with open('unique_pages.pkl', 'rb') as f:
unique_pages = pickle.load(f)
from score_extractor import ScoreExtractor
extractor = ScoreExtractor()
extractor.process_pages(unique_pages)
for i, block in enumerate(extractor.final_sheet_chunks):
cv2.imwrite(os.path.join(out_dir, f"block_{i:02d}.png"), block)
print("Dumped blocks!")

BIN
scripts/debug/debug_c1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

BIN
scripts/debug/debug_c3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

View File

@@ -0,0 +1,48 @@
import cv2
import pickle
import traceback
try:
with open('unique_pages.pkl', 'rb') as f:
unique_pages = pickle.load(f)
except Exception:
import video_cv_tracker as tracker_lib
from youtube_tab_to_pdf import _find_white_tab_strip
tracker = tracker_lib.TemporalTracker(diff_threshold=0.05)
video = cv2.VideoCapture("output/shintakarajima.mp4")
# Just read 100 frames
frames = []
fps_orig = video.get(cv2.CAP_PROP_FPS)
stride = max(1, int(fps_orig / 4.0))
count = 0
while len(frames) < 150:
ret, f = video.read()
if not ret: break
if count % stride == 0: frames.append(f)
count += 1
video.release()
top, bottom = 0, frames[0].shape[0]
for f in frames[::10]:
b = _find_white_tab_strip(f)
if b:
top, bottom = b
break
for f in frames:
tracker.process_frame(f[top:bottom, :])
unique_pages = tracker.get_unique_pages()
with open('unique_pages.pkl', 'wb') as f:
pickle.dump(unique_pages, f)
from score_extractor import ScoreExtractor
ex = ScoreExtractor()
try:
print(f"Running ScoreExtractor on {len(unique_pages)} pages...")
ex.process_pages(unique_pages)
print("Success!")
except Exception as e:
print("CRASHED:")
traceback.print_exc()

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 621 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 609 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

View File

@@ -0,0 +1,61 @@
import cv2
import numpy as np
import easyocr
import os
from pathlib import Path
from youtube_tab_to_pdf import _find_white_tab_strip, _has_tab_content, _extract_print_channel, _detect_measure_bars
def main():
url = "https://youtu.be/tJq1n8TofM0"
video_path = Path("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
artifact_dir = Path(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6")
cap = cv2.VideoCapture(str(video_path))
ret, frame = cap.read()
strip = _find_white_tab_strip(frame)
top, bottom = strip[0], strip[1]
tab_crop = frame[max(0, top):min(frame.shape[0], bottom), :]
gray_page = _extract_print_channel(tab_crop)
bar_coords = _detect_measure_bars(gray_page)
coords = [0] + bar_coords + [tab_crop.shape[1]]
coords = sorted(list(set(coords)))
reader = easyocr.Reader(['en'], verbose=False)
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
measure_w = x_end - x_start
if measure_w < 30: continue
m_img = tab_crop[:, x_start:x_end]
# Extract Number Sprite precisely
gray = cv2.cvtColor(m_img, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
if len(staff_lines) > 0:
y_staff = staff_lines[0]
# 상단 45px, 좌측 70px 크롭
crop_y1 = max(0, y_staff - 45)
crop_y2 = y_staff
sprite = thresh[crop_y1:crop_y2, 0:min(70, m_img.shape[1])]
out_file = artifact_dir / f"debug_sprite_{i}.png"
cv2.imwrite(str(out_file), sprite)
# OCR
upscaled = cv2.resize(sprite, (0, 0), fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC)
res = reader.readtext(upscaled, allowlist='0123456789', detail=0)
print(f"Measure {i}: Found text = {res}")
cap.release()
if __name__ == "__main__":
main()

View File

View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 323 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 621 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 332 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 556 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 872 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 328 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 630 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 323 B

View File

@@ -0,0 +1,102 @@
import cv2
import numpy as np
import sys
from pathlib import Path
# Load tracker directly to inspect ORB
sys.path.append(str(Path(".").resolve()))
from video_cv_tracker import TemporalTracker
def main():
print("Testing ORB matcher...")
# Load test frames from video 1
cap = cv2.VideoCapture("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
# Fast forward to transition frame
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
succ, prev = cap.read()
curr = prev.copy()
# We will just write a snippet from the actual video loop and manually feed it
# We can use debug_video1.py ? No, I'll extract real frames directly where the cut happens
# A faster way: Just scan the video for transitions and print the ORB histogram
tracker = TemporalTracker()
frame_idx = 500
transitions_found = 0
while True:
succ, frame = cap.read()
if not succ: break
if frame_idx % 100 == 0:
print(f"Reading frame {frame_idx}...", flush=True)
# We need the strip, like youtube_tab_to_pdf.py does
strip = frame[111:390] # Approximate Region
dx, conf = tracker._calculate_pixel_shift(tracker.last_clean_frame if tracker.last_clean_frame is not None else strip, strip)
if tracker.panorama is None:
tracker.panorama = strip.copy()
tracker.last_clean_frame = strip.copy()
frame_idx += 1
continue
if (conf < 0.45) or (tracker.last_conf - conf > 0.3):
tracker.in_transition = True
elif tracker.in_transition and conf > 0.85 and dx == 0:
tracker.in_transition = False
print(f"[{frame_idx}] Transition Recovered! Testing ORB...")
# RUN ORB
search_w = min(1500, tracker.panorama.shape[1])
search_region = tracker._extract_print_channel(tracker.panorama[:, -search_w:])
head = tracker._extract_print_channel(strip)
orb = cv2.ORB_create(1000)
kp1, des1 = orb.detectAndCompute(search_region, None)
kp2, des2 = orb.detectAndCompute(head, None)
print(f" kp1: {len(kp1) if kp1 else 0}, kp2: {len(kp2) if kp2 else 0}")
if des1 is not None and des2 is not None and len(des1) > 10 and len(des2) > 10:
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
matches = bf.match(des1, des2)
dx_votes = []
for m in matches:
x1, y1 = kp1[m.queryIdx].pt
x2, y2 = kp2[m.trainIdx].pt
if abs(y1 - y2) < 10:
dx_votes.append(x1 - x2)
if dx_votes:
hist, bins = np.histogram(dx_votes, bins=np.arange(min(dx_votes)-5, max(dx_votes)+5, 5))
best_bin_idx = np.argmax(hist)
print(f" Max Vote Count: {hist[best_bin_idx]} at dx={bins[best_bin_idx]}")
if hist[best_bin_idx] < 12:
print(" => FAILED! Overlap not found (too few ORB matches). Will append complete new page.")
else:
print(" => SUCCESS! Overlap found.")
else:
print(" => FAILED! No dx votes.")
else:
print(" => FAILED! des1 or des2 is None or less than 10!")
tracker.panorama = np.hstack([tracker.panorama, strip])
transitions_found += 1
if transitions_found > 0:
break
elif dx > 0 and not tracker.in_transition:
tracker.panorama = np.hstack([tracker.panorama, strip[:, strip.shape[1] - int(dx):, :]])
tracker.last_conf = conf
tracker.last_clean_frame = strip.copy()
frame_idx += 1
cap.release()
if __name__ == '__main__':
main()

Binary file not shown.

View File

@@ -0,0 +1,98 @@
import cv2
import numpy as np
import sys
import glob
# Test matching between two chunks to see what the score was!
# Wait, the chunks are the output of the slicing!
# The tracker works on the original FRAMES!
# Let's test the tracker on the original frames!
# I will supply the exact logic used in the tracker.
def test_tracker():
video_file = r"C:\Users\Certes\Desktop\guitar_score\output\サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
cap = cv2.VideoCapture(video_file)
panorama = None
last_clean_frame = None
in_transition = False
last_conf = 1.0
count = 0
saved_matches = []
while True:
ret, frame = cap.read()
if not ret: break
count += 1
if count % 15 != 0: # fps=2
continue
frame = cv2.resize(frame, (1280, 720))
if panorama is None:
panorama = frame.copy()
last_clean_frame = frame.copy()
continue
# calculate shift
prev_chan = last_clean_frame[:, :, 0]
curr_chan = frame[:, :, 0]
w = 1280
template_w = int(w * 0.3)
start_x = int(w * 0.6)
template = prev_chan[:, start_x:start_x + template_w]
res = cv2.matchTemplate(curr_chan, template, cv2.TM_CCOEFF_NORMED)
_, conf, _, max_loc = cv2.minMaxLoc(res)
dx = start_x - max_loc[0]
if conf < 0.15 or dx <= 0:
dx = 0
if dx > w * 0.15:
dx = 0
if (conf < 0.45) or (last_conf - conf > 0.3):
in_transition = True
elif in_transition and conf > 0.85 and dx == 0:
in_transition = False
# overlap logic
h = panorama.shape[0]
new_page = frame.copy()
search_w = min(1500, panorama.shape[1])
search_region = panorama[:, -search_w:, 0]
head_w = min(400, new_page.shape[1])
head = new_page[:, 50:50+head_w, 0]
res2 = cv2.matchTemplate(search_region, head, cv2.TM_CCOEFF_NORMED)
_, max_val, _, matched_loc = cv2.minMaxLoc(res2)
saved_matches.append(max_val)
print(f"Page turn detected! Overlap match score: {max_val:.4f} at {matched_loc}")
if max_val > 0.65:
overlap_px = search_w - matched_loc[0] + 50
if overlap_px < new_page.shape[1] - 50:
panorama = np.hstack([panorama, new_page[:, overlap_px:]])
else:
pass
else:
panorama = np.hstack([panorama, new_page])
elif dx > 0 and dx < w and not in_transition:
new_strip = frame[:, w - dx:, :]
panorama = np.hstack([panorama, new_strip])
last_conf = conf
last_clean_frame = frame.copy()
if len(saved_matches) >= 3:
break
cap.release()
print("Test complete.")
if __name__ == "__main__":
test_tracker()

View File

@@ -0,0 +1,73 @@
import cv2
import numpy as np
from score_extractor import ScoreExtractor
from youtube_tab_to_pdf import extract_unique_scroll, _detect_tab_overlay
# Simplified run script to dump all macro blocks and ignored pages
frames = []
video = cv2.VideoCapture("sakanaction shintakarajima.mp4")
fps_orig = video.get(cv2.CAP_PROP_FPS)
stride = max(1, int(fps_orig / 4.0))
count = 0
while True:
ret, frame = video.read()
if not ret: break
if count % stride == 0:
frames.append(frame)
count += 1
video.release()
from video_cv_tracker import TemporalTracker
from youtube_tab_to_pdf import _find_white_tab_strip
tracker = TemporalTracker(diff_threshold=0.05)
tab_bounds = None
for f in frames[::30]:
b = _find_white_tab_strip(f)
if b:
tab_bounds = b
break
top, bottom = tab_bounds if tab_bounds else (0, frames[0].shape[0])
for f in frames:
tracker.process_frame(f[top:bottom, :])
unique = tracker.get_unique_pages()
ex = ScoreExtractor()
# Manually process them and print verbose output
ex.macro_blocks = [unique[0].copy()]
ex.history_pages = [unique[0]]
for i, page in enumerate(unique[1:], 1):
current = ex.macro_blocks[-1]
head_w = min(800, page.shape[1])
search_w = min(1500, current.shape[1])
h_gray = cv2.cvtColor(page[:, :head_w], cv2.COLOR_BGR2GRAY)
s_gray = cv2.cvtColor(current[:, -search_w:], cv2.COLOR_BGR2GRAY)
res = cv2.matchTemplate(s_gray, h_gray, cv2.TM_CCOEFF_NORMED)
_, max_val, _, max_loc = cv2.minMaxLoc(res)
if max_val > 0.50:
print(f"[Page {i}] Stitched! max_val={max_val:.2f}")
absolute_match_x = current.shape[1] - search_w + max_loc[0]
next_start_idx = current.shape[1] - absolute_match_x
if next_start_idx < page.shape[1]:
append_part = page[:, next_start_idx:]
ex.macro_blocks[-1] = np.hstack([ex.macro_blocks[-1], append_part])
ex.history_pages.append(append_part)
else:
# Check repeat
is_repeat = ex._is_historical_repeat(page)
print(f"[Page {i}] Jump! max_val={max_val:.2f}, repeat={is_repeat}")
if is_repeat:
# We will save the rejected page to see if it was 22-29
cv2.imwrite(f"rejected_page_{i}.png", page)
else:
ex.macro_blocks.append(page.copy())
ex.history_pages.append(page)
# Dump the starts of the blocks
for j, b in enumerate(ex.macro_blocks):
cv2.imwrite(f"macro_block_{j}_start.png", b[:, :1800])

View File

@@ -0,0 +1,21 @@
import cv2
import numpy as np
cap = cv2.VideoCapture("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
# Skip to 30 seconds
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
cap.set(cv2.CAP_PROP_POS_FRAMES, 30 * fps)
ret, frame_30s = cap.read()
if ret:
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/raw_frame_30s.png", frame_30s)
# Skip to 35 seconds
cap.set(cv2.CAP_PROP_POS_FRAMES, 35 * fps)
ret, frame_35s = cap.read()
if ret:
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/raw_frame_35s.png", frame_35s)
cap.release()
print("Saved raw frames for structural analysis.")

Binary file not shown.

After

Width:  |  Height:  |  Size: 901 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 608 KiB

View File

View File

@@ -0,0 +1,35 @@
import cv2
import os
import shutil
video_file = r"C:\Users\Certes\Desktop\guitar_score\output\サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
debug_dir = r"C:\Users\Certes\Desktop\guitar_score\output\debug_video1"
if os.path.exists(debug_dir):
shutil.rmtree(debug_dir)
os.makedirs(debug_dir)
cap = cv2.VideoCapture(video_file)
fps_orig = cap.get(cv2.CAP_PROP_FPS)
target_fps = 1
frame_skip = int(fps_orig / target_fps)
count = 0
saved = 0
last_frame = None
while True:
ret, frame = cap.read()
if not ret: break
if count % (fps_orig * 10) == 0:
frame = cv2.resize(frame, (1280, 720))
cv2.imwrite(os.path.join(debug_dir, f"frame_{count:05d}.jpg"), frame)
saved += 1
if saved > 30:
break
count += 1
cap.release()
print(f"Extraction complete. {saved} frames saved.")

View File

@@ -0,0 +1,33 @@
"""원본 프레임 덤프 — 각 영상에서 5개 프레임을 랜덤 추출"""
import sys
if sys.platform == "win32":
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
import cv2
import numpy as np
from pathlib import Path
output = Path("output")
dump_dir = output / "raw_dump"
dump_dir.mkdir(exist_ok=True)
mp4s = sorted(output.glob("*.mp4"))
for vi, mp4 in enumerate(mp4s):
cap = cv2.VideoCapture(str(mp4))
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Video {vi+1}: {mp4.name[:30]}... ({w}x{h}, {fps:.0f}fps, {total} frames)")
# 균등 간격으로 5개 프레임
indices = np.linspace(total * 0.1, total * 0.9, 5, dtype=int)
for i, idx in enumerate(indices):
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if ret:
path = dump_dir / f"v{vi+1}_raw_{i}.png"
cv2.imwrite(str(path), frame)
print(f" frame {idx}{path.name} ({frame.shape})")
cap.release()
print(f"\n덤프 완료: {dump_dir}")

View File

@@ -0,0 +1,25 @@
import cv2
import pickle
import os
with open('unique_pages.pkl', 'rb') as f:
unique_pages = pickle.load(f)
# Save jump cut boundary frames to see what happened exactly around measure 21 and 45.
# We will use the browser subagent to securely review them.
out_dir = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6"
# Let's save the Pages that we know caused issues:
# In verify_log.txt, we saw:
# Page 18-24 (Around Measure 21 problem)
# Page 40-50 (Around Measure 45 problem)
for i in range(16, 26):
if i < len(unique_pages):
cv2.imwrite(os.path.join(out_dir, f"jump_cut_inspection_page_{i}.png"), unique_pages[i])
for i in range(43, 53):
if i < len(unique_pages):
cv2.imwrite(os.path.join(out_dir, f"jump_cut_inspection_page_{i}.png"), unique_pages[i])
print(f"Dumped inspection frames to Artifact Directory.")

View File

@@ -0,0 +1,68 @@
import cv2
import pickle
from pathlib import Path
# TemporalTracker already saved the video chunks? No.
# I will use fast_verify.py's frames but run process_pages directly and print all its output.
import fast_verify
from youtube_tab_to_pdf import extract_unique_scroll
# Actually, I will just write a wrapper around ScoreExtractor to print to file
import sys
def main():
cap = cv2.VideoCapture("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
# Quick dynamic crop
ret, initial = cap.read()
scale = 1280 / initial.shape[1]
from youtube_tab_to_pdf import _find_white_tab_strip
crop_top, crop_bottom = 0, int(initial.shape[0] * scale)
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
ret, check_frame = cap.read()
if ret:
resized_check = cv2.resize(check_frame, (1280, int(check_frame.shape[0] * scale)))
bounds = _find_white_tab_strip(resized_check)
if bounds:
crop_top, crop_bottom = bounds
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
# We don't want to load ALL 15000 frames into memory. Use TemporalTracker directly!
from video_cv_tracker import TemporalTracker
tracker = TemporalTracker(diff_threshold=0.05)
count = 0
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
while True:
ret, frame = cap.read()
if not ret: break
if count % 4 == 0:
resized = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
tracker.process_frame(resized[crop_top:crop_bottom, :])
count += 1
unique_pages = tracker.get_unique_pages()
print(f"Got {len(unique_pages)} unique pages from tracker.")
from score_extractor import ScoreExtractor
extractor = ScoreExtractor()
# We will hook print
original_print = print
with open("score_log.txt", "w") as f:
def my_print(*args, **kwargs):
text = " ".join(map(str, args))
f.write(text + "\n")
original_print(*args, **kwargs)
import builtins
builtins.print = my_print
extractor.process_pages(unique_pages)
builtins.print = original_print
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,14 @@
import cv2
import pickle
with open('unique_pages.pkl', 'rb') as f:
unique_pages = pickle.load(f)
import os
out_dir = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\pages"
os.makedirs(out_dir, exist_ok=True)
for i, p in enumerate(unique_pages):
cv2.imwrite(os.path.join(out_dir, f"page_{i:03d}.png"), p)
print(f"Saved {len(unique_pages)} pages to {out_dir}")

View File

@@ -0,0 +1,21 @@
import cv2
import os
img = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\final_check_100_sec.png")
if img is None:
print("Image not found!")
exit(1)
out_dir = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\slices"
os.makedirs(out_dir, exist_ok=True)
h, w = img.shape[:2]
# Final check image is a single ROW (very long panorama).
# We will cut it into 2000px chunks.
idx = 0
for x in range(0, w, 2000):
slice_img = img[:, x:min(x+2000, w)]
cv2.imwrite(os.path.join(out_dir, f"pano_slice_{idx:02d}.png"), slice_img)
idx += 1
print(f"Generated {idx} slices.")

View File

@@ -0,0 +1,64 @@
import cv2
import numpy as np
import os
from glob import glob
video_path = glob('output/*.mp4')[0]
cap = cv2.VideoCapture(video_path)
def _find_white_tab_strip(frame):
h, w = frame.shape[:2]
gray = np.max(frame, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_white_counts = np.sum(binary > 0, axis=1)
threshold = w * 0.1
white_rows = np.where(row_white_counts > threshold)[0]
if len(white_rows) < 5: return None
return white_rows[0], white_rows[-1]
def get_number_sprite(m_img):
gray = np.max(m_img, axis=2)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
crop_y1 = max(0, y_staff - 35)
crop_y2 = max(0, y_staff - 2)
crop_x1 = 0
crop_x2 = min(60, m_img.shape[1])
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
return thresh[crop_y1:crop_y2, crop_x1:crop_x2]
frame_count = 0
found = 0
while True:
ret, frame = cap.read()
if not ret: break
if frame_count % 30 == 0:
strip = _find_white_tab_strip(frame)
if strip:
tab_crop = frame[max(0, strip[0]):min(frame.shape[0], strip[1]), :]
# _detect_measure_bars inline
b_gray = np.max(tab_crop, axis=2)
_, b_bin = cv2.threshold(b_gray, 180, 255, cv2.THRESH_BINARY)
col_sums = np.sum(b_bin, axis=0) / 255
bars = np.where(col_sums > tab_crop.shape[0] * 0.8)[0]
if len(bars) > 1:
x_start = bars[0]
x_end = bars[1]
if x_end - x_start > 40:
first_m = tab_crop[:, x_start:x_end]
sprite = get_number_sprite(first_m)
if sprite is not None:
pixels = np.count_nonzero(sprite > 127)
cv2.imwrite(f"C:/Users/Certes/Desktop/guitar_score/debug_s_{frame_count}_{pixels}.png", sprite)
print(f"Dumped sprite frame {frame_count} with {pixels} pixels")
found += 1
if found > 15: break
frame_count += 1
cap.release()

View File

@@ -0,0 +1,78 @@
import cv2
from video_cv_tracker import TemporalTracker
from youtube_tab_to_pdf import extract_unique_scroll, generate_long_image, generate_pdf, download_video, extract_frames
import sys
import os
from pathlib import Path
# Run verification specifically on Shintakarajima
url = "https://youtu.be/tJq1n8TofM0"
video_path = Path("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
print("Extracting full video for final 142-measure verification...")
cap = cv2.VideoCapture(str(video_path))
# PRE-CALCULATE Dynamic Crop
# Just like extract_unique_scroll does automatically, we detect the white band.
ret, initial = cap.read()
scale = 1280 / initial.shape[1]
resized_init = cv2.resize(initial, (1280, int(initial.shape[0] * scale)))
from youtube_tab_to_pdf import _find_white_tab_strip
crop_top = 0
crop_bottom = resized_init.shape[0]
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
ret, check_frame = cap.read()
if ret:
resized_check = cv2.resize(check_frame, (1280, int(check_frame.shape[0] * scale)))
bounds = _find_white_tab_strip(resized_check)
if bounds:
crop_top, crop_bottom = bounds
# Preserve D.S. al Coda, ┌─ 1., ┌─ 2., and measure numbers drawn in the black abyss!
crop_top = max(0, crop_top - 60)
print(f"Dynamically Cropping to: Y={crop_top} to {crop_bottom}")
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
frames = []
idx = 0
tracker = TemporalTracker(diff_threshold=0.05)
while True:
ret, frame = cap.read()
if not ret: break
frame_resized = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
clean_ribbon = frame_resized[crop_top:crop_bottom, :]
frames.append(clean_ribbon)
idx += 1
cap.release()
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/raw_frame_check.png", frames[30])
print(f"Extracted {len(frames)} frames. Running sequential page extraction...")
try:
final_chunks = extract_unique_scroll(frames)
print("DEBUG: final_chunks len =", len(final_chunks))
if final_chunks:
print("DEBUG: final_chunks[0].shape =", final_chunks[0].shape)
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/debug_chunk_0.png", final_chunks[0])
# Save the chunks to artifact directory to literally look at it
artifact_path = Path(os.environ.get('APPDATA', '')) / '..' / 'Local' / 'Google' / 'AndroidStudio2024.1' # Just using relative artifact manually? No, I'll save it to C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\
artifact_path = Path(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6")
output_png = artifact_path / "final_check_100_sec.png"
generate_long_image(final_chunks, output_png)
print(f"Saved successful verification image to: {output_png}")
if final_chunks:
generate_pdf(final_chunks, Path("output/shintakarajima_perfect.pdf"))
print("✨ Successfully generated output/shintakarajima_perfect.pdf ✨")
else:
print("Failed to produce rows.")
except Exception as e:
import traceback
traceback.print_exc()

View File

@@ -0,0 +1,39 @@
import cv2
import numpy as np
img = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_super_block.png")
if img is None:
print("Image not found")
exit()
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, bin_inv = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
# Staff lines are y=76 to y=152
# A vertical bar line is a vertical strip of black pixels from 76 to 151
# Sum down the columns
col_sums = np.sum(bin_inv[76:152, :], axis=0) / 255.0
# If a column has > 70 black pixels out of the 76 height, it's a solid vertical line
bar_xs = np.where(col_sums > 70)[0]
# Group adjacent pixels into single lines
grouped_bars = []
if len(bar_xs) > 0:
current_group = [bar_xs[0]]
for x in bar_xs[1:]:
if x - current_group[-1] <= 5:
current_group.append(x)
else:
grouped_bars.append(int(np.mean(current_group)))
current_group = [x]
grouped_bars.append(int(np.mean(current_group)))
print(f"Found {len(grouped_bars)} vertical barlines:")
print(grouped_bars)
# Draw lines
out = img.copy()
for x in grouped_bars:
cv2.line(out, (x, 0), (x, out.shape[0]), (0, 0, 255), 2)
cv2.imwrite(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\test_barlines.png", out)

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

View File

@@ -0,0 +1,139 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 1FPS 타임라인 기반 마디 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
unique_measures = []
chunk_width = 1280
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
for frame_idx, frame in enumerate(frames):
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
gray_page = _extract_print_channel(tab_crop)
bar_coords = _detect_measure_bars(gray_page)
if not bar_coords:
continue
coords = [0] + bar_coords + [tab_crop.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(tab_crop[:, x_start:x_end])
if not page_measures:
continue
if not unique_measures:
unique_measures.extend(page_measures)
continue
first_m = page_measures[0]
bin_first = get_clean_binary(first_m)
best_error = 1.0
best_offset = 0
anchored = False
for scan_dist in range(1, min(10, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(diff > 0) / s1.size
if error_ratio < best_error:
best_error = error_ratio
best_offset = len(unique_measures) - past_idx
# Error ratio < 20% confirms identity for sparse structures
if best_error < 0.20:
new_start_offset = best_offset
anchored = True
print(f" [Anchor] Frame {frame_idx} -> PDF offset {best_offset} (Best Error: {best_error:.4f})")
else:
print(f" [New] Frame {frame_idx} -> No Match (Best Error was {best_error:.4f})")
if anchored and new_start_offset < len(page_measures):
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
unique_measures.extend(page_measures)
print(f" -> 동기화 중복 제거 완료: 무손실 타임라인 기반 {len(unique_measures)}개 연속 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Patched.")

View File

@@ -0,0 +1,182 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 Number Sprite 앵커 기반 마디 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
unique_measures = []
chunk_width = 1280
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
def get_number_sprite(m_img):
# We find the top-left region where the number is displayed
gray = np.max(m_img, axis=2)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
crop_y1 = max(0, y_staff - 35)
crop_y2 = max(0, y_staff - 2)
crop_x1 = 0
crop_x2 = min(60, m_img.shape[1])
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
sprite = thresh[crop_y1:crop_y2, crop_x1:crop_x2]
# if there are no white pixels, it's a blank space, not a number
if np.count_nonzero(sprite > 127) < 5: return None
return sprite
for frame_idx, frame in enumerate(frames):
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
gray_page = _extract_print_channel(tab_crop)
bar_coords = _detect_measure_bars(gray_page)
if not bar_coords: continue
coords = [0] + bar_coords + [tab_crop.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(tab_crop[:, x_start:x_end])
if not page_measures: continue
if not unique_measures:
unique_measures.extend(page_measures)
continue
first_m = page_measures[0]
first_sprite = get_number_sprite(first_m)
best_error = 1.0
best_offset = 0
anchored = False
# Only anchor if we explicitly see a printed number in the top left
if first_sprite is not None:
# We can scan further back safely because different numbers won't mathematically match
for scan_dist in range(1, min(15, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
past_sprite = get_number_sprite(past_m)
if past_sprite is not None:
hs = min(first_sprite.shape[0], past_sprite.shape[0])
ws = min(first_sprite.shape[1], past_sprite.shape[1])
s1 = first_sprite[:hs, :ws]
s2 = past_sprite[:hs, :ws]
diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(diff > 0) / s1.size
if error_ratio < best_error:
best_error = error_ratio
best_offset = len(unique_measures) - past_idx
# If the literal printed number matches perfectly, we securely anchor Here!
if best_error < 0.15:
new_start_offset = best_offset
anchored = True
# Fallback for pages entirely devoid of explicit numbering
if not anchored:
bin_first = get_clean_binary(first_m)
for scan_dist in range(1, min(5, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(diff > 0) / s1.size
if error_ratio < best_error:
best_error = error_ratio
best_offset = len(unique_measures) - past_idx
if best_error < 0.15:
new_start_offset = best_offset
anchored = True
if anchored and new_start_offset < len(page_measures):
# Middle append
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
unique_measures.extend(page_measures)
print(f" -> 동기화 중복 제거 완료: Number Sprite 타임라인 기반 {len(unique_measures)}개 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Patched.")

View File

@@ -0,0 +1,145 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 페이지 분할 기반 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
# 5% 픽셀 변화를 통해 페이지가 넘어가는 장면(Scene)만 정지 화면으로 추출 (모션 블러 프레임 제거)
tracker = TemporalTracker(diff_threshold=0.05)
for frame in frames:
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
tracker.process_frame(tab_crop)
unique_pages = tracker.get_unique_pages()
if not unique_pages: return []
unique_measures = []
chunk_width = 1280
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
for page_idx, page in enumerate(unique_pages):
gray_page = _extract_print_channel(page)
bar_coords = _detect_measure_bars(gray_page)
if not bar_coords: continue
coords = [0] + bar_coords + [page.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(page[:, x_start:x_end])
if not page_measures: continue
if not unique_measures:
unique_measures.extend(page_measures)
continue
first_m = page_measures[0]
bin_first = get_clean_binary(first_m)
best_error = 1.0
best_offset = 0
anchored = False
for scan_dist in range(1, min(10, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(diff > 0) / s1.size
if error_ratio < best_error:
best_error = error_ratio
best_offset = len(unique_measures) - past_idx
if best_error < 0.20:
new_start_offset = best_offset
anchored = True
print(f" [Anchor] Page {page_idx} -> PDF offset {best_offset} (Best Error: {best_error:.4f})")
else:
print(f" [New Page] Page {page_idx} -> No Overlap (Best Error: {best_error:.4f})")
if anchored and new_start_offset < len(page_measures):
# 중복된 오프셋만큼 건너뛰고 나머지 새 마디만 추가
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
# 겹침이 전혀 없으므로 전체 마디 추가
unique_measures.extend(page_measures)
print(f" -> 동기화 중복 제거 완료: 무손실 정적 페이지 기반 {len(unique_measures)}개 연속 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Patched completely back to optimal tracking.")

View File

@@ -0,0 +1,168 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 Binarized-Tracker 정밀 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
# The Holy Grail Tracker: Feed it ONLY the pure 200-threshold binary mask.
# The hand is gone. Only the white staff lines and notes exist.
# When the page flips, the notes change position, creating a very small but undeniable structural pixel diff.
# We use a highly sensitive 0.015 (1.5%) threshold to perfectly catch thin notes transitioning!
tracker = TemporalTracker(diff_threshold=0.015)
# Store associations so we can retrieve the original BGR page later
clean_to_bgr = []
for frame in frames:
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
clean_bin = get_clean_binary(tab_crop)
# tracker will process the pure binary structural image
diff = 0.0
if tracker.last_frame is not None:
raw_diff = cv2.absdiff(clean_bin, tracker.last_frame)
non_zero_ratio = np.count_nonzero(raw_diff) / clean_bin.size
if non_zero_ratio > tracker.diff_threshold:
tracker.unique_pages.append(clean_bin)
clean_to_bgr.append(tab_crop)
tracker.last_frame = clean_bin.copy()
else:
tracker.unique_pages.append(clean_bin)
clean_to_bgr.append(tab_crop)
tracker.last_frame = clean_bin.copy()
unique_pages = clean_to_bgr
if not unique_pages: return []
print(f" -> {len(unique_pages)}개의 고유 정적 페이지 캡처 완료. 3-마디 역탐색 동기화 시작...")
unique_measures = []
chunk_width = 1280
for page_idx, page in enumerate(unique_pages):
gray_page = _extract_print_channel(page)
bar_coords = _detect_measure_bars(gray_page)
if not bar_coords: continue
coords = [0] + bar_coords + [page.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(page[:, x_start:x_end])
if not page_measures: continue
if not unique_measures:
unique_measures.extend(page_measures)
continue
first_m = page_measures[0]
bin_first = get_clean_binary(first_m)
best_error = 1.0
best_offset = 0
anchored = False
# We limit the search distance to EXACTLY 3 measures.
# This completely cures Time-Traveling overlaps caused by M10 matching identical M2.
# A page flip overlap can NEVER be further back than the immediately previous page's length.
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(diff > 0) / s1.size
if error_ratio < best_error:
best_error = error_ratio
best_offset = len(unique_measures) - past_idx
if best_error < 0.20:
new_start_offset = best_offset
anchored = True
print(f" [Anchor] Page {page_idx} -> PDF offset {best_offset} (Best Error: {best_error:.4f})")
else:
print(f" [New Page] Page {page_idx} -> No Overlap (Best Error: {best_error:.4f})")
if anchored and new_start_offset < len(page_measures):
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
unique_measures.extend(page_measures)
print(f" -> 동기화 중복 제거 완료: 무손실 정적 페이지 기반 {len(unique_measures)}개 연속 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Holy Grail Pipeline Embedded.")

View File

@@ -0,0 +1,160 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 Stable-Blurred-Matrix 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
unique_measures = []
chunk_width = 1280
last_1fps_bin = None
last_solid_page = None
for frame_idx, frame in enumerate(frames):
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
clean_bin = get_clean_binary(tab_crop)
if last_1fps_bin is not None:
diff = cv2.absdiff(clean_bin, last_1fps_bin)
error = np.count_nonzero(diff) / clean_bin.size
if error < 0.05:
has_changed_since_last_solid = True
if last_solid_page is not None:
s_diff = cv2.absdiff(clean_bin, last_solid_page)
s_err = np.count_nonzero(s_diff) / clean_bin.size
if s_err < 0.05:
has_changed_since_last_solid = False
if has_changed_since_last_solid:
last_solid_page = clean_bin.copy()
gray_page = _extract_print_channel(tab_crop)
bar_coords = _detect_measure_bars(gray_page)
if bar_coords:
coords = [0] + bar_coords + [tab_crop.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(tab_crop[:, x_start:x_end])
if page_measures:
if not unique_measures:
unique_measures.extend(page_measures)
else:
first_m = page_measures[0]
bin_first = get_clean_binary(first_m)
blurred_first = cv2.GaussianBlur(bin_first, (7, 7), 0)
best_val = 0.0
best_offset = 0
anchored = False
# Deep Scan Deduplication explicitly disabled to prevent repeating choruses wiping out the PDF timeline!
# scan_dist=4 ensures we only match the immediately preceding page-flip overlap.
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
blurred_past = cv2.GaussianBlur(bin_past, (7, 7), 0)
if abs(blurred_first.shape[1] - blurred_past.shape[1]) <= 30:
hs = min(blurred_first.shape[0], blurred_past.shape[0])
ws = min(blurred_first.shape[1], blurred_past.shape[1])
s1 = blurred_first[:hs, :ws]
s2 = blurred_past[:hs, :ws]
template = s1[10:-10, 10:-10]
if template.shape[0] >= 10 and template.shape[1] >= 10:
res = cv2.matchTemplate(s2, template, cv2.TM_CCOEFF_NORMED)
# Using cv2.minMaxLoc inside the result matrix to find any peak (subpixel shifting tolerance)
_, max_val, _, _ = cv2.minMaxLoc(res)
if max_val > best_val:
best_val = max_val
best_offset = len(unique_measures) - past_idx
if best_val > 0.85:
print(f" [Anchor] Page Matched -> PDF offset {best_offset} (Confidence: {best_val:.2f})")
new_start_offset = best_offset
anchored = True
if anchored and new_start_offset < len(page_measures):
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
print(f" [New Page] No recent overlap (Confidence: {best_val:.2f})")
unique_measures.extend(page_measures)
last_1fps_bin = clean_bin.copy()
print(f" -> 동기화 중복 제거 완료: Stability-Blur 기반 {len(unique_measures)}개 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Stable-Blurred-Matrix Patched.")

View File

@@ -0,0 +1,198 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 Stable Content Trigger + Number Sprite 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
def get_number_sprite(m_img):
gray = np.max(m_img, axis=2)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
crop_y1 = max(0, y_staff - 35)
crop_y2 = max(0, y_staff - 2)
crop_x1 = 0
crop_x2 = min(60, m_img.shape[1])
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
sprite = thresh[crop_y1:crop_y2, crop_x1:crop_x2]
if np.count_nonzero(sprite > 127) < 8: return None
return sprite
unique_measures = []
chunk_width = 1280
last_1fps_bin = None
last_solid_page = None
for frame_idx, frame in enumerate(frames):
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
clean_bin = get_clean_binary(tab_crop)
if last_1fps_bin is not None:
diff = cv2.absdiff(clean_bin, last_1fps_bin)
error = np.count_nonzero(diff) / clean_bin.size
if error < 0.05:
has_changed_since_last_solid = True
if last_solid_page is not None:
s_diff = cv2.absdiff(clean_bin, last_solid_page)
s_err = np.count_nonzero(s_diff) / clean_bin.size
if s_err < 0.05:
has_changed_since_last_solid = False
if has_changed_since_last_solid:
last_solid_page = clean_bin.copy()
gray_page = _extract_print_channel(tab_crop)
bar_coords = _detect_measure_bars(gray_page)
if bar_coords:
coords = [0] + bar_coords + [tab_crop.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(tab_crop[:, x_start:x_end])
if page_measures:
if not unique_measures:
unique_measures.extend(page_measures)
else:
first_m = page_measures[0]
first_sprite = get_number_sprite(first_m)
best_val = 0.0
best_offset = 0
anchored = False
# Deep Scan Deduplication (find exact Number Sprite match)
if first_sprite is not None:
for scan_dist in range(1, len(unique_measures) + 1):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
past_sprite = get_number_sprite(past_m)
if past_sprite is not None:
hs = min(first_sprite.shape[0], past_sprite.shape[0])
ws = min(first_sprite.shape[1], past_sprite.shape[1])
if hs > 5 and ws > 5:
s1 = first_sprite[:hs, :ws]
s2 = past_sprite[:hs, :ws]
template = s1[2:-2, 2:-2]
if template.shape[0] >= 5 and template.shape[1] >= 5:
res = cv2.matchTemplate(s2, template, cv2.TM_CCOEFF_NORMED)
max_val = res[0][0]
if max_val > best_val:
best_val = max_val
best_offset = len(unique_measures) - past_idx
if best_val > 0.85:
print(f" [Sprite Anchor] Detected Measure {best_offset}! Ignoring duplicates.")
new_start_offset = best_offset
anchored = True
# Fallback geometric anchor for unlabeled pages (restricted back-scan)
if not anchored:
bin_first = get_clean_binary(first_m)
best_err = 1.0
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
m_diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(m_diff > 0) / s1.size
if error_ratio < best_err:
best_err = error_ratio
best_offset = len(unique_measures) - past_idx
if best_err < 0.15:
new_start_offset = best_offset
anchored = True
if anchored and new_start_offset < len(page_measures):
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
unique_measures.extend(page_measures)
last_1fps_bin = clean_bin.copy()
print(f" -> 동기화 중복 제거 완료: Stability 기반 {len(unique_measures)}개 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Stable Sprite Anchor Patched.")

View File

View File

@@ -0,0 +1,145 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 Keyframe 페이지 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
# 1. 0.05 threshold Tracker to completely ignore all fade/blur frames and extract EXACTLY 13 keyframes
tracker = TemporalTracker(diff_threshold=0.05)
for frame in frames:
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
tracker.process_frame(tab_crop)
unique_pages = tracker.get_unique_pages()
if not unique_pages: return []
unique_measures = []
chunk_width = 1280
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
for page_idx, page in enumerate(unique_pages):
gray_page = _extract_print_channel(page)
bar_coords = _detect_measure_bars(gray_page)
if not bar_coords: continue
coords = [0] + bar_coords + [page.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(page[:, x_start:x_end])
if not page_measures: continue
if not unique_measures:
unique_measures.extend(page_measures)
continue
first_m = page_measures[0]
bin_first = get_clean_binary(first_m)
best_error = 1.0
best_offset = 0
anchored = False
# 3. CRUCIAL FIX: scan_dist limited to exactly 3.
# Preventing M40 from visually matching M9 because Chorus repeats.
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
# 2. Binary Absdiff error < 0.20 for subpixel-immune, noise-immune math overlap matching
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(diff > 0) / s1.size
if error_ratio < best_error:
best_error = error_ratio
best_offset = len(unique_measures) - past_idx
if best_error < 0.20:
new_start_offset = best_offset
anchored = True
if anchored and new_start_offset < len(page_measures):
# Overlapped exactly at this point, only append the truly NEW measures
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
# Completely discrete page flip with no overlap, append all measures
unique_measures.extend(page_measures)
print(f" -> 동기화 중복 제거 완료: 무손실 정적 페이지 기반 {len(unique_measures)}개 연속 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Final Truth Pipeline Patched.")

View File

@@ -0,0 +1,156 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 Binarized-Tracker 정밀 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
diff_threshold = 0.015
clean_to_bgr = []
last_clean_bin = None
for frame in frames:
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
clean_bin = get_clean_binary(tab_crop)
if last_clean_bin is not None:
raw_diff = cv2.absdiff(clean_bin, last_clean_bin)
non_zero_ratio = np.count_nonzero(raw_diff) / clean_bin.size
if non_zero_ratio > diff_threshold:
clean_to_bgr.append(tab_crop)
last_clean_bin = clean_bin.copy()
else:
clean_to_bgr.append(tab_crop)
last_clean_bin = clean_bin.copy()
unique_pages = clean_to_bgr
if not unique_pages: return []
print(f" -> {len(unique_pages)}개의 고유 정적 페이지 캡처 완료. 3-마디 역탐색 동기화 시작...")
unique_measures = []
chunk_width = 1280
for page_idx, page in enumerate(unique_pages):
gray_page = _extract_print_channel(page)
bar_coords = _detect_measure_bars(gray_page)
if not bar_coords: continue
coords = [0] + bar_coords + [page.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(page[:, x_start:x_end])
if not page_measures: continue
if not unique_measures:
unique_measures.extend(page_measures)
continue
first_m = page_measures[0]
bin_first = get_clean_binary(first_m)
best_error = 1.0
best_offset = 0
anchored = False
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(diff > 0) / s1.size
if error_ratio < best_error:
best_error = error_ratio
best_offset = len(unique_measures) - past_idx
if best_error < 0.20:
new_start_offset = best_offset
anchored = True
print(f" [Anchor] Page {page_idx} -> PDF offset {best_offset} (Best Error: {best_error:.4f})")
else:
print(f" [New Page] Page {page_idx} -> No Overlap (Best Error: {best_error:.4f})")
if anchored and new_start_offset < len(page_measures):
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
unique_measures.extend(page_measures)
print(f" -> 동기화 중복 제거 완료: 무손실 정적 페이지 기반 {len(unique_measures)}개 연속 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Holy Grail Pipeline Embedded Inline successfully!")

View File

@@ -0,0 +1,180 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 Number Sprite Template 앵커 기반 마디 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
unique_measures = []
chunk_width = 1280
def get_number_sprite(m_img):
# We explicitly use inverse thresholding to capture the tiny white number on black background
gray = np.max(m_img, axis=2)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
crop_y1 = max(0, y_staff - 35)
crop_y2 = max(0, y_staff - 2)
crop_x1 = 0
crop_x2 = min(60, m_img.shape[1])
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
sprite = thresh[crop_y1:crop_y2, crop_x1:crop_x2]
# MUST BE STRICT: If there are fewer than 8 white pixels, it's a BLANK SPRITE.
# Blank sprites caused the catastrophic 1->36 time-travel deletion!
if np.count_nonzero(sprite > 127) < 8: return None
return sprite
for frame_idx, frame in enumerate(frames):
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
gray_page = _extract_print_channel(tab_crop)
bar_coords = _detect_measure_bars(gray_page)
if not bar_coords: continue
coords = [0] + bar_coords + [tab_crop.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(tab_crop[:, x_start:x_end])
if not page_measures: continue
if not unique_measures:
unique_measures.extend(page_measures)
first_sprite = get_number_sprite(page_measures[0])
has_pixels = np.count_nonzero(first_sprite > 127) if first_sprite is not None else 0
print(f" -> [초기화] 첫 프레임 배열 등록: {len(page_measures)}개 마디 (Sprite Pixels: {has_pixels})")
continue
first_m = page_measures[0]
first_sprite = get_number_sprite(first_m)
anchored = False
new_start_offset = 0
best_val = 0.0
# Only attempt anchor if the first measure explicitly displays a sequence number.
# If it's blank, we DO NOT blindly match it to other blank measures!
if first_sprite is not None:
# We can scan backwards up to 15 measures because clear Number Sprites are completely unique IDs.
for scan_dist in range(1, min(15, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
past_sprite = get_number_sprite(past_m)
if past_sprite is not None:
hs = min(first_sprite.shape[0], past_sprite.shape[0])
ws = min(first_sprite.shape[1], past_sprite.shape[1])
s1 = first_sprite[:hs, :ws]
s2 = past_sprite[:hs, :ws]
template = s1[2:-2, 2:-2]
if template.shape[0] >= 5 and template.shape[1] >= 5:
res = cv2.matchTemplate(s2, template, cv2.TM_CCOEFF_NORMED)
max_val = res[0][0]
if max_val > best_val:
best_val = max_val
new_start_offset = len(unique_measures) - past_idx
if best_val > 0.85:
anchored = True
# If we failed to anchor via Sprite (maybe this page has no numbers at all),
# we fallback to strict whole-measure Template Matching (TM_CCOEFF_NORMED on greyscale prints to survive subpixel scroll drift)
if not anchored:
bin_first = _extract_print_channel(first_m) # greyscale thresholded
for scan_dist in range(1, min(4, len(unique_measures) + 1)): # strictly limit to 4 to prevent musical loops
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = _extract_print_channel(past_m)
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 30:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
template = s1[10:-10, 10:-10]
if template.shape[0] >= 10 and template.shape[1] >= 10:
res = cv2.matchTemplate(s2, template, cv2.TM_CCOEFF_NORMED)
max_val = res[0][0]
if max_val > 0.85:
new_start_offset = len(unique_measures) - past_idx
anchored = True
break
if anchored and new_start_offset < len(page_measures):
unique_measures.extend(page_measures[new_start_offset:])
elif not anchored:
unique_measures.extend(page_measures)
print(f" -> 동기화 중복 제거 완료: Number Sprite 시계열 기반 {len(unique_measures)}개 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Supreme Logic Embedded.")

View File

@@ -0,0 +1,153 @@
import re
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
code = f.read()
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 순차 Stable Content Trigger 방식 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops: return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
unique_measures = []
chunk_width = 1280
last_1fps_bin = None
last_solid_page = None
for frame_idx, frame in enumerate(frames):
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
clean_bin = get_clean_binary(tab_crop)
if last_1fps_bin is not None:
# Check stability compared to 1 second ago
diff = cv2.absdiff(clean_bin, last_1fps_bin)
error = np.count_nonzero(diff) / clean_bin.size
if error < 0.05: # Page is fully stabilized (not a fading transition)
has_changed_since_last_solid = True
if last_solid_page is not None:
s_diff = cv2.absdiff(clean_bin, last_solid_page)
s_err = np.count_nonzero(s_diff) / clean_bin.size
if s_err < 0.05:
has_changed_since_last_solid = False
# We only process this page if it's securely stable AND we haven't already processed it
if has_changed_since_last_solid:
last_solid_page = clean_bin.copy()
# Extract measures
gray_page = _extract_print_channel(tab_crop)
bar_coords = _detect_measure_bars(gray_page)
if bar_coords:
coords = [0] + bar_coords + [tab_crop.shape[1]]
coords = sorted(list(set(coords)))
page_measures = []
for i in range(len(coords) - 1):
x_start = coords[i]
x_end = coords[i+1]
if x_end - x_start < 40: continue
page_measures.append(tab_crop[:, x_start:x_end])
if page_measures:
if not unique_measures:
unique_measures.extend(page_measures)
else:
first_m = page_measures[0]
bin_first = get_clean_binary(first_m)
best_error = 1.0
best_offset = 0
anchored = False
# scan_dist=4 ensures we never loop back to identical repeating choruses from 10 seconds ago!
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
past_idx = len(unique_measures) - scan_dist
past_m = unique_measures[past_idx]
bin_past = get_clean_binary(past_m)
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
hs = min(bin_first.shape[0], bin_past.shape[0])
ws = min(bin_first.shape[1], bin_past.shape[1])
s1 = bin_first[:hs, :ws]
s2 = bin_past[:hs, :ws]
m_diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(m_diff > 0) / s1.size
if error_ratio < best_error:
best_error = error_ratio
best_offset = len(unique_measures) - past_idx
if best_error < 0.15:
new_start_offset = best_offset
if new_start_offset < len(page_measures):
unique_measures.extend(page_measures[new_start_offset:])
else:
unique_measures.extend(page_measures)
last_1fps_bin = clean_bin.copy()
print(f" -> 동기화 중복 제거 완료: Stability 기반 {len(unique_measures)}개 마디 보존")
final_chunks = []
current_row_measures = []
current_row_width = 0
for measure_img in unique_measures:
measure_w = measure_img.shape[1]
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
row_img = np.hstack(current_row_measures)
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
current_row_measures = [measure_img]
current_row_width = measure_w
else:
current_row_measures.append(measure_img)
current_row_width += measure_w
if current_row_measures:
row_img = np.hstack(current_row_measures)
if row_img.shape[1] > chunk_width:
row_img = row_img[:, :chunk_width]
else:
pad_w = chunk_width - row_img.shape[1]
if pad_w > 0:
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
row_img = np.hstack([row_img, pad_img])
final_chunks.append(row_img)
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
return final_chunks
"""
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.write(new_code)
print("Stable Content Trigger Patched.")

View File

@@ -0,0 +1,80 @@
import sys
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
lines = f.readlines()
new_lines = []
skip = False
import_added = False
for line in lines:
if line.startswith('import cv2') and not import_added:
new_lines.append(line)
new_lines.append('from video_cv_tracker import TemporalTracker\n')
import_added = True
continue
if line.startswith('def extract_unique_scroll(frames:'):
skip = True
new_lines.append('''def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
print(f"[4/5] 스크롤형 Tab 시계열 추적 추출 중...")
strip_tops, strip_bottoms = [], []
for frame in frames[:50]:
strip = _find_white_tab_strip(frame)
if strip:
strip_tops.append(strip[0])
strip_bottoms.append(strip[1])
if not strip_tops:
return []
median_top = int(np.median(strip_tops))
median_bottom = int(np.median(strip_bottoms))
tracker = TemporalTracker()
for frame in frames:
h = frame.shape[0]
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
if not _has_tab_content(tab_crop):
continue
tracker.process_frame(tab_crop)
panorama = tracker.get_final_panorama()
if panorama is None:
return []
print(f" -> 생성된 파노라마 길이: {panorama.shape[1]}px")
chunk_width = 1280
final_chunks = []
w = panorama.shape[1]
start_x = 0
while start_x < w:
chunk = panorama[:, start_x:min(w, start_x + chunk_width)]
if chunk.shape[1] < chunk_width:
pad = np.full((chunk.shape[0], chunk_width - chunk.shape[1], 3), 255, dtype=np.uint8)
chunk = np.hstack([chunk, pad])
final_chunks.append(chunk)
start_x += chunk_width
print(f" -> A4 분할 컷: {len(final_chunks)}개")
return final_chunks
''')
continue
if skip and line.startswith('def extract_unique_overlay('):
skip = False
if not skip:
new_lines.append(line)
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
f.writelines(new_lines)
print("Patched youtube_tab_to_pdf.py successfully.")

View File

View File

@@ -0,0 +1,41 @@
import cv2
import numpy as np
import time
img0 = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_chunk_00.png")
img1 = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_chunk_01.png")
gray0 = cv2.cvtColor(img0, cv2.COLOR_BGR2GRAY)
gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
w = gray0.shape[1]
best_ov = 0
min_mad = float('inf')
start_time = time.time()
# Downsample by 2 horizontally & vertically for extreme speed
small0 = cv2.resize(gray0, (w//2, gray0.shape[0]//2))
small1 = cv2.resize(gray1, (w//2, gray1.shape[0]//2))
sw = small0.shape[1]
# We are testing overlap pixel widths
for ov in range(sw-2, 10, -1):
diff = cv2.absdiff(small0[:, -ov:], small1[:, :ov])
mad = np.mean(diff)
if mad < min_mad:
min_mad = mad
best_ov = ov * 2 # map back to original scale
if min_mad < 3.0: # Break early if effectively a perfect match!
best_ov = ov * 2
break
end_time = time.time()
print(f"MSE MAD found overlap {best_ov}px with MAD {min_mad:.2f} in {(end_time-start_time)*1000:.1f}ms")
# Verify
stitched = np.hstack([img0, img1[:, best_ov:]])
cv2.imwrite(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\test_mse_stitch.png", stitched)
print("Exported test_mse_stitch.png")

View File

@@ -0,0 +1,47 @@
import cv2
import numpy as np
import glob
videos = glob.glob('output/*.mp4')
cap = cv2.VideoCapture(videos[0])
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
ret, frame = cap.read()
cap.release()
def _find_white_tab_strip(bgr: np.ndarray):
gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
h, w = binary.shape
row_white_counts = np.sum(binary, axis=1) / 255
threshold = w * 0.1
white_rows = np.where(row_white_counts > threshold)[0]
if len(white_rows) < 2: return None
return white_rows[0], white_rows[-1]
strip = _find_white_tab_strip(frame)
if strip:
y1, y2 = strip
roi = frame[y1:y2, :]
gray_roi = np.max(roi, axis=2)
_, binary = cv2.threshold(gray_roi, 200, 255, cv2.THRESH_BINARY)
col_sums = np.sum(binary, axis=0) / 255
h_roi = y2 - y1
# Relaxed to 40% to survive hand occlusions. Note stems max out at ~20-30%.
bars = np.where(col_sums > h_roi * 0.4)[0]
clean_bars = []
for x in bars:
if not clean_bars or x - clean_bars[-1] > 20: # 20px min distance
clean_bars.append(int(x))
# Include edges
if not clean_bars or clean_bars[0] > 50: clean_bars.insert(0, 0)
if clean_bars[-1] < binary.shape[1] - 50: clean_bars.append(binary.shape[1])
print(f"Top: {y1}, Bottom: {y2}, Height: {h_roi}")
print(f"Detected Clean Measure Bars: {clean_bars}")
else:
print("Could not find tab strip.")

View File

@@ -0,0 +1,39 @@
import cv2
import numpy as np
from video_cv_tracker import TemporalTracker
import time
def extract_cropped_pages(video_path, limit_frames=3000):
cap = cv2.VideoCapture(video_path)
tracker = TemporalTracker(diff_threshold=0.20)
frames_processed = 0
while frames_processed < limit_frames:
ret, frame = cap.read()
if not ret: break
scale = 1280 / frame.shape[1]
frame = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
# Ultimate flawless crop derived from structural ASCII analysis:
# 103:280 precisely truncates before the top of the guitarist's head, isolating ONLY sheet music.
ribbon = frame[103:280, :]
tracker.process_frame(ribbon)
frames_processed += 1
pages = tracker.get_unique_pages()
cap.release()
return pages
if __name__ == "__main__":
video_path = "output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
pages = extract_cropped_pages(video_path)
print(f"Extracted {len(pages)} perfectly cropped median pages.")
if pages:
# Stack vertically
final_img = np.vstack(pages)
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/restored_perfect_crop.png", final_img)
print("Saved cleanly cropped vertical stack.")

View File

@@ -0,0 +1,17 @@
import cv2
import numpy as np
frame = cv2.imread("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/raw_frame_30s.png")
# Crop based on ASCII mathematical deduction
# Top black letterbox is 0:100
# White sheet music is 100:280
# Guitarist is 280:720
crop1 = frame[103:280, :]
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/crop_103_280.png", crop1)
crop2 = frame[0:180, :]
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/crop_0_180.png", crop2)
print("Saved crop_103_280.png and crop_0_180.png")

View File

@@ -0,0 +1,31 @@
import cv2
import numpy as np
import easyocr
import time
reader = easyocr.Reader(['en'], gpu=False)
def test_ocr(image_text, img_data):
# Upscale 3x to give CRAFT detector enough spatial resolution
upscaled = cv2.resize(img_data, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC)
# Pad to make it look like a printed document page
padded = cv2.copyMakeBorder(upscaled, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255])
t0 = time.time()
results = reader.readtext(padded, allowlist="0123456789")
tf = time.time()
print(f"[{image_text}] Result: {results} (took {tf-t0:.2f}s)")
# Generate a tiny "37" (white on black)
img_37 = np.zeros((30, 40), dtype=np.uint8)
img_37[5:10, 10:20] = 255 # Top of "3"
img_37[12:15, 10:20] = 255 # Mid of "3"
img_37[20:25, 10:20] = 255 # Bot of "3"
img_37[5:10, 25:35] = 255 # Top of "7"
img_37[5:25, 30:35] = 255 # Right of "7"
# Invert it so it's black text on white background (what OCR expects)
img_37_inv = cv2.bitwise_not(img_37)
test_ocr("Tiny 37 Synth", img_37_inv)

View File

@@ -0,0 +1,44 @@
import cv2
import numpy as np
def test_page_flip_diff():
import glob
videos = glob.glob("output/*.mp4")
cap = cv2.VideoCapture(videos[0] if videos else "output/shintakarajima.mp4")
ret, prev_frame = cap.read()
if not ret: return
scale = 1280 / prev_frame.shape[1]
prev = cv2.resize(prev_frame, (1280, int(prev_frame.shape[0] * scale)))[103:280, :]
prev_gray = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
idx = 1
max_diff = 0
max_diff_idx = -1
print("Scanning first 2000 frames for diff_ratio spikes...")
while idx < 2000:
ret, frame = cap.read()
if not ret: break
# Only check every frame
curr = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))[103:280, :]
curr_gray = cv2.cvtColor(curr, cv2.COLOR_BGR2GRAY)
diff = cv2.absdiff(prev_gray, curr_gray)
_, thresh = cv2.threshold(diff, 50, 255, cv2.THRESH_BINARY)
ratio = np.sum(thresh > 0) / thresh.size
if ratio > 0.01:
print(f"Frame {idx}: diff_ratio = {ratio:.4f}")
if ratio > max_diff:
max_diff = ratio
max_diff_idx = idx
prev_gray = curr_gray
idx += 1
print(f"\nMax diff spike: {max_diff:.4f} at frame {max_diff_idx}")
if __name__ == "__main__":
test_page_flip_diff()

View File

@@ -0,0 +1,61 @@
import cv2
import numpy as np
import glob
videos = glob.glob('output/*.mp4')
cap = cv2.VideoCapture(videos[0])
# Collect 30 continuous frames
frames = []
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
for _ in range(30):
ret, frame = cap.read()
if not ret: break
frames.append(frame)
cap.release()
if len(frames) == 30:
median_frame = np.median(frames, axis=0).astype(np.uint8)
gray = np.max(median_frame, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(binary, axis=1) / 255
y_staff = np.where(row_sums > binary.shape[1] * 0.4)[0]
if len(y_staff) > 0:
y_top = y_staff[0]
y_bottom = y_staff[-1]
staff_h = y_bottom - y_top
roi = binary[y_top:y_bottom, :]
# 1. Bridge vertical gaps (like the gap between standard notation and tab)
# kernel of 20px will bridge gaps up to 19px without increasing horizontal width
bridge_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 20))
bridged = cv2.dilate(roi, bridge_kernel)
# 2. Erase everything that isn't a continuous vertical line of at least 80% staff height
# Note stems are short, so they get erased even after bridging!
open_height = int(staff_h * 0.8)
open_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, open_height))
isolated_bars = cv2.morphologyEx(bridged, cv2.MORPH_OPEN, open_kernel)
# 3. The isolated_bars image now contains ONLY thick, pure measure bars. Get their X coords.
col_sums = np.sum(isolated_bars, axis=0) / 255
# Even 1 pixel of the filtered bar is enough, but let's use a tiny threshold
bars = np.where(col_sums > open_height * 0.5)[0]
clean_bars = []
for x in bars:
if not clean_bars or x - clean_bars[-1] > 20:
clean_bars.append(int(x))
# Inject edges
if not clean_bars or clean_bars[0] > 50: clean_bars.insert(0, 0)
if clean_bars[-1] < binary.shape[1] - 50: clean_bars.append(binary.shape[1])
print(f"Gap-Bridged Morphology Measure Boundaries: {clean_bars}")
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_gap_bridged.png", isolated_bars)
else:
print("Not enough frames.")

View File

@@ -0,0 +1,60 @@
import cv2
import numpy as np
# Simulate a thin "1" and "2"
img_12 = np.zeros((60, 100), dtype=np.uint8)
img_12[10:50, 40:45] = 255 # The "1"
img_12[10:15, 60:80] = 255 # Top of "2"
img_12[15:45, 75:80] = 255 # Right of "2"
img_12[45:50, 60:80] = 255 # Bottom of "2"
# Simulate a thin "3" and "7"
img_37 = np.zeros((60, 100), dtype=np.uint8)
img_37[10:15, 30:50] = 255 # Top of "3"
img_37[25:30, 30:50] = 255 # Mid of "3"
img_37[45:50, 30:50] = 255 # Bot of "3"
img_37[10:15, 60:80] = 255 # Top of "7"
img_37[15:50, 75:80] = 255 # Right of "7"
# Simulate the SAME "12" but shifted by 2 pixels (due to video wobble)
img_12_shifted = np.zeros((60, 100), dtype=np.uint8)
img_12_shifted[12:52, 42:47] = 255
img_12_shifted[12:17, 62:82] = 255
img_12_shifted[17:47, 77:82] = 255
img_12_shifted[47:52, 62:82] = 255
def compute_iou(s1, s2):
intersection = np.logical_and(s1 > 0, s2 > 0)
union = np.logical_or(s1 > 0, s2 > 0)
return np.count_nonzero(intersection) / max(1, np.count_nonzero(union))
def robust_match(s1, s2):
# Dilate by 3x3 to make lines thick enough to overlap even if shifted by 2px
kernel = np.ones((5, 5), np.uint8)
d1 = cv2.dilate(s1, kernel, iterations=1)
d2 = cv2.dilate(s2, kernel, iterations=1)
# Try multiple subpixel shifts manually and take the best IoU
best_iou = 0
for dy in [-2, 0, 2]:
for dx in [-2, 0, 2]:
M = np.float32([[1, 0, dx], [0, 1, dy]])
shifted_d2 = cv2.warpAffine(d2, M, (s2.shape[1], s2.shape[0]))
iou = compute_iou(d1, shifted_d2)
if iou > best_iou:
best_iou = iou
return best_iou
print("IoU (12 vs 37):", robust_match(img_12, img_37))
print("IoU (12 vs 12_shifted):", robust_match(img_12, img_12_shifted))
# Let's see what TM_CCOEFF_NORMED would have done:
res = cv2.matchTemplate(img_37, img_12[5:-5, 5:-5], cv2.TM_CCOEFF_NORMED)
_, max_val_diff, _, _ = cv2.minMaxLoc(res)
res2 = cv2.matchTemplate(img_12_shifted, img_12[5:-5, 5:-5], cv2.TM_CCOEFF_NORMED)
_, max_val_same, _, _ = cv2.minMaxLoc(res2)
print("\nTM_CCOEFF_NORMED (12 vs 37):", max_val_diff)
print("TM_CCOEFF_NORMED (12 vs 12_shifted):", max_val_same)

View File

@@ -0,0 +1,75 @@
import cv2
import numpy as np
import easyocr
import re
from youtube_tab_to_pdf import TemporalTracker
cap = cv2.VideoCapture(r"C:\Users\Certes\Desktop\guitar_score\output\shintakarajima.mp4")
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
tracker = TemporalTracker()
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
ret, check_frame = cap.read()
from youtube_tab_to_pdf import _find_white_tab_strip
bounds = _find_white_tab_strip(cv2.resize(check_frame, (1280, int(check_frame.shape[0] * (1280/check_frame.shape[1])))))
if bounds:
crop_top = max(0, bounds[0] - 60)
crop_bottom = bounds[1]
tracker.set_crop(crop_top, crop_bottom)
# Process only first 95 seconds to get unique pages
print("Extracting unique pages from first 95 seconds...")
tracker.process_video(cap, start_sec=0, end_sec=95)
unique_pages = tracker.get_unique_pages()
print(f"Extracted {len(unique_pages)} unique pages.")
# Try easyOCR
reader = easyocr.Reader(['en'], gpu=False)
def extract_measure_number(page_bgr):
cw = min(page_bgr.shape[1], 1000)
page_gray = cv2.cvtColor(page_bgr[:, :cw], cv2.COLOR_BGR2GRAY)
_, bin_inv = cv2.threshold(page_gray, 200, 255, cv2.THRESH_BINARY_INV)
row_sums = np.sum(bin_inv, axis=1) / 255.0
staff_rows = np.where(row_sums > cw * 0.4)[0]
if len(staff_rows) >= 6:
staff_y_top, staff_y_bottom = staff_rows[0], staff_rows[-1]
for r in staff_rows:
if r - staff_y_top > 100: break
staff_y_bottom = r
else:
return -1
expected_h = max(10, staff_y_bottom - staff_y_top + 1)
staff_region = bin_inv[staff_y_top:staff_y_bottom+1, :]
col_sums = np.sum(staff_region, axis=0) / 255.0
bar_xs = np.where(col_sums >= expected_h * 0.8)[0]
if len(bar_xs) == 0: return -1
x_bar = bar_xs[0]
box_y1 = max(0, staff_y_top - 25)
box_y2 = staff_y_top
box_x1 = x_bar
box_x2 = min(page_gray.shape[1], x_bar + 35)
num_box = page_gray[box_y1:box_y2, box_x1:box_x2]
_, num_inv = cv2.threshold(num_box, 200, 255, cv2.THRESH_BINARY_INV)
num_for_ocr = cv2.bitwise_not(num_inv)
upscaled = cv2.resize(num_for_ocr, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)
padded = cv2.copyMakeBorder(upscaled, 20, 20, 20, 20, cv2.BORDER_CONSTANT, value=[255, 255, 255])
results = reader.readtext(padded, allowlist="0123456789")
if not results: return -1
digits = re.findall(r'\d+', results[0][1])
return int(digits[0]) if digits else -1
for i, page in enumerate(unique_pages):
num = extract_measure_number(page)
print(f"Page {i:02d}: {num}")

BIN
scripts/debug/test_m1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

BIN
scripts/debug/test_m2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

View File

@@ -0,0 +1,43 @@
import cv2
import numpy as np
from youtube_tab_to_pdf import _find_white_tab_strip, _detect_measure_bars, _extract_print_channel
def get_clean_binary(img):
gray = np.max(img, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
return binary
cap = cv2.VideoCapture(r"output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
cap.set(cv2.CAP_PROP_POS_FRAMES, 50)
ret, f1 = cap.read()
cap.set(cv2.CAP_PROP_POS_FRAMES, 65) # Next second
ret, f2 = cap.read()
cap.release()
def process(frame):
s = _find_white_tab_strip(frame)
crop = frame[s[0]:s[1], :]
gray = _extract_print_channel(crop)
bars = _detect_measure_bars(gray)
coords = [0] + bars + [crop.shape[1]]
m = crop[:, coords[1]:coords[2]] # Get M2 just in case M1 is a clef
return m
m1 = process(f1)
m2 = process(f2)
cv2.imwrite("test_m1.png", m1)
cv2.imwrite("test_m2.png", m2)
bin1 = get_clean_binary(m1)
bin2 = get_clean_binary(m2)
h = min(bin1.shape[0], bin2.shape[0])
w = min(bin1.shape[1], bin2.shape[1])
s1 = bin1[:h, :w]
s2 = bin2[:h, :w]
diff = cv2.absdiff(s1, s2)
error_ratio = np.sum(diff > 0) / s1.size
print(f"Error Ratio: {error_ratio:.4f}")

View File

@@ -0,0 +1,25 @@
import cv2
import numpy as np
def find_measure_boundaries(img_bgr, max_width=1280):
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
_, bin_inv = cv2.threshold(img_gray, 180, 255, cv2.THRESH_BINARY_INV)
staff_region = bin_inv[50:160, :]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
vertical_lines = cv2.morphologyEx(staff_region, cv2.MORPH_OPEN, kernel)
proj = np.sum(vertical_lines, axis=0) / 255
peaks = np.where(proj > 30)[0]
valid_peaks = [p for p in peaks if p <= max_width - 15]
if not valid_peaks: return max_width
return valid_peaks[-1] + 10
if __name__ == "__main__":
img = cv2.imread(r'C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_super_block.png')
for w_cap in [1280, 2000, 2560]:
cw = min(w_cap, img.shape[1])
cut_x = find_measure_boundaries(img[:, :cw], cw)
print(f"Max {cw} => Cut at {cut_x}")
out = img[:, :cw].copy()
cv2.line(out, (cut_x, 0), (cut_x, out.shape[0]), (0, 0, 255), 2)
cv2.imwrite(r'C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\slice_'+str(w_cap)+'.png', out)

View File

@@ -0,0 +1,82 @@
import cv2
import numpy as np
import glob
from dataclasses import dataclass
from typing import List, Tuple
@dataclass
class MeasureBound:
x_start: int
x_end: int
y_top: int
y_bottom: int
class GridParser:
def __init__(self, frame: np.ndarray):
self.frame = frame
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
_, self.binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
self.h, self.w = self.binary.shape
def find_staff_y_bounds(self) -> Tuple[int, int]:
h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (100, 1))
h_lines = cv2.morphologyEx(self.binary, cv2.MORPH_OPEN, h_kernel)
row_sums = np.sum(h_lines, axis=1) / 255
staff_rows = np.where(row_sums > self.w * 0.4)[0]
if len(staff_rows) == 0: return 0, 0
y_top = int(staff_rows[0])
y_bottom = y_top
for y in staff_rows:
if y - y_bottom > 150: break
y_bottom = int(y)
return max(0, y_top - 5), min(self.h, y_bottom + 5)
def find_measure_bounds(self) -> List[MeasureBound]:
y_top, y_bottom = self.find_staff_y_bounds()
if y_bottom - y_top < 20: return []
staff_height = y_bottom - y_top
# Isolate all vertical linear structures at least 30px tall (ignores almost all hand features and note heads)
v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 30))
roi = self.binary[y_top:y_bottom, :]
v_lines = cv2.morphologyEx(roi, cv2.MORPH_OPEN, v_kernel, iterations=1)
# Aggregate the vertical structures. Measure bars will have a high column density.
col_sums = np.sum(v_lines, axis=0) / 255
# We expect a measure bar to cross both staves, totaling maybe 50% of the ROI height
bar_cols = np.where(col_sums > staff_height * 0.4)[0]
clean_bars = []
for x in bar_cols:
if not clean_bars or x - clean_bars[-1] > 20:
clean_bars.append(int(x))
if not clean_bars or clean_bars[0] > 50:
clean_bars.insert(0, 0)
if clean_bars[-1] < self.w - 50:
clean_bars.append(self.w)
measures = []
for i in range(len(clean_bars) - 1):
x1 = clean_bars[i]
x2 = clean_bars[i+1]
if x2 - x1 < 40: continue
measures.append(MeasureBound(x1, x2, y_top, y_bottom))
return measures
if __name__ == "__main__":
videos = glob.glob('output/*.mp4')
cap = cv2.VideoCapture(videos[0])
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
ret, frame = cap.read()
cap.release()
parser = GridParser(frame)
measures = parser.find_measure_bounds()
print(f"Measures: {[(m.x_start, m.x_end) for m in measures]}")

View File

@@ -0,0 +1,48 @@
import cv2
import numpy as np
import glob
video_path = glob.glob('output/*.mp4')[0]
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_FRAMES, 500) # jump to a frame with chords and hand
ret, frame = cap.read()
cap.release()
if not ret:
print("Cannot read video frame.")
exit()
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY) # White text, black background
# Morphological horizontal line detection
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
detect_horizontal = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
# Morphological vertical line detection
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 30))
detect_vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
# Find staves
row_sums = np.sum(detect_horizontal, axis=1) / 255
y_staves = np.where(row_sums > binary.shape[1] * 0.4)[0]
if len(y_staves) > 0:
print(f"Top staff line Y: {y_staves[0]}")
print(f"Bottom staff line Y: {y_staves[-1]}")
# Restrict vertical detection to within the staff lines
staff_crop = detect_vertical[y_staves[0]:y_staves[-1], :]
col_sums = np.sum(staff_crop, axis=0) / 255
bars = np.where(col_sums > (y_staves[-1] - y_staves[0]) * 0.6)[0]
# Filter bars that are too close (thickness)
clean_bars = []
for x in bars:
if not clean_bars or x - clean_bars[-1] > 10:
clean_bars.append(x)
print(f"Measure bars X: {clean_bars}")
else:
print("No staves detected.")
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_morph_horiz.png", detect_horizontal)
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_morph_vert.png", detect_vertical)

View File

@@ -0,0 +1,33 @@
import cv2
import numpy as np
from youtube_tab_to_pdf import extract_unique_scroll
# We will read fast_test_pano.jpg
img = cv2.imread('fast_test_pano.jpg', cv2.IMREAD_GRAYSCALE)
# We want to find staff lines and number band
_, bin_inv = cv2.threshold(img, 200, 255, cv2.THRESH_BINARY_INV)
row_sums = np.sum(bin_inv, axis=1) / 255.0
staff_rows = np.where(row_sums > img.shape[1] * 0.4)[0]
if len(staff_rows) >= 6:
staff_y_top = staff_rows[0]
else:
staff_y_top = int(img.shape[0] * 0.3)
# Number band
band_y_top = max(0, staff_y_top - 25)
band_y_bottom = staff_y_top
band = img[band_y_top:band_y_bottom, :]
# Save it to see if it correctly contains the numbers
cv2.imwrite('debug_band.png', band)
print(f"Band shape: {band.shape}")
# Let's see if we can extract number boxes!
band_inv = cv2.bitwise_not(band)
col_sums = np.sum(band_inv, axis=0) / 255.0
number_xs = np.where(col_sums > 5)[0] # at least 5 pixels of ink vertically
print(f"Pixels with numbers: {len(number_xs)}")

View File

@@ -0,0 +1,44 @@
import cv2
import easyocr
import numpy as np
from youtube_tab_to_pdf import _extract_print_channel, _detect_measure_bars
cap = cv2.VideoCapture(r"output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
cap.set(cv2.CAP_PROP_POS_FRAMES, 50) # 1.6 seconds in
ret, frame = cap.read()
if not ret: exit()
gray = np.max(frame, axis=2)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > frame.shape[1] * 0.5)[0]
y_staff = staff_lines[0] if len(staff_lines) > 0 else 100
bar_coords = _detect_measure_bars(thresh)
print(f"Detected Bars at X: {bar_coords}")
reader = easyocr.Reader(['en'], gpu=False)
for idx, x_bar in enumerate(bar_coords):
# Crop the tiny region above the bar where the number should be
crop_y1 = max(0, y_staff - 25)
crop_y2 = max(0, y_staff - 2)
crop_x1 = max(0, x_bar - 5)
crop_x2 = min(frame.shape[1], x_bar + 25)
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1:
continue
sprite = frame[crop_y1:crop_y2, crop_x1:crop_x2]
cv2.imwrite(f"debug_sprite_{idx}.png", sprite)
# Scale up for better OCR
scaled = cv2.resize(sprite, (0,0), fx=3, fy=3, interpolation=cv2.INTER_CUBIC)
gray_sprite = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
_, binary_sprite = cv2.threshold(gray_sprite, 180, 255, cv2.THRESH_BINARY_INV)
cv2.imwrite(f"debug_sprite_bin_{idx}.png", binary_sprite)
res = reader.readtext(gray_sprite, allowlist='0123456789')
print(f"Bar {idx} X={x_bar} OCR: {res}")

View File

@@ -0,0 +1,74 @@
import cv2
import pickle
import numpy as np
import easyocr
import time
import re
reader = easyocr.Reader(['en'], gpu=False)
with open('unique_pages.pkl', 'rb') as f:
unique_pages = pickle.load(f)
print(f"Loaded {len(unique_pages)} chunks. Running OCR on jump-cut boundaries...")
def extract_measure_number(page_bgr):
# Same logic as before to find the first measure box
cw = min(page_bgr.shape[1], 1000)
page_gray = cv2.cvtColor(page_bgr[:, :cw], cv2.COLOR_BGR2GRAY)
_, bin_inv = cv2.threshold(page_gray, 200, 255, cv2.THRESH_BINARY_INV)
row_sums = np.sum(bin_inv, axis=1) / 255.0
staff_rows = np.where(row_sums > cw * 0.4)[0]
if len(staff_rows) >= 6:
staff_y_top, staff_y_bottom = staff_rows[0], staff_rows[-1]
for r in staff_rows:
if r - staff_y_top > 100: break
staff_y_bottom = r
else:
return -1
expected_h = max(10, staff_y_bottom - staff_y_top + 1)
staff_region = bin_inv[staff_y_top:staff_y_bottom+1, :]
col_sums = np.sum(staff_region, axis=0) / 255.0
bar_xs = np.where(col_sums >= expected_h * 0.8)[0]
if len(bar_xs) == 0: return -1
x_bar = bar_xs[0]
box_y1 = max(0, staff_y_top - 25)
box_y2 = staff_y_top
box_x1 = x_bar
box_x2 = min(page_gray.shape[1], x_bar + 35)
num_box = page_gray[box_y1:box_y2, box_x1:box_x2]
# Preprocess for OCR
_, num_inv = cv2.threshold(num_box, 200, 255, cv2.THRESH_BINARY_INV)
# Must pass white background with black text to EasyOCR! (Since it reads printed text)
num_for_ocr = cv2.bitwise_not(num_inv)
upscaled = cv2.resize(num_for_ocr, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)
padded = cv2.copyMakeBorder(upscaled, 20, 20, 20, 20, cv2.BORDER_CONSTANT, value=[255, 255, 255])
results = reader.readtext(padded, allowlist="0123456789")
if not results: return -1
text = results[0][1]
digits = re.findall(r'\d+', text)
if digits:
return int(digits[0])
return -1
results = []
for i, page in enumerate(unique_pages):
t0 = time.time()
num = extract_measure_number(page)
tf = time.time()
print(f"Page {i:02d}: {num} (took {tf-t0:.2f}s)")
results.append(num)
print(f"Sequential Detections: {results}")

View File

@@ -0,0 +1,137 @@
import cv2
import numpy as np
import time
from pathlib import Path
def stitch_scrolling_video(video_path, start_sec=0, duration_sec=100, fps_sample_rate=15):
cap = cv2.VideoCapture(video_path)
video_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
# Calculate frame skip
frame_skip = int(video_fps / fps_sample_rate)
if frame_skip < 1: frame_skip = 1
start_frame = int(start_sec * video_fps)
max_frames = int(duration_sec * video_fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
# Structural assumptions based on subagent analysis
# Y=103 to Y=435 is the white tablature bar
y_start = 103
y_end = 435
panorama = None
prev_gray = None
count = 0
while count < max_frames:
ret, frame = cap.read()
if not ret: break
# We only process every `frame_skip` frames
if count % frame_skip != 0:
count += 1
continue
scale = 1280 / frame.shape[1]
frame_resized = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
# Crop to the exact white ribbon
ribbon = frame_resized[y_start:y_end, :]
gray = cv2.cvtColor(ribbon, cv2.COLOR_BGR2GRAY)
# Binarize aggressively to vertical features only to kill horizontal staff lines aliases
# dx=1, dy=0 computes horizontal gradient (which highlights VERTICAL edges like note stems and bar lines)
sobelx = cv2.Sobel(gray, cv2.CV_32F, 1, 0, ksize=3)
bin_float = np.abs(sobelx)
if panorama is None:
# First frame is the initial panorama
panorama = ribbon.copy()
prev_gray = bin_float
continue
# 1. Constrained Template Matching for dx
# Template is a 100px wide vertical slice from prev_gray at x=600
template = prev_gray[:, 600:700]
# Search Region: from x=550 to x=710 in bin_float
search_region = bin_float[:, 550:710]
res = cv2.matchTemplate(search_region, template, cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
# In search_region (starts at 550), the template's original position (600) is at index 50.
# If max_loc[0] == 50 -> no movement (dx=0).
# If max_loc[0] < 50 -> image moved left (dx > 0).
dx = 50 - max_loc[0]
if count < 30: # Print first few shifts
print(f"Frame {count}: dx={dx}, max_val={max_val:.3f}")
shift_x = int(dx)
# dx is typically POSITIVE if the camera moves right, meaning the image content moves LEFT.
# dx will be positive or negative depending on parameter order.
# Let's enforce that we only append new pixels from the RIGHT edge of the 'new' frame.
shift_x = int(round(dx))
# In a left-scrolling video, the content moves left.
# phaseCorrelate(prev, curr) -> to overlap curr onto prev, we shift curr by +dx.
# The new pixels entering from the right are exactly the `dx` rightmost columns of the current ribbon!
# If shift_x > 0...
# Let's verify shift_x sign.
# If curr is moved left by 10 pixels compared to prev, then prev[x] == curr[x-10].
# So curr must be shifted by +10 to match prev. Thus dx > 0.
# We need to append the NEWest 10 pixels from the right side of curr.
if shift_x > 0 and shift_x < 300: # Sanity check to ignore massive glitches
# The new column is the absolute rightmost shift_x columns of the current ribbon
new_pixels = ribbon[:, -shift_x:]
panorama = np.hstack([panorama, new_pixels])
prev_gray = bin_float
cap.release()
return panorama
def slice_panorama_to_a4(panorama, slice_width=1280):
"""Cuts the infinite 1D panorama into stacked A4 rows"""
h, w, c = panorama.shape
rows = []
for start_x in range(0, w, slice_width):
end_x = start_x + slice_width
chunk = panorama[:, start_x:end_x]
# Pad the last chunk with white if it's too short
if chunk.shape[1] < slice_width:
pad_w = slice_width - chunk.shape[1]
pad = np.ones((h, pad_w, c), dtype=np.uint8) * 255
chunk = np.hstack([chunk, pad])
rows.append(chunk)
final_image = np.vstack(rows)
return final_image
if __name__ == "__main__":
video_path = "output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
if not Path(video_path).exists():
# Fallback to output/untitled.mp4 or whatever it might be named
for f in Path("output").glob("*.mp4"):
video_path = str(f)
break
print(f"Stitching...")
start_t = time.time()
panorama = stitch_scrolling_video(video_path, start_sec=0, duration_sec=100, fps_sample_rate=15)
print(f"Extraction took {time.time() - start_t:.2f}s. Panorama shape: {panorama.shape}")
if panorama is not None:
final_sheet = slice_panorama_to_a4(panorama, slice_width=1280)
out_path = "C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/verify_panorama.png"
cv2.imwrite(out_path, final_sheet)
print(f"Saved stacked result to {out_path} with shape {final_sheet.shape}")
else:
print("Failed to generate panorama.")

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""로컬 캐시된 mp4 파일로 파이프라인 테스트 (다운로드 스킵)
1080p 다운로드 모드: python test_pipeline.py --download
"""
import sys
import os
from pathlib import Path
import importlib.util
import argparse
import gc
# youtube_tab_to_pdf 모듈 임포트
spec = importlib.util.spec_from_file_location(
"pipeline", str(Path(__file__).parent / "youtube_tab_to_pdf.py"))
pipeline = importlib.util.module_from_spec(spec)
spec.loader.exec_module(pipeline)
# 테스트용 YouTube URLs
TEST_URLS = {
"video_1": "https://www.youtube.com/watch?v=x76IMSvWR0o", # 晴る
"video_2": "https://www.youtube.com/watch?v=90BWvJY6KbE", # 新宝島
"video_3": "https://www.youtube.com/watch?v=Ri9g4lwnrJQ", # 空奏列車
}
def test_video(mp4_path: Path, label: str):
"""단일 영상 테스트 — 다운로드 없이 로컬 파일 직접 사용"""
print(f"\n{'='*60}")
print(f"테스트: {label}")
print(f"파일: {mp4_path.name}")
print(f"{'='*60}")
output_dir = Path("output")
debug_dir = output_dir / "debug_frames" / label
debug_dir.mkdir(parents=True, exist_ok=True)
# Step 2: 프레임 추출
frames = pipeline.extract_frames(mp4_path)
# Step 3: 패턴 감지
pattern = pipeline.detect_pattern(frames)
# Step 4: 고유 프레임 추출
if pattern == "scroll":
unique = pipeline.extract_unique_scroll(frames)
elif pattern == "split":
unique = pipeline.extract_unique_split(frames)
else:
unique = pipeline.extract_unique_overlay(frames)
# Step 5: PDF 생성
pdf_path = output_dir / f"test_{label}.pdf"
pipeline.generate_pdf(unique, pdf_path, debug_dir=debug_dir)
print(f"\n결과: {pattern} / {len(unique)}개 고유 프레임")
return pattern, len(unique)
def download_test_videos():
"""1080p로 테스트 영상 다운로드"""
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)
for label, url in TEST_URLS.items():
print(f"\n--- {label} 다운로드 ---")
try:
video_path, title = pipeline.download_video(url, output_dir)
print(f" → 완료: {video_path.name}")
except Exception as e:
print(f" → 실패: {e}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--download", action="store_true",
help="1080p로 테스트 영상 다운로드")
args = parser.parse_args()
if args.download:
download_test_videos()
return
output_dir = Path("output")
mp4_files = sorted(output_dir.glob("*.mp4"))
if not mp4_files:
print("테스트할 영상(mp4)이 output 폴더에 없습니다.")
print(" → python test_pipeline.py --download 로 영상 다운로드")
sys.exit(1)
print(f"캐시된 영상 {len(mp4_files)}개 발견:")
for f in mp4_files:
print(f" - {f.name} ({f.stat().st_size / 1024 / 1024:.1f} MB)")
results = {}
for i, mp4 in enumerate(mp4_files):
label = f"video_{i+1}"
pattern, count = test_video(mp4, label)
results[label] = (mp4.name, pattern, count)
gc.collect() # 1080p 프레임 메모리 해제
print(f"\n{'='*60}")
print("전체 결과 요약:")
print(f"{'='*60}")
for label, (name, pattern, count) in results.items():
print(f" {label}: {pattern:8s}{count:4d}개 프레임 | {name[:40]}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,57 @@
import cv2
import numpy as np
import time
import glob
from video_cv_tracker import TemporalTracker
from score_extractor import ScoreExtractor
def test_pipeline():
videos = glob.glob('output/*.mp4')
if not videos: return
cap = cv2.VideoCapture(videos[0])
# 1. Tracker extracts median jump-cut pages flawlessly
tracker = TemporalTracker(diff_threshold=0.05)
# Process 100 seconds
limit_frames = 3000
count = 0
t0 = time.time()
while count < limit_frames:
ret, frame = cap.read()
if not ret: break
# We only pass the lower tab bounding box if needed.
# But actually, finding the tab strip directly using robust median is safer.
# Let's just crop roughly the bottom 2/3rds where tab lives, reducing processing load.
h = frame.shape[0]
roi = frame[int(h*0.3):h, :]
tracker.process_frame(roi)
count += 1
if count % 300 == 0:
print(f"Processed {count} frames...")
cap.release()
unique_pages = tracker.get_unique_pages()
print(f"Tracker returned {len(unique_pages)} unique structural median pages. Took {time.time()-t0:.2f}s")
# 2. Score Extractor applies the Ultimate Structure State Machine
t1 = time.time()
extractor = ScoreExtractor()
extractor.process_pages(unique_pages)
tiled_rows = extractor.tile_to_a4(chunk_width=1280)
print(f"Extraction & Tiling took {time.time()-t1:.2f}s")
if tiled_rows:
final_img = np.vstack(tiled_rows)
# Invert back to black-on-white PDF format
pdf_img = cv2.bitwise_not(final_img)
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_final_state_machine.png", pdf_img)
print("Wrote debug_final_state_machine.png")
else:
print("Failed to produce rows.")
if __name__ == "__main__":
test_pipeline()

View File

@@ -0,0 +1,36 @@
import cv2
import numpy as np
img0 = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_chunk_00.png")
img1 = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_chunk_01.png")
gray0 = cv2.cvtColor(img0, cv2.COLOR_BGR2GRAY)
gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
h, w = gray0.shape
# The first 300px of img1 is our template
template_w = 400
template = gray1[:60, :template_w] # ONLY TOP 60 PIXELS
ref = gray0[:60, :] # ONLY TOP 60 PIXELS
# Find where 'template' is in 'gray0'
res = cv2.matchTemplate(ref, template, cv2.TM_CCOEFF_NORMED)
_, max_val, _, max_loc = cv2.minMaxLoc(res)
print(f"Match value (Top 60px): {max_val:.3f}")
if max_val > 0.8:
match_x_in_last = max_loc[0]
overlap_len = w - match_x_in_last
print(f"Overlap starts in last_chunk at x={match_x_in_last}.")
print(f"Length of overlap is {overlap_len}px.")
if overlap_len < w:
new_slice = img1[:, overlap_len:]
stitched = np.hstack([img0, new_slice])
cv2.imwrite(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\test_stitched_top60.png", stitched)
print("Exported test_stitched_top60.png")
else:
print("No valid overlap found.")

View File

@@ -0,0 +1,50 @@
import cv2
import numpy as np
import glob
videos = glob.glob('output/*.mp4')
cap = cv2.VideoCapture(videos[0])
# Collect 30 continuous frames (about 1 second of video)
frames = []
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
for _ in range(30):
ret, frame = cap.read()
if not ret: break
frames.append(frame)
cap.release()
if len(frames) == 30:
# 1. Temporal Median to completely erase the live-action moving guitarist and background
median_frame = np.median(frames, axis=0).astype(np.uint8)
gray = np.max(median_frame, axis=2)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
# 2. Extract true staff lines from the pristine static overlay
row_sums = np.sum(binary, axis=1) / 255
y_staff = np.where(row_sums > binary.shape[1] * 0.4)[0]
if len(y_staff) > 0:
print(f"Pristine staff lines detected at: {y_staff}")
y_top = y_staff[0]
y_bottom = y_staff[-1]
# 3. Extract vertical bars perfectly
roi = binary[y_top:y_bottom, :]
col_sums = np.sum(roi, axis=0) / 255
staff_h = y_bottom - y_top
bars = np.where(col_sums > staff_h * 0.5)[0]
clean_bars = []
for x in bars:
if not clean_bars or x - clean_bars[-1] > 20:
clean_bars.append(int(x))
print(f"Pristine Measure Boundaries: {clean_bars}")
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_temporal_median.png", median_frame)
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_temporal_binary.png", binary)
else:
print("Not enough frames.")

View File

@@ -0,0 +1,52 @@
import cv2
import numpy as np
def find_white_tab_bounds(video_path):
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_FRAMES, 30 * cap.get(cv2.CAP_PROP_FPS))
ret, frame = cap.read()
cap.release()
if not ret: return None
scale = 1280 / frame.shape[1]
frame = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# Calculate row-wise mean brightness
row_means = np.mean(gray, axis=1)
# We are looking for the white paper background which has brightness > 230 on average
# Wait, notes and black lines reduce the mean of a row.
# A single black horizontal line on white reduces mean by (255 - 0) * (width/width) -> It drops to ~180 if it's thick.
# Let's say any row with mean > 180 is part of the white strip.
is_white_row = row_means > 180
# Find contiguous blocks of True
# Pad with False to handle edges cleanly
padded = np.concatenate(([False], is_white_row, [False]))
diffs = np.diff(padded.astype(int))
starts = np.where(diffs == 1)[0]
ends = np.where(diffs == -1)[0]
best_start, best_end, max_len = 0, 0, 0
for s, e in zip(starts, ends):
length = e - s
if length > max_len:
max_len = length
best_start = s
best_end = e
return best_start, best_end, frame.shape[0]
if __name__ == "__main__":
video_path = "output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
bounds = find_white_tab_bounds(video_path)
if bounds:
s, e, h = bounds
print(f"Mathematically found White Tab Strip: Y_START={s}, Y_END={e}. Total Height={h}")
else:
print("Failed to find bound")

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""
수정된 버그 3개가 실제로 동작하는지 검증하는 재실행 시뮬레이션.
youtube_tab_to_pdf.py의 수정된 함수들을 직접 임포트하여 사용합니다.
"""
import sys
from pathlib import Path
import cv2
import numpy as np
if sys.platform == "win32":
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
# 메인 모듈 임포트 (수정된 코드 사용)
sys.path.insert(0, str(Path(__file__).parent))
from youtube_tab_to_pdf import (
_find_white_tab_strip, _has_tab_content,
_detect_scroll_offset, _extract_tracking_channel,
_merge_scroll_candidates, merge_panoramas_list,
_detect_measure_bars, compare_frames
)
FRAME_DIR = Path("output/temp_frames")
OUT_DIR = Path("output/sim_verify")
OUT_DIR.mkdir(exist_ok=True)
def main():
paths = sorted(FRAME_DIR.glob("f_0*.png"))
if not paths:
print("❌ 프레임 없음"); return
print(f"[VERIFY] {len(paths)}개 프레임 — 수정된 코드로 재검증")
# 스트립 Y범위
tops, bots = [], []
for p in paths[:30]:
f = cv2.imread(str(p))
if f is None: continue
s = _find_white_tab_strip(f)
if s: tops.append(s[0]); bots.append(s[1])
med_top = int(np.median(tops))
med_bot = int(np.median(bots))
print(f" 스트립 Y: {med_top}~{med_bot}")
# MSE 중복제거
THRESHOLD = 0.95
candidates, compared = [], []
for p in paths:
f = cv2.imread(str(p))
if f is None: continue
h = f.shape[0]
crop = f[max(0, med_top):min(h, med_bot), :]
if not _has_tab_content(crop): continue
cmp_img = cv2.resize(crop, (480, 120), interpolation=cv2.INTER_AREA)
if any(compare_frames(cmp_img, ref) >= THRESHOLD for ref in compared):
continue
candidates.append(crop)
compared.append(cmp_img)
print(f"\n[1] MSE 중복제거 후: {len(candidates)}개 후보")
# ── BUG1 검증: 씬전환 감지 횟수 ─────────────────────────────────────
print(f"\n[2] BUG1 검증 — 씬전환 감지 횟수 (기대: 1~3)")
stitched = _merge_scroll_candidates(candidates)
print(f" _merge_scroll_candidates 결과: {len(stitched)}개 세그먼트 → 파노라마")
for i, s in enumerate(stitched):
print(f" 세그먼트 파노라마 {i}: {s.shape[1]}px")
cv2.imwrite(str(OUT_DIR / f"seg_pano_{i:02d}.png"), s)
# ── BUG2 검증: 파노라마 병합 ────────────────────────────────────────
print(f"\n[3] BUG2 검증 — 파노라마 병합 (기대: 1~2개)")
merged = merge_panoramas_list(stitched)
print(f" merge_panoramas_list 결과: {len(merged)}개 최종 파노라마")
for i, m in enumerate(merged):
print(f" 최종 파노라마 {i}: {m.shape[1]}x{m.shape[0]}px")
cv2.imwrite(str(OUT_DIR / f"final_pano_{i:02d}.png"), m)
# ── BUG3 검증: 마디 구분선 탐지 ────────────────────────────────────
print(f"\n[4] BUG3 검증 — 마디 구분선 탐지 (기대: 간격 모두 ≥100px)")
total_measures = 0
all_ok = True
for i, m in enumerate(merged):
gray = m[:, :, 2] # Red 채널
bars = _detect_measure_bars(gray)
total_measures += max(0, len(bars) - 1) # 구분선 사이가 마디 수
print(f" 파노라마 {i}: {len(bars)}개 구분선 탐지", end="")
if bars:
gaps = [bars[j+1]-bars[j] for j in range(len(bars)-1)]
min_gap = min(gaps) if gaps else 0
ok = min_gap >= 100
if not ok: all_ok = False
print(f" | 최소간격: {min_gap}px {'' if ok else '❌ (오탐 여전히 존재)'}")
print(f" 첫5개 좌표: {bars[:5]}")
else:
print()
# ── 최종 판정 ───────────────────────────────────────────────────────
print(f"\n{'='*60}")
print("[검증 결과]")
seg_ok = len(stitched) <= 5 # 씬전환 5회 이하 (이전 8회 → 개선)
merge_ok = len(merged) <= 2 # 파노라마 2개 이하 (이전 3개 → 개선)
bar_ok = all_ok # 모든 마디선 간격 ≥100px
print(f" BUG1 씬전환 오탐: {'✅ 개선됨' if seg_ok else '❌ 여전히 과다'} ({len(stitched)}개 세그먼트, 이전 9개)")
print(f" BUG2 파노라마 분리: {'✅ 개선됨' if merge_ok else '❌ 여전히 분리'} ({len(merged)}개, 이전 3개)")
print(f" BUG3 마디선 오탐: {'✅ 개선됨' if bar_ok else '❌ 여전히 오탐'}")
print(f" 탐지된 총 마디 수: {total_measures}")
print(f"{'='*60}")
if seg_ok and merge_ok and bar_ok:
print("\n✅ 모든 버그 수정 확인 — 실제 파이프라인 실행 가능")
else:
print("\n⚠ 일부 문제 잔존 — 추가 파라미터 조정 필요")
if __name__ == "__main__":
main()

Binary file not shown.

View File

@@ -0,0 +1,41 @@
import sys
sys.path.append(r"C:\Users\Certes\Desktop\guitar_score")
import cv2
import easyocr
import numpy as np
import os
from youtube_tab_to_pdf import extract_frames, extract_unique_scroll
video_file = r"C:\Users\Certes\Desktop\guitar_score\output\サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
print("Extracting frames...")
frames = extract_frames(video_file, fps=2)
print("Running pipeline extraction...")
unique = extract_unique_scroll(frames, threshold=0.95)
print("Initializing OCR...")
reader = easyocr.Reader(['en'])
print(f"Generated {len(unique)} chunks.")
detect_log = []
for i, page in enumerate(unique):
# Image is A4 width
# We want to OCR the top 150 pixels of the whole chunk to find measure numbers
h, w = page.shape[:2]
top_area = page[:min(200, h), :]
results = reader.readtext(top_area)
# filter for numbers
nums = []
for (bbox, text, prob) in results:
t = ''.join(filter(str.isdigit, text))
if t:
nums.append(int(t))
print(f"Page {i} measure numbers detected: {nums}")
detect_log.append(nums)
cv2.imwrite(f"output/verify_chunk_{i}.jpg", page)
if i > 5:
break

View File

@@ -0,0 +1,55 @@
import cv2
import numpy as np
import glob
def get_number_sprite(m_img):
gray = np.max(m_img, axis=2)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
row_sums = np.sum(thresh, axis=1) / 255
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
crop_y1 = max(0, y_staff - 60)
crop_y2 = max(0, y_staff - 5)
crop_x1 = 0
crop_x2 = min(60, m_img.shape[1])
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
sprite = thresh[crop_y1:crop_y2, crop_x1:crop_x2]
if np.count_nonzero(sprite > 127) < 8: return None
return sprite
img_path = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\final_check_100_sec.png"
img = cv2.imread(img_path)
h, w = img.shape[:2]
gray = np.max(img, axis=2)
col_sums = np.sum(gray < 100, axis=0) # white padding is 255, black measures are <100
# ACTUALLY, final image has white padding for rows. And black background for music.
# Let's just crop based on the stitched widths.
# Better yet, just use a sliding window template match on the number sprite!
# Even simpler: just visually save the sprites of the FIRST measure of every ROW!
rows = []
for y in range(0, h, 320): # assuming chunk height is around 320
chunk = img[y:y+320, :]
if np.max(chunk) > 200:
rows.append(chunk)
print(f"Detected {len(rows)} A4 rows in final image.")
for i, row in enumerate(rows):
gray_row = np.max(row, axis=2)
_, binary = cv2.threshold(gray_row, 200, 255, cv2.THRESH_BINARY)
# Just save the first 100x100 box of the row where the number sprite usually is
row_sums = np.sum(binary, axis=1) / 255
staff_lines = np.where(row_sums > w * 0.4)[0]
if len(staff_lines) > 0:
y_staff = staff_lines[0]
crop_y1 = max(0, y_staff - 60)
crop_y2 = max(0, y_staff - 5)
sprite = binary[crop_y1:crop_y2, 10:80]
cv2.imwrite(f"C:/Users/Certes/Desktop/guitar_score/debug_ocr_measure_{i}.png", sprite)
pixels = np.count_nonzero(sprite > 127)
print(f"Row {i} parsed. Sprite white pixels: {pixels}")

View File

@@ -0,0 +1,32 @@
import cv2
import numpy as np
def img_to_ascii(img_path, target_width=120):
img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
if img is None:
print("Could not load image:", img_path)
return
h, w = img.shape
aspect_ratio = h / w
# Terminal characters are roughly 2:1 height:width, so adjust aspect
target_height = int(target_width * aspect_ratio * 0.5)
resized = cv2.resize(img, (target_width, target_height))
# ASCII characters gradient from dark to light
chars = ["@", "%", "#", "*", "+", "=", "-", ":", ".", " "]
# Normalize mapping
for y in range(target_height):
row_str = ""
for x in range(target_width):
pixel = resized[y, x]
# Map 0-255 to 0-9
char_idx = int((pixel / 255.0) * 9)
row_str += chars[char_idx]
print(row_str)
if __name__ == "__main__":
print("=== debug_chunk_0.png ===")
img_to_ascii("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/debug_chunk_0.png", 120)