chore(docs): document ScoreExtractor tiling and refactor debug scripts (#563)
31
scripts/debug/check_top.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
def img_to_ascii(image, max_w=120):
|
||||
if isinstance(image, str):
|
||||
image = cv2.imread(image)
|
||||
if image is None: return
|
||||
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image
|
||||
h, w = gray.shape
|
||||
scale = max_w / w
|
||||
resized = cv2.resize(gray, (max_w, int(h * scale)))
|
||||
|
||||
chars = " .:-=+*#%@"
|
||||
for r in range(resized.shape[0]):
|
||||
row_str = ""
|
||||
for c in range(resized.shape[1]):
|
||||
val = resized[r, c]
|
||||
idx = int((val / 255.0) * (len(chars) - 1))
|
||||
row_str += chars[idx]
|
||||
print(row_str)
|
||||
|
||||
if __name__ == "__main__":
|
||||
img = cv2.imread("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/verify_pano_chunk_00.png")
|
||||
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/verify_first_measure.png", img[:50, :200])
|
||||
|
||||
print("Exported verify_first_measure.png from verify_pano_chunk_00.png")
|
||||
|
||||
print("Exported verify_first_measure.png from raw_frame_1920.png")
|
||||
|
||||
|
||||
BIN
scripts/debug/debug_121.png
Normal file
|
After Width: | Height: | Size: 546 KiB |
BIN
scripts/debug/debug_38.png
Normal file
|
After Width: | Height: | Size: 550 KiB |
19
scripts/debug/debug_blocks.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Research Script for Debugging process_pages
|
||||
import cv2
|
||||
import pickle
|
||||
import os
|
||||
|
||||
out_dir = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\debug_blocks"
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
with open('unique_pages.pkl', 'rb') as f:
|
||||
unique_pages = pickle.load(f)
|
||||
|
||||
from score_extractor import ScoreExtractor
|
||||
extractor = ScoreExtractor()
|
||||
extractor.process_pages(unique_pages)
|
||||
|
||||
for i, block in enumerate(extractor.final_sheet_chunks):
|
||||
cv2.imwrite(os.path.join(out_dir, f"block_{i:02d}.png"), block)
|
||||
|
||||
print("Dumped blocks!")
|
||||
BIN
scripts/debug/debug_c1.png
Normal file
|
After Width: | Height: | Size: 53 KiB |
BIN
scripts/debug/debug_c3.png
Normal file
|
After Width: | Height: | Size: 59 KiB |
48
scripts/debug/debug_crash.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import cv2
|
||||
import pickle
|
||||
import traceback
|
||||
|
||||
try:
|
||||
with open('unique_pages.pkl', 'rb') as f:
|
||||
unique_pages = pickle.load(f)
|
||||
except Exception:
|
||||
import video_cv_tracker as tracker_lib
|
||||
from youtube_tab_to_pdf import _find_white_tab_strip
|
||||
tracker = tracker_lib.TemporalTracker(diff_threshold=0.05)
|
||||
video = cv2.VideoCapture("output/shintakarajima.mp4")
|
||||
|
||||
# Just read 100 frames
|
||||
frames = []
|
||||
fps_orig = video.get(cv2.CAP_PROP_FPS)
|
||||
stride = max(1, int(fps_orig / 4.0))
|
||||
count = 0
|
||||
while len(frames) < 150:
|
||||
ret, f = video.read()
|
||||
if not ret: break
|
||||
if count % stride == 0: frames.append(f)
|
||||
count += 1
|
||||
video.release()
|
||||
|
||||
top, bottom = 0, frames[0].shape[0]
|
||||
for f in frames[::10]:
|
||||
b = _find_white_tab_strip(f)
|
||||
if b:
|
||||
top, bottom = b
|
||||
break
|
||||
|
||||
for f in frames:
|
||||
tracker.process_frame(f[top:bottom, :])
|
||||
|
||||
unique_pages = tracker.get_unique_pages()
|
||||
with open('unique_pages.pkl', 'wb') as f:
|
||||
pickle.dump(unique_pages, f)
|
||||
|
||||
from score_extractor import ScoreExtractor
|
||||
ex = ScoreExtractor()
|
||||
try:
|
||||
print(f"Running ScoreExtractor on {len(unique_pages)} pages...")
|
||||
ex.process_pages(unique_pages)
|
||||
print("Success!")
|
||||
except Exception as e:
|
||||
print("CRASHED:")
|
||||
traceback.print_exc()
|
||||
BIN
scripts/debug/debug_final_state_machine.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
scripts/debug/debug_gap_bridged.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
scripts/debug/debug_last_frame.png
Normal file
|
After Width: | Height: | Size: 621 KiB |
BIN
scripts/debug/debug_morph_grid.png
Normal file
|
After Width: | Height: | Size: 609 KiB |
BIN
scripts/debug/debug_morph_horiz.png
Normal file
|
After Width: | Height: | Size: 5.5 KiB |
BIN
scripts/debug/debug_morph_vert.png
Normal file
|
After Width: | Height: | Size: 11 KiB |
61
scripts/debug/debug_numbers.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import easyocr
|
||||
import os
|
||||
from pathlib import Path
|
||||
from youtube_tab_to_pdf import _find_white_tab_strip, _has_tab_content, _extract_print_channel, _detect_measure_bars
|
||||
|
||||
def main():
|
||||
url = "https://youtu.be/tJq1n8TofM0"
|
||||
video_path = Path("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
|
||||
artifact_dir = Path(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6")
|
||||
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
ret, frame = cap.read()
|
||||
|
||||
strip = _find_white_tab_strip(frame)
|
||||
top, bottom = strip[0], strip[1]
|
||||
|
||||
tab_crop = frame[max(0, top):min(frame.shape[0], bottom), :]
|
||||
|
||||
gray_page = _extract_print_channel(tab_crop)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
coords = [0] + bar_coords + [tab_crop.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
reader = easyocr.Reader(['en'], verbose=False)
|
||||
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
measure_w = x_end - x_start
|
||||
if measure_w < 30: continue
|
||||
|
||||
m_img = tab_crop[:, x_start:x_end]
|
||||
|
||||
# Extract Number Sprite precisely
|
||||
gray = cv2.cvtColor(m_img, cv2.COLOR_BGR2GRAY)
|
||||
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
||||
row_sums = np.sum(thresh, axis=1) / 255
|
||||
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
|
||||
|
||||
if len(staff_lines) > 0:
|
||||
y_staff = staff_lines[0]
|
||||
# 상단 45px, 좌측 70px 크롭
|
||||
crop_y1 = max(0, y_staff - 45)
|
||||
crop_y2 = y_staff
|
||||
sprite = thresh[crop_y1:crop_y2, 0:min(70, m_img.shape[1])]
|
||||
|
||||
out_file = artifact_dir / f"debug_sprite_{i}.png"
|
||||
cv2.imwrite(str(out_file), sprite)
|
||||
|
||||
# OCR
|
||||
upscaled = cv2.resize(sprite, (0, 0), fx=4.0, fy=4.0, interpolation=cv2.INTER_CUBIC)
|
||||
res = reader.readtext(upscaled, allowlist='0123456789', detail=0)
|
||||
print(f"Measure {i}: Found text = {res}")
|
||||
|
||||
cap.release()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
scripts/debug/debug_ocr_chunks.py
Normal file
0
scripts/debug/debug_ocr_image.py
Normal file
BIN
scripts/debug/debug_ocr_measure_2.png
Normal file
|
After Width: | Height: | Size: 323 B |
BIN
scripts/debug/debug_ocr_measure_3.png
Normal file
|
After Width: | Height: | Size: 621 B |
BIN
scripts/debug/debug_ocr_measure_4.png
Normal file
|
After Width: | Height: | Size: 332 B |
BIN
scripts/debug/debug_ocr_measure_5.png
Normal file
|
After Width: | Height: | Size: 556 B |
BIN
scripts/debug/debug_ocr_measure_6.png
Normal file
|
After Width: | Height: | Size: 872 B |
BIN
scripts/debug/debug_ocr_measure_7.png
Normal file
|
After Width: | Height: | Size: 328 B |
BIN
scripts/debug/debug_ocr_measure_8.png
Normal file
|
After Width: | Height: | Size: 630 B |
BIN
scripts/debug/debug_ocr_measure_9.png
Normal file
|
After Width: | Height: | Size: 323 B |
102
scripts/debug/debug_orb_failures.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Load tracker directly to inspect ORB
|
||||
sys.path.append(str(Path(".").resolve()))
|
||||
from video_cv_tracker import TemporalTracker
|
||||
|
||||
def main():
|
||||
print("Testing ORB matcher...")
|
||||
# Load test frames from video 1
|
||||
cap = cv2.VideoCapture("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
|
||||
|
||||
# Fast forward to transition frame
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
|
||||
|
||||
succ, prev = cap.read()
|
||||
curr = prev.copy()
|
||||
|
||||
# We will just write a snippet from the actual video loop and manually feed it
|
||||
# We can use debug_video1.py ? No, I'll extract real frames directly where the cut happens
|
||||
|
||||
# A faster way: Just scan the video for transitions and print the ORB histogram
|
||||
tracker = TemporalTracker()
|
||||
frame_idx = 500
|
||||
transitions_found = 0
|
||||
while True:
|
||||
succ, frame = cap.read()
|
||||
if not succ: break
|
||||
if frame_idx % 100 == 0:
|
||||
print(f"Reading frame {frame_idx}...", flush=True)
|
||||
|
||||
# We need the strip, like youtube_tab_to_pdf.py does
|
||||
strip = frame[111:390] # Approximate Region
|
||||
|
||||
dx, conf = tracker._calculate_pixel_shift(tracker.last_clean_frame if tracker.last_clean_frame is not None else strip, strip)
|
||||
|
||||
if tracker.panorama is None:
|
||||
tracker.panorama = strip.copy()
|
||||
tracker.last_clean_frame = strip.copy()
|
||||
frame_idx += 1
|
||||
continue
|
||||
|
||||
if (conf < 0.45) or (tracker.last_conf - conf > 0.3):
|
||||
tracker.in_transition = True
|
||||
|
||||
elif tracker.in_transition and conf > 0.85 and dx == 0:
|
||||
tracker.in_transition = False
|
||||
print(f"[{frame_idx}] Transition Recovered! Testing ORB...")
|
||||
|
||||
# RUN ORB
|
||||
search_w = min(1500, tracker.panorama.shape[1])
|
||||
search_region = tracker._extract_print_channel(tracker.panorama[:, -search_w:])
|
||||
head = tracker._extract_print_channel(strip)
|
||||
|
||||
orb = cv2.ORB_create(1000)
|
||||
kp1, des1 = orb.detectAndCompute(search_region, None)
|
||||
kp2, des2 = orb.detectAndCompute(head, None)
|
||||
|
||||
print(f" kp1: {len(kp1) if kp1 else 0}, kp2: {len(kp2) if kp2 else 0}")
|
||||
|
||||
if des1 is not None and des2 is not None and len(des1) > 10 and len(des2) > 10:
|
||||
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
|
||||
matches = bf.match(des1, des2)
|
||||
|
||||
dx_votes = []
|
||||
for m in matches:
|
||||
x1, y1 = kp1[m.queryIdx].pt
|
||||
x2, y2 = kp2[m.trainIdx].pt
|
||||
if abs(y1 - y2) < 10:
|
||||
dx_votes.append(x1 - x2)
|
||||
|
||||
if dx_votes:
|
||||
hist, bins = np.histogram(dx_votes, bins=np.arange(min(dx_votes)-5, max(dx_votes)+5, 5))
|
||||
best_bin_idx = np.argmax(hist)
|
||||
print(f" Max Vote Count: {hist[best_bin_idx]} at dx={bins[best_bin_idx]}")
|
||||
if hist[best_bin_idx] < 12:
|
||||
print(" => FAILED! Overlap not found (too few ORB matches). Will append complete new page.")
|
||||
else:
|
||||
print(" => SUCCESS! Overlap found.")
|
||||
else:
|
||||
print(" => FAILED! No dx votes.")
|
||||
else:
|
||||
print(" => FAILED! des1 or des2 is None or less than 10!")
|
||||
|
||||
tracker.panorama = np.hstack([tracker.panorama, strip])
|
||||
transitions_found += 1
|
||||
if transitions_found > 0:
|
||||
break
|
||||
|
||||
elif dx > 0 and not tracker.in_transition:
|
||||
tracker.panorama = np.hstack([tracker.panorama, strip[:, strip.shape[1] - int(dx):, :]])
|
||||
|
||||
tracker.last_conf = conf
|
||||
tracker.last_clean_frame = strip.copy()
|
||||
frame_idx += 1
|
||||
|
||||
cap.release()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
BIN
scripts/debug/debug_output.txt
Normal file
98
scripts/debug/debug_overlap.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import sys
|
||||
import glob
|
||||
|
||||
# Test matching between two chunks to see what the score was!
|
||||
# Wait, the chunks are the output of the slicing!
|
||||
# The tracker works on the original FRAMES!
|
||||
# Let's test the tracker on the original frames!
|
||||
# I will supply the exact logic used in the tracker.
|
||||
|
||||
def test_tracker():
|
||||
video_file = r"C:\Users\Certes\Desktop\guitar_score\output\サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
|
||||
cap = cv2.VideoCapture(video_file)
|
||||
|
||||
panorama = None
|
||||
last_clean_frame = None
|
||||
in_transition = False
|
||||
last_conf = 1.0
|
||||
|
||||
count = 0
|
||||
saved_matches = []
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
count += 1
|
||||
if count % 15 != 0: # fps=2
|
||||
continue
|
||||
|
||||
frame = cv2.resize(frame, (1280, 720))
|
||||
|
||||
if panorama is None:
|
||||
panorama = frame.copy()
|
||||
last_clean_frame = frame.copy()
|
||||
continue
|
||||
|
||||
# calculate shift
|
||||
prev_chan = last_clean_frame[:, :, 0]
|
||||
curr_chan = frame[:, :, 0]
|
||||
w = 1280
|
||||
template_w = int(w * 0.3)
|
||||
start_x = int(w * 0.6)
|
||||
template = prev_chan[:, start_x:start_x + template_w]
|
||||
|
||||
res = cv2.matchTemplate(curr_chan, template, cv2.TM_CCOEFF_NORMED)
|
||||
_, conf, _, max_loc = cv2.minMaxLoc(res)
|
||||
dx = start_x - max_loc[0]
|
||||
if conf < 0.15 or dx <= 0:
|
||||
dx = 0
|
||||
if dx > w * 0.15:
|
||||
dx = 0
|
||||
|
||||
if (conf < 0.45) or (last_conf - conf > 0.3):
|
||||
in_transition = True
|
||||
elif in_transition and conf > 0.85 and dx == 0:
|
||||
in_transition = False
|
||||
|
||||
# overlap logic
|
||||
h = panorama.shape[0]
|
||||
new_page = frame.copy()
|
||||
search_w = min(1500, panorama.shape[1])
|
||||
search_region = panorama[:, -search_w:, 0]
|
||||
|
||||
head_w = min(400, new_page.shape[1])
|
||||
head = new_page[:, 50:50+head_w, 0]
|
||||
|
||||
res2 = cv2.matchTemplate(search_region, head, cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val, _, matched_loc = cv2.minMaxLoc(res2)
|
||||
|
||||
saved_matches.append(max_val)
|
||||
print(f"Page turn detected! Overlap match score: {max_val:.4f} at {matched_loc}")
|
||||
|
||||
if max_val > 0.65:
|
||||
overlap_px = search_w - matched_loc[0] + 50
|
||||
if overlap_px < new_page.shape[1] - 50:
|
||||
panorama = np.hstack([panorama, new_page[:, overlap_px:]])
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
panorama = np.hstack([panorama, new_page])
|
||||
|
||||
elif dx > 0 and dx < w and not in_transition:
|
||||
new_strip = frame[:, w - dx:, :]
|
||||
panorama = np.hstack([panorama, new_strip])
|
||||
|
||||
last_conf = conf
|
||||
last_clean_frame = frame.copy()
|
||||
|
||||
if len(saved_matches) >= 3:
|
||||
break
|
||||
|
||||
cap.release()
|
||||
print("Test complete.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_tracker()
|
||||
73
scripts/debug/debug_sequence.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from score_extractor import ScoreExtractor
|
||||
from youtube_tab_to_pdf import extract_unique_scroll, _detect_tab_overlay
|
||||
|
||||
# Simplified run script to dump all macro blocks and ignored pages
|
||||
frames = []
|
||||
video = cv2.VideoCapture("sakanaction shintakarajima.mp4")
|
||||
fps_orig = video.get(cv2.CAP_PROP_FPS)
|
||||
stride = max(1, int(fps_orig / 4.0))
|
||||
count = 0
|
||||
while True:
|
||||
ret, frame = video.read()
|
||||
if not ret: break
|
||||
if count % stride == 0:
|
||||
frames.append(frame)
|
||||
count += 1
|
||||
video.release()
|
||||
|
||||
from video_cv_tracker import TemporalTracker
|
||||
from youtube_tab_to_pdf import _find_white_tab_strip
|
||||
tracker = TemporalTracker(diff_threshold=0.05)
|
||||
tab_bounds = None
|
||||
for f in frames[::30]:
|
||||
b = _find_white_tab_strip(f)
|
||||
if b:
|
||||
tab_bounds = b
|
||||
break
|
||||
top, bottom = tab_bounds if tab_bounds else (0, frames[0].shape[0])
|
||||
|
||||
for f in frames:
|
||||
tracker.process_frame(f[top:bottom, :])
|
||||
|
||||
unique = tracker.get_unique_pages()
|
||||
|
||||
ex = ScoreExtractor()
|
||||
# Manually process them and print verbose output
|
||||
ex.macro_blocks = [unique[0].copy()]
|
||||
ex.history_pages = [unique[0]]
|
||||
|
||||
for i, page in enumerate(unique[1:], 1):
|
||||
current = ex.macro_blocks[-1]
|
||||
head_w = min(800, page.shape[1])
|
||||
search_w = min(1500, current.shape[1])
|
||||
|
||||
h_gray = cv2.cvtColor(page[:, :head_w], cv2.COLOR_BGR2GRAY)
|
||||
s_gray = cv2.cvtColor(current[:, -search_w:], cv2.COLOR_BGR2GRAY)
|
||||
|
||||
res = cv2.matchTemplate(s_gray, h_gray, cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(res)
|
||||
|
||||
if max_val > 0.50:
|
||||
print(f"[Page {i}] Stitched! max_val={max_val:.2f}")
|
||||
absolute_match_x = current.shape[1] - search_w + max_loc[0]
|
||||
next_start_idx = current.shape[1] - absolute_match_x
|
||||
if next_start_idx < page.shape[1]:
|
||||
append_part = page[:, next_start_idx:]
|
||||
ex.macro_blocks[-1] = np.hstack([ex.macro_blocks[-1], append_part])
|
||||
ex.history_pages.append(append_part)
|
||||
else:
|
||||
# Check repeat
|
||||
is_repeat = ex._is_historical_repeat(page)
|
||||
print(f"[Page {i}] Jump! max_val={max_val:.2f}, repeat={is_repeat}")
|
||||
if is_repeat:
|
||||
# We will save the rejected page to see if it was 22-29
|
||||
cv2.imwrite(f"rejected_page_{i}.png", page)
|
||||
else:
|
||||
ex.macro_blocks.append(page.copy())
|
||||
ex.history_pages.append(page)
|
||||
|
||||
# Dump the starts of the blocks
|
||||
for j, b in enumerate(ex.macro_blocks):
|
||||
cv2.imwrite(f"macro_block_{j}_start.png", b[:, :1800])
|
||||
21
scripts/debug/debug_stitch.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
cap = cv2.VideoCapture("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
|
||||
|
||||
# Skip to 30 seconds
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 30 * fps)
|
||||
|
||||
ret, frame_30s = cap.read()
|
||||
if ret:
|
||||
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/raw_frame_30s.png", frame_30s)
|
||||
|
||||
# Skip to 35 seconds
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 35 * fps)
|
||||
ret, frame_35s = cap.read()
|
||||
if ret:
|
||||
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/raw_frame_35s.png", frame_35s)
|
||||
|
||||
cap.release()
|
||||
print("Saved raw frames for structural analysis.")
|
||||
BIN
scripts/debug/debug_super_scale.png
Normal file
|
After Width: | Height: | Size: 901 B |
BIN
scripts/debug/debug_temporal_binary.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
scripts/debug/debug_temporal_median.png
Normal file
|
After Width: | Height: | Size: 608 KiB |
0
scripts/debug/debug_test_m1_sprite.py
Normal file
35
scripts/debug/debug_video1.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import cv2
|
||||
import os
|
||||
import shutil
|
||||
|
||||
video_file = r"C:\Users\Certes\Desktop\guitar_score\output\サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
|
||||
debug_dir = r"C:\Users\Certes\Desktop\guitar_score\output\debug_video1"
|
||||
|
||||
if os.path.exists(debug_dir):
|
||||
shutil.rmtree(debug_dir)
|
||||
os.makedirs(debug_dir)
|
||||
|
||||
cap = cv2.VideoCapture(video_file)
|
||||
fps_orig = cap.get(cv2.CAP_PROP_FPS)
|
||||
target_fps = 1
|
||||
frame_skip = int(fps_orig / target_fps)
|
||||
|
||||
count = 0
|
||||
saved = 0
|
||||
last_frame = None
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
if count % (fps_orig * 10) == 0:
|
||||
frame = cv2.resize(frame, (1280, 720))
|
||||
cv2.imwrite(os.path.join(debug_dir, f"frame_{count:05d}.jpg"), frame)
|
||||
saved += 1
|
||||
if saved > 30:
|
||||
break
|
||||
|
||||
count += 1
|
||||
|
||||
cap.release()
|
||||
print(f"Extraction complete. {saved} frames saved.")
|
||||
33
scripts/debug/dump_frames.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""원본 프레임 덤프 — 각 영상에서 5개 프레임을 랜덤 추출"""
|
||||
import sys
|
||||
if sys.platform == "win32":
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
output = Path("output")
|
||||
dump_dir = output / "raw_dump"
|
||||
dump_dir.mkdir(exist_ok=True)
|
||||
|
||||
mp4s = sorted(output.glob("*.mp4"))
|
||||
for vi, mp4 in enumerate(mp4s):
|
||||
cap = cv2.VideoCapture(str(mp4))
|
||||
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
print(f"Video {vi+1}: {mp4.name[:30]}... ({w}x{h}, {fps:.0f}fps, {total} frames)")
|
||||
|
||||
# 균등 간격으로 5개 프레임
|
||||
indices = np.linspace(total * 0.1, total * 0.9, 5, dtype=int)
|
||||
for i, idx in enumerate(indices):
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
||||
ret, frame = cap.read()
|
||||
if ret:
|
||||
path = dump_dir / f"v{vi+1}_raw_{i}.png"
|
||||
cv2.imwrite(str(path), frame)
|
||||
print(f" frame {idx} → {path.name} ({frame.shape})")
|
||||
cap.release()
|
||||
|
||||
print(f"\n덤프 완료: {dump_dir}")
|
||||
25
scripts/debug/dump_inspection_frames.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import cv2
|
||||
import pickle
|
||||
import os
|
||||
|
||||
with open('unique_pages.pkl', 'rb') as f:
|
||||
unique_pages = pickle.load(f)
|
||||
|
||||
# Save jump cut boundary frames to see what happened exactly around measure 21 and 45.
|
||||
# We will use the browser subagent to securely review them.
|
||||
out_dir = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6"
|
||||
|
||||
# Let's save the Pages that we know caused issues:
|
||||
# In verify_log.txt, we saw:
|
||||
# Page 18-24 (Around Measure 21 problem)
|
||||
# Page 40-50 (Around Measure 45 problem)
|
||||
|
||||
for i in range(16, 26):
|
||||
if i < len(unique_pages):
|
||||
cv2.imwrite(os.path.join(out_dir, f"jump_cut_inspection_page_{i}.png"), unique_pages[i])
|
||||
|
||||
for i in range(43, 53):
|
||||
if i < len(unique_pages):
|
||||
cv2.imwrite(os.path.join(out_dir, f"jump_cut_inspection_page_{i}.png"), unique_pages[i])
|
||||
|
||||
print(f"Dumped inspection frames to Artifact Directory.")
|
||||
68
scripts/debug/dump_logs.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import cv2
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
# TemporalTracker already saved the video chunks? No.
|
||||
# I will use fast_verify.py's frames but run process_pages directly and print all its output.
|
||||
import fast_verify
|
||||
from youtube_tab_to_pdf import extract_unique_scroll
|
||||
|
||||
# Actually, I will just write a wrapper around ScoreExtractor to print to file
|
||||
import sys
|
||||
|
||||
def main():
|
||||
cap = cv2.VideoCapture("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
|
||||
# Quick dynamic crop
|
||||
ret, initial = cap.read()
|
||||
scale = 1280 / initial.shape[1]
|
||||
|
||||
from youtube_tab_to_pdf import _find_white_tab_strip
|
||||
crop_top, crop_bottom = 0, int(initial.shape[0] * scale)
|
||||
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
|
||||
ret, check_frame = cap.read()
|
||||
if ret:
|
||||
resized_check = cv2.resize(check_frame, (1280, int(check_frame.shape[0] * scale)))
|
||||
bounds = _find_white_tab_strip(resized_check)
|
||||
if bounds:
|
||||
crop_top, crop_bottom = bounds
|
||||
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
||||
# We don't want to load ALL 15000 frames into memory. Use TemporalTracker directly!
|
||||
from video_cv_tracker import TemporalTracker
|
||||
tracker = TemporalTracker(diff_threshold=0.05)
|
||||
|
||||
count = 0
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
if count % 4 == 0:
|
||||
resized = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
|
||||
tracker.process_frame(resized[crop_top:crop_bottom, :])
|
||||
count += 1
|
||||
|
||||
unique_pages = tracker.get_unique_pages()
|
||||
print(f"Got {len(unique_pages)} unique pages from tracker.")
|
||||
|
||||
from score_extractor import ScoreExtractor
|
||||
extractor = ScoreExtractor()
|
||||
|
||||
# We will hook print
|
||||
original_print = print
|
||||
with open("score_log.txt", "w") as f:
|
||||
def my_print(*args, **kwargs):
|
||||
text = " ".join(map(str, args))
|
||||
f.write(text + "\n")
|
||||
original_print(*args, **kwargs)
|
||||
|
||||
import builtins
|
||||
builtins.print = my_print
|
||||
|
||||
extractor.process_pages(unique_pages)
|
||||
|
||||
builtins.print = original_print
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
14
scripts/debug/dump_pages.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import cv2
|
||||
import pickle
|
||||
|
||||
with open('unique_pages.pkl', 'rb') as f:
|
||||
unique_pages = pickle.load(f)
|
||||
|
||||
import os
|
||||
out_dir = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\pages"
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
for i, p in enumerate(unique_pages):
|
||||
cv2.imwrite(os.path.join(out_dir, f"page_{i:03d}.png"), p)
|
||||
|
||||
print(f"Saved {len(unique_pages)} pages to {out_dir}")
|
||||
21
scripts/debug/dump_slices.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import cv2
|
||||
import os
|
||||
|
||||
img = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\final_check_100_sec.png")
|
||||
if img is None:
|
||||
print("Image not found!")
|
||||
exit(1)
|
||||
|
||||
out_dir = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\slices"
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
h, w = img.shape[:2]
|
||||
# Final check image is a single ROW (very long panorama).
|
||||
# We will cut it into 2000px chunks.
|
||||
idx = 0
|
||||
for x in range(0, w, 2000):
|
||||
slice_img = img[:, x:min(x+2000, w)]
|
||||
cv2.imwrite(os.path.join(out_dir, f"pano_slice_{idx:02d}.png"), slice_img)
|
||||
idx += 1
|
||||
|
||||
print(f"Generated {idx} slices.")
|
||||
64
scripts/debug/dump_sprites.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import os
|
||||
from glob import glob
|
||||
|
||||
video_path = glob('output/*.mp4')[0]
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
|
||||
def _find_white_tab_strip(frame):
|
||||
h, w = frame.shape[:2]
|
||||
gray = np.max(frame, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
row_white_counts = np.sum(binary > 0, axis=1)
|
||||
|
||||
threshold = w * 0.1
|
||||
white_rows = np.where(row_white_counts > threshold)[0]
|
||||
if len(white_rows) < 5: return None
|
||||
return white_rows[0], white_rows[-1]
|
||||
|
||||
def get_number_sprite(m_img):
|
||||
gray = np.max(m_img, axis=2)
|
||||
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
row_sums = np.sum(thresh, axis=1) / 255
|
||||
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
|
||||
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
|
||||
|
||||
crop_y1 = max(0, y_staff - 35)
|
||||
crop_y2 = max(0, y_staff - 2)
|
||||
crop_x1 = 0
|
||||
crop_x2 = min(60, m_img.shape[1])
|
||||
|
||||
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
|
||||
return thresh[crop_y1:crop_y2, crop_x1:crop_x2]
|
||||
|
||||
frame_count = 0
|
||||
found = 0
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
if frame_count % 30 == 0:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
tab_crop = frame[max(0, strip[0]):min(frame.shape[0], strip[1]), :]
|
||||
|
||||
# _detect_measure_bars inline
|
||||
b_gray = np.max(tab_crop, axis=2)
|
||||
_, b_bin = cv2.threshold(b_gray, 180, 255, cv2.THRESH_BINARY)
|
||||
col_sums = np.sum(b_bin, axis=0) / 255
|
||||
bars = np.where(col_sums > tab_crop.shape[0] * 0.8)[0]
|
||||
|
||||
if len(bars) > 1:
|
||||
x_start = bars[0]
|
||||
x_end = bars[1]
|
||||
if x_end - x_start > 40:
|
||||
first_m = tab_crop[:, x_start:x_end]
|
||||
sprite = get_number_sprite(first_m)
|
||||
if sprite is not None:
|
||||
pixels = np.count_nonzero(sprite > 127)
|
||||
cv2.imwrite(f"C:/Users/Certes/Desktop/guitar_score/debug_s_{frame_count}_{pixels}.png", sprite)
|
||||
print(f"Dumped sprite frame {frame_count} with {pixels} pixels")
|
||||
found += 1
|
||||
if found > 15: break
|
||||
frame_count += 1
|
||||
cap.release()
|
||||
78
scripts/debug/fast_verify.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import cv2
|
||||
from video_cv_tracker import TemporalTracker
|
||||
from youtube_tab_to_pdf import extract_unique_scroll, generate_long_image, generate_pdf, download_video, extract_frames
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Run verification specifically on Shintakarajima
|
||||
url = "https://youtu.be/tJq1n8TofM0"
|
||||
video_path = Path("output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
|
||||
|
||||
print("Extracting full video for final 142-measure verification...")
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
|
||||
# PRE-CALCULATE Dynamic Crop
|
||||
# Just like extract_unique_scroll does automatically, we detect the white band.
|
||||
ret, initial = cap.read()
|
||||
scale = 1280 / initial.shape[1]
|
||||
resized_init = cv2.resize(initial, (1280, int(initial.shape[0] * scale)))
|
||||
|
||||
from youtube_tab_to_pdf import _find_white_tab_strip
|
||||
crop_top = 0
|
||||
crop_bottom = resized_init.shape[0]
|
||||
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
|
||||
ret, check_frame = cap.read()
|
||||
if ret:
|
||||
resized_check = cv2.resize(check_frame, (1280, int(check_frame.shape[0] * scale)))
|
||||
bounds = _find_white_tab_strip(resized_check)
|
||||
if bounds:
|
||||
crop_top, crop_bottom = bounds
|
||||
# Preserve D.S. al Coda, ┌─ 1., ┌─ 2., and measure numbers drawn in the black abyss!
|
||||
crop_top = max(0, crop_top - 60)
|
||||
|
||||
print(f"Dynamically Cropping to: Y={crop_top} to {crop_bottom}")
|
||||
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
||||
frames = []
|
||||
idx = 0
|
||||
tracker = TemporalTracker(diff_threshold=0.05)
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
frame_resized = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
|
||||
clean_ribbon = frame_resized[crop_top:crop_bottom, :]
|
||||
frames.append(clean_ribbon)
|
||||
idx += 1
|
||||
|
||||
cap.release()
|
||||
|
||||
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/raw_frame_check.png", frames[30])
|
||||
|
||||
print(f"Extracted {len(frames)} frames. Running sequential page extraction...")
|
||||
try:
|
||||
final_chunks = extract_unique_scroll(frames)
|
||||
print("DEBUG: final_chunks len =", len(final_chunks))
|
||||
if final_chunks:
|
||||
print("DEBUG: final_chunks[0].shape =", final_chunks[0].shape)
|
||||
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/debug_chunk_0.png", final_chunks[0])
|
||||
|
||||
# Save the chunks to artifact directory to literally look at it
|
||||
artifact_path = Path(os.environ.get('APPDATA', '')) / '..' / 'Local' / 'Google' / 'AndroidStudio2024.1' # Just using relative artifact manually? No, I'll save it to C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\
|
||||
artifact_path = Path(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6")
|
||||
output_png = artifact_path / "final_check_100_sec.png"
|
||||
|
||||
generate_long_image(final_chunks, output_png)
|
||||
print(f"Saved successful verification image to: {output_png}")
|
||||
|
||||
if final_chunks:
|
||||
generate_pdf(final_chunks, Path("output/shintakarajima_perfect.pdf"))
|
||||
print("✨ Successfully generated output/shintakarajima_perfect.pdf ✨")
|
||||
else:
|
||||
print("Failed to produce rows.")
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
39
scripts/debug/find_staff_lines.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
img = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_super_block.png")
|
||||
if img is None:
|
||||
print("Image not found")
|
||||
exit()
|
||||
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
_, bin_inv = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
||||
|
||||
# Staff lines are y=76 to y=152
|
||||
# A vertical bar line is a vertical strip of black pixels from 76 to 151
|
||||
# Sum down the columns
|
||||
col_sums = np.sum(bin_inv[76:152, :], axis=0) / 255.0
|
||||
|
||||
# If a column has > 70 black pixels out of the 76 height, it's a solid vertical line
|
||||
bar_xs = np.where(col_sums > 70)[0]
|
||||
|
||||
# Group adjacent pixels into single lines
|
||||
grouped_bars = []
|
||||
if len(bar_xs) > 0:
|
||||
current_group = [bar_xs[0]]
|
||||
for x in bar_xs[1:]:
|
||||
if x - current_group[-1] <= 5:
|
||||
current_group.append(x)
|
||||
else:
|
||||
grouped_bars.append(int(np.mean(current_group)))
|
||||
current_group = [x]
|
||||
grouped_bars.append(int(np.mean(current_group)))
|
||||
|
||||
print(f"Found {len(grouped_bars)} vertical barlines:")
|
||||
print(grouped_bars)
|
||||
|
||||
# Draw lines
|
||||
out = img.copy()
|
||||
for x in grouped_bars:
|
||||
cv2.line(out, (x, 0), (x, out.shape[0]), (0, 0, 255), 2)
|
||||
cv2.imwrite(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\test_barlines.png", out)
|
||||
BIN
scripts/debug/measure_1_crop.png
Normal file
|
After Width: | Height: | Size: 10 KiB |
139
scripts/debug/patch_extractor.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 1FPS 타임라인 기반 마디 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
for frame_idx, frame in enumerate(frames):
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
|
||||
gray_page = _extract_print_channel(tab_crop)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if not bar_coords:
|
||||
continue
|
||||
|
||||
coords = [0] + bar_coords + [tab_crop.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(tab_crop[:, x_start:x_end])
|
||||
|
||||
if not page_measures:
|
||||
continue
|
||||
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
continue
|
||||
|
||||
first_m = page_measures[0]
|
||||
bin_first = get_clean_binary(first_m)
|
||||
|
||||
best_error = 1.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
for scan_dist in range(1, min(10, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
|
||||
bin_past = get_clean_binary(past_m)
|
||||
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(diff > 0) / s1.size
|
||||
|
||||
if error_ratio < best_error:
|
||||
best_error = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
# Error ratio < 20% confirms identity for sparse structures
|
||||
if best_error < 0.20:
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
print(f" [Anchor] Frame {frame_idx} -> PDF offset {best_offset} (Best Error: {best_error:.4f})")
|
||||
else:
|
||||
print(f" [New] Frame {frame_idx} -> No Match (Best Error was {best_error:.4f})")
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: 무손실 타임라인 기반 {len(unique_measures)}개 연속 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Patched.")
|
||||
182
scripts/debug/patch_extractor_with_sprite.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 Number Sprite 앵커 기반 마디 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
def get_number_sprite(m_img):
|
||||
# We find the top-left region where the number is displayed
|
||||
gray = np.max(m_img, axis=2)
|
||||
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
row_sums = np.sum(thresh, axis=1) / 255
|
||||
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
|
||||
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
|
||||
|
||||
crop_y1 = max(0, y_staff - 35)
|
||||
crop_y2 = max(0, y_staff - 2)
|
||||
crop_x1 = 0
|
||||
crop_x2 = min(60, m_img.shape[1])
|
||||
|
||||
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
|
||||
sprite = thresh[crop_y1:crop_y2, crop_x1:crop_x2]
|
||||
|
||||
# if there are no white pixels, it's a blank space, not a number
|
||||
if np.count_nonzero(sprite > 127) < 5: return None
|
||||
return sprite
|
||||
|
||||
for frame_idx, frame in enumerate(frames):
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
|
||||
gray_page = _extract_print_channel(tab_crop)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if not bar_coords: continue
|
||||
|
||||
coords = [0] + bar_coords + [tab_crop.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(tab_crop[:, x_start:x_end])
|
||||
|
||||
if not page_measures: continue
|
||||
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
continue
|
||||
|
||||
first_m = page_measures[0]
|
||||
first_sprite = get_number_sprite(first_m)
|
||||
|
||||
best_error = 1.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
# Only anchor if we explicitly see a printed number in the top left
|
||||
if first_sprite is not None:
|
||||
# We can scan further back safely because different numbers won't mathematically match
|
||||
for scan_dist in range(1, min(15, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
past_sprite = get_number_sprite(past_m)
|
||||
|
||||
if past_sprite is not None:
|
||||
hs = min(first_sprite.shape[0], past_sprite.shape[0])
|
||||
ws = min(first_sprite.shape[1], past_sprite.shape[1])
|
||||
s1 = first_sprite[:hs, :ws]
|
||||
s2 = past_sprite[:hs, :ws]
|
||||
|
||||
diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(diff > 0) / s1.size
|
||||
|
||||
if error_ratio < best_error:
|
||||
best_error = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
# If the literal printed number matches perfectly, we securely anchor Here!
|
||||
if best_error < 0.15:
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
|
||||
# Fallback for pages entirely devoid of explicit numbering
|
||||
if not anchored:
|
||||
bin_first = get_clean_binary(first_m)
|
||||
for scan_dist in range(1, min(5, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = get_clean_binary(past_m)
|
||||
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(diff > 0) / s1.size
|
||||
|
||||
if error_ratio < best_error:
|
||||
best_error = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_error < 0.15:
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
# Middle append
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: Number Sprite 타임라인 기반 {len(unique_measures)}개 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Patched.")
|
||||
145
scripts/debug/patch_extractor_with_tracker.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 페이지 분할 기반 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
# 5% 픽셀 변화를 통해 페이지가 넘어가는 장면(Scene)만 정지 화면으로 추출 (모션 블러 프레임 제거)
|
||||
tracker = TemporalTracker(diff_threshold=0.05)
|
||||
|
||||
for frame in frames:
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
tracker.process_frame(tab_crop)
|
||||
|
||||
unique_pages = tracker.get_unique_pages()
|
||||
if not unique_pages: return []
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
for page_idx, page in enumerate(unique_pages):
|
||||
gray_page = _extract_print_channel(page)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if not bar_coords: continue
|
||||
|
||||
coords = [0] + bar_coords + [page.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(page[:, x_start:x_end])
|
||||
|
||||
if not page_measures: continue
|
||||
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
continue
|
||||
|
||||
first_m = page_measures[0]
|
||||
bin_first = get_clean_binary(first_m)
|
||||
|
||||
best_error = 1.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
for scan_dist in range(1, min(10, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = get_clean_binary(past_m)
|
||||
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(diff > 0) / s1.size
|
||||
|
||||
if error_ratio < best_error:
|
||||
best_error = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_error < 0.20:
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
print(f" [Anchor] Page {page_idx} -> PDF offset {best_offset} (Best Error: {best_error:.4f})")
|
||||
else:
|
||||
print(f" [New Page] Page {page_idx} -> No Overlap (Best Error: {best_error:.4f})")
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
# 중복된 오프셋만큼 건너뛰고 나머지 새 마디만 추가
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
# 겹침이 전혀 없으므로 전체 마디 추가
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: 무손실 정적 페이지 기반 {len(unique_measures)}개 연속 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Patched completely back to optimal tracking.")
|
||||
168
scripts/debug/patch_final_holy_grail.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 Binarized-Tracker 정밀 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
# The Holy Grail Tracker: Feed it ONLY the pure 200-threshold binary mask.
|
||||
# The hand is gone. Only the white staff lines and notes exist.
|
||||
# When the page flips, the notes change position, creating a very small but undeniable structural pixel diff.
|
||||
# We use a highly sensitive 0.015 (1.5%) threshold to perfectly catch thin notes transitioning!
|
||||
tracker = TemporalTracker(diff_threshold=0.015)
|
||||
|
||||
# Store associations so we can retrieve the original BGR page later
|
||||
clean_to_bgr = []
|
||||
|
||||
for frame in frames:
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
|
||||
clean_bin = get_clean_binary(tab_crop)
|
||||
# tracker will process the pure binary structural image
|
||||
diff = 0.0
|
||||
if tracker.last_frame is not None:
|
||||
raw_diff = cv2.absdiff(clean_bin, tracker.last_frame)
|
||||
non_zero_ratio = np.count_nonzero(raw_diff) / clean_bin.size
|
||||
if non_zero_ratio > tracker.diff_threshold:
|
||||
tracker.unique_pages.append(clean_bin)
|
||||
clean_to_bgr.append(tab_crop)
|
||||
tracker.last_frame = clean_bin.copy()
|
||||
else:
|
||||
tracker.unique_pages.append(clean_bin)
|
||||
clean_to_bgr.append(tab_crop)
|
||||
tracker.last_frame = clean_bin.copy()
|
||||
|
||||
unique_pages = clean_to_bgr
|
||||
if not unique_pages: return []
|
||||
|
||||
print(f" -> {len(unique_pages)}개의 고유 정적 페이지 캡처 완료. 3-마디 역탐색 동기화 시작...")
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
for page_idx, page in enumerate(unique_pages):
|
||||
gray_page = _extract_print_channel(page)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if not bar_coords: continue
|
||||
|
||||
coords = [0] + bar_coords + [page.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(page[:, x_start:x_end])
|
||||
|
||||
if not page_measures: continue
|
||||
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
continue
|
||||
|
||||
first_m = page_measures[0]
|
||||
bin_first = get_clean_binary(first_m)
|
||||
|
||||
best_error = 1.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
# We limit the search distance to EXACTLY 3 measures.
|
||||
# This completely cures Time-Traveling overlaps caused by M10 matching identical M2.
|
||||
# A page flip overlap can NEVER be further back than the immediately previous page's length.
|
||||
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = get_clean_binary(past_m)
|
||||
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(diff > 0) / s1.size
|
||||
|
||||
if error_ratio < best_error:
|
||||
best_error = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_error < 0.20:
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
print(f" [Anchor] Page {page_idx} -> PDF offset {best_offset} (Best Error: {best_error:.4f})")
|
||||
else:
|
||||
print(f" [New Page] Page {page_idx} -> No Overlap (Best Error: {best_error:.4f})")
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: 무손실 정적 페이지 기반 {len(unique_measures)}개 연속 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Holy Grail Pipeline Embedded.")
|
||||
160
scripts/debug/patch_final_holy_matrix.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 Stable-Blurred-Matrix 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
last_1fps_bin = None
|
||||
last_solid_page = None
|
||||
|
||||
for frame_idx, frame in enumerate(frames):
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
|
||||
clean_bin = get_clean_binary(tab_crop)
|
||||
|
||||
if last_1fps_bin is not None:
|
||||
diff = cv2.absdiff(clean_bin, last_1fps_bin)
|
||||
error = np.count_nonzero(diff) / clean_bin.size
|
||||
if error < 0.05:
|
||||
has_changed_since_last_solid = True
|
||||
|
||||
if last_solid_page is not None:
|
||||
s_diff = cv2.absdiff(clean_bin, last_solid_page)
|
||||
s_err = np.count_nonzero(s_diff) / clean_bin.size
|
||||
if s_err < 0.05:
|
||||
has_changed_since_last_solid = False
|
||||
|
||||
if has_changed_since_last_solid:
|
||||
last_solid_page = clean_bin.copy()
|
||||
|
||||
gray_page = _extract_print_channel(tab_crop)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if bar_coords:
|
||||
coords = [0] + bar_coords + [tab_crop.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(tab_crop[:, x_start:x_end])
|
||||
|
||||
if page_measures:
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
else:
|
||||
first_m = page_measures[0]
|
||||
bin_first = get_clean_binary(first_m)
|
||||
blurred_first = cv2.GaussianBlur(bin_first, (7, 7), 0)
|
||||
|
||||
best_val = 0.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
# Deep Scan Deduplication explicitly disabled to prevent repeating choruses wiping out the PDF timeline!
|
||||
# scan_dist=4 ensures we only match the immediately preceding page-flip overlap.
|
||||
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = get_clean_binary(past_m)
|
||||
blurred_past = cv2.GaussianBlur(bin_past, (7, 7), 0)
|
||||
|
||||
if abs(blurred_first.shape[1] - blurred_past.shape[1]) <= 30:
|
||||
hs = min(blurred_first.shape[0], blurred_past.shape[0])
|
||||
ws = min(blurred_first.shape[1], blurred_past.shape[1])
|
||||
s1 = blurred_first[:hs, :ws]
|
||||
s2 = blurred_past[:hs, :ws]
|
||||
|
||||
template = s1[10:-10, 10:-10]
|
||||
if template.shape[0] >= 10 and template.shape[1] >= 10:
|
||||
res = cv2.matchTemplate(s2, template, cv2.TM_CCOEFF_NORMED)
|
||||
# Using cv2.minMaxLoc inside the result matrix to find any peak (subpixel shifting tolerance)
|
||||
_, max_val, _, _ = cv2.minMaxLoc(res)
|
||||
|
||||
if max_val > best_val:
|
||||
best_val = max_val
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_val > 0.85:
|
||||
print(f" [Anchor] Page Matched -> PDF offset {best_offset} (Confidence: {best_val:.2f})")
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
print(f" [New Page] No recent overlap (Confidence: {best_val:.2f})")
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
last_1fps_bin = clean_bin.copy()
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: Stability-Blur 기반 {len(unique_measures)}개 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Stable-Blurred-Matrix Patched.")
|
||||
198
scripts/debug/patch_final_holy_sprite.py
Normal file
@@ -0,0 +1,198 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 Stable Content Trigger + Number Sprite 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
def get_number_sprite(m_img):
|
||||
gray = np.max(m_img, axis=2)
|
||||
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
row_sums = np.sum(thresh, axis=1) / 255
|
||||
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
|
||||
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
|
||||
|
||||
crop_y1 = max(0, y_staff - 35)
|
||||
crop_y2 = max(0, y_staff - 2)
|
||||
crop_x1 = 0
|
||||
crop_x2 = min(60, m_img.shape[1])
|
||||
|
||||
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
|
||||
sprite = thresh[crop_y1:crop_y2, crop_x1:crop_x2]
|
||||
if np.count_nonzero(sprite > 127) < 8: return None
|
||||
return sprite
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
last_1fps_bin = None
|
||||
last_solid_page = None
|
||||
|
||||
for frame_idx, frame in enumerate(frames):
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
|
||||
clean_bin = get_clean_binary(tab_crop)
|
||||
|
||||
if last_1fps_bin is not None:
|
||||
diff = cv2.absdiff(clean_bin, last_1fps_bin)
|
||||
error = np.count_nonzero(diff) / clean_bin.size
|
||||
if error < 0.05:
|
||||
has_changed_since_last_solid = True
|
||||
|
||||
if last_solid_page is not None:
|
||||
s_diff = cv2.absdiff(clean_bin, last_solid_page)
|
||||
s_err = np.count_nonzero(s_diff) / clean_bin.size
|
||||
if s_err < 0.05:
|
||||
has_changed_since_last_solid = False
|
||||
|
||||
if has_changed_since_last_solid:
|
||||
last_solid_page = clean_bin.copy()
|
||||
|
||||
gray_page = _extract_print_channel(tab_crop)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if bar_coords:
|
||||
coords = [0] + bar_coords + [tab_crop.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(tab_crop[:, x_start:x_end])
|
||||
|
||||
if page_measures:
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
else:
|
||||
first_m = page_measures[0]
|
||||
first_sprite = get_number_sprite(first_m)
|
||||
|
||||
best_val = 0.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
# Deep Scan Deduplication (find exact Number Sprite match)
|
||||
if first_sprite is not None:
|
||||
for scan_dist in range(1, len(unique_measures) + 1):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
past_sprite = get_number_sprite(past_m)
|
||||
|
||||
if past_sprite is not None:
|
||||
hs = min(first_sprite.shape[0], past_sprite.shape[0])
|
||||
ws = min(first_sprite.shape[1], past_sprite.shape[1])
|
||||
if hs > 5 and ws > 5:
|
||||
s1 = first_sprite[:hs, :ws]
|
||||
s2 = past_sprite[:hs, :ws]
|
||||
|
||||
template = s1[2:-2, 2:-2]
|
||||
if template.shape[0] >= 5 and template.shape[1] >= 5:
|
||||
res = cv2.matchTemplate(s2, template, cv2.TM_CCOEFF_NORMED)
|
||||
max_val = res[0][0]
|
||||
if max_val > best_val:
|
||||
best_val = max_val
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_val > 0.85:
|
||||
print(f" [Sprite Anchor] Detected Measure {best_offset}! Ignoring duplicates.")
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
|
||||
# Fallback geometric anchor for unlabeled pages (restricted back-scan)
|
||||
if not anchored:
|
||||
bin_first = get_clean_binary(first_m)
|
||||
best_err = 1.0
|
||||
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = get_clean_binary(past_m)
|
||||
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
m_diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(m_diff > 0) / s1.size
|
||||
if error_ratio < best_err:
|
||||
best_err = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_err < 0.15:
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
last_1fps_bin = clean_bin.copy()
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: Stability 기반 {len(unique_measures)}개 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Stable Sprite Anchor Patched.")
|
||||
0
scripts/debug/patch_final_monotonic.py
Normal file
145
scripts/debug/patch_final_truth.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 Keyframe 페이지 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
# 1. 0.05 threshold Tracker to completely ignore all fade/blur frames and extract EXACTLY 13 keyframes
|
||||
tracker = TemporalTracker(diff_threshold=0.05)
|
||||
|
||||
for frame in frames:
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
tracker.process_frame(tab_crop)
|
||||
|
||||
unique_pages = tracker.get_unique_pages()
|
||||
if not unique_pages: return []
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
for page_idx, page in enumerate(unique_pages):
|
||||
gray_page = _extract_print_channel(page)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if not bar_coords: continue
|
||||
|
||||
coords = [0] + bar_coords + [page.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(page[:, x_start:x_end])
|
||||
|
||||
if not page_measures: continue
|
||||
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
continue
|
||||
|
||||
first_m = page_measures[0]
|
||||
bin_first = get_clean_binary(first_m)
|
||||
|
||||
best_error = 1.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
# 3. CRUCIAL FIX: scan_dist limited to exactly 3.
|
||||
# Preventing M40 from visually matching M9 because Chorus repeats.
|
||||
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = get_clean_binary(past_m)
|
||||
|
||||
# 2. Binary Absdiff error < 0.20 for subpixel-immune, noise-immune math overlap matching
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(diff > 0) / s1.size
|
||||
|
||||
if error_ratio < best_error:
|
||||
best_error = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_error < 0.20:
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
# Overlapped exactly at this point, only append the truly NEW measures
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
# Completely discrete page flip with no overlap, append all measures
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: 무손실 정적 페이지 기반 {len(unique_measures)}개 연속 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Final Truth Pipeline Patched.")
|
||||
156
scripts/debug/patch_holy_grail_fix.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 Binarized-Tracker 정밀 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
diff_threshold = 0.015
|
||||
clean_to_bgr = []
|
||||
last_clean_bin = None
|
||||
|
||||
for frame in frames:
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
|
||||
clean_bin = get_clean_binary(tab_crop)
|
||||
if last_clean_bin is not None:
|
||||
raw_diff = cv2.absdiff(clean_bin, last_clean_bin)
|
||||
non_zero_ratio = np.count_nonzero(raw_diff) / clean_bin.size
|
||||
if non_zero_ratio > diff_threshold:
|
||||
clean_to_bgr.append(tab_crop)
|
||||
last_clean_bin = clean_bin.copy()
|
||||
else:
|
||||
clean_to_bgr.append(tab_crop)
|
||||
last_clean_bin = clean_bin.copy()
|
||||
|
||||
unique_pages = clean_to_bgr
|
||||
if not unique_pages: return []
|
||||
|
||||
print(f" -> {len(unique_pages)}개의 고유 정적 페이지 캡처 완료. 3-마디 역탐색 동기화 시작...")
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
for page_idx, page in enumerate(unique_pages):
|
||||
gray_page = _extract_print_channel(page)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if not bar_coords: continue
|
||||
|
||||
coords = [0] + bar_coords + [page.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(page[:, x_start:x_end])
|
||||
|
||||
if not page_measures: continue
|
||||
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
continue
|
||||
|
||||
first_m = page_measures[0]
|
||||
bin_first = get_clean_binary(first_m)
|
||||
|
||||
best_error = 1.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = get_clean_binary(past_m)
|
||||
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(diff > 0) / s1.size
|
||||
|
||||
if error_ratio < best_error:
|
||||
best_error = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_error < 0.20:
|
||||
new_start_offset = best_offset
|
||||
anchored = True
|
||||
print(f" [Anchor] Page {page_idx} -> PDF offset {best_offset} (Best Error: {best_error:.4f})")
|
||||
else:
|
||||
print(f" [New Page] Page {page_idx} -> No Overlap (Best Error: {best_error:.4f})")
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: 무손실 정적 페이지 기반 {len(unique_measures)}개 연속 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Holy Grail Pipeline Embedded Inline successfully!")
|
||||
180
scripts/debug/patch_ocr_sprite.py
Normal file
@@ -0,0 +1,180 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 Number Sprite Template 앵커 기반 마디 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
def get_number_sprite(m_img):
|
||||
# We explicitly use inverse thresholding to capture the tiny white number on black background
|
||||
gray = np.max(m_img, axis=2)
|
||||
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
row_sums = np.sum(thresh, axis=1) / 255
|
||||
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
|
||||
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
|
||||
|
||||
crop_y1 = max(0, y_staff - 35)
|
||||
crop_y2 = max(0, y_staff - 2)
|
||||
crop_x1 = 0
|
||||
crop_x2 = min(60, m_img.shape[1])
|
||||
|
||||
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
|
||||
sprite = thresh[crop_y1:crop_y2, crop_x1:crop_x2]
|
||||
|
||||
# MUST BE STRICT: If there are fewer than 8 white pixels, it's a BLANK SPRITE.
|
||||
# Blank sprites caused the catastrophic 1->36 time-travel deletion!
|
||||
if np.count_nonzero(sprite > 127) < 8: return None
|
||||
return sprite
|
||||
|
||||
for frame_idx, frame in enumerate(frames):
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
|
||||
gray_page = _extract_print_channel(tab_crop)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if not bar_coords: continue
|
||||
|
||||
coords = [0] + bar_coords + [tab_crop.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(tab_crop[:, x_start:x_end])
|
||||
|
||||
if not page_measures: continue
|
||||
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
first_sprite = get_number_sprite(page_measures[0])
|
||||
has_pixels = np.count_nonzero(first_sprite > 127) if first_sprite is not None else 0
|
||||
print(f" -> [초기화] 첫 프레임 배열 등록: {len(page_measures)}개 마디 (Sprite Pixels: {has_pixels})")
|
||||
continue
|
||||
|
||||
first_m = page_measures[0]
|
||||
first_sprite = get_number_sprite(first_m)
|
||||
|
||||
anchored = False
|
||||
new_start_offset = 0
|
||||
best_val = 0.0
|
||||
|
||||
# Only attempt anchor if the first measure explicitly displays a sequence number.
|
||||
# If it's blank, we DO NOT blindly match it to other blank measures!
|
||||
if first_sprite is not None:
|
||||
# We can scan backwards up to 15 measures because clear Number Sprites are completely unique IDs.
|
||||
for scan_dist in range(1, min(15, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
past_sprite = get_number_sprite(past_m)
|
||||
|
||||
if past_sprite is not None:
|
||||
hs = min(first_sprite.shape[0], past_sprite.shape[0])
|
||||
ws = min(first_sprite.shape[1], past_sprite.shape[1])
|
||||
s1 = first_sprite[:hs, :ws]
|
||||
s2 = past_sprite[:hs, :ws]
|
||||
|
||||
template = s1[2:-2, 2:-2]
|
||||
if template.shape[0] >= 5 and template.shape[1] >= 5:
|
||||
res = cv2.matchTemplate(s2, template, cv2.TM_CCOEFF_NORMED)
|
||||
max_val = res[0][0]
|
||||
|
||||
if max_val > best_val:
|
||||
best_val = max_val
|
||||
new_start_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_val > 0.85:
|
||||
anchored = True
|
||||
|
||||
# If we failed to anchor via Sprite (maybe this page has no numbers at all),
|
||||
# we fallback to strict whole-measure Template Matching (TM_CCOEFF_NORMED on greyscale prints to survive subpixel scroll drift)
|
||||
if not anchored:
|
||||
bin_first = _extract_print_channel(first_m) # greyscale thresholded
|
||||
for scan_dist in range(1, min(4, len(unique_measures) + 1)): # strictly limit to 4 to prevent musical loops
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = _extract_print_channel(past_m)
|
||||
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 30:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
template = s1[10:-10, 10:-10]
|
||||
if template.shape[0] >= 10 and template.shape[1] >= 10:
|
||||
res = cv2.matchTemplate(s2, template, cv2.TM_CCOEFF_NORMED)
|
||||
max_val = res[0][0]
|
||||
if max_val > 0.85:
|
||||
new_start_offset = len(unique_measures) - past_idx
|
||||
anchored = True
|
||||
break
|
||||
|
||||
if anchored and new_start_offset < len(page_measures):
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
elif not anchored:
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: Number Sprite 시계열 기반 {len(unique_measures)}개 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Supreme Logic Embedded.")
|
||||
153
scripts/debug/patch_stable_trigger.py
Normal file
@@ -0,0 +1,153 @@
|
||||
import re
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
code = f.read()
|
||||
|
||||
new_func = """def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 순차 Stable Content Trigger 방식 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops: return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
unique_measures = []
|
||||
chunk_width = 1280
|
||||
|
||||
last_1fps_bin = None
|
||||
last_solid_page = None
|
||||
|
||||
for frame_idx, frame in enumerate(frames):
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
|
||||
clean_bin = get_clean_binary(tab_crop)
|
||||
|
||||
if last_1fps_bin is not None:
|
||||
# Check stability compared to 1 second ago
|
||||
diff = cv2.absdiff(clean_bin, last_1fps_bin)
|
||||
error = np.count_nonzero(diff) / clean_bin.size
|
||||
if error < 0.05: # Page is fully stabilized (not a fading transition)
|
||||
has_changed_since_last_solid = True
|
||||
|
||||
if last_solid_page is not None:
|
||||
s_diff = cv2.absdiff(clean_bin, last_solid_page)
|
||||
s_err = np.count_nonzero(s_diff) / clean_bin.size
|
||||
if s_err < 0.05:
|
||||
has_changed_since_last_solid = False
|
||||
|
||||
# We only process this page if it's securely stable AND we haven't already processed it
|
||||
if has_changed_since_last_solid:
|
||||
last_solid_page = clean_bin.copy()
|
||||
|
||||
# Extract measures
|
||||
gray_page = _extract_print_channel(tab_crop)
|
||||
bar_coords = _detect_measure_bars(gray_page)
|
||||
|
||||
if bar_coords:
|
||||
coords = [0] + bar_coords + [tab_crop.shape[1]]
|
||||
coords = sorted(list(set(coords)))
|
||||
|
||||
page_measures = []
|
||||
for i in range(len(coords) - 1):
|
||||
x_start = coords[i]
|
||||
x_end = coords[i+1]
|
||||
if x_end - x_start < 40: continue
|
||||
page_measures.append(tab_crop[:, x_start:x_end])
|
||||
|
||||
if page_measures:
|
||||
if not unique_measures:
|
||||
unique_measures.extend(page_measures)
|
||||
else:
|
||||
first_m = page_measures[0]
|
||||
bin_first = get_clean_binary(first_m)
|
||||
|
||||
best_error = 1.0
|
||||
best_offset = 0
|
||||
anchored = False
|
||||
|
||||
# scan_dist=4 ensures we never loop back to identical repeating choruses from 10 seconds ago!
|
||||
for scan_dist in range(1, min(4, len(unique_measures) + 1)):
|
||||
past_idx = len(unique_measures) - scan_dist
|
||||
past_m = unique_measures[past_idx]
|
||||
bin_past = get_clean_binary(past_m)
|
||||
|
||||
if abs(bin_first.shape[1] - bin_past.shape[1]) <= 25:
|
||||
hs = min(bin_first.shape[0], bin_past.shape[0])
|
||||
ws = min(bin_first.shape[1], bin_past.shape[1])
|
||||
s1 = bin_first[:hs, :ws]
|
||||
s2 = bin_past[:hs, :ws]
|
||||
|
||||
m_diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(m_diff > 0) / s1.size
|
||||
|
||||
if error_ratio < best_error:
|
||||
best_error = error_ratio
|
||||
best_offset = len(unique_measures) - past_idx
|
||||
|
||||
if best_error < 0.15:
|
||||
new_start_offset = best_offset
|
||||
if new_start_offset < len(page_measures):
|
||||
unique_measures.extend(page_measures[new_start_offset:])
|
||||
else:
|
||||
unique_measures.extend(page_measures)
|
||||
|
||||
last_1fps_bin = clean_bin.copy()
|
||||
|
||||
print(f" -> 동기화 중복 제거 완료: Stability 기반 {len(unique_measures)}개 마디 보존")
|
||||
|
||||
final_chunks = []
|
||||
current_row_measures = []
|
||||
current_row_width = 0
|
||||
|
||||
for measure_img in unique_measures:
|
||||
measure_w = measure_img.shape[1]
|
||||
|
||||
if current_row_width + measure_w > chunk_width and len(current_row_measures) > 0:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
current_row_measures = [measure_img]
|
||||
current_row_width = measure_w
|
||||
else:
|
||||
current_row_measures.append(measure_img)
|
||||
current_row_width += measure_w
|
||||
|
||||
if current_row_measures:
|
||||
row_img = np.hstack(current_row_measures)
|
||||
if row_img.shape[1] > chunk_width:
|
||||
row_img = row_img[:, :chunk_width]
|
||||
else:
|
||||
pad_w = chunk_width - row_img.shape[1]
|
||||
if pad_w > 0:
|
||||
pad_img = np.full((row_img.shape[0], pad_w, 3), 255, dtype=np.uint8)
|
||||
row_img = np.hstack([row_img, pad_img])
|
||||
final_chunks.append(row_img)
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개 줄(Row)")
|
||||
return final_chunks
|
||||
"""
|
||||
|
||||
pattern = r'def extract_unique_scroll\(frames: List\[np\.ndarray\], threshold: float = SIMILARITY_THRESHOLD\) -> List\[np\.ndarray\]:.*?return final_chunks'
|
||||
new_code = re.sub(pattern, new_func, code, flags=re.DOTALL)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.write(new_code)
|
||||
print("Stable Content Trigger Patched.")
|
||||
80
scripts/debug/patch_tracker.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import sys
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
new_lines = []
|
||||
skip = False
|
||||
|
||||
import_added = False
|
||||
|
||||
for line in lines:
|
||||
if line.startswith('import cv2') and not import_added:
|
||||
new_lines.append(line)
|
||||
new_lines.append('from video_cv_tracker import TemporalTracker\n')
|
||||
import_added = True
|
||||
continue
|
||||
|
||||
if line.startswith('def extract_unique_scroll(frames:'):
|
||||
skip = True
|
||||
new_lines.append('''def extract_unique_scroll(frames: List[np.ndarray], threshold: float = SIMILARITY_THRESHOLD) -> List[np.ndarray]:
|
||||
print(f"[4/5] 스크롤형 Tab 시계열 추적 추출 중...")
|
||||
|
||||
strip_tops, strip_bottoms = [], []
|
||||
for frame in frames[:50]:
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
strip_tops.append(strip[0])
|
||||
strip_bottoms.append(strip[1])
|
||||
|
||||
if not strip_tops:
|
||||
return []
|
||||
|
||||
median_top = int(np.median(strip_tops))
|
||||
median_bottom = int(np.median(strip_bottoms))
|
||||
|
||||
tracker = TemporalTracker()
|
||||
|
||||
for frame in frames:
|
||||
h = frame.shape[0]
|
||||
tab_crop = frame[max(0, median_top):min(h, median_bottom), :]
|
||||
if not _has_tab_content(tab_crop):
|
||||
continue
|
||||
tracker.process_frame(tab_crop)
|
||||
|
||||
panorama = tracker.get_final_panorama()
|
||||
if panorama is None:
|
||||
return []
|
||||
|
||||
print(f" -> 생성된 파노라마 길이: {panorama.shape[1]}px")
|
||||
|
||||
chunk_width = 1280
|
||||
final_chunks = []
|
||||
|
||||
w = panorama.shape[1]
|
||||
start_x = 0
|
||||
|
||||
while start_x < w:
|
||||
chunk = panorama[:, start_x:min(w, start_x + chunk_width)]
|
||||
if chunk.shape[1] < chunk_width:
|
||||
pad = np.full((chunk.shape[0], chunk_width - chunk.shape[1], 3), 255, dtype=np.uint8)
|
||||
chunk = np.hstack([chunk, pad])
|
||||
final_chunks.append(chunk)
|
||||
start_x += chunk_width
|
||||
|
||||
print(f" -> A4 분할 컷: {len(final_chunks)}개")
|
||||
return final_chunks
|
||||
|
||||
''')
|
||||
continue
|
||||
|
||||
if skip and line.startswith('def extract_unique_overlay('):
|
||||
skip = False
|
||||
|
||||
if not skip:
|
||||
new_lines.append(line)
|
||||
|
||||
with open('youtube_tab_to_pdf.py', 'w', encoding='utf-8') as f:
|
||||
f.writelines(new_lines)
|
||||
|
||||
print("Patched youtube_tab_to_pdf.py successfully.")
|
||||
0
scripts/debug/score_log.txt
Normal file
41
scripts/debug/test_blur_match.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
img0 = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_chunk_00.png")
|
||||
img1 = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_chunk_01.png")
|
||||
|
||||
gray0 = cv2.cvtColor(img0, cv2.COLOR_BGR2GRAY)
|
||||
gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
w = gray0.shape[1]
|
||||
|
||||
best_ov = 0
|
||||
min_mad = float('inf')
|
||||
|
||||
start_time = time.time()
|
||||
# Downsample by 2 horizontally & vertically for extreme speed
|
||||
small0 = cv2.resize(gray0, (w//2, gray0.shape[0]//2))
|
||||
small1 = cv2.resize(gray1, (w//2, gray1.shape[0]//2))
|
||||
sw = small0.shape[1]
|
||||
|
||||
# We are testing overlap pixel widths
|
||||
for ov in range(sw-2, 10, -1):
|
||||
diff = cv2.absdiff(small0[:, -ov:], small1[:, :ov])
|
||||
mad = np.mean(diff)
|
||||
|
||||
if mad < min_mad:
|
||||
min_mad = mad
|
||||
best_ov = ov * 2 # map back to original scale
|
||||
|
||||
if min_mad < 3.0: # Break early if effectively a perfect match!
|
||||
best_ov = ov * 2
|
||||
break
|
||||
|
||||
end_time = time.time()
|
||||
print(f"MSE MAD found overlap {best_ov}px with MAD {min_mad:.2f} in {(end_time-start_time)*1000:.1f}ms")
|
||||
|
||||
# Verify
|
||||
stitched = np.hstack([img0, img1[:, best_ov:]])
|
||||
cv2.imwrite(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\test_mse_stitch.png", stitched)
|
||||
print("Exported test_mse_stitch.png")
|
||||
47
scripts/debug/test_col_sums.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import glob
|
||||
|
||||
videos = glob.glob('output/*.mp4')
|
||||
cap = cv2.VideoCapture(videos[0])
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
def _find_white_tab_strip(bgr: np.ndarray):
|
||||
gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
h, w = binary.shape
|
||||
row_white_counts = np.sum(binary, axis=1) / 255
|
||||
threshold = w * 0.1
|
||||
white_rows = np.where(row_white_counts > threshold)[0]
|
||||
if len(white_rows) < 2: return None
|
||||
return white_rows[0], white_rows[-1]
|
||||
|
||||
strip = _find_white_tab_strip(frame)
|
||||
if strip:
|
||||
y1, y2 = strip
|
||||
roi = frame[y1:y2, :]
|
||||
|
||||
gray_roi = np.max(roi, axis=2)
|
||||
_, binary = cv2.threshold(gray_roi, 200, 255, cv2.THRESH_BINARY)
|
||||
|
||||
col_sums = np.sum(binary, axis=0) / 255
|
||||
h_roi = y2 - y1
|
||||
|
||||
# Relaxed to 40% to survive hand occlusions. Note stems max out at ~20-30%.
|
||||
bars = np.where(col_sums > h_roi * 0.4)[0]
|
||||
|
||||
clean_bars = []
|
||||
for x in bars:
|
||||
if not clean_bars or x - clean_bars[-1] > 20: # 20px min distance
|
||||
clean_bars.append(int(x))
|
||||
|
||||
# Include edges
|
||||
if not clean_bars or clean_bars[0] > 50: clean_bars.insert(0, 0)
|
||||
if clean_bars[-1] < binary.shape[1] - 50: clean_bars.append(binary.shape[1])
|
||||
|
||||
print(f"Top: {y1}, Bottom: {y2}, Height: {h_roi}")
|
||||
print(f"Detected Clean Measure Bars: {clean_bars}")
|
||||
else:
|
||||
print("Could not find tab strip.")
|
||||
39
scripts/debug/test_crop_pipeline.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from video_cv_tracker import TemporalTracker
|
||||
import time
|
||||
|
||||
def extract_cropped_pages(video_path, limit_frames=3000):
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
tracker = TemporalTracker(diff_threshold=0.20)
|
||||
|
||||
frames_processed = 0
|
||||
while frames_processed < limit_frames:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
scale = 1280 / frame.shape[1]
|
||||
frame = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
|
||||
|
||||
# Ultimate flawless crop derived from structural ASCII analysis:
|
||||
# 103:280 precisely truncates before the top of the guitarist's head, isolating ONLY sheet music.
|
||||
ribbon = frame[103:280, :]
|
||||
|
||||
tracker.process_frame(ribbon)
|
||||
frames_processed += 1
|
||||
|
||||
pages = tracker.get_unique_pages()
|
||||
cap.release()
|
||||
return pages
|
||||
|
||||
if __name__ == "__main__":
|
||||
video_path = "output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
|
||||
pages = extract_cropped_pages(video_path)
|
||||
|
||||
print(f"Extracted {len(pages)} perfectly cropped median pages.")
|
||||
|
||||
if pages:
|
||||
# Stack vertically
|
||||
final_img = np.vstack(pages)
|
||||
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/restored_perfect_crop.png", final_img)
|
||||
print("Saved cleanly cropped vertical stack.")
|
||||
17
scripts/debug/test_crop_raw.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
frame = cv2.imread("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/raw_frame_30s.png")
|
||||
|
||||
# Crop based on ASCII mathematical deduction
|
||||
# Top black letterbox is 0:100
|
||||
# White sheet music is 100:280
|
||||
# Guitarist is 280:720
|
||||
|
||||
crop1 = frame[103:280, :]
|
||||
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/crop_103_280.png", crop1)
|
||||
|
||||
crop2 = frame[0:180, :]
|
||||
cv2.imwrite("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/crop_0_180.png", crop2)
|
||||
|
||||
print("Saved crop_103_280.png and crop_0_180.png")
|
||||
31
scripts/debug/test_easyocr.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import easyocr
|
||||
import time
|
||||
|
||||
reader = easyocr.Reader(['en'], gpu=False)
|
||||
|
||||
def test_ocr(image_text, img_data):
|
||||
# Upscale 3x to give CRAFT detector enough spatial resolution
|
||||
upscaled = cv2.resize(img_data, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC)
|
||||
# Pad to make it look like a printed document page
|
||||
padded = cv2.copyMakeBorder(upscaled, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255])
|
||||
|
||||
t0 = time.time()
|
||||
results = reader.readtext(padded, allowlist="0123456789")
|
||||
tf = time.time()
|
||||
|
||||
print(f"[{image_text}] Result: {results} (took {tf-t0:.2f}s)")
|
||||
|
||||
# Generate a tiny "37" (white on black)
|
||||
img_37 = np.zeros((30, 40), dtype=np.uint8)
|
||||
img_37[5:10, 10:20] = 255 # Top of "3"
|
||||
img_37[12:15, 10:20] = 255 # Mid of "3"
|
||||
img_37[20:25, 10:20] = 255 # Bot of "3"
|
||||
img_37[5:10, 25:35] = 255 # Top of "7"
|
||||
img_37[5:25, 30:35] = 255 # Right of "7"
|
||||
|
||||
# Invert it so it's black text on white background (what OCR expects)
|
||||
img_37_inv = cv2.bitwise_not(img_37)
|
||||
|
||||
test_ocr("Tiny 37 Synth", img_37_inv)
|
||||
44
scripts/debug/test_flip.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
def test_page_flip_diff():
|
||||
import glob
|
||||
videos = glob.glob("output/*.mp4")
|
||||
cap = cv2.VideoCapture(videos[0] if videos else "output/shintakarajima.mp4")
|
||||
ret, prev_frame = cap.read()
|
||||
if not ret: return
|
||||
scale = 1280 / prev_frame.shape[1]
|
||||
prev = cv2.resize(prev_frame, (1280, int(prev_frame.shape[0] * scale)))[103:280, :]
|
||||
prev_gray = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
idx = 1
|
||||
max_diff = 0
|
||||
max_diff_idx = -1
|
||||
|
||||
print("Scanning first 2000 frames for diff_ratio spikes...")
|
||||
while idx < 2000:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
# Only check every frame
|
||||
curr = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))[103:280, :]
|
||||
curr_gray = cv2.cvtColor(curr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
diff = cv2.absdiff(prev_gray, curr_gray)
|
||||
_, thresh = cv2.threshold(diff, 50, 255, cv2.THRESH_BINARY)
|
||||
ratio = np.sum(thresh > 0) / thresh.size
|
||||
|
||||
if ratio > 0.01:
|
||||
print(f"Frame {idx}: diff_ratio = {ratio:.4f}")
|
||||
|
||||
if ratio > max_diff:
|
||||
max_diff = ratio
|
||||
max_diff_idx = idx
|
||||
|
||||
prev_gray = curr_gray
|
||||
idx += 1
|
||||
|
||||
print(f"\nMax diff spike: {max_diff:.4f} at frame {max_diff_idx}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_page_flip_diff()
|
||||
61
scripts/debug/test_gap_morphology.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import glob
|
||||
|
||||
videos = glob.glob('output/*.mp4')
|
||||
cap = cv2.VideoCapture(videos[0])
|
||||
|
||||
# Collect 30 continuous frames
|
||||
frames = []
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
|
||||
for _ in range(30):
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
frames.append(frame)
|
||||
cap.release()
|
||||
|
||||
if len(frames) == 30:
|
||||
median_frame = np.median(frames, axis=0).astype(np.uint8)
|
||||
gray = np.max(median_frame, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
|
||||
row_sums = np.sum(binary, axis=1) / 255
|
||||
y_staff = np.where(row_sums > binary.shape[1] * 0.4)[0]
|
||||
|
||||
if len(y_staff) > 0:
|
||||
y_top = y_staff[0]
|
||||
y_bottom = y_staff[-1]
|
||||
staff_h = y_bottom - y_top
|
||||
|
||||
roi = binary[y_top:y_bottom, :]
|
||||
|
||||
# 1. Bridge vertical gaps (like the gap between standard notation and tab)
|
||||
# kernel of 20px will bridge gaps up to 19px without increasing horizontal width
|
||||
bridge_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 20))
|
||||
bridged = cv2.dilate(roi, bridge_kernel)
|
||||
|
||||
# 2. Erase everything that isn't a continuous vertical line of at least 80% staff height
|
||||
# Note stems are short, so they get erased even after bridging!
|
||||
open_height = int(staff_h * 0.8)
|
||||
open_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, open_height))
|
||||
isolated_bars = cv2.morphologyEx(bridged, cv2.MORPH_OPEN, open_kernel)
|
||||
|
||||
# 3. The isolated_bars image now contains ONLY thick, pure measure bars. Get their X coords.
|
||||
col_sums = np.sum(isolated_bars, axis=0) / 255
|
||||
|
||||
# Even 1 pixel of the filtered bar is enough, but let's use a tiny threshold
|
||||
bars = np.where(col_sums > open_height * 0.5)[0]
|
||||
|
||||
clean_bars = []
|
||||
for x in bars:
|
||||
if not clean_bars or x - clean_bars[-1] > 20:
|
||||
clean_bars.append(int(x))
|
||||
|
||||
# Inject edges
|
||||
if not clean_bars or clean_bars[0] > 50: clean_bars.insert(0, 0)
|
||||
if clean_bars[-1] < binary.shape[1] - 50: clean_bars.append(binary.shape[1])
|
||||
|
||||
print(f"Gap-Bridged Morphology Measure Boundaries: {clean_bars}")
|
||||
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_gap_bridged.png", isolated_bars)
|
||||
else:
|
||||
print("Not enough frames.")
|
||||
60
scripts/debug/test_iou_math.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
# Simulate a thin "1" and "2"
|
||||
img_12 = np.zeros((60, 100), dtype=np.uint8)
|
||||
img_12[10:50, 40:45] = 255 # The "1"
|
||||
img_12[10:15, 60:80] = 255 # Top of "2"
|
||||
img_12[15:45, 75:80] = 255 # Right of "2"
|
||||
img_12[45:50, 60:80] = 255 # Bottom of "2"
|
||||
|
||||
# Simulate a thin "3" and "7"
|
||||
img_37 = np.zeros((60, 100), dtype=np.uint8)
|
||||
img_37[10:15, 30:50] = 255 # Top of "3"
|
||||
img_37[25:30, 30:50] = 255 # Mid of "3"
|
||||
img_37[45:50, 30:50] = 255 # Bot of "3"
|
||||
img_37[10:15, 60:80] = 255 # Top of "7"
|
||||
img_37[15:50, 75:80] = 255 # Right of "7"
|
||||
|
||||
# Simulate the SAME "12" but shifted by 2 pixels (due to video wobble)
|
||||
img_12_shifted = np.zeros((60, 100), dtype=np.uint8)
|
||||
img_12_shifted[12:52, 42:47] = 255
|
||||
img_12_shifted[12:17, 62:82] = 255
|
||||
img_12_shifted[17:47, 77:82] = 255
|
||||
img_12_shifted[47:52, 62:82] = 255
|
||||
|
||||
def compute_iou(s1, s2):
|
||||
intersection = np.logical_and(s1 > 0, s2 > 0)
|
||||
union = np.logical_or(s1 > 0, s2 > 0)
|
||||
return np.count_nonzero(intersection) / max(1, np.count_nonzero(union))
|
||||
|
||||
def robust_match(s1, s2):
|
||||
# Dilate by 3x3 to make lines thick enough to overlap even if shifted by 2px
|
||||
kernel = np.ones((5, 5), np.uint8)
|
||||
d1 = cv2.dilate(s1, kernel, iterations=1)
|
||||
d2 = cv2.dilate(s2, kernel, iterations=1)
|
||||
|
||||
# Try multiple subpixel shifts manually and take the best IoU
|
||||
best_iou = 0
|
||||
for dy in [-2, 0, 2]:
|
||||
for dx in [-2, 0, 2]:
|
||||
M = np.float32([[1, 0, dx], [0, 1, dy]])
|
||||
shifted_d2 = cv2.warpAffine(d2, M, (s2.shape[1], s2.shape[0]))
|
||||
iou = compute_iou(d1, shifted_d2)
|
||||
if iou > best_iou:
|
||||
best_iou = iou
|
||||
|
||||
return best_iou
|
||||
|
||||
print("IoU (12 vs 37):", robust_match(img_12, img_37))
|
||||
print("IoU (12 vs 12_shifted):", robust_match(img_12, img_12_shifted))
|
||||
|
||||
# Let's see what TM_CCOEFF_NORMED would have done:
|
||||
res = cv2.matchTemplate(img_37, img_12[5:-5, 5:-5], cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val_diff, _, _ = cv2.minMaxLoc(res)
|
||||
|
||||
res2 = cv2.matchTemplate(img_12_shifted, img_12[5:-5, 5:-5], cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val_same, _, _ = cv2.minMaxLoc(res2)
|
||||
|
||||
print("\nTM_CCOEFF_NORMED (12 vs 37):", max_val_diff)
|
||||
print("TM_CCOEFF_NORMED (12 vs 12_shifted):", max_val_same)
|
||||
75
scripts/debug/test_live_ocr.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import easyocr
|
||||
import re
|
||||
from youtube_tab_to_pdf import TemporalTracker
|
||||
|
||||
cap = cv2.VideoCapture(r"C:\Users\Certes\Desktop\guitar_score\output\shintakarajima.mp4")
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
|
||||
tracker = TemporalTracker()
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
|
||||
ret, check_frame = cap.read()
|
||||
|
||||
from youtube_tab_to_pdf import _find_white_tab_strip
|
||||
bounds = _find_white_tab_strip(cv2.resize(check_frame, (1280, int(check_frame.shape[0] * (1280/check_frame.shape[1])))))
|
||||
if bounds:
|
||||
crop_top = max(0, bounds[0] - 60)
|
||||
crop_bottom = bounds[1]
|
||||
tracker.set_crop(crop_top, crop_bottom)
|
||||
|
||||
# Process only first 95 seconds to get unique pages
|
||||
print("Extracting unique pages from first 95 seconds...")
|
||||
tracker.process_video(cap, start_sec=0, end_sec=95)
|
||||
unique_pages = tracker.get_unique_pages()
|
||||
|
||||
print(f"Extracted {len(unique_pages)} unique pages.")
|
||||
|
||||
# Try easyOCR
|
||||
reader = easyocr.Reader(['en'], gpu=False)
|
||||
|
||||
def extract_measure_number(page_bgr):
|
||||
cw = min(page_bgr.shape[1], 1000)
|
||||
page_gray = cv2.cvtColor(page_bgr[:, :cw], cv2.COLOR_BGR2GRAY)
|
||||
_, bin_inv = cv2.threshold(page_gray, 200, 255, cv2.THRESH_BINARY_INV)
|
||||
row_sums = np.sum(bin_inv, axis=1) / 255.0
|
||||
staff_rows = np.where(row_sums > cw * 0.4)[0]
|
||||
|
||||
if len(staff_rows) >= 6:
|
||||
staff_y_top, staff_y_bottom = staff_rows[0], staff_rows[-1]
|
||||
for r in staff_rows:
|
||||
if r - staff_y_top > 100: break
|
||||
staff_y_bottom = r
|
||||
else:
|
||||
return -1
|
||||
|
||||
expected_h = max(10, staff_y_bottom - staff_y_top + 1)
|
||||
staff_region = bin_inv[staff_y_top:staff_y_bottom+1, :]
|
||||
col_sums = np.sum(staff_region, axis=0) / 255.0
|
||||
bar_xs = np.where(col_sums >= expected_h * 0.8)[0]
|
||||
|
||||
if len(bar_xs) == 0: return -1
|
||||
x_bar = bar_xs[0]
|
||||
|
||||
box_y1 = max(0, staff_y_top - 25)
|
||||
box_y2 = staff_y_top
|
||||
box_x1 = x_bar
|
||||
box_x2 = min(page_gray.shape[1], x_bar + 35)
|
||||
|
||||
num_box = page_gray[box_y1:box_y2, box_x1:box_x2]
|
||||
_, num_inv = cv2.threshold(num_box, 200, 255, cv2.THRESH_BINARY_INV)
|
||||
num_for_ocr = cv2.bitwise_not(num_inv)
|
||||
|
||||
upscaled = cv2.resize(num_for_ocr, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)
|
||||
padded = cv2.copyMakeBorder(upscaled, 20, 20, 20, 20, cv2.BORDER_CONSTANT, value=[255, 255, 255])
|
||||
|
||||
results = reader.readtext(padded, allowlist="0123456789")
|
||||
if not results: return -1
|
||||
|
||||
digits = re.findall(r'\d+', results[0][1])
|
||||
return int(digits[0]) if digits else -1
|
||||
|
||||
for i, page in enumerate(unique_pages):
|
||||
num = extract_measure_number(page)
|
||||
print(f"Page {i:02d}: {num}")
|
||||
BIN
scripts/debug/test_m1.png
Normal file
|
After Width: | Height: | Size: 31 KiB |
BIN
scripts/debug/test_m2.png
Normal file
|
After Width: | Height: | Size: 30 KiB |
43
scripts/debug/test_math.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from youtube_tab_to_pdf import _find_white_tab_strip, _detect_measure_bars, _extract_print_channel
|
||||
|
||||
def get_clean_binary(img):
|
||||
gray = np.max(img, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
return binary
|
||||
|
||||
cap = cv2.VideoCapture(r"output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 50)
|
||||
ret, f1 = cap.read()
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 65) # Next second
|
||||
ret, f2 = cap.read()
|
||||
cap.release()
|
||||
|
||||
def process(frame):
|
||||
s = _find_white_tab_strip(frame)
|
||||
crop = frame[s[0]:s[1], :]
|
||||
gray = _extract_print_channel(crop)
|
||||
bars = _detect_measure_bars(gray)
|
||||
coords = [0] + bars + [crop.shape[1]]
|
||||
m = crop[:, coords[1]:coords[2]] # Get M2 just in case M1 is a clef
|
||||
return m
|
||||
|
||||
m1 = process(f1)
|
||||
m2 = process(f2)
|
||||
|
||||
cv2.imwrite("test_m1.png", m1)
|
||||
cv2.imwrite("test_m2.png", m2)
|
||||
|
||||
bin1 = get_clean_binary(m1)
|
||||
bin2 = get_clean_binary(m2)
|
||||
|
||||
h = min(bin1.shape[0], bin2.shape[0])
|
||||
w = min(bin1.shape[1], bin2.shape[1])
|
||||
s1 = bin1[:h, :w]
|
||||
s2 = bin2[:h, :w]
|
||||
|
||||
diff = cv2.absdiff(s1, s2)
|
||||
error_ratio = np.sum(diff > 0) / s1.size
|
||||
|
||||
print(f"Error Ratio: {error_ratio:.4f}")
|
||||
25
scripts/debug/test_measure_slice.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
def find_measure_boundaries(img_bgr, max_width=1280):
|
||||
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
_, bin_inv = cv2.threshold(img_gray, 180, 255, cv2.THRESH_BINARY_INV)
|
||||
staff_region = bin_inv[50:160, :]
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
|
||||
vertical_lines = cv2.morphologyEx(staff_region, cv2.MORPH_OPEN, kernel)
|
||||
proj = np.sum(vertical_lines, axis=0) / 255
|
||||
peaks = np.where(proj > 30)[0]
|
||||
|
||||
valid_peaks = [p for p in peaks if p <= max_width - 15]
|
||||
if not valid_peaks: return max_width
|
||||
return valid_peaks[-1] + 10
|
||||
|
||||
if __name__ == "__main__":
|
||||
img = cv2.imread(r'C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_super_block.png')
|
||||
for w_cap in [1280, 2000, 2560]:
|
||||
cw = min(w_cap, img.shape[1])
|
||||
cut_x = find_measure_boundaries(img[:, :cw], cw)
|
||||
print(f"Max {cw} => Cut at {cut_x}")
|
||||
out = img[:, :cw].copy()
|
||||
cv2.line(out, (cut_x, 0), (cut_x, out.shape[0]), (0, 0, 255), 2)
|
||||
cv2.imwrite(r'C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\slice_'+str(w_cap)+'.png', out)
|
||||
82
scripts/debug/test_morph_grid.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import glob
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Tuple
|
||||
|
||||
@dataclass
|
||||
class MeasureBound:
|
||||
x_start: int
|
||||
x_end: int
|
||||
y_top: int
|
||||
y_bottom: int
|
||||
|
||||
class GridParser:
|
||||
def __init__(self, frame: np.ndarray):
|
||||
self.frame = frame
|
||||
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
||||
_, self.binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
self.h, self.w = self.binary.shape
|
||||
|
||||
def find_staff_y_bounds(self) -> Tuple[int, int]:
|
||||
h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (100, 1))
|
||||
h_lines = cv2.morphologyEx(self.binary, cv2.MORPH_OPEN, h_kernel)
|
||||
row_sums = np.sum(h_lines, axis=1) / 255
|
||||
|
||||
staff_rows = np.where(row_sums > self.w * 0.4)[0]
|
||||
if len(staff_rows) == 0: return 0, 0
|
||||
|
||||
y_top = int(staff_rows[0])
|
||||
y_bottom = y_top
|
||||
|
||||
for y in staff_rows:
|
||||
if y - y_bottom > 150: break
|
||||
y_bottom = int(y)
|
||||
|
||||
return max(0, y_top - 5), min(self.h, y_bottom + 5)
|
||||
|
||||
def find_measure_bounds(self) -> List[MeasureBound]:
|
||||
y_top, y_bottom = self.find_staff_y_bounds()
|
||||
if y_bottom - y_top < 20: return []
|
||||
staff_height = y_bottom - y_top
|
||||
|
||||
# Isolate all vertical linear structures at least 30px tall (ignores almost all hand features and note heads)
|
||||
v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 30))
|
||||
roi = self.binary[y_top:y_bottom, :]
|
||||
v_lines = cv2.morphologyEx(roi, cv2.MORPH_OPEN, v_kernel, iterations=1)
|
||||
|
||||
# Aggregate the vertical structures. Measure bars will have a high column density.
|
||||
col_sums = np.sum(v_lines, axis=0) / 255
|
||||
|
||||
# We expect a measure bar to cross both staves, totaling maybe 50% of the ROI height
|
||||
bar_cols = np.where(col_sums > staff_height * 0.4)[0]
|
||||
|
||||
clean_bars = []
|
||||
for x in bar_cols:
|
||||
if not clean_bars or x - clean_bars[-1] > 20:
|
||||
clean_bars.append(int(x))
|
||||
|
||||
if not clean_bars or clean_bars[0] > 50:
|
||||
clean_bars.insert(0, 0)
|
||||
if clean_bars[-1] < self.w - 50:
|
||||
clean_bars.append(self.w)
|
||||
|
||||
measures = []
|
||||
for i in range(len(clean_bars) - 1):
|
||||
x1 = clean_bars[i]
|
||||
x2 = clean_bars[i+1]
|
||||
if x2 - x1 < 40: continue
|
||||
measures.append(MeasureBound(x1, x2, y_top, y_bottom))
|
||||
|
||||
return measures
|
||||
|
||||
if __name__ == "__main__":
|
||||
videos = glob.glob('output/*.mp4')
|
||||
cap = cv2.VideoCapture(videos[0])
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
parser = GridParser(frame)
|
||||
measures = parser.find_measure_bounds()
|
||||
print(f"Measures: {[(m.x_start, m.x_end) for m in measures]}")
|
||||
48
scripts/debug/test_morphology.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import glob
|
||||
|
||||
video_path = glob.glob('output/*.mp4')[0]
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500) # jump to a frame with chords and hand
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
if not ret:
|
||||
print("Cannot read video frame.")
|
||||
exit()
|
||||
|
||||
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY) # White text, black background
|
||||
|
||||
# Morphological horizontal line detection
|
||||
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
|
||||
detect_horizontal = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
|
||||
|
||||
# Morphological vertical line detection
|
||||
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 30))
|
||||
detect_vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
|
||||
|
||||
# Find staves
|
||||
row_sums = np.sum(detect_horizontal, axis=1) / 255
|
||||
y_staves = np.where(row_sums > binary.shape[1] * 0.4)[0]
|
||||
if len(y_staves) > 0:
|
||||
print(f"Top staff line Y: {y_staves[0]}")
|
||||
print(f"Bottom staff line Y: {y_staves[-1]}")
|
||||
|
||||
# Restrict vertical detection to within the staff lines
|
||||
staff_crop = detect_vertical[y_staves[0]:y_staves[-1], :]
|
||||
col_sums = np.sum(staff_crop, axis=0) / 255
|
||||
bars = np.where(col_sums > (y_staves[-1] - y_staves[0]) * 0.6)[0]
|
||||
|
||||
# Filter bars that are too close (thickness)
|
||||
clean_bars = []
|
||||
for x in bars:
|
||||
if not clean_bars or x - clean_bars[-1] > 10:
|
||||
clean_bars.append(x)
|
||||
print(f"Measure bars X: {clean_bars}")
|
||||
else:
|
||||
print("No staves detected.")
|
||||
|
||||
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_morph_horiz.png", detect_horizontal)
|
||||
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_morph_vert.png", detect_vertical)
|
||||
33
scripts/debug/test_number_band.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from youtube_tab_to_pdf import extract_unique_scroll
|
||||
|
||||
# We will read fast_test_pano.jpg
|
||||
img = cv2.imread('fast_test_pano.jpg', cv2.IMREAD_GRAYSCALE)
|
||||
|
||||
# We want to find staff lines and number band
|
||||
_, bin_inv = cv2.threshold(img, 200, 255, cv2.THRESH_BINARY_INV)
|
||||
row_sums = np.sum(bin_inv, axis=1) / 255.0
|
||||
staff_rows = np.where(row_sums > img.shape[1] * 0.4)[0]
|
||||
|
||||
if len(staff_rows) >= 6:
|
||||
staff_y_top = staff_rows[0]
|
||||
else:
|
||||
staff_y_top = int(img.shape[0] * 0.3)
|
||||
|
||||
# Number band
|
||||
band_y_top = max(0, staff_y_top - 25)
|
||||
band_y_bottom = staff_y_top
|
||||
|
||||
band = img[band_y_top:band_y_bottom, :]
|
||||
|
||||
# Save it to see if it correctly contains the numbers
|
||||
cv2.imwrite('debug_band.png', band)
|
||||
print(f"Band shape: {band.shape}")
|
||||
|
||||
# Let's see if we can extract number boxes!
|
||||
band_inv = cv2.bitwise_not(band)
|
||||
col_sums = np.sum(band_inv, axis=0) / 255.0
|
||||
number_xs = np.where(col_sums > 5)[0] # at least 5 pixels of ink vertically
|
||||
|
||||
print(f"Pixels with numbers: {len(number_xs)}")
|
||||
44
scripts/debug/test_ocr_crop.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import cv2
|
||||
import easyocr
|
||||
import numpy as np
|
||||
from youtube_tab_to_pdf import _extract_print_channel, _detect_measure_bars
|
||||
|
||||
cap = cv2.VideoCapture(r"output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4")
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 50) # 1.6 seconds in
|
||||
ret, frame = cap.read()
|
||||
if not ret: exit()
|
||||
|
||||
gray = np.max(frame, axis=2)
|
||||
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
row_sums = np.sum(thresh, axis=1) / 255
|
||||
staff_lines = np.where(row_sums > frame.shape[1] * 0.5)[0]
|
||||
|
||||
y_staff = staff_lines[0] if len(staff_lines) > 0 else 100
|
||||
|
||||
bar_coords = _detect_measure_bars(thresh)
|
||||
print(f"Detected Bars at X: {bar_coords}")
|
||||
|
||||
reader = easyocr.Reader(['en'], gpu=False)
|
||||
|
||||
for idx, x_bar in enumerate(bar_coords):
|
||||
# Crop the tiny region above the bar where the number should be
|
||||
crop_y1 = max(0, y_staff - 25)
|
||||
crop_y2 = max(0, y_staff - 2)
|
||||
crop_x1 = max(0, x_bar - 5)
|
||||
crop_x2 = min(frame.shape[1], x_bar + 25)
|
||||
|
||||
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1:
|
||||
continue
|
||||
|
||||
sprite = frame[crop_y1:crop_y2, crop_x1:crop_x2]
|
||||
cv2.imwrite(f"debug_sprite_{idx}.png", sprite)
|
||||
|
||||
# Scale up for better OCR
|
||||
scaled = cv2.resize(sprite, (0,0), fx=3, fy=3, interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
gray_sprite = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
|
||||
_, binary_sprite = cv2.threshold(gray_sprite, 180, 255, cv2.THRESH_BINARY_INV)
|
||||
cv2.imwrite(f"debug_sprite_bin_{idx}.png", binary_sprite)
|
||||
|
||||
res = reader.readtext(gray_sprite, allowlist='0123456789')
|
||||
print(f"Bar {idx} X={x_bar} OCR: {res}")
|
||||
74
scripts/debug/test_ocr_on_real_boxes.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import cv2
|
||||
import pickle
|
||||
import numpy as np
|
||||
import easyocr
|
||||
import time
|
||||
import re
|
||||
|
||||
reader = easyocr.Reader(['en'], gpu=False)
|
||||
|
||||
with open('unique_pages.pkl', 'rb') as f:
|
||||
unique_pages = pickle.load(f)
|
||||
|
||||
print(f"Loaded {len(unique_pages)} chunks. Running OCR on jump-cut boundaries...")
|
||||
|
||||
def extract_measure_number(page_bgr):
|
||||
# Same logic as before to find the first measure box
|
||||
cw = min(page_bgr.shape[1], 1000)
|
||||
page_gray = cv2.cvtColor(page_bgr[:, :cw], cv2.COLOR_BGR2GRAY)
|
||||
_, bin_inv = cv2.threshold(page_gray, 200, 255, cv2.THRESH_BINARY_INV)
|
||||
|
||||
row_sums = np.sum(bin_inv, axis=1) / 255.0
|
||||
staff_rows = np.where(row_sums > cw * 0.4)[0]
|
||||
|
||||
if len(staff_rows) >= 6:
|
||||
staff_y_top, staff_y_bottom = staff_rows[0], staff_rows[-1]
|
||||
for r in staff_rows:
|
||||
if r - staff_y_top > 100: break
|
||||
staff_y_bottom = r
|
||||
else:
|
||||
return -1
|
||||
|
||||
expected_h = max(10, staff_y_bottom - staff_y_top + 1)
|
||||
staff_region = bin_inv[staff_y_top:staff_y_bottom+1, :]
|
||||
col_sums = np.sum(staff_region, axis=0) / 255.0
|
||||
bar_xs = np.where(col_sums >= expected_h * 0.8)[0]
|
||||
|
||||
if len(bar_xs) == 0: return -1
|
||||
x_bar = bar_xs[0]
|
||||
|
||||
box_y1 = max(0, staff_y_top - 25)
|
||||
box_y2 = staff_y_top
|
||||
box_x1 = x_bar
|
||||
box_x2 = min(page_gray.shape[1], x_bar + 35)
|
||||
|
||||
num_box = page_gray[box_y1:box_y2, box_x1:box_x2]
|
||||
|
||||
# Preprocess for OCR
|
||||
_, num_inv = cv2.threshold(num_box, 200, 255, cv2.THRESH_BINARY_INV)
|
||||
|
||||
# Must pass white background with black text to EasyOCR! (Since it reads printed text)
|
||||
num_for_ocr = cv2.bitwise_not(num_inv)
|
||||
|
||||
upscaled = cv2.resize(num_for_ocr, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)
|
||||
padded = cv2.copyMakeBorder(upscaled, 20, 20, 20, 20, cv2.BORDER_CONSTANT, value=[255, 255, 255])
|
||||
|
||||
results = reader.readtext(padded, allowlist="0123456789")
|
||||
if not results: return -1
|
||||
|
||||
text = results[0][1]
|
||||
|
||||
digits = re.findall(r'\d+', text)
|
||||
if digits:
|
||||
return int(digits[0])
|
||||
return -1
|
||||
|
||||
results = []
|
||||
for i, page in enumerate(unique_pages):
|
||||
t0 = time.time()
|
||||
num = extract_measure_number(page)
|
||||
tf = time.time()
|
||||
print(f"Page {i:02d}: {num} (took {tf-t0:.2f}s)")
|
||||
results.append(num)
|
||||
|
||||
print(f"Sequential Detections: {results}")
|
||||
137
scripts/debug/test_panorama.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
def stitch_scrolling_video(video_path, start_sec=0, duration_sec=100, fps_sample_rate=15):
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
video_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
||||
|
||||
# Calculate frame skip
|
||||
frame_skip = int(video_fps / fps_sample_rate)
|
||||
if frame_skip < 1: frame_skip = 1
|
||||
|
||||
start_frame = int(start_sec * video_fps)
|
||||
max_frames = int(duration_sec * video_fps)
|
||||
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
|
||||
|
||||
# Structural assumptions based on subagent analysis
|
||||
# Y=103 to Y=435 is the white tablature bar
|
||||
y_start = 103
|
||||
y_end = 435
|
||||
|
||||
panorama = None
|
||||
prev_gray = None
|
||||
|
||||
count = 0
|
||||
while count < max_frames:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
# We only process every `frame_skip` frames
|
||||
if count % frame_skip != 0:
|
||||
count += 1
|
||||
continue
|
||||
|
||||
scale = 1280 / frame.shape[1]
|
||||
frame_resized = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
|
||||
|
||||
# Crop to the exact white ribbon
|
||||
ribbon = frame_resized[y_start:y_end, :]
|
||||
gray = cv2.cvtColor(ribbon, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Binarize aggressively to vertical features only to kill horizontal staff lines aliases
|
||||
# dx=1, dy=0 computes horizontal gradient (which highlights VERTICAL edges like note stems and bar lines)
|
||||
sobelx = cv2.Sobel(gray, cv2.CV_32F, 1, 0, ksize=3)
|
||||
bin_float = np.abs(sobelx)
|
||||
|
||||
if panorama is None:
|
||||
# First frame is the initial panorama
|
||||
panorama = ribbon.copy()
|
||||
prev_gray = bin_float
|
||||
continue
|
||||
|
||||
# 1. Constrained Template Matching for dx
|
||||
# Template is a 100px wide vertical slice from prev_gray at x=600
|
||||
template = prev_gray[:, 600:700]
|
||||
|
||||
# Search Region: from x=550 to x=710 in bin_float
|
||||
search_region = bin_float[:, 550:710]
|
||||
|
||||
res = cv2.matchTemplate(search_region, template, cv2.TM_CCOEFF_NORMED)
|
||||
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
|
||||
|
||||
# In search_region (starts at 550), the template's original position (600) is at index 50.
|
||||
# If max_loc[0] == 50 -> no movement (dx=0).
|
||||
# If max_loc[0] < 50 -> image moved left (dx > 0).
|
||||
dx = 50 - max_loc[0]
|
||||
|
||||
if count < 30: # Print first few shifts
|
||||
print(f"Frame {count}: dx={dx}, max_val={max_val:.3f}")
|
||||
shift_x = int(dx)
|
||||
# dx is typically POSITIVE if the camera moves right, meaning the image content moves LEFT.
|
||||
# dx will be positive or negative depending on parameter order.
|
||||
# Let's enforce that we only append new pixels from the RIGHT edge of the 'new' frame.
|
||||
shift_x = int(round(dx))
|
||||
|
||||
# In a left-scrolling video, the content moves left.
|
||||
# phaseCorrelate(prev, curr) -> to overlap curr onto prev, we shift curr by +dx.
|
||||
# The new pixels entering from the right are exactly the `dx` rightmost columns of the current ribbon!
|
||||
# If shift_x > 0...
|
||||
|
||||
# Let's verify shift_x sign.
|
||||
# If curr is moved left by 10 pixels compared to prev, then prev[x] == curr[x-10].
|
||||
# So curr must be shifted by +10 to match prev. Thus dx > 0.
|
||||
# We need to append the NEWest 10 pixels from the right side of curr.
|
||||
|
||||
if shift_x > 0 and shift_x < 300: # Sanity check to ignore massive glitches
|
||||
# The new column is the absolute rightmost shift_x columns of the current ribbon
|
||||
new_pixels = ribbon[:, -shift_x:]
|
||||
panorama = np.hstack([panorama, new_pixels])
|
||||
prev_gray = bin_float
|
||||
|
||||
cap.release()
|
||||
return panorama
|
||||
|
||||
def slice_panorama_to_a4(panorama, slice_width=1280):
|
||||
"""Cuts the infinite 1D panorama into stacked A4 rows"""
|
||||
h, w, c = panorama.shape
|
||||
rows = []
|
||||
|
||||
for start_x in range(0, w, slice_width):
|
||||
end_x = start_x + slice_width
|
||||
chunk = panorama[:, start_x:end_x]
|
||||
|
||||
# Pad the last chunk with white if it's too short
|
||||
if chunk.shape[1] < slice_width:
|
||||
pad_w = slice_width - chunk.shape[1]
|
||||
pad = np.ones((h, pad_w, c), dtype=np.uint8) * 255
|
||||
chunk = np.hstack([chunk, pad])
|
||||
|
||||
rows.append(chunk)
|
||||
|
||||
final_image = np.vstack(rows)
|
||||
return final_image
|
||||
|
||||
if __name__ == "__main__":
|
||||
video_path = "output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
|
||||
if not Path(video_path).exists():
|
||||
# Fallback to output/untitled.mp4 or whatever it might be named
|
||||
for f in Path("output").glob("*.mp4"):
|
||||
video_path = str(f)
|
||||
break
|
||||
|
||||
print(f"Stitching...")
|
||||
|
||||
start_t = time.time()
|
||||
panorama = stitch_scrolling_video(video_path, start_sec=0, duration_sec=100, fps_sample_rate=15)
|
||||
print(f"Extraction took {time.time() - start_t:.2f}s. Panorama shape: {panorama.shape}")
|
||||
|
||||
if panorama is not None:
|
||||
final_sheet = slice_panorama_to_a4(panorama, slice_width=1280)
|
||||
out_path = "C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/verify_panorama.png"
|
||||
cv2.imwrite(out_path, final_sheet)
|
||||
print(f"Saved stacked result to {out_path} with shape {final_sheet.shape}")
|
||||
else:
|
||||
print("Failed to generate panorama.")
|
||||
109
scripts/debug/test_pipeline.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""로컬 캐시된 mp4 파일로 파이프라인 테스트 (다운로드 스킵)
|
||||
1080p 다운로드 모드: python test_pipeline.py --download
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
import importlib.util
|
||||
import argparse
|
||||
import gc
|
||||
|
||||
# youtube_tab_to_pdf 모듈 임포트
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"pipeline", str(Path(__file__).parent / "youtube_tab_to_pdf.py"))
|
||||
pipeline = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(pipeline)
|
||||
|
||||
# 테스트용 YouTube URLs
|
||||
TEST_URLS = {
|
||||
"video_1": "https://www.youtube.com/watch?v=x76IMSvWR0o", # 晴る
|
||||
"video_2": "https://www.youtube.com/watch?v=90BWvJY6KbE", # 新宝島
|
||||
"video_3": "https://www.youtube.com/watch?v=Ri9g4lwnrJQ", # 空奏列車
|
||||
}
|
||||
|
||||
|
||||
def test_video(mp4_path: Path, label: str):
|
||||
"""단일 영상 테스트 — 다운로드 없이 로컬 파일 직접 사용"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"테스트: {label}")
|
||||
print(f"파일: {mp4_path.name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
output_dir = Path("output")
|
||||
debug_dir = output_dir / "debug_frames" / label
|
||||
debug_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Step 2: 프레임 추출
|
||||
frames = pipeline.extract_frames(mp4_path)
|
||||
|
||||
# Step 3: 패턴 감지
|
||||
pattern = pipeline.detect_pattern(frames)
|
||||
|
||||
# Step 4: 고유 프레임 추출
|
||||
if pattern == "scroll":
|
||||
unique = pipeline.extract_unique_scroll(frames)
|
||||
elif pattern == "split":
|
||||
unique = pipeline.extract_unique_split(frames)
|
||||
else:
|
||||
unique = pipeline.extract_unique_overlay(frames)
|
||||
|
||||
# Step 5: PDF 생성
|
||||
pdf_path = output_dir / f"test_{label}.pdf"
|
||||
pipeline.generate_pdf(unique, pdf_path, debug_dir=debug_dir)
|
||||
|
||||
print(f"\n결과: {pattern} / {len(unique)}개 고유 프레임")
|
||||
return pattern, len(unique)
|
||||
|
||||
|
||||
def download_test_videos():
|
||||
"""1080p로 테스트 영상 다운로드"""
|
||||
output_dir = Path("output")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
for label, url in TEST_URLS.items():
|
||||
print(f"\n--- {label} 다운로드 ---")
|
||||
try:
|
||||
video_path, title = pipeline.download_video(url, output_dir)
|
||||
print(f" → 완료: {video_path.name}")
|
||||
except Exception as e:
|
||||
print(f" → 실패: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--download", action="store_true",
|
||||
help="1080p로 테스트 영상 다운로드")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.download:
|
||||
download_test_videos()
|
||||
return
|
||||
|
||||
output_dir = Path("output")
|
||||
mp4_files = sorted(output_dir.glob("*.mp4"))
|
||||
if not mp4_files:
|
||||
print("테스트할 영상(mp4)이 output 폴더에 없습니다.")
|
||||
print(" → python test_pipeline.py --download 로 영상 다운로드")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"캐시된 영상 {len(mp4_files)}개 발견:")
|
||||
for f in mp4_files:
|
||||
print(f" - {f.name} ({f.stat().st_size / 1024 / 1024:.1f} MB)")
|
||||
|
||||
results = {}
|
||||
for i, mp4 in enumerate(mp4_files):
|
||||
label = f"video_{i+1}"
|
||||
pattern, count = test_video(mp4, label)
|
||||
results[label] = (mp4.name, pattern, count)
|
||||
gc.collect() # 1080p 프레임 메모리 해제
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("전체 결과 요약:")
|
||||
print(f"{'='*60}")
|
||||
for label, (name, pattern, count) in results.items():
|
||||
print(f" {label}: {pattern:8s} → {count:4d}개 프레임 | {name[:40]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
57
scripts/debug/test_score_extractor.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import time
|
||||
import glob
|
||||
from video_cv_tracker import TemporalTracker
|
||||
from score_extractor import ScoreExtractor
|
||||
|
||||
def test_pipeline():
|
||||
videos = glob.glob('output/*.mp4')
|
||||
if not videos: return
|
||||
cap = cv2.VideoCapture(videos[0])
|
||||
|
||||
# 1. Tracker extracts median jump-cut pages flawlessly
|
||||
tracker = TemporalTracker(diff_threshold=0.05)
|
||||
|
||||
# Process 100 seconds
|
||||
limit_frames = 3000
|
||||
|
||||
count = 0
|
||||
t0 = time.time()
|
||||
while count < limit_frames:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
# We only pass the lower tab bounding box if needed.
|
||||
# But actually, finding the tab strip directly using robust median is safer.
|
||||
# Let's just crop roughly the bottom 2/3rds where tab lives, reducing processing load.
|
||||
h = frame.shape[0]
|
||||
roi = frame[int(h*0.3):h, :]
|
||||
|
||||
tracker.process_frame(roi)
|
||||
count += 1
|
||||
if count % 300 == 0:
|
||||
print(f"Processed {count} frames...")
|
||||
|
||||
cap.release()
|
||||
unique_pages = tracker.get_unique_pages()
|
||||
print(f"Tracker returned {len(unique_pages)} unique structural median pages. Took {time.time()-t0:.2f}s")
|
||||
|
||||
# 2. Score Extractor applies the Ultimate Structure State Machine
|
||||
t1 = time.time()
|
||||
extractor = ScoreExtractor()
|
||||
extractor.process_pages(unique_pages)
|
||||
tiled_rows = extractor.tile_to_a4(chunk_width=1280)
|
||||
print(f"Extraction & Tiling took {time.time()-t1:.2f}s")
|
||||
|
||||
if tiled_rows:
|
||||
final_img = np.vstack(tiled_rows)
|
||||
# Invert back to black-on-white PDF format
|
||||
pdf_img = cv2.bitwise_not(final_img)
|
||||
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_final_state_machine.png", pdf_img)
|
||||
print("Wrote debug_final_state_machine.png")
|
||||
else:
|
||||
print("Failed to produce rows.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_pipeline()
|
||||
36
scripts/debug/test_stitch.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
img0 = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_chunk_00.png")
|
||||
img1 = cv2.imread(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\raw_chunk_01.png")
|
||||
|
||||
gray0 = cv2.cvtColor(img0, cv2.COLOR_BGR2GRAY)
|
||||
gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
h, w = gray0.shape
|
||||
|
||||
# The first 300px of img1 is our template
|
||||
template_w = 400
|
||||
template = gray1[:60, :template_w] # ONLY TOP 60 PIXELS
|
||||
ref = gray0[:60, :] # ONLY TOP 60 PIXELS
|
||||
|
||||
# Find where 'template' is in 'gray0'
|
||||
res = cv2.matchTemplate(ref, template, cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(res)
|
||||
|
||||
print(f"Match value (Top 60px): {max_val:.3f}")
|
||||
if max_val > 0.8:
|
||||
match_x_in_last = max_loc[0]
|
||||
overlap_len = w - match_x_in_last
|
||||
print(f"Overlap starts in last_chunk at x={match_x_in_last}.")
|
||||
print(f"Length of overlap is {overlap_len}px.")
|
||||
|
||||
if overlap_len < w:
|
||||
new_slice = img1[:, overlap_len:]
|
||||
stitched = np.hstack([img0, new_slice])
|
||||
cv2.imwrite(r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\test_stitched_top60.png", stitched)
|
||||
print("Exported test_stitched_top60.png")
|
||||
else:
|
||||
print("No valid overlap found.")
|
||||
|
||||
|
||||
50
scripts/debug/test_temporal_median.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import glob
|
||||
|
||||
videos = glob.glob('output/*.mp4')
|
||||
cap = cv2.VideoCapture(videos[0])
|
||||
|
||||
# Collect 30 continuous frames (about 1 second of video)
|
||||
frames = []
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 500)
|
||||
for _ in range(30):
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
frames.append(frame)
|
||||
cap.release()
|
||||
|
||||
if len(frames) == 30:
|
||||
# 1. Temporal Median to completely erase the live-action moving guitarist and background
|
||||
median_frame = np.median(frames, axis=0).astype(np.uint8)
|
||||
|
||||
gray = np.max(median_frame, axis=2)
|
||||
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
|
||||
# 2. Extract true staff lines from the pristine static overlay
|
||||
row_sums = np.sum(binary, axis=1) / 255
|
||||
y_staff = np.where(row_sums > binary.shape[1] * 0.4)[0]
|
||||
|
||||
if len(y_staff) > 0:
|
||||
print(f"Pristine staff lines detected at: {y_staff}")
|
||||
y_top = y_staff[0]
|
||||
y_bottom = y_staff[-1]
|
||||
|
||||
# 3. Extract vertical bars perfectly
|
||||
roi = binary[y_top:y_bottom, :]
|
||||
col_sums = np.sum(roi, axis=0) / 255
|
||||
|
||||
staff_h = y_bottom - y_top
|
||||
bars = np.where(col_sums > staff_h * 0.5)[0]
|
||||
|
||||
clean_bars = []
|
||||
for x in bars:
|
||||
if not clean_bars or x - clean_bars[-1] > 20:
|
||||
clean_bars.append(int(x))
|
||||
|
||||
print(f"Pristine Measure Boundaries: {clean_bars}")
|
||||
|
||||
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_temporal_median.png", median_frame)
|
||||
cv2.imwrite("C:/Users/Certes/Desktop/guitar_score/debug_temporal_binary.png", binary)
|
||||
else:
|
||||
print("Not enough frames.")
|
||||
52
scripts/debug/test_y_crop.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
def find_white_tab_bounds(video_path):
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, 30 * cap.get(cv2.CAP_PROP_FPS))
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
if not ret: return None
|
||||
|
||||
scale = 1280 / frame.shape[1]
|
||||
frame = cv2.resize(frame, (1280, int(frame.shape[0] * scale)))
|
||||
|
||||
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Calculate row-wise mean brightness
|
||||
row_means = np.mean(gray, axis=1)
|
||||
|
||||
# We are looking for the white paper background which has brightness > 230 on average
|
||||
# Wait, notes and black lines reduce the mean of a row.
|
||||
# A single black horizontal line on white reduces mean by (255 - 0) * (width/width) -> It drops to ~180 if it's thick.
|
||||
# Let's say any row with mean > 180 is part of the white strip.
|
||||
is_white_row = row_means > 180
|
||||
|
||||
# Find contiguous blocks of True
|
||||
# Pad with False to handle edges cleanly
|
||||
padded = np.concatenate(([False], is_white_row, [False]))
|
||||
diffs = np.diff(padded.astype(int))
|
||||
|
||||
starts = np.where(diffs == 1)[0]
|
||||
ends = np.where(diffs == -1)[0]
|
||||
|
||||
best_start, best_end, max_len = 0, 0, 0
|
||||
|
||||
for s, e in zip(starts, ends):
|
||||
length = e - s
|
||||
if length > max_len:
|
||||
max_len = length
|
||||
best_start = s
|
||||
best_end = e
|
||||
|
||||
return best_start, best_end, frame.shape[0]
|
||||
|
||||
if __name__ == "__main__":
|
||||
video_path = "output/サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
|
||||
bounds = find_white_tab_bounds(video_path)
|
||||
if bounds:
|
||||
s, e, h = bounds
|
||||
print(f"Mathematically found White Tab Strip: Y_START={s}, Y_END={e}. Total Height={h}")
|
||||
else:
|
||||
print("Failed to find bound")
|
||||
BIN
scripts/debug/verify_chunk_0.jpg
Normal file
|
After Width: | Height: | Size: 63 KiB |
BIN
scripts/debug/verify_chunk_1.jpg
Normal file
|
After Width: | Height: | Size: 65 KiB |
BIN
scripts/debug/verify_chunk_2.jpg
Normal file
|
After Width: | Height: | Size: 41 KiB |
BIN
scripts/debug/verify_chunk_3.jpg
Normal file
|
After Width: | Height: | Size: 46 KiB |
BIN
scripts/debug/verify_chunk_4.jpg
Normal file
|
After Width: | Height: | Size: 48 KiB |
BIN
scripts/debug/verify_chunk_5.jpg
Normal file
|
After Width: | Height: | Size: 67 KiB |
BIN
scripts/debug/verify_chunk_6.jpg
Normal file
|
After Width: | Height: | Size: 59 KiB |
116
scripts/debug/verify_fixes.py
Normal file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
수정된 버그 3개가 실제로 동작하는지 검증하는 재실행 시뮬레이션.
|
||||
youtube_tab_to_pdf.py의 수정된 함수들을 직접 임포트하여 사용합니다.
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
if sys.platform == "win32":
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# 메인 모듈 임포트 (수정된 코드 사용)
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from youtube_tab_to_pdf import (
|
||||
_find_white_tab_strip, _has_tab_content,
|
||||
_detect_scroll_offset, _extract_tracking_channel,
|
||||
_merge_scroll_candidates, merge_panoramas_list,
|
||||
_detect_measure_bars, compare_frames
|
||||
)
|
||||
|
||||
FRAME_DIR = Path("output/temp_frames")
|
||||
OUT_DIR = Path("output/sim_verify")
|
||||
OUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
def main():
|
||||
paths = sorted(FRAME_DIR.glob("f_0*.png"))
|
||||
if not paths:
|
||||
print("❌ 프레임 없음"); return
|
||||
|
||||
print(f"[VERIFY] {len(paths)}개 프레임 — 수정된 코드로 재검증")
|
||||
|
||||
# 스트립 Y범위
|
||||
tops, bots = [], []
|
||||
for p in paths[:30]:
|
||||
f = cv2.imread(str(p))
|
||||
if f is None: continue
|
||||
s = _find_white_tab_strip(f)
|
||||
if s: tops.append(s[0]); bots.append(s[1])
|
||||
med_top = int(np.median(tops))
|
||||
med_bot = int(np.median(bots))
|
||||
print(f" 스트립 Y: {med_top}~{med_bot}")
|
||||
|
||||
# MSE 중복제거
|
||||
THRESHOLD = 0.95
|
||||
candidates, compared = [], []
|
||||
for p in paths:
|
||||
f = cv2.imread(str(p))
|
||||
if f is None: continue
|
||||
h = f.shape[0]
|
||||
crop = f[max(0, med_top):min(h, med_bot), :]
|
||||
if not _has_tab_content(crop): continue
|
||||
cmp_img = cv2.resize(crop, (480, 120), interpolation=cv2.INTER_AREA)
|
||||
if any(compare_frames(cmp_img, ref) >= THRESHOLD for ref in compared):
|
||||
continue
|
||||
candidates.append(crop)
|
||||
compared.append(cmp_img)
|
||||
|
||||
print(f"\n[1] MSE 중복제거 후: {len(candidates)}개 후보")
|
||||
|
||||
# ── BUG1 검증: 씬전환 감지 횟수 ─────────────────────────────────────
|
||||
print(f"\n[2] BUG1 검증 — 씬전환 감지 횟수 (기대: 1~3)")
|
||||
stitched = _merge_scroll_candidates(candidates)
|
||||
print(f" _merge_scroll_candidates 결과: {len(stitched)}개 세그먼트 → 파노라마")
|
||||
for i, s in enumerate(stitched):
|
||||
print(f" 세그먼트 파노라마 {i}: {s.shape[1]}px")
|
||||
cv2.imwrite(str(OUT_DIR / f"seg_pano_{i:02d}.png"), s)
|
||||
|
||||
# ── BUG2 검증: 파노라마 병합 ────────────────────────────────────────
|
||||
print(f"\n[3] BUG2 검증 — 파노라마 병합 (기대: 1~2개)")
|
||||
merged = merge_panoramas_list(stitched)
|
||||
print(f" merge_panoramas_list 결과: {len(merged)}개 최종 파노라마")
|
||||
for i, m in enumerate(merged):
|
||||
print(f" 최종 파노라마 {i}: {m.shape[1]}x{m.shape[0]}px")
|
||||
cv2.imwrite(str(OUT_DIR / f"final_pano_{i:02d}.png"), m)
|
||||
|
||||
# ── BUG3 검증: 마디 구분선 탐지 ────────────────────────────────────
|
||||
print(f"\n[4] BUG3 검증 — 마디 구분선 탐지 (기대: 간격 모두 ≥100px)")
|
||||
total_measures = 0
|
||||
all_ok = True
|
||||
for i, m in enumerate(merged):
|
||||
gray = m[:, :, 2] # Red 채널
|
||||
bars = _detect_measure_bars(gray)
|
||||
total_measures += max(0, len(bars) - 1) # 구분선 사이가 마디 수
|
||||
print(f" 파노라마 {i}: {len(bars)}개 구분선 탐지", end="")
|
||||
if bars:
|
||||
gaps = [bars[j+1]-bars[j] for j in range(len(bars)-1)]
|
||||
min_gap = min(gaps) if gaps else 0
|
||||
ok = min_gap >= 100
|
||||
if not ok: all_ok = False
|
||||
print(f" | 최소간격: {min_gap}px {'✅' if ok else '❌ (오탐 여전히 존재)'}")
|
||||
print(f" 첫5개 좌표: {bars[:5]}")
|
||||
else:
|
||||
print()
|
||||
|
||||
# ── 최종 판정 ───────────────────────────────────────────────────────
|
||||
print(f"\n{'='*60}")
|
||||
print("[검증 결과]")
|
||||
seg_ok = len(stitched) <= 5 # 씬전환 5회 이하 (이전 8회 → 개선)
|
||||
merge_ok = len(merged) <= 2 # 파노라마 2개 이하 (이전 3개 → 개선)
|
||||
bar_ok = all_ok # 모든 마디선 간격 ≥100px
|
||||
print(f" BUG1 씬전환 오탐: {'✅ 개선됨' if seg_ok else '❌ 여전히 과다'} ({len(stitched)}개 세그먼트, 이전 9개)")
|
||||
print(f" BUG2 파노라마 분리: {'✅ 개선됨' if merge_ok else '❌ 여전히 분리'} ({len(merged)}개, 이전 3개)")
|
||||
print(f" BUG3 마디선 오탐: {'✅ 개선됨' if bar_ok else '❌ 여전히 오탐'}")
|
||||
print(f" 탐지된 총 마디 수: {total_measures}개")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if seg_ok and merge_ok and bar_ok:
|
||||
print("\n✅ 모든 버그 수정 확인 — 실제 파이프라인 실행 가능")
|
||||
else:
|
||||
print("\n⚠ 일부 문제 잔존 — 추가 파라미터 조정 필요")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
scripts/debug/verify_log.txt
Normal file
41
scripts/debug/verify_monotonic.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import sys
|
||||
sys.path.append(r"C:\Users\Certes\Desktop\guitar_score")
|
||||
import cv2
|
||||
import easyocr
|
||||
import numpy as np
|
||||
import os
|
||||
from youtube_tab_to_pdf import extract_frames, extract_unique_scroll
|
||||
|
||||
video_file = r"C:\Users\Certes\Desktop\guitar_score\output\サカナクション/新宝島(エレキギターTAB) 難易度★★★ sakanaction shintakarajima.mp4"
|
||||
print("Extracting frames...")
|
||||
frames = extract_frames(video_file, fps=2)
|
||||
|
||||
print("Running pipeline extraction...")
|
||||
unique = extract_unique_scroll(frames, threshold=0.95)
|
||||
|
||||
print("Initializing OCR...")
|
||||
reader = easyocr.Reader(['en'])
|
||||
|
||||
print(f"Generated {len(unique)} chunks.")
|
||||
detect_log = []
|
||||
|
||||
for i, page in enumerate(unique):
|
||||
# Image is A4 width
|
||||
# We want to OCR the top 150 pixels of the whole chunk to find measure numbers
|
||||
h, w = page.shape[:2]
|
||||
top_area = page[:min(200, h), :]
|
||||
|
||||
results = reader.readtext(top_area)
|
||||
# filter for numbers
|
||||
nums = []
|
||||
for (bbox, text, prob) in results:
|
||||
t = ''.join(filter(str.isdigit, text))
|
||||
if t:
|
||||
nums.append(int(t))
|
||||
|
||||
print(f"Page {i} measure numbers detected: {nums}")
|
||||
detect_log.append(nums)
|
||||
|
||||
cv2.imwrite(f"output/verify_chunk_{i}.jpg", page)
|
||||
if i > 5:
|
||||
break
|
||||
55
scripts/debug/verify_ocr.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import glob
|
||||
|
||||
def get_number_sprite(m_img):
|
||||
gray = np.max(m_img, axis=2)
|
||||
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
||||
row_sums = np.sum(thresh, axis=1) / 255
|
||||
staff_lines = np.where(row_sums > m_img.shape[1] * 0.5)[0]
|
||||
y_staff = staff_lines[0] if len(staff_lines) > 0 else 50
|
||||
crop_y1 = max(0, y_staff - 60)
|
||||
crop_y2 = max(0, y_staff - 5)
|
||||
crop_x1 = 0
|
||||
crop_x2 = min(60, m_img.shape[1])
|
||||
if crop_y2 <= crop_y1 or crop_x2 <= crop_x1: return None
|
||||
sprite = thresh[crop_y1:crop_y2, crop_x1:crop_x2]
|
||||
if np.count_nonzero(sprite > 127) < 8: return None
|
||||
return sprite
|
||||
|
||||
img_path = r"C:\Users\Certes\.gemini\antigravity\brain\975cea00-dd68-4689-9ee3-f1a2408b4ee6\final_check_100_sec.png"
|
||||
img = cv2.imread(img_path)
|
||||
|
||||
h, w = img.shape[:2]
|
||||
gray = np.max(img, axis=2)
|
||||
col_sums = np.sum(gray < 100, axis=0) # white padding is 255, black measures are <100
|
||||
# ACTUALLY, final image has white padding for rows. And black background for music.
|
||||
# Let's just crop based on the stitched widths.
|
||||
# Better yet, just use a sliding window template match on the number sprite!
|
||||
# Even simpler: just visually save the sprites of the FIRST measure of every ROW!
|
||||
|
||||
rows = []
|
||||
for y in range(0, h, 320): # assuming chunk height is around 320
|
||||
chunk = img[y:y+320, :]
|
||||
if np.max(chunk) > 200:
|
||||
rows.append(chunk)
|
||||
|
||||
print(f"Detected {len(rows)} A4 rows in final image.")
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
gray_row = np.max(row, axis=2)
|
||||
_, binary = cv2.threshold(gray_row, 200, 255, cv2.THRESH_BINARY)
|
||||
|
||||
# Just save the first 100x100 box of the row where the number sprite usually is
|
||||
row_sums = np.sum(binary, axis=1) / 255
|
||||
staff_lines = np.where(row_sums > w * 0.4)[0]
|
||||
if len(staff_lines) > 0:
|
||||
y_staff = staff_lines[0]
|
||||
crop_y1 = max(0, y_staff - 60)
|
||||
crop_y2 = max(0, y_staff - 5)
|
||||
sprite = binary[crop_y1:crop_y2, 10:80]
|
||||
|
||||
cv2.imwrite(f"C:/Users/Certes/Desktop/guitar_score/debug_ocr_measure_{i}.png", sprite)
|
||||
pixels = np.count_nonzero(sprite > 127)
|
||||
print(f"Row {i} parsed. Sprite white pixels: {pixels}")
|
||||
|
||||
32
scripts/debug/view_ascii.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
def img_to_ascii(img_path, target_width=120):
|
||||
img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
|
||||
if img is None:
|
||||
print("Could not load image:", img_path)
|
||||
return
|
||||
|
||||
h, w = img.shape
|
||||
aspect_ratio = h / w
|
||||
# Terminal characters are roughly 2:1 height:width, so adjust aspect
|
||||
target_height = int(target_width * aspect_ratio * 0.5)
|
||||
|
||||
resized = cv2.resize(img, (target_width, target_height))
|
||||
|
||||
# ASCII characters gradient from dark to light
|
||||
chars = ["@", "%", "#", "*", "+", "=", "-", ":", ".", " "]
|
||||
|
||||
# Normalize mapping
|
||||
for y in range(target_height):
|
||||
row_str = ""
|
||||
for x in range(target_width):
|
||||
pixel = resized[y, x]
|
||||
# Map 0-255 to 0-9
|
||||
char_idx = int((pixel / 255.0) * 9)
|
||||
row_str += chars[char_idx]
|
||||
print(row_str)
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=== debug_chunk_0.png ===")
|
||||
img_to_ascii("C:/Users/Certes/.gemini/antigravity/brain/975cea00-dd68-4689-9ee3-f1a2408b4ee6/debug_chunk_0.png", 120)
|
||||