From 64ecc12d35825396431e726160898b89fe091a85 Mon Sep 17 00:00:00 2001 From: CD Date: Sat, 28 Mar 2026 22:17:48 +0900 Subject: [PATCH] docs: initialize project --- .planning/PROJECT.md | 36 ++++++++++++++++++++++++++++++++++++ .planning/REQUIREMENTS.md | 20 ++++++++++++++++++++ .planning/ROADMAP.md | 29 +++++++++++++++++++++++++++++ .planning/STATE.md | 19 +++++++++++++++++++ .planning/config.json | 16 ++++++++++++++++ 5 files changed, 120 insertions(+) create mode 100644 .planning/PROJECT.md create mode 100644 .planning/REQUIREMENTS.md create mode 100644 .planning/ROADMAP.md create mode 100644 .planning/STATE.md create mode 100644 .planning/config.json diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md new file mode 100644 index 0000000..255b486 --- /dev/null +++ b/.planning/PROJECT.md @@ -0,0 +1,36 @@ +# Guitar Score Extraction Pipeline (youtube_tab_to_pdf v2) + +## What This Is +This project aims to automate the extraction of guitar tablature from YouTube videos into clean, readable PDFs. The current objective (v2) is to completely rebuild the OpenCV-based "scroll" and "overlay" extraction pipeline from scratch (zero-based) to solve the chronic issue of missing/discontinuous measure numbers. + +## Target Users +- Guitarists wanting to practice songs from YouTube covers without manually transcribing or struggling to pause the video. + +## Core Value +100% reliable measure extraction without overlaps, repetitions, or jumps, resulting in a perfectly sequenced PDF score. + +## Context +The previous implementation (`merge_panoramas_list` and `cv2.matchTemplate`) relied on horizontal scrolling offset matches which completely failed when similar-looking choruses or repeating measures appeared, leading to entire sections of the song being overwritten and skipped. Additionally, the OCR-based measure duplicate detection was too unstable due to video compression noise and differing fonts. + +## Existing Capabilities (Brownfield) +- ✓ YouTube `yt-dlp` integration and 1080p -> 720p scaling. +- ✓ Frame extraction memory-efficient loop (`DEFAULT_FPS=2`). +- ✓ Target Tab color isolation (`_find_white_tab_strip`). +- ✓ PDF generation via `img2pdf`. + +## Active Requirements +- [ ] Implement Temporal Tracking to measure pixel shift velocity ($v_x$) across frames instead of purely matching past panoramic bounds. +- [ ] Implement Time-Median Filter to erase moving playheads and animated cursors cleanly. +- [ ] Robustly detect Tab Staff Line horizontal rows. +- [ ] Slice continuous stream by strictly calculating elapsed $v_x$ distance rather than relying on unreliable OCR text or thin measure bars. +- [ ] Create rigorous test suite asserting 0 missing frames across reference videos (`video_1`, `video_2`, `video_3`). + +## Key Decisions +| Decision | Rationale | Outcome | +|----------|-----------|---------| +| **Zero-Based Rebuild** | Legacy horizontal stitching math was fundamentally flawed for repeating melodies. | — Pending | +| **Separation of CV tracking** | `youtube_tab_to_pdf.py` is too heavy (914 lines), move CV logic to `video_cv_tracker.py`. | — Pending | +| **Time-Median Filter** | Necessary to remove the playhead cursor which interferes with continuous sequence matching. | — Pending | + +--- +*Last updated: 2026-03-28 after initialization* diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md new file mode 100644 index 0000000..65f660f --- /dev/null +++ b/.planning/REQUIREMENTS.md @@ -0,0 +1,20 @@ +# Requirements + +## Objective +Rebuild `youtube_tab_to_pdf.py` Computer Vision pipeline from the ground up to achieve 100% continuous measure extraction without skips or overwrites, primarily resolving the "discontinuous measure numbers" issue in YouTube guitar tabs. + +## Scenarios +- **SCN-1: The Playhead Problem.** Videos often contain a vertical red/blue bar tracking the current play position. This cursor moves across the screen and disrupts image matching. +- **SCN-2: The Repeating Chorus Problem.** In music, measure 50 might look identical to measure 10. The system must not confuse current frame context with a previous frame 40 measures ago and overwrite the timeline. +- **SCN-3: Sub-optimal measure bars.** Videos compress measure bar lines making them hard to detect accurately, so the system must rely on chronological time-shift tracking. + +## Acceptance Criteria +- [ ] `test_pipeline.py` passes for all 3 sample URLs showing no missing sections between start and end. +- [ ] Output panoramas/chunks are continuously ordered from start to finish without jumping back to an earlier identical part of the song. +- [ ] The moving playhead indicator is fully removed in the final PDF chunks. +- [ ] CV Logic is moved out of the main wrapper into a concise, easily testable `video_cv_tracker.py`. + +## Out of Scope +- Building a UI/Frontend. +- Changing `yt-dlp` download logic. +- Supporting arbitrary instruments (Piano/Drums) other than 6-string Guitar Tabs. diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md new file mode 100644 index 0000000..844d7a1 --- /dev/null +++ b/.planning/ROADMAP.md @@ -0,0 +1,29 @@ +# Roadmap + +## Phase 1: CV Core Refactor (video_cv_tracker.py) +**Goal**: Isolate and establish the core computer vision algorithms needed for temporal continuous extraction of guitar tabs without OCR. +- **Dependencies**: None. +- **Plans**: + - `01-create-tracker.md`: Build `video_cv_tracker.py` introducing exactly three core functions: `extract_roi_median()` (playhead killer), `compute_pixel_shift()` (1D phase correlation tracking), and `stitch_temporally()` (append newly shifted columns only). + - `02-unit-tests.md`: Write minimal unit tests feeding dummy arrays simulating guitar chord lines scrolling to verify exactly $v_x$ shift is returned. + +## Phase 2: Refactoring youtube_tab_to_pdf.py +**Goal**: Tear down the old logic and integrate the new temporal tracking mechanism. +- **Dependencies**: Phase 1. +- **Plans**: + - `01-remove-legacy.md`: Delete `cv2.matchTemplate`-heavy sprawling logic, the fragile `_merge_scroll_candidates()`, and unpredictable `_detect_measure_bars()`. + - `02-integrate-tracker.md`: Hook `extract_unique_scroll` directly to the `video_cv_tracker` generator and loop frames across time, returning one continuous panoramic image. + +## Phase 3: Slicing & PDF Integration +**Goal**: Reliably chop the massive horizontal panoramic tab into A4 width segments. +- **Dependencies**: Phase 2. +- **Plans**: + - `01-robust-measure-chop.md`: Given a complete panorama, cut it blindly into fixed max chunk widths (simulating A4 line breaks) OR slice correctly using the tab color projection. Measure jumps are natively prevented by Phase 1. + - `02-pdf-export.md`: Hand off arrays back to the existing `img2pdf` PDF generation stack. + +## Phase 4: Final Acceptance Testing +**Goal**: Execute test suite against `video_1` (晴る), `video_2` (新宝島), `video_3` (空奏列車). +- **Dependencies**: Phase 3. +- **Plans**: + - `01-execute-end-to-end.md`: Run `python test_pipeline.py`. + - `02-verify-output.md`: Visually inspect the `output/debug_frames/` panoramas to prove zero overlaps and strict chronological transcription. diff --git a/.planning/STATE.md b/.planning/STATE.md new file mode 100644 index 0000000..e48678d --- /dev/null +++ b/.planning/STATE.md @@ -0,0 +1,19 @@ +{ + "project_started": "2026-03-28", + "decisions": [ + { + "date": "2026-03-28", + "decision": "Abandon `cv2.matchTemplate` panoramic horizon matching.", + "rationale": "Repeating music patterns inherently break matching systems without temporal (time-based) anchors. If measures look identical, the algorithm maps current video frames back to old identical measures causing jump-cuts and data loss.", + "context": "YouTube guitar tabs almost always follow a linear chronological left-to-right scrolling pattern. Only velocity tracking is needed." + }, + { + "date": "2026-03-28", + "decision": "Median filtering for playhead removal.", + "rationale": "Animated cursors traversing horizontal staves distort matching distances and visual clarity. Using a Time-Median filter over 5-7 frames permanently erases any rapid-moving objects (like a red cursor) leaving only the static or slowly scrolling background tab notes untouched.", + "context": "The user approved the RAM overhead of taking multiple frames." + } + ], + "blockers": [], + "current_focus": "Phase 1: CV Core Refactor (video_cv_tracker.py)" +} diff --git a/.planning/config.json b/.planning/config.json new file mode 100644 index 0000000..7135861 --- /dev/null +++ b/.planning/config.json @@ -0,0 +1,16 @@ +{ + "mode": "yolo", + "granularity": "standard", + "model_profile": "inherit", + "commit_docs": true, + "sub_repos": [], + "parallelization": true, + "workflow": { + "research": false, + "plan_check": false, + "verifier": true, + "nyquist_validation": true, + "auto_advance": true, + "_auto_chain_active": true + } +} \ No newline at end of file