Files
variet_llm/.planning/HANDOFF.json
2026-04-06 21:17:39 +09:00

35 lines
2.0 KiB
JSON

{
"version": "1.0",
"timestamp": "2026-04-06T21:18:00+09:00",
"phase": "01",
"phase_name": "llm-tuning",
"phase_dir": ".planning/phases/01-llm-tuning",
"plan": 1,
"task": 3,
"total_tasks": 5,
"status": "paused",
"completed_tasks": [
{"id": 1, "name": "Evaluate 122B Single GPU", "status": "done", "commit": ""},
{"id": 2, "name": "Evaluate 122B Dual GPU memory geometric splitting", "status": "done", "commit": ""},
{"id": 3, "name": "Calculate theoretical limits of DDR4 MoE fetching", "status": "done", "commit": ""},
{"id": 4, "name": "Test Qwen 27B Dense context bounds limits", "status": "in_progress", "progress": "Confirmed -c 262144 boots successfully"}
],
"remaining_tasks": [
{"id": 5, "name": "Evaluate Gemma-4 31B max context and speed", "status": "not_started"}
],
"blockers": [
{"description": "122B Q4_K_M 20t/s Generation Speed Limit", "type": "technical", "workaround": "Physical limitation of DDR4 RAM bandwidth (50GB/s) against 4+ GB of active weights. Cannot be bypassed. Shifted focus to smaller Dense models that fit completely into VRAM."}
],
"human_actions_pending": [],
"decisions": [
{"decision": "Stop forcing Dual GPU symmetric utilization on MoE with n-cpu-moe", "rationale": "Model asymmetry forces OOM on one GPU and underutilization on the other.", "phase": "01"},
{"decision": "Shift focus to Qwen 27B / Gemma 4 31B dense models", "rationale": "They fit 100% into VRAM, bypassing WDDM/PCIe/DDR4 bottlenecks, guaranteeing ~20+ t/s generation speeds.", "phase": "01"}
],
"uncommitted_files": [
"scripts/find_max_dense.mjs",
"scripts/tune_122b_20ts.mjs"
],
"next_action": "Complete speed benchmark for Qwen 27B and find max context for Gemma 4 31B",
"context_notes": "We successfully shifted the user's focus away from physically impossible 122B Q4_K_M constraints by laying down concrete mathematical logic about VRAM/RAM bandwidth. We are now pivoting to dense models (27B/31B) to guarantee speed and context size."
}