wip: [01-llm-tuning] paused at task 3/5

2026-04-06 21:17:39 +09:00
parent a09736e930
commit 626a089b6b
1 changed files with 18 additions and 17 deletions
--- a/.planning/HANDOFF.json
+++ b/.planning/HANDOFF.json
@@ -1,33 +1,34 @@
 {
  "version": "1.0",
-  "timestamp": "2026-04-05T13:54:58.707Z",
+  "timestamp": "2026-04-06T21:18:00+09:00",
  "phase": "01",
-  "phase_name": "01-llm-tuning",
+  "phase_name": "llm-tuning",
  "phase_dir": ".planning/phases/01-llm-tuning",
  "plan": 1,
-  "task": 1,
+  "task": 3,
-  "total_tasks": 3,
+  "total_tasks": 5,
  "status": "paused",
  "completed_tasks": [
-    {"id": 1, "name": "Gemma4 26B performance tuning at 256K context", "status": "done", "commit": "none"}
+    {"id": 1, "name": "Evaluate 122B Single GPU", "status": "done", "commit": ""},
    {"id": 2, "name": "Evaluate 122B Dual GPU memory geometric splitting", "status": "done", "commit": ""},
    {"id": 3, "name": "Calculate theoretical limits of DDR4 MoE fetching", "status": "done", "commit": ""},
    {"id": 4, "name": "Test Qwen 27B Dense context bounds limits", "status": "in_progress", "progress": "Confirmed -c 262144 boots successfully"}
  ],
  "remaining_tasks": [
-    {"id": 2, "name": "Proceed with extensions frontend UI integration", "status": "not_started"},
+    {"id": 5, "name": "Evaluate Gemma-4 31B max context and speed", "status": "not_started"}
    {"id": 3, "name": "Add 2nd RTX 3060 to verify 45-60 t/s MoE performance", "status": "not_started"}
  ],
-  "blockers": [],
+  "blockers": [
-  "human_actions_pending": [
+    {"description": "122B Q4_K_M 20t/s Generation Speed Limit", "type": "technical", "workaround": "Physical limitation of DDR4 RAM bandwidth (50GB/s) against 4+ GB of active weights. Cannot be bypassed. Shifted focus to smaller Dense models that fit completely into VRAM."}
    {"action": "Decide next step: integration of Extension frontend streaming or adding second GPU for Qwen/Gemma4 full evaluation", "context": "Server is fully optimized for 1 GPU, further improvements in speed require hardware upgrade", "blocking": false}
  ],
  "human_actions_pending": [],
  "decisions": [
-    {"decision": "Used --n-cpu-moe 10 for Gemma4 26B instead of --cpu-moe", "rationale": "Applying --cpu-moe globally to Gemma4 resulted in severe instability and crashes (graph splits 62) due to SWA+MoE entanglement. Targeted offload (10 layers) prevents VRAM swap and stabilizes split at 2, achieving 30.9 t/s on 1 GPU.", "phase": "01"},
+    {"decision": "Stop forcing Dual GPU symmetric utilization on MoE with n-cpu-moe", "rationale": "Model asymmetry forces OOM on one GPU and underutilization on the other.", "phase": "01"},
-    {"decision": "Verified Qwen3.5 35B-A3B speed capabilities", "rationale": "Tested Qwen 35B limits on 12GB. Found it causes heavy WDDM swap without MoE offload. Confirmed its smaller active parameters (3B vs Gemma4's 4B) will likely make it significantly faster than Gemma4 on a dual 3060 24GB setup up to 64K context.", "phase": "01"}
+    {"decision": "Shift focus to Qwen 27B / Gemma 4 31B dense models", "rationale": "They fit 100% into VRAM, bypassing WDDM/PCIe/DDR4 bottlenecks, guaranteeing ~20+ t/s generation speeds.", "phase": "01"}
  ],
  "uncommitted_files": [
-    "start_gemma4_26b_api.bat",
+    "scripts/find_max_dense.mjs",
-    "scripts/auto_tune_gemma4_256k.py",
+    "scripts/tune_122b_20ts.mjs"
    "scripts/auto_tune_gemma4_ncpumoe.py"
  ],
-  "next_action": "Resume development on OpenClaude integration (Extension frontend UI) or configure Dual-GPU testing.",
+  "next_action": "Complete speed benchmark for Qwen 27B and find max context for Gemma 4 31B",
-  "context_notes": "We've successfully proven the 1 GPU tuning threshold for Gemma4 (30.9 t/s). We also understood why OpenClaude needs large contexts (200K default scaling) and mapped out exact expectations for Qwen VS Gemma on 2x GPUs."
+  "context_notes": "We successfully shifted the user's focus away from physically impossible 122B Q4_K_M constraints by laying down concrete mathematical logic about VRAM/RAM bandwidth. We are now pivoting to dense models (27B/31B) to guarantee speed and context size."
 }