wip: [01-llm-tuning] paused at task 1/3

2026-04-05 22:59:01 +09:00
parent 66778b750d
commit a09736e930
12 changed files with 53 additions and 1094 deletions
--- a/.planning/HANDOFF.json
+++ b/.planning/HANDOFF.json
@@ -1,25 +1,33 @@
 {
  "version": "1.0",
-  "timestamp": "2026-04-05T00:51:15+09:00",
-  "phase": "00-initialization",
-  "phase_name": "Project Initialization",
-  "phase_dir": ".planning",
-  "plan": 0,
-  "task": 0,
-  "total_tasks": 0,
+  "timestamp": "2026-04-05T13:54:58.707Z",
+  "phase": "01",
+  "phase_name": "01-llm-tuning",
+  "phase_dir": ".planning/phases/01-llm-tuning",
+  "plan": 1,
+  "task": 1,
+  "total_tasks": 3,
  "status": "paused",
  "completed_tasks": [
-    {"id": 1, "name": "Initialize Project & Repo", "status": "done", "commit": "e37f65a"}
+    {"id": 1, "name": "Gemma4 26B performance tuning at 256K context", "status": "done", "commit": "none"}
  ],
  "remaining_tasks": [
-    {"id": 2, "name": "Run /gsd-plan-phase 1 to start planning Phase 1", "status": "not_started"}
+    {"id": 2, "name": "Proceed with extensions frontend UI integration", "status": "not_started"},
+    {"id": 3, "name": "Add 2nd RTX 3060 to verify 45-60 t/s MoE performance", "status": "not_started"}
  ],
  "blockers": [],
-  "human_actions_pending": [],
-  "decisions": [
-    {"decision": "2+0 GPU Architecture (Machine A API Server, Machine B tools)", "rationale": "Prioritize coding speed (50-80 t/s) and separate logic cleanly", "phase": "00"}
+  "human_actions_pending": [
+    {"action": "Decide next step: integration of Extension frontend streaming or adding second GPU for Qwen/Gemma4 full evaluation", "context": "Server is fully optimized for 1 GPU, further improvements in speed require hardware upgrade", "blocking": false}
  ],
-  "uncommitted_files": [],
-  "next_action": "Run /gsd-plan-phase 1 to plan the Machine A server setup and hot-swap script.",
-  "context_notes": "We just finalized the initial architecture plan for Variet LLM involving Dual GPUs on Machine A for pure API inference, and Machine B as the workstation for VS Code Extension, Discord Bot, and Search/MCP tools."
+  "decisions": [
+    {"decision": "Used --n-cpu-moe 10 for Gemma4 26B instead of --cpu-moe", "rationale": "Applying --cpu-moe globally to Gemma4 resulted in severe instability and crashes (graph splits 62) due to SWA+MoE entanglement. Targeted offload (10 layers) prevents VRAM swap and stabilizes split at 2, achieving 30.9 t/s on 1 GPU.", "phase": "01"},
+    {"decision": "Verified Qwen3.5 35B-A3B speed capabilities", "rationale": "Tested Qwen 35B limits on 12GB. Found it causes heavy WDDM swap without MoE offload. Confirmed its smaller active parameters (3B vs Gemma4's 4B) will likely make it significantly faster than Gemma4 on a dual 3060 24GB setup up to 64K context.", "phase": "01"}
+  ],
+  "uncommitted_files": [
+    "start_gemma4_26b_api.bat",
+    "scripts/auto_tune_gemma4_256k.py",
+    "scripts/auto_tune_gemma4_ncpumoe.py"
+  ],
+  "next_action": "Resume development on OpenClaude integration (Extension frontend UI) or configure Dual-GPU testing.",
+  "context_notes": "We've successfully proven the 1 GPU tuning threshold for Gemma4 (30.9 t/s). We also understood why OpenClaude needs large contexts (200K default scaling) and mapped out exact expectations for Qwen VS Gemma on 2x GPUs."
 }