variet_llm/.planning/HANDOFF.json

{
  "version": "1.0",
  "timestamp": "2026-04-05T13:54:58.707Z",
  "phase": "01",
  "phase_name": "01-llm-tuning",
  "phase_dir": ".planning/phases/01-llm-tuning",
  "plan": 1,
  "task": 1,
  "total_tasks": 3,
  "status": "paused",
  "completed_tasks": [
    {"id": 1, "name": "Gemma4 26B performance tuning at 256K context", "status": "done", "commit": "none"}
  ],
  "remaining_tasks": [
    {"id": 2, "name": "Proceed with extensions frontend UI integration", "status": "not_started"},
    {"id": 3, "name": "Add 2nd RTX 3060 to verify 45-60 t/s MoE performance", "status": "not_started"}
  ],
  "blockers": [],
  "human_actions_pending": [
    {"action": "Decide next step: integration of Extension frontend streaming or adding second GPU for Qwen/Gemma4 full evaluation", "context": "Server is fully optimized for 1 GPU, further improvements in speed require hardware upgrade", "blocking": false}
  ],
  "decisions": [
    {"decision": "Used --n-cpu-moe 10 for Gemma4 26B instead of --cpu-moe", "rationale": "Applying --cpu-moe globally to Gemma4 resulted in severe instability and crashes (graph splits 62) due to SWA+MoE entanglement. Targeted offload (10 layers) prevents VRAM swap and stabilizes split at 2, achieving 30.9 t/s on 1 GPU.", "phase": "01"},
    {"decision": "Verified Qwen3.5 35B-A3B speed capabilities", "rationale": "Tested Qwen 35B limits on 12GB. Found it causes heavy WDDM swap without MoE offload. Confirmed its smaller active parameters (3B vs Gemma4's 4B) will likely make it significantly faster than Gemma4 on a dual 3060 24GB setup up to 64K context.", "phase": "01"}
  ],
  "uncommitted_files": [
    "start_gemma4_26b_api.bat",
    "scripts/auto_tune_gemma4_256k.py",
    "scripts/auto_tune_gemma4_ncpumoe.py"
  ],
  "next_action": "Resume development on OpenClaude integration (Extension frontend UI) or configure Dual-GPU testing.",
  "context_notes": "We've successfully proven the 1 GPU tuning threshold for Gemma4 (30.9 t/s). We also understood why OpenClaude needs large contexts (200K default scaling) and mapped out exact expectations for Qwen VS Gemma on 2x GPUs."
}