variet_llm/.planning/HANDOFF.json

{
  "version": "1.0",
  "timestamp": "2026-04-06T21:18:00+09:00",
  "phase": "01",
  "phase_name": "llm-tuning",
  "phase_dir": ".planning/phases/01-llm-tuning",
  "plan": 1,
  "task": 3,
  "total_tasks": 5,
  "status": "paused",
  "completed_tasks": [
    {"id": 1, "name": "Evaluate 122B Single GPU", "status": "done", "commit": ""},
    {"id": 2, "name": "Evaluate 122B Dual GPU memory geometric splitting", "status": "done", "commit": ""},
    {"id": 3, "name": "Calculate theoretical limits of DDR4 MoE fetching", "status": "done", "commit": ""},
    {"id": 4, "name": "Test Qwen 27B Dense context bounds limits", "status": "in_progress", "progress": "Confirmed -c 262144 boots successfully"}
  ],
  "remaining_tasks": [
    {"id": 5, "name": "Evaluate Gemma-4 31B max context and speed", "status": "not_started"}
  ],
  "blockers": [
    {"description": "122B Q4_K_M 20t/s Generation Speed Limit", "type": "technical", "workaround": "Physical limitation of DDR4 RAM bandwidth (50GB/s) against 4+ GB of active weights. Cannot be bypassed. Shifted focus to smaller Dense models that fit completely into VRAM."}
  ],
  "human_actions_pending": [],
  "decisions": [
    {"decision": "Stop forcing Dual GPU symmetric utilization on MoE with n-cpu-moe", "rationale": "Model asymmetry forces OOM on one GPU and underutilization on the other.", "phase": "01"},
    {"decision": "Shift focus to Qwen 27B / Gemma 4 31B dense models", "rationale": "They fit 100% into VRAM, bypassing WDDM/PCIe/DDR4 bottlenecks, guaranteeing ~20+ t/s generation speeds.", "phase": "01"}
  ],
  "uncommitted_files": [
    "scripts/find_max_dense.mjs",
    "scripts/tune_122b_20ts.mjs"
  ],
  "next_action": "Complete speed benchmark for Qwen 27B and find max context for Gemma 4 31B",
  "context_notes": "We successfully shifted the user's focus away from physically impossible 122B Q4_K_M constraints by laying down concrete mathematical logic about VRAM/RAM bandwidth. We are now pivoting to dense models (27B/31B) to guarantee speed and context size."
}