{ "version": "1.0", "timestamp": "2026-04-11T16:30:00+09:00", "session_type": "maintenance", "phase": "01", "phase_name": "llm-tuning", "phase_dir": ".planning/phases/01-llm-tuning", "plan": 1, "task": null, "total_tasks": null, "status": "paused", "summary": "Maintenance session — Phase 01 LLM tuning re-verification + Hermes Agent v0.8.0 update + Qwen promoted to primary role", "completed_work": [ {"id": 1, "name": "Balanced role (Qwen 3.5 35B-A3B) v3 retuning", "status": "done", "commit": "0dee779"}, {"id": 2, "name": "Fast role (Gemma 4 26B-A4B) v3 retuning + mmproj GPU", "status": "done", "commit": "0dee779"}, {"id": 3, "name": "Speculative decoding experiment (E2B draft) + rejection decision", "status": "done", "commit": "0dee779"}, {"id": 4, "name": "Benchmark utilities (bench_short/bench_long/test_ts_ratios)", "status": "done", "commit": "0dee779"}, {"id": 5, "name": "E2B model files removed (3.8 GB disk recovery)", "status": "done", "commit": "0dee779"}, {"id": 6, "name": "llama.cpp b8660 -> b8757 update + regression analysis", "status": "done", "commit": "pending"}, {"id": 7, "name": "llama.cpp b8660 retained (Gemma 4 9% regression vs bug fixes)", "status": "done", "commit": "pending"}, {"id": 8, "name": "Qwen 3.5 35B promoted to primary role (default_role=balanced)", "status": "done", "commit": "pending"}, {"id": 9, "name": "Hermes Agent update fff237e1 -> e902e55b (v0.8.0, 340 commits)", "status": "done", "commit": "pending"}, {"id": 10, "name": "Local Hermes patches preserved via stash/pop auto-merge (8 files)", "status": "done", "commit": "pending"}, {"id": 11, "name": "Hermes config.yaml updated: Qwen default + DISCORD_HOME_CHANNEL fix", "status": "done", "commit": "pending"} ], "decisions": [ { "decision": "Qwen 3.5 35B (balanced) promoted to primary role", "rationale": "Speed differential vs Gemma 4 26B is 1.25 t/s (negligible). Qwen larger (35B vs 26B), thinking mode, better Korean/coding, architecture mature. Vision CPU offload (6.4s/image) acceptable for Hermes Agent text-dominant workload.", "phase": "01" }, { "decision": "llama.cpp kept at b8660 (rolled back from b8757)", "rationale": "b8757 has Gemma 4 correctness fixes but introduces ~9% speed regression on Gemma 4. b8757 binary retained in llama_bin_run.backup_b8660/ for rollback. Re-evaluate on next llama.cpp release.", "phase": "01" }, { "decision": "Speculative Decoding rejected", "rationale": "+14% avg gen speedup vs -31% cold start penalty + tokenizer mismatch (E2B vs 26B) + mmproj conflict + 5+ additional flags complexity.", "phase": "01" }, { "decision": "KV cache q8_0 for fast, q4_0 for balanced", "rationale": "fast needs mmproj GPU capacity (q8_0 saves 2.5 GB). balanced uses q4_0 for maximum VRAM headroom.", "phase": "01" }, { "decision": "Removed --mlock/--poll/--prio/-t/-tb from all roles", "rationale": "Measured impact 0.04 t/s. Dedicated inference machine has no CPU contention.", "phase": "01" } ], "performance_baselines": { "balanced_qwen3.5_35b": { "gen_short_tps": 64.16, "prefill_long_tps": 1157, "gen_long_tps": 62.00, "vram_free_mb": 3246, "mmproj_mode": "CPU", "vision_encode_sec": 6.4 }, "fast_gemma4_26b": { "gen_short_tps": 71.89, "prefill_long_tps": 1672, "gen_long_tps": 66.67, "vram_free_mb": 2073, "mmproj_mode": "GPU", "vision_encode_sec": 1.0 } }, "hardware_constraints": { "gpu0_pcie": "Gen3 x4 (3.94 GB/s)", "gpu1_pcie": "Gen4 x16 (31.5 GB/s)", "asymmetric_bottleneck": "GPU0 PCIe x4 limits inter-GPU transfers", "total_vram_mb": 24576 }, "blockers": [], "human_actions_pending": [ { "action": "Run run_hermes_agent.bat to start Hermes Agent", "context": "All config ready. Qwen 3.5 35B loaded, DISCORD_HOME_CHANNEL set, v0.8.0 active.", "blocking": false } ], "uncommitted_files": [ "config/engine_models.json", "agents/hermes-agent (submodule)", "~/.hermes/config.yaml (outside repo)", "agents/hermes-agent/cli-config.yaml" ], "next_action": "Commit remaining changes + git push + start Hermes Agent", "context_notes": "Maintenance session for Phase 01 and Hermes Agent (Phase 06 outcome). Phase 05 VS Code Extension Packaging still planning. Next real development session determined by user priority." }