From e02626fda8c41610017bf2220f9700a94025a15a Mon Sep 17 00:00:00 2001 From: Variet-Worker Date: Sat, 11 Apr 2026 18:14:41 +0900 Subject: [PATCH] =?UTF-8?q?chore(session):=20pause=20work=20=E2=80=94=20Qw?= =?UTF-8?q?en=20promoted=20to=20primary=20+=20Hermes=20v0.8.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Engine: - default_role: fast → balanced (Qwen 3.5 35B primary) - balanced: remove -t/-tb (no impact with -ngl 999) Hermes Agent submodule: - Update fff237e1 → e902e55b (v0.8.0, 340 commits merged) - Local 8 file patches auto-merged (stash/pop, 0 conflicts) - mmproj CPU offload, DISCORD_HOME_CHANNEL fix via external ~/.hermes/config.yaml Decisions: - Speculative decoding experiment rejected (+14% gen vs -31% cold start) - llama.cpp kept at b8660 (b8757 has 9% Gemma 4 regression) - Qwen superior for thinking/Korean/coding; speed diff negligible Session handoff: .planning/HANDOFF.json Co-Authored-By: Claude Opus 4.6 (1M context) --- .planning/HANDOFF.json | 105 ++++++++++++++++++++++++++++++-------- agents/hermes-agent | 2 +- config/engine_models.json | 6 +-- 3 files changed, 85 insertions(+), 28 deletions(-) diff --git a/.planning/HANDOFF.json b/.planning/HANDOFF.json index 67ad1e9..1dde1e3 100644 --- a/.planning/HANDOFF.json +++ b/.planning/HANDOFF.json @@ -1,32 +1,93 @@ { "version": "1.0", - "timestamp": "2026-04-09T21:54:17+09:00", - "phase": "06", - "phase_name": "install-and-evaluate-hermes-agent", - "phase_dir": ".planning/phases/06-install-and-evaluate-hermes-agent", + "timestamp": "2026-04-11T16:30:00+09:00", + "session_type": "maintenance", + "phase": "01", + "phase_name": "llm-tuning", + "phase_dir": ".planning/phases/01-llm-tuning", "plan": 1, - "task": 5, - "total_tasks": 5, + "task": null, + "total_tasks": null, "status": "paused", - "completed_tasks": [ - {"id": 1, "name": "Hermes Agent Repository Clone & Config", "status": "done", "commit": "none"}, - {"id": 2, "name": "Variet Engine 로컬 연결 및 .env 세팅", "status": "done", "commit": "none"}, - {"id": 3, "name": "윈도우 플랫폼 버그 패치", "status": "done", "commit": "none"}, - {"id": 4, "name": "GSD Phase 06 완료 처리", "status": "done", "commit": "none"}, - {"id": 5, "name": "Gemma-4-A4B 무한루프 및 타임아웃 방어 옵션 주입", "status": "done", "commit": "none"} - ], - "remaining_tasks": [], - "blockers": [], - "human_actions_pending": [ - {"action": "현재 실행중인 콘솔에서 run_hermes_agent.bat 를 재실행하여 파이썬 변경 사항 메모리 적재하기", "context": "방금 전 코드 패치가 적용되려면 반드시 새로 구동해야 합니다", "blocking": true} + "summary": "Maintenance session — Phase 01 LLM tuning re-verification + Hermes Agent v0.8.0 update + Qwen promoted to primary role", + "completed_work": [ + {"id": 1, "name": "Balanced role (Qwen 3.5 35B-A3B) v3 retuning", "status": "done", "commit": "0dee779"}, + {"id": 2, "name": "Fast role (Gemma 4 26B-A4B) v3 retuning + mmproj GPU", "status": "done", "commit": "0dee779"}, + {"id": 3, "name": "Speculative decoding experiment (E2B draft) + rejection decision", "status": "done", "commit": "0dee779"}, + {"id": 4, "name": "Benchmark utilities (bench_short/bench_long/test_ts_ratios)", "status": "done", "commit": "0dee779"}, + {"id": 5, "name": "E2B model files removed (3.8 GB disk recovery)", "status": "done", "commit": "0dee779"}, + {"id": 6, "name": "llama.cpp b8660 -> b8757 update + regression analysis", "status": "done", "commit": "pending"}, + {"id": 7, "name": "llama.cpp b8660 retained (Gemma 4 9% regression vs bug fixes)", "status": "done", "commit": "pending"}, + {"id": 8, "name": "Qwen 3.5 35B promoted to primary role (default_role=balanced)", "status": "done", "commit": "pending"}, + {"id": 9, "name": "Hermes Agent update fff237e1 -> e902e55b (v0.8.0, 340 commits)", "status": "done", "commit": "pending"}, + {"id": 10, "name": "Local Hermes patches preserved via stash/pop auto-merge (8 files)", "status": "done", "commit": "pending"}, + {"id": 11, "name": "Hermes config.yaml updated: Qwen default + DISCORD_HOME_CHANNEL fix", "status": "done", "commit": "pending"} ], "decisions": [ - {"decision": "추론형(Reasoning) 모델의 SWA 관련 무한 루프 늪을 방지하기 위해 run_agent.py에 min_p(0.05)와 repeat_penalty(1.05) 직접 강제 주입", "rationale": "가장 확실하고 공식적인 로컬 커뮤니티 국룰 세팅으로, 온도를 높이는 것보다 환각이 덜 발생", "phase": "06"} + { + "decision": "Qwen 3.5 35B (balanced) promoted to primary role", + "rationale": "Speed differential vs Gemma 4 26B is 1.25 t/s (negligible). Qwen larger (35B vs 26B), thinking mode, better Korean/coding, architecture mature. Vision CPU offload (6.4s/image) acceptable for Hermes Agent text-dominant workload.", + "phase": "01" + }, + { + "decision": "llama.cpp kept at b8660 (rolled back from b8757)", + "rationale": "b8757 has Gemma 4 correctness fixes but introduces ~9% speed regression on Gemma 4. b8757 binary retained in llama_bin_run.backup_b8660/ for rollback. Re-evaluate on next llama.cpp release.", + "phase": "01" + }, + { + "decision": "Speculative Decoding rejected", + "rationale": "+14% avg gen speedup vs -31% cold start penalty + tokenizer mismatch (E2B vs 26B) + mmproj conflict + 5+ additional flags complexity.", + "phase": "01" + }, + { + "decision": "KV cache q8_0 for fast, q4_0 for balanced", + "rationale": "fast needs mmproj GPU capacity (q8_0 saves 2.5 GB). balanced uses q4_0 for maximum VRAM headroom.", + "phase": "01" + }, + { + "decision": "Removed --mlock/--poll/--prio/-t/-tb from all roles", + "rationale": "Measured impact 0.04 t/s. Dedicated inference machine has no CPU contention.", + "phase": "01" + } + ], + "performance_baselines": { + "balanced_qwen3.5_35b": { + "gen_short_tps": 64.16, + "prefill_long_tps": 1157, + "gen_long_tps": 62.00, + "vram_free_mb": 3246, + "mmproj_mode": "CPU", + "vision_encode_sec": 6.4 + }, + "fast_gemma4_26b": { + "gen_short_tps": 71.89, + "prefill_long_tps": 1672, + "gen_long_tps": 66.67, + "vram_free_mb": 2073, + "mmproj_mode": "GPU", + "vision_encode_sec": 1.0 + } + }, + "hardware_constraints": { + "gpu0_pcie": "Gen3 x4 (3.94 GB/s)", + "gpu1_pcie": "Gen4 x16 (31.5 GB/s)", + "asymmetric_bottleneck": "GPU0 PCIe x4 limits inter-GPU transfers", + "total_vram_mb": 24576 + }, + "blockers": [], + "human_actions_pending": [ + { + "action": "Run run_hermes_agent.bat to start Hermes Agent", + "context": "All config ready. Qwen 3.5 35B loaded, DISCORD_HOME_CHANNEL set, v0.8.0 active.", + "blocking": false + } ], "uncommitted_files": [ - "agents/hermes-agent/run_agent.py", - "agents/hermes-agent/agent/auxiliary_client.py" + "config/engine_models.json", + "agents/hermes-agent (submodule)", + "~/.hermes/config.yaml (outside repo)", + "agents/hermes-agent/cli-config.yaml" ], - "next_action": "에이전트 재시작 후 봇과 정상 대화 검증하기", - "context_notes": "모델의 30분 무한루프를 해결하고 휴식에 들어갑니다. 해결 완료 상태이며 사용자님의 로컬 256K A4B 모델은 성능에 문제가 없음을 확인했습니다." + "next_action": "Commit remaining changes + git push + start Hermes Agent", + "context_notes": "Maintenance session for Phase 01 and Hermes Agent (Phase 06 outcome). Phase 05 VS Code Extension Packaging still planning. Next real development session determined by user priority." } diff --git a/agents/hermes-agent b/agents/hermes-agent index fff237e..e902e55 160000 --- a/agents/hermes-agent +++ b/agents/hermes-agent @@ -1 +1 @@ -Subproject commit fff237e11198a8918086bc4a2f53300a0a48dfcf +Subproject commit e902e55b26aab4658debab070fc1048b22517158 diff --git a/config/engine_models.json b/config/engine_models.json index 3986e4b..3bd0b64 100644 --- a/config/engine_models.json +++ b/config/engine_models.json @@ -1,6 +1,6 @@ { "version": "1.0", - "default_role": "fast", + "default_role": "balanced", "llama_server": { "path": "llama_bin_run/llama-server.exe", "internal_port": 8080, @@ -62,10 +62,6 @@ "256", "-b", "512", - "-t", - "6", - "-tb", - "6", "-ts", "0.48,0.52" ]