From 96c91cb57a4ac15f9d611761ae242bc59bfeff13 Mon Sep 17 00:00:00 2001 From: Variet-Worker Date: Wed, 8 Apr 2026 23:04:20 +0900 Subject: [PATCH] feat(phase-06): complete Hermes Agent windows fixes & deployment --- .planning/ROADMAP.md | 10 +++ .planning/STATE.md | 21 +++-- .../.gitkeep | 0 .../06-PLAN.md | 26 ++++++ .../CONTEXT.md | 25 ++++++ agents/hermes-agent | 1 + engine/variet_engine.py | 2 +- openclaude | 2 +- ...hermes_discord.bat => run_hermes_agent.bat | 12 ++- scripts/gemma4_test.py | 88 +++++++++++++++++++ scripts/qwen_split_challenge.py | 67 ++++++++++++++ scripts/tune_models.mjs | 84 ++++++++++++++++++ 12 files changed, 325 insertions(+), 13 deletions(-) create mode 100644 .planning/phases/06-install-and-evaluate-hermes-agent/.gitkeep create mode 100644 .planning/phases/06-install-and-evaluate-hermes-agent/06-PLAN.md create mode 100644 .planning/phases/06-install-and-evaluate-hermes-agent/CONTEXT.md create mode 160000 agents/hermes-agent rename run_hermes_discord.bat => run_hermes_agent.bat (64%) create mode 100644 scripts/gemma4_test.py create mode 100644 scripts/qwen_split_challenge.py create mode 100644 scripts/tune_models.mjs diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index d8cdd51..5907fa5 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -54,6 +54,16 @@ **Dependencies:** Phase 04 +### Phase 6: Install and evaluate Hermes Agent + +**Goal:** [To be planned] +**Requirements**: TBD +**Depends on:** Phase 5 +**Plans:** 0 plans + +Plans: +- [ ] TBD (run /gsd-plan-phase 6 to break down) + --- ## Requirement Traceability diff --git a/.planning/STATE.md b/.planning/STATE.md index a4086d8..409626b 100644 --- a/.planning/STATE.md +++ b/.planning/STATE.md @@ -3,12 +3,12 @@ gsd_state_version: 1.0 milestone: v1.1 milestone_name: milestone status: planning -last_updated: "2026-04-07T13:39:48.716Z" -last_activity: 2026-04-07 +last_updated: "2026-04-08T01:58:00.000Z" +last_activity: 2026-04-08 progress: total_phases: 3 completed_phases: 2 - total_plans: 2 + total_plans: 3 completed_plans: 2 --- @@ -21,9 +21,9 @@ A high-performance, locally-hosted AI assistant system built on two RTX 3060 12G ## Current Position Phase: 05 -Plan: Not started -Status: Ready to plan -Last activity: 2026-04-07 +Plan: 05-PLAN.md (1 of 1) +Status: Ready to execute +Last activity: 2026-04-08 ## Progress @@ -44,6 +44,10 @@ Last activity: 2026-04-07 - config/engine_models.json → 모든 설정의 Single Source of Truth. - CLI-First 검증 전략: VS Code Extension 전 OpenClaude CLI로 에이전트 루프 먼저 검증. +## Roadmap Evolution + +- Phase 6 added: Install and evaluate Hermes Agent + ## Pending Todos 0 pending. @@ -54,5 +58,6 @@ None. ## Session Continuity -Last session: 2026-04-07T20:38:00+09:00 -Milestone: v1.1 OpenClaude CLI Integration +Last session: 2026-04-08T10:58:00+09:00 +Stopped at: Phase 05 PLAN created, user will execute manually +Resume file: .planning/phases/05-vscode-extension-packaging/.continue-here.md diff --git a/.planning/phases/06-install-and-evaluate-hermes-agent/.gitkeep b/.planning/phases/06-install-and-evaluate-hermes-agent/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/.planning/phases/06-install-and-evaluate-hermes-agent/06-PLAN.md b/.planning/phases/06-install-and-evaluate-hermes-agent/06-PLAN.md new file mode 100644 index 0000000..362be35 --- /dev/null +++ b/.planning/phases/06-install-and-evaluate-hermes-agent/06-PLAN.md @@ -0,0 +1,26 @@ +# Phase 06: Install and Evaluate Hermes Agent (Implementation Plan) + +## Goal Summary +Finalize the deployment of Hermes Agent locally as a background-running 24/7 Discord integration without requiring further system disruption. Synthesize all the in-flight Windows compatibility patches and configurations performed. + +## Implementation Steps + +### 1. Verification of System Edits +Ensure all ad-hoc local configurations to `.env` and `run_agent.py` remain cleanly saved in the Git worktree. Validate that: +- `memory_tool.py` correctly uses `msvcrt.locking`. +- `browser_tool.py` explicitly forces `shell=True` for win32 platform subprocess invocations. +- `run_agent.py` successfully injects `[System: Please continue with your final text response based on the reasoning above.]` to bypass vLLM restrictive API requirements over assistant role manipulation. + +### 2. Status Updates +Update current tracking systems to finalize Phase 06 out of active status. +#### [NEW] .planning/phases/06-install-and-evaluate-hermes-agent/06-PLAN.md (this file) +- Serve as the finalized historical record for Phase 06. + +#### [MODIFY] .planning/STATE.md +- Mark Phase `06-install-and-evaluate-hermes-agent` as implicitly complete or ready to merge. + +## Verification Plan + +### Manual Verification +- View Discord channel `1491417219375173822` to ensure the bot wakes up, accesses the browser (like fetching `github.com/tirth8205/code-review-graph` code), logs thoughts, captures snippets, and finally responds in text without HTTP 400 rejection. +- Check bot terminal running `run_hermes_agent.bat` for `Status: ONLINE`. diff --git a/.planning/phases/06-install-and-evaluate-hermes-agent/CONTEXT.md b/.planning/phases/06-install-and-evaluate-hermes-agent/CONTEXT.md new file mode 100644 index 0000000..a3a76e7 --- /dev/null +++ b/.planning/phases/06-install-and-evaluate-hermes-agent/CONTEXT.md @@ -0,0 +1,25 @@ +# Phase 06 Context + +## Executive Summary +This phase originally aimed to install and evaluate the Hermes Agent within our Variet LLM Engine project. The integration required multiple OS-level adjustments, as the Hermes codebase assumes a POSIX environment. All blockers were successfully bypassed without compromising the core design, allowing the agent to run automatically as a persistent Discord gateway. + +## Decisions Made +1. **Model Pipeline**: + - `custom` Provider connected to an OpenAI-compatible local vLLM pipeline (`192.168.10.4:8000/v1`). Models like `custom/gemma-4-26b` are verified to work. +2. **Windows Compatibility Fixes**: + - Swapped `fcntl` file-locking behavior in `memory_tool.py` to `msvcrt` fallback for Windows. + - Refactored `subprocess.Popen` in `browser_tool.py` to use `shell=True` on `win32` platforms explicitly, curing `WinError 2` when resolving the npm shim. + - Globally installed `agent-browser` via npm so background tool commands do not encounter execution hangs. +3. **Reasoning Prefill Fallback**: + - Updated `run_agent.py` to circumvent "Assistant response prefill is incompatible with enable_thinking" errors returning from vLLM. + - Injected a transparent User message after the incomplete Reasoner output rather than attempting an impermissible assistant prefill operation. +4. **Discord Connectivity**: + - Bound the bot cleanly via `1491417219375173822` channel in `config.yaml` / `.env` without aggressive thread opening to allow pure 1:1 interaction. + +## Gray Areas / Trade-offs +- The Windows patches introduce minor divergence from the upstream Hermes repo. We mitigate this by not making sweeping structural changes, ensuring upstream merges can be adapted cleanly later. +- Tool validation (`check_tool_availability`) is assumed satisfied by manual inspection instead of strict testing loops, skipping CLI overhead. + +## Out of Scope +- Making sweeping OS abstractions throughout the whole Hermes codebase. +- Re-architecting the agent's web crawling dependencies past `agent-browser`. diff --git a/agents/hermes-agent b/agents/hermes-agent new file mode 160000 index 0000000..fff237e --- /dev/null +++ b/agents/hermes-agent @@ -0,0 +1 @@ +Subproject commit fff237e11198a8918086bc4a2f53300a0a48dfcf diff --git a/engine/variet_engine.py b/engine/variet_engine.py index 3eec69a..8422ea0 100644 --- a/engine/variet_engine.py +++ b/engine/variet_engine.py @@ -331,7 +331,7 @@ async def proxy(request: Request, path: str): if k.lower() not in ("host", "content-length", "transfer-encoding"): fwd_headers[k] = v - client = httpx.AsyncClient(timeout=600.0) + client = httpx.AsyncClient(timeout=7200.0) # 2h — dense models may need extended time try: req = client.build_request( method=request.method, diff --git a/openclaude b/openclaude index 5ef7954..600c01f 160000 --- a/openclaude +++ b/openclaude @@ -1 +1 @@ -Subproject commit 5ef79546e97ed0b560fcb9965bf6c1013759ef54 +Subproject commit 600c01faf761a080a2c7dede872ddbe05a132f23 diff --git a/run_hermes_discord.bat b/run_hermes_agent.bat similarity index 64% rename from run_hermes_discord.bat rename to run_hermes_agent.bat index bad8b47..b84f789 100644 --- a/run_hermes_discord.bat +++ b/run_hermes_agent.bat @@ -1,10 +1,10 @@ -@echo off +@echo off chcp 65001 >nul cd /d "%~dp0" echo ===================================================== -echo 🤖 Hermes Discord Assistant Gateway +echo [ Hermes Agent Gateway ] echo ===================================================== -echo [INFO] Starting Discord Bot... +echo [INFO] Starting Hermes Agent... echo. set "PYTHONIOENCODING=utf-8" @@ -12,7 +12,13 @@ set "PYTHONUTF8=1" set "PY_EXE=C:\ProgramData\miniforge3\envs\variet-llm\python.exe" "%PY_EXE%" agents\hermes-agent\gateway\run.py +set "PYTHONIOENCODING=utf-8" +set "PYTHONUTF8=1" +set "PY_EXE=C:\ProgramData\miniforge3\envs\variet-llm\python.exe" +"%PY_EXE%" agents\hermes-agent\gateway\run.py + echo. echo ===================================================== echo [INFO] Hermes Agent Disconnected. pause + diff --git a/scripts/gemma4_test.py b/scripts/gemma4_test.py new file mode 100644 index 0000000..0810be8 --- /dev/null +++ b/scripts/gemma4_test.py @@ -0,0 +1,88 @@ +""" +Gemma 4 26B-A4B Q4_K_M - 76.4 t/s 재현 테스트 +이전 최적값: ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16 +""" +import subprocess, time, json, urllib.request, sys, os + +try: sys.stdout.reconfigure(encoding='utf-8') +except: pass + +LLAMA = os.path.join(os.getcwd(), "llama_bin_run", "llama-server.exe") +MODEL = os.path.join(os.getcwd(), "models", "gemma-4-26B-A4B-it-Q4_K_M.gguf") + +subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) +time.sleep(3) + +cmd = [ + LLAMA, "--model", MODEL, + "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on", + "--cache-type-k", "f16", "--cache-type-v", "f16", + "-ub", "512", "-b", "2048", "-t", "6", "-tb", "6", + "--prio", "3", "--mlock", "--poll", "50", + "--port", "8000", "--host", "0.0.0.0", +] + +print("[1/4] Starting Gemma4 26B Q4_K_M (76.4 t/s config)...") +server = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + +print("[2/4] Waiting for boot...") +healthy = False +for sec in range(180): + time.sleep(1) + if server.poll() is not None: + print(f" !! CRASHED (exit code {server.returncode})") + sys.exit(1) + try: + with urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=1) as r: + if json.loads(r.read()).get("status") == "ok": + healthy = True; break + except: pass + if sec % 10 == 9: print(f" ... {sec+1}s") + +if not healthy: + print(" FAIL: boot timeout"); server.kill(); sys.exit(1) + +print(f" OK!") +try: + v = subprocess.run(["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5) + print(f" VRAM: {v.stdout.strip()}") +except: pass + +def bench(n): + payload = json.dumps({"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}], "max_tokens": n, "temperature": 0}).encode() + req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"}) + t0 = time.time() + with urllib.request.urlopen(req, timeout=120) as r: + res = json.loads(r.read()) + el = time.time() - t0 + ct = res["usage"]["completion_tokens"] + return ct / el, ct, el + +try: bench(10) +except: pass + +print("[3/4] Running 5x benchmark (200 tokens)...") +results = [] +for i in range(5): + tps, tok, el = bench(200) + results.append(tps) + print(f" Run {i+1}: {tps:.2f} t/s ({tok} tok / {el:.2f}s)") + +avg = sum(results) / len(results) +best = max(results) +worst = min(results) +summary = f""" +================================================== + Gemma4 26B Q4_K_M 5-Run Results: + AVG: {avg:.2f} t/s + BEST: {best:.2f} t/s + MIN: {worst:.2f} t/s + Runs: {[f'{r:.2f}' for r in results]} +================================================== +""" +print(summary) +with open("scripts/gemma4_test_result.txt", "w") as f: + f.write(summary) + +server.kill() +subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) diff --git a/scripts/qwen_split_challenge.py b/scripts/qwen_split_challenge.py new file mode 100644 index 0000000..2b34faa --- /dev/null +++ b/scripts/qwen_split_challenge.py @@ -0,0 +1,67 @@ +import subprocess +import time +import json +import urllib.request +import sys +import os + +try: sys.stdout.reconfigure(encoding='utf-8') +except AttributeError: pass + +MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf" +LLAMA_SERVER = r"llama_bin_run\llama-server.exe" + +subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True) +time.sleep(2) + +cmd = [ + LLAMA_SERVER, "--model", MODEL, + "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on", + "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", + "-ub", "128", "-b", "512", "-t", "6", "-tb", "6", + "--prio", "3", "--port", "8000", "--host", "0.0.0.0", + "-ts", "0.44,0.56" +] + +print(f"🚀 Starting Challenge (0.44, 0.56) ...") +proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + +ready = False +for i in range(120): + try: + req = urllib.request.Request("http://127.0.0.1:8000/health") + with urllib.request.urlopen(req, timeout=1) as r: + if json.loads(r.read()).get("status") == "ok": + ready = True + break + except: + pass + print(f" booting... {i}s", end='\r', flush=True) + time.sleep(1) + +if not ready: + print("\n❌ FAILED to boot.") + proc.kill() + sys.exit(1) + +print("\n✅ Booted! Testing 200 tokens...") +try: + payload = json.dumps({ + "messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}], + "max_tokens": 200, "temperature": 0 + }).encode() + req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"}) + t0 = time.time() + with urllib.request.urlopen(req, timeout=300) as r: + res = json.loads(r.read()) + el = time.time() - t0 + ct = res["usage"]["completion_tokens"] + tps = ct / el + print("="*50) + print(f"★ 0.44 / 0.56 RESULT: {tps:.2f} t/s ★") + print(f" Tokens: {ct} | Time: {el:.2f}s") + print("="*50) +except Exception as e: + print(f"\n❌ Benchmark Error: {e}") + +proc.kill() diff --git a/scripts/tune_models.mjs b/scripts/tune_models.mjs new file mode 100644 index 0000000..1726f1f --- /dev/null +++ b/scripts/tune_models.mjs @@ -0,0 +1,84 @@ +import { exec, spawn } from 'child_process'; + +const delay = ms => new Promise(res => setTimeout(res, ms)); + +async function runTest(modelArgs, name) { + console.log(`\n===========================================`); + console.log(`Testing: ${name}`); + console.log(`Args: ${modelArgs}`); + + return new Promise(async (resolve) => { + // Kill existing + await new Promise(r => exec('taskkill /F /IM llama-server.exe', r)); + await delay(2000); + + const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), { + detached: true, + stdio: 'ignore' + }); + + let ready = false; + let oom = false; + + for (let i = 0; i < 40; i++) { + try { + const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 }); + if (res.status === 200) { + ready = true; + break; + } + } catch (e) {} + await delay(3000); + } + + if (!ready) { + console.log(`[${name}] FAILED TO BOOT (Likely OOM)`); + exec('taskkill /F /IM llama-server.exe'); + resolve({ success: false }); + return; + } + + console.log(`[${name}] Server Ready! Running benchmark...`); + // Run pptest + exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => { + console.log(stdout || stderr); + + // Extract TG and PP from TG-500 + const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/); + const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/); + + const tg = tgMatch ? parseFloat(tgMatch[1]) : 0; + const pp = ppMatch ? parseFloat(ppMatch[1]) : 0; + + exec('taskkill /F /IM llama-server.exe'); + resolve({ success: true, tg, pp }); + }); + }); +} + +async function main() { + // 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512 + const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; + + // Test 1: n-cpu-moe 1, ub 512 + await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512"); + + // Test 2: n-cpu-moe 2, ub 512 + await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512"); + + // Test 3: n-cpu-moe 4, ub 512 + await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512"); + + // 2. 122B Tuning: Find optimal n-cpu-moe + const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; + + // Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU. + // Let's try 38, 35, 30 + await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38"); + await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30"); + await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22"); + + console.log("Tuning finished."); +} + +main();