feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning):
- Gemma4 26B: 74.65 t/s (fast)
- Qwen 35B: 61.62 t/s (balanced)
- Gemma4 31B: 16.0 t/s (deep-coder)
- Qwen 27B: 16.7 t/s (deep-logic)
- Qwen 122B: 8.95 t/s (ultra, GPU 1 only)

Phase 02 (API Engine):
- FastAPI reverse proxy on port 8000
- /engine/switch hot-swap with 503 protection
- config/engine_models.json as single source of truth
- Replaced 4 individual .bat files with unified engine

File cleanup:
- scripts/ 85 files -> 9 + _archive/
- Root .bat files -> _archive/
This commit is contained in:
Variet-Worker
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions

View File

@@ -0,0 +1,163 @@
"""
Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
"""
import subprocess, time, json, urllib.request, sys, os
try:
sys.stdout.reconfigure(encoding='utf-8')
except:
pass
BASE_URL = "http://127.0.0.1:8000"
SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
CTX = 262144
RUNS = 3
def kill():
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
time.sleep(4)
def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
cmd = [SERVER, "--model", MODEL, "-ngl", "999",
"-c", str(CTX), "-np", "1", "-fa", "on",
"--cache-type-k", ctk, "--cache-type-v", ctv,
"-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
"--prio", str(prio), "--poll", "50",
"--mlock", "--port", "8000", "--host", "0.0.0.0"]
if ncpumoe > 0:
cmd.extend(["--n-cpu-moe", str(ncpumoe)])
if nommap:
cmd.append("--no-mmap")
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
def wait_ready(timeout=240):
t0 = time.time()
while time.time() - t0 < timeout:
try:
with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
if json.loads(r.read()).get("status") == "ok":
return True
except:
pass
time.sleep(2)
return False
def bench(n=200):
p = json.dumps({"model": "m", "messages": [{"role": "user",
"content": "Count from 1 to 50, each number on new line."}],
"max_tokens": n, "temperature": 0.0}).encode()
r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
headers={"Content-Type": "application/json"})
t0 = time.time()
with urllib.request.urlopen(r, timeout=300) as resp:
res = json.loads(resp.read())
dt = time.time() - t0
ct = res.get("usage", {}).get("completion_tokens", 0)
return ct / dt if dt > 0 else 0
def vram():
try:
r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
"--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
a, b = r.stdout.strip().split(",")
return int(a.strip()), int(b.strip())
except:
return 0, 0
def test(label, ncpumoe, **kw):
kill()
print(f" [{label}] Starting...", end=" ", flush=True)
p = start(ncpumoe, **kw)
if not wait_ready():
print("FAILED"); p.kill(); return None
vu, vt = vram()
print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
try: bench(20)
except: pass
speeds = []
for _ in range(RUNS):
try: speeds.append(bench())
except: pass
p.kill()
if not speeds:
print("BENCH FAILED"); return None
avg, best = sum(speeds)/len(speeds), max(speeds)
print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
"vram": vu, **kw}
def main():
print("=" * 60)
print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
print("=" * 60)
results = []
# Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
print("\n--- Phase 1: --n-cpu-moe sweep ---")
for n in [0, 5, 10, 15, 20, 25, 30]:
nm = n > 15 # use --no-mmap when heavy CPU offload
r = test(f"ncpumoe={n}", n, nommap=nm)
if r: results.append(r)
# Find best n-cpu-moe
best_r = max(results, key=lambda x: x["avg"])
best_n = best_r["ncpumoe"]
print(f"\n ★ Best n-cpu-moe: {best_n}{best_r['avg']:.1f} t/s")
# Fine-tune around best
if best_n > 0:
print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
if n == best_n: continue
nm = n > 15
r = test(f"ncpumoe={n}", n, nommap=nm)
if r: results.append(r)
best_r = max(results, key=lambda x: x["avg"])
best_n = best_r["ncpumoe"]
print(f"\n ★ Refined n-cpu-moe: {best_n}{best_r['avg']:.1f} t/s")
# Phase 2: Thread sweep at best n-cpu-moe
nm = best_n > 15
print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
for t in [2, 4, 6, 8, 10]:
r = test(f"t={t}", best_n, t=t, nommap=nm)
if r: results.append(r)
best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
bt = best_t.get("t", 4)
print(f"\n ★ Best threads: {bt}")
# Phase 3: Batch sweep
print(f"\n--- Phase 3: Batch sweep ---")
for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
if r: results.append(r)
# Phase 4: KV cache type
print(f"\n--- Phase 4: KV cache type ---")
for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
if r: results.append(r)
# Final report
best_all = max(results, key=lambda x: x["avg"])
print(f"\n{'='*60}")
print(f" FINAL BEST: {best_all['label']}{best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
print(f"{'='*60}")
with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
json.dump(results, f, indent=2, default=str)
print(" Saved: scripts/tune_results_gemma4_ncpumoe.json")
if __name__ == "__main__":
main()