feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
This commit is contained in:
163
scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
Normal file
163
scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
|
||||
Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
|
||||
"""
|
||||
import subprocess, time, json, urllib.request, sys, os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||
CTX = 262144
|
||||
RUNS = 3
|
||||
|
||||
|
||||
def kill():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
|
||||
cmd = [SERVER, "--model", MODEL, "-ngl", "999",
|
||||
"-c", str(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", ctk, "--cache-type-v", ctv,
|
||||
"-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
|
||||
"--prio", str(prio), "--poll", "50",
|
||||
"--mlock", "--port", "8000", "--host", "0.0.0.0"]
|
||||
if ncpumoe > 0:
|
||||
cmd.extend(["--n-cpu-moe", str(ncpumoe)])
|
||||
if nommap:
|
||||
cmd.append("--no-mmap")
|
||||
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def wait_ready(timeout=240):
|
||||
t0 = time.time()
|
||||
while time.time() - t0 < timeout:
|
||||
try:
|
||||
with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
|
||||
if json.loads(r.read()).get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def bench(n=200):
|
||||
p = json.dumps({"model": "m", "messages": [{"role": "user",
|
||||
"content": "Count from 1 to 50, each number on new line."}],
|
||||
"max_tokens": n, "temperature": 0.0}).encode()
|
||||
r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
|
||||
headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
with urllib.request.urlopen(r, timeout=300) as resp:
|
||||
res = json.loads(resp.read())
|
||||
dt = time.time() - t0
|
||||
ct = res.get("usage", {}).get("completion_tokens", 0)
|
||||
return ct / dt if dt > 0 else 0
|
||||
|
||||
|
||||
def vram():
|
||||
try:
|
||||
r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
|
||||
a, b = r.stdout.strip().split(",")
|
||||
return int(a.strip()), int(b.strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test(label, ncpumoe, **kw):
|
||||
kill()
|
||||
print(f" [{label}] Starting...", end=" ", flush=True)
|
||||
p = start(ncpumoe, **kw)
|
||||
if not wait_ready():
|
||||
print("FAILED"); p.kill(); return None
|
||||
vu, vt = vram()
|
||||
print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
|
||||
try: bench(20)
|
||||
except: pass
|
||||
speeds = []
|
||||
for _ in range(RUNS):
|
||||
try: speeds.append(bench())
|
||||
except: pass
|
||||
p.kill()
|
||||
if not speeds:
|
||||
print("BENCH FAILED"); return None
|
||||
avg, best = sum(speeds)/len(speeds), max(speeds)
|
||||
print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
|
||||
return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
|
||||
"vram": vu, **kw}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
|
||||
print("=" * 60)
|
||||
results = []
|
||||
|
||||
# Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
|
||||
print("\n--- Phase 1: --n-cpu-moe sweep ---")
|
||||
for n in [0, 5, 10, 15, 20, 25, 30]:
|
||||
nm = n > 15 # use --no-mmap when heavy CPU offload
|
||||
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Find best n-cpu-moe
|
||||
best_r = max(results, key=lambda x: x["avg"])
|
||||
best_n = best_r["ncpumoe"]
|
||||
print(f"\n ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||
|
||||
# Fine-tune around best
|
||||
if best_n > 0:
|
||||
print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
|
||||
for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
|
||||
if n == best_n: continue
|
||||
nm = n > 15
|
||||
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||
if r: results.append(r)
|
||||
best_r = max(results, key=lambda x: x["avg"])
|
||||
best_n = best_r["ncpumoe"]
|
||||
print(f"\n ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||
|
||||
# Phase 2: Thread sweep at best n-cpu-moe
|
||||
nm = best_n > 15
|
||||
print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
|
||||
for t in [2, 4, 6, 8, 10]:
|
||||
r = test(f"t={t}", best_n, t=t, nommap=nm)
|
||||
if r: results.append(r)
|
||||
best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
|
||||
bt = best_t.get("t", 4)
|
||||
print(f"\n ★ Best threads: {bt}")
|
||||
|
||||
# Phase 3: Batch sweep
|
||||
print(f"\n--- Phase 3: Batch sweep ---")
|
||||
for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
|
||||
r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Phase 4: KV cache type
|
||||
print(f"\n--- Phase 4: KV cache type ---")
|
||||
for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
|
||||
r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Final report
|
||||
best_all = max(results, key=lambda x: x["avg"])
|
||||
print(f"\n{'='*60}")
|
||||
print(f" FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
print(" Saved: scripts/tune_results_gemma4_ncpumoe.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user