feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions
--- a/scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
+++ b/scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
@@ -0,0 +1,163 @@
+"""
+Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
+Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
+"""
+import subprocess, time, json, urllib.request, sys, os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
+CTX = 262144
+RUNS = 3
+
+
+def kill():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+    time.sleep(4)
+
+
+def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
+    cmd = [SERVER, "--model", MODEL, "-ngl", "999",
+           "-c", str(CTX), "-np", "1", "-fa", "on",
+           "--cache-type-k", ctk, "--cache-type-v", ctv,
+           "-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
+           "--prio", str(prio), "--poll", "50",
+           "--mlock", "--port", "8000", "--host", "0.0.0.0"]
+    if ncpumoe > 0:
+        cmd.extend(["--n-cpu-moe", str(ncpumoe)])
+    if nommap:
+        cmd.append("--no-mmap")
+    return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                            cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
+
+
+def wait_ready(timeout=240):
+    t0 = time.time()
+    while time.time() - t0 < timeout:
+        try:
+            with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
+                if json.loads(r.read()).get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(2)
+    return False
+
+
+def bench(n=200):
+    p = json.dumps({"model": "m", "messages": [{"role": "user",
+         "content": "Count from 1 to 50, each number on new line."}],
+         "max_tokens": n, "temperature": 0.0}).encode()
+    r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
+                               headers={"Content-Type": "application/json"})
+    t0 = time.time()
+    with urllib.request.urlopen(r, timeout=300) as resp:
+        res = json.loads(resp.read())
+    dt = time.time() - t0
+    ct = res.get("usage", {}).get("completion_tokens", 0)
+    return ct / dt if dt > 0 else 0
+
+
+def vram():
+    try:
+        r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
+                            "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
+        a, b = r.stdout.strip().split(",")
+        return int(a.strip()), int(b.strip())
+    except:
+        return 0, 0
+
+
+def test(label, ncpumoe, **kw):
+    kill()
+    print(f"  [{label}] Starting...", end=" ", flush=True)
+    p = start(ncpumoe, **kw)
+    if not wait_ready():
+        print("FAILED"); p.kill(); return None
+    vu, vt = vram()
+    print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
+    try: bench(20)
+    except: pass
+    speeds = []
+    for _ in range(RUNS):
+        try: speeds.append(bench())
+        except: pass
+    p.kill()
+    if not speeds:
+        print("BENCH FAILED"); return None
+    avg, best = sum(speeds)/len(speeds), max(speeds)
+    print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
+    return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
+            "vram": vu, **kw}
+
+
+def main():
+    print("=" * 60)
+    print("  Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
+    print("=" * 60)
+    results = []
+
+    # Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
+    print("\n--- Phase 1: --n-cpu-moe sweep ---")
+    for n in [0, 5, 10, 15, 20, 25, 30]:
+        nm = n > 15  # use --no-mmap when heavy CPU offload
+        r = test(f"ncpumoe={n}", n, nommap=nm)
+        if r: results.append(r)
+
+    # Find best n-cpu-moe
+    best_r = max(results, key=lambda x: x["avg"])
+    best_n = best_r["ncpumoe"]
+    print(f"\n  ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
+
+    # Fine-tune around best
+    if best_n > 0:
+        print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
+        for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
+            if n == best_n: continue
+            nm = n > 15
+            r = test(f"ncpumoe={n}", n, nommap=nm)
+            if r: results.append(r)
+        best_r = max(results, key=lambda x: x["avg"])
+        best_n = best_r["ncpumoe"]
+        print(f"\n  ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
+
+    # Phase 2: Thread sweep at best n-cpu-moe
+    nm = best_n > 15
+    print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
+    for t in [2, 4, 6, 8, 10]:
+        r = test(f"t={t}", best_n, t=t, nommap=nm)
+        if r: results.append(r)
+    best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
+    bt = best_t.get("t", 4)
+    print(f"\n  ★ Best threads: {bt}")
+
+    # Phase 3: Batch sweep
+    print(f"\n--- Phase 3: Batch sweep ---")
+    for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
+        r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
+        if r: results.append(r)
+
+    # Phase 4: KV cache type
+    print(f"\n--- Phase 4: KV cache type ---")
+    for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
+        r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
+        if r: results.append(r)
+
+    # Final report
+    best_all = max(results, key=lambda x: x["avg"])
+    print(f"\n{'='*60}")
+    print(f"  FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
+    print(f"{'='*60}")
+
+    with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
+        json.dump(results, f, indent=2, default=str)
+    print("  Saved: scripts/tune_results_gemma4_ncpumoe.json")
+
+
+if __name__ == "__main__":
+    main()