""" Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context. Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30. """ import subprocess, time, json, urllib.request, sys, os try: sys.stdout.reconfigure(encoding='utf-8') except: pass BASE_URL = "http://127.0.0.1:8000" SERVER = r"llama_bin_run\llama-server.exe" MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf" CTX = 262144 RUNS = 3 def kill(): subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True) time.sleep(4) def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False): cmd = [SERVER, "--model", MODEL, "-ngl", "999", "-c", str(CTX), "-np", "1", "-fa", "on", "--cache-type-k", ctk, "--cache-type-v", ctv, "-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t), "--prio", str(prio), "--poll", "50", "--mlock", "--port", "8000", "--host", "0.0.0.0"] if ncpumoe > 0: cmd.extend(["--n-cpu-moe", str(ncpumoe)]) if nommap: cmd.append("--no-mmap") return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace') def wait_ready(timeout=240): t0 = time.time() while time.time() - t0 < timeout: try: with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r: if json.loads(r.read()).get("status") == "ok": return True except: pass time.sleep(2) return False def bench(n=200): p = json.dumps({"model": "m", "messages": [{"role": "user", "content": "Count from 1 to 50, each number on new line."}], "max_tokens": n, "temperature": 0.0}).encode() r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p, headers={"Content-Type": "application/json"}) t0 = time.time() with urllib.request.urlopen(r, timeout=300) as resp: res = json.loads(resp.read()) dt = time.time() - t0 ct = res.get("usage", {}).get("completion_tokens", 0) return ct / dt if dt > 0 else 0 def vram(): try: r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5) a, b = r.stdout.strip().split(",") return int(a.strip()), int(b.strip()) except: return 0, 0 def test(label, ncpumoe, **kw): kill() print(f" [{label}] Starting...", end=" ", flush=True) p = start(ncpumoe, **kw) if not wait_ready(): print("FAILED"); p.kill(); return None vu, vt = vram() print(f"VRAM:{vu}/{vt} | ", end="", flush=True) try: bench(20) except: pass speeds = [] for _ in range(RUNS): try: speeds.append(bench()) except: pass p.kill() if not speeds: print("BENCH FAILED"); return None avg, best = sum(speeds)/len(speeds), max(speeds) print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s") return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best, "vram": vu, **kw} def main(): print("=" * 60) print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune") print("=" * 60) results = [] # Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30) print("\n--- Phase 1: --n-cpu-moe sweep ---") for n in [0, 5, 10, 15, 20, 25, 30]: nm = n > 15 # use --no-mmap when heavy CPU offload r = test(f"ncpumoe={n}", n, nommap=nm) if r: results.append(r) # Find best n-cpu-moe best_r = max(results, key=lambda x: x["avg"]) best_n = best_r["ncpumoe"] print(f"\n ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s") # Fine-tune around best if best_n > 0: print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---") for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]: if n == best_n: continue nm = n > 15 r = test(f"ncpumoe={n}", n, nommap=nm) if r: results.append(r) best_r = max(results, key=lambda x: x["avg"]) best_n = best_r["ncpumoe"] print(f"\n ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s") # Phase 2: Thread sweep at best n-cpu-moe nm = best_n > 15 print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---") for t in [2, 4, 6, 8, 10]: r = test(f"t={t}", best_n, t=t, nommap=nm) if r: results.append(r) best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"]) bt = best_t.get("t", 4) print(f"\n ★ Best threads: {bt}") # Phase 3: Batch sweep print(f"\n--- Phase 3: Batch sweep ---") for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]: r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm) if r: results.append(r) # Phase 4: KV cache type print(f"\n--- Phase 4: KV cache type ---") for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]: r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm) if r: results.append(r) # Final report best_all = max(results, key=lambda x: x["avg"]) print(f"\n{'='*60}") print(f" FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})") print(f"{'='*60}") with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f: json.dump(results, f, indent=2, default=str) print(" Saved: scripts/tune_results_gemma4_ncpumoe.json") if __name__ == "__main__": main()