""" Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB Phase 1: -ngl sweep (GPU layers) Phase 2: -t / -tb sweep (CPU threads) Phase 3: -ub / -b sweep (batch sizes) Phase 4: --cache-type-k/v sweep (KV cache precision) Phase 5: --no-mmap, --poll, --prio sweep (misc) Each phase fixes the best from previous phases. """ import subprocess import time import json import urllib.request import sys import os import itertools try: sys.stdout.reconfigure(encoding='utf-8') except AttributeError: pass BASE_URL = "http://127.0.0.1:8000" LLAMA_SERVER = r"llama_bin_run\llama-server.exe" MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf" CONTEXT = 262144 BENCHMARK_RUNS = 3 BENCHMARK_TOKENS = 200 # ─── Baseline (from previous tuning at -c 4096) ─── BEST = { "ngl": 22, "t": 8, "tb": 8, "ub": 512, "b": 2048, "ctk": "q4_0", "ctv": "q4_0", "fa": "on", "mlock": True, "mmap": True, "prio": 2, "poll": 50, } ALL_RESULTS = [] def kill_server(): subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True) time.sleep(4) def build_cmd(cfg): cmd = [LLAMA_SERVER, "--model", MODEL, "-ngl", str(cfg["ngl"]), "-c", str(CONTEXT), "-np", "1", "-fa", cfg["fa"], "--cache-type-k", cfg["ctk"], "--cache-type-v", cfg["ctv"], "-ub", str(cfg["ub"]), "-b", str(cfg["b"]), "-t", str(cfg["t"]), "-tb", str(cfg["tb"]), "--prio", str(cfg["prio"]), "--poll", str(cfg["poll"]), "--port", "8000", "--host", "0.0.0.0"] if cfg["mlock"]: cmd.append("--mlock") if not cfg["mmap"]: cmd.append("--no-mmap") return cmd def start_server(cfg): cmd = build_cmd(cfg) proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace' ) return proc def wait_for_server(timeout=180): start = time.time() while time.time() - start < timeout: try: req = urllib.request.Request(f"{BASE_URL}/health") with urllib.request.urlopen(req, timeout=3) as resp: data = json.loads(resp.read()) if data.get("status") == "ok": return True except: pass time.sleep(2) return False def run_benchmark(max_tokens=BENCHMARK_TOKENS): payload = json.dumps({ "model": "local-model", "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}], "max_tokens": max_tokens, "temperature": 0.0 }).encode("utf-8") req = urllib.request.Request( f"{BASE_URL}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"} ) start = time.time() with urllib.request.urlopen(req, timeout=300) as resp: result = json.loads(resp.read()) elapsed = time.time() - start usage = result.get("usage", {}) ct = usage.get("completion_tokens", 0) return ct / elapsed if elapsed > 0 else 0 def get_vram(): try: r = subprocess.run( ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) parts = r.stdout.strip().split(",") return int(parts[0].strip()), int(parts[1].strip()) except: return 0, 0 def test_config(cfg, label=""): kill_server() desc = label or str(cfg) print(f" [{desc}] Starting server...") proc = start_server(cfg) if not wait_for_server(): print(f" [{desc}] FAILED to start") proc.kill() return None vram_used, vram_total = get_vram() print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True) # Warmup try: run_benchmark(max_tokens=20) except: pass # Benchmark speeds = [] for i in range(BENCHMARK_RUNS): try: tps = run_benchmark() speeds.append(tps) except Exception as e: print(f"ERR({e}) ", end="", flush=True) proc.kill() if not speeds: print("ALL FAILED") return None avg = sum(speeds) / len(speeds) best = max(speeds) print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s") result = {**cfg, "avg_tps": avg, "best_tps": best, "vram_used": vram_used, "vram_total": vram_total, "label": label} ALL_RESULTS.append(result) return result def phase_sweep(phase_name, param_name, values, base_cfg): print(f"\n{'='*70}") print(f" PHASE: {phase_name}") print(f" Sweeping: {param_name} = {values}") print(f"{'='*70}") best_result = None for val in values: cfg = {**base_cfg} if isinstance(param_name, list): for p, v in zip(param_name, val): cfg[p] = v label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val)) else: cfg[param_name] = val label = f"{param_name}={val}" r = test_config(cfg, label) if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]): best_result = r if best_result: print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s") return best_result def main(): print("=" * 70) print(" Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner") print(" 256K Context | RTX 3060 12GB") print("=" * 70) print() cfg = dict(BEST) # ─── Phase 1: -ngl (already done, quick verify top 3) ─── r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg) if r: cfg["ngl"] = r["ngl"] # ─── Phase 2: CPU threads (-t, -tb) ─── thread_combos = [ (2, 2), (4, 4), (4, 8), (6, 6), (6, 8), (8, 8), (8, 12), (10, 10), (12, 12), (16, 16) ] r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg) if r: cfg["t"] = r["t"] cfg["tb"] = r["tb"] # ─── Phase 3: Batch sizes (-ub, -b) ─── batch_combos = [ (128, 512), (256, 1024), (256, 2048), (512, 1024), (512, 2048), (512, 4096), (1024, 2048), (1024, 4096) ] r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg) if r: cfg["ub"] = r["ub"] cfg["b"] = r["b"] # ─── Phase 4: KV cache precision ─── kv_combos = [ ("q4_0", "q4_0"), ("q8_0", "q8_0"), ("q4_0", "q8_0"), ("f16", "f16"), ] r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg) if r: cfg["ctk"] = r["ctk"] cfg["ctv"] = r["ctv"] # ─── Phase 5: Misc (mmap, poll, prio) ─── misc_combos = [ (True, 50, 2), # baseline (False, 50, 2), # no-mmap (True, 0, 2), # no polling (True, 100, 2), # max polling (True, 50, 3), # realtime priority (False, 0, 3), # no-mmap + no-poll + realtime ] r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg) if r: cfg["mmap"] = r["mmap"] cfg["poll"] = r["poll"] cfg["prio"] = r["prio"] # ─── Final Report ─── print() print("=" * 70) print(" FINAL OPTIMAL CONFIGURATION") print("=" * 70) print(f" ngl: {cfg['ngl']}") print(f" threads: -t {cfg['t']} -tb {cfg['tb']}") print(f" batch: -ub {cfg['ub']} -b {cfg['b']}") print(f" kv cache: -ctk {cfg['ctk']} -ctv {cfg['ctv']}") print(f" flash: -fa {cfg['fa']}") print(f" mlock: {'yes' if cfg['mlock'] else 'no'}") print(f" mmap: {'yes' if cfg['mmap'] else 'no (--no-mmap)'}") print(f" prio: {cfg['prio']}") print(f" poll: {cfg['poll']}") print() # Final verification run print(" Running final verification (5 runs)...") kill_server() proc = start_server(cfg) wait_for_server() try: run_benchmark(max_tokens=20) except: pass final_speeds = [] for i in range(5): try: tps = run_benchmark() final_speeds.append(tps) print(f" Run {i+1}: {tps:.2f} t/s") except: pass proc.kill() if final_speeds: avg = sum(final_speeds) / len(final_speeds) best = max(final_speeds) print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s") print() cmd_parts = [ f"llama-server --model {MODEL}", f"-ngl {cfg['ngl']} -c {CONTEXT}", f"-t {cfg['t']} -tb {cfg['tb']}", f"-ub {cfg['ub']} -b {cfg['b']}", f"-fa {cfg['fa']}", f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}", f"--prio {cfg['prio']} --poll {cfg['poll']}", ] if cfg["mlock"]: cmd_parts.append("--mlock") if not cfg["mmap"]: cmd_parts.append("--no-mmap") cmd_parts.append("--port 8000 --host 0.0.0.0") print(" Recommended command:") print(f" {' '.join(cmd_parts)}") print("=" * 70) # Dump all results to JSON with open("scripts/tune_results_gemma4_256k.json", "w") as f: json.dump(ALL_RESULTS, f, indent=2, default=str) print(f"\n Full results saved: scripts/tune_results_gemma4_256k.json") if __name__ == "__main__": main()