""" Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark ========================================================== Tests 4 models across multiple parameter configurations to find the absolute best model + settings for 256K context coding agent. Models: 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB) 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB) 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB) 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB) Test Phases (per model): Phase 0: Basic dual-GPU startup test (can it even boot at 256K?) Phase 1: GPU layer + MoE offload strategy sweep Phase 2: CPU thread sweep (carry best from P1) Phase 3: Batch size sweep (carry best from P1+P2) Phase 4: KV cache type sweep (carry best from P1+P2+P3) Phase 5: Final verification (5 runs) Output: scripts/dual_gpu_results.json (all raw data) scripts/dual_gpu_summary.txt (human-readable winner) """ import subprocess import time import json import urllib.request import sys import os import datetime try: sys.stdout.reconfigure(encoding='utf-8') except Exception: pass # ─── Configuration ─────────────────────────────────────────────── BASE_URL = "http://127.0.0.1:8000" LLAMA_SERVER = r"llama_bin_run\llama-server.exe" CONTEXT = 262144 # 256K BENCHMARK_RUNS = 3 BENCHMARK_TOKENS = 200 SERVER_TIMEOUT = 300 # seconds to wait for server startup MODELS = [ { "name": "Qwen3.5-35B-A3B Q4_K_M", "path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf", "type": "qwen", "quant": "Q4_K_M", "is_mxfp4": False, "total_layers": 64, # Qwen3.5 35B has 64 layers }, { "name": "Qwen3.5-35B-A3B MXFP4_MOE", "path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf", "type": "qwen", "quant": "MXFP4_MOE", "is_mxfp4": True, "total_layers": 64, }, { "name": "Gemma4 26B-A4B Q4_K_M", "path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf", "type": "gemma4", "quant": "Q4_K_M", "is_mxfp4": False, "total_layers": 30, # Gemma4 26B has 30 layers }, { "name": "Gemma4 26B-A4B MXFP4_MOE", "path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf", "type": "gemma4", "quant": "MXFP4_MOE", "is_mxfp4": True, "total_layers": 30, }, ] ALL_RESULTS = [] # ─── Utility Functions ────────────────────────────────────────── def log(msg): ts = datetime.datetime.now().strftime("%H:%M:%S") print(f"[{ts}] {msg}", flush=True) def kill_server(): subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True) time.sleep(5) def get_vram_all(): """Returns list of (used, total) tuples for each GPU.""" try: r = subprocess.run( ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) gpus = [] for line in r.stdout.strip().split("\n"): parts = [p.strip() for p in line.split(",")] if len(parts) >= 3: gpus.append({ "gpu": int(parts[0]), "used": int(parts[1]), "total": int(parts[2]), }) return gpus except Exception: return [] def build_cmd(model_path, ngl, t, ub, b, ctk, ctv, cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False): """Build llama-server command for dual-GPU.""" cmd = [ LLAMA_SERVER, "--model", model_path, "-ngl", str(ngl), "-c", str(CONTEXT), "-np", "1", "-fa", "on", "--cache-type-k", ctk, "--cache-type-v", ctv, "-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t), "--prio", str(prio), "--poll", "50", "--mlock", "--port", "8000", "--host", "0.0.0.0", ] # MoE offloading options if cpu_moe: cmd.append("--cpu-moe") elif n_cpu_moe > 0: cmd.extend(["--n-cpu-moe", str(n_cpu_moe)]) if nommap: cmd.append("--no-mmap") return cmd def start_server(model_path, **kwargs): cmd = build_cmd(model_path, **kwargs) log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace' ) return proc def wait_for_server(timeout=SERVER_TIMEOUT): start = time.time() while time.time() - start < timeout: try: req = urllib.request.Request(f"{BASE_URL}/health") with urllib.request.urlopen(req, timeout=3) as resp: data = json.loads(resp.read()) if data.get("status") == "ok": boot_time = time.time() - start return True, boot_time except Exception: pass time.sleep(3) return False, timeout def run_benchmark(max_tokens=BENCHMARK_TOKENS): payload = json.dumps({ "model": "local-model", "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}], "max_tokens": max_tokens, "temperature": 0.0, }).encode("utf-8") req = urllib.request.Request( f"{BASE_URL}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"}, ) start = time.time() with urllib.request.urlopen(req, timeout=600) as resp: result = json.loads(resp.read()) elapsed = time.time() - start usage = result.get("usage", {}) ct = usage.get("completion_tokens", 0) pt = usage.get("prompt_tokens", 0) return { "tps": ct / elapsed if elapsed > 0 else 0, "completion_tokens": ct, "prompt_tokens": pt, "elapsed": elapsed, } def test_config(model_info, label, **kwargs): """Test a single configuration. Returns result dict or None.""" kill_server() log(f" [{label}] Starting server...") proc = start_server(model_info["path"], **kwargs) ok, boot_time = wait_for_server() if not ok: log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)") proc.kill() return None vram = get_vram_all() vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram) log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}") # Warmup try: run_benchmark(max_tokens=20) except Exception: pass # Benchmark runs speeds = [] for i in range(BENCHMARK_RUNS): try: r = run_benchmark() speeds.append(r["tps"]) log(f" Run {i+1}: {r['tps']:.2f} t/s") except Exception as e: log(f" Run {i+1}: ERROR ({e})") proc.kill() if not speeds: log(f" [{label}] ALL BENCHMARK RUNS FAILED") return None avg = sum(speeds) / len(speeds) best = max(speeds) log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s") result = { "model": model_info["name"], "quant": model_info["quant"], "label": label, "avg_tps": round(avg, 2), "best_tps": round(best, 2), "boot_time": round(boot_time, 1), "vram": vram, "params": kwargs, } ALL_RESULTS.append(result) return result # ─── Phase Runners ─────────────────────────────────────────────── def phase0_boot_test(model): """Quick test: can the model even boot with 256K on dual GPU?""" log(f"\n{'='*70}") log(f" PHASE 0: Boot Test — {model['name']}") log(f"{'='*70}") # Try -ngl 999 (all layers to GPU) as baseline r = test_config( model, f"boot-ngl999", ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", ) if r: return r # If full GPU fails, try with cpu-moe log(" Full GPU failed, trying with --cpu-moe...") r = test_config( model, f"boot-cpumoe", ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", cpu_moe=True, ) if r: return r # Extreme fallback: fewer layers log(" --cpu-moe also failed, trying reduced layers...") r = test_config( model, f"boot-ngl-half", ngl=model["total_layers"] // 2, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", ) return r def phase1_gpu_offload(model, baseline): """Find optimal GPU layer count and MoE offload strategy.""" log(f"\n{'='*70}") log(f" PHASE 1: GPU Offload Strategy — {model['name']}") log(f"{'='*70}") results = [] if baseline: results.append(baseline) total = model["total_layers"] # Strategy A: All GPU + cpu-moe variations for cpu_moe in [True, False]: label = f"ngl=999 cpu_moe={cpu_moe}" # Skip if already tested in baseline if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \ baseline["params"].get("cpu_moe", False) == cpu_moe: continue r = test_config( model, label, ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", cpu_moe=cpu_moe, ) if r: results.append(r) # Strategy B: n-cpu-moe sweep (selective expert offload) for n in [0, 5, 10, 15, 20]: if n > total: continue r = test_config( model, f"n-cpu-moe={n}", ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", n_cpu_moe=n, ) if r: results.append(r) if not results: log(" PHASE 1: No configuration worked!") return None best = max(results, key=lambda x: x["avg_tps"]) log(f"\n ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s") return best def phase2_threads(model, prev_best): """Sweep CPU threads with best GPU config locked.""" log(f"\n{'='*70}") log(f" PHASE 2: CPU Thread Sweep — {model['name']}") log(f"{'='*70}") p = prev_best["params"] results = [prev_best] for t in [2, 4, 6, 8, 10, 12]: if t == p.get("t", 6): continue r = test_config( model, f"t={t}", ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"], ctk=p["ctk"], ctv=p["ctv"], cpu_moe=p.get("cpu_moe", False), n_cpu_moe=p.get("n_cpu_moe", 0), ) if r: results.append(r) best = max(results, key=lambda x: x["avg_tps"]) log(f"\n ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s") return best def phase3_batch(model, prev_best): """Sweep batch sizes.""" log(f"\n{'='*70}") log(f" PHASE 3: Batch Size Sweep — {model['name']}") log(f"{'='*70}") p = prev_best["params"] best_t = p["t"] results = [prev_best] for ub, b in [(128, 512), (256, 1024), (256, 2048), (512, 1024), (512, 2048), (512, 4096), (1024, 2048), (1024, 4096)]: if ub == p["ub"] and b == p["b"]: continue r = test_config( model, f"ub={ub} b={b}", ngl=p["ngl"], t=best_t, ub=ub, b=b, ctk=p["ctk"], ctv=p["ctv"], cpu_moe=p.get("cpu_moe", False), n_cpu_moe=p.get("n_cpu_moe", 0), ) if r: results.append(r) best = max(results, key=lambda x: x["avg_tps"]) log(f"\n ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s") return best def phase4_kvcache(model, prev_best): """Sweep KV cache precision.""" log(f"\n{'='*70}") log(f" PHASE 4: KV Cache Type Sweep — {model['name']}") log(f"{'='*70}") p = prev_best["params"] results = [prev_best] for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"), ("q4_0", "q8_0"), ("f16", "f16")]: if ctk == p["ctk"] and ctv == p["ctv"]: continue r = test_config( model, f"kv={ctk}/{ctv}", ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"], ctk=ctk, ctv=ctv, cpu_moe=p.get("cpu_moe", False), n_cpu_moe=p.get("n_cpu_moe", 0), ) if r: results.append(r) best = max(results, key=lambda x: x["avg_tps"]) log(f"\n ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s") return best def phase5_final(model, prev_best): """Final verification with 5 runs.""" log(f"\n{'='*70}") log(f" PHASE 5: Final Verification (5 runs) — {model['name']}") log(f"{'='*70}") p = prev_best["params"] kill_server() proc = start_server(model["path"], **p) ok, boot_time = wait_for_server() if not ok: log(" FAILED to start for final verification!") proc.kill() return prev_best vram = get_vram_all() # Warmup try: run_benchmark(max_tokens=20) except Exception: pass speeds = [] for i in range(5): try: r = run_benchmark() speeds.append(r["tps"]) log(f" Final Run {i+1}: {r['tps']:.2f} t/s") except Exception as e: log(f" Final Run {i+1}: ERROR ({e})") proc.kill() if speeds: avg = sum(speeds) / len(speeds) best_tps = max(speeds) log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s") final = { "model": model["name"], "quant": model["quant"], "label": f"FINAL-{model['name']}", "avg_tps": round(avg, 2), "best_tps": round(best_tps, 2), "boot_time": round(boot_time, 1), "vram": vram, "params": p, } ALL_RESULTS.append(final) return final return prev_best # ─── Main ──────────────────────────────────────────────────────── def run_full_benchmark_for_model(model): """Run all phases for a single model.""" log(f"\n{'#'*70}") log(f" MODEL: {model['name']}") log(f" File: {model['path']}") log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB") log(f"{'#'*70}") # Check model exists if not os.path.exists(model["path"]): log(f" SKIP: Model file not found!") return None # Phase 0: Can it boot? baseline = phase0_boot_test(model) if not baseline: log(f" SKIP: {model['name']} cannot boot at 256K context!") return None # Phase 1: GPU offload strategy best = phase1_gpu_offload(model, baseline) if not best: return baseline # Phase 2: CPU threads best = phase2_threads(model, best) # Phase 3: Batch sizes best = phase3_batch(model, best) # Phase 4: KV cache best = phase4_kvcache(model, best) # Phase 5: Final verification final = phase5_final(model, best) return final def main(): start_time = time.time() log("=" * 70) log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK") log(" 2x RTX 3060 (24GB Total) | 256K Context") log(f" Models: {len(MODELS)}") log(f" Started: {datetime.datetime.now().isoformat()}") log("=" * 70) # Show GPU info gpus = get_vram_all() for g in gpus: log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used") # Run benchmarks for each model model_winners = [] for i, model in enumerate(MODELS): log(f"\n{'='*70}") log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}") log(f"{'='*70}") winner = run_full_benchmark_for_model(model) if winner: model_winners.append(winner) # Save intermediate results with open("scripts/dual_gpu_results.json", "w") as f: json.dump(ALL_RESULTS, f, indent=2, default=str) log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)") # ─── Grand Final Comparison ────────────────────────────────── elapsed = (time.time() - start_time) / 60 log(f"\n{'='*70}") log(f" GRAND FINAL COMPARISON") log(f" Total time: {elapsed:.1f} minutes") log(f" Configs tested: {len(ALL_RESULTS)}") log(f"{'='*70}") if not model_winners: log(" No models were able to run at 256K context!") return # Sort by avg t/s model_winners.sort(key=lambda x: x["avg_tps"], reverse=True) summary_lines = [] summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}") summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K") summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}") summary_lines.append(f"Total time: {elapsed:.1f} minutes") summary_lines.append("") summary_lines.append("=" * 60) summary_lines.append(" RANKING (by AVG t/s)") summary_lines.append("=" * 60) for rank, w in enumerate(model_winners, 1): medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ") summary_lines.append(f"\n {medal} #{rank}: {w['model']}") summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s") summary_lines.append(f" Boot: {w['boot_time']:.0f}s") p = w["params"] summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}") summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}") if p.get("cpu_moe"): summary_lines.append(f" --cpu-moe") elif p.get("n_cpu_moe", 0) > 0: summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}") champion = model_winners[0] summary_lines.append(f"\n{'='*60}") summary_lines.append(f" ★ CHAMPION: {champion['model']}") summary_lines.append(f" {champion['avg_tps']:.2f} t/s average") summary_lines.append(f"{'='*60}") # Build recommended command p = champion["params"] cmd_parts = [ f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}", f"-ngl {p['ngl']} -c {CONTEXT}", f"-t {p['t']} -tb {p['t']}", f"-ub {p['ub']} -b {p['b']}", "-fa on", f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}", f"--prio {p.get('prio', 3)} --poll 50", "--mlock", ] if p.get("cpu_moe"): cmd_parts.append("--cpu-moe") elif p.get("n_cpu_moe", 0) > 0: cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}") if p.get("nommap"): cmd_parts.append("--no-mmap") cmd_parts.append("--port 8000 --host 0.0.0.0") summary_lines.append(f"\n Recommended command:") summary_lines.append(f" {' '.join(cmd_parts)}") summary = "\n".join(summary_lines) print(summary) with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f: f.write(summary) with open("scripts/dual_gpu_results.json", "w") as f: json.dump(ALL_RESULTS, f, indent=2, default=str) log(f"\n Results: scripts/dual_gpu_results.json") log(f" Summary: scripts/dual_gpu_summary.txt") log(f" DONE!") kill_server() if __name__ == "__main__": main()