Files
variet_llm/scripts/dual_gpu_benchmark.py

645 lines
19 KiB
Python

"""
Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
==========================================================
Tests 4 models across multiple parameter configurations to find
the absolute best model + settings for 256K context coding agent.
Models:
1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
Test Phases (per model):
Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
Phase 1: GPU layer + MoE offload strategy sweep
Phase 2: CPU thread sweep (carry best from P1)
Phase 3: Batch size sweep (carry best from P1+P2)
Phase 4: KV cache type sweep (carry best from P1+P2+P3)
Phase 5: Final verification (5 runs)
Output: scripts/dual_gpu_results.json (all raw data)
scripts/dual_gpu_summary.txt (human-readable winner)
"""
import subprocess
import time
import json
import urllib.request
import sys
import os
import datetime
try:
sys.stdout.reconfigure(encoding='utf-8')
except Exception:
pass
# ─── Configuration ───────────────────────────────────────────────
BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
CONTEXT = 262144 # 256K
BENCHMARK_RUNS = 3
BENCHMARK_TOKENS = 200
SERVER_TIMEOUT = 300 # seconds to wait for server startup
MODELS = [
{
"name": "Qwen3.5-35B-A3B Q4_K_M",
"path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
"type": "qwen",
"quant": "Q4_K_M",
"is_mxfp4": False,
"total_layers": 64, # Qwen3.5 35B has 64 layers
},
{
"name": "Qwen3.5-35B-A3B MXFP4_MOE",
"path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
"type": "qwen",
"quant": "MXFP4_MOE",
"is_mxfp4": True,
"total_layers": 64,
},
{
"name": "Gemma4 26B-A4B Q4_K_M",
"path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
"type": "gemma4",
"quant": "Q4_K_M",
"is_mxfp4": False,
"total_layers": 30, # Gemma4 26B has 30 layers
},
{
"name": "Gemma4 26B-A4B MXFP4_MOE",
"path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
"type": "gemma4",
"quant": "MXFP4_MOE",
"is_mxfp4": True,
"total_layers": 30,
},
]
ALL_RESULTS = []
# ─── Utility Functions ──────────────────────────────────────────
def log(msg):
ts = datetime.datetime.now().strftime("%H:%M:%S")
print(f"[{ts}] {msg}", flush=True)
def kill_server():
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
capture_output=True)
time.sleep(5)
def get_vram_all():
"""Returns list of (used, total) tuples for each GPU."""
try:
r = subprocess.run(
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
gpus = []
for line in r.stdout.strip().split("\n"):
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 3:
gpus.append({
"gpu": int(parts[0]),
"used": int(parts[1]),
"total": int(parts[2]),
})
return gpus
except Exception:
return []
def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
"""Build llama-server command for dual-GPU."""
cmd = [
LLAMA_SERVER,
"--model", model_path,
"-ngl", str(ngl),
"-c", str(CONTEXT),
"-np", "1",
"-fa", "on",
"--cache-type-k", ctk,
"--cache-type-v", ctv,
"-ub", str(ub),
"-b", str(b),
"-t", str(t),
"-tb", str(t),
"--prio", str(prio),
"--poll", "50",
"--mlock",
"--port", "8000",
"--host", "0.0.0.0",
]
# MoE offloading options
if cpu_moe:
cmd.append("--cpu-moe")
elif n_cpu_moe > 0:
cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
if nommap:
cmd.append("--no-mmap")
return cmd
def start_server(model_path, **kwargs):
cmd = build_cmd(model_path, **kwargs)
log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
)
return proc
def wait_for_server(timeout=SERVER_TIMEOUT):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=3) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
boot_time = time.time() - start
return True, boot_time
except Exception:
pass
time.sleep(3)
return False, timeout
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user",
"content": "Count from 1 to 50, writing each number on a new line."}],
"max_tokens": max_tokens,
"temperature": 0.0,
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"},
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
ct = usage.get("completion_tokens", 0)
pt = usage.get("prompt_tokens", 0)
return {
"tps": ct / elapsed if elapsed > 0 else 0,
"completion_tokens": ct,
"prompt_tokens": pt,
"elapsed": elapsed,
}
def test_config(model_info, label, **kwargs):
"""Test a single configuration. Returns result dict or None."""
kill_server()
log(f" [{label}] Starting server...")
proc = start_server(model_info["path"], **kwargs)
ok, boot_time = wait_for_server()
if not ok:
log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
proc.kill()
return None
vram = get_vram_all()
vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
# Warmup
try:
run_benchmark(max_tokens=20)
except Exception:
pass
# Benchmark runs
speeds = []
for i in range(BENCHMARK_RUNS):
try:
r = run_benchmark()
speeds.append(r["tps"])
log(f" Run {i+1}: {r['tps']:.2f} t/s")
except Exception as e:
log(f" Run {i+1}: ERROR ({e})")
proc.kill()
if not speeds:
log(f" [{label}] ALL BENCHMARK RUNS FAILED")
return None
avg = sum(speeds) / len(speeds)
best = max(speeds)
log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
result = {
"model": model_info["name"],
"quant": model_info["quant"],
"label": label,
"avg_tps": round(avg, 2),
"best_tps": round(best, 2),
"boot_time": round(boot_time, 1),
"vram": vram,
"params": kwargs,
}
ALL_RESULTS.append(result)
return result
# ─── Phase Runners ───────────────────────────────────────────────
def phase0_boot_test(model):
"""Quick test: can the model even boot with 256K on dual GPU?"""
log(f"\n{'='*70}")
log(f" PHASE 0: Boot Test — {model['name']}")
log(f"{'='*70}")
# Try -ngl 999 (all layers to GPU) as baseline
r = test_config(
model, f"boot-ngl999",
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
)
if r:
return r
# If full GPU fails, try with cpu-moe
log(" Full GPU failed, trying with --cpu-moe...")
r = test_config(
model, f"boot-cpumoe",
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
cpu_moe=True,
)
if r:
return r
# Extreme fallback: fewer layers
log(" --cpu-moe also failed, trying reduced layers...")
r = test_config(
model, f"boot-ngl-half",
ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
ctk="q4_0", ctv="q4_0",
)
return r
def phase1_gpu_offload(model, baseline):
"""Find optimal GPU layer count and MoE offload strategy."""
log(f"\n{'='*70}")
log(f" PHASE 1: GPU Offload Strategy — {model['name']}")
log(f"{'='*70}")
results = []
if baseline:
results.append(baseline)
total = model["total_layers"]
# Strategy A: All GPU + cpu-moe variations
for cpu_moe in [True, False]:
label = f"ngl=999 cpu_moe={cpu_moe}"
# Skip if already tested in baseline
if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
baseline["params"].get("cpu_moe", False) == cpu_moe:
continue
r = test_config(
model, label,
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
cpu_moe=cpu_moe,
)
if r:
results.append(r)
# Strategy B: n-cpu-moe sweep (selective expert offload)
for n in [0, 5, 10, 15, 20]:
if n > total:
continue
r = test_config(
model, f"n-cpu-moe={n}",
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
n_cpu_moe=n,
)
if r:
results.append(r)
if not results:
log(" PHASE 1: No configuration worked!")
return None
best = max(results, key=lambda x: x["avg_tps"])
log(f"\n ★ Phase 1 winner: {best['label']}{best['avg_tps']:.2f} t/s")
return best
def phase2_threads(model, prev_best):
"""Sweep CPU threads with best GPU config locked."""
log(f"\n{'='*70}")
log(f" PHASE 2: CPU Thread Sweep — {model['name']}")
log(f"{'='*70}")
p = prev_best["params"]
results = [prev_best]
for t in [2, 4, 6, 8, 10, 12]:
if t == p.get("t", 6):
continue
r = test_config(
model, f"t={t}",
ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
ctk=p["ctk"], ctv=p["ctv"],
cpu_moe=p.get("cpu_moe", False),
n_cpu_moe=p.get("n_cpu_moe", 0),
)
if r:
results.append(r)
best = max(results, key=lambda x: x["avg_tps"])
log(f"\n ★ Phase 2 winner: {best['label']}{best['avg_tps']:.2f} t/s")
return best
def phase3_batch(model, prev_best):
"""Sweep batch sizes."""
log(f"\n{'='*70}")
log(f" PHASE 3: Batch Size Sweep — {model['name']}")
log(f"{'='*70}")
p = prev_best["params"]
best_t = p["t"]
results = [prev_best]
for ub, b in [(128, 512), (256, 1024), (256, 2048),
(512, 1024), (512, 2048), (512, 4096),
(1024, 2048), (1024, 4096)]:
if ub == p["ub"] and b == p["b"]:
continue
r = test_config(
model, f"ub={ub} b={b}",
ngl=p["ngl"], t=best_t, ub=ub, b=b,
ctk=p["ctk"], ctv=p["ctv"],
cpu_moe=p.get("cpu_moe", False),
n_cpu_moe=p.get("n_cpu_moe", 0),
)
if r:
results.append(r)
best = max(results, key=lambda x: x["avg_tps"])
log(f"\n ★ Phase 3 winner: {best['label']}{best['avg_tps']:.2f} t/s")
return best
def phase4_kvcache(model, prev_best):
"""Sweep KV cache precision."""
log(f"\n{'='*70}")
log(f" PHASE 4: KV Cache Type Sweep — {model['name']}")
log(f"{'='*70}")
p = prev_best["params"]
results = [prev_best]
for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
("q4_0", "q8_0"), ("f16", "f16")]:
if ctk == p["ctk"] and ctv == p["ctv"]:
continue
r = test_config(
model, f"kv={ctk}/{ctv}",
ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
ctk=ctk, ctv=ctv,
cpu_moe=p.get("cpu_moe", False),
n_cpu_moe=p.get("n_cpu_moe", 0),
)
if r:
results.append(r)
best = max(results, key=lambda x: x["avg_tps"])
log(f"\n ★ Phase 4 winner: {best['label']}{best['avg_tps']:.2f} t/s")
return best
def phase5_final(model, prev_best):
"""Final verification with 5 runs."""
log(f"\n{'='*70}")
log(f" PHASE 5: Final Verification (5 runs) — {model['name']}")
log(f"{'='*70}")
p = prev_best["params"]
kill_server()
proc = start_server(model["path"], **p)
ok, boot_time = wait_for_server()
if not ok:
log(" FAILED to start for final verification!")
proc.kill()
return prev_best
vram = get_vram_all()
# Warmup
try:
run_benchmark(max_tokens=20)
except Exception:
pass
speeds = []
for i in range(5):
try:
r = run_benchmark()
speeds.append(r["tps"])
log(f" Final Run {i+1}: {r['tps']:.2f} t/s")
except Exception as e:
log(f" Final Run {i+1}: ERROR ({e})")
proc.kill()
if speeds:
avg = sum(speeds) / len(speeds)
best_tps = max(speeds)
log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
final = {
"model": model["name"],
"quant": model["quant"],
"label": f"FINAL-{model['name']}",
"avg_tps": round(avg, 2),
"best_tps": round(best_tps, 2),
"boot_time": round(boot_time, 1),
"vram": vram,
"params": p,
}
ALL_RESULTS.append(final)
return final
return prev_best
# ─── Main ────────────────────────────────────────────────────────
def run_full_benchmark_for_model(model):
"""Run all phases for a single model."""
log(f"\n{'#'*70}")
log(f" MODEL: {model['name']}")
log(f" File: {model['path']}")
log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB")
log(f"{'#'*70}")
# Check model exists
if not os.path.exists(model["path"]):
log(f" SKIP: Model file not found!")
return None
# Phase 0: Can it boot?
baseline = phase0_boot_test(model)
if not baseline:
log(f" SKIP: {model['name']} cannot boot at 256K context!")
return None
# Phase 1: GPU offload strategy
best = phase1_gpu_offload(model, baseline)
if not best:
return baseline
# Phase 2: CPU threads
best = phase2_threads(model, best)
# Phase 3: Batch sizes
best = phase3_batch(model, best)
# Phase 4: KV cache
best = phase4_kvcache(model, best)
# Phase 5: Final verification
final = phase5_final(model, best)
return final
def main():
start_time = time.time()
log("=" * 70)
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
log(" 2x RTX 3060 (24GB Total) | 256K Context")
log(f" Models: {len(MODELS)}")
log(f" Started: {datetime.datetime.now().isoformat()}")
log("=" * 70)
# Show GPU info
gpus = get_vram_all()
for g in gpus:
log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
# Run benchmarks for each model
model_winners = []
for i, model in enumerate(MODELS):
log(f"\n{'='*70}")
log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
log(f"{'='*70}")
winner = run_full_benchmark_for_model(model)
if winner:
model_winners.append(winner)
# Save intermediate results
with open("scripts/dual_gpu_results.json", "w") as f:
json.dump(ALL_RESULTS, f, indent=2, default=str)
log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
# ─── Grand Final Comparison ──────────────────────────────────
elapsed = (time.time() - start_time) / 60
log(f"\n{'='*70}")
log(f" GRAND FINAL COMPARISON")
log(f" Total time: {elapsed:.1f} minutes")
log(f" Configs tested: {len(ALL_RESULTS)}")
log(f"{'='*70}")
if not model_winners:
log(" No models were able to run at 256K context!")
return
# Sort by avg t/s
model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
summary_lines = []
summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
summary_lines.append(f"Total time: {elapsed:.1f} minutes")
summary_lines.append("")
summary_lines.append("=" * 60)
summary_lines.append(" RANKING (by AVG t/s)")
summary_lines.append("=" * 60)
for rank, w in enumerate(model_winners, 1):
medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ")
summary_lines.append(f"\n {medal} #{rank}: {w['model']}")
summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
summary_lines.append(f" Boot: {w['boot_time']:.0f}s")
p = w["params"]
summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}")
if p.get("cpu_moe"):
summary_lines.append(f" --cpu-moe")
elif p.get("n_cpu_moe", 0) > 0:
summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}")
champion = model_winners[0]
summary_lines.append(f"\n{'='*60}")
summary_lines.append(f" ★ CHAMPION: {champion['model']}")
summary_lines.append(f" {champion['avg_tps']:.2f} t/s average")
summary_lines.append(f"{'='*60}")
# Build recommended command
p = champion["params"]
cmd_parts = [
f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
f"-ngl {p['ngl']} -c {CONTEXT}",
f"-t {p['t']} -tb {p['t']}",
f"-ub {p['ub']} -b {p['b']}",
"-fa on",
f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
f"--prio {p.get('prio', 3)} --poll 50",
"--mlock",
]
if p.get("cpu_moe"):
cmd_parts.append("--cpu-moe")
elif p.get("n_cpu_moe", 0) > 0:
cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
if p.get("nommap"):
cmd_parts.append("--no-mmap")
cmd_parts.append("--port 8000 --host 0.0.0.0")
summary_lines.append(f"\n Recommended command:")
summary_lines.append(f" {' '.join(cmd_parts)}")
summary = "\n".join(summary_lines)
print(summary)
with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
f.write(summary)
with open("scripts/dual_gpu_results.json", "w") as f:
json.dump(ALL_RESULTS, f, indent=2, default=str)
log(f"\n Results: scripts/dual_gpu_results.json")
log(f" Summary: scripts/dual_gpu_summary.txt")
log(f" DONE!")
kill_server()
if __name__ == "__main__":
main()