331 lines
12 KiB
JavaScript
331 lines
12 KiB
JavaScript
/**
|
|
* Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2
|
|
* =====================================================
|
|
* Informed by VRAM analysis — tests models in optimal order.
|
|
*
|
|
* Key insights applied:
|
|
* - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA)
|
|
* - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first
|
|
* - Skip configs known to fail, minimize wasted time
|
|
*
|
|
* Run: node scripts/dual_gpu_benchmark_v2.mjs
|
|
* Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt
|
|
*/
|
|
|
|
import { spawn, execSync } from "child_process";
|
|
import { writeFileSync, existsSync, statSync } from "fs";
|
|
|
|
const BASE_URL = "http://127.0.0.1:8000";
|
|
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
|
const CTX = 262144;
|
|
const RUNS = 3;
|
|
const TOKENS = 200;
|
|
const BOOT_TIMEOUT = 300_000;
|
|
|
|
// Models ordered: smallest first (most likely to succeed fully on GPU)
|
|
const MODELS = [
|
|
{
|
|
name: "Gemma4-26B MXFP4_MOE",
|
|
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
|
quant: "MXFP4_MOE",
|
|
fitsGPU: true, // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB
|
|
},
|
|
{
|
|
name: "Gemma4-26B Q4_K_M",
|
|
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
|
quant: "Q4_K_M",
|
|
fitsGPU: true, // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB
|
|
},
|
|
{
|
|
name: "Qwen3.5-35B MXFP4_MOE",
|
|
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
|
quant: "MXFP4_MOE",
|
|
fitsGPU: "maybe", // 20.1 + 1.41 + 1 = 22.51 GB — tight
|
|
},
|
|
{
|
|
name: "Qwen3.5-35B Q4_K_M",
|
|
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
|
quant: "Q4_K_M",
|
|
fitsGPU: "maybe", // 20.5 + 1.41 + 1 = 22.91 GB — very tight
|
|
},
|
|
];
|
|
|
|
const ALL = [];
|
|
let currentProc = null;
|
|
|
|
// ─── Utilities ─────────────────────────────────────────────────
|
|
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
|
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
|
|
|
async function kill() {
|
|
if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; }
|
|
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
|
await sleep(5000);
|
|
}
|
|
|
|
function vram() {
|
|
try {
|
|
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
|
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
|
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
|
return { gpu: g, used: u, total: t };
|
|
});
|
|
} catch { return []; }
|
|
}
|
|
|
|
function startServer(modelPath, p) {
|
|
const args = [
|
|
"--model", modelPath, "-ngl", String(p.ngl),
|
|
"-c", String(CTX), "-np", "1", "-fa", "on",
|
|
"--cache-type-k", p.ctk, "--cache-type-v", p.ctv,
|
|
"-ub", String(p.ub), "-b", String(p.b),
|
|
"-t", String(p.t), "-tb", String(p.t),
|
|
"--prio", String(p.prio || 3), "--poll", "50", "--mlock",
|
|
"--port", "8000", "--host", "0.0.0.0",
|
|
];
|
|
if (p.cpuMoe) args.push("--cpu-moe");
|
|
else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
|
if (p.nommap) args.push("--no-mmap");
|
|
|
|
currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
|
return currentProc;
|
|
}
|
|
|
|
async function waitReady(timeout = BOOT_TIMEOUT) {
|
|
const t0 = Date.now();
|
|
while (Date.now() - t0 < timeout) {
|
|
try {
|
|
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
|
const d = await r.json();
|
|
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
|
} catch {}
|
|
await sleep(3000);
|
|
}
|
|
return { ok: false, boot: timeout / 1000 };
|
|
}
|
|
|
|
async function bench(n = TOKENS) {
|
|
const t0 = Date.now();
|
|
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({
|
|
model: "m",
|
|
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
|
max_tokens: n, temperature: 0,
|
|
}),
|
|
signal: AbortSignal.timeout(600_000),
|
|
});
|
|
const d = await r.json();
|
|
const dt = (Date.now() - t0) / 1000;
|
|
const ct = d.usage?.completion_tokens || 0;
|
|
return { tps: ct / dt, ct, dt };
|
|
}
|
|
|
|
async function testConfig(model, label, params) {
|
|
await kill();
|
|
log(` [${label}] Starting...`);
|
|
startServer(model.path, params);
|
|
const { ok, boot } = await waitReady();
|
|
if (!ok) { log(` [${label}] ✗ FAILED (timeout)`); await kill(); return null; }
|
|
|
|
const v = vram();
|
|
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
|
log(` [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`);
|
|
|
|
try { await bench(20); } catch {} // warmup
|
|
|
|
const speeds = [];
|
|
for (let i = 0; i < RUNS; i++) {
|
|
try { const r = await bench(); speeds.push(r.tps); log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
|
} catch (e) { log(` Run${i+1}: ERR ${e.message}`); }
|
|
}
|
|
await kill();
|
|
|
|
if (!speeds.length) { log(` [${label}] ✗ ALL RUNS FAILED`); return null; }
|
|
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
|
const best = Math.max(...speeds);
|
|
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
|
|
|
const res = { model: model.name, quant: model.quant, label,
|
|
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
|
boot: +boot.toFixed(1), vram: v, params };
|
|
ALL.push(res);
|
|
return res;
|
|
}
|
|
|
|
// Save intermediate results after each test
|
|
function saveIntermediate() {
|
|
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
|
}
|
|
|
|
// ─── Smart Phase Runner ────────────────────────────────────────
|
|
|
|
async function tuneModel(model) {
|
|
log(`\n${"#".repeat(65)}`);
|
|
log(` ${model.name} (${model.quant})`);
|
|
if (!existsSync(model.path)) { log(" ✗ File not found, SKIP"); return null; }
|
|
const sz = (statSync(model.path).size / 1024**3).toFixed(2);
|
|
log(` Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`);
|
|
log(`${"#".repeat(65)}`);
|
|
|
|
// ── Step 1: Find working GPU config ──
|
|
log(`\n ── Step 1: Find optimal GPU offload ──`);
|
|
let baseline = null;
|
|
|
|
if (model.fitsGPU === true || model.fitsGPU === "maybe") {
|
|
// Try full GPU, no CPU offload
|
|
baseline = await testConfig(model, "ngl=999 pure-GPU", {
|
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" });
|
|
saveIntermediate();
|
|
}
|
|
|
|
if (!baseline) {
|
|
// Try n-cpu-moe values (ascending — find minimum needed)
|
|
for (const n of [5, 10, 15, 20]) {
|
|
baseline = await testConfig(model, `n-cpu-moe=${n}`, {
|
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n });
|
|
saveIntermediate();
|
|
if (baseline) break; // found minimum working offload
|
|
}
|
|
}
|
|
|
|
if (!baseline) {
|
|
// Last resort: full cpu-moe
|
|
baseline = await testConfig(model, "cpu-moe", {
|
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true });
|
|
saveIntermediate();
|
|
}
|
|
|
|
if (!baseline) { log(` ✗ ${model.name} cannot boot at 256K!`); return null; }
|
|
|
|
const bp = baseline.params; // carry forward best params
|
|
|
|
// If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory)
|
|
if (!bp.cpuMoe && !bp.nCpuMoe) {
|
|
const alt = await testConfig(model, "compare: cpu-moe", {
|
|
...bp, cpuMoe: true });
|
|
saveIntermediate();
|
|
if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; }
|
|
}
|
|
|
|
let best = baseline;
|
|
|
|
// ── Step 2: Thread sweep ──
|
|
log(`\n ── Step 2: Thread sweep ──`);
|
|
for (const t of [2, 4, 8, 10, 12]) {
|
|
if (t === best.params.t) continue;
|
|
const r = await testConfig(model, `t=${t}`, { ...best.params, t });
|
|
saveIntermediate();
|
|
if (r && r.avg_tps > best.avg_tps) best = r;
|
|
}
|
|
|
|
// ── Step 3: Batch sweep ──
|
|
log(`\n ── Step 3: Batch sweep ──`);
|
|
for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) {
|
|
if (ub === best.params.ub && b === best.params.b) continue;
|
|
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b });
|
|
saveIntermediate();
|
|
if (r && r.avg_tps > best.avg_tps) best = r;
|
|
}
|
|
|
|
// ── Step 4: KV cache sweep ──
|
|
log(`\n ── Step 4: KV cache type ──`);
|
|
for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) {
|
|
if (ctk === best.params.ctk && ctv === best.params.ctv) continue;
|
|
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv });
|
|
saveIntermediate();
|
|
if (r && r.avg_tps > best.avg_tps) best = r;
|
|
}
|
|
|
|
// ── Step 5: Final verification (5 runs) ──
|
|
log(`\n ── Step 5: Final verification ──`);
|
|
await kill();
|
|
startServer(model.path, best.params);
|
|
const { ok, boot } = await waitReady();
|
|
if (!ok) { await kill(); return best; }
|
|
const v = vram();
|
|
try { await bench(20); } catch {}
|
|
|
|
const finals = [];
|
|
for (let i = 0; i < 5; i++) {
|
|
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
|
} catch (e) { log(` Final ${i+1}: ERR`); }
|
|
}
|
|
await kill();
|
|
|
|
if (finals.length > 0) {
|
|
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
|
const bst = Math.max(...finals);
|
|
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
|
const final = { model: model.name, quant: model.quant, label: `FINAL`,
|
|
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
|
boot: +boot.toFixed(1), vram: v, params: best.params };
|
|
ALL.push(final);
|
|
saveIntermediate();
|
|
return final;
|
|
}
|
|
return best;
|
|
}
|
|
|
|
// ─── Main ──────────────────────────────────────────────────────
|
|
async function main() {
|
|
const t0 = Date.now();
|
|
log("=" .repeat(65));
|
|
log(" DUAL-GPU BENCHMARK v2 — Smart Strategy");
|
|
log(" 2x RTX 3060 (24GB) | 256K Context");
|
|
log(" " + new Date().toISOString());
|
|
log("=".repeat(65));
|
|
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
|
|
|
const winners = [];
|
|
for (let i = 0; i < MODELS.length; i++) {
|
|
log(`\n${"=".repeat(65)}`);
|
|
log(` MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`);
|
|
log("=".repeat(65));
|
|
const w = await tuneModel(MODELS[i]);
|
|
if (w) winners.push(w);
|
|
saveIntermediate();
|
|
}
|
|
|
|
// ─── Summary ──────────────────────────────────────────────
|
|
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
|
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
|
const medals = ["🥇", "🥈", "🥉", " "];
|
|
|
|
const lines = [
|
|
`Dual-GPU Benchmark v2 — ${new Date().toISOString()}`,
|
|
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
|
"", "=" .repeat(55), " RANKING", "=".repeat(55),
|
|
];
|
|
for (let i = 0; i < winners.length; i++) {
|
|
const w = winners[i], p = w.params;
|
|
lines.push("", ` ${medals[i]||" "} #${i+1}: ${w.model}`);
|
|
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
|
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`);
|
|
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
|
else if (p.nCpuMoe) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
|
}
|
|
if (winners.length > 0) {
|
|
const c = winners[0], cp = c.params;
|
|
lines.push("", "=".repeat(55), ` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s`, "=".repeat(55));
|
|
const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`,
|
|
`-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`,
|
|
`-ub ${cp.ub} -b ${cp.b} -fa on`,
|
|
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
|
`--prio ${cp.prio||3} --poll 50 --mlock`,
|
|
cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "",
|
|
"--port 8000 --host 0.0.0.0"].filter(Boolean).join(" ");
|
|
lines.push("", " Recommended:", ` ${cmd}`);
|
|
}
|
|
const summary = lines.join("\n");
|
|
console.log("\n" + summary);
|
|
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
|
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
|
log(`\n Saved: dual_gpu_results.json + dual_gpu_summary.txt`);
|
|
log(" DONE!");
|
|
await kill();
|
|
}
|
|
|
|
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|