532 lines
16 KiB
JavaScript
532 lines
16 KiB
JavaScript
/**
|
|
* Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
|
* ===========================================================
|
|
* Tests 4 models across multiple parameter configurations to find
|
|
* the absolute best model + settings for 256K context coding agent.
|
|
*
|
|
* Models:
|
|
* 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
|
* 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
|
* 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
|
* 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
|
*
|
|
* Run: node scripts/dual_gpu_benchmark.mjs
|
|
*/
|
|
|
|
import { spawn, execSync } from "child_process";
|
|
import { writeFileSync, statSync, existsSync } from "fs";
|
|
import { resolve } from "path";
|
|
|
|
// ─── Configuration ─────────────────────────────────────────────
|
|
const BASE_URL = "http://127.0.0.1:8000";
|
|
const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
|
|
const CONTEXT = 262144; // 256K
|
|
const BENCHMARK_RUNS = 3;
|
|
const BENCHMARK_TOKENS = 200;
|
|
const SERVER_TIMEOUT = 300_000; // ms
|
|
|
|
const MODELS = [
|
|
{
|
|
name: "Qwen3.5-35B-A3B Q4_K_M",
|
|
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
|
type: "qwen", quant: "Q4_K_M", totalLayers: 64,
|
|
},
|
|
{
|
|
name: "Qwen3.5-35B-A3B MXFP4_MOE",
|
|
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
|
type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
|
|
},
|
|
{
|
|
name: "Gemma4 26B-A4B Q4_K_M",
|
|
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
|
type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
|
|
},
|
|
{
|
|
name: "Gemma4 26B-A4B MXFP4_MOE",
|
|
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
|
type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
|
|
},
|
|
];
|
|
|
|
const ALL_RESULTS = [];
|
|
|
|
// ─── Utility ───────────────────────────────────────────────────
|
|
|
|
function log(msg) {
|
|
const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
|
|
console.log(`[${ts}] ${msg}`);
|
|
}
|
|
|
|
function sleep(ms) {
|
|
return new Promise((r) => setTimeout(r, ms));
|
|
}
|
|
|
|
function killServer() {
|
|
try {
|
|
execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
|
|
} catch {}
|
|
return sleep(5000);
|
|
}
|
|
|
|
function getVramAll() {
|
|
try {
|
|
const out = execSync(
|
|
'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
|
{ encoding: "utf-8", timeout: 5000 }
|
|
);
|
|
return out.trim().split("\n").map((line) => {
|
|
const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
|
|
return { gpu, used, total };
|
|
});
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
function buildCmd(modelPath, params) {
|
|
const {
|
|
ngl, t, ub, b, ctk, ctv,
|
|
cpuMoe = false, nCpuMoe = 0,
|
|
prio = 3, nommap = false
|
|
} = params;
|
|
|
|
const cmd = [
|
|
LLAMA_SERVER,
|
|
"--model", modelPath,
|
|
"-ngl", String(ngl),
|
|
"-c", String(CONTEXT),
|
|
"-np", "1",
|
|
"-fa", "on",
|
|
"--cache-type-k", ctk,
|
|
"--cache-type-v", ctv,
|
|
"-ub", String(ub),
|
|
"-b", String(b),
|
|
"-t", String(t),
|
|
"-tb", String(t),
|
|
"--prio", String(prio),
|
|
"--poll", "50",
|
|
"--mlock",
|
|
"--port", "8000",
|
|
"--host", "0.0.0.0",
|
|
];
|
|
|
|
if (cpuMoe) cmd.push("--cpu-moe");
|
|
else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
|
|
if (nommap) cmd.push("--no-mmap");
|
|
|
|
return cmd;
|
|
}
|
|
|
|
function startServer(modelPath, params) {
|
|
const args = buildCmd(modelPath, params);
|
|
const exe = args.shift();
|
|
log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
|
|
return spawn(exe, args, {
|
|
cwd: process.cwd(),
|
|
stdio: ["ignore", "pipe", "pipe"],
|
|
});
|
|
}
|
|
|
|
async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
|
|
const start = Date.now();
|
|
while (Date.now() - start < timeoutMs) {
|
|
try {
|
|
const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
|
const data = await resp.json();
|
|
if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
|
|
} catch {}
|
|
await sleep(3000);
|
|
}
|
|
return { ok: false, bootTime: timeoutMs / 1000 };
|
|
}
|
|
|
|
async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
|
|
const payload = JSON.stringify({
|
|
model: "local-model",
|
|
messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
|
|
max_tokens: maxTokens,
|
|
temperature: 0.0,
|
|
});
|
|
|
|
const start = Date.now();
|
|
const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: payload,
|
|
signal: AbortSignal.timeout(600_000),
|
|
});
|
|
const result = await resp.json();
|
|
const elapsed = (Date.now() - start) / 1000;
|
|
|
|
const usage = result.usage || {};
|
|
const ct = usage.completion_tokens || 0;
|
|
return {
|
|
tps: elapsed > 0 ? ct / elapsed : 0,
|
|
completionTokens: ct,
|
|
promptTokens: usage.prompt_tokens || 0,
|
|
elapsed,
|
|
};
|
|
}
|
|
|
|
async function testConfig(model, label, params) {
|
|
await killServer();
|
|
log(` [${label}] Starting server...`);
|
|
|
|
const proc = startServer(model.path, params);
|
|
const { ok, bootTime } = await waitForServer();
|
|
|
|
if (!ok) {
|
|
log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
|
|
proc.kill("SIGKILL");
|
|
return null;
|
|
}
|
|
|
|
const vram = getVramAll();
|
|
const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
|
|
log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
|
|
|
|
// Warmup
|
|
try { await runBenchmark(20); } catch {}
|
|
|
|
// Benchmark
|
|
const speeds = [];
|
|
for (let i = 0; i < BENCHMARK_RUNS; i++) {
|
|
try {
|
|
const r = await runBenchmark();
|
|
speeds.push(r.tps);
|
|
log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
|
} catch (e) {
|
|
log(` Run ${i + 1}: ERROR (${e.message})`);
|
|
}
|
|
}
|
|
|
|
proc.kill("SIGKILL");
|
|
|
|
if (speeds.length === 0) {
|
|
log(` [${label}] ALL BENCHMARK RUNS FAILED`);
|
|
return null;
|
|
}
|
|
|
|
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
|
const best = Math.max(...speeds);
|
|
log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
|
|
|
|
const result = {
|
|
model: model.name, quant: model.quant, label,
|
|
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
|
boot_time: +bootTime.toFixed(1), vram, params,
|
|
};
|
|
ALL_RESULTS.push(result);
|
|
return result;
|
|
}
|
|
|
|
// ─── Phase Runners ─────────────────────────────────────────────
|
|
|
|
async function phase0_bootTest(model) {
|
|
log(`\n${"=".repeat(70)}`);
|
|
log(` PHASE 0: Boot Test — ${model.name}`);
|
|
log(`${"=".repeat(70)}`);
|
|
|
|
// Try full GPU first
|
|
let r = await testConfig(model, "boot-ngl999", {
|
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
|
|
});
|
|
if (r) return r;
|
|
|
|
// Try with cpu-moe
|
|
log(" Full GPU failed, trying with --cpu-moe...");
|
|
r = await testConfig(model, "boot-cpumoe", {
|
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
|
|
});
|
|
if (r) return r;
|
|
|
|
// Reduced layers
|
|
log(" --cpu-moe also failed, trying reduced layers...");
|
|
r = await testConfig(model, "boot-ngl-half", {
|
|
ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
|
|
ctk: "q4_0", ctv: "q4_0",
|
|
});
|
|
return r;
|
|
}
|
|
|
|
async function phase1_gpuOffload(model, baseline) {
|
|
log(`\n${"=".repeat(70)}`);
|
|
log(` PHASE 1: GPU Offload Strategy — ${model.name}`);
|
|
log(`${"=".repeat(70)}`);
|
|
|
|
const results = baseline ? [baseline] : [];
|
|
|
|
// Test --cpu-moe on/off
|
|
for (const cpuMoe of [true, false]) {
|
|
const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
|
|
if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
|
|
const r = await testConfig(model, lbl, {
|
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
|
|
});
|
|
if (r) results.push(r);
|
|
}
|
|
|
|
// n-cpu-moe sweep
|
|
for (const n of [0, 5, 10, 15, 20]) {
|
|
if (n > model.totalLayers) continue;
|
|
const r = await testConfig(model, `n-cpu-moe=${n}`, {
|
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
|
|
});
|
|
if (r) results.push(r);
|
|
}
|
|
|
|
if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; }
|
|
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
|
log(`\n ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
|
return best;
|
|
}
|
|
|
|
async function phase2_threads(model, prev) {
|
|
log(`\n${"=".repeat(70)}`);
|
|
log(` PHASE 2: CPU Thread Sweep — ${model.name}`);
|
|
log(`${"=".repeat(70)}`);
|
|
|
|
const p = prev.params;
|
|
const results = [prev];
|
|
|
|
for (const t of [2, 4, 6, 8, 10, 12]) {
|
|
if (t === p.t) continue;
|
|
const r = await testConfig(model, `t=${t}`, {
|
|
...p, t,
|
|
});
|
|
if (r) results.push(r);
|
|
}
|
|
|
|
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
|
log(`\n ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
|
return best;
|
|
}
|
|
|
|
async function phase3_batch(model, prev) {
|
|
log(`\n${"=".repeat(70)}`);
|
|
log(` PHASE 3: Batch Size Sweep — ${model.name}`);
|
|
log(`${"=".repeat(70)}`);
|
|
|
|
const p = prev.params;
|
|
const results = [prev];
|
|
|
|
for (const [ub, b] of [
|
|
[128, 512], [256, 1024], [256, 2048],
|
|
[512, 1024], [512, 2048], [512, 4096],
|
|
[1024, 2048], [1024, 4096],
|
|
]) {
|
|
if (ub === p.ub && b === p.b) continue;
|
|
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
|
|
if (r) results.push(r);
|
|
}
|
|
|
|
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
|
log(`\n ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
|
return best;
|
|
}
|
|
|
|
async function phase4_kvcache(model, prev) {
|
|
log(`\n${"=".repeat(70)}`);
|
|
log(` PHASE 4: KV Cache Type Sweep — ${model.name}`);
|
|
log(`${"=".repeat(70)}`);
|
|
|
|
const p = prev.params;
|
|
const results = [prev];
|
|
|
|
for (const [ctk, ctv] of [
|
|
["q4_0", "q4_0"], ["q8_0", "q8_0"],
|
|
["q4_0", "q8_0"], ["f16", "f16"],
|
|
]) {
|
|
if (ctk === p.ctk && ctv === p.ctv) continue;
|
|
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
|
|
if (r) results.push(r);
|
|
}
|
|
|
|
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
|
log(`\n ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
|
return best;
|
|
}
|
|
|
|
async function phase5_final(model, prev) {
|
|
log(`\n${"=".repeat(70)}`);
|
|
log(` PHASE 5: Final Verification (5 runs) — ${model.name}`);
|
|
log(`${"=".repeat(70)}`);
|
|
|
|
await killServer();
|
|
const proc = startServer(model.path, prev.params);
|
|
const { ok, bootTime } = await waitForServer();
|
|
if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; }
|
|
|
|
const vram = getVramAll();
|
|
try { await runBenchmark(20); } catch {}
|
|
|
|
const speeds = [];
|
|
for (let i = 0; i < 5; i++) {
|
|
try {
|
|
const r = await runBenchmark();
|
|
speeds.push(r.tps);
|
|
log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
|
} catch (e) {
|
|
log(` Final Run ${i + 1}: ERROR (${e.message})`);
|
|
}
|
|
}
|
|
proc.kill("SIGKILL");
|
|
|
|
if (speeds.length > 0) {
|
|
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
|
const best = Math.max(...speeds);
|
|
log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
|
|
|
|
const final_ = {
|
|
model: model.name, quant: model.quant,
|
|
label: `FINAL-${model.name}`,
|
|
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
|
boot_time: +bootTime.toFixed(1), vram, params: prev.params,
|
|
};
|
|
ALL_RESULTS.push(final_);
|
|
return final_;
|
|
}
|
|
return prev;
|
|
}
|
|
|
|
// ─── Main ──────────────────────────────────────────────────────
|
|
|
|
async function runModelBenchmark(model) {
|
|
log(`\n${"#".repeat(70)}`);
|
|
log(` MODEL: ${model.name}`);
|
|
log(` File: ${model.path}`);
|
|
try {
|
|
const sz = statSync(model.path).size / 1024 ** 3;
|
|
log(` Size: ${sz.toFixed(2)} GB`);
|
|
} catch { log(` Size: unknown`); }
|
|
log(`${"#".repeat(70)}`);
|
|
|
|
if (!existsSync(model.path)) {
|
|
log(` SKIP: Model file not found!`);
|
|
return null;
|
|
}
|
|
|
|
const baseline = await phase0_bootTest(model);
|
|
if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; }
|
|
|
|
let best = await phase1_gpuOffload(model, baseline);
|
|
if (!best) return baseline;
|
|
|
|
best = await phase2_threads(model, best);
|
|
best = await phase3_batch(model, best);
|
|
best = await phase4_kvcache(model, best);
|
|
best = await phase5_final(model, best);
|
|
|
|
return best;
|
|
}
|
|
|
|
async function main() {
|
|
const startTime = Date.now();
|
|
|
|
log("=".repeat(70));
|
|
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
|
|
log(" 2x RTX 3060 (24GB Total) | 256K Context");
|
|
log(` Models: ${MODELS.length}`);
|
|
log(` Started: ${new Date().toISOString()}`);
|
|
log("=".repeat(70));
|
|
|
|
const gpus = getVramAll();
|
|
gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
|
|
|
|
const winners = [];
|
|
|
|
for (let i = 0; i < MODELS.length; i++) {
|
|
log(`\n${"=".repeat(70)}`);
|
|
log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
|
|
log(`${"=".repeat(70)}`);
|
|
|
|
const winner = await runModelBenchmark(MODELS[i]);
|
|
if (winner) winners.push(winner);
|
|
|
|
// Save intermediate
|
|
writeFileSync("scripts/dual_gpu_results.json",
|
|
JSON.stringify(ALL_RESULTS, null, 2));
|
|
log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`);
|
|
}
|
|
|
|
// ─── Grand Final ───────────────────────────────────────────
|
|
const elapsed = (Date.now() - startTime) / 60000;
|
|
|
|
log(`\n${"=".repeat(70)}`);
|
|
log(` GRAND FINAL COMPARISON`);
|
|
log(` Total time: ${elapsed.toFixed(1)} minutes`);
|
|
log(` Configs tested: ${ALL_RESULTS.length}`);
|
|
log(`${"=".repeat(70)}`);
|
|
|
|
if (winners.length === 0) {
|
|
log(" No models ran at 256K!");
|
|
return;
|
|
}
|
|
|
|
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
|
const medals = ["🥇", "🥈", "🥉", " "];
|
|
|
|
const lines = [
|
|
`Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
|
|
`Hardware: 2x RTX 3060 12GB | Context: 256K`,
|
|
`Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
|
|
"", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60),
|
|
];
|
|
|
|
for (let i = 0; i < winners.length; i++) {
|
|
const w = winners[i];
|
|
const p = w.params;
|
|
lines.push("");
|
|
lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`);
|
|
lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
|
|
lines.push(` Boot: ${w.boot_time.toFixed(0)}s`);
|
|
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
|
|
lines.push(` ctk=${p.ctk} ctv=${p.ctv}`);
|
|
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
|
else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
|
}
|
|
|
|
const champ = winners[0];
|
|
const cp = champ.params;
|
|
lines.push("", "=".repeat(60));
|
|
lines.push(` ★ CHAMPION: ${champ.model}`);
|
|
lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`);
|
|
lines.push("=".repeat(60));
|
|
|
|
// Build recommended command
|
|
const cmdParts = [
|
|
`llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
|
|
`-ngl ${cp.ngl} -c ${CONTEXT}`,
|
|
`-t ${cp.t} -tb ${cp.t}`,
|
|
`-ub ${cp.ub} -b ${cp.b}`,
|
|
`-fa on`,
|
|
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
|
`--prio ${cp.prio || 3} --poll 50`,
|
|
`--mlock`,
|
|
];
|
|
if (cp.cpuMoe) cmdParts.push("--cpu-moe");
|
|
else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
|
|
if (cp.nommap) cmdParts.push("--no-mmap");
|
|
cmdParts.push("--port 8000 --host 0.0.0.0");
|
|
|
|
lines.push("", " Recommended command:");
|
|
lines.push(` ${cmdParts.join(" ")}`);
|
|
|
|
const summary = lines.join("\n");
|
|
console.log(summary);
|
|
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
|
writeFileSync("scripts/dual_gpu_results.json",
|
|
JSON.stringify(ALL_RESULTS, null, 2));
|
|
|
|
log(`\n Results: scripts/dual_gpu_results.json`);
|
|
log(` Summary: scripts/dual_gpu_summary.txt`);
|
|
log(` DONE!`);
|
|
|
|
await killServer();
|
|
}
|
|
|
|
main().catch((e) => {
|
|
console.error("Fatal error:", e);
|
|
process.exit(1);
|
|
});
|