/** * Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark * =========================================================== * Tests 4 models across multiple parameter configurations to find * the absolute best model + settings for 256K context coding agent. * * Models: * 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB) * 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB) * 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB) * 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB) * * Run: node scripts/dual_gpu_benchmark.mjs */ import { spawn, execSync } from "child_process"; import { writeFileSync, statSync, existsSync } from "fs"; import { resolve } from "path"; // ─── Configuration ───────────────────────────────────────────── const BASE_URL = "http://127.0.0.1:8000"; const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`; const CONTEXT = 262144; // 256K const BENCHMARK_RUNS = 3; const BENCHMARK_TOKENS = 200; const SERVER_TIMEOUT = 300_000; // ms const MODELS = [ { name: "Qwen3.5-35B-A3B Q4_K_M", path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`, type: "qwen", quant: "Q4_K_M", totalLayers: 64, }, { name: "Qwen3.5-35B-A3B MXFP4_MOE", path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`, type: "qwen", quant: "MXFP4_MOE", totalLayers: 64, }, { name: "Gemma4 26B-A4B Q4_K_M", path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`, type: "gemma4", quant: "Q4_K_M", totalLayers: 30, }, { name: "Gemma4 26B-A4B MXFP4_MOE", path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`, type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30, }, ]; const ALL_RESULTS = []; // ─── Utility ─────────────────────────────────────────────────── function log(msg) { const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false }); console.log(`[${ts}] ${msg}`); } function sleep(ms) { return new Promise((r) => setTimeout(r, ms)); } function killServer() { try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {} return sleep(5000); } function getVramAll() { try { const out = execSync( 'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', { encoding: "utf-8", timeout: 5000 } ); return out.trim().split("\n").map((line) => { const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim())); return { gpu, used, total }; }); } catch { return []; } } function buildCmd(modelPath, params) { const { ngl, t, ub, b, ctk, ctv, cpuMoe = false, nCpuMoe = 0, prio = 3, nommap = false } = params; const cmd = [ LLAMA_SERVER, "--model", modelPath, "-ngl", String(ngl), "-c", String(CONTEXT), "-np", "1", "-fa", "on", "--cache-type-k", ctk, "--cache-type-v", ctv, "-ub", String(ub), "-b", String(b), "-t", String(t), "-tb", String(t), "--prio", String(prio), "--poll", "50", "--mlock", "--port", "8000", "--host", "0.0.0.0", ]; if (cpuMoe) cmd.push("--cpu-moe"); else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe)); if (nommap) cmd.push("--no-mmap"); return cmd; } function startServer(modelPath, params) { const args = buildCmd(modelPath, params); const exe = args.shift(); log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`); return spawn(exe, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"], }); } async function waitForServer(timeoutMs = SERVER_TIMEOUT) { const start = Date.now(); while (Date.now() - start < timeoutMs) { try { const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) }); const data = await resp.json(); if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 }; } catch {} await sleep(3000); } return { ok: false, bootTime: timeoutMs / 1000 }; } async function runBenchmark(maxTokens = BENCHMARK_TOKENS) { const payload = JSON.stringify({ model: "local-model", messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }], max_tokens: maxTokens, temperature: 0.0, }); const start = Date.now(); const resp = await fetch(`${BASE_URL}/v1/chat/completions`, { method: "POST", headers: { "Content-Type": "application/json" }, body: payload, signal: AbortSignal.timeout(600_000), }); const result = await resp.json(); const elapsed = (Date.now() - start) / 1000; const usage = result.usage || {}; const ct = usage.completion_tokens || 0; return { tps: elapsed > 0 ? ct / elapsed : 0, completionTokens: ct, promptTokens: usage.prompt_tokens || 0, elapsed, }; } async function testConfig(model, label, params) { await killServer(); log(` [${label}] Starting server...`); const proc = startServer(model.path, params); const { ok, bootTime } = await waitForServer(); if (!ok) { log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`); proc.kill("SIGKILL"); return null; } const vram = getVramAll(); const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | "); log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`); // Warmup try { await runBenchmark(20); } catch {} // Benchmark const speeds = []; for (let i = 0; i < BENCHMARK_RUNS; i++) { try { const r = await runBenchmark(); speeds.push(r.tps); log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`); } catch (e) { log(` Run ${i + 1}: ERROR (${e.message})`); } } proc.kill("SIGKILL"); if (speeds.length === 0) { log(` [${label}] ALL BENCHMARK RUNS FAILED`); return null; } const avg = speeds.reduce((a, b) => a + b) / speeds.length; const best = Math.max(...speeds); log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`); const result = { model: model.name, quant: model.quant, label, avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2), boot_time: +bootTime.toFixed(1), vram, params, }; ALL_RESULTS.push(result); return result; } // ─── Phase Runners ───────────────────────────────────────────── async function phase0_bootTest(model) { log(`\n${"=".repeat(70)}`); log(` PHASE 0: Boot Test — ${model.name}`); log(`${"=".repeat(70)}`); // Try full GPU first let r = await testConfig(model, "boot-ngl999", { ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", }); if (r) return r; // Try with cpu-moe log(" Full GPU failed, trying with --cpu-moe..."); r = await testConfig(model, "boot-cpumoe", { ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true, }); if (r) return r; // Reduced layers log(" --cpu-moe also failed, trying reduced layers..."); r = await testConfig(model, "boot-ngl-half", { ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", }); return r; } async function phase1_gpuOffload(model, baseline) { log(`\n${"=".repeat(70)}`); log(` PHASE 1: GPU Offload Strategy — ${model.name}`); log(`${"=".repeat(70)}`); const results = baseline ? [baseline] : []; // Test --cpu-moe on/off for (const cpuMoe of [true, false]) { const lbl = `ngl=999 cpuMoe=${cpuMoe}`; if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue; const r = await testConfig(model, lbl, { ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe, }); if (r) results.push(r); } // n-cpu-moe sweep for (const n of [0, 5, 10, 15, 20]) { if (n > model.totalLayers) continue; const r = await testConfig(model, `n-cpu-moe=${n}`, { ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n, }); if (r) results.push(r); } if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; } const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b); log(`\n ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`); return best; } async function phase2_threads(model, prev) { log(`\n${"=".repeat(70)}`); log(` PHASE 2: CPU Thread Sweep — ${model.name}`); log(`${"=".repeat(70)}`); const p = prev.params; const results = [prev]; for (const t of [2, 4, 6, 8, 10, 12]) { if (t === p.t) continue; const r = await testConfig(model, `t=${t}`, { ...p, t, }); if (r) results.push(r); } const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b); log(`\n ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`); return best; } async function phase3_batch(model, prev) { log(`\n${"=".repeat(70)}`); log(` PHASE 3: Batch Size Sweep — ${model.name}`); log(`${"=".repeat(70)}`); const p = prev.params; const results = [prev]; for (const [ub, b] of [ [128, 512], [256, 1024], [256, 2048], [512, 1024], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096], ]) { if (ub === p.ub && b === p.b) continue; const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b }); if (r) results.push(r); } const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b); log(`\n ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`); return best; } async function phase4_kvcache(model, prev) { log(`\n${"=".repeat(70)}`); log(` PHASE 4: KV Cache Type Sweep — ${model.name}`); log(`${"=".repeat(70)}`); const p = prev.params; const results = [prev]; for (const [ctk, ctv] of [ ["q4_0", "q4_0"], ["q8_0", "q8_0"], ["q4_0", "q8_0"], ["f16", "f16"], ]) { if (ctk === p.ctk && ctv === p.ctv) continue; const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv }); if (r) results.push(r); } const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b); log(`\n ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`); return best; } async function phase5_final(model, prev) { log(`\n${"=".repeat(70)}`); log(` PHASE 5: Final Verification (5 runs) — ${model.name}`); log(`${"=".repeat(70)}`); await killServer(); const proc = startServer(model.path, prev.params); const { ok, bootTime } = await waitForServer(); if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; } const vram = getVramAll(); try { await runBenchmark(20); } catch {} const speeds = []; for (let i = 0; i < 5; i++) { try { const r = await runBenchmark(); speeds.push(r.tps); log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`); } catch (e) { log(` Final Run ${i + 1}: ERROR (${e.message})`); } } proc.kill("SIGKILL"); if (speeds.length > 0) { const avg = speeds.reduce((a, b) => a + b) / speeds.length; const best = Math.max(...speeds); log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`); const final_ = { model: model.name, quant: model.quant, label: `FINAL-${model.name}`, avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2), boot_time: +bootTime.toFixed(1), vram, params: prev.params, }; ALL_RESULTS.push(final_); return final_; } return prev; } // ─── Main ────────────────────────────────────────────────────── async function runModelBenchmark(model) { log(`\n${"#".repeat(70)}`); log(` MODEL: ${model.name}`); log(` File: ${model.path}`); try { const sz = statSync(model.path).size / 1024 ** 3; log(` Size: ${sz.toFixed(2)} GB`); } catch { log(` Size: unknown`); } log(`${"#".repeat(70)}`); if (!existsSync(model.path)) { log(` SKIP: Model file not found!`); return null; } const baseline = await phase0_bootTest(model); if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; } let best = await phase1_gpuOffload(model, baseline); if (!best) return baseline; best = await phase2_threads(model, best); best = await phase3_batch(model, best); best = await phase4_kvcache(model, best); best = await phase5_final(model, best); return best; } async function main() { const startTime = Date.now(); log("=".repeat(70)); log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK"); log(" 2x RTX 3060 (24GB Total) | 256K Context"); log(` Models: ${MODELS.length}`); log(` Started: ${new Date().toISOString()}`); log("=".repeat(70)); const gpus = getVramAll(); gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`)); const winners = []; for (let i = 0; i < MODELS.length; i++) { log(`\n${"=".repeat(70)}`); log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`); log(`${"=".repeat(70)}`); const winner = await runModelBenchmark(MODELS[i]); if (winner) winners.push(winner); // Save intermediate writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL_RESULTS, null, 2)); log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`); } // ─── Grand Final ─────────────────────────────────────────── const elapsed = (Date.now() - startTime) / 60000; log(`\n${"=".repeat(70)}`); log(` GRAND FINAL COMPARISON`); log(` Total time: ${elapsed.toFixed(1)} minutes`); log(` Configs tested: ${ALL_RESULTS.length}`); log(`${"=".repeat(70)}`); if (winners.length === 0) { log(" No models ran at 256K!"); return; } winners.sort((a, b) => b.avg_tps - a.avg_tps); const medals = ["🥇", "🥈", "🥉", " "]; const lines = [ `Dual-GPU Benchmark Results — ${new Date().toISOString()}`, `Hardware: 2x RTX 3060 12GB | Context: 256K`, `Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`, "", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60), ]; for (let i = 0; i < winners.length; i++) { const w = winners[i]; const p = w.params; lines.push(""); lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`); lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`); lines.push(` Boot: ${w.boot_time.toFixed(0)}s`); lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`); lines.push(` ctk=${p.ctk} ctv=${p.ctv}`); if (p.cpuMoe) lines.push(` --cpu-moe`); else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`); } const champ = winners[0]; const cp = champ.params; lines.push("", "=".repeat(60)); lines.push(` ★ CHAMPION: ${champ.model}`); lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`); lines.push("=".repeat(60)); // Build recommended command const cmdParts = [ `llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`, `-ngl ${cp.ngl} -c ${CONTEXT}`, `-t ${cp.t} -tb ${cp.t}`, `-ub ${cp.ub} -b ${cp.b}`, `-fa on`, `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`, `--prio ${cp.prio || 3} --poll 50`, `--mlock`, ]; if (cp.cpuMoe) cmdParts.push("--cpu-moe"); else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`); if (cp.nommap) cmdParts.push("--no-mmap"); cmdParts.push("--port 8000 --host 0.0.0.0"); lines.push("", " Recommended command:"); lines.push(` ${cmdParts.join(" ")}`); const summary = lines.join("\n"); console.log(summary); writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8"); writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL_RESULTS, null, 2)); log(`\n Results: scripts/dual_gpu_results.json`); log(` Summary: scripts/dual_gpu_summary.txt`); log(` DONE!`); await killServer(); } main().catch((e) => { console.error("Fatal error:", e); process.exit(1); });