variet_llm/scripts/dual_gpu_benchmark.mjs

/**
 * Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
 * ===========================================================
 * Tests 4 models across multiple parameter configurations to find
 * the absolute best model + settings for 256K context coding agent.
 *
 * Models:
 *   1. Qwen3.5-35B-A3B  Q4_K_M     (~20.5 GB)
 *   2. Qwen3.5-35B-A3B  MXFP4_MOE  (~20.1 GB)
 *   3. Gemma4  26B-A4B   Q4_K_M     (~15.6 GB)
 *   4. Gemma4  26B-A4B   MXFP4_MOE  (~15.5 GB)
 *
 * Run: node scripts/dual_gpu_benchmark.mjs
 */

import { spawn, execSync } from "child_process";
import { writeFileSync, statSync, existsSync } from "fs";
import { resolve } from "path";

// ─── Configuration ─────────────────────────────────────────────
const BASE_URL = "http://127.0.0.1:8000";
const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
const CONTEXT = 262144; // 256K
const BENCHMARK_RUNS = 3;
const BENCHMARK_TOKENS = 200;
const SERVER_TIMEOUT = 300_000; // ms

const MODELS = [
  {
    name: "Qwen3.5-35B-A3B Q4_K_M",
    path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
    type: "qwen", quant: "Q4_K_M", totalLayers: 64,
  },
  {
    name: "Qwen3.5-35B-A3B MXFP4_MOE",
    path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
    type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
  },
  {
    name: "Gemma4 26B-A4B Q4_K_M",
    path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
    type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
  },
  {
    name: "Gemma4 26B-A4B MXFP4_MOE",
    path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
    type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
  },
];

const ALL_RESULTS = [];

// ─── Utility ───────────────────────────────────────────────────

function log(msg) {
  const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
  console.log(`[${ts}] ${msg}`);
}

function sleep(ms) {
  return new Promise((r) => setTimeout(r, ms));
}

function killServer() {
  try {
    execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
  } catch {}
  return sleep(5000);
}

function getVramAll() {
  try {
    const out = execSync(
      'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
      { encoding: "utf-8", timeout: 5000 }
    );
    return out.trim().split("\n").map((line) => {
      const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
      return { gpu, used, total };
    });
  } catch {
    return [];
  }
}

function buildCmd(modelPath, params) {
  const {
    ngl, t, ub, b, ctk, ctv,
    cpuMoe = false, nCpuMoe = 0,
    prio = 3, nommap = false
  } = params;

  const cmd = [
    LLAMA_SERVER,
    "--model", modelPath,
    "-ngl", String(ngl),
    "-c", String(CONTEXT),
    "-np", "1",
    "-fa", "on",
    "--cache-type-k", ctk,
    "--cache-type-v", ctv,
    "-ub", String(ub),
    "-b", String(b),
    "-t", String(t),
    "-tb", String(t),
    "--prio", String(prio),
    "--poll", "50",
    "--mlock",
    "--port", "8000",
    "--host", "0.0.0.0",
  ];

  if (cpuMoe) cmd.push("--cpu-moe");
  else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
  if (nommap) cmd.push("--no-mmap");

  return cmd;
}

function startServer(modelPath, params) {
  const args = buildCmd(modelPath, params);
  const exe = args.shift();
  log(`  CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
  return spawn(exe, args, {
    cwd: process.cwd(),
    stdio: ["ignore", "pipe", "pipe"],
  });
}

async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
  const start = Date.now();
  while (Date.now() - start < timeoutMs) {
    try {
      const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
      const data = await resp.json();
      if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
    } catch {}
    await sleep(3000);
  }
  return { ok: false, bootTime: timeoutMs / 1000 };
}

async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
  const payload = JSON.stringify({
    model: "local-model",
    messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
    max_tokens: maxTokens,
    temperature: 0.0,
  });

  const start = Date.now();
  const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: payload,
    signal: AbortSignal.timeout(600_000),
  });
  const result = await resp.json();
  const elapsed = (Date.now() - start) / 1000;

  const usage = result.usage || {};
  const ct = usage.completion_tokens || 0;
  return {
    tps: elapsed > 0 ? ct / elapsed : 0,
    completionTokens: ct,
    promptTokens: usage.prompt_tokens || 0,
    elapsed,
  };
}

async function testConfig(model, label, params) {
  await killServer();
  log(`  [${label}] Starting server...`);

  const proc = startServer(model.path, params);
  const { ok, bootTime } = await waitForServer();

  if (!ok) {
    log(`  [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
    proc.kill("SIGKILL");
    return null;
  }

  const vram = getVramAll();
  const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
  log(`  [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);

  // Warmup
  try { await runBenchmark(20); } catch {}

  // Benchmark
  const speeds = [];
  for (let i = 0; i < BENCHMARK_RUNS; i++) {
    try {
      const r = await runBenchmark();
      speeds.push(r.tps);
      log(`    Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
    } catch (e) {
      log(`    Run ${i + 1}: ERROR (${e.message})`);
    }
  }

  proc.kill("SIGKILL");

  if (speeds.length === 0) {
    log(`  [${label}] ALL BENCHMARK RUNS FAILED`);
    return null;
  }

  const avg = speeds.reduce((a, b) => a + b) / speeds.length;
  const best = Math.max(...speeds);
  log(`  [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);

  const result = {
    model: model.name, quant: model.quant, label,
    avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
    boot_time: +bootTime.toFixed(1), vram, params,
  };
  ALL_RESULTS.push(result);
  return result;
}

// ─── Phase Runners ─────────────────────────────────────────────

async function phase0_bootTest(model) {
  log(`\n${"=".repeat(70)}`);
  log(`  PHASE 0: Boot Test — ${model.name}`);
  log(`${"=".repeat(70)}`);

  // Try full GPU first
  let r = await testConfig(model, "boot-ngl999", {
    ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
  });
  if (r) return r;

  // Try with cpu-moe
  log("  Full GPU failed, trying with --cpu-moe...");
  r = await testConfig(model, "boot-cpumoe", {
    ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
  });
  if (r) return r;

  // Reduced layers
  log("  --cpu-moe also failed, trying reduced layers...");
  r = await testConfig(model, "boot-ngl-half", {
    ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
    ctk: "q4_0", ctv: "q4_0",
  });
  return r;
}

async function phase1_gpuOffload(model, baseline) {
  log(`\n${"=".repeat(70)}`);
  log(`  PHASE 1: GPU Offload Strategy — ${model.name}`);
  log(`${"=".repeat(70)}`);

  const results = baseline ? [baseline] : [];

  // Test --cpu-moe on/off
  for (const cpuMoe of [true, false]) {
    const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
    if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
    const r = await testConfig(model, lbl, {
      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
    });
    if (r) results.push(r);
  }

  // n-cpu-moe sweep
  for (const n of [0, 5, 10, 15, 20]) {
    if (n > model.totalLayers) continue;
    const r = await testConfig(model, `n-cpu-moe=${n}`, {
      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
    });
    if (r) results.push(r);
  }

  if (results.length === 0) { log("  PHASE 1: No config worked!"); return null; }
  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
  log(`\n  ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
  return best;
}

async function phase2_threads(model, prev) {
  log(`\n${"=".repeat(70)}`);
  log(`  PHASE 2: CPU Thread Sweep — ${model.name}`);
  log(`${"=".repeat(70)}`);

  const p = prev.params;
  const results = [prev];

  for (const t of [2, 4, 6, 8, 10, 12]) {
    if (t === p.t) continue;
    const r = await testConfig(model, `t=${t}`, {
      ...p, t,
    });
    if (r) results.push(r);
  }

  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
  log(`\n  ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
  return best;
}

async function phase3_batch(model, prev) {
  log(`\n${"=".repeat(70)}`);
  log(`  PHASE 3: Batch Size Sweep — ${model.name}`);
  log(`${"=".repeat(70)}`);

  const p = prev.params;
  const results = [prev];

  for (const [ub, b] of [
    [128, 512], [256, 1024], [256, 2048],
    [512, 1024], [512, 2048], [512, 4096],
    [1024, 2048], [1024, 4096],
  ]) {
    if (ub === p.ub && b === p.b) continue;
    const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
    if (r) results.push(r);
  }

  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
  log(`\n  ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
  return best;
}

async function phase4_kvcache(model, prev) {
  log(`\n${"=".repeat(70)}`);
  log(`  PHASE 4: KV Cache Type Sweep — ${model.name}`);
  log(`${"=".repeat(70)}`);

  const p = prev.params;
  const results = [prev];

  for (const [ctk, ctv] of [
    ["q4_0", "q4_0"], ["q8_0", "q8_0"],
    ["q4_0", "q8_0"], ["f16", "f16"],
  ]) {
    if (ctk === p.ctk && ctv === p.ctv) continue;
    const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
    if (r) results.push(r);
  }

  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
  log(`\n  ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
  return best;
}

async function phase5_final(model, prev) {
  log(`\n${"=".repeat(70)}`);
  log(`  PHASE 5: Final Verification (5 runs) — ${model.name}`);
  log(`${"=".repeat(70)}`);

  await killServer();
  const proc = startServer(model.path, prev.params);
  const { ok, bootTime } = await waitForServer();
  if (!ok) { log("  FAILED to start!"); proc.kill("SIGKILL"); return prev; }

  const vram = getVramAll();
  try { await runBenchmark(20); } catch {}

  const speeds = [];
  for (let i = 0; i < 5; i++) {
    try {
      const r = await runBenchmark();
      speeds.push(r.tps);
      log(`    Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
    } catch (e) {
      log(`    Final Run ${i + 1}: ERROR (${e.message})`);
    }
  }
  proc.kill("SIGKILL");

  if (speeds.length > 0) {
    const avg = speeds.reduce((a, b) => a + b) / speeds.length;
    const best = Math.max(...speeds);
    log(`\n  ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);

    const final_ = {
      model: model.name, quant: model.quant,
      label: `FINAL-${model.name}`,
      avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
      boot_time: +bootTime.toFixed(1), vram, params: prev.params,
    };
    ALL_RESULTS.push(final_);
    return final_;
  }
  return prev;
}

// ─── Main ──────────────────────────────────────────────────────

async function runModelBenchmark(model) {
  log(`\n${"#".repeat(70)}`);
  log(`  MODEL: ${model.name}`);
  log(`  File:  ${model.path}`);
  try {
    const sz = statSync(model.path).size / 1024 ** 3;
    log(`  Size:  ${sz.toFixed(2)} GB`);
  } catch { log(`  Size:  unknown`); }
  log(`${"#".repeat(70)}`);

  if (!existsSync(model.path)) {
    log(`  SKIP: Model file not found!`);
    return null;
  }

  const baseline = await phase0_bootTest(model);
  if (!baseline) { log(`  SKIP: Cannot boot at 256K!`); return null; }

  let best = await phase1_gpuOffload(model, baseline);
  if (!best) return baseline;

  best = await phase2_threads(model, best);
  best = await phase3_batch(model, best);
  best = await phase4_kvcache(model, best);
  best = await phase5_final(model, best);

  return best;
}

async function main() {
  const startTime = Date.now();

  log("=".repeat(70));
  log("  DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
  log("  2x RTX 3060 (24GB Total) | 256K Context");
  log(`  Models: ${MODELS.length}`);
  log(`  Started: ${new Date().toISOString()}`);
  log("=".repeat(70));

  const gpus = getVramAll();
  gpus.forEach((g) => log(`  GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));

  const winners = [];

  for (let i = 0; i < MODELS.length; i++) {
    log(`\n${"=".repeat(70)}`);
    log(`  STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
    log(`${"=".repeat(70)}`);

    const winner = await runModelBenchmark(MODELS[i]);
    if (winner) winners.push(winner);

    // Save intermediate
    writeFileSync("scripts/dual_gpu_results.json",
      JSON.stringify(ALL_RESULTS, null, 2));
    log(`  Intermediate saved (${ALL_RESULTS.length} configs tested)`);
  }

  // ─── Grand Final ───────────────────────────────────────────
  const elapsed = (Date.now() - startTime) / 60000;

  log(`\n${"=".repeat(70)}`);
  log(`  GRAND FINAL COMPARISON`);
  log(`  Total time: ${elapsed.toFixed(1)} minutes`);
  log(`  Configs tested: ${ALL_RESULTS.length}`);
  log(`${"=".repeat(70)}`);

  if (winners.length === 0) {
    log("  No models ran at 256K!");
    return;
  }

  winners.sort((a, b) => b.avg_tps - a.avg_tps);
  const medals = ["🥇", "🥈", "🥉", "  "];

  const lines = [
    `Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
    `Hardware: 2x RTX 3060 12GB | Context: 256K`,
    `Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
    "", "=".repeat(60), "  RANKING (by AVG t/s)", "=".repeat(60),
  ];

  for (let i = 0; i < winners.length; i++) {
    const w = winners[i];
    const p = w.params;
    lines.push("");
    lines.push(`  ${medals[i] || "  "} #${i + 1}: ${w.model}`);
    lines.push(`      AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
    lines.push(`      Boot: ${w.boot_time.toFixed(0)}s`);
    lines.push(`      ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
    lines.push(`      ctk=${p.ctk} ctv=${p.ctv}`);
    if (p.cpuMoe) lines.push(`      --cpu-moe`);
    else if ((p.nCpuMoe || 0) > 0) lines.push(`      --n-cpu-moe ${p.nCpuMoe}`);
  }

  const champ = winners[0];
  const cp = champ.params;
  lines.push("", "=".repeat(60));
  lines.push(`  ★ CHAMPION: ${champ.model}`);
  lines.push(`    ${champ.avg_tps.toFixed(2)} t/s average`);
  lines.push("=".repeat(60));

  // Build recommended command
  const cmdParts = [
    `llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
    `-ngl ${cp.ngl} -c ${CONTEXT}`,
    `-t ${cp.t} -tb ${cp.t}`,
    `-ub ${cp.ub} -b ${cp.b}`,
    `-fa on`,
    `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
    `--prio ${cp.prio || 3} --poll 50`,
    `--mlock`,
  ];
  if (cp.cpuMoe) cmdParts.push("--cpu-moe");
  else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
  if (cp.nommap) cmdParts.push("--no-mmap");
  cmdParts.push("--port 8000 --host 0.0.0.0");

  lines.push("", "  Recommended command:");
  lines.push(`    ${cmdParts.join(" ")}`);

  const summary = lines.join("\n");
  console.log(summary);
  writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
  writeFileSync("scripts/dual_gpu_results.json",
    JSON.stringify(ALL_RESULTS, null, 2));

  log(`\n  Results: scripts/dual_gpu_results.json`);
  log(`  Summary: scripts/dual_gpu_summary.txt`);
  log(`  DONE!`);

  await killServer();
}

main().catch((e) => {
  console.error("Fatal error:", e);
  process.exit(1);
});