Update tuning scripts and add task creation to sync_vikunja.js

2026-04-06 21:49:56 +09:00
parent 626a089b6b
commit 7c7a899fd5
61 changed files with 8705 additions and 1566 deletions
--- a/scripts/dual_gpu_benchmark.mjs
+++ b/scripts/dual_gpu_benchmark.mjs
@@ -0,0 +1,531 @@
+/**
+ * Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
+ * ===========================================================
+ * Tests 4 models across multiple parameter configurations to find
+ * the absolute best model + settings for 256K context coding agent.
+ *
+ * Models:
+ *   1. Qwen3.5-35B-A3B  Q4_K_M     (~20.5 GB)
+ *   2. Qwen3.5-35B-A3B  MXFP4_MOE  (~20.1 GB)
+ *   3. Gemma4  26B-A4B   Q4_K_M     (~15.6 GB)
+ *   4. Gemma4  26B-A4B   MXFP4_MOE  (~15.5 GB)
+ *
+ * Run: node scripts/dual_gpu_benchmark.mjs
+ */
+
+import { spawn, execSync } from "child_process";
+import { writeFileSync, statSync, existsSync } from "fs";
+import { resolve } from "path";
+
+// ─── Configuration ─────────────────────────────────────────────
+const BASE_URL = "http://127.0.0.1:8000";
+const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
+const CONTEXT = 262144; // 256K
+const BENCHMARK_RUNS = 3;
+const BENCHMARK_TOKENS = 200;
+const SERVER_TIMEOUT = 300_000; // ms
+
+const MODELS = [
+  {
+    name: "Qwen3.5-35B-A3B Q4_K_M",
+    path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
+    type: "qwen", quant: "Q4_K_M", totalLayers: 64,
+  },
+  {
+    name: "Qwen3.5-35B-A3B MXFP4_MOE",
+    path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
+    type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
+  },
+  {
+    name: "Gemma4 26B-A4B Q4_K_M",
+    path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
+    type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
+  },
+  {
+    name: "Gemma4 26B-A4B MXFP4_MOE",
+    path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
+    type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
+  },
+];
+
+const ALL_RESULTS = [];
+
+// ─── Utility ───────────────────────────────────────────────────
+
+function log(msg) {
+  const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
+  console.log(`[${ts}] ${msg}`);
+}
+
+function sleep(ms) {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+function killServer() {
+  try {
+    execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
+  } catch {}
+  return sleep(5000);
+}
+
+function getVramAll() {
+  try {
+    const out = execSync(
+      'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
+      { encoding: "utf-8", timeout: 5000 }
+    );
+    return out.trim().split("\n").map((line) => {
+      const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
+      return { gpu, used, total };
+    });
+  } catch {
+    return [];
+  }
+}
+
+function buildCmd(modelPath, params) {
+  const {
+    ngl, t, ub, b, ctk, ctv,
+    cpuMoe = false, nCpuMoe = 0,
+    prio = 3, nommap = false
+  } = params;
+
+  const cmd = [
+    LLAMA_SERVER,
+    "--model", modelPath,
+    "-ngl", String(ngl),
+    "-c", String(CONTEXT),
+    "-np", "1",
+    "-fa", "on",
+    "--cache-type-k", ctk,
+    "--cache-type-v", ctv,
+    "-ub", String(ub),
+    "-b", String(b),
+    "-t", String(t),
+    "-tb", String(t),
+    "--prio", String(prio),
+    "--poll", "50",
+    "--mlock",
+    "--port", "8000",
+    "--host", "0.0.0.0",
+  ];
+
+  if (cpuMoe) cmd.push("--cpu-moe");
+  else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
+  if (nommap) cmd.push("--no-mmap");
+
+  return cmd;
+}
+
+function startServer(modelPath, params) {
+  const args = buildCmd(modelPath, params);
+  const exe = args.shift();
+  log(`  CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
+  return spawn(exe, args, {
+    cwd: process.cwd(),
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+}
+
+async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
+  const start = Date.now();
+  while (Date.now() - start < timeoutMs) {
+    try {
+      const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
+      const data = await resp.json();
+      if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
+    } catch {}
+    await sleep(3000);
+  }
+  return { ok: false, bootTime: timeoutMs / 1000 };
+}
+
+async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
+  const payload = JSON.stringify({
+    model: "local-model",
+    messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
+    max_tokens: maxTokens,
+    temperature: 0.0,
+  });
+
+  const start = Date.now();
+  const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: payload,
+    signal: AbortSignal.timeout(600_000),
+  });
+  const result = await resp.json();
+  const elapsed = (Date.now() - start) / 1000;
+
+  const usage = result.usage || {};
+  const ct = usage.completion_tokens || 0;
+  return {
+    tps: elapsed > 0 ? ct / elapsed : 0,
+    completionTokens: ct,
+    promptTokens: usage.prompt_tokens || 0,
+    elapsed,
+  };
+}
+
+async function testConfig(model, label, params) {
+  await killServer();
+  log(`  [${label}] Starting server...`);
+
+  const proc = startServer(model.path, params);
+  const { ok, bootTime } = await waitForServer();
+
+  if (!ok) {
+    log(`  [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
+    proc.kill("SIGKILL");
+    return null;
+  }
+
+  const vram = getVramAll();
+  const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
+  log(`  [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
+
+  // Warmup
+  try { await runBenchmark(20); } catch {}
+
+  // Benchmark
+  const speeds = [];
+  for (let i = 0; i < BENCHMARK_RUNS; i++) {
+    try {
+      const r = await runBenchmark();
+      speeds.push(r.tps);
+      log(`    Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) {
+      log(`    Run ${i + 1}: ERROR (${e.message})`);
+    }
+  }
+
+  proc.kill("SIGKILL");
+
+  if (speeds.length === 0) {
+    log(`  [${label}] ALL BENCHMARK RUNS FAILED`);
+    return null;
+  }
+
+  const avg = speeds.reduce((a, b) => a + b) / speeds.length;
+  const best = Math.max(...speeds);
+  log(`  [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
+
+  const result = {
+    model: model.name, quant: model.quant, label,
+    avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+    boot_time: +bootTime.toFixed(1), vram, params,
+  };
+  ALL_RESULTS.push(result);
+  return result;
+}
+
+// ─── Phase Runners ─────────────────────────────────────────────
+
+async function phase0_bootTest(model) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 0: Boot Test — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  // Try full GPU first
+  let r = await testConfig(model, "boot-ngl999", {
+    ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
+  });
+  if (r) return r;
+
+  // Try with cpu-moe
+  log("  Full GPU failed, trying with --cpu-moe...");
+  r = await testConfig(model, "boot-cpumoe", {
+    ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
+  });
+  if (r) return r;
+
+  // Reduced layers
+  log("  --cpu-moe also failed, trying reduced layers...");
+  r = await testConfig(model, "boot-ngl-half", {
+    ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
+    ctk: "q4_0", ctv: "q4_0",
+  });
+  return r;
+}
+
+async function phase1_gpuOffload(model, baseline) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 1: GPU Offload Strategy — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const results = baseline ? [baseline] : [];
+
+  // Test --cpu-moe on/off
+  for (const cpuMoe of [true, false]) {
+    const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
+    if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
+    const r = await testConfig(model, lbl, {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
+    });
+    if (r) results.push(r);
+  }
+
+  // n-cpu-moe sweep
+  for (const n of [0, 5, 10, 15, 20]) {
+    if (n > model.totalLayers) continue;
+    const r = await testConfig(model, `n-cpu-moe=${n}`, {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
+    });
+    if (r) results.push(r);
+  }
+
+  if (results.length === 0) { log("  PHASE 1: No config worked!"); return null; }
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase2_threads(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 2: CPU Thread Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const t of [2, 4, 6, 8, 10, 12]) {
+    if (t === p.t) continue;
+    const r = await testConfig(model, `t=${t}`, {
+      ...p, t,
+    });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase3_batch(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 3: Batch Size Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const [ub, b] of [
+    [128, 512], [256, 1024], [256, 2048],
+    [512, 1024], [512, 2048], [512, 4096],
+    [1024, 2048], [1024, 4096],
+  ]) {
+    if (ub === p.ub && b === p.b) continue;
+    const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase4_kvcache(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 4: KV Cache Type Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const [ctk, ctv] of [
+    ["q4_0", "q4_0"], ["q8_0", "q8_0"],
+    ["q4_0", "q8_0"], ["f16", "f16"],
+  ]) {
+    if (ctk === p.ctk && ctv === p.ctv) continue;
+    const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase5_final(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 5: Final Verification (5 runs) — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  await killServer();
+  const proc = startServer(model.path, prev.params);
+  const { ok, bootTime } = await waitForServer();
+  if (!ok) { log("  FAILED to start!"); proc.kill("SIGKILL"); return prev; }
+
+  const vram = getVramAll();
+  try { await runBenchmark(20); } catch {}
+
+  const speeds = [];
+  for (let i = 0; i < 5; i++) {
+    try {
+      const r = await runBenchmark();
+      speeds.push(r.tps);
+      log(`    Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) {
+      log(`    Final Run ${i + 1}: ERROR (${e.message})`);
+    }
+  }
+  proc.kill("SIGKILL");
+
+  if (speeds.length > 0) {
+    const avg = speeds.reduce((a, b) => a + b) / speeds.length;
+    const best = Math.max(...speeds);
+    log(`\n  ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
+
+    const final_ = {
+      model: model.name, quant: model.quant,
+      label: `FINAL-${model.name}`,
+      avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+      boot_time: +bootTime.toFixed(1), vram, params: prev.params,
+    };
+    ALL_RESULTS.push(final_);
+    return final_;
+  }
+  return prev;
+}
+
+// ─── Main ──────────────────────────────────────────────────────
+
+async function runModelBenchmark(model) {
+  log(`\n${"#".repeat(70)}`);
+  log(`  MODEL: ${model.name}`);
+  log(`  File:  ${model.path}`);
+  try {
+    const sz = statSync(model.path).size / 1024 ** 3;
+    log(`  Size:  ${sz.toFixed(2)} GB`);
+  } catch { log(`  Size:  unknown`); }
+  log(`${"#".repeat(70)}`);
+
+  if (!existsSync(model.path)) {
+    log(`  SKIP: Model file not found!`);
+    return null;
+  }
+
+  const baseline = await phase0_bootTest(model);
+  if (!baseline) { log(`  SKIP: Cannot boot at 256K!`); return null; }
+
+  let best = await phase1_gpuOffload(model, baseline);
+  if (!best) return baseline;
+
+  best = await phase2_threads(model, best);
+  best = await phase3_batch(model, best);
+  best = await phase4_kvcache(model, best);
+  best = await phase5_final(model, best);
+
+  return best;
+}
+
+async function main() {
+  const startTime = Date.now();
+
+  log("=".repeat(70));
+  log("  DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
+  log("  2x RTX 3060 (24GB Total) | 256K Context");
+  log(`  Models: ${MODELS.length}`);
+  log(`  Started: ${new Date().toISOString()}`);
+  log("=".repeat(70));
+
+  const gpus = getVramAll();
+  gpus.forEach((g) => log(`  GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
+
+  const winners = [];
+
+  for (let i = 0; i < MODELS.length; i++) {
+    log(`\n${"=".repeat(70)}`);
+    log(`  STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
+    log(`${"=".repeat(70)}`);
+
+    const winner = await runModelBenchmark(MODELS[i]);
+    if (winner) winners.push(winner);
+
+    // Save intermediate
+    writeFileSync("scripts/dual_gpu_results.json",
+      JSON.stringify(ALL_RESULTS, null, 2));
+    log(`  Intermediate saved (${ALL_RESULTS.length} configs tested)`);
+  }
+
+  // ─── Grand Final ───────────────────────────────────────────
+  const elapsed = (Date.now() - startTime) / 60000;
+
+  log(`\n${"=".repeat(70)}`);
+  log(`  GRAND FINAL COMPARISON`);
+  log(`  Total time: ${elapsed.toFixed(1)} minutes`);
+  log(`  Configs tested: ${ALL_RESULTS.length}`);
+  log(`${"=".repeat(70)}`);
+
+  if (winners.length === 0) {
+    log("  No models ran at 256K!");
+    return;
+  }
+
+  winners.sort((a, b) => b.avg_tps - a.avg_tps);
+  const medals = ["🥇", "🥈", "🥉", "  "];
+
+  const lines = [
+    `Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
+    `Hardware: 2x RTX 3060 12GB | Context: 256K`,
+    `Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
+    "", "=".repeat(60), "  RANKING (by AVG t/s)", "=".repeat(60),
+  ];
+
+  for (let i = 0; i < winners.length; i++) {
+    const w = winners[i];
+    const p = w.params;
+    lines.push("");
+    lines.push(`  ${medals[i] || "  "} #${i + 1}: ${w.model}`);
+    lines.push(`      AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
+    lines.push(`      Boot: ${w.boot_time.toFixed(0)}s`);
+    lines.push(`      ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
+    lines.push(`      ctk=${p.ctk} ctv=${p.ctv}`);
+    if (p.cpuMoe) lines.push(`      --cpu-moe`);
+    else if ((p.nCpuMoe || 0) > 0) lines.push(`      --n-cpu-moe ${p.nCpuMoe}`);
+  }
+
+  const champ = winners[0];
+  const cp = champ.params;
+  lines.push("", "=".repeat(60));
+  lines.push(`  ★ CHAMPION: ${champ.model}`);
+  lines.push(`    ${champ.avg_tps.toFixed(2)} t/s average`);
+  lines.push("=".repeat(60));
+
+  // Build recommended command
+  const cmdParts = [
+    `llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
+    `-ngl ${cp.ngl} -c ${CONTEXT}`,
+    `-t ${cp.t} -tb ${cp.t}`,
+    `-ub ${cp.ub} -b ${cp.b}`,
+    `-fa on`,
+    `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
+    `--prio ${cp.prio || 3} --poll 50`,
+    `--mlock`,
+  ];
+  if (cp.cpuMoe) cmdParts.push("--cpu-moe");
+  else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
+  if (cp.nommap) cmdParts.push("--no-mmap");
+  cmdParts.push("--port 8000 --host 0.0.0.0");
+
+  lines.push("", "  Recommended command:");
+  lines.push(`    ${cmdParts.join(" ")}`);
+
+  const summary = lines.join("\n");
+  console.log(summary);
+  writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
+  writeFileSync("scripts/dual_gpu_results.json",
+    JSON.stringify(ALL_RESULTS, null, 2));
+
+  log(`\n  Results: scripts/dual_gpu_results.json`);
+  log(`  Summary: scripts/dual_gpu_summary.txt`);
+  log(`  DONE!`);
+
+  await killServer();
+}
+
+main().catch((e) => {
+  console.error("Fatal error:", e);
+  process.exit(1);
+});