Update tuning scripts and add task creation to sync_vikunja.js

This commit is contained in:
Variet-Worker
2026-04-06 21:49:56 +09:00
parent 626a089b6b
commit 7c7a899fd5
61 changed files with 8705 additions and 1566 deletions

View File

@@ -0,0 +1,531 @@
/**
* Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
* ===========================================================
* Tests 4 models across multiple parameter configurations to find
* the absolute best model + settings for 256K context coding agent.
*
* Models:
* 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
* 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
* 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
* 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
*
* Run: node scripts/dual_gpu_benchmark.mjs
*/
import { spawn, execSync } from "child_process";
import { writeFileSync, statSync, existsSync } from "fs";
import { resolve } from "path";
// ─── Configuration ─────────────────────────────────────────────
const BASE_URL = "http://127.0.0.1:8000";
const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
const CONTEXT = 262144; // 256K
const BENCHMARK_RUNS = 3;
const BENCHMARK_TOKENS = 200;
const SERVER_TIMEOUT = 300_000; // ms
const MODELS = [
{
name: "Qwen3.5-35B-A3B Q4_K_M",
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
type: "qwen", quant: "Q4_K_M", totalLayers: 64,
},
{
name: "Qwen3.5-35B-A3B MXFP4_MOE",
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
},
{
name: "Gemma4 26B-A4B Q4_K_M",
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
},
{
name: "Gemma4 26B-A4B MXFP4_MOE",
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
},
];
const ALL_RESULTS = [];
// ─── Utility ───────────────────────────────────────────────────
function log(msg) {
const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
console.log(`[${ts}] ${msg}`);
}
function sleep(ms) {
return new Promise((r) => setTimeout(r, ms));
}
function killServer() {
try {
execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
} catch {}
return sleep(5000);
}
function getVramAll() {
try {
const out = execSync(
'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
{ encoding: "utf-8", timeout: 5000 }
);
return out.trim().split("\n").map((line) => {
const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
return { gpu, used, total };
});
} catch {
return [];
}
}
function buildCmd(modelPath, params) {
const {
ngl, t, ub, b, ctk, ctv,
cpuMoe = false, nCpuMoe = 0,
prio = 3, nommap = false
} = params;
const cmd = [
LLAMA_SERVER,
"--model", modelPath,
"-ngl", String(ngl),
"-c", String(CONTEXT),
"-np", "1",
"-fa", "on",
"--cache-type-k", ctk,
"--cache-type-v", ctv,
"-ub", String(ub),
"-b", String(b),
"-t", String(t),
"-tb", String(t),
"--prio", String(prio),
"--poll", "50",
"--mlock",
"--port", "8000",
"--host", "0.0.0.0",
];
if (cpuMoe) cmd.push("--cpu-moe");
else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
if (nommap) cmd.push("--no-mmap");
return cmd;
}
function startServer(modelPath, params) {
const args = buildCmd(modelPath, params);
const exe = args.shift();
log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
return spawn(exe, args, {
cwd: process.cwd(),
stdio: ["ignore", "pipe", "pipe"],
});
}
async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
try {
const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
const data = await resp.json();
if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
} catch {}
await sleep(3000);
}
return { ok: false, bootTime: timeoutMs / 1000 };
}
async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
const payload = JSON.stringify({
model: "local-model",
messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
max_tokens: maxTokens,
temperature: 0.0,
});
const start = Date.now();
const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: payload,
signal: AbortSignal.timeout(600_000),
});
const result = await resp.json();
const elapsed = (Date.now() - start) / 1000;
const usage = result.usage || {};
const ct = usage.completion_tokens || 0;
return {
tps: elapsed > 0 ? ct / elapsed : 0,
completionTokens: ct,
promptTokens: usage.prompt_tokens || 0,
elapsed,
};
}
async function testConfig(model, label, params) {
await killServer();
log(` [${label}] Starting server...`);
const proc = startServer(model.path, params);
const { ok, bootTime } = await waitForServer();
if (!ok) {
log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
proc.kill("SIGKILL");
return null;
}
const vram = getVramAll();
const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
// Warmup
try { await runBenchmark(20); } catch {}
// Benchmark
const speeds = [];
for (let i = 0; i < BENCHMARK_RUNS; i++) {
try {
const r = await runBenchmark();
speeds.push(r.tps);
log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
} catch (e) {
log(` Run ${i + 1}: ERROR (${e.message})`);
}
}
proc.kill("SIGKILL");
if (speeds.length === 0) {
log(` [${label}] ALL BENCHMARK RUNS FAILED`);
return null;
}
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
const best = Math.max(...speeds);
log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
const result = {
model: model.name, quant: model.quant, label,
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
boot_time: +bootTime.toFixed(1), vram, params,
};
ALL_RESULTS.push(result);
return result;
}
// ─── Phase Runners ─────────────────────────────────────────────
async function phase0_bootTest(model) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 0: Boot Test — ${model.name}`);
log(`${"=".repeat(70)}`);
// Try full GPU first
let r = await testConfig(model, "boot-ngl999", {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
});
if (r) return r;
// Try with cpu-moe
log(" Full GPU failed, trying with --cpu-moe...");
r = await testConfig(model, "boot-cpumoe", {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
});
if (r) return r;
// Reduced layers
log(" --cpu-moe also failed, trying reduced layers...");
r = await testConfig(model, "boot-ngl-half", {
ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
ctk: "q4_0", ctv: "q4_0",
});
return r;
}
async function phase1_gpuOffload(model, baseline) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 1: GPU Offload Strategy — ${model.name}`);
log(`${"=".repeat(70)}`);
const results = baseline ? [baseline] : [];
// Test --cpu-moe on/off
for (const cpuMoe of [true, false]) {
const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
const r = await testConfig(model, lbl, {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
});
if (r) results.push(r);
}
// n-cpu-moe sweep
for (const n of [0, 5, 10, 15, 20]) {
if (n > model.totalLayers) continue;
const r = await testConfig(model, `n-cpu-moe=${n}`, {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
});
if (r) results.push(r);
}
if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; }
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
log(`\n ★ Phase 1 winner: ${best.label}${best.avg_tps.toFixed(2)} t/s`);
return best;
}
async function phase2_threads(model, prev) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 2: CPU Thread Sweep — ${model.name}`);
log(`${"=".repeat(70)}`);
const p = prev.params;
const results = [prev];
for (const t of [2, 4, 6, 8, 10, 12]) {
if (t === p.t) continue;
const r = await testConfig(model, `t=${t}`, {
...p, t,
});
if (r) results.push(r);
}
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
log(`\n ★ Phase 2 winner: ${best.label}${best.avg_tps.toFixed(2)} t/s`);
return best;
}
async function phase3_batch(model, prev) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 3: Batch Size Sweep — ${model.name}`);
log(`${"=".repeat(70)}`);
const p = prev.params;
const results = [prev];
for (const [ub, b] of [
[128, 512], [256, 1024], [256, 2048],
[512, 1024], [512, 2048], [512, 4096],
[1024, 2048], [1024, 4096],
]) {
if (ub === p.ub && b === p.b) continue;
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
if (r) results.push(r);
}
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
log(`\n ★ Phase 3 winner: ${best.label}${best.avg_tps.toFixed(2)} t/s`);
return best;
}
async function phase4_kvcache(model, prev) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 4: KV Cache Type Sweep — ${model.name}`);
log(`${"=".repeat(70)}`);
const p = prev.params;
const results = [prev];
for (const [ctk, ctv] of [
["q4_0", "q4_0"], ["q8_0", "q8_0"],
["q4_0", "q8_0"], ["f16", "f16"],
]) {
if (ctk === p.ctk && ctv === p.ctv) continue;
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
if (r) results.push(r);
}
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
log(`\n ★ Phase 4 winner: ${best.label}${best.avg_tps.toFixed(2)} t/s`);
return best;
}
async function phase5_final(model, prev) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 5: Final Verification (5 runs) — ${model.name}`);
log(`${"=".repeat(70)}`);
await killServer();
const proc = startServer(model.path, prev.params);
const { ok, bootTime } = await waitForServer();
if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; }
const vram = getVramAll();
try { await runBenchmark(20); } catch {}
const speeds = [];
for (let i = 0; i < 5; i++) {
try {
const r = await runBenchmark();
speeds.push(r.tps);
log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
} catch (e) {
log(` Final Run ${i + 1}: ERROR (${e.message})`);
}
}
proc.kill("SIGKILL");
if (speeds.length > 0) {
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
const best = Math.max(...speeds);
log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
const final_ = {
model: model.name, quant: model.quant,
label: `FINAL-${model.name}`,
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
boot_time: +bootTime.toFixed(1), vram, params: prev.params,
};
ALL_RESULTS.push(final_);
return final_;
}
return prev;
}
// ─── Main ──────────────────────────────────────────────────────
async function runModelBenchmark(model) {
log(`\n${"#".repeat(70)}`);
log(` MODEL: ${model.name}`);
log(` File: ${model.path}`);
try {
const sz = statSync(model.path).size / 1024 ** 3;
log(` Size: ${sz.toFixed(2)} GB`);
} catch { log(` Size: unknown`); }
log(`${"#".repeat(70)}`);
if (!existsSync(model.path)) {
log(` SKIP: Model file not found!`);
return null;
}
const baseline = await phase0_bootTest(model);
if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; }
let best = await phase1_gpuOffload(model, baseline);
if (!best) return baseline;
best = await phase2_threads(model, best);
best = await phase3_batch(model, best);
best = await phase4_kvcache(model, best);
best = await phase5_final(model, best);
return best;
}
async function main() {
const startTime = Date.now();
log("=".repeat(70));
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
log(" 2x RTX 3060 (24GB Total) | 256K Context");
log(` Models: ${MODELS.length}`);
log(` Started: ${new Date().toISOString()}`);
log("=".repeat(70));
const gpus = getVramAll();
gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
const winners = [];
for (let i = 0; i < MODELS.length; i++) {
log(`\n${"=".repeat(70)}`);
log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
log(`${"=".repeat(70)}`);
const winner = await runModelBenchmark(MODELS[i]);
if (winner) winners.push(winner);
// Save intermediate
writeFileSync("scripts/dual_gpu_results.json",
JSON.stringify(ALL_RESULTS, null, 2));
log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`);
}
// ─── Grand Final ───────────────────────────────────────────
const elapsed = (Date.now() - startTime) / 60000;
log(`\n${"=".repeat(70)}`);
log(` GRAND FINAL COMPARISON`);
log(` Total time: ${elapsed.toFixed(1)} minutes`);
log(` Configs tested: ${ALL_RESULTS.length}`);
log(`${"=".repeat(70)}`);
if (winners.length === 0) {
log(" No models ran at 256K!");
return;
}
winners.sort((a, b) => b.avg_tps - a.avg_tps);
const medals = ["🥇", "🥈", "🥉", " "];
const lines = [
`Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
`Hardware: 2x RTX 3060 12GB | Context: 256K`,
`Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
"", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60),
];
for (let i = 0; i < winners.length; i++) {
const w = winners[i];
const p = w.params;
lines.push("");
lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`);
lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
lines.push(` Boot: ${w.boot_time.toFixed(0)}s`);
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
lines.push(` ctk=${p.ctk} ctv=${p.ctv}`);
if (p.cpuMoe) lines.push(` --cpu-moe`);
else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
}
const champ = winners[0];
const cp = champ.params;
lines.push("", "=".repeat(60));
lines.push(` ★ CHAMPION: ${champ.model}`);
lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`);
lines.push("=".repeat(60));
// Build recommended command
const cmdParts = [
`llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
`-ngl ${cp.ngl} -c ${CONTEXT}`,
`-t ${cp.t} -tb ${cp.t}`,
`-ub ${cp.ub} -b ${cp.b}`,
`-fa on`,
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
`--prio ${cp.prio || 3} --poll 50`,
`--mlock`,
];
if (cp.cpuMoe) cmdParts.push("--cpu-moe");
else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
if (cp.nommap) cmdParts.push("--no-mmap");
cmdParts.push("--port 8000 --host 0.0.0.0");
lines.push("", " Recommended command:");
lines.push(` ${cmdParts.join(" ")}`);
const summary = lines.join("\n");
console.log(summary);
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
writeFileSync("scripts/dual_gpu_results.json",
JSON.stringify(ALL_RESULTS, null, 2));
log(`\n Results: scripts/dual_gpu_results.json`);
log(` Summary: scripts/dual_gpu_summary.txt`);
log(` DONE!`);
await killServer();
}
main().catch((e) => {
console.error("Fatal error:", e);
process.exit(1);
});