feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
This commit is contained in:
345
scripts/_archive/tuning/qwen_fullgpu_challenge.mjs
Normal file
345
scripts/_archive/tuning/qwen_fullgpu_challenge.mjs
Normal file
@@ -0,0 +1,345 @@
|
||||
/**
|
||||
* Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크
|
||||
* =====================================================
|
||||
* 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기
|
||||
*
|
||||
* 테스트 모델:
|
||||
* 1. UD-IQ4_NL (16.6 GB) — 확실히 올라감, 기준선
|
||||
* 2. MXFP4_MOE (20.1 GB) — 도전! VRAM 극한 최적화
|
||||
* 3. Q4_K_M (20.5 GB) — 대조군 (n-cpu-moe=5)
|
||||
*
|
||||
* VRAM 절감 전략:
|
||||
* A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소)
|
||||
* B. split-mode row (GPU간 더 균등한 분배)
|
||||
* C. tensor-split 수동 밸런싱
|
||||
* D. no-mmap (메모리 관리 최적화)
|
||||
* E. defrag-thold (KV 캐시 파편화 방지)
|
||||
*
|
||||
* Run: node scripts/qwen_fullgpu_challenge.mjs
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, existsSync, statSync } from "fs";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CTX = 262144;
|
||||
const RUNS = 3;
|
||||
const TOKENS = 200;
|
||||
const BOOT_TIMEOUT = 300_000;
|
||||
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Qwen3.5 UD-IQ4_NL",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`,
|
||||
sizeGB: 16.6,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5 MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
sizeGB: 20.11,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5 Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
sizeGB: 20.5,
|
||||
},
|
||||
];
|
||||
|
||||
const ALL = [];
|
||||
let proc = null;
|
||||
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
async function kill() {
|
||||
if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; }
|
||||
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||
await sleep(5000);
|
||||
}
|
||||
|
||||
function vram() {
|
||||
try {
|
||||
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||
return { gpu: g, used: u, total: t };
|
||||
});
|
||||
} catch { return []; }
|
||||
}
|
||||
|
||||
function startServer(modelPath, p) {
|
||||
const args = [
|
||||
"--model", modelPath, "-ngl", "999",
|
||||
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", p.ctk || "q4_0",
|
||||
"--cache-type-v", p.ctv || "q4_0",
|
||||
"-ub", String(p.ub || 512), "-b", String(p.b || 2048),
|
||||
"-t", String(p.t || 4), "-tb", String(p.t || 4),
|
||||
"--prio", "3", "--poll", "50", "--mlock",
|
||||
"--port", "8000", "--host", "0.0.0.0",
|
||||
];
|
||||
|
||||
// GPU offload strategy
|
||||
if (p.cpuMoe) args.push("--cpu-moe");
|
||||
else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||
|
||||
// VRAM saving options
|
||||
if (p.splitMode) args.push("--split-mode", p.splitMode);
|
||||
if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit);
|
||||
if (p.noMmap) args.push("--no-mmap");
|
||||
if (p.defragThold) args.push("--defrag-thold", String(p.defragThold));
|
||||
if (p.noKvOffload) args.push("--no-kv-offload");
|
||||
|
||||
const cmdStr = args.join(" ");
|
||||
log(` CMD: ...${cmdStr.slice(-80)}`);
|
||||
proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||
return proc;
|
||||
}
|
||||
|
||||
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||
const t0 = Date.now();
|
||||
while (Date.now() - t0 < timeout) {
|
||||
try {
|
||||
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const d = await r.json();
|
||||
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, boot: timeout / 1000 };
|
||||
}
|
||||
|
||||
async function bench(n = TOKENS) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "m",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||
max_tokens: n, temperature: 0,
|
||||
}),
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const ct = d.usage?.completion_tokens || 0;
|
||||
return { tps: ct / dt, ct, dt };
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await kill();
|
||||
log(` [${label}] Starting...`);
|
||||
startServer(model.path, params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) {
|
||||
log(` [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`);
|
||||
await kill();
|
||||
return null;
|
||||
}
|
||||
|
||||
const v = vram();
|
||||
const totalUsed = v.reduce((a, g) => a + g.used, 0);
|
||||
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||
log(` [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`);
|
||||
|
||||
try { await bench(20); } catch {} // warmup
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < RUNS; i++) {
|
||||
try {
|
||||
const r = await bench();
|
||||
speeds.push(r.tps);
|
||||
log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Run${i+1}: ERR ${e.message}`);
|
||||
}
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (!speeds.length) return null;
|
||||
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||
|
||||
const res = {
|
||||
model: model.name, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot: +boot.toFixed(1),
|
||||
vram_total: totalUsed, vram: v,
|
||||
params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" },
|
||||
gpu_only: !params.cpuMoe && !params.nCpuMoe,
|
||||
};
|
||||
ALL.push(res);
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
return res;
|
||||
}
|
||||
|
||||
// ─── Test Strategies ───────────────────────────────────────────
|
||||
|
||||
async function testModel(model) {
|
||||
log(`\n${"#".repeat(65)}`);
|
||||
log(` ${model.name} (${model.sizeGB} GB)`);
|
||||
if (!existsSync(model.path)) { log(" ✗ File not found!"); return null; }
|
||||
log(`${"#".repeat(65)}`);
|
||||
|
||||
let best = null;
|
||||
const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; };
|
||||
|
||||
// ── Strategy 1: Pure GPU, default settings ──
|
||||
log(`\n ── Strategy 1: Pure GPU (default) ──`);
|
||||
update(await testConfig(model, "pure-GPU default", {
|
||||
t: 4, ub: 512, b: 2048
|
||||
}));
|
||||
|
||||
// ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ──
|
||||
log(`\n ── Strategy 2: Pure GPU, minimal batch ──`);
|
||||
update(await testConfig(model, "pure-GPU minbatch", {
|
||||
t: 4, ub: 64, b: 256
|
||||
}));
|
||||
|
||||
// ── Strategy 3: Pure GPU, small batch + no-mmap ──
|
||||
log(`\n ── Strategy 3: Pure GPU + no-mmap + small batch ──`);
|
||||
update(await testConfig(model, "pure-GPU nommap small", {
|
||||
t: 4, ub: 128, b: 512, noMmap: true
|
||||
}));
|
||||
|
||||
// ── Strategy 4: Pure GPU, split-mode row ──
|
||||
log(`\n ── Strategy 4: Pure GPU + split-mode row ──`);
|
||||
update(await testConfig(model, "pure-GPU row-split", {
|
||||
t: 4, ub: 128, b: 512, splitMode: "row"
|
||||
}));
|
||||
|
||||
// ── Strategy 5: Pure GPU, tensor-split manual balance ──
|
||||
log(`\n ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`);
|
||||
update(await testConfig(model, "pure-GPU ts=0.5,0.5", {
|
||||
t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5"
|
||||
}));
|
||||
|
||||
// ── Strategy 6: Pure GPU, defrag + all tricks ──
|
||||
log(`\n ── Strategy 6: Pure GPU ALL tricks ──`);
|
||||
update(await testConfig(model, "pure-GPU all-tricks", {
|
||||
t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1
|
||||
}));
|
||||
|
||||
// ── Fallback: n-cpu-moe=5 baseline ──
|
||||
if (!best || !best.gpu_only) {
|
||||
log(`\n ── Fallback: n-cpu-moe=5 ──`);
|
||||
update(await testConfig(model, "n-cpu-moe=5 baseline", {
|
||||
t: 4, ub: 256, b: 1024, nCpuMoe: 5
|
||||
}));
|
||||
}
|
||||
|
||||
// ── If pure GPU worked, tune batch/thread/kv ──
|
||||
if (best && best.gpu_only) {
|
||||
log(`\n ── Pure GPU succeeded! Fine-tuning... ──`);
|
||||
const bp = best.params;
|
||||
|
||||
// Thread sweep
|
||||
for (const t of [2, 6, 8]) {
|
||||
if (t === bp.t) continue;
|
||||
update(await testConfig(model, `tune t=${t}`, { ...bp, t }));
|
||||
}
|
||||
|
||||
// Batch sweep
|
||||
for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) {
|
||||
if (ub === bp.ub && b === bp.b) continue;
|
||||
update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b }));
|
||||
}
|
||||
|
||||
// KV cache upgrade (extra VRAM available?)
|
||||
for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) {
|
||||
update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv }));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Final verification ──
|
||||
if (best) {
|
||||
log(`\n ── Final verification (5 runs) ──`);
|
||||
await kill();
|
||||
startServer(model.path, best.params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (ok) {
|
||||
const v = vram();
|
||||
try { await bench(20); } catch {}
|
||||
const finals = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||
}
|
||||
await kill();
|
||||
if (finals.length > 0) {
|
||||
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||
const bst = Math.max(...finals);
|
||||
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||
const final = { model: model.name, label: "FINAL",
|
||||
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0),
|
||||
vram: v, params: best.params, gpu_only: best.gpu_only };
|
||||
ALL.push(final);
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
return final;
|
||||
}
|
||||
}
|
||||
await kill();
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const t0 = Date.now();
|
||||
log("=".repeat(65));
|
||||
log(" QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET");
|
||||
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||
log(" " + new Date().toISOString());
|
||||
log("=".repeat(65));
|
||||
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||
|
||||
const winners = [];
|
||||
for (const model of MODELS) {
|
||||
const w = await testModel(model);
|
||||
if (w) winners.push(w);
|
||||
}
|
||||
|
||||
// ─── Summary ──────────────────────────────────────────────
|
||||
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
|
||||
const lines = [
|
||||
`Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`,
|
||||
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||
"", "=".repeat(55), " RANKING", "=".repeat(55),
|
||||
];
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i], p = w.params;
|
||||
const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload";
|
||||
lines.push("", ` #${i+1}: ${w.model} [${gpu}]`);
|
||||
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||
lines.push(` VRAM: ${w.vram_total} MiB total`);
|
||||
const flags = [];
|
||||
if (p.splitMode) flags.push(`split=${p.splitMode}`);
|
||||
if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`);
|
||||
if (p.noMmap) flags.push("no-mmap");
|
||||
if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`);
|
||||
lines.push(` t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`);
|
||||
}
|
||||
|
||||
if (winners.length > 0) {
|
||||
const c = winners[0];
|
||||
lines.push("", "=".repeat(55));
|
||||
lines.push(` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`);
|
||||
lines.push("=".repeat(55));
|
||||
}
|
||||
|
||||
const summary = lines.join("\n");
|
||||
console.log("\n" + summary);
|
||||
writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
log(`\n Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`);
|
||||
log(" DONE!");
|
||||
await kill();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||
Reference in New Issue
Block a user