/** * Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크 * ===================================================== * 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기 * * 테스트 모델: * 1. UD-IQ4_NL (16.6 GB) — 확실히 올라감, 기준선 * 2. MXFP4_MOE (20.1 GB) — 도전! VRAM 극한 최적화 * 3. Q4_K_M (20.5 GB) — 대조군 (n-cpu-moe=5) * * VRAM 절감 전략: * A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소) * B. split-mode row (GPU간 더 균등한 분배) * C. tensor-split 수동 밸런싱 * D. no-mmap (메모리 관리 최적화) * E. defrag-thold (KV 캐시 파편화 방지) * * Run: node scripts/qwen_fullgpu_challenge.mjs */ import { spawn, execSync } from "child_process"; import { writeFileSync, existsSync, statSync } from "fs"; const BASE_URL = "http://127.0.0.1:8000"; const LLAMA = String.raw`llama_bin_run\llama-server.exe`; const CTX = 262144; const RUNS = 3; const TOKENS = 200; const BOOT_TIMEOUT = 300_000; const MODELS = [ { name: "Qwen3.5 UD-IQ4_NL", path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`, sizeGB: 16.6, }, { name: "Qwen3.5 MXFP4_MOE", path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`, sizeGB: 20.11, }, { name: "Qwen3.5 Q4_K_M", path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`, sizeGB: 20.5, }, ]; const ALL = []; let proc = null; const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`); const sleep = (ms) => new Promise(r => setTimeout(r, ms)); async function kill() { if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; } try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {} await sleep(5000); } function vram() { try { return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', { encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => { const [g, u, t] = l.split(",").map(s => parseInt(s)); return { gpu: g, used: u, total: t }; }); } catch { return []; } } function startServer(modelPath, p) { const args = [ "--model", modelPath, "-ngl", "999", "-c", String(CTX), "-np", "1", "-fa", "on", "--cache-type-k", p.ctk || "q4_0", "--cache-type-v", p.ctv || "q4_0", "-ub", String(p.ub || 512), "-b", String(p.b || 2048), "-t", String(p.t || 4), "-tb", String(p.t || 4), "--prio", "3", "--poll", "50", "--mlock", "--port", "8000", "--host", "0.0.0.0", ]; // GPU offload strategy if (p.cpuMoe) args.push("--cpu-moe"); else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe)); // VRAM saving options if (p.splitMode) args.push("--split-mode", p.splitMode); if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit); if (p.noMmap) args.push("--no-mmap"); if (p.defragThold) args.push("--defrag-thold", String(p.defragThold)); if (p.noKvOffload) args.push("--no-kv-offload"); const cmdStr = args.join(" "); log(` CMD: ...${cmdStr.slice(-80)}`); proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] }); return proc; } async function waitReady(timeout = BOOT_TIMEOUT) { const t0 = Date.now(); while (Date.now() - t0 < timeout) { try { const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) }); const d = await r.json(); if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 }; } catch {} await sleep(3000); } return { ok: false, boot: timeout / 1000 }; } async function bench(n = TOKENS) { const t0 = Date.now(); const r = await fetch(`${BASE_URL}/v1/chat/completions`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ model: "m", messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], max_tokens: n, temperature: 0, }), signal: AbortSignal.timeout(600_000), }); const d = await r.json(); const dt = (Date.now() - t0) / 1000; const ct = d.usage?.completion_tokens || 0; return { tps: ct / dt, ct, dt }; } async function testConfig(model, label, params) { await kill(); log(` [${label}] Starting...`); startServer(model.path, params); const { ok, boot } = await waitReady(); if (!ok) { log(` [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`); await kill(); return null; } const v = vram(); const totalUsed = v.reduce((a, g) => a + g.used, 0); const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | "); log(` [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`); try { await bench(20); } catch {} // warmup const speeds = []; for (let i = 0; i < RUNS; i++) { try { const r = await bench(); speeds.push(r.tps); log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`); } catch (e) { log(` Run${i+1}: ERR ${e.message}`); } } await kill(); if (!speeds.length) return null; const avg = speeds.reduce((a,b)=>a+b) / speeds.length; const best = Math.max(...speeds); log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`); const res = { model: model.name, label, avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2), boot: +boot.toFixed(1), vram_total: totalUsed, vram: v, params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" }, gpu_only: !params.cpuMoe && !params.nCpuMoe, }; ALL.push(res); writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2)); return res; } // ─── Test Strategies ─────────────────────────────────────────── async function testModel(model) { log(`\n${"#".repeat(65)}`); log(` ${model.name} (${model.sizeGB} GB)`); if (!existsSync(model.path)) { log(" ✗ File not found!"); return null; } log(`${"#".repeat(65)}`); let best = null; const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; }; // ── Strategy 1: Pure GPU, default settings ── log(`\n ── Strategy 1: Pure GPU (default) ──`); update(await testConfig(model, "pure-GPU default", { t: 4, ub: 512, b: 2048 })); // ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ── log(`\n ── Strategy 2: Pure GPU, minimal batch ──`); update(await testConfig(model, "pure-GPU minbatch", { t: 4, ub: 64, b: 256 })); // ── Strategy 3: Pure GPU, small batch + no-mmap ── log(`\n ── Strategy 3: Pure GPU + no-mmap + small batch ──`); update(await testConfig(model, "pure-GPU nommap small", { t: 4, ub: 128, b: 512, noMmap: true })); // ── Strategy 4: Pure GPU, split-mode row ── log(`\n ── Strategy 4: Pure GPU + split-mode row ──`); update(await testConfig(model, "pure-GPU row-split", { t: 4, ub: 128, b: 512, splitMode: "row" })); // ── Strategy 5: Pure GPU, tensor-split manual balance ── log(`\n ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`); update(await testConfig(model, "pure-GPU ts=0.5,0.5", { t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5" })); // ── Strategy 6: Pure GPU, defrag + all tricks ── log(`\n ── Strategy 6: Pure GPU ALL tricks ──`); update(await testConfig(model, "pure-GPU all-tricks", { t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1 })); // ── Fallback: n-cpu-moe=5 baseline ── if (!best || !best.gpu_only) { log(`\n ── Fallback: n-cpu-moe=5 ──`); update(await testConfig(model, "n-cpu-moe=5 baseline", { t: 4, ub: 256, b: 1024, nCpuMoe: 5 })); } // ── If pure GPU worked, tune batch/thread/kv ── if (best && best.gpu_only) { log(`\n ── Pure GPU succeeded! Fine-tuning... ──`); const bp = best.params; // Thread sweep for (const t of [2, 6, 8]) { if (t === bp.t) continue; update(await testConfig(model, `tune t=${t}`, { ...bp, t })); } // Batch sweep for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) { if (ub === bp.ub && b === bp.b) continue; update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b })); } // KV cache upgrade (extra VRAM available?) for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) { update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv })); } } // ── Final verification ── if (best) { log(`\n ── Final verification (5 runs) ──`); await kill(); startServer(model.path, best.params); const { ok, boot } = await waitReady(); if (ok) { const v = vram(); try { await bench(20); } catch {} const finals = []; for (let i = 0; i < 5; i++) { try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`); } catch (e) { log(` Final ${i+1}: ERR`); } } await kill(); if (finals.length > 0) { const avg = finals.reduce((a,b)=>a+b) / finals.length; const bst = Math.max(...finals); log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`); const final = { model: model.name, label: "FINAL", avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2), boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0), vram: v, params: best.params, gpu_only: best.gpu_only }; ALL.push(final); writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2)); return final; } } await kill(); } return best; } // ─── Main ────────────────────────────────────────────────────── async function main() { const t0 = Date.now(); log("=".repeat(65)); log(" QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET"); log(" 2x RTX 3060 (24GB) | 256K Context"); log(" " + new Date().toISOString()); log("=".repeat(65)); vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`)); const winners = []; for (const model of MODELS) { const w = await testModel(model); if (w) winners.push(w); } // ─── Summary ────────────────────────────────────────────── const elapsed = ((Date.now() - t0) / 60000).toFixed(1); winners.sort((a, b) => b.avg_tps - a.avg_tps); const lines = [ `Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`, `2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`, "", "=".repeat(55), " RANKING", "=".repeat(55), ]; for (let i = 0; i < winners.length; i++) { const w = winners[i], p = w.params; const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload"; lines.push("", ` #${i+1}: ${w.model} [${gpu}]`); lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`); lines.push(` VRAM: ${w.vram_total} MiB total`); const flags = []; if (p.splitMode) flags.push(`split=${p.splitMode}`); if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`); if (p.noMmap) flags.push("no-mmap"); if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`); lines.push(` t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`); } if (winners.length > 0) { const c = winners[0]; lines.push("", "=".repeat(55)); lines.push(` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`); lines.push("=".repeat(55)); } const summary = lines.join("\n"); console.log("\n" + summary); writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8"); writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2)); log(`\n Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`); log(" DONE!"); await kill(); } main().catch(e => { console.error("FATAL:", e); process.exit(1); });