import { exec, spawn } from 'child_process'; const delay = ms => new Promise(res => setTimeout(res, ms)); async function runTest(modelArgs, envVars, name) { console.log(`\n===========================================`); console.log(`Testing: ${name}`); console.log(`Args: ${modelArgs}`); return new Promise(async (resolve) => { await new Promise(r => exec('taskkill /F /IM llama-server.exe', r)); await delay(2000); const env = { ...process.env, ...envVars }; const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), { detached: true, stdio: 'ignore', env }); let ready = false; for (let i = 0; i < 40; i++) { try { const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 }); if (res.status === 200) { ready = true; break; } } catch (e) {} await delay(3000); } if (!ready) { console.log(`[${name}] FAILED TO BOOT`); exec('taskkill /F /IM llama-server.exe'); resolve({ success: false }); return; } console.log(`[${name}] Server Ready! Running benchmark...`); exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => { console.log(stdout || stderr); exec('taskkill /F /IM llama-server.exe'); resolve({ success: true }); }); }); } async function main() { const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; // 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48) await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer"); // 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer"); // 3. Fallback to 36 if OOM happens on 32/28 await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer"); console.log("\nALL TESTS COMPLETED"); } main();