import { exec, spawn } from 'child_process'; const delay = ms => new Promise(res => setTimeout(res, ms)); async function runTest(modelArgs, name) { console.log(`\n===========================================`); console.log(`Testing: ${name}`); console.log(`Args: ${modelArgs}`); return new Promise(async (resolve) => { // Kill existing await new Promise(r => exec('taskkill /F /IM llama-server.exe', r)); await delay(2000); const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), { detached: true, stdio: 'ignore' }); let ready = false; let oom = false; for (let i = 0; i < 40; i++) { try { const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 }); if (res.status === 200) { ready = true; break; } } catch (e) {} await delay(3000); } if (!ready) { console.log(`[${name}] FAILED TO BOOT (Likely OOM)`); exec('taskkill /F /IM llama-server.exe'); resolve({ success: false }); return; } console.log(`[${name}] Server Ready! Running benchmark...`); // Run pptest exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => { console.log(stdout || stderr); // Extract TG and PP from TG-500 const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/); const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/); const tg = tgMatch ? parseFloat(tgMatch[1]) : 0; const pp = ppMatch ? parseFloat(ppMatch[1]) : 0; exec('taskkill /F /IM llama-server.exe'); resolve({ success: true, tg, pp }); }); }); } async function main() { // 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512 const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; // Test 1: n-cpu-moe 1, ub 512 await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512"); // Test 2: n-cpu-moe 2, ub 512 await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512"); // Test 3: n-cpu-moe 4, ub 512 await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512"); // 2. 122B Tuning: Find optimal n-cpu-moe const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; // Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU. // Let's try 38, 35, 30 await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38"); await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30"); await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22"); console.log("Tuning finished."); } main();