import { spawn, execSync } from "child_process"; const BASE_URL = "http://127.0.0.1:8000"; const args = [ "llama_bin_run\\llama-server.exe", "--model", "models\\Qwen3.5-35B-A3B-Q4_K_M.gguf", "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on", "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "128", "-b", "512", "-t", "6", "-tb", "6", "--prio", "3", "--port", "8000", "--host", "0.0.0.0", "-ts", "0.3,0.7" ]; console.log(`Starting server with args: \n${args.join(" ")}\n`); try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {} await new Promise(r => setTimeout(r, 2000)); const server = spawn(args[0], args.slice(1), { stdio: 'ignore' }); let ready = false; let bootStart = Date.now(); for (let i = 0; i < 60; i++) { try { const res = await fetch(`${BASE_URL}/health`); if (res.status === 200) { ready = true; break; } } catch(e) {} await new Promise(r => setTimeout(r, 3000)); } if (!ready) { console.log("Server failed to boot within 3 mins."); try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {} process.exit(1); } const bootElapsed = (Date.now() - bootStart) / 1000; console.log(`\n===========================================`); console.log(`Server booted in ${bootElapsed.toFixed(1)}s.`); try { const vram = execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', { encoding: 'utf-8' }); console.log(`VRAM USAGE:\n${vram.trim()}`); } catch(e) {} console.log(`===========================================\n`); try { await fetch(`${BASE_URL}/v1/chat/completions`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ messages: [{ role: "user", content: "hi" }], max_tokens: 10, temperature: 0 }) }); } catch(e) {} console.log("Running speed test (200 tokens)..."); const t0 = Date.now(); try { const res = await fetch(`${BASE_URL}/v1/chat/completions`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], max_tokens: 200, temperature: 0 }) }); const result = await res.json(); const elapsed = (Date.now() - t0) / 1000; const ct = result?.usage?.completion_tokens || 0; const tps = ct / elapsed; console.log(`\n===========================================`); console.log(`★ 0.3/0.7 SPLIT RESULT: ${tps.toFixed(2)} t/s ★`); console.log(` Tokens: ${ct}`); console.log(` Time: ${elapsed.toFixed(2)}s\n===========================================\n`); } catch(e) { console.log("ERROR during benchmark:", e.message); } try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {} process.exit(0);