import { spawn, execSync } from "child_process";

const BASE_URL = "http://127.0.0.1:8000";
const args = [
  "llama_bin_run\\llama-server.exe",
  "--model", "models\\Qwen3.5-35B-A3B-Q4_K_M.gguf",
  "-ngl", "999",
  "-c", "262144",
  "-np", "1",
  "-fa", "on",
  "--cache-type-k", "q4_0",
  "--cache-type-v", "q4_0",
  "-ub", "128",
  "-b", "512",
  "-t", "6",
  "-tb", "6",
  "--prio", "3",
  "--port", "8000",
  "--host", "0.0.0.0",
  "-ts", "0.3,0.7"
];

console.log(`Starting server with args: \n${args.join(" ")}\n`);
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
await new Promise(r => setTimeout(r, 2000));

const server = spawn(args[0], args.slice(1), { stdio: 'ignore' });

let ready = false;
let bootStart = Date.now();
for (let i = 0; i < 60; i++) {
  try {
    const res = await fetch(`${BASE_URL}/health`);
    if (res.status === 200) { ready = true; break; }
  } catch(e) {}
  await new Promise(r => setTimeout(r, 3000));
}

if (!ready) {
  console.log("Server failed to boot within 3 mins.");
  try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
  process.exit(1);
}

const bootElapsed = (Date.now() - bootStart) / 1000;
console.log(`\n===========================================`);
console.log(`Server booted in ${bootElapsed.toFixed(1)}s.`);

try {
  const vram = execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', { encoding: 'utf-8' });
  console.log(`VRAM USAGE:\n${vram.trim()}`);
} catch(e) {}
console.log(`===========================================\n`);

try {
  await fetch(`${BASE_URL}/v1/chat/completions`, {
    method: "POST", headers: { "Content-Type": "application/json" },
    body: JSON.stringify({ messages: [{ role: "user", content: "hi" }], max_tokens: 10, temperature: 0 })
  });
} catch(e) {}

console.log("Running speed test (200 tokens)...");
const t0 = Date.now();
try {
    const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
      method: "POST", headers: { "Content-Type": "application/json" },
      body: JSON.stringify({ messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], max_tokens: 200, temperature: 0 })
    });
    const result = await res.json();
    const elapsed = (Date.now() - t0) / 1000;
    const ct = result?.usage?.completion_tokens || 0;
    const tps = ct / elapsed;

    console.log(`\n===========================================`);
    console.log(`★ 0.3/0.7 SPLIT RESULT: ${tps.toFixed(2)} t/s ★`);
    console.log(`   Tokens: ${ct}`);
    console.log(`   Time: ${elapsed.toFixed(2)}s\n===========================================\n`);

} catch(e) {
    console.log("ERROR during benchmark:", e.message);
}

try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
process.exit(0);