feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
This commit is contained in:
84
scripts/_archive/benchmarks/test_split_03_07.mjs
Normal file
84
scripts/_archive/benchmarks/test_split_03_07.mjs
Normal file
@@ -0,0 +1,84 @@
|
||||
import { spawn, execSync } from "child_process";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const args = [
|
||||
"llama_bin_run\\llama-server.exe",
|
||||
"--model", "models\\Qwen3.5-35B-A3B-Q4_K_M.gguf",
|
||||
"-ngl", "999",
|
||||
"-c", "262144",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "128",
|
||||
"-b", "512",
|
||||
"-t", "6",
|
||||
"-tb", "6",
|
||||
"--prio", "3",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"-ts", "0.3,0.7"
|
||||
];
|
||||
|
||||
console.log(`Starting server with args: \n${args.join(" ")}\n`);
|
||||
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
const server = spawn(args[0], args.slice(1), { stdio: 'ignore' });
|
||||
|
||||
let ready = false;
|
||||
let bootStart = Date.now();
|
||||
for (let i = 0; i < 60; i++) {
|
||||
try {
|
||||
const res = await fetch(`${BASE_URL}/health`);
|
||||
if (res.status === 200) { ready = true; break; }
|
||||
} catch(e) {}
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log("Server failed to boot within 3 mins.");
|
||||
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const bootElapsed = (Date.now() - bootStart) / 1000;
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Server booted in ${bootElapsed.toFixed(1)}s.`);
|
||||
|
||||
try {
|
||||
const vram = execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', { encoding: 'utf-8' });
|
||||
console.log(`VRAM USAGE:\n${vram.trim()}`);
|
||||
} catch(e) {}
|
||||
console.log(`===========================================\n`);
|
||||
|
||||
try {
|
||||
await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST", headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ messages: [{ role: "user", content: "hi" }], max_tokens: 10, temperature: 0 })
|
||||
});
|
||||
} catch(e) {}
|
||||
|
||||
console.log("Running speed test (200 tokens)...");
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST", headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], max_tokens: 200, temperature: 0 })
|
||||
});
|
||||
const result = await res.json();
|
||||
const elapsed = (Date.now() - t0) / 1000;
|
||||
const ct = result?.usage?.completion_tokens || 0;
|
||||
const tps = ct / elapsed;
|
||||
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`★ 0.3/0.7 SPLIT RESULT: ${tps.toFixed(2)} t/s ★`);
|
||||
console.log(` Tokens: ${ct}`);
|
||||
console.log(` Time: ${elapsed.toFixed(2)}s\n===========================================\n`);
|
||||
|
||||
} catch(e) {
|
||||
console.log("ERROR during benchmark:", e.message);
|
||||
}
|
||||
|
||||
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
|
||||
process.exit(0);
|
||||
Reference in New Issue
Block a user