Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
102 lines
2.8 KiB
JavaScript
102 lines
2.8 KiB
JavaScript
import { spawn, exec } from 'child_process';
|
|
|
|
const delay = ms => new Promise(res => setTimeout(res, ms));
|
|
|
|
async function killServer() {
|
|
return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
|
|
}
|
|
|
|
async function testContextSize(modelPath, contextSize) {
|
|
console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
|
|
await killServer();
|
|
|
|
const args = [
|
|
'--model', `models\\${modelPath}`,
|
|
'-ngl', '999',
|
|
'-c', contextSize.toString(),
|
|
'-fa', 'on',
|
|
'--cache-type-k', 'q4_0',
|
|
'--cache-type-v', 'q4_0',
|
|
'-ub', '512',
|
|
'-b', '2048',
|
|
'-t', '6',
|
|
'-tb', '6',
|
|
'--split-mode', 'row',
|
|
'--prio', '3',
|
|
'--fit', 'off',
|
|
'--port', '8000',
|
|
'--host', '0.0.0.0'
|
|
];
|
|
|
|
const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
|
|
|
|
let booted = false;
|
|
let oomed = false;
|
|
|
|
server.stderr.on('data', (d) => {
|
|
const text = d.toString();
|
|
if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
|
|
oomed = true;
|
|
}
|
|
});
|
|
|
|
for (let i = 0; i < 20; i++) {
|
|
if (oomed) break;
|
|
try {
|
|
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
|
if (res.status === 200) {
|
|
booted = true;
|
|
break;
|
|
}
|
|
} catch(e) {}
|
|
await delay(2000);
|
|
}
|
|
|
|
if (oomed || !booted) {
|
|
console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
|
|
server.kill('SIGKILL');
|
|
await killServer();
|
|
return false;
|
|
}
|
|
|
|
console.log(`✅ Booted! Running Benchmark...`);
|
|
|
|
// Benchmark
|
|
const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
|
r(stdout || stderr);
|
|
}));
|
|
|
|
console.log(bench);
|
|
await killServer();
|
|
return true;
|
|
}
|
|
|
|
async function findMaxContext(modelName) {
|
|
const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
|
|
|
|
let maxFound = false;
|
|
for (const c of contexts) {
|
|
const success = await testContextSize(modelName, c);
|
|
if (success) {
|
|
maxFound = true;
|
|
console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!maxFound) {
|
|
console.log(`\n❌ Failed to find any working context size for ${modelName}`);
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
exec('set CUDA_VISIBLE_DEVICES=');
|
|
console.log("============= QWEN 27B Q4_K_M =============");
|
|
await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
|
|
|
|
console.log("\n============= GEMMA 4 31B Q4_K_M =============");
|
|
await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
|
|
}
|
|
|
|
main();
|