feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
This commit is contained in:
101
scripts/_archive/tuning/find_max_dense.mjs
Normal file
101
scripts/_archive/tuning/find_max_dense.mjs
Normal file
@@ -0,0 +1,101 @@
|
||||
import { spawn, exec } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function killServer() {
|
||||
return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
|
||||
}
|
||||
|
||||
async function testContextSize(modelPath, contextSize) {
|
||||
console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
|
||||
await killServer();
|
||||
|
||||
const args = [
|
||||
'--model', `models\\${modelPath}`,
|
||||
'-ngl', '999',
|
||||
'-c', contextSize.toString(),
|
||||
'-fa', 'on',
|
||||
'--cache-type-k', 'q4_0',
|
||||
'--cache-type-v', 'q4_0',
|
||||
'-ub', '512',
|
||||
'-b', '2048',
|
||||
'-t', '6',
|
||||
'-tb', '6',
|
||||
'--split-mode', 'row',
|
||||
'--prio', '3',
|
||||
'--fit', 'off',
|
||||
'--port', '8000',
|
||||
'--host', '0.0.0.0'
|
||||
];
|
||||
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
|
||||
|
||||
let booted = false;
|
||||
let oomed = false;
|
||||
|
||||
server.stderr.on('data', (d) => {
|
||||
const text = d.toString();
|
||||
if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
|
||||
oomed = true;
|
||||
}
|
||||
});
|
||||
|
||||
for (let i = 0; i < 20; i++) {
|
||||
if (oomed) break;
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
booted = true;
|
||||
break;
|
||||
}
|
||||
} catch(e) {}
|
||||
await delay(2000);
|
||||
}
|
||||
|
||||
if (oomed || !booted) {
|
||||
console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
|
||||
server.kill('SIGKILL');
|
||||
await killServer();
|
||||
return false;
|
||||
}
|
||||
|
||||
console.log(`✅ Booted! Running Benchmark...`);
|
||||
|
||||
// Benchmark
|
||||
const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
r(stdout || stderr);
|
||||
}));
|
||||
|
||||
console.log(bench);
|
||||
await killServer();
|
||||
return true;
|
||||
}
|
||||
|
||||
async function findMaxContext(modelName) {
|
||||
const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
|
||||
|
||||
let maxFound = false;
|
||||
for (const c of contexts) {
|
||||
const success = await testContextSize(modelName, c);
|
||||
if (success) {
|
||||
maxFound = true;
|
||||
console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!maxFound) {
|
||||
console.log(`\n❌ Failed to find any working context size for ${modelName}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
exec('set CUDA_VISIBLE_DEVICES=');
|
||||
console.log("============= QWEN 27B Q4_K_M =============");
|
||||
await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
|
||||
|
||||
console.log("\n============= GEMMA 4 31B Q4_K_M =============");
|
||||
await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user