feat(phase-06): complete Hermes Agent windows fixes & deployment
This commit is contained in:
84
scripts/tune_models.mjs
Normal file
84
scripts/tune_models.mjs
Normal file
@@ -0,0 +1,84 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
// Kill existing
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore'
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
let oom = false;
|
||||
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT (Likely OOM)`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||
// Run pptest
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
|
||||
// Extract TG and PP from TG-500
|
||||
const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/);
|
||||
const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/);
|
||||
|
||||
const tg = tgMatch ? parseFloat(tgMatch[1]) : 0;
|
||||
const pp = ppMatch ? parseFloat(ppMatch[1]) : 0;
|
||||
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true, tg, pp });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512
|
||||
const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// Test 1: n-cpu-moe 1, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512");
|
||||
|
||||
// Test 2: n-cpu-moe 2, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512");
|
||||
|
||||
// Test 3: n-cpu-moe 4, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512");
|
||||
|
||||
// 2. 122B Tuning: Find optimal n-cpu-moe
|
||||
const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU.
|
||||
// Let's try 38, 35, 30
|
||||
await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38");
|
||||
await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30");
|
||||
await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22");
|
||||
|
||||
console.log("Tuning finished.");
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user