variet_llm/scripts/_archive/tuning/find_max_dense.mjs

import { spawn, exec } from 'child_process';

const delay = ms => new Promise(res => setTimeout(res, ms));

async function killServer() {
    return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
}

async function testContextSize(modelPath, contextSize) {
    console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
    await killServer();

    const args = [
        '--model', `models\\${modelPath}`,
        '-ngl', '999',
        '-c', contextSize.toString(),
        '-fa', 'on',
        '--cache-type-k', 'q4_0',
        '--cache-type-v', 'q4_0',
        '-ub', '512',
        '-b', '2048',
        '-t', '6',
        '-tb', '6',
        '--split-mode', 'row',
        '--prio', '3',
        '--fit', 'off',
        '--port', '8000',
        '--host', '0.0.0.0'
    ];

    const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });

    let booted = false;
    let oomed = false;

    server.stderr.on('data', (d) => {
        const text = d.toString();
        if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
            oomed = true;
        }
    });

    for (let i = 0; i < 20; i++) {
        if (oomed) break;
        try {
            const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
            if (res.status === 200) {
                booted = true;
                break;
            }
        } catch(e) {}
        await delay(2000);
    }

    if (oomed || !booted) {
        console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
        server.kill('SIGKILL');
        await killServer();
        return false;
    }

    console.log(`✅ Booted! Running Benchmark...`);

    // Benchmark
    const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
        r(stdout || stderr);
    }));

    console.log(bench);
    await killServer();
    return true;
}

async function findMaxContext(modelName) {
    const contexts = [262144, 131072, 65536, 32768, 16384, 8192];

    let maxFound = false;
    for (const c of contexts) {
        const success = await testContextSize(modelName, c);
        if (success) {
            maxFound = true;
            console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
            break;
        }
    }

    if (!maxFound) {
        console.log(`\n❌ Failed to find any working context size for ${modelName}`);
    }
}

async function main() {
    exec('set CUDA_VISIBLE_DEVICES=');
    console.log("============= QWEN 27B Q4_K_M =============");
    await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');

    console.log("\n============= GEMMA 4 31B Q4_K_M =============");
    await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
}

main();