feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions
--- a/scripts/_archive/tuning/find_max_dense.mjs
+++ b/scripts/_archive/tuning/find_max_dense.mjs
@@ -0,0 +1,101 @@
+import { spawn, exec } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function killServer() {
+    return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
+}
+
+async function testContextSize(modelPath, contextSize) {
+    console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
+    await killServer();
+
+    const args = [
+        '--model', `models\\${modelPath}`,
+        '-ngl', '999',
+        '-c', contextSize.toString(),
+        '-fa', 'on',
+        '--cache-type-k', 'q4_0',
+        '--cache-type-v', 'q4_0',
+        '-ub', '512',
+        '-b', '2048',
+        '-t', '6',
+        '-tb', '6',
+        '--split-mode', 'row',
+        '--prio', '3',
+        '--fit', 'off',
+        '--port', '8000',
+        '--host', '0.0.0.0'
+    ];
+
+    const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
+    
+    let booted = false;
+    let oomed = false;
+
+    server.stderr.on('data', (d) => {
+        const text = d.toString();
+        if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
+            oomed = true;
+        }
+    });
+
+    for (let i = 0; i < 20; i++) {
+        if (oomed) break;
+        try {
+            const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+            if (res.status === 200) {
+                booted = true;
+                break;
+            }
+        } catch(e) {}
+        await delay(2000);
+    }
+
+    if (oomed || !booted) {
+        console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
+        server.kill('SIGKILL');
+        await killServer();
+        return false;
+    }
+
+    console.log(`✅ Booted! Running Benchmark...`);
+    
+    // Benchmark
+    const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+        r(stdout || stderr);
+    }));
+    
+    console.log(bench);
+    await killServer();
+    return true;
+}
+
+async function findMaxContext(modelName) {
+    const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
+    
+    let maxFound = false;
+    for (const c of contexts) {
+        const success = await testContextSize(modelName, c);
+        if (success) {
+            maxFound = true;
+            console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
+            break;
+        }
+    }
+    
+    if (!maxFound) {
+        console.log(`\n❌ Failed to find any working context size for ${modelName}`);
+    }
+}
+
+async function main() {
+    exec('set CUDA_VISIBLE_DEVICES=');
+    console.log("============= QWEN 27B Q4_K_M =============");
+    await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
+    
+    console.log("\n============= GEMMA 4 31B Q4_K_M =============");
+    await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
+}
+
+main();