feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions
--- a/scripts/_archive/benchmarks/qwen_split_challenge.py
+++ b/scripts/_archive/benchmarks/qwen_split_challenge.py
@@ -0,0 +1,67 @@
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+
+try: sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError: pass
+
+MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+
+subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+time.sleep(2)
+
+cmd = [
+    LLAMA_SERVER, "--model", MODEL,
+    "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
+    "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+    "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
+    "--prio", "3", "--port", "8000", "--host", "0.0.0.0",
+    "-ts", "0.44,0.56"
+]
+
+print(f"🚀 Starting Challenge (0.44, 0.56) ...")
+proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+ready = False
+for i in range(120):
+    try:
+        req = urllib.request.Request("http://127.0.0.1:8000/health")
+        with urllib.request.urlopen(req, timeout=1) as r:
+            if json.loads(r.read()).get("status") == "ok":
+                ready = True
+                break
+    except:
+        pass
+    print(f" booting... {i}s", end='\r', flush=True)
+    time.sleep(1)
+
+if not ready:
+    print("\n❌ FAILED to boot.")
+    proc.kill()
+    sys.exit(1)
+
+print("\n✅ Booted! Testing 200 tokens...")
+try:
+    payload = json.dumps({
+        "messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
+        "max_tokens": 200, "temperature": 0
+    }).encode()
+    req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
+    t0 = time.time()
+    with urllib.request.urlopen(req, timeout=300) as r:
+        res = json.loads(r.read())
+        el = time.time() - t0
+        ct = res["usage"]["completion_tokens"]
+        tps = ct / el
+        print("="*50)
+        print(f"★ 0.44 / 0.56 RESULT: {tps:.2f} t/s ★")
+        print(f"   Tokens: {ct} | Time: {el:.2f}s")
+        print("="*50)
+except Exception as e:
+    print(f"\n❌ Benchmark Error: {e}")
+
+proc.kill()