feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions
--- a/scripts/test_hotswap.py
+++ b/scripts/test_hotswap.py
@@ -0,0 +1,46 @@
+import urllib.request, json, time
+
+B = "http://127.0.0.1:8000"
+
+# Test 4: Hot-swap to balanced
+print("=== TEST 4: HOT-SWAP fast -> balanced ===")
+req = urllib.request.Request(f"{B}/engine/switch/balanced", method="POST")
+r = json.loads(urllib.request.urlopen(req, timeout=10).read())
+print(f"  Switch response: {json.dumps(r)}")
+
+# Test 5: During loading, /v1 should return 503
+time.sleep(3)
+print("\n=== TEST 5: 503 during loading ===")
+try:
+    p = json.dumps({"model":"m","messages":[{"role":"user","content":"hi"}],"max_tokens":5}).encode()
+    urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=5)
+    print("  ERROR: Got 200 during loading!")
+except urllib.error.HTTPError as e:
+    print(f"  Status: {e.code} (expected 503)")
+    retry = e.headers.get("Retry-After", "N/A")
+    print(f"  Retry-After: {retry}")
+
+# Wait for switch to complete
+print("\n=== Waiting for switch to complete... ===")
+for i in range(60):
+    try:
+        s = json.loads(urllib.request.urlopen(f"{B}/engine/status", timeout=3).read())
+        print(f"  [{i*2}s] state={s['state']}, role={s['role']}")
+        if s["state"] == "ready":
+            break
+    except:
+        pass
+    time.sleep(2)
+
+# Test 6: Verify new model works
+print("\n=== TEST 6: Verify balanced model ===")
+p = json.dumps({"model":"m","messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Say hello"}],"max_tokens":15,"temperature":0}).encode()
+t0 = time.time()
+r = json.loads(urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=60).read())
+dt = time.time() - t0
+tk = r.get("usage", {}).get("completion_tokens", 0)
+print(f"  Speed: {tk/dt:.1f} t/s")
+
+s = json.loads(urllib.request.urlopen(f"{B}/engine/status").read())
+print(f"  Current model: {s['display_name']}")
+print("\nALL HOT-SWAP TESTS PASSED")