feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning):
- Gemma4 26B: 74.65 t/s (fast)
- Qwen 35B: 61.62 t/s (balanced)
- Gemma4 31B: 16.0 t/s (deep-coder)
- Qwen 27B: 16.7 t/s (deep-logic)
- Qwen 122B: 8.95 t/s (ultra, GPU 1 only)

Phase 02 (API Engine):
- FastAPI reverse proxy on port 8000
- /engine/switch hot-swap with 503 protection
- config/engine_models.json as single source of truth
- Replaced 4 individual .bat files with unified engine

File cleanup:
- scripts/ 85 files -> 9 + _archive/
- Root .bat files -> _archive/
This commit is contained in:
Variet-Worker
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions

46
scripts/test_hotswap.py Normal file
View File

@@ -0,0 +1,46 @@
import urllib.request, json, time
B = "http://127.0.0.1:8000"
# Test 4: Hot-swap to balanced
print("=== TEST 4: HOT-SWAP fast -> balanced ===")
req = urllib.request.Request(f"{B}/engine/switch/balanced", method="POST")
r = json.loads(urllib.request.urlopen(req, timeout=10).read())
print(f" Switch response: {json.dumps(r)}")
# Test 5: During loading, /v1 should return 503
time.sleep(3)
print("\n=== TEST 5: 503 during loading ===")
try:
p = json.dumps({"model":"m","messages":[{"role":"user","content":"hi"}],"max_tokens":5}).encode()
urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=5)
print(" ERROR: Got 200 during loading!")
except urllib.error.HTTPError as e:
print(f" Status: {e.code} (expected 503)")
retry = e.headers.get("Retry-After", "N/A")
print(f" Retry-After: {retry}")
# Wait for switch to complete
print("\n=== Waiting for switch to complete... ===")
for i in range(60):
try:
s = json.loads(urllib.request.urlopen(f"{B}/engine/status", timeout=3).read())
print(f" [{i*2}s] state={s['state']}, role={s['role']}")
if s["state"] == "ready":
break
except:
pass
time.sleep(2)
# Test 6: Verify new model works
print("\n=== TEST 6: Verify balanced model ===")
p = json.dumps({"model":"m","messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Say hello"}],"max_tokens":15,"temperature":0}).encode()
t0 = time.time()
r = json.loads(urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=60).read())
dt = time.time() - t0
tk = r.get("usage", {}).get("completion_tokens", 0)
print(f" Speed: {tk/dt:.1f} t/s")
s = json.loads(urllib.request.urlopen(f"{B}/engine/status").read())
print(f" Current model: {s['display_name']}")
print("\nALL HOT-SWAP TESTS PASSED")