Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
47 lines
1.9 KiB
Python
47 lines
1.9 KiB
Python
import urllib.request, json, time
|
|
|
|
B = "http://127.0.0.1:8000"
|
|
|
|
# Test 4: Hot-swap to balanced
|
|
print("=== TEST 4: HOT-SWAP fast -> balanced ===")
|
|
req = urllib.request.Request(f"{B}/engine/switch/balanced", method="POST")
|
|
r = json.loads(urllib.request.urlopen(req, timeout=10).read())
|
|
print(f" Switch response: {json.dumps(r)}")
|
|
|
|
# Test 5: During loading, /v1 should return 503
|
|
time.sleep(3)
|
|
print("\n=== TEST 5: 503 during loading ===")
|
|
try:
|
|
p = json.dumps({"model":"m","messages":[{"role":"user","content":"hi"}],"max_tokens":5}).encode()
|
|
urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=5)
|
|
print(" ERROR: Got 200 during loading!")
|
|
except urllib.error.HTTPError as e:
|
|
print(f" Status: {e.code} (expected 503)")
|
|
retry = e.headers.get("Retry-After", "N/A")
|
|
print(f" Retry-After: {retry}")
|
|
|
|
# Wait for switch to complete
|
|
print("\n=== Waiting for switch to complete... ===")
|
|
for i in range(60):
|
|
try:
|
|
s = json.loads(urllib.request.urlopen(f"{B}/engine/status", timeout=3).read())
|
|
print(f" [{i*2}s] state={s['state']}, role={s['role']}")
|
|
if s["state"] == "ready":
|
|
break
|
|
except:
|
|
pass
|
|
time.sleep(2)
|
|
|
|
# Test 6: Verify new model works
|
|
print("\n=== TEST 6: Verify balanced model ===")
|
|
p = json.dumps({"model":"m","messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Say hello"}],"max_tokens":15,"temperature":0}).encode()
|
|
t0 = time.time()
|
|
r = json.loads(urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=60).read())
|
|
dt = time.time() - t0
|
|
tk = r.get("usage", {}).get("completion_tokens", 0)
|
|
print(f" Speed: {tk/dt:.1f} t/s")
|
|
|
|
s = json.loads(urllib.request.urlopen(f"{B}/engine/status").read())
|
|
print(f" Current model: {s['display_name']}")
|
|
print("\nALL HOT-SWAP TESTS PASSED")
|