import urllib.request, json, time B = "http://127.0.0.1:8000" # Test 4: Hot-swap to balanced print("=== TEST 4: HOT-SWAP fast -> balanced ===") req = urllib.request.Request(f"{B}/engine/switch/balanced", method="POST") r = json.loads(urllib.request.urlopen(req, timeout=10).read()) print(f" Switch response: {json.dumps(r)}") # Test 5: During loading, /v1 should return 503 time.sleep(3) print("\n=== TEST 5: 503 during loading ===") try: p = json.dumps({"model":"m","messages":[{"role":"user","content":"hi"}],"max_tokens":5}).encode() urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=5) print(" ERROR: Got 200 during loading!") except urllib.error.HTTPError as e: print(f" Status: {e.code} (expected 503)") retry = e.headers.get("Retry-After", "N/A") print(f" Retry-After: {retry}") # Wait for switch to complete print("\n=== Waiting for switch to complete... ===") for i in range(60): try: s = json.loads(urllib.request.urlopen(f"{B}/engine/status", timeout=3).read()) print(f" [{i*2}s] state={s['state']}, role={s['role']}") if s["state"] == "ready": break except: pass time.sleep(2) # Test 6: Verify new model works print("\n=== TEST 6: Verify balanced model ===") p = json.dumps({"model":"m","messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Say hello"}],"max_tokens":15,"temperature":0}).encode() t0 = time.time() r = json.loads(urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=60).read()) dt = time.time() - t0 tk = r.get("usage", {}).get("completion_tokens", 0) print(f" Speed: {tk/dt:.1f} t/s") s = json.loads(urllib.request.urlopen(f"{B}/engine/status").read()) print(f" Current model: {s['display_name']}") print("\nALL HOT-SWAP TESTS PASSED")