variet_llm/scripts/test_hotswap.py

import urllib.request, json, time

B = "http://127.0.0.1:8000"

# Test 4: Hot-swap to balanced
print("=== TEST 4: HOT-SWAP fast -> balanced ===")
req = urllib.request.Request(f"{B}/engine/switch/balanced", method="POST")
r = json.loads(urllib.request.urlopen(req, timeout=10).read())
print(f"  Switch response: {json.dumps(r)}")

# Test 5: During loading, /v1 should return 503
time.sleep(3)
print("\n=== TEST 5: 503 during loading ===")
try:
    p = json.dumps({"model":"m","messages":[{"role":"user","content":"hi"}],"max_tokens":5}).encode()
    urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=5)
    print("  ERROR: Got 200 during loading!")
except urllib.error.HTTPError as e:
    print(f"  Status: {e.code} (expected 503)")
    retry = e.headers.get("Retry-After", "N/A")
    print(f"  Retry-After: {retry}")

# Wait for switch to complete
print("\n=== Waiting for switch to complete... ===")
for i in range(60):
    try:
        s = json.loads(urllib.request.urlopen(f"{B}/engine/status", timeout=3).read())
        print(f"  [{i*2}s] state={s['state']}, role={s['role']}")
        if s["state"] == "ready":
            break
    except:
        pass
    time.sleep(2)

# Test 6: Verify new model works
print("\n=== TEST 6: Verify balanced model ===")
p = json.dumps({"model":"m","messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Say hello"}],"max_tokens":15,"temperature":0}).encode()
t0 = time.time()
r = json.loads(urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=60).read())
dt = time.time() - t0
tk = r.get("usage", {}).get("completion_tokens", 0)
print(f"  Speed: {tk/dt:.1f} t/s")

s = json.loads(urllib.request.urlopen(f"{B}/engine/status").read())
print(f"  Current model: {s['display_name']}")
print("\nALL HOT-SWAP TESTS PASSED")