variet_llm/scripts/qwen_split_challenge.py

import subprocess
import time
import json
import urllib.request
import sys
import os

try: sys.stdout.reconfigure(encoding='utf-8')
except AttributeError: pass

MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"

subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
time.sleep(2)

cmd = [
    LLAMA_SERVER, "--model", MODEL,
    "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
    "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
    "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
    "--prio", "3", "--port", "8000", "--host", "0.0.0.0",
    "-ts", "0.44,0.56"
]

print(f"🚀 Starting Challenge (0.44, 0.56) ...")
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

ready = False
for i in range(120):
    try:
        req = urllib.request.Request("http://127.0.0.1:8000/health")
        with urllib.request.urlopen(req, timeout=1) as r:
            if json.loads(r.read()).get("status") == "ok":
                ready = True
                break
    except:
        pass
    print(f" booting... {i}s", end='\r', flush=True)
    time.sleep(1)

if not ready:
    print("\n❌ FAILED to boot.")
    proc.kill()
    sys.exit(1)

print("\n✅ Booted! Testing 200 tokens...")
try:
    payload = json.dumps({
        "messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
        "max_tokens": 200, "temperature": 0
    }).encode()
    req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
    t0 = time.time()
    with urllib.request.urlopen(req, timeout=300) as r:
        res = json.loads(r.read())
        el = time.time() - t0
        ct = res["usage"]["completion_tokens"]
        tps = ct / el
        print("="*50)
        print(f"★ 0.44 / 0.56 RESULT: {tps:.2f} t/s ★")
        print(f"   Tokens: {ct} | Time: {el:.2f}s")
        print("="*50)
except Exception as e:
    print(f"\n❌ Benchmark Error: {e}")

proc.kill()