import subprocess import time import json import urllib.request import sys import os try: sys.stdout.reconfigure(encoding='utf-8') except AttributeError: pass MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf" LLAMA_SERVER = r"llama_bin_run\llama-server.exe" subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True) time.sleep(2) cmd = [ LLAMA_SERVER, "--model", MODEL, "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on", "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "128", "-b", "512", "-t", "6", "-tb", "6", "--prio", "3", "--port", "8000", "--host", "0.0.0.0", "-ts", "0.44,0.56" ] print(f"šŸš€ Starting Challenge (0.44, 0.56) ...") proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) ready = False for i in range(120): try: req = urllib.request.Request("http://127.0.0.1:8000/health") with urllib.request.urlopen(req, timeout=1) as r: if json.loads(r.read()).get("status") == "ok": ready = True break except: pass print(f" booting... {i}s", end='\r', flush=True) time.sleep(1) if not ready: print("\nāŒ FAILED to boot.") proc.kill() sys.exit(1) print("\nāœ… Booted! Testing 200 tokens...") try: payload = json.dumps({ "messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}], "max_tokens": 200, "temperature": 0 }).encode() req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"}) t0 = time.time() with urllib.request.urlopen(req, timeout=300) as r: res = json.loads(r.read()) el = time.time() - t0 ct = res["usage"]["completion_tokens"] tps = ct / el print("="*50) print(f"ā˜… 0.44 / 0.56 RESULT: {tps:.2f} t/s ā˜…") print(f" Tokens: {ct} | Time: {el:.2f}s") print("="*50) except Exception as e: print(f"\nāŒ Benchmark Error: {e}") proc.kill()