import subprocess import time import json import urllib.request import sys import os try: sys.stdout.reconfigure(encoding='utf-8') except AttributeError: pass BASE_URL = "http://127.0.0.1:8000" LLAMA_SERVER = r"llama_bin_run\llama-server.exe" MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf" CONTEXT = 262144 def kill_server(): try: subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True) except: pass time.sleep(3) def run_benchmark(max_tokens=200): payload = json.dumps({ "model": "local-model", "messages": [{"role": "user", "content": "Count from 1 to 50, each on new line."}], "max_tokens": max_tokens, "temperature": 0.0 }).encode("utf-8") req = urllib.request.Request( f"{BASE_URL}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"} ) start = time.time() with urllib.request.urlopen(req, timeout=300) as resp: result = json.loads(resp.read()) elapsed = time.time() - start usage = result.get("usage", {}) ct = usage.get("completion_tokens", 0) return ct / elapsed if elapsed > 0 else 0, ct, elapsed def get_vram(): try: r = subprocess.run( ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) return r.stdout.strip() except: return "Unknown" kill_server() cmd = [ LLAMA_SERVER, "--model", MODEL, "-ngl", "999", "-c", str(CONTEXT), "-np", "1", "-fa", "on", "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "128", "-b", "512", "-t", "6", "-tb", "6", "--prio", "3", "--port", "8000", "--host", "0.0.0.0", "-ts", "0.45,0.55" ] print("Starting server with tensorSplit 0.45,0.55") proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=os.getcwd()) ready = False boot_start = time.time() for _ in range(30): try: req = urllib.request.Request(f"{BASE_URL}/health") with urllib.request.urlopen(req, timeout=2) as resp: data = json.loads(resp.read()) if data.get("status") == "ok": ready = True break except: pass time.sleep(3) if not ready: print("Server failed to boot.") kill_server() sys.exit(1) boot_time = time.time() - boot_start print(f"Booted in {boot_time:.1f}s") print(f"VRAM:\n{get_vram()}") try: print("Warming up...") run_benchmark(10) print("Benchmarking (200 tokens)...") tps, ct, el = run_benchmark(200) print("=" * 50) print(f"★ 0.3/0.7 SPLIT RESULT: {tps:.2f} t/s ★") print(f" Tokens: {ct} / Time: {el:.2f}s") print("=" * 50) except Exception as e: print(f"Error benchmark: {e}") kill_server()