variet_llm/scripts/_archive/benchmarks/test_split_03_07.py

import subprocess
import time
import json
import urllib.request
import sys
import os

try:
    sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
    pass

BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
CONTEXT = 262144

def kill_server():
    try:
        subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
    except:
        pass
    time.sleep(3)

def run_benchmark(max_tokens=200):
    payload = json.dumps({
        "model": "local-model",
        "messages": [{"role": "user", "content": "Count from 1 to 50, each on new line."}],
        "max_tokens": max_tokens,
        "temperature": 0.0
    }).encode("utf-8")

    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )

    start = time.time()
    with urllib.request.urlopen(req, timeout=300) as resp:
        result = json.loads(resp.read())
    elapsed = time.time() - start

    usage = result.get("usage", {})
    ct = usage.get("completion_tokens", 0)
    return ct / elapsed if elapsed > 0 else 0, ct, elapsed

def get_vram():
    try:
        r = subprocess.run(
            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"],
            capture_output=True, text=True, timeout=5
        )
        return r.stdout.strip()
    except:
        return "Unknown"

kill_server()

cmd = [
    LLAMA_SERVER, "--model", MODEL,
    "-ngl", "999", "-c", str(CONTEXT), "-np", "1", "-fa", "on",
    "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
    "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
    "--prio", "3", "--port", "8000", "--host", "0.0.0.0",
    "-ts", "0.45,0.55"
]

print("Starting server with tensorSplit 0.45,0.55")
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=os.getcwd())

ready = False
boot_start = time.time()
for _ in range(30):
    try:
        req = urllib.request.Request(f"{BASE_URL}/health")
        with urllib.request.urlopen(req, timeout=2) as resp:
            data = json.loads(resp.read())
            if data.get("status") == "ok":
                ready = True
                break
    except:
        pass
    time.sleep(3)

if not ready:
    print("Server failed to boot.")
    kill_server()
    sys.exit(1)

boot_time = time.time() - boot_start
print(f"Booted in {boot_time:.1f}s")
print(f"VRAM:\n{get_vram()}")

try:
    print("Warming up...")
    run_benchmark(10)

    print("Benchmarking (200 tokens)...")
    tps, ct, el = run_benchmark(200)
    print("=" * 50)
    print(f"★ 0.3/0.7 SPLIT RESULT: {tps:.2f} t/s ★")
    print(f"   Tokens: {ct} / Time: {el:.2f}s")
    print("=" * 50)
except Exception as e:
    print(f"Error benchmark: {e}")

kill_server()