Files
variet_llm/scripts/_archive/benchmarks/test_split_03_07.py
Variet-Worker c111b3a9b0 feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning):
- Gemma4 26B: 74.65 t/s (fast)
- Qwen 35B: 61.62 t/s (balanced)
- Gemma4 31B: 16.0 t/s (deep-coder)
- Qwen 27B: 16.7 t/s (deep-logic)
- Qwen 122B: 8.95 t/s (ultra, GPU 1 only)

Phase 02 (API Engine):
- FastAPI reverse proxy on port 8000
- /engine/switch hot-swap with 503 protection
- config/engine_models.json as single source of truth
- Replaced 4 individual .bat files with unified engine

File cleanup:
- scripts/ 85 files -> 9 + _archive/
- Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00

109 lines
2.8 KiB
Python

import subprocess
import time
import json
import urllib.request
import sys
import os
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
CONTEXT = 262144
def kill_server():
try:
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
except:
pass
time.sleep(3)
def run_benchmark(max_tokens=200):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": "Count from 1 to 50, each on new line."}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=300) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
ct = usage.get("completion_tokens", 0)
return ct / elapsed if elapsed > 0 else 0, ct, elapsed
def get_vram():
try:
r = subprocess.run(
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
return r.stdout.strip()
except:
return "Unknown"
kill_server()
cmd = [
LLAMA_SERVER, "--model", MODEL,
"-ngl", "999", "-c", str(CONTEXT), "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
"--prio", "3", "--port", "8000", "--host", "0.0.0.0",
"-ts", "0.45,0.55"
]
print("Starting server with tensorSplit 0.45,0.55")
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=os.getcwd())
ready = False
boot_start = time.time()
for _ in range(30):
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=2) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
ready = True
break
except:
pass
time.sleep(3)
if not ready:
print("Server failed to boot.")
kill_server()
sys.exit(1)
boot_time = time.time() - boot_start
print(f"Booted in {boot_time:.1f}s")
print(f"VRAM:\n{get_vram()}")
try:
print("Warming up...")
run_benchmark(10)
print("Benchmarking (200 tokens)...")
tps, ct, el = run_benchmark(200)
print("=" * 50)
print(f"★ 0.3/0.7 SPLIT RESULT: {tps:.2f} t/s ★")
print(f" Tokens: {ct} / Time: {el:.2f}s")
print("=" * 50)
except Exception as e:
print(f"Error benchmark: {e}")
kill_server()