Files
variet_llm/scripts/_archive/benchmarks/deep_tier_extreme_test.py
Variet-Worker c111b3a9b0 feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning):
- Gemma4 26B: 74.65 t/s (fast)
- Qwen 35B: 61.62 t/s (balanced)
- Gemma4 31B: 16.0 t/s (deep-coder)
- Qwen 27B: 16.7 t/s (deep-logic)
- Qwen 122B: 8.95 t/s (ultra, GPU 1 only)

Phase 02 (API Engine):
- FastAPI reverse proxy on port 8000
- /engine/switch hot-swap with 503 protection
- config/engine_models.json as single source of truth
- Replaced 4 individual .bat files with unified engine

File cleanup:
- scripts/ 85 files -> 9 + _archive/
- Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00

172 lines
5.9 KiB
Python

import subprocess
import time
import urllib.request
import json
import sys
try:
sys.stdout.reconfigure(encoding='utf-8')
except:
pass
BASE_URL = "http://127.0.0.1:8000"
RESULTS_FILE = "scripts/deep_tier_extreme_results.json"
MODELS = [
{
"name": "Qwen 27B - 256K 극한 (q4_0, ub=512)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
"--port", "8000", "--host", "0.0.0.0"
]
},
{
"name": "Gemma 31B - 128K 확장 (q4_0)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
"-ngl", "999", "-c", "131072", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50",
"--port", "8000", "--host", "0.0.0.0"
]
},
{
"name": "Gemma 31B - 192K 극한 (q4_0)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
"-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50",
"--port", "8000", "--host", "0.0.0.0"
]
}
]
TEST_PROMPTS = [
{
"id": "code",
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
},
{
"id": "logical",
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
}
]
def check_server(timeout=300):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
if resp.get("status") == "ok" or resp.get("status") == "ready":
return True
except:
pass
time.sleep(5)
return False
def get_vram_usage():
try:
out = subprocess.check_output(
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
text=True
)
return out.strip().split("\n")
except:
return ["Failed to get VRAM info"]
def ask(prompt, max_tokens=300):
payload = json.dumps({
"model": "m",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode()
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
dt = time.time() - t0
usage = resp.get("usage", {})
content = resp["choices"][0]["message"]["content"]
tokens = usage.get("completion_tokens", 0)
tps = round(tokens / dt, 2) if dt > 0 else 0
return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
def main():
results = []
# Clean init
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(3)
for cfg in MODELS:
print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print(f"Waiting for server to boot (up to 5 mins)...")
is_ready = check_server(300)
if not is_ready:
print(f"❌ Failed to boot {cfg['name']}.")
results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
proc.terminate()
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)
continue
print(f"✅ Server Ready!")
vram = get_vram_usage()
print(f"VRAM: {vram}")
# Warmup
try:
ask("Hello", max_tokens=10)
except Exception:
pass
test_data = {}
for p in TEST_PROMPTS:
print(f" Testing {p['id']}...", end="", flush=True)
try:
res = ask(p["prompt"])
test_data[p["id"]] = res
print(f" {res['tps']} t/s")
except Exception as e:
test_data[p["id"]] = {"error": str(e)}
print(f" ERROR: {e}")
results.append({
"name": cfg["name"],
"status": "Success",
"vram": vram,
"tests": test_data
})
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print("Shutting down server...")
proc.terminate()
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)
print("\n✅ Extreme testing complete! Check results in", RESULTS_FILE)
if __name__ == "__main__":
main()