feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
This commit is contained in:
177
scripts/_archive/benchmarks/deep_tier_auto_test.py
Normal file
177
scripts/_archive/benchmarks/deep_tier_auto_test.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
RESULTS_FILE = "scripts/deep_tier_auto_results.json"
|
||||
|
||||
MODELS = [
|
||||
{
|
||||
"name": "Qwen 27B - 256K (q4_0)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 32K (q4_0)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "32768", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 64K (q4_0)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "65536", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
TEST_PROMPTS = [
|
||||
{
|
||||
"id": "code",
|
||||
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
|
||||
},
|
||||
{
|
||||
"id": "logical",
|
||||
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
|
||||
}
|
||||
]
|
||||
|
||||
def check_server(timeout=300):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
||||
if resp.get("status") == "ok" or resp.get("status") == "ready":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def get_vram_usage():
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
|
||||
text=True
|
||||
)
|
||||
return out.strip().split("\n")
|
||||
except:
|
||||
return ["Failed to get VRAM info"]
|
||||
|
||||
def ask(prompt, max_tokens=300):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
|
||||
dt = time.time() - t0
|
||||
usage = resp.get("usage", {})
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
|
||||
tokens = usage.get("completion_tokens", 0)
|
||||
tps = round(tokens / dt, 2) if dt > 0 else 0
|
||||
return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
|
||||
|
||||
def main():
|
||||
results = []
|
||||
|
||||
# Kill any existing llama-server
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(3)
|
||||
|
||||
for cfg in MODELS:
|
||||
print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
|
||||
|
||||
# Start server
|
||||
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
# Wait for boot
|
||||
print(f"Waiting for server to boot (up to 5 mins)...")
|
||||
is_ready = check_server(300)
|
||||
|
||||
if not is_ready:
|
||||
print(f"❌ Failed to boot {cfg['name']}.")
|
||||
results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
|
||||
proc.terminate()
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
print(f"✅ Server Ready!")
|
||||
vram = get_vram_usage()
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
ask("Hello", max_tokens=10)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
test_data = {}
|
||||
for p in TEST_PROMPTS:
|
||||
print(f" Testing {p['id']}...", end="", flush=True)
|
||||
try:
|
||||
res = ask(p["prompt"])
|
||||
test_data[p["id"]] = res
|
||||
print(f" {res['tps']} t/s")
|
||||
except Exception as e:
|
||||
test_data[p["id"]] = {"error": str(e)}
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
results.append({
|
||||
"name": cfg["name"],
|
||||
"status": "Success",
|
||||
"vram": vram,
|
||||
"tests": test_data
|
||||
})
|
||||
|
||||
# Save incremental
|
||||
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Shutdown
|
||||
print("Shutting down server...")
|
||||
proc.terminate()
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(5)
|
||||
|
||||
print("\n✅ All tests complete!")
|
||||
print(f"Results saved to {RESULTS_FILE}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user