Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
172 lines
5.9 KiB
Python
172 lines
5.9 KiB
Python
import subprocess
|
|
import time
|
|
import urllib.request
|
|
import json
|
|
import sys
|
|
|
|
try:
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
except:
|
|
pass
|
|
|
|
BASE_URL = "http://127.0.0.1:8000"
|
|
RESULTS_FILE = "scripts/deep_tier_extreme_results.json"
|
|
|
|
MODELS = [
|
|
{
|
|
"name": "Qwen 27B - 256K 극한 (q4_0, ub=512)",
|
|
"cmd": [
|
|
r"llama_bin_run\llama-server.exe",
|
|
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
|
|
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
|
|
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
|
"-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
|
|
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
|
|
"--port", "8000", "--host", "0.0.0.0"
|
|
]
|
|
},
|
|
{
|
|
"name": "Gemma 31B - 128K 확장 (q4_0)",
|
|
"cmd": [
|
|
r"llama_bin_run\llama-server.exe",
|
|
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
|
"-ngl", "999", "-c", "131072", "-np", "1", "-fa", "on",
|
|
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
|
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
|
|
"--prio", "3", "--mlock", "--poll", "50",
|
|
"--port", "8000", "--host", "0.0.0.0"
|
|
]
|
|
},
|
|
{
|
|
"name": "Gemma 31B - 192K 극한 (q4_0)",
|
|
"cmd": [
|
|
r"llama_bin_run\llama-server.exe",
|
|
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
|
"-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
|
|
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
|
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
|
|
"--prio", "3", "--mlock", "--poll", "50",
|
|
"--port", "8000", "--host", "0.0.0.0"
|
|
]
|
|
}
|
|
]
|
|
|
|
TEST_PROMPTS = [
|
|
{
|
|
"id": "code",
|
|
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
|
|
},
|
|
{
|
|
"id": "logical",
|
|
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
|
|
}
|
|
]
|
|
|
|
def check_server(timeout=300):
|
|
start = time.time()
|
|
while time.time() - start < timeout:
|
|
try:
|
|
req = urllib.request.Request(f"{BASE_URL}/health")
|
|
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
|
if resp.get("status") == "ok" or resp.get("status") == "ready":
|
|
return True
|
|
except:
|
|
pass
|
|
time.sleep(5)
|
|
return False
|
|
|
|
def get_vram_usage():
|
|
try:
|
|
out = subprocess.check_output(
|
|
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
|
|
text=True
|
|
)
|
|
return out.strip().split("\n")
|
|
except:
|
|
return ["Failed to get VRAM info"]
|
|
|
|
def ask(prompt, max_tokens=300):
|
|
payload = json.dumps({
|
|
"model": "m",
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"max_tokens": max_tokens,
|
|
"temperature": 0.0
|
|
}).encode()
|
|
req = urllib.request.Request(
|
|
f"{BASE_URL}/v1/chat/completions",
|
|
data=payload,
|
|
headers={"Content-Type": "application/json"}
|
|
)
|
|
t0 = time.time()
|
|
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
|
|
dt = time.time() - t0
|
|
usage = resp.get("usage", {})
|
|
content = resp["choices"][0]["message"]["content"]
|
|
|
|
tokens = usage.get("completion_tokens", 0)
|
|
tps = round(tokens / dt, 2) if dt > 0 else 0
|
|
return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
|
|
|
|
def main():
|
|
results = []
|
|
|
|
# Clean init
|
|
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
time.sleep(3)
|
|
|
|
for cfg in MODELS:
|
|
print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
|
|
|
|
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
print(f"Waiting for server to boot (up to 5 mins)...")
|
|
is_ready = check_server(300)
|
|
|
|
if not is_ready:
|
|
print(f"❌ Failed to boot {cfg['name']}.")
|
|
results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
|
|
proc.terminate()
|
|
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
time.sleep(5)
|
|
continue
|
|
|
|
print(f"✅ Server Ready!")
|
|
vram = get_vram_usage()
|
|
print(f"VRAM: {vram}")
|
|
|
|
# Warmup
|
|
try:
|
|
ask("Hello", max_tokens=10)
|
|
except Exception:
|
|
pass
|
|
|
|
test_data = {}
|
|
for p in TEST_PROMPTS:
|
|
print(f" Testing {p['id']}...", end="", flush=True)
|
|
try:
|
|
res = ask(p["prompt"])
|
|
test_data[p["id"]] = res
|
|
print(f" {res['tps']} t/s")
|
|
except Exception as e:
|
|
test_data[p["id"]] = {"error": str(e)}
|
|
print(f" ERROR: {e}")
|
|
|
|
results.append({
|
|
"name": cfg["name"],
|
|
"status": "Success",
|
|
"vram": vram,
|
|
"tests": test_data
|
|
})
|
|
|
|
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
print("Shutting down server...")
|
|
proc.terminate()
|
|
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
time.sleep(5)
|
|
|
|
print("\n✅ Extreme testing complete! Check results in", RESULTS_FILE)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|