feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning):
- Gemma4 26B: 74.65 t/s (fast)
- Qwen 35B: 61.62 t/s (balanced)
- Gemma4 31B: 16.0 t/s (deep-coder)
- Qwen 27B: 16.7 t/s (deep-logic)
- Qwen 122B: 8.95 t/s (ultra, GPU 1 only)

Phase 02 (API Engine):
- FastAPI reverse proxy on port 8000
- /engine/switch hot-swap with 503 protection
- config/engine_models.json as single source of truth
- Replaced 4 individual .bat files with unified engine

File cleanup:
- scripts/ 85 files -> 9 + _archive/
- Root .bat files -> _archive/
This commit is contained in:
Variet-Worker
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions

View File

@@ -0,0 +1,38 @@
import urllib.request, json, time, sys
try: sys.stdout.reconfigure(encoding='utf-8')
except: pass
BASE = "http://127.0.0.1:8000"
prompt = "Write a Python function to calculate fibonacci numbers efficiently using memoization. Include type hints and docstring."
payload = json.dumps({
"model": "m",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
"max_tokens": 500,
"temperature": 0.0
}).encode('utf-8')
req = urllib.request.Request(
f"{BASE}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
print("Sending request...")
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=300).read())
dt = time.time() - t0
u = resp.get("usage", {})
tokens = u.get("completion_tokens", 0)
speed = tokens / dt if dt > 0 else 0
print(f"\n=== 122B Benchmark ===")
print(f"Time: {dt:.1f}s")
print(f"Completion Tokens: {tokens}")
print(f"Speed: {speed:.2f} t/s")
print(f"\n--- Response Preview ---")
print(resp["choices"][0]["message"]["content"][:300])

View File

@@ -0,0 +1,177 @@
import subprocess
import time
import urllib.request
import json
import sys
import os
try:
sys.stdout.reconfigure(encoding='utf-8')
except:
pass
BASE_URL = "http://127.0.0.1:8000"
RESULTS_FILE = "scripts/deep_tier_auto_results.json"
MODELS = [
{
"name": "Qwen 27B - 256K (q4_0)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
"--port", "8000", "--host", "0.0.0.0"
]
},
{
"name": "Gemma 31B - 32K (q4_0)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
"-ngl", "999", "-c", "32768", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50",
"--port", "8000", "--host", "0.0.0.0"
]
},
{
"name": "Gemma 31B - 64K (q4_0)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
"-ngl", "999", "-c", "65536", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50",
"--port", "8000", "--host", "0.0.0.0"
]
}
]
TEST_PROMPTS = [
{
"id": "code",
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
},
{
"id": "logical",
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
}
]
def check_server(timeout=300):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
if resp.get("status") == "ok" or resp.get("status") == "ready":
return True
except:
pass
time.sleep(5)
return False
def get_vram_usage():
try:
out = subprocess.check_output(
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
text=True
)
return out.strip().split("\n")
except:
return ["Failed to get VRAM info"]
def ask(prompt, max_tokens=300):
payload = json.dumps({
"model": "m",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode()
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
dt = time.time() - t0
usage = resp.get("usage", {})
content = resp["choices"][0]["message"]["content"]
tokens = usage.get("completion_tokens", 0)
tps = round(tokens / dt, 2) if dt > 0 else 0
return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
def main():
results = []
# Kill any existing llama-server
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(3)
for cfg in MODELS:
print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
# Start server
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# Wait for boot
print(f"Waiting for server to boot (up to 5 mins)...")
is_ready = check_server(300)
if not is_ready:
print(f"❌ Failed to boot {cfg['name']}.")
results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
proc.terminate()
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)
continue
print(f"✅ Server Ready!")
vram = get_vram_usage()
# Warmup
try:
ask("Hello", max_tokens=10)
except Exception as e:
pass
test_data = {}
for p in TEST_PROMPTS:
print(f" Testing {p['id']}...", end="", flush=True)
try:
res = ask(p["prompt"])
test_data[p["id"]] = res
print(f" {res['tps']} t/s")
except Exception as e:
test_data[p["id"]] = {"error": str(e)}
print(f" ERROR: {e}")
results.append({
"name": cfg["name"],
"status": "Success",
"vram": vram,
"tests": test_data
})
# Save incremental
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# Shutdown
print("Shutting down server...")
proc.terminate()
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)
print("\n✅ All tests complete!")
print(f"Results saved to {RESULTS_FILE}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,171 @@
import subprocess
import time
import urllib.request
import json
import sys
try:
sys.stdout.reconfigure(encoding='utf-8')
except:
pass
BASE_URL = "http://127.0.0.1:8000"
RESULTS_FILE = "scripts/deep_tier_extreme_results.json"
MODELS = [
{
"name": "Qwen 27B - 256K 극한 (q4_0, ub=512)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
"--port", "8000", "--host", "0.0.0.0"
]
},
{
"name": "Gemma 31B - 128K 확장 (q4_0)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
"-ngl", "999", "-c", "131072", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50",
"--port", "8000", "--host", "0.0.0.0"
]
},
{
"name": "Gemma 31B - 192K 극한 (q4_0)",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
"-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50",
"--port", "8000", "--host", "0.0.0.0"
]
}
]
TEST_PROMPTS = [
{
"id": "code",
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
},
{
"id": "logical",
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
}
]
def check_server(timeout=300):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
if resp.get("status") == "ok" or resp.get("status") == "ready":
return True
except:
pass
time.sleep(5)
return False
def get_vram_usage():
try:
out = subprocess.check_output(
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
text=True
)
return out.strip().split("\n")
except:
return ["Failed to get VRAM info"]
def ask(prompt, max_tokens=300):
payload = json.dumps({
"model": "m",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode()
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
dt = time.time() - t0
usage = resp.get("usage", {})
content = resp["choices"][0]["message"]["content"]
tokens = usage.get("completion_tokens", 0)
tps = round(tokens / dt, 2) if dt > 0 else 0
return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
def main():
results = []
# Clean init
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(3)
for cfg in MODELS:
print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print(f"Waiting for server to boot (up to 5 mins)...")
is_ready = check_server(300)
if not is_ready:
print(f"❌ Failed to boot {cfg['name']}.")
results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
proc.terminate()
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)
continue
print(f"✅ Server Ready!")
vram = get_vram_usage()
print(f"VRAM: {vram}")
# Warmup
try:
ask("Hello", max_tokens=10)
except Exception:
pass
test_data = {}
for p in TEST_PROMPTS:
print(f" Testing {p['id']}...", end="", flush=True)
try:
res = ask(p["prompt"])
test_data[p["id"]] = res
print(f" {res['tps']} t/s")
except Exception as e:
test_data[p["id"]] = {"error": str(e)}
print(f" ERROR: {e}")
results.append({
"name": cfg["name"],
"status": "Success",
"vram": vram,
"tests": test_data
})
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print("Shutting down server...")
proc.terminate()
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)
print("\n✅ Extreme testing complete! Check results in", RESULTS_FILE)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,88 @@
"""
Gemma 4 26B-A4B Q4_K_M - 76.4 t/s 재현 테스트
이전 최적값: ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16
"""
import subprocess, time, json, urllib.request, sys, os
try: sys.stdout.reconfigure(encoding='utf-8')
except: pass
LLAMA = os.path.join(os.getcwd(), "llama_bin_run", "llama-server.exe")
MODEL = os.path.join(os.getcwd(), "models", "gemma-4-26B-A4B-it-Q4_K_M.gguf")
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(3)
cmd = [
LLAMA, "--model", MODEL,
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
"--cache-type-k", "f16", "--cache-type-v", "f16",
"-ub", "512", "-b", "2048", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50",
"--port", "8000", "--host", "0.0.0.0",
]
print("[1/4] Starting Gemma4 26B Q4_K_M (76.4 t/s config)...")
server = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print("[2/4] Waiting for boot...")
healthy = False
for sec in range(180):
time.sleep(1)
if server.poll() is not None:
print(f" !! CRASHED (exit code {server.returncode})")
sys.exit(1)
try:
with urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=1) as r:
if json.loads(r.read()).get("status") == "ok":
healthy = True; break
except: pass
if sec % 10 == 9: print(f" ... {sec+1}s")
if not healthy:
print(" FAIL: boot timeout"); server.kill(); sys.exit(1)
print(f" OK!")
try:
v = subprocess.run(["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
print(f" VRAM: {v.stdout.strip()}")
except: pass
def bench(n):
payload = json.dumps({"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}], "max_tokens": n, "temperature": 0}).encode()
req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
t0 = time.time()
with urllib.request.urlopen(req, timeout=120) as r:
res = json.loads(r.read())
el = time.time() - t0
ct = res["usage"]["completion_tokens"]
return ct / el, ct, el
try: bench(10)
except: pass
print("[3/4] Running 5x benchmark (200 tokens)...")
results = []
for i in range(5):
tps, tok, el = bench(200)
results.append(tps)
print(f" Run {i+1}: {tps:.2f} t/s ({tok} tok / {el:.2f}s)")
avg = sum(results) / len(results)
best = max(results)
worst = min(results)
summary = f"""
==================================================
Gemma4 26B Q4_K_M 5-Run Results:
AVG: {avg:.2f} t/s
BEST: {best:.2f} t/s
MIN: {worst:.2f} t/s
Runs: {[f'{r:.2f}' for r in results]}
==================================================
"""
print(summary)
with open("scripts/gemma4_test_result.txt", "w") as f:
f.write(summary)
server.kill()
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

View File

@@ -0,0 +1,134 @@
import subprocess
import time
import urllib.request
import json
import sys
import traceback
try:
sys.stdout.reconfigure(encoding='utf-8')
except:
pass
BASE_URL = "http://127.0.0.1:8000"
RESULTS_FILE = "scripts/llm_judge_answers.json"
MODELS = [
{
"name": "Qwen 27B",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
"--port", "8000", "--host", "0.0.0.0"
]
},
{
"name": "Gemma 31B",
"cmd": [
r"llama_bin_run\llama-server.exe",
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
"-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
"--prio", "3", "--mlock", "--poll", "50",
"--port", "8000", "--host", "0.0.0.0"
]
}
]
QUESTIONS = [
{
"id": "architecture",
"prompt": "수백만 건의 실시간 주식 틱 데이터를 수집하고, 이를 가공하여 초당 수천 명의 클라이언트에게 웹소켓으로 지연 없이 브로드캐스팅하는 시스템을 설계해야 합니다. 언어, 메시지 큐, 데이터베이스, 캐싱 전략 등을 포함해 구체적인 아키텍처를 제안하고, 병목 현상에 대비한 해결책을 설명하세요."
},
{
"id": "logic",
"prompt": "논리 문제: 방 안에 5명의 사람(A, B, C, D, E)이 있습니다. A는 B를 제외한 모든 사람과 악수했습니다. B는 C와만 악수했습니다. C는 D와 악수하지 않았습니다. 그렇다면 E는 총 몇 명과 악수했을까요? 당신의 논리적 사고 과정을 한 단계씩 명확히 설명해주세요."
},
{
"id": "coding",
"prompt": "파이썬에서 데코레이터를 작성하세요. 이 데코레이터는 함수의 실행을 최대 3번까지 재시도하며, 각 재시도 간에 지수 백오프(Exponential Backoff)를 적용해야 합니다. 로깅 처리가 포함되어야 하며, 어떤 예외 타입(Exception type)이 발생했을 때만 재시도할지 인자로 받을 수 있어야 합니다."
}
]
def check_server(timeout=300):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
if resp.get("status") == "ok" or resp.get("status") == "ready":
return True
except:
pass
time.sleep(5)
return False
def ask(prompt, max_tokens=4096):
payload = json.dumps({
"model": "m",
"messages": [
{"role": "system", "content": "You are a world-class IT system architect and developer. Please output your response in Korean."},
{"role": "user", "content": prompt}
],
"max_tokens": -1,
"temperature": 0.0
}).encode('utf-8')
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=1800).read())
dt = time.time() - t0
content = resp["choices"][0]["message"]["content"]
return content
def main():
results = {}
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(3)
for cfg in MODELS:
print(f"\n[{time.strftime('%H:%M:%S')}] Booting {cfg['name']}...")
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not check_server(300):
print(f"Failed to boot {cfg['name']}.")
proc.terminate()
continue
print(f"{cfg['name']} is ready! Asking questions...")
try: ask("Hi", max_tokens=10)
except: pass
results[cfg['name']] = {}
for q in QUESTIONS:
print(f" -> Asking: {q['id']}")
try:
ans = ask(q['prompt'])
results[cfg['name']][q['id']] = ans
print(" (Done)")
except Exception as e:
results[cfg['name']][q['id']] = f"ERROR: {e}"
print(" (Error)")
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
proc.terminate()
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)
print("\n✅ All questions answered! Results saved to", RESULTS_FILE)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,241 @@
"""
Quality A/B Test — Gemma 4 26B vs Qwen 3.5 35B
실제 서비스 시나리오 기반 품질 비교
"""
import urllib.request, json, time, sys, os
try:
sys.stdout.reconfigure(encoding='utf-8')
except:
pass
BASE = "http://127.0.0.1:8000"
MODEL_NAME = sys.argv[1] if len(sys.argv) > 1 else "unknown"
OUTPUT_FILE = f"scripts/quality_result_{MODEL_NAME}.json"
SCENARIOS = [
# ═══ 1. 코딩 에이전트 (VS Code) ═══
{
"id": "code_generate",
"category": "coding",
"name": "Python 함수 생성",
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code.",
"eval_criteria": ["correctness", "type_hints", "docstring", "edge_cases"]
},
{
"id": "code_debug",
"category": "coding",
"name": "버그 찾기 & 수정",
"prompt": """Find and fix the bug in this code:
```python
def find_duplicates(arr):
seen = {}
duplicates = []
for item in arr:
if item in seen:
duplicates.append(item)
seen[item] = True
return list(set(duplicates))
# Bug: find_duplicates([1,2,2,3,3,3]) returns [2,3] but
# find_duplicates([]) crashes with unexpected behavior
# Also it should return count of each duplicate
```
Fix it to return a dict like {2: 2, 3: 3} (value=count of occurrences).""",
"eval_criteria": ["bug_identified", "correct_fix", "clean_code"]
},
{
"id": "code_refactor",
"category": "coding",
"name": "TypeScript 리팩토링",
"prompt": """Refactor this messy TypeScript into clean, typed code:
```typescript
async function getData(url, retry, timeout) {
let result = null
for (let i = 0; i < retry; i++) {
try {
const r = await fetch(url, {signal: AbortSignal.timeout(timeout)})
if (r.ok) {
result = await r.json()
break
}
} catch(e) {
if (i === retry - 1) throw e
await new Promise(r => setTimeout(r, 1000 * (i+1)))
}
}
return result
}
```
Add proper types, error handling, configurable backoff, and make it production-ready.""",
"eval_criteria": ["types", "error_handling", "backoff", "production_quality"]
},
# ═══ 2. 개인 비서 (Discord Bot) — 한국어 ═══
{
"id": "korean_schedule",
"category": "assistant_kr",
"name": "한국어 일정 관리",
"prompt": "내일 오후 2시에 팀 미팅이 있고, 3시에 치과 예약이 있어. 그리고 저녁 7시에 친구랑 홍대에서 만나기로 했어. 이 일정들을 정리해주고, 이동 시간을 고려해서 현실적으로 가능한지 알려줘. 서울 기준으로.",
"eval_criteria": ["korean_fluency", "schedule_analysis", "practical_advice"]
},
{
"id": "korean_email",
"category": "assistant_kr",
"name": "한국어 이메일 요약",
"prompt": """다음 이메일을 3줄로 요약하고, 필요한 액션을 정리해줘:
안녕하세요 김팀장님,
지난 주 논의했던 Q2 마케팅 예산 관련하여 연락드립니다.
본부장님께서 기존 제안 대비 15% 삭감을 요청하셨습니다.
이에 따라 디지털 마케팅 채널 중 ROI가 낮은 채널을 우선 정리해야 할 것 같습니다.
리서치팀에서는 네이버 검색광고 대비 인스타그램 광고의 전환율이
0.3%로 가장 낮다는 분석 결과를 공유했습니다.
수요일까지 수정안을 제출해야 하니, 화요일 오전까지
각 채널별 삭감 우선순위를 정리해서 회신 부탁드립니다.
감사합니다.
마케팅팀 박과장 드림""",
"eval_criteria": ["korean_summary", "action_items", "conciseness"]
},
# ═══ 3. MCP 도구 (Function Calling) ═══
{
"id": "tool_calling",
"category": "tool_use",
"name": "Function Calling (JSON)",
"prompt": """You have access to these tools:
- search_web(query: string) -> string
- get_calendar(date: string) -> list[Event]
- send_email(to: string, subject: string, body: string) -> bool
User says: "Check my calendar for tomorrow, and if I have a meeting with John, search for the latest quarterly report and email him a summary."
Respond with the exact sequence of tool calls as JSON array. Use this format:
[{"tool": "name", "args": {...}}, ...]""",
"eval_criteria": ["correct_sequence", "valid_json", "complete_args"]
},
{
"id": "structured_output",
"category": "tool_use",
"name": "구조화 출력 (JSON)",
"prompt": """Parse this unstructured text into a JSON object:
"삼성전자가 2026년 1분기 실적을 발표했다. 매출은 79조원으로 전년 동기 대비 12% 증가했고, 영업이익은 15.2조원을 기록했다. 반도체 부문이 전체 이익의 65%를 차지했으며, 특히 HBM4 수요 증가로 인해 메모리 사업부 매출이 전 분기 대비 23% 성장했다."
Output format:
{
"company": "",
"period": "",
"revenue": {"amount": "", "unit": "", "yoy_change": ""},
"operating_profit": {"amount": "", "unit": ""},
"segments": [{"name": "", "profit_share": "", "highlights": ""}]
}""",
"eval_criteria": ["correct_parsing", "valid_json", "completeness"]
},
# ═══ 4. 일반 추론 ═══
{
"id": "reasoning",
"category": "reasoning",
"name": "논리 추론",
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step.",
"eval_criteria": ["correct_answer", "clear_steps", "math_accuracy"]
},
]
def ask(prompt, max_tokens=800):
payload = json.dumps({
"model": "m",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0
}).encode()
req = urllib.request.Request(
f"{BASE}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
dt = time.time() - t0
usage = resp.get("usage", {})
content = resp["choices"][0]["message"]["content"]
return {
"content": content,
"tokens": usage.get("completion_tokens", 0),
"time": round(dt, 2),
"tps": round(usage.get("completion_tokens", 0) / dt, 2) if dt > 0 else 0
}
def main():
print(f"{'='*60}")
print(f" Quality A/B Test — Model: {MODEL_NAME}")
print(f" {len(SCENARIOS)} scenarios | {time.strftime('%Y-%m-%d %H:%M')}")
print(f"{'='*60}\n")
# Health check
try:
req = urllib.request.Request(f"{BASE}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=5).read())
if resp.get("status") != "ok":
print("Server not ready!")
sys.exit(1)
except Exception as e:
print(f"Server not reachable: {e}")
sys.exit(1)
# Warmup
print("Warmup...", flush=True)
ask("Hello", max_tokens=10)
print("Done\n", flush=True)
results = []
for i, sc in enumerate(SCENARIOS):
print(f"[{i+1}/{len(SCENARIOS)}] {sc['category']}{sc['name']}")
print(f" Prompt: {sc['prompt'][:80]}...", flush=True)
try:
resp = ask(sc["prompt"])
print(f"{resp['tokens']} tokens | {resp['tps']:.1f} t/s | {resp['time']}s")
print(f" Response preview: {resp['content'][:120]}...\n")
results.append({
"id": sc["id"],
"category": sc["category"],
"name": sc["name"],
"model": MODEL_NAME,
"response": resp["content"],
"tokens": resp["tokens"],
"time": resp["time"],
"tps": resp["tps"],
"eval_criteria": sc["eval_criteria"]
})
except Exception as e:
print(f" ❌ Error: {e}\n")
results.append({
"id": sc["id"],
"category": sc["category"],
"name": sc["name"],
"model": MODEL_NAME,
"response": f"ERROR: {e}",
"tokens": 0,
"time": 0,
"tps": 0,
})
# Save
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n{'='*60}")
print(f" Results saved: {OUTPUT_FILE}")
print(f" Total scenarios: {len(results)}")
print(f"{'='*60}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,45 @@
"""Quick benchmark for running llama-server instance"""
import urllib.request, json, time, sys
BASE = "http://127.0.0.1:8000"
RUNS = 5
TOKENS = 200
def bench(max_tokens=TOKENS):
payload = json.dumps({
"model": "m",
"messages": [{"role": "user", "content": "Count from 1 to 100, each number on a new line."}],
"max_tokens": max_tokens,
"temperature": 0
}).encode()
req = urllib.request.Request(
f"{BASE}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=300).read())
dt = time.time() - t0
ct = resp.get("usage", {}).get("completion_tokens", 0)
return ct / dt if dt > 0 else 0, ct, dt
print("Warmup...", flush=True)
try:
bench(20)
except Exception as e:
print(f"Warmup failed: {e}")
sys.exit(1)
print("Warmup done\n", flush=True)
speeds = []
for i in range(RUNS):
tps, ct, dt = bench()
speeds.append(tps)
print(f" Run {i+1}: {tps:.2f} t/s (tokens={ct}, time={dt:.2f}s)", flush=True)
avg = sum(speeds) / len(speeds)
best = max(speeds)
mn = min(speeds)
print(f"\n{'='*50}")
print(f" RESULT: AVG {avg:.2f} / BEST {best:.2f} / MIN {mn:.2f} t/s")
print(f"{'='*50}")

View File

@@ -0,0 +1,67 @@
import subprocess
import time
import json
import urllib.request
import sys
import os
try: sys.stdout.reconfigure(encoding='utf-8')
except AttributeError: pass
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
time.sleep(2)
cmd = [
LLAMA_SERVER, "--model", MODEL,
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
"--prio", "3", "--port", "8000", "--host", "0.0.0.0",
"-ts", "0.44,0.56"
]
print(f"🚀 Starting Challenge (0.44, 0.56) ...")
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
ready = False
for i in range(120):
try:
req = urllib.request.Request("http://127.0.0.1:8000/health")
with urllib.request.urlopen(req, timeout=1) as r:
if json.loads(r.read()).get("status") == "ok":
ready = True
break
except:
pass
print(f" booting... {i}s", end='\r', flush=True)
time.sleep(1)
if not ready:
print("\n❌ FAILED to boot.")
proc.kill()
sys.exit(1)
print("\n✅ Booted! Testing 200 tokens...")
try:
payload = json.dumps({
"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
"max_tokens": 200, "temperature": 0
}).encode()
req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
t0 = time.time()
with urllib.request.urlopen(req, timeout=300) as r:
res = json.loads(r.read())
el = time.time() - t0
ct = res["usage"]["completion_tokens"]
tps = ct / el
print("="*50)
print(f"★ 0.44 / 0.56 RESULT: {tps:.2f} t/s ★")
print(f" Tokens: {ct} | Time: {el:.2f}s")
print("="*50)
except Exception as e:
print(f"\n❌ Benchmark Error: {e}")
proc.kill()

View File

@@ -0,0 +1,141 @@
"""
Qwen 3.5 35B-A3B Q4_K_M - Tensor Split 0.42/0.58 Speed Test
64 t/s 달성 설정 기반, 스플릿 비율만 변경
"""
import subprocess, time, json, urllib.request, sys, os
PYTHON = sys.executable
LLAMA = os.path.join(os.getcwd(), "llama_bin_run", "llama-server.exe")
MODEL = os.path.join(os.getcwd(), "models", "Qwen3.5-35B-A3B-Q4_K_M.gguf")
TS = "0.55,0.45"
# 1. Kill any existing server
print("[1/4] Killing existing llama-server...")
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(3)
# 2. Start server with 64t/s config + custom split
args = [
LLAMA, "--model", MODEL,
"-ngl", "999",
"-c", "262144",
"-np", "1",
"-fa", "on",
"--cache-type-k", "q4_0",
"--cache-type-v", "q4_0",
"-ub", "128",
"-b", "512",
"-t", "6",
"-tb", "6",
"--prio", "3",
"--mlock",
"--poll", "50",
"--port", "8000",
"--host", "0.0.0.0",
"-ts", TS,
]
print(f"[2/4] Starting server with -ts {TS}")
print(f" CMD: {' '.join(args[-6:])}")
server = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# 3. Wait for health
print("[3/4] Waiting for server to become healthy...")
t_boot = time.time()
healthy = False
for sec in range(180): # max 3 min
time.sleep(1)
# Check if process crashed
if server.poll() is not None:
print(f" !! Server process CRASHED (exit code {server.returncode})")
sys.exit(1)
try:
r = urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=1)
body = json.loads(r.read())
if body.get("status") == "ok":
healthy = True
break
except Exception:
pass
if sec % 10 == 9:
print(f" ... {sec+1}s elapsed")
if not healthy:
print(f" FAIL: Server not healthy after 180 seconds")
server.kill()
sys.exit(1)
boot_secs = time.time() - t_boot
print(f" OK: Booted in {boot_secs:.1f}s")
# VRAM check
try:
v = subprocess.run(
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5)
print(f" VRAM: {v.stdout.strip()}")
except:
pass
# 4. Benchmark
print("[4/4] Running token speed benchmark (200 tokens)...")
def do_bench(max_tok):
payload = json.dumps({
"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
"max_tokens": max_tok,
"temperature": 0
}).encode("utf-8")
req = urllib.request.Request(
"http://127.0.0.1:8000/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"})
t0 = time.time()
with urllib.request.urlopen(req, timeout=120) as resp:
result = json.loads(resp.read())
elapsed = time.time() - t0
ct = result["usage"]["completion_tokens"]
return ct / elapsed, ct, elapsed
# warmup
try:
do_bench(10)
except:
pass
# real runs - 5회
print("[4/4] Running 5x benchmark (200 tokens each)...")
results = []
for i in range(5):
tps, tokens, elapsed = do_bench(200)
results.append(tps)
# VRAM check per run
try:
v = subprocess.run(["nvidia-smi", "--query-gpu=index,memory.used,memory.total,memory.free", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
vram_info = v.stdout.strip()
except:
vram_info = "?"
print(f" Run {i+1}: {tps:.2f} t/s ({tokens} tok / {elapsed:.2f}s) | VRAM: {vram_info}")
avg = sum(results) / len(results)
best = max(results)
worst = min(results)
summary = f"""
==================================================
TS={TS} 5-Run Results (with --mlock --poll 50):
AVG: {avg:.2f} t/s
BEST: {best:.2f} t/s
MIN: {worst:.2f} t/s
Runs: {[f'{r:.2f}' for r in results]}
==================================================
"""
print(summary)
with open("scripts/split_test_result.txt", "w") as f:
f.write(summary)
# cleanup
server.kill()
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

View File

@@ -0,0 +1,36 @@
import urllib.request
import json
import traceback
BASE_URL = "http://127.0.0.1:8000"
prompt = "수백만 건의 실시간 주식 틱 데이터를 수집하고, 이를 가공하여 초당 수천 명의 클라이언트에게 웹소켓으로 지연 없이 브로드캐스팅하는 시스템을 설계해야 합니다. 언어, 메시지 큐, 데이터베이스, 캐싱 전략 등을 포함해 구체적인 아키텍처를 제안하고, 병목 현상에 대비한 해결책을 설명하세요."
def test():
try:
payload = json.dumps({
"model": "m",
"messages": [
{"role": "system", "content": "You are a world-class IT system architect and developer. Please output your response in Korean."},
{"role": "user", "content": prompt}
],
"max_tokens": 4096,
"temperature": 0.1
}).encode('utf-8')
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
print("전송 중... (타임아웃 300초)")
resp = urllib.request.urlopen(req, timeout=300).read()
res_json = json.loads(resp)
print("\n=== 결과 ===")
print(res_json["choices"][0]["message"]["content"])
except Exception as e:
print("\n=== 에러 발생 ===")
print(e)
traceback.print_exc()
if __name__ == "__main__":
test()

View File

@@ -0,0 +1,84 @@
import { spawn, execSync } from "child_process";
const BASE_URL = "http://127.0.0.1:8000";
const args = [
"llama_bin_run\\llama-server.exe",
"--model", "models\\Qwen3.5-35B-A3B-Q4_K_M.gguf",
"-ngl", "999",
"-c", "262144",
"-np", "1",
"-fa", "on",
"--cache-type-k", "q4_0",
"--cache-type-v", "q4_0",
"-ub", "128",
"-b", "512",
"-t", "6",
"-tb", "6",
"--prio", "3",
"--port", "8000",
"--host", "0.0.0.0",
"-ts", "0.3,0.7"
];
console.log(`Starting server with args: \n${args.join(" ")}\n`);
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
await new Promise(r => setTimeout(r, 2000));
const server = spawn(args[0], args.slice(1), { stdio: 'ignore' });
let ready = false;
let bootStart = Date.now();
for (let i = 0; i < 60; i++) {
try {
const res = await fetch(`${BASE_URL}/health`);
if (res.status === 200) { ready = true; break; }
} catch(e) {}
await new Promise(r => setTimeout(r, 3000));
}
if (!ready) {
console.log("Server failed to boot within 3 mins.");
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
process.exit(1);
}
const bootElapsed = (Date.now() - bootStart) / 1000;
console.log(`\n===========================================`);
console.log(`Server booted in ${bootElapsed.toFixed(1)}s.`);
try {
const vram = execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', { encoding: 'utf-8' });
console.log(`VRAM USAGE:\n${vram.trim()}`);
} catch(e) {}
console.log(`===========================================\n`);
try {
await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST", headers: { "Content-Type": "application/json" },
body: JSON.stringify({ messages: [{ role: "user", content: "hi" }], max_tokens: 10, temperature: 0 })
});
} catch(e) {}
console.log("Running speed test (200 tokens)...");
const t0 = Date.now();
try {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST", headers: { "Content-Type": "application/json" },
body: JSON.stringify({ messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], max_tokens: 200, temperature: 0 })
});
const result = await res.json();
const elapsed = (Date.now() - t0) / 1000;
const ct = result?.usage?.completion_tokens || 0;
const tps = ct / elapsed;
console.log(`\n===========================================`);
console.log(`★ 0.3/0.7 SPLIT RESULT: ${tps.toFixed(2)} t/s ★`);
console.log(` Tokens: ${ct}`);
console.log(` Time: ${elapsed.toFixed(2)}s\n===========================================\n`);
} catch(e) {
console.log("ERROR during benchmark:", e.message);
}
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
process.exit(0);

View File

@@ -0,0 +1,108 @@
import subprocess
import time
import json
import urllib.request
import sys
import os
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
CONTEXT = 262144
def kill_server():
try:
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
except:
pass
time.sleep(3)
def run_benchmark(max_tokens=200):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": "Count from 1 to 50, each on new line."}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=300) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
ct = usage.get("completion_tokens", 0)
return ct / elapsed if elapsed > 0 else 0, ct, elapsed
def get_vram():
try:
r = subprocess.run(
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
return r.stdout.strip()
except:
return "Unknown"
kill_server()
cmd = [
LLAMA_SERVER, "--model", MODEL,
"-ngl", "999", "-c", str(CONTEXT), "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
"--prio", "3", "--port", "8000", "--host", "0.0.0.0",
"-ts", "0.45,0.55"
]
print("Starting server with tensorSplit 0.45,0.55")
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=os.getcwd())
ready = False
boot_start = time.time()
for _ in range(30):
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=2) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
ready = True
break
except:
pass
time.sleep(3)
if not ready:
print("Server failed to boot.")
kill_server()
sys.exit(1)
boot_time = time.time() - boot_start
print(f"Booted in {boot_time:.1f}s")
print(f"VRAM:\n{get_vram()}")
try:
print("Warming up...")
run_benchmark(10)
print("Benchmarking (200 tokens)...")
tps, ct, el = run_benchmark(200)
print("=" * 50)
print(f"★ 0.3/0.7 SPLIT RESULT: {tps:.2f} t/s ★")
print(f" Tokens: {ct} / Time: {el:.2f}s")
print("=" * 50)
except Exception as e:
print(f"Error benchmark: {e}")
kill_server()

View File

@@ -0,0 +1,47 @@
[
{
"name": "Baseline: all expert CPU",
"avg_tps": 8.72,
"best_tps": 8.74,
"vram_gpu0": 620,
"vram_gpu1": 6493,
"vram_total": 7113,
"status": "OK"
},
{
"name": "n-cpu-moe=60 (4 layers expert GPU)",
"avg_tps": 8.72,
"best_tps": 8.77,
"vram_gpu0": 638,
"vram_gpu1": 6493,
"vram_total": 7131,
"status": "OK"
},
{
"name": "n-cpu-moe=56 (8 layers expert GPU)",
"avg_tps": 8.72,
"best_tps": 8.8,
"vram_gpu0": 624,
"vram_gpu1": 6493,
"vram_total": 7117,
"status": "OK"
},
{
"name": "n-cpu-moe=52 (12 layers expert GPU)",
"avg_tps": 8.76,
"best_tps": 8.79,
"vram_gpu0": 634,
"vram_gpu1": 6493,
"vram_total": 7127,
"status": "OK"
},
{
"name": "n-cpu-moe=48 (16 layers expert GPU)",
"avg_tps": 8.81,
"best_tps": 8.95,
"vram_gpu0": 632,
"vram_gpu1": 6493,
"vram_total": 7125,
"status": "OK"
}
]

View File

@@ -0,0 +1,52 @@
[
{
"name": "GPU1 only + Expert CPU + 8t",
"avg_tps": 8.74,
"best_tps": 8.75,
"vram_gpu0": 618,
"vram_gpu1": 6493,
"vram_total": 7111,
"pcie": "1, 4 | 4, 16",
"status": "OK"
},
{
"name": "GPU1 only + Expert CPU + 16t",
"avg_tps": 8.0,
"best_tps": 8.02,
"vram_gpu0": 619,
"vram_gpu1": 6493,
"vram_total": 7112,
"pcie": "1, 4 | 4, 16",
"status": "OK"
},
{
"name": "Both GPU (main=1) + Expert CPU + 8t",
"avg_tps": 4.71,
"best_tps": 4.75,
"vram_gpu0": 4220,
"vram_gpu1": 3779,
"vram_total": 7999,
"pcie": "3, 4 | 4, 16",
"status": "OK"
},
{
"name": "Both GPU (ts 0.2,0.8) + Expert CPU + 8t",
"avg_tps": 4.53,
"best_tps": 4.6,
"vram_gpu0": 2666,
"vram_gpu1": 5333,
"vram_total": 7999,
"pcie": "2, 4 | 4, 16",
"status": "OK"
},
{
"name": "GPU1 only + Expert CPU + 8t + b4096",
"avg_tps": 8.73,
"best_tps": 8.77,
"vram_gpu0": 615,
"vram_gpu1": 6895,
"vram_total": 7510,
"pcie": "1, 4 | 4, 16",
"status": "OK"
}
]

View File

@@ -0,0 +1,37 @@
[
{
"name": "n-cpu-moe=64 (all CPU)",
"n_cpu_moe": 64,
"speed_tps": 4.87,
"vram_gpu0": 4257,
"vram_gpu1": 3793,
"vram_total": 8050,
"status": "OK"
},
{
"name": "n-cpu-moe=56 (8 layers GPU expert)",
"n_cpu_moe": 56,
"speed_tps": 4.78,
"vram_gpu0": 4233,
"vram_gpu1": 3793,
"vram_total": 8026,
"status": "OK"
},
{
"name": "n-cpu-moe=48 (16 layers GPU expert)",
"n_cpu_moe": 48,
"speed_tps": 4.82,
"vram_gpu0": 4233,
"vram_gpu1": 3793,
"vram_total": 8026,
"status": "OK"
},
{
"name": "n-cpu-moe=40 (24 layers GPU expert)",
"status": "BOOT_FAIL"
},
{
"name": "n-cpu-moe=32 (32 layers GPU expert)",
"status": "BOOT_FAIL"
}
]

View File

@@ -0,0 +1,43 @@
[
{
"name": "ngl=999 + expert CPU + no-mmap",
"ngl": 999,
"avg_tps": 4.8,
"best_tps": 4.84,
"vram_gpu0": 4225,
"vram_gpu1": 3779,
"vram_total": 8004,
"pcie": "3, 4\r | 4, 16",
"status": "OK"
},
{
"name": "ngl=10 (pure, no expert override)",
"ngl": 10,
"avg_tps": 2.52,
"best_tps": 2.56,
"vram_gpu0": 10309,
"vram_gpu1": 5871,
"vram_total": 16180,
"pcie": "1, 4\r | 1, 16",
"status": "OK"
},
{
"name": "ngl=12 (pure)",
"ngl": 12,
"avg_tps": 2.86,
"best_tps": 2.86,
"vram_gpu0": 11807,
"vram_gpu1": 7377,
"vram_total": 19184,
"pcie": "2, 4\r | 2, 16",
"status": "OK"
},
{
"name": "ngl=14 (pure)",
"status": "BOOT_FAIL"
},
{
"name": "ngl=999 + upper expert CPU (blk 32-63)",
"status": "BOOT_FAIL"
}
]

View File

@@ -0,0 +1,68 @@
[
{
"name": "Qwen 27B - 256K (q4_0)",
"status": "Success",
"vram": [
"0, 10853 MiB, 12288 MiB",
"1, 10951 MiB, 12288 MiB"
],
"tests": {
"code": {
"time": 17.89,
"tokens": 300,
"tps": 16.77,
"res": "..."
},
"logical": {
"time": 17.96,
"tokens": 300,
"tps": 16.71,
"res": "..."
}
}
},
{
"name": "Gemma 31B - 32K (q4_0)",
"status": "Success",
"vram": [
"0, 9834 MiB, 12288 MiB",
"1, 9963 MiB, 12288 MiB"
],
"tests": {
"code": {
"time": 18.75,
"tokens": 300,
"tps": 16.0,
"res": "```python\nfrom typing import List, Any\n\ndef merge_sorted_lists(list1: List[Any], list2: List[Any]) -> List[Any]:\n \"\"\"\n Merges two sorted lists i..."
},
"logical": {
"time": 18.82,
"tokens": 300,
"tps": 15.94,
"res": "..."
}
}
},
{
"name": "Gemma 31B - 64K (q4_0)",
"status": "Success",
"vram": [
"0, 10346 MiB, 12288 MiB",
"1, 10387 MiB, 12288 MiB"
],
"tests": {
"code": {
"time": 18.75,
"tokens": 300,
"tps": 16.0,
"res": "```python\nfrom typing import List, Any\n\ndef merge_sorted_lists(list1: List[Any], list2: List[Any]) -> List[Any]:\n \"\"\"\n Merges two sorted lists i..."
},
"logical": {
"time": 18.83,
"tokens": 300,
"tps": 15.93,
"res": "..."
}
}
}
]

View File

@@ -0,0 +1,68 @@
[
{
"name": "Qwen 27B - 256K 극한 (q4_0, ub=512)",
"status": "Success",
"vram": [
"0, 11120 MiB, 12288 MiB",
"1, 11081 MiB, 12288 MiB"
],
"tests": {
"code": {
"time": 17.97,
"tokens": 300,
"tps": 16.7,
"res": "..."
},
"logical": {
"time": 18.01,
"tokens": 300,
"tps": 16.65,
"res": "..."
}
}
},
{
"name": "Gemma 31B - 128K 확장 (q4_0)",
"status": "Success",
"vram": [
"0, 11437 MiB, 12288 MiB",
"1, 11259 MiB, 12288 MiB"
],
"tests": {
"code": {
"time": 18.75,
"tokens": 300,
"tps": 16.0,
"res": "```python\nfrom typing import List, Any\n\ndef merge_sorted_lists(list1: List[Any], list2: List[Any]) -> List[Any]:\n \"\"\"\n Merges two sorted lists i..."
},
"logical": {
"time": 18.79,
"tokens": 300,
"tps": 15.97,
"res": "..."
}
}
},
{
"name": "Gemma 31B - 192K 극한 (q4_0)",
"status": "Success",
"vram": [
"0, 11888 MiB, 12288 MiB",
"1, 11754 MiB, 12288 MiB"
],
"tests": {
"code": {
"time": 18.69,
"tokens": 300,
"tps": 16.05,
"res": "```python\nfrom typing import List, Any\n\ndef merge_sorted_lists(list1: List[Any], list2: List[Any]) -> List[Any]:\n \"\"\"\n Merges two sorted lists i..."
},
"logical": {
"time": 18.77,
"tokens": 300,
"tps": 15.98,
"res": "..."
}
}
}
]

View File

@@ -0,0 +1,8 @@
==================================================
Gemma4 26B Q4_K_M 5-Run Results:
AVG: 74.65 t/s
BEST: 75.07 t/s
MIN: 74.27 t/s
Runs: ['74.59', '74.68', '74.65', '75.07', '74.27']
==================================================

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,124 @@
[
{
"id": "code_generate",
"category": "coding",
"name": "Python 함수 생성",
"model": "gemma4",
"response": "```python\nfrom typing import List\n\ndef merge_sorted",
"tokens": 800,
"time": 11.21,
"tps": 71.34,
"eval_criteria": [
"correctness",
"type_hints",
"docstring",
"edge_cases"
]
},
{
"id": "code_debug",
"category": "coding",
"name": "버그 찾기 & 수정",
"model": "gemma4",
"response": "",
"tokens": 800,
"time": 11.2,
"tps": 71.4,
"eval_criteria": [
"bug_identified",
"correct_fix",
"clean_code"
]
},
{
"id": "code_refactor",
"category": "coding",
"name": "TypeScript 리팩토링",
"model": "gemma4",
"response": "",
"tokens": 800,
"time": 11.23,
"tps": 71.26,
"eval_criteria": [
"types",
"error_handling",
"backoff",
"production_quality"
]
},
{
"id": "korean_schedule",
"category": "assistant_kr",
"name": "한국어 일정 관리",
"model": "gemma4",
"response": "요청하신 내일 일정을 정리하고, 서울 시내 이동 시간을 고려하여 현실적인 가능성을 분석해",
"tokens": 800,
"time": 11.2,
"tps": 71.43,
"eval_criteria": [
"korean_fluency",
"schedule_analysis",
"practical_advice"
]
},
{
"id": "korean_email",
"category": "assistant_kr",
"name": "한국어 이메일 요약",
"model": "gemma4",
"response": "요청하신 내용을 다음과 같이 요약 및 정리해 드립니다.\n\n**[3줄 요약]**\n1. 본부장님 지시로 Q2 마케팅 예산이 기존 대비 15% 삭감되었습니다.\n2. 이에 따라 ROI가 낮은 채널(인스타그램 등)을 중심으로 예산 조정이 필요합니다.\n3. 수요일 수정안 제출을 위해 채널별 삭감 우선순위 결정이 시급합니다.\n\n**[필요 액션]**\n* **채널별 삭감 우선순위 정리 및 회신** (기한: **화요일 오전까지**)",
"tokens": 686,
"time": 9.67,
"tps": 70.95,
"eval_criteria": [
"korean_summary",
"action_items",
"conciseness"
]
},
{
"id": "tool_calling",
"category": "tool_use",
"name": "Function Calling (JSON)",
"model": "gemma4",
"response": "",
"tokens": 800,
"time": 11.19,
"tps": 71.49,
"eval_criteria": [
"correct_sequence",
"valid_json",
"complete_args"
]
},
{
"id": "structured_output",
"category": "tool_use",
"name": "구조화 출력 (JSON)",
"model": "gemma4",
"response": "",
"tokens": 800,
"time": 11.41,
"tps": 70.12,
"eval_criteria": [
"correct_parsing",
"valid_json",
"completeness"
]
},
{
"id": "reasoning",
"category": "reasoning",
"name": "논리 추론",
"model": "gemma4",
"response": "To find the amount of wheat produced by each field, we can use algebra. We will express the production of all fields in terms of one variable.\n\n### Step 1: Define the variables\nLet **$B$** be the amount of wheat produced by **Field B**.\n\nBased on the problem description, we can define the other fields relative to $B$:\n* **Field A** produces 20% more than Field B:\n $A = B + 0.20B = 1.2B$\n* **Field C** produces ",
"tokens": 800,
"time": 11.2,
"tps": 71.44,
"eval_criteria": [
"correct_answer",
"clear_steps",
"math_accuracy"
]
}
]

View File

@@ -0,0 +1,124 @@
[
{
"id": "code_generate",
"category": "coding",
"name": "Python 함수 생성",
"model": "qwen27b",
"response": "",
"tokens": 800,
"time": 47.56,
"tps": 16.82,
"eval_criteria": [
"correctness",
"type_hints",
"docstring",
"edge_cases"
]
},
{
"id": "code_debug",
"category": "coding",
"name": "버그 찾기 & 수정",
"model": "qwen27b",
"response": "# Fixed Code\n\n```python\ndef find_duplicates(arr):\n count = {}\n for item in arr:\n count[item] = count.get(item, 0) + 1\n \n # Filter to only include items that appear more than once\n duplicates = {item: count for item, count in count.items() if count > 1}\n return duplicates\n```\n\n## What Was Fixed\n\n1. **Changed return type**: Now returns a dictionary `{value: count}` instead of a list\n2. **Fixed counting logic**: Counts all occurrences of each item, not just marks as seen\n3. **Handles empty lists**: Returns `{}` instead of crashing\n4. **Removed unnecessary set conversion**: No need to convert to set since we're returning a dict\n\n## Test Cases\n\n```python\nprint(find_duplicates([1, 2, 2, 3, 3, 3])) # {2: 2, 3: 3}\nprint(find_duplicates([])) # {}\nprint(find_duplicates([1, 1, 1])) # {1: 3}\nprint(find_duplicates([1, 2, 3])) # {}\nprint(find_duplicates([5])) # {}\n```\n\n## Alternative (More Pythonic)\n\n```python\nfrom collections import Counter\n\ndef find_duplicates(arr):\n count = Counter(arr)\n return {item: count for item, count in count.items() if count > 1}\n```\n\nThis version uses `Counter` from the `collections`",
"tokens": 800,
"time": 48.01,
"tps": 16.66,
"eval_criteria": [
"bug_identified",
"correct_fix",
"clean_code"
]
},
{
"id": "code_refactor",
"category": "coding",
"name": "TypeScript 리팩토링",
"model": "qwen27b",
"response": "",
"tokens": 800,
"time": 47.99,
"tps": 16.67,
"eval_criteria": [
"types",
"error_handling",
"backoff",
"production_quality"
]
},
{
"id": "korean_schedule",
"category": "assistant_kr",
"name": "한국어 일정 관리",
"model": "qwen27b",
"response": "",
"tokens": 800,
"time": 47.75,
"tps": 16.75,
"eval_criteria": [
"korean_fluency",
"schedule_analysis",
"practical_advice"
]
},
{
"id": "korean_email",
"category": "assistant_kr",
"name": "한국어 이메일 요약",
"model": "qwen27b",
"response": "",
"tokens": 800,
"time": 48.05,
"tps": 16.65,
"eval_criteria": [
"korean_summary",
"action_items",
"conciseness"
]
},
{
"id": "tool_calling",
"category": "tool_use",
"name": "Function Calling (JSON)",
"model": "qwen27b",
"response": "[{\"tool\": \"get_calendar\", \"args\": {\"date\": \"tomorrow\"}}, {\"tool\": \"search_web\", \"args\": {\"query\": \"latest quarterly report\"}}, {\"tool\": \"send_email\", \"args\": {\"to\": \"john@example.com\", \"subject\": \"Quarterly Report Summary\", \"body\": \"Summary of the latest quarterly report attached for your review.\"}}]",
"tokens": 719,
"time": 43.06,
"tps": 16.7,
"eval_criteria": [
"correct_sequence",
"valid_json",
"complete_args"
]
},
{
"id": "structured_output",
"category": "tool_use",
"name": "구조화 출력 (JSON)",
"model": "qwen27b",
"response": "",
"tokens": 800,
"time": 48.01,
"tps": 16.66,
"eval_criteria": [
"correct_parsing",
"valid_json",
"completeness"
]
},
{
"id": "reasoning",
"category": "reasoning",
"name": "논리 추론",
"model": "qwen27b",
"response": "",
"tokens": 800,
"time": 47.67,
"tps": 16.78,
"eval_criteria": [
"correct_answer",
"clear_steps",
"math_accuracy"
]
}
]

View File

@@ -0,0 +1,124 @@
[
{
"id": "code_generate",
"category": "coding",
"name": "Python 함수 생성",
"model": "qwen35b",
"response": "",
"tokens": 800,
"time": 12.7,
"tps": 62.97,
"eval_criteria": [
"correctness",
"type_hints",
"docstring",
"edge_cases"
]
},
{
"id": "code_debug",
"category": "coding",
"name": "버그 찾기 & 수정",
"model": "qwen35b",
"response": "Here's the corrected version of the function that returns a dictionary of duplicate values with their occurrence counts:\n\n```python\ndef find_duplicates(arr):\n count = {}\n for item in arr:\n count[item] = count.get(item, 0) + 1\n \n return {item: cnt for item, cnt in count.items() if cnt > 1}\n```\n\n### Explanation of Fixes:\n\n1. **Correct Output Format:** The function now returns a dictionary where each key is a duplicate value and the corresponding value is its total count of occurrences.\n2. **Handles Empty Lists:** The function works correctly with an empty list, returning an empty dictionary `{}`.\n3. **Accurate Duplicate Detection:** Only values that appear more than once are included in the result.\n\n### Example Usage:\n\n```python\nprint(find_duplicates([1, 2, 2, 3, 3, 3])) # Output: {2: 2, 3: 3}\nprint(find_duplicates([])) # Output: {}\nprint(find_duplicates([1, 1, 1])) # Output: {1: 3}\nprint(find_duplicates([1, 2, 3])) # Output: {}\n```\n\nThis implementation is efficient, readable, and meets the specified requirements.",
"tokens": 788,
"time": 12.8,
"tps": 61.58,
"eval_criteria": [
"bug_identified",
"correct_fix",
"clean_code"
]
},
{
"id": "code_refactor",
"category": "coding",
"name": "TypeScript 리팩토링",
"model": "qwen35b",
"response": "",
"tokens": 800,
"time": 13.05,
"tps": 61.3,
"eval_criteria": [
"types",
"error_handling",
"backoff",
"production_quality"
]
},
{
"id": "korean_schedule",
"category": "assistant_kr",
"name": "한국어 일정 관리",
"model": "qwen35b",
"response": "",
"tokens": 800,
"time": 12.89,
"tps": 62.08,
"eval_criteria": [
"korean_fluency",
"schedule_analysis",
"practical_advice"
]
},
{
"id": "korean_email",
"category": "assistant_kr",
"name": "한국어 이메일 요약",
"model": "qwen35b",
"response": "",
"tokens": 800,
"time": 13.06,
"tps": 61.27,
"eval_criteria": [
"korean_summary",
"action_items",
"conciseness"
]
},
{
"id": "tool_calling",
"category": "tool_use",
"name": "Function Calling (JSON)",
"model": "qwen35b",
"response": "",
"tokens": 800,
"time": 12.92,
"tps": 61.9,
"eval_criteria": [
"correct_sequence",
"valid_json",
"complete_args"
]
},
{
"id": "structured_output",
"category": "tool_use",
"name": "구조화 출력 (JSON)",
"model": "qwen35b",
"response": "",
"tokens": 800,
"time": 13.04,
"tps": 61.34,
"eval_criteria": [
"correct_parsing",
"valid_json",
"completeness"
]
},
{
"id": "reasoning",
"category": "reasoning",
"name": "논리 추론",
"model": "qwen35b",
"response": "",
"tokens": 800,
"time": 12.86,
"tps": 62.21,
"eval_criteria": [
"correct_answer",
"clear_steps",
"math_accuracy"
]
}
]

View File

@@ -0,0 +1,8 @@
==================================================
TS=0.5,0.5 5-Run Results (with --mlock --poll 50):
AVG: 61.94 t/s
BEST: 62.06 t/s
MIN: 61.74 t/s
Runs: ['62.06', '61.74', '61.92', '62.00', '61.96']
==================================================

View File

@@ -0,0 +1,129 @@
import subprocess, time, urllib.request, json, sys
try: sys.stdout.reconfigure(encoding='utf-8')
except: pass
MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
BASE = "http://127.0.0.1:8000"
# BEST SO FAR: GPU1 only + Expert CPU + 8t = 8.75 t/s (6.5GB / 12GB used)
# 5.5GB VRAM remaining on GPU 1. Let's use it!
# Strategy: keep some experts on GPU 1 using -ncmoe (n-cpu-moe)
# n-cpu-moe = number of layers whose experts stay on CPU
# Lower = more experts on GPU = more VRAM used = potentially faster
BASE_CMD = [
r"llama_bin_run\llama-server.exe",
"--model", MODEL,
"-ngl", "999",
"-sm", "none", "--main-gpu", "1",
"-c", "4096", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "512", "-b", "2048",
"-t", "8", "-tb", "8",
"--prio", "3", "--poll", "50",
"--no-mmap",
"--port", "8000", "--host", "0.0.0.0"
]
CONFIGS = [
# Baseline: all experts CPU (confirmed 8.75 t/s)
{"name": "Baseline: all expert CPU", "extra": ["-ot", ".*ffn_.*_exps.*=CPU"]},
# Try n-cpu-moe with GPU1 only: keep some experts on GPU
{"name": "n-cpu-moe=60 (4 layers expert GPU)", "extra": ["-ncmoe", "60"]},
{"name": "n-cpu-moe=56 (8 layers expert GPU)", "extra": ["-ncmoe", "56"]},
{"name": "n-cpu-moe=52 (12 layers expert GPU)", "extra": ["-ncmoe", "52"]},
{"name": "n-cpu-moe=48 (16 layers expert GPU)", "extra": ["-ncmoe", "48"]},
]
def kill():
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(4)
def check_server(timeout=900):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
if resp.get("status") in ("ok", "ready"):
return True
except: pass
time.sleep(5)
return False
def bench(runs=3):
speeds = []
for i in range(runs):
payload = json.dumps({
"model": "m",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a Python fibonacci function with memoization."}
],
"max_tokens": 200,
"temperature": 0.0
}).encode('utf-8')
req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
dt = time.time() - t0
tokens = resp.get("usage", {}).get("completion_tokens", 0)
speed = tokens / dt if dt > 0 else 0
speeds.append(speed)
print(f" Run {i+1}: {speed:.2f} t/s ({tokens} tok / {dt:.1f}s)")
return sum(speeds)/len(speeds), max(speeds)
def vram():
try:
out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
return [int(x.strip()) for x in out.split('\n')]
except: return [0, 0]
results = []
for cfg in CONFIGS:
kill()
print(f"\n{'='*60}")
print(f"Testing: {cfg['name']}")
print(f"{'='*60}")
cmd = BASE_CMD + cfg["extra"]
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not check_server(900):
print(f" FAILED TO BOOT")
results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
proc.terminate(); kill(); continue
print(" Server ready! Warming up...")
try:
p = json.dumps({"model":"m","messages":[{"role":"system","content":"Hi"},{"role":"user","content":"Hi"}],"max_tokens":5}).encode()
urllib.request.urlopen(urllib.request.Request(f"{BASE}/v1/chat/completions",data=p,headers={"Content-Type":"application/json"}), timeout=120)
except: pass
v = vram()
print(f" VRAM: GPU0={v[0]}MB, GPU1={v[1]}MB, Total={sum(v)}MB")
avg, best = bench(runs=3)
print(f" >>> AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
results.append({
"name": cfg["name"], "avg_tps": round(avg,2), "best_tps": round(best,2),
"vram_gpu0": v[0], "vram_gpu1": v[1], "vram_total": sum(v), "status": "OK"
})
proc.terminate()
kill()
print(f"\n\n{'='*60}")
print("FINAL RESULTS - GPU1 Expert Balance (Target: 10+ t/s)")
print(f"{'='*60}")
print(f"{'Config':<48} {'AVG':>6} {'BEST':>6} {'GPU1':>7}")
print("-" * 72)
for r in results:
if r["status"] == "OK":
print(f" {r['name']:<46} {r['avg_tps']:>5} {r['best_tps']:>5} {r['vram_gpu1']:>5}MB")
else:
print(f" {r['name']:<46} {'FAIL':>5}")
with open("scripts/122b_final_results.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print("\nSaved to scripts/122b_final_results.json")

View File

@@ -0,0 +1,107 @@
import subprocess, time, urllib.request, json, sys
try: sys.stdout.reconfigure(encoding='utf-8')
except: pass
MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
BASE = "http://127.0.0.1:8000"
# Goal: See if decreasing n-cpu-moe increases VRAM usage on GPU 1 and improves speed
# Now that we know GPU 1 is isolated and has ~3.5GB free with 256K context
BASE_CMD = [
r"llama_bin_run\llama-server.exe",
"--model", MODEL,
"-ngl", "999",
"-sm", "none", "--main-gpu", "1",
"-c", "4096", "-np", "1", "-fa", "on", # use a small context for fast boot/testing
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "512", "-b", "2048",
"-t", "8", "-tb", "8",
"--prio", "3", "--poll", "50",
"--no-mmap",
"--port", "8000", "--host", "0.0.0.0"
]
CONFIGS = [
{"name": "n-cpu-moe=48 (baseline)", "extra": ["-ncmoe", "48"]},
{"name": "n-cpu-moe=40", "extra": ["-ncmoe", "40"]},
{"name": "n-cpu-moe=32", "extra": ["-ncmoe", "32"]},
{"name": "n-cpu-moe=24", "extra": ["-ncmoe", "24"]},
]
def kill():
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(4)
def check_server(timeout=900):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
if resp.get("status") in ("ok", "ready"):
return True
except: pass
time.sleep(5)
return False
def bench(runs=2):
speeds = []
for i in range(runs):
payload = json.dumps({
"model": "m",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a short Python script."}
],
"max_tokens": 100,
"temperature": 0.0
}).encode('utf-8')
req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
dt = time.time() - t0
tokens = resp.get("usage", {}).get("completion_tokens", 0)
speed = tokens / dt if dt > 0 else 0
speeds.append(speed)
return sum(speeds)/len(speeds), max(speeds)
def vram():
try:
out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
return [int(x.strip()) for x in out.split('\n')]
except: return [0, 0]
results = []
for cfg in CONFIGS:
kill()
print(f"\n{'='*60}\nTesting: {cfg['name']}\n{'='*60}")
cmd = BASE_CMD + cfg["extra"]
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not check_server(300):
print(f" FAILED TO BOOT (OOM?)")
results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
proc.terminate(); kill(); continue
print(" Server ready! Warming up...")
time.sleep(2)
v = vram()
avg, best = bench(runs=2)
print(f" >>> AVG: {avg:.2f} t/s | VRAM GPU1: {v[1]}MB")
results.append({
"name": cfg["name"], "avg_tps": round(avg,2),
"vram_gpu1": v[1], "status": "OK"
})
proc.terminate()
kill()
print("\nFINAL RESULTS:")
for r in results:
if r["status"] == "OK":
print(f" {r['name']:<25} {r['avg_tps']:>5} t/s | {r['vram_gpu1']:>5}MB")
else:
print(f" {r['name']:<25} FAIL (OOM)")

View File

@@ -1,58 +0,0 @@
0|Gemma4-26B MXFP4_MOE|ngl=999 pure-GPU|63.21|63.78|G0:11770|G1:10411|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
1|Gemma4-26B MXFP4_MOE|compare: cpu-moe|12.92|14.21|G0:3096|G1:3497|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe
2|Gemma4-26B MXFP4_MOE|t=2|64.1|64.27|G0:11728|G1:10411|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
3|Gemma4-26B MXFP4_MOE|t=4|64|64.39|G0:11728|G1:10411|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
4|Gemma4-26B MXFP4_MOE|t=8|63.75|63.9|G0:11728|G1:10411|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
5|Gemma4-26B MXFP4_MOE|t=10|64.01|64.14|G0:11728|G1:10411|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
6|Gemma4-26B MXFP4_MOE|t=12|63.86|63.98|G0:11728|G1:10411|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
7|Gemma4-26B MXFP4_MOE|ub=256 b=1024|63.8|64.12|G0:10504|G1:9619|t=2|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU
8|Gemma4-26B MXFP4_MOE|ub=256 b=2048|63.88|64.04|G0:10504|G1:9619|t=2|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU
9|Gemma4-26B MXFP4_MOE|ub=512 b=4096|63.91|64.18|G0:11728|G1:10411|t=2|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU
10|Gemma4-26B MXFP4_MOE|ub=1024 b=2048|63.86|64.1|G0:10956|G1:9907|t=2|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU
11|Gemma4-26B MXFP4_MOE|ub=1024 b=4096|63.85|64.06|G0:10956|G1:9907|t=2|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU
12|Gemma4-26B MXFP4_MOE|kv=q8_0/q8_0|64.14|64.39|G0:10670|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
13|Gemma4-26B MXFP4_MOE|kv=q4_0/q8_0|37.52|37.86|G0:10394|G1:9753|t=2|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU
14|Gemma4-26B MXFP4_MOE|kv=f16/f16|63.48|64.31|G0:11700|G1:11667|t=2|ub=512 b=2048|kv=f16/f16|pure-GPU
15|Gemma4-26B MXFP4_MOE|FINAL|64.05|64.29|G0:10667|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
16|Gemma4-26B Q4_K_M|ngl=999 pure-GPU|76.01|76.31|G0:11784|G1:10454|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
17|Gemma4-26B Q4_K_M|compare: cpu-moe|10.19|10.49|G0:2652|G1:2982|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe
18|Gemma4-26B Q4_K_M|t=2|75.67|75.87|G0:11783|G1:10454|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
19|Gemma4-26B Q4_K_M|t=4|75.61|75.87|G0:11783|G1:10454|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
20|Gemma4-26B Q4_K_M|t=8|75.42|75.59|G0:11783|G1:10454|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
21|Gemma4-26B Q4_K_M|t=10|75.71|75.82|G0:11783|G1:10454|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
22|Gemma4-26B Q4_K_M|t=12|75.08|75.7|G0:11783|G1:10454|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
23|Gemma4-26B Q4_K_M|ub=256 b=1024|75.16|75.64|G0:10559|G1:9662|t=6|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU
24|Gemma4-26B Q4_K_M|ub=256 b=2048|75.68|76.05|G0:10559|G1:9662|t=6|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU
25|Gemma4-26B Q4_K_M|ub=512 b=4096|75.92|76.16|G0:11784|G1:10454|t=6|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU
26|Gemma4-26B Q4_K_M|ub=1024 b=2048|75.7|75.9|G0:11012|G1:9950|t=6|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU
27|Gemma4-26B Q4_K_M|ub=1024 b=4096|75.77|75.99|G0:11011|G1:9950|t=6|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU
28|Gemma4-26B Q4_K_M|kv=q8_0/q8_0|76.3|76.69|G0:10725|G1:10212|t=6|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
29|Gemma4-26B Q4_K_M|kv=q4_0/q8_0|42.88|44.58|G0:10439|G1:9796|t=6|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU
30|Gemma4-26B Q4_K_M|kv=f16/f16|76.36|76.78|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU
31|Gemma4-26B Q4_K_M|FINAL|76.4|76.75|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU
32|Qwen3.5-35B MXFP4_MOE|n-cpu-moe=5|51.43|52.07|G0:10365|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
33|Qwen3.5-35B MXFP4_MOE|t=2|43.8|46.4|G0:10365|G1:11152|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
34|Qwen3.5-35B MXFP4_MOE|t=4|49.21|52.78|G0:10353|G1:11152|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
35|Qwen3.5-35B MXFP4_MOE|t=8|46.43|50.49|G0:10397|G1:11152|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
36|Qwen3.5-35B MXFP4_MOE|t=10|46.12|50.06|G0:10351|G1:11152|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
37|Qwen3.5-35B MXFP4_MOE|t=12|45.23|47.1|G0:10337|G1:11152|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
38|Qwen3.5-35B MXFP4_MOE|ub=256 b=1024|48.9|52.3|G0:9834|G1:10906|t=6|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
39|Qwen3.5-35B MXFP4_MOE|ub=256 b=2048|49.62|52.52|G0:9833|G1:10906|t=6|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
40|Qwen3.5-35B MXFP4_MOE|ub=512 b=4096|48.78|52.14|G0:10337|G1:11152|t=6|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
41|Qwen3.5-35B MXFP4_MOE|ub=1024 b=2048|49.95|52.53|G0:11124|G1:11644|t=6|ub=1024 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
42|Qwen3.5-35B MXFP4_MOE|ub=1024 b=4096|48.75|52.06|G0:11123|G1:11644|t=6|ub=1024 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
43|Qwen3.5-35B MXFP4_MOE|kv=q4_0/q8_0|42.81|44.14|G0:10681|G1:11472|t=6|ub=512 b=2048|kv=q4_0/q8_0|n-cpu-moe=5
44|Qwen3.5-35B MXFP4_MOE|FINAL|46.66|47.09|G0:10476|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
45|Qwen3.5-35B Q4_K_M|n-cpu-moe=5|49.01|53.09|G0:10606|G1:11338|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
46|Qwen3.5-35B Q4_K_M|t=2|45.73|47.87|G0:10599|G1:11338|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
47|Qwen3.5-35B Q4_K_M|t=4|50.98|54.33|G0:10601|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
48|Qwen3.5-35B Q4_K_M|t=8|48.45|52.1|G0:10596|G1:11338|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
49|Qwen3.5-35B Q4_K_M|t=10|47.83|51.45|G0:10595|G1:11338|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
50|Qwen3.5-35B Q4_K_M|t=12|43.77|46.79|G0:10589|G1:11338|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
51|Qwen3.5-35B Q4_K_M|ub=256 b=1024|52.14|53.82|G0:10089|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
52|Qwen3.5-35B Q4_K_M|ub=256 b=2048|50.23|53.66|G0:10091|G1:11092|t=4|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
53|Qwen3.5-35B Q4_K_M|ub=512 b=2048|49.89|53.89|G0:10595|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
54|Qwen3.5-35B Q4_K_M|ub=512 b=4096|50.4|54.19|G0:10564|G1:11338|t=4|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
55|Qwen3.5-35B Q4_K_M|kv=q8_0/q8_0|51.84|53.53|G0:10726|G1:11732|t=4|ub=256 b=1024|kv=q8_0/q8_0|n-cpu-moe=5
56|Qwen3.5-35B Q4_K_M|kv=q4_0/q8_0|43.22|45.99|G0:10410|G1:11412|t=4|ub=256 b=1024|kv=q4_0/q8_0|n-cpu-moe=5
57|Qwen3.5-35B Q4_K_M|FINAL|52.05|54.48|G0:10062|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

184
scripts/optimal_configs.py Normal file
View File

@@ -0,0 +1,184 @@
# 🏆 2x RTX 3060 (24GB) 최적 추론 설정 — 실측 확정값
# ⚠️ [DEPRECATED] ⚠️
# 이 파일은 참조용으로만 보존됩니다.
# 현재 시스템의 실제 운영 설정(Single Source of Truth)은 `config/engine_models.json`을 참조하세요.
# 테스트 일시: 2026-04-06
# 컨텍스트: 256K (262144)
# 하드웨어: 2x RTX 3060 12GB (Machine A)
## ═══════════════════════════════════════════════════
## 1. Gemma 4 26B-A4B (Q4_K_M) — 74.65 t/s
## ═══════════════════════════════════════════════════
# 실측: AVG 74.65 / BEST 75.07 / MIN 74.27 t/s
# VRAM: ~16.8 GB (여유 충분)
# 이전 기록: 76.4 t/s (동일 설정)
# llama-server 실행 커맨드:
# llama-server --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf \
# -ngl 999 -c 262144 -np 1 -fa on \
# --cache-type-k f16 --cache-type-v f16 \
# -ub 512 -b 2048 -t 6 -tb 6 \
# --prio 3 --mlock --poll 50 \
# --port 8000 --host 0.0.0.0
GEMMA4_CONFIG = {
"model": "models\\gemma-4-26B-A4B-it-Q4_K_M.gguf",
"ngl": 999,
"context": 262144,
"np": 1,
"fa": True,
"cache_type_k": "f16",
"cache_type_v": "f16",
"ub": 512,
"b": 2048,
"t": 6,
"tb": 6,
"prio": 3,
"mlock": True,
"poll": 50,
"measured_avg_tps": 74.65,
"measured_best_tps": 75.07,
}
## ═══════════════════════════════════════════════════
## 2. Qwen 3.5 35B-A3B (Q4_K_M) — 61.62 t/s
## ═══════════════════════════════════════════════════
# 실측: AVG 61.62 / BEST 62.12 / MIN 61.02 t/s
# VRAM: ~23.0 GB (GPU 0: 12038, GPU 1: 10942 — 거의 한계)
# 이전 기록: 64.18 t/s (동일 설정, 3회 평균)
# ⚠️ 비대칭 스플릿 (0.49/0.51 이하) 시 12+ t/s 하락 또는 크래시
# ⚠️ UD-IQ4_NL 사용 금지 (안정성 문제)
# llama-server 실행 커맨드:
# llama-server --model models\Qwen3.5-35B-A3B-Q4_K_M.gguf \
# -ngl 999 -c 262144 -np 1 -fa on \
# --cache-type-k q4_0 --cache-type-v q4_0 \
# -ub 128 -b 512 -t 6 -tb 6 \
# --prio 3 --mlock --poll 50 \
# -ts 0.5,0.5 \
# --port 8000 --host 0.0.0.0
QWEN35B_CONFIG = {
"model": "models\\Qwen3.5-35B-A3B-Q4_K_M.gguf",
"ngl": 999,
"context": 262144,
"np": 1,
"fa": True,
"cache_type_k": "q4_0",
"cache_type_v": "q4_0",
"ub": 128,
"b": 512,
"t": 6,
"tb": 6,
"prio": 3,
"mlock": True,
"poll": 50,
"tensor_split": "0.5,0.5",
"measured_avg_tps": 61.62,
"measured_best_tps": 62.12,
}
## ═══════════════════════════════════════════════════
## 스플릿 테스트 결과 (Qwen 3.5 35B Q4_K_M)
## ═══════════════════════════════════════════════════
# 0.3 / 0.7 → 부팅 실패 ❌
## ═══════════════════════════════════════════════════
## 3. Deep Tier - 코딩 및 시스템 설계 전담 (Gemma 4 31B Q4_K_M)
## ═══════════════════════════════════════════════════
# 테스트 일시: 2026-04-07
# 실측: 16.0 t/s (192K 극한 컨텍스트 세팅 시)
# 용도 [Primary Coder]: 복잡한 Python 코딩, 프레임워크 아키텍처 설계, 알고리즘 최적화, 모의 테스트 케이스 작성 등 "시니어급 엔지니어링 능력이 제약적으로 요구되는 작업" 전담
# 특징: 24GB VRAM 환경에서 단일 모델 풀 로딩 시 최대 192K 컨텍스트를 지원합니다 (ub=128 세밀 컨트롤 통과).
# System Prompt 누락에 상대적으로 유연하며 창의적인 문제 해결에 뛰어납니다.
GEMMA4_31B_DEEP_CONFIG = {
"model": "models\\gemma-4-31B-it-Q4_K_M.gguf",
"ngl": 999,
"context": 196608, # 192K Limit
"np": 1,
"fa": True,
"cache_type_k": "q4_0",
"cache_type_v": "q4_0",
"ub": 128,
"b": 512,
"t": 6,
"tb": 6,
"prio": 3,
"mlock": True,
"poll": 50,
"measured_avg_tps": 16.0,
"role_assignment": "Primary Coder & Architect",
}
## ═══════════════════════════════════════════════════
## 4. Deep Tier - 복잡한 논리 및 초대형 문서 분석 (Qwen 3.5 27B Q4_K_M)
## ═══════════════════════════════════════════════════
# 테스트 일시: 2026-04-07
# 실측: 16.7 t/s (256K 극한 컨텍스트 세팅 시)
# 용도 [Logic Analyst]: 모호하거나 제한된 정보 하에서의 침착한 공학적 논증, 수학적 풀이, 256K를 꽉 채우는 방대한 문서 리딩 및 핵심 규칙 추출
# 특징: 256K 풀 컨텍스트를 안정적으로 지원(ub=512)하여 극단적인 메모리 버퍼를 감당합니다.
# ⚠️ 주의: API 요청 시 반드시 System Prompt ("You are a...")를 명시해야 응답 거부(Empty Response) 버그를 막을 수 있습니다.
QWEN35_27B_DEEP_CONFIG = {
"model": "models\\Qwen3.5-27B-Q4_K_M.gguf",
"ngl": 999,
"context": 262144, # 256K Full
"np": 1,
"fa": True,
"cache_type_k": "q4_0",
"cache_type_v": "q4_0",
"ub": 512,
"b": 1024,
"t": 6,
"tb": 6,
"prio": 3,
"mlock": True,
"poll": 50,
"tensor_split": "0.5,0.5",
"measured_avg_tps": 16.7,
"role_assignment": "Logic Analyst & Huge Context Reader",
}
## ═══════════════════════════════════════════════════
## 5. Qwen 3.5 122B-A10B MoE (Q4_K_M) — 8.95 t/s
## ═══════════════════════════════════════════════════
# 테스트 일시: 2026-04-07
# 하드웨어 이슈 반경: GPU0이 PCIe 3.0 x4 로 제한되어 듀얼 GPU 사용(split) 시 극심한 병목 발생.
# 해결책: GPU1(Gen4 x16) 단독 사용 및 Expert를 CPU에 오프로드.
# 실측: AVG 8.81 / BEST 8.95 t/s
# VRAM: 단일 GPU에서 6.5GB 유지
# 용도 [Ultra-Heavy Analyst]: 최대 122B 파라미터의 지식 풀이 필요한 최고 난이도 추론 및 에이전트 워크플로우
QWEN35_122B_MOE_CONFIG = {
"model": "models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf",
"ngl": 999,
"n_cpu_moe": 48, # 16 layers expert on GPU, rest on CPU
"context": 4096, # 물리적 메모리 한계로 컨텍스트 확장 주의
"np": 1,
"fa": True,
"cache_type_k": "q4_0",
"cache_type_v": "q4_0",
"ub": 512,
"b": 2048,
"t": 8, # CPU 물리 코어 수와 일치
"tb": 8,
"prio": 3,
"poll": 50,
"main_gpu": 1,
"split_mode": "none",
"no_mmap": True,
"measured_avg_tps": 8.81,
"measured_best_tps": 8.95,
"role_assignment": "Ultra-Heavy Reasoning Agent",
}

Binary file not shown.

46
scripts/test_hotswap.py Normal file
View File

@@ -0,0 +1,46 @@
import urllib.request, json, time
B = "http://127.0.0.1:8000"
# Test 4: Hot-swap to balanced
print("=== TEST 4: HOT-SWAP fast -> balanced ===")
req = urllib.request.Request(f"{B}/engine/switch/balanced", method="POST")
r = json.loads(urllib.request.urlopen(req, timeout=10).read())
print(f" Switch response: {json.dumps(r)}")
# Test 5: During loading, /v1 should return 503
time.sleep(3)
print("\n=== TEST 5: 503 during loading ===")
try:
p = json.dumps({"model":"m","messages":[{"role":"user","content":"hi"}],"max_tokens":5}).encode()
urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=5)
print(" ERROR: Got 200 during loading!")
except urllib.error.HTTPError as e:
print(f" Status: {e.code} (expected 503)")
retry = e.headers.get("Retry-After", "N/A")
print(f" Retry-After: {retry}")
# Wait for switch to complete
print("\n=== Waiting for switch to complete... ===")
for i in range(60):
try:
s = json.loads(urllib.request.urlopen(f"{B}/engine/status", timeout=3).read())
print(f" [{i*2}s] state={s['state']}, role={s['role']}")
if s["state"] == "ready":
break
except:
pass
time.sleep(2)
# Test 6: Verify new model works
print("\n=== TEST 6: Verify balanced model ===")
p = json.dumps({"model":"m","messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Say hello"}],"max_tokens":15,"temperature":0}).encode()
t0 = time.time()
r = json.loads(urllib.request.urlopen(urllib.request.Request(f"{B}/v1/chat/completions", data=p, headers={"Content-Type":"application/json"}), timeout=60).read())
dt = time.time() - t0
tk = r.get("usage", {}).get("completion_tokens", 0)
print(f" Speed: {tk/dt:.1f} t/s")
s = json.loads(urllib.request.urlopen(f"{B}/engine/status").read())
print(f" Current model: {s['display_name']}")
print("\nALL HOT-SWAP TESTS PASSED")