feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
This commit is contained in:
38
scripts/_archive/benchmarks/bench_122b.py
Normal file
38
scripts/_archive/benchmarks/bench_122b.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import urllib.request, json, time, sys
|
||||
try: sys.stdout.reconfigure(encoding='utf-8')
|
||||
except: pass
|
||||
|
||||
BASE = "http://127.0.0.1:8000"
|
||||
prompt = "Write a Python function to calculate fibonacci numbers efficiently using memoization. Include type hints and docstring."
|
||||
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"max_tokens": 500,
|
||||
"temperature": 0.0
|
||||
}).encode('utf-8')
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
print("Sending request...")
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=300).read())
|
||||
dt = time.time() - t0
|
||||
|
||||
u = resp.get("usage", {})
|
||||
tokens = u.get("completion_tokens", 0)
|
||||
speed = tokens / dt if dt > 0 else 0
|
||||
|
||||
print(f"\n=== 122B Benchmark ===")
|
||||
print(f"Time: {dt:.1f}s")
|
||||
print(f"Completion Tokens: {tokens}")
|
||||
print(f"Speed: {speed:.2f} t/s")
|
||||
print(f"\n--- Response Preview ---")
|
||||
print(resp["choices"][0]["message"]["content"][:300])
|
||||
177
scripts/_archive/benchmarks/deep_tier_auto_test.py
Normal file
177
scripts/_archive/benchmarks/deep_tier_auto_test.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
RESULTS_FILE = "scripts/deep_tier_auto_results.json"
|
||||
|
||||
MODELS = [
|
||||
{
|
||||
"name": "Qwen 27B - 256K (q4_0)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 32K (q4_0)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "32768", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 64K (q4_0)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "65536", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
TEST_PROMPTS = [
|
||||
{
|
||||
"id": "code",
|
||||
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
|
||||
},
|
||||
{
|
||||
"id": "logical",
|
||||
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
|
||||
}
|
||||
]
|
||||
|
||||
def check_server(timeout=300):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
||||
if resp.get("status") == "ok" or resp.get("status") == "ready":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def get_vram_usage():
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
|
||||
text=True
|
||||
)
|
||||
return out.strip().split("\n")
|
||||
except:
|
||||
return ["Failed to get VRAM info"]
|
||||
|
||||
def ask(prompt, max_tokens=300):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
|
||||
dt = time.time() - t0
|
||||
usage = resp.get("usage", {})
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
|
||||
tokens = usage.get("completion_tokens", 0)
|
||||
tps = round(tokens / dt, 2) if dt > 0 else 0
|
||||
return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
|
||||
|
||||
def main():
|
||||
results = []
|
||||
|
||||
# Kill any existing llama-server
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(3)
|
||||
|
||||
for cfg in MODELS:
|
||||
print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
|
||||
|
||||
# Start server
|
||||
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
# Wait for boot
|
||||
print(f"Waiting for server to boot (up to 5 mins)...")
|
||||
is_ready = check_server(300)
|
||||
|
||||
if not is_ready:
|
||||
print(f"❌ Failed to boot {cfg['name']}.")
|
||||
results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
|
||||
proc.terminate()
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
print(f"✅ Server Ready!")
|
||||
vram = get_vram_usage()
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
ask("Hello", max_tokens=10)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
test_data = {}
|
||||
for p in TEST_PROMPTS:
|
||||
print(f" Testing {p['id']}...", end="", flush=True)
|
||||
try:
|
||||
res = ask(p["prompt"])
|
||||
test_data[p["id"]] = res
|
||||
print(f" {res['tps']} t/s")
|
||||
except Exception as e:
|
||||
test_data[p["id"]] = {"error": str(e)}
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
results.append({
|
||||
"name": cfg["name"],
|
||||
"status": "Success",
|
||||
"vram": vram,
|
||||
"tests": test_data
|
||||
})
|
||||
|
||||
# Save incremental
|
||||
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Shutdown
|
||||
print("Shutting down server...")
|
||||
proc.terminate()
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(5)
|
||||
|
||||
print("\n✅ All tests complete!")
|
||||
print(f"Results saved to {RESULTS_FILE}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
171
scripts/_archive/benchmarks/deep_tier_extreme_test.py
Normal file
171
scripts/_archive/benchmarks/deep_tier_extreme_test.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
import json
|
||||
import sys
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
RESULTS_FILE = "scripts/deep_tier_extreme_results.json"
|
||||
|
||||
MODELS = [
|
||||
{
|
||||
"name": "Qwen 27B - 256K 극한 (q4_0, ub=512)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 128K 확장 (q4_0)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "131072", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 192K 극한 (q4_0)",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
TEST_PROMPTS = [
|
||||
{
|
||||
"id": "code",
|
||||
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
|
||||
},
|
||||
{
|
||||
"id": "logical",
|
||||
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
|
||||
}
|
||||
]
|
||||
|
||||
def check_server(timeout=300):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
||||
if resp.get("status") == "ok" or resp.get("status") == "ready":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def get_vram_usage():
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
|
||||
text=True
|
||||
)
|
||||
return out.strip().split("\n")
|
||||
except:
|
||||
return ["Failed to get VRAM info"]
|
||||
|
||||
def ask(prompt, max_tokens=300):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
|
||||
dt = time.time() - t0
|
||||
usage = resp.get("usage", {})
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
|
||||
tokens = usage.get("completion_tokens", 0)
|
||||
tps = round(tokens / dt, 2) if dt > 0 else 0
|
||||
return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
|
||||
|
||||
def main():
|
||||
results = []
|
||||
|
||||
# Clean init
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(3)
|
||||
|
||||
for cfg in MODELS:
|
||||
print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
|
||||
|
||||
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
print(f"Waiting for server to boot (up to 5 mins)...")
|
||||
is_ready = check_server(300)
|
||||
|
||||
if not is_ready:
|
||||
print(f"❌ Failed to boot {cfg['name']}.")
|
||||
results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
|
||||
proc.terminate()
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
print(f"✅ Server Ready!")
|
||||
vram = get_vram_usage()
|
||||
print(f"VRAM: {vram}")
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
ask("Hello", max_tokens=10)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
test_data = {}
|
||||
for p in TEST_PROMPTS:
|
||||
print(f" Testing {p['id']}...", end="", flush=True)
|
||||
try:
|
||||
res = ask(p["prompt"])
|
||||
test_data[p["id"]] = res
|
||||
print(f" {res['tps']} t/s")
|
||||
except Exception as e:
|
||||
test_data[p["id"]] = {"error": str(e)}
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
results.append({
|
||||
"name": cfg["name"],
|
||||
"status": "Success",
|
||||
"vram": vram,
|
||||
"tests": test_data
|
||||
})
|
||||
|
||||
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("Shutting down server...")
|
||||
proc.terminate()
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(5)
|
||||
|
||||
print("\n✅ Extreme testing complete! Check results in", RESULTS_FILE)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
88
scripts/_archive/benchmarks/gemma4_test.py
Normal file
88
scripts/_archive/benchmarks/gemma4_test.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""
|
||||
Gemma 4 26B-A4B Q4_K_M - 76.4 t/s 재현 테스트
|
||||
이전 최적값: ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16
|
||||
"""
|
||||
import subprocess, time, json, urllib.request, sys, os
|
||||
|
||||
try: sys.stdout.reconfigure(encoding='utf-8')
|
||||
except: pass
|
||||
|
||||
LLAMA = os.path.join(os.getcwd(), "llama_bin_run", "llama-server.exe")
|
||||
MODEL = os.path.join(os.getcwd(), "models", "gemma-4-26B-A4B-it-Q4_K_M.gguf")
|
||||
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(3)
|
||||
|
||||
cmd = [
|
||||
LLAMA, "--model", MODEL,
|
||||
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "f16", "--cache-type-v", "f16",
|
||||
"-ub", "512", "-b", "2048", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50",
|
||||
"--port", "8000", "--host", "0.0.0.0",
|
||||
]
|
||||
|
||||
print("[1/4] Starting Gemma4 26B Q4_K_M (76.4 t/s config)...")
|
||||
server = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
print("[2/4] Waiting for boot...")
|
||||
healthy = False
|
||||
for sec in range(180):
|
||||
time.sleep(1)
|
||||
if server.poll() is not None:
|
||||
print(f" !! CRASHED (exit code {server.returncode})")
|
||||
sys.exit(1)
|
||||
try:
|
||||
with urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=1) as r:
|
||||
if json.loads(r.read()).get("status") == "ok":
|
||||
healthy = True; break
|
||||
except: pass
|
||||
if sec % 10 == 9: print(f" ... {sec+1}s")
|
||||
|
||||
if not healthy:
|
||||
print(" FAIL: boot timeout"); server.kill(); sys.exit(1)
|
||||
|
||||
print(f" OK!")
|
||||
try:
|
||||
v = subprocess.run(["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
|
||||
print(f" VRAM: {v.stdout.strip()}")
|
||||
except: pass
|
||||
|
||||
def bench(n):
|
||||
payload = json.dumps({"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}], "max_tokens": n, "temperature": 0}).encode()
|
||||
req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
with urllib.request.urlopen(req, timeout=120) as r:
|
||||
res = json.loads(r.read())
|
||||
el = time.time() - t0
|
||||
ct = res["usage"]["completion_tokens"]
|
||||
return ct / el, ct, el
|
||||
|
||||
try: bench(10)
|
||||
except: pass
|
||||
|
||||
print("[3/4] Running 5x benchmark (200 tokens)...")
|
||||
results = []
|
||||
for i in range(5):
|
||||
tps, tok, el = bench(200)
|
||||
results.append(tps)
|
||||
print(f" Run {i+1}: {tps:.2f} t/s ({tok} tok / {el:.2f}s)")
|
||||
|
||||
avg = sum(results) / len(results)
|
||||
best = max(results)
|
||||
worst = min(results)
|
||||
summary = f"""
|
||||
==================================================
|
||||
Gemma4 26B Q4_K_M 5-Run Results:
|
||||
AVG: {avg:.2f} t/s
|
||||
BEST: {best:.2f} t/s
|
||||
MIN: {worst:.2f} t/s
|
||||
Runs: {[f'{r:.2f}' for r in results]}
|
||||
==================================================
|
||||
"""
|
||||
print(summary)
|
||||
with open("scripts/gemma4_test_result.txt", "w") as f:
|
||||
f.write(summary)
|
||||
|
||||
server.kill()
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
134
scripts/_archive/benchmarks/llm_judge_test.py
Normal file
134
scripts/_archive/benchmarks/llm_judge_test.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
import json
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
RESULTS_FILE = "scripts/llm_judge_answers.json"
|
||||
|
||||
MODELS = [
|
||||
{
|
||||
"name": "Qwen 27B",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B",
|
||||
"cmd": [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
|
||||
"-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--mlock", "--poll", "50",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
QUESTIONS = [
|
||||
{
|
||||
"id": "architecture",
|
||||
"prompt": "수백만 건의 실시간 주식 틱 데이터를 수집하고, 이를 가공하여 초당 수천 명의 클라이언트에게 웹소켓으로 지연 없이 브로드캐스팅하는 시스템을 설계해야 합니다. 언어, 메시지 큐, 데이터베이스, 캐싱 전략 등을 포함해 구체적인 아키텍처를 제안하고, 병목 현상에 대비한 해결책을 설명하세요."
|
||||
},
|
||||
{
|
||||
"id": "logic",
|
||||
"prompt": "논리 문제: 방 안에 5명의 사람(A, B, C, D, E)이 있습니다. A는 B를 제외한 모든 사람과 악수했습니다. B는 C와만 악수했습니다. C는 D와 악수하지 않았습니다. 그렇다면 E는 총 몇 명과 악수했을까요? 당신의 논리적 사고 과정을 한 단계씩 명확히 설명해주세요."
|
||||
},
|
||||
{
|
||||
"id": "coding",
|
||||
"prompt": "파이썬에서 데코레이터를 작성하세요. 이 데코레이터는 함수의 실행을 최대 3번까지 재시도하며, 각 재시도 간에 지수 백오프(Exponential Backoff)를 적용해야 합니다. 로깅 처리가 포함되어야 하며, 어떤 예외 타입(Exception type)이 발생했을 때만 재시도할지 인자로 받을 수 있어야 합니다."
|
||||
}
|
||||
]
|
||||
|
||||
def check_server(timeout=300):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
||||
if resp.get("status") == "ok" or resp.get("status") == "ready":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def ask(prompt, max_tokens=4096):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a world-class IT system architect and developer. Please output your response in Korean."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"max_tokens": -1,
|
||||
"temperature": 0.0
|
||||
}).encode('utf-8')
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=1800).read())
|
||||
dt = time.time() - t0
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
return content
|
||||
|
||||
def main():
|
||||
results = {}
|
||||
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(3)
|
||||
|
||||
for cfg in MODELS:
|
||||
print(f"\n[{time.strftime('%H:%M:%S')}] Booting {cfg['name']}...")
|
||||
proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
if not check_server(300):
|
||||
print(f"Failed to boot {cfg['name']}.")
|
||||
proc.terminate()
|
||||
continue
|
||||
|
||||
print(f"✅ {cfg['name']} is ready! Asking questions...")
|
||||
|
||||
try: ask("Hi", max_tokens=10)
|
||||
except: pass
|
||||
|
||||
results[cfg['name']] = {}
|
||||
for q in QUESTIONS:
|
||||
print(f" -> Asking: {q['id']}")
|
||||
try:
|
||||
ans = ask(q['prompt'])
|
||||
results[cfg['name']][q['id']] = ans
|
||||
print(" (Done)")
|
||||
except Exception as e:
|
||||
results[cfg['name']][q['id']] = f"ERROR: {e}"
|
||||
print(" (Error)")
|
||||
|
||||
with open(RESULTS_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
proc.terminate()
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(5)
|
||||
|
||||
print("\n✅ All questions answered! Results saved to", RESULTS_FILE)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
169
scripts/_archive/benchmarks/perf_test_122b.py
Normal file
169
scripts/_archive/benchmarks/perf_test_122b.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
|
||||
def check_server():
|
||||
"""Check if server is up"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("status") == "ok"
|
||||
except:
|
||||
return False
|
||||
|
||||
def check_slots():
|
||||
"""Check server slot info for VRAM usage details"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/slots")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read())
|
||||
except:
|
||||
return None
|
||||
|
||||
def run_benchmark(prompt, max_tokens=300, label="Test"):
|
||||
"""Run a single benchmark request and return results"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
content = result["choices"][0]["message"].get("content", "")
|
||||
usage = result.get("usage", {})
|
||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
|
||||
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"elapsed": elapsed,
|
||||
"gen_tps_approx": gen_tps,
|
||||
"content_preview": content[:150]
|
||||
}
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B Performance Benchmark")
|
||||
print(" Target: 10+ t/s generation speed")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Wait for server (model loading takes 3-5 min for 71 GB)
|
||||
print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
|
||||
max_wait = 600 # 10 minutes max
|
||||
for i in range(max_wait // 5):
|
||||
if check_server():
|
||||
print(f" -> Server is ready! (waited {i*5}s)")
|
||||
break
|
||||
if i % 6 == 0:
|
||||
print(f" -> Loading model... ({i*5}s / {max_wait}s)")
|
||||
time.sleep(5)
|
||||
else:
|
||||
print(f" -> ERROR: Server not responding after {max_wait}s")
|
||||
return
|
||||
|
||||
# Check server info
|
||||
print()
|
||||
print("[2/4] Checking server status...")
|
||||
slots = check_slots()
|
||||
if slots:
|
||||
print(f" -> Slots available: {len(slots)}")
|
||||
|
||||
# Warmup
|
||||
print()
|
||||
print("[3/4] Warmup run (short, pre-heating GPU caches)...")
|
||||
try:
|
||||
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
||||
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
||||
print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
||||
except Exception as e:
|
||||
print(f" -> Warmup failed: {e}")
|
||||
|
||||
# Main benchmark - 5 runs for statistical reliability
|
||||
print()
|
||||
print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
|
||||
print("-" * 70)
|
||||
|
||||
test_prompts = [
|
||||
"Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
|
||||
"Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
|
||||
"Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
|
||||
"Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
|
||||
"Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
|
||||
]
|
||||
|
||||
results = []
|
||||
for i in range(5):
|
||||
prompt = test_prompts[i % len(test_prompts)]
|
||||
print(f"\n Run {i+1}/5: {prompt[:50]}...")
|
||||
try:
|
||||
r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
|
||||
results.append(r)
|
||||
print(f" Completion tokens: {r['completion_tokens']}")
|
||||
print(f" Total time: {r['elapsed']:.2f}s")
|
||||
print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
if results:
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" RESULTS SUMMARY - Qwen3.5 122B-A10B")
|
||||
print("=" * 70)
|
||||
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
||||
max_tps = max(r["gen_tps_approx"] for r in results)
|
||||
min_tps = min(r["gen_tps_approx"] for r in results)
|
||||
total_tokens = sum(r["completion_tokens"] for r in results)
|
||||
total_time = sum(r["elapsed"] for r in results)
|
||||
|
||||
print(f" Runs completed: {len(results)}/5")
|
||||
print(f" Total tokens: {total_tokens}")
|
||||
print(f" Total time: {total_time:.1f}s")
|
||||
print()
|
||||
print(f" Approx TPS (avg): {avg_tps:.2f} t/s")
|
||||
print(f" Approx TPS (min): {min_tps:.2f} t/s")
|
||||
print(f" Approx TPS (max): {max_tps:.2f} t/s")
|
||||
print()
|
||||
|
||||
# Verdict
|
||||
if avg_tps >= 10:
|
||||
print(" ✅ TARGET ACHIEVED: 10+ t/s!")
|
||||
elif avg_tps >= 8:
|
||||
print(" ⚠️ CLOSE TO TARGET: Consider further tuning")
|
||||
else:
|
||||
print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
|
||||
|
||||
print()
|
||||
print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
|
||||
print(" ⚡ Check the server console/log for exact 'eval time' t/s value,")
|
||||
print(" ⚡ which shows pure token generation speed (always higher).")
|
||||
print("=" * 70)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
241
scripts/_archive/benchmarks/quality_ab_test.py
Normal file
241
scripts/_archive/benchmarks/quality_ab_test.py
Normal file
@@ -0,0 +1,241 @@
|
||||
"""
|
||||
Quality A/B Test — Gemma 4 26B vs Qwen 3.5 35B
|
||||
실제 서비스 시나리오 기반 품질 비교
|
||||
"""
|
||||
import urllib.request, json, time, sys, os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE = "http://127.0.0.1:8000"
|
||||
MODEL_NAME = sys.argv[1] if len(sys.argv) > 1 else "unknown"
|
||||
OUTPUT_FILE = f"scripts/quality_result_{MODEL_NAME}.json"
|
||||
|
||||
SCENARIOS = [
|
||||
# ═══ 1. 코딩 에이전트 (VS Code) ═══
|
||||
{
|
||||
"id": "code_generate",
|
||||
"category": "coding",
|
||||
"name": "Python 함수 생성",
|
||||
"prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code.",
|
||||
"eval_criteria": ["correctness", "type_hints", "docstring", "edge_cases"]
|
||||
},
|
||||
{
|
||||
"id": "code_debug",
|
||||
"category": "coding",
|
||||
"name": "버그 찾기 & 수정",
|
||||
"prompt": """Find and fix the bug in this code:
|
||||
```python
|
||||
def find_duplicates(arr):
|
||||
seen = {}
|
||||
duplicates = []
|
||||
for item in arr:
|
||||
if item in seen:
|
||||
duplicates.append(item)
|
||||
seen[item] = True
|
||||
return list(set(duplicates))
|
||||
|
||||
# Bug: find_duplicates([1,2,2,3,3,3]) returns [2,3] but
|
||||
# find_duplicates([]) crashes with unexpected behavior
|
||||
# Also it should return count of each duplicate
|
||||
```
|
||||
Fix it to return a dict like {2: 2, 3: 3} (value=count of occurrences).""",
|
||||
"eval_criteria": ["bug_identified", "correct_fix", "clean_code"]
|
||||
},
|
||||
{
|
||||
"id": "code_refactor",
|
||||
"category": "coding",
|
||||
"name": "TypeScript 리팩토링",
|
||||
"prompt": """Refactor this messy TypeScript into clean, typed code:
|
||||
```typescript
|
||||
async function getData(url, retry, timeout) {
|
||||
let result = null
|
||||
for (let i = 0; i < retry; i++) {
|
||||
try {
|
||||
const r = await fetch(url, {signal: AbortSignal.timeout(timeout)})
|
||||
if (r.ok) {
|
||||
result = await r.json()
|
||||
break
|
||||
}
|
||||
} catch(e) {
|
||||
if (i === retry - 1) throw e
|
||||
await new Promise(r => setTimeout(r, 1000 * (i+1)))
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
```
|
||||
Add proper types, error handling, configurable backoff, and make it production-ready.""",
|
||||
"eval_criteria": ["types", "error_handling", "backoff", "production_quality"]
|
||||
},
|
||||
|
||||
# ═══ 2. 개인 비서 (Discord Bot) — 한국어 ═══
|
||||
{
|
||||
"id": "korean_schedule",
|
||||
"category": "assistant_kr",
|
||||
"name": "한국어 일정 관리",
|
||||
"prompt": "내일 오후 2시에 팀 미팅이 있고, 3시에 치과 예약이 있어. 그리고 저녁 7시에 친구랑 홍대에서 만나기로 했어. 이 일정들을 정리해주고, 이동 시간을 고려해서 현실적으로 가능한지 알려줘. 서울 기준으로.",
|
||||
"eval_criteria": ["korean_fluency", "schedule_analysis", "practical_advice"]
|
||||
},
|
||||
{
|
||||
"id": "korean_email",
|
||||
"category": "assistant_kr",
|
||||
"name": "한국어 이메일 요약",
|
||||
"prompt": """다음 이메일을 3줄로 요약하고, 필요한 액션을 정리해줘:
|
||||
|
||||
안녕하세요 김팀장님,
|
||||
|
||||
지난 주 논의했던 Q2 마케팅 예산 관련하여 연락드립니다.
|
||||
본부장님께서 기존 제안 대비 15% 삭감을 요청하셨습니다.
|
||||
이에 따라 디지털 마케팅 채널 중 ROI가 낮은 채널을 우선 정리해야 할 것 같습니다.
|
||||
|
||||
리서치팀에서는 네이버 검색광고 대비 인스타그램 광고의 전환율이
|
||||
0.3%로 가장 낮다는 분석 결과를 공유했습니다.
|
||||
수요일까지 수정안을 제출해야 하니, 화요일 오전까지
|
||||
각 채널별 삭감 우선순위를 정리해서 회신 부탁드립니다.
|
||||
|
||||
감사합니다.
|
||||
마케팅팀 박과장 드림""",
|
||||
"eval_criteria": ["korean_summary", "action_items", "conciseness"]
|
||||
},
|
||||
|
||||
# ═══ 3. MCP 도구 (Function Calling) ═══
|
||||
{
|
||||
"id": "tool_calling",
|
||||
"category": "tool_use",
|
||||
"name": "Function Calling (JSON)",
|
||||
"prompt": """You have access to these tools:
|
||||
- search_web(query: string) -> string
|
||||
- get_calendar(date: string) -> list[Event]
|
||||
- send_email(to: string, subject: string, body: string) -> bool
|
||||
|
||||
User says: "Check my calendar for tomorrow, and if I have a meeting with John, search for the latest quarterly report and email him a summary."
|
||||
|
||||
Respond with the exact sequence of tool calls as JSON array. Use this format:
|
||||
[{"tool": "name", "args": {...}}, ...]""",
|
||||
"eval_criteria": ["correct_sequence", "valid_json", "complete_args"]
|
||||
},
|
||||
{
|
||||
"id": "structured_output",
|
||||
"category": "tool_use",
|
||||
"name": "구조화 출력 (JSON)",
|
||||
"prompt": """Parse this unstructured text into a JSON object:
|
||||
|
||||
"삼성전자가 2026년 1분기 실적을 발표했다. 매출은 79조원으로 전년 동기 대비 12% 증가했고, 영업이익은 15.2조원을 기록했다. 반도체 부문이 전체 이익의 65%를 차지했으며, 특히 HBM4 수요 증가로 인해 메모리 사업부 매출이 전 분기 대비 23% 성장했다."
|
||||
|
||||
Output format:
|
||||
{
|
||||
"company": "",
|
||||
"period": "",
|
||||
"revenue": {"amount": "", "unit": "", "yoy_change": ""},
|
||||
"operating_profit": {"amount": "", "unit": ""},
|
||||
"segments": [{"name": "", "profit_share": "", "highlights": ""}]
|
||||
}""",
|
||||
"eval_criteria": ["correct_parsing", "valid_json", "completeness"]
|
||||
},
|
||||
|
||||
# ═══ 4. 일반 추론 ═══
|
||||
{
|
||||
"id": "reasoning",
|
||||
"category": "reasoning",
|
||||
"name": "논리 추론",
|
||||
"prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step.",
|
||||
"eval_criteria": ["correct_answer", "clear_steps", "math_accuracy"]
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def ask(prompt, max_tokens=800):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BASE}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
|
||||
dt = time.time() - t0
|
||||
usage = resp.get("usage", {})
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
return {
|
||||
"content": content,
|
||||
"tokens": usage.get("completion_tokens", 0),
|
||||
"time": round(dt, 2),
|
||||
"tps": round(usage.get("completion_tokens", 0) / dt, 2) if dt > 0 else 0
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print(f"{'='*60}")
|
||||
print(f" Quality A/B Test — Model: {MODEL_NAME}")
|
||||
print(f" {len(SCENARIOS)} scenarios | {time.strftime('%Y-%m-%d %H:%M')}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Health check
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=5).read())
|
||||
if resp.get("status") != "ok":
|
||||
print("Server not ready!")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Server not reachable: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Warmup
|
||||
print("Warmup...", flush=True)
|
||||
ask("Hello", max_tokens=10)
|
||||
print("Done\n", flush=True)
|
||||
|
||||
results = []
|
||||
for i, sc in enumerate(SCENARIOS):
|
||||
print(f"[{i+1}/{len(SCENARIOS)}] {sc['category']} — {sc['name']}")
|
||||
print(f" Prompt: {sc['prompt'][:80]}...", flush=True)
|
||||
|
||||
try:
|
||||
resp = ask(sc["prompt"])
|
||||
print(f" ✅ {resp['tokens']} tokens | {resp['tps']:.1f} t/s | {resp['time']}s")
|
||||
print(f" Response preview: {resp['content'][:120]}...\n")
|
||||
|
||||
results.append({
|
||||
"id": sc["id"],
|
||||
"category": sc["category"],
|
||||
"name": sc["name"],
|
||||
"model": MODEL_NAME,
|
||||
"response": resp["content"],
|
||||
"tokens": resp["tokens"],
|
||||
"time": resp["time"],
|
||||
"tps": resp["tps"],
|
||||
"eval_criteria": sc["eval_criteria"]
|
||||
})
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}\n")
|
||||
results.append({
|
||||
"id": sc["id"],
|
||||
"category": sc["category"],
|
||||
"name": sc["name"],
|
||||
"model": MODEL_NAME,
|
||||
"response": f"ERROR: {e}",
|
||||
"tokens": 0,
|
||||
"time": 0,
|
||||
"tps": 0,
|
||||
})
|
||||
|
||||
# Save
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Results saved: {OUTPUT_FILE}")
|
||||
print(f" Total scenarios: {len(results)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
45
scripts/_archive/benchmarks/quick_bench.py
Normal file
45
scripts/_archive/benchmarks/quick_bench.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""Quick benchmark for running llama-server instance"""
|
||||
import urllib.request, json, time, sys
|
||||
|
||||
BASE = "http://127.0.0.1:8000"
|
||||
RUNS = 5
|
||||
TOKENS = 200
|
||||
|
||||
def bench(max_tokens=TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 100, each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BASE}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=300).read())
|
||||
dt = time.time() - t0
|
||||
ct = resp.get("usage", {}).get("completion_tokens", 0)
|
||||
return ct / dt if dt > 0 else 0, ct, dt
|
||||
|
||||
print("Warmup...", flush=True)
|
||||
try:
|
||||
bench(20)
|
||||
except Exception as e:
|
||||
print(f"Warmup failed: {e}")
|
||||
sys.exit(1)
|
||||
print("Warmup done\n", flush=True)
|
||||
|
||||
speeds = []
|
||||
for i in range(RUNS):
|
||||
tps, ct, dt = bench()
|
||||
speeds.append(tps)
|
||||
print(f" Run {i+1}: {tps:.2f} t/s (tokens={ct}, time={dt:.2f}s)", flush=True)
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
mn = min(speeds)
|
||||
print(f"\n{'='*50}")
|
||||
print(f" RESULT: AVG {avg:.2f} / BEST {best:.2f} / MIN {mn:.2f} t/s")
|
||||
print(f"{'='*50}")
|
||||
31
scripts/_archive/benchmarks/quick_pptest.mjs
Normal file
31
scripts/_archive/benchmarks/quick_pptest.mjs
Normal file
@@ -0,0 +1,31 @@
|
||||
// Quick PP+TG speed test
|
||||
const BASE = "http://127.0.0.1:8000";
|
||||
|
||||
async function test(label, prompt, maxTok) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ model: "m", messages: [{ role: "user", content: prompt }], max_tokens: maxTok, temperature: 0 }),
|
||||
signal: AbortSignal.timeout(600000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const u = d.usage || {};
|
||||
const pp = u.prompt_tokens || 0;
|
||||
const tg = u.completion_tokens || 0;
|
||||
const ppSpeed = pp > 0 ? (pp / dt).toFixed(1) : "?";
|
||||
const tgSpeed = tg > 0 ? (tg / dt).toFixed(1) : "?";
|
||||
console.log(`${label} | PP:${pp}tok ${ppSpeed}t/s | TG:${tg}tok ${tgSpeed}t/s | ${dt.toFixed(1)}s`);
|
||||
}
|
||||
|
||||
const short = "Count 1 to 20.";
|
||||
const long = "x".repeat(3000) + " Summarize above in 3 words.";
|
||||
const code = Array(200).fill("function foo(x) { return x * 2 + Math.random(); }").join("\n") + "\n\nRefactor above to arrow functions. Show first 5 lines.";
|
||||
|
||||
await test("warmup", short, 20);
|
||||
await test("SHORT", short, 200);
|
||||
await test("3K-PP", long, 100);
|
||||
await test("10K-CODE", code, 100);
|
||||
await test("TG-500", short, 500);
|
||||
console.log("DONE");
|
||||
67
scripts/_archive/benchmarks/qwen_split_challenge.py
Normal file
67
scripts/_archive/benchmarks/qwen_split_challenge.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
|
||||
try: sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError: pass
|
||||
|
||||
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
time.sleep(2)
|
||||
|
||||
cmd = [
|
||||
LLAMA_SERVER, "--model", MODEL,
|
||||
"-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--port", "8000", "--host", "0.0.0.0",
|
||||
"-ts", "0.44,0.56"
|
||||
]
|
||||
|
||||
print(f"🚀 Starting Challenge (0.44, 0.56) ...")
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
ready = False
|
||||
for i in range(120):
|
||||
try:
|
||||
req = urllib.request.Request("http://127.0.0.1:8000/health")
|
||||
with urllib.request.urlopen(req, timeout=1) as r:
|
||||
if json.loads(r.read()).get("status") == "ok":
|
||||
ready = True
|
||||
break
|
||||
except:
|
||||
pass
|
||||
print(f" booting... {i}s", end='\r', flush=True)
|
||||
time.sleep(1)
|
||||
|
||||
if not ready:
|
||||
print("\n❌ FAILED to boot.")
|
||||
proc.kill()
|
||||
sys.exit(1)
|
||||
|
||||
print("\n✅ Booted! Testing 200 tokens...")
|
||||
try:
|
||||
payload = json.dumps({
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
|
||||
"max_tokens": 200, "temperature": 0
|
||||
}).encode()
|
||||
req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as r:
|
||||
res = json.loads(r.read())
|
||||
el = time.time() - t0
|
||||
ct = res["usage"]["completion_tokens"]
|
||||
tps = ct / el
|
||||
print("="*50)
|
||||
print(f"★ 0.44 / 0.56 RESULT: {tps:.2f} t/s ★")
|
||||
print(f" Tokens: {ct} | Time: {el:.2f}s")
|
||||
print("="*50)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Benchmark Error: {e}")
|
||||
|
||||
proc.kill()
|
||||
141
scripts/_archive/benchmarks/qwen_split_test.py
Normal file
141
scripts/_archive/benchmarks/qwen_split_test.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
Qwen 3.5 35B-A3B Q4_K_M - Tensor Split 0.42/0.58 Speed Test
|
||||
64 t/s 달성 설정 기반, 스플릿 비율만 변경
|
||||
"""
|
||||
import subprocess, time, json, urllib.request, sys, os
|
||||
|
||||
PYTHON = sys.executable
|
||||
LLAMA = os.path.join(os.getcwd(), "llama_bin_run", "llama-server.exe")
|
||||
MODEL = os.path.join(os.getcwd(), "models", "Qwen3.5-35B-A3B-Q4_K_M.gguf")
|
||||
TS = "0.55,0.45"
|
||||
|
||||
# 1. Kill any existing server
|
||||
print("[1/4] Killing existing llama-server...")
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(3)
|
||||
|
||||
# 2. Start server with 64t/s config + custom split
|
||||
args = [
|
||||
LLAMA, "--model", MODEL,
|
||||
"-ngl", "999",
|
||||
"-c", "262144",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "128",
|
||||
"-b", "512",
|
||||
"-t", "6",
|
||||
"-tb", "6",
|
||||
"--prio", "3",
|
||||
"--mlock",
|
||||
"--poll", "50",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"-ts", TS,
|
||||
]
|
||||
print(f"[2/4] Starting server with -ts {TS}")
|
||||
print(f" CMD: {' '.join(args[-6:])}")
|
||||
server = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
# 3. Wait for health
|
||||
print("[3/4] Waiting for server to become healthy...")
|
||||
t_boot = time.time()
|
||||
healthy = False
|
||||
for sec in range(180): # max 3 min
|
||||
time.sleep(1)
|
||||
# Check if process crashed
|
||||
if server.poll() is not None:
|
||||
print(f" !! Server process CRASHED (exit code {server.returncode})")
|
||||
sys.exit(1)
|
||||
try:
|
||||
r = urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=1)
|
||||
body = json.loads(r.read())
|
||||
if body.get("status") == "ok":
|
||||
healthy = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
if sec % 10 == 9:
|
||||
print(f" ... {sec+1}s elapsed")
|
||||
|
||||
if not healthy:
|
||||
print(f" FAIL: Server not healthy after 180 seconds")
|
||||
server.kill()
|
||||
sys.exit(1)
|
||||
|
||||
boot_secs = time.time() - t_boot
|
||||
print(f" OK: Booted in {boot_secs:.1f}s")
|
||||
|
||||
# VRAM check
|
||||
try:
|
||||
v = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
print(f" VRAM: {v.stdout.strip()}")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 4. Benchmark
|
||||
print("[4/4] Running token speed benchmark (200 tokens)...")
|
||||
|
||||
def do_bench(max_tok):
|
||||
payload = json.dumps({
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
|
||||
"max_tokens": max_tok,
|
||||
"temperature": 0
|
||||
}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
"http://127.0.0.1:8000/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
with urllib.request.urlopen(req, timeout=120) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - t0
|
||||
ct = result["usage"]["completion_tokens"]
|
||||
return ct / elapsed, ct, elapsed
|
||||
|
||||
# warmup
|
||||
try:
|
||||
do_bench(10)
|
||||
except:
|
||||
pass
|
||||
|
||||
# real runs - 5회
|
||||
print("[4/4] Running 5x benchmark (200 tokens each)...")
|
||||
results = []
|
||||
for i in range(5):
|
||||
tps, tokens, elapsed = do_bench(200)
|
||||
results.append(tps)
|
||||
# VRAM check per run
|
||||
try:
|
||||
v = subprocess.run(["nvidia-smi", "--query-gpu=index,memory.used,memory.total,memory.free", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
|
||||
vram_info = v.stdout.strip()
|
||||
except:
|
||||
vram_info = "?"
|
||||
print(f" Run {i+1}: {tps:.2f} t/s ({tokens} tok / {elapsed:.2f}s) | VRAM: {vram_info}")
|
||||
|
||||
avg = sum(results) / len(results)
|
||||
best = max(results)
|
||||
worst = min(results)
|
||||
summary = f"""
|
||||
==================================================
|
||||
TS={TS} 5-Run Results (with --mlock --poll 50):
|
||||
AVG: {avg:.2f} t/s
|
||||
BEST: {best:.2f} t/s
|
||||
MIN: {worst:.2f} t/s
|
||||
Runs: {[f'{r:.2f}' for r in results]}
|
||||
==================================================
|
||||
"""
|
||||
print(summary)
|
||||
with open("scripts/split_test_result.txt", "w") as f:
|
||||
f.write(summary)
|
||||
|
||||
# cleanup
|
||||
server.kill()
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
36
scripts/_archive/benchmarks/test_qwen.py
Normal file
36
scripts/_archive/benchmarks/test_qwen.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import urllib.request
|
||||
import json
|
||||
import traceback
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
prompt = "수백만 건의 실시간 주식 틱 데이터를 수집하고, 이를 가공하여 초당 수천 명의 클라이언트에게 웹소켓으로 지연 없이 브로드캐스팅하는 시스템을 설계해야 합니다. 언어, 메시지 큐, 데이터베이스, 캐싱 전략 등을 포함해 구체적인 아키텍처를 제안하고, 병목 현상에 대비한 해결책을 설명하세요."
|
||||
|
||||
def test():
|
||||
try:
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a world-class IT system architect and developer. Please output your response in Korean."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"max_tokens": 4096,
|
||||
"temperature": 0.1
|
||||
}).encode('utf-8')
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
print("전송 중... (타임아웃 300초)")
|
||||
resp = urllib.request.urlopen(req, timeout=300).read()
|
||||
res_json = json.loads(resp)
|
||||
print("\n=== 결과 ===")
|
||||
print(res_json["choices"][0]["message"]["content"])
|
||||
except Exception as e:
|
||||
print("\n=== 에러 발생 ===")
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
||||
84
scripts/_archive/benchmarks/test_split_03_07.mjs
Normal file
84
scripts/_archive/benchmarks/test_split_03_07.mjs
Normal file
@@ -0,0 +1,84 @@
|
||||
import { spawn, execSync } from "child_process";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const args = [
|
||||
"llama_bin_run\\llama-server.exe",
|
||||
"--model", "models\\Qwen3.5-35B-A3B-Q4_K_M.gguf",
|
||||
"-ngl", "999",
|
||||
"-c", "262144",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "128",
|
||||
"-b", "512",
|
||||
"-t", "6",
|
||||
"-tb", "6",
|
||||
"--prio", "3",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"-ts", "0.3,0.7"
|
||||
];
|
||||
|
||||
console.log(`Starting server with args: \n${args.join(" ")}\n`);
|
||||
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
const server = spawn(args[0], args.slice(1), { stdio: 'ignore' });
|
||||
|
||||
let ready = false;
|
||||
let bootStart = Date.now();
|
||||
for (let i = 0; i < 60; i++) {
|
||||
try {
|
||||
const res = await fetch(`${BASE_URL}/health`);
|
||||
if (res.status === 200) { ready = true; break; }
|
||||
} catch(e) {}
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log("Server failed to boot within 3 mins.");
|
||||
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const bootElapsed = (Date.now() - bootStart) / 1000;
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Server booted in ${bootElapsed.toFixed(1)}s.`);
|
||||
|
||||
try {
|
||||
const vram = execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', { encoding: 'utf-8' });
|
||||
console.log(`VRAM USAGE:\n${vram.trim()}`);
|
||||
} catch(e) {}
|
||||
console.log(`===========================================\n`);
|
||||
|
||||
try {
|
||||
await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST", headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ messages: [{ role: "user", content: "hi" }], max_tokens: 10, temperature: 0 })
|
||||
});
|
||||
} catch(e) {}
|
||||
|
||||
console.log("Running speed test (200 tokens)...");
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST", headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], max_tokens: 200, temperature: 0 })
|
||||
});
|
||||
const result = await res.json();
|
||||
const elapsed = (Date.now() - t0) / 1000;
|
||||
const ct = result?.usage?.completion_tokens || 0;
|
||||
const tps = ct / elapsed;
|
||||
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`★ 0.3/0.7 SPLIT RESULT: ${tps.toFixed(2)} t/s ★`);
|
||||
console.log(` Tokens: ${ct}`);
|
||||
console.log(` Time: ${elapsed.toFixed(2)}s\n===========================================\n`);
|
||||
|
||||
} catch(e) {
|
||||
console.log("ERROR during benchmark:", e.message);
|
||||
}
|
||||
|
||||
try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
|
||||
process.exit(0);
|
||||
108
scripts/_archive/benchmarks/test_split_03_07.py
Normal file
108
scripts/_archive/benchmarks/test_split_03_07.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
|
||||
CONTEXT = 262144
|
||||
|
||||
def kill_server():
|
||||
try:
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
except:
|
||||
pass
|
||||
time.sleep(3)
|
||||
|
||||
def run_benchmark(max_tokens=200):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, each on new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
return ct / elapsed if elapsed > 0 else 0, ct, elapsed
|
||||
|
||||
def get_vram():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
return r.stdout.strip()
|
||||
except:
|
||||
return "Unknown"
|
||||
|
||||
kill_server()
|
||||
|
||||
cmd = [
|
||||
LLAMA_SERVER, "--model", MODEL,
|
||||
"-ngl", "999", "-c", str(CONTEXT), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
|
||||
"--prio", "3", "--port", "8000", "--host", "0.0.0.0",
|
||||
"-ts", "0.45,0.55"
|
||||
]
|
||||
|
||||
print("Starting server with tensorSplit 0.45,0.55")
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=os.getcwd())
|
||||
|
||||
ready = False
|
||||
boot_start = time.time()
|
||||
for _ in range(30):
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=2) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
ready = True
|
||||
break
|
||||
except:
|
||||
pass
|
||||
time.sleep(3)
|
||||
|
||||
if not ready:
|
||||
print("Server failed to boot.")
|
||||
kill_server()
|
||||
sys.exit(1)
|
||||
|
||||
boot_time = time.time() - boot_start
|
||||
print(f"Booted in {boot_time:.1f}s")
|
||||
print(f"VRAM:\n{get_vram()}")
|
||||
|
||||
try:
|
||||
print("Warming up...")
|
||||
run_benchmark(10)
|
||||
|
||||
print("Benchmarking (200 tokens)...")
|
||||
tps, ct, el = run_benchmark(200)
|
||||
print("=" * 50)
|
||||
print(f"★ 0.3/0.7 SPLIT RESULT: {tps:.2f} t/s ★")
|
||||
print(f" Tokens: {ct} / Time: {el:.2f}s")
|
||||
print("=" * 50)
|
||||
except Exception as e:
|
||||
print(f"Error benchmark: {e}")
|
||||
|
||||
kill_server()
|
||||
562
scripts/_archive/help_full.txt
Normal file
562
scripts/_archive/help_full.txt
Normal file
@@ -0,0 +1,562 @@
|
||||
----- common params -----
|
||||
|
||||
-h, --help, --usage print usage and exit
|
||||
--version show version and build info
|
||||
--license show source code license and dependencies
|
||||
-cl, --cache-list show list of models in cache
|
||||
--completion-bash print source-able bash completion script for llama.cpp
|
||||
-t, --threads N number of CPU threads to use during generation (default: -1)
|
||||
(env: LLAMA_ARG_THREADS)
|
||||
-tb, --threads-batch N number of threads to use during batch and prompt processing (default:
|
||||
same as --threads)
|
||||
-C, --cpu-mask M CPU affinity mask: arbitrarily long hex. Complements cpu-range
|
||||
(default: "")
|
||||
-Cr, --cpu-range lo-hi range of CPUs for affinity. Complements --cpu-mask
|
||||
--cpu-strict <0|1> use strict CPU placement (default: 0)
|
||||
--prio N set process/thread priority : low(-1), normal(0), medium(1), high(2),
|
||||
realtime(3) (default: 0)
|
||||
--poll <0...100> use polling level to wait for work (0 - no polling, default: 50)
|
||||
-Cb, --cpu-mask-batch M CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch
|
||||
(default: same as --cpu-mask)
|
||||
-Crb, --cpu-range-batch lo-hi ranges of CPUs for affinity. Complements --cpu-mask-batch
|
||||
--cpu-strict-batch <0|1> use strict CPU placement (default: same as --cpu-strict)
|
||||
--prio-batch N set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime
|
||||
(default: 0)
|
||||
--poll-batch <0|1> use polling to wait for work (default: same as --poll)
|
||||
-c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
|
||||
(env: LLAMA_ARG_CTX_SIZE)
|
||||
-n, --predict, --n-predict N number of tokens to predict (default: -1, -1 = infinity)
|
||||
(env: LLAMA_ARG_N_PREDICT)
|
||||
-b, --batch-size N logical maximum batch size (default: 2048)
|
||||
(env: LLAMA_ARG_BATCH)
|
||||
-ub, --ubatch-size N physical maximum batch size (default: 512)
|
||||
(env: LLAMA_ARG_UBATCH)
|
||||
--keep N number of tokens to keep from the initial prompt (default: 0, -1 =
|
||||
all)
|
||||
--swa-full use full-size SWA cache (default: false)
|
||||
[(more
|
||||
info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
(env: LLAMA_ARG_SWA_FULL)
|
||||
-fa, --flash-attn [on|off|auto] set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
|
||||
(env: LLAMA_ARG_FLASH_ATTN)
|
||||
--perf, --no-perf whether to enable internal libllama performance timings (default:
|
||||
false)
|
||||
(env: LLAMA_ARG_PERF)
|
||||
-e, --escape, --no-escape whether to process escapes sequences (\n, \r, \t, \', \", \\)
|
||||
(default: true)
|
||||
--rope-scaling {none,linear,yarn} RoPE frequency scaling method, defaults to linear unless specified by
|
||||
the model
|
||||
(env: LLAMA_ARG_ROPE_SCALING_TYPE)
|
||||
--rope-scale N RoPE context scaling factor, expands context by a factor of N
|
||||
(env: LLAMA_ARG_ROPE_SCALE)
|
||||
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from
|
||||
model)
|
||||
(env: LLAMA_ARG_ROPE_FREQ_BASE)
|
||||
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
|
||||
(env: LLAMA_ARG_ROPE_FREQ_SCALE)
|
||||
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training
|
||||
context size)
|
||||
(env: LLAMA_ARG_YARN_ORIG_CTX)
|
||||
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.00, 0.0 = full
|
||||
interpolation)
|
||||
(env: LLAMA_ARG_YARN_EXT_FACTOR)
|
||||
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
|
||||
(env: LLAMA_ARG_YARN_ATTN_FACTOR)
|
||||
--yarn-beta-slow N YaRN: high correction dim or alpha (default: -1.00)
|
||||
(env: LLAMA_ARG_YARN_BETA_SLOW)
|
||||
--yarn-beta-fast N YaRN: low correction dim or beta (default: -1.00)
|
||||
(env: LLAMA_ARG_YARN_BETA_FAST)
|
||||
-kvo, --kv-offload, -nkvo, --no-kv-offload
|
||||
whether to enable KV cache offloading (default: enabled)
|
||||
(env: LLAMA_ARG_KV_OFFLOAD)
|
||||
--repack, -nr, --no-repack whether to enable weight repacking (default: enabled)
|
||||
(env: LLAMA_ARG_REPACK)
|
||||
--no-host bypass host buffer allowing extra buffers to be used
|
||||
(env: LLAMA_ARG_NO_HOST)
|
||||
-ctk, --cache-type-k TYPE KV cache data type for K
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(default: f16)
|
||||
(env: LLAMA_ARG_CACHE_TYPE_K)
|
||||
-ctv, --cache-type-v TYPE KV cache data type for V
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(default: f16)
|
||||
(env: LLAMA_ARG_CACHE_TYPE_V)
|
||||
-dt, --defrag-thold N KV cache defragmentation threshold (DEPRECATED)
|
||||
(env: LLAMA_ARG_DEFRAG_THOLD)
|
||||
--rpc SERVERS comma separated list of RPC servers (host:port)
|
||||
(env: LLAMA_ARG_RPC)
|
||||
--mlock force system to keep model in RAM rather than swapping or compressing
|
||||
(env: LLAMA_ARG_MLOCK)
|
||||
--mmap, --no-mmap whether to memory-map model. (if mmap disabled, slower load but may
|
||||
reduce pageouts if not using mlock) (default: enabled)
|
||||
(env: LLAMA_ARG_MMAP)
|
||||
-dio, --direct-io, -ndio, --no-direct-io
|
||||
use DirectIO if available. (default: disabled)
|
||||
(env: LLAMA_ARG_DIO)
|
||||
--numa TYPE attempt optimizations that help on some NUMA systems
|
||||
- distribute: spread execution evenly over all nodes
|
||||
- isolate: only spawn threads on CPUs on the node that execution
|
||||
started on
|
||||
- numactl: use the CPU map provided by numactl
|
||||
if run without this previously, it is recommended to drop the system
|
||||
page cache before using this
|
||||
see https://github.com/ggml-org/llama.cpp/issues/1437
|
||||
(env: LLAMA_ARG_NUMA)
|
||||
-dev, --device <dev1,dev2,..> comma-separated list of devices to use for offloading (none = don't
|
||||
offload)
|
||||
use --list-devices to see a list of available devices
|
||||
(env: LLAMA_ARG_DEVICE)
|
||||
--list-devices print list of available devices and exit
|
||||
-ot, --override-tensor <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type
|
||||
(env: LLAMA_ARG_OVERRIDE_TENSOR)
|
||||
-cmoe, --cpu-moe keep all Mixture of Experts (MoE) weights in the CPU
|
||||
(env: LLAMA_ARG_CPU_MOE)
|
||||
-ncmoe, --n-cpu-moe N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||
CPU
|
||||
(env: LLAMA_ARG_N_CPU_MOE)
|
||||
-ngl, --gpu-layers, --n-gpu-layers N max. number of layers to store in VRAM, either an exact number,
|
||||
'auto', or 'all' (default: auto)
|
||||
(env: LLAMA_ARG_N_GPU_LAYERS)
|
||||
-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of:
|
||||
- none: use one GPU only
|
||||
- layer (default): split layers and KV across GPUs
|
||||
- row: split rows across GPUs
|
||||
(env: LLAMA_ARG_SPLIT_MODE)
|
||||
-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of
|
||||
proportions, e.g. 3,1
|
||||
(env: LLAMA_ARG_TENSOR_SPLIT)
|
||||
-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for
|
||||
intermediate results and KV (with split-mode = row) (default: 0)
|
||||
(env: LLAMA_ARG_MAIN_GPU)
|
||||
-fit, --fit [on|off] whether to adjust unset arguments to fit in device memory ('on' or
|
||||
'off', default: 'on')
|
||||
(env: LLAMA_ARG_FIT)
|
||||
-fitt, --fit-target MiB0,MiB1,MiB2,...
|
||||
target margin per device for --fit, comma-separated list of values,
|
||||
single value is broadcast across all devices, default: 1024
|
||||
(env: LLAMA_ARG_FIT_TARGET)
|
||||
-fitc, --fit-ctx N minimum ctx size that can be set by --fit option, default: 4096
|
||||
(env: LLAMA_ARG_FIT_CTX)
|
||||
--check-tensors check model tensor data for invalid values (default: false)
|
||||
--override-kv KEY=TYPE:VALUE,... advanced option to override model metadata by key. to specify multiple
|
||||
overrides, either use comma-separated values.
|
||||
types: int, float, bool, str. example: --override-kv
|
||||
tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false
|
||||
--op-offload, --no-op-offload whether to offload host tensor operations to device (default: true)
|
||||
--lora FNAME path to LoRA adapter (use comma-separated values to load multiple
|
||||
adapters)
|
||||
--lora-scaled FNAME:SCALE,... path to LoRA adapter with user defined scaling (format:
|
||||
FNAME:SCALE,...)
|
||||
note: use comma-separated values
|
||||
--control-vector FNAME add a control vector
|
||||
note: use comma-separated values to add multiple control vectors
|
||||
--control-vector-scaled FNAME:SCALE,...
|
||||
add a control vector with user defined scaling SCALE
|
||||
note: use comma-separated values (format: FNAME:SCALE,...)
|
||||
--control-vector-layer-range START END
|
||||
layer range to apply the control vector(s) to, start and end inclusive
|
||||
-m, --model FNAME model path to load
|
||||
(env: LLAMA_ARG_MODEL)
|
||||
-mu, --model-url MODEL_URL model download url (default: unused)
|
||||
(env: LLAMA_ARG_MODEL_URL)
|
||||
-dr, --docker-repo [<repo>/]<model>[:quant]
|
||||
Docker Hub model repository. repo is optional, default to ai/. quant
|
||||
is optional, default to :latest.
|
||||
example: gemma3
|
||||
(default: unused)
|
||||
(env: LLAMA_ARG_DOCKER_REPO)
|
||||
-hf, -hfr, --hf-repo <user>/<model>[:quant]
|
||||
Hugging Face model repository; quant is optional, case-insensitive,
|
||||
default to Q4_K_M, or falls back to the first file in the repo if
|
||||
Q4_K_M doesn't exist.
|
||||
mmproj is also downloaded automatically if available. to disable, add
|
||||
--no-mmproj
|
||||
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
|
||||
(default: unused)
|
||||
(env: LLAMA_ARG_HF_REPO)
|
||||
-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]
|
||||
Same as --hf-repo, but for the draft model (default: unused)
|
||||
(env: LLAMA_ARG_HFD_REPO)
|
||||
-hff, --hf-file FILE Hugging Face model file. If specified, it will override the quant in
|
||||
--hf-repo (default: unused)
|
||||
(env: LLAMA_ARG_HF_FILE)
|
||||
-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]
|
||||
Hugging Face model repository for the vocoder model (default: unused)
|
||||
(env: LLAMA_ARG_HF_REPO_V)
|
||||
-hffv, --hf-file-v FILE Hugging Face model file for the vocoder model (default: unused)
|
||||
(env: LLAMA_ARG_HF_FILE_V)
|
||||
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment
|
||||
variable)
|
||||
(env: HF_TOKEN)
|
||||
--log-disable Log disable
|
||||
--log-file FNAME Log to file
|
||||
(env: LLAMA_LOG_FILE)
|
||||
--log-colors [on|off|auto] Set colored logging ('on', 'off', or 'auto', default: 'auto')
|
||||
'auto' enables colors when output is to a terminal
|
||||
(env: LLAMA_LOG_COLORS)
|
||||
-v, --verbose, --log-verbose Set verbosity level to infinity (i.e. log all messages, useful for
|
||||
debugging)
|
||||
--offline Offline mode: forces use of cache, prevents network access
|
||||
(env: LLAMA_OFFLINE)
|
||||
-lv, --verbosity, --log-verbosity N Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
ignored. Values:
|
||||
- 0: generic output
|
||||
- 1: error
|
||||
- 2: warning
|
||||
- 3: info
|
||||
- 4: debug
|
||||
(default: 3)
|
||||
|
||||
(env: LLAMA_LOG_VERBOSITY)
|
||||
--log-prefix Enable prefix in log messages
|
||||
(env: LLAMA_LOG_PREFIX)
|
||||
--log-timestamps Enable timestamps in log messages
|
||||
(env: LLAMA_LOG_TIMESTAMPS)
|
||||
-ctkd, --cache-type-k-draft TYPE KV cache data type for K for the draft model
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(default: f16)
|
||||
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT)
|
||||
-ctvd, --cache-type-v-draft TYPE KV cache data type for V for the draft model
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(default: f16)
|
||||
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT)
|
||||
|
||||
|
||||
----- sampling params -----
|
||||
|
||||
--samplers SAMPLERS samplers that will be used for generation in the order, separated by
|
||||
';'
|
||||
(default:
|
||||
penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature)
|
||||
-s, --seed SEED RNG seed (default: -1, use random seed for -1)
|
||||
--sampler-seq, --sampling-seq SEQUENCE
|
||||
simplified sequence for samplers that will be used (default:
|
||||
edskypmxt)
|
||||
--ignore-eos ignore end of stream token and continue generating (implies
|
||||
--logit-bias EOS-inf)
|
||||
--temp, --temperature N temperature (default: 0.80)
|
||||
--top-k N top-k sampling (default: 40, 0 = disabled)
|
||||
(env: LLAMA_ARG_TOP_K)
|
||||
--top-p N top-p sampling (default: 0.95, 1.0 = disabled)
|
||||
--min-p N min-p sampling (default: 0.05, 0.0 = disabled)
|
||||
--top-nsigma, --top-n-sigma N top-n-sigma sampling (default: -1.00, -1.0 = disabled)
|
||||
--xtc-probability N xtc probability (default: 0.00, 0.0 = disabled)
|
||||
--xtc-threshold N xtc threshold (default: 0.10, 1.0 = disabled)
|
||||
--typical, --typical-p N locally typical sampling, parameter p (default: 1.00, 1.0 = disabled)
|
||||
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1
|
||||
= ctx_size)
|
||||
--repeat-penalty N penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled)
|
||||
--presence-penalty N repeat alpha presence penalty (default: 0.00, 0.0 = disabled)
|
||||
--frequency-penalty N repeat alpha frequency penalty (default: 0.00, 0.0 = disabled)
|
||||
--dry-multiplier N set DRY sampling multiplier (default: 0.00, 0.0 = disabled)
|
||||
--dry-base N set DRY sampling base value (default: 1.75)
|
||||
--dry-allowed-length N set allowed length for DRY sampling (default: 2)
|
||||
--dry-penalty-last-n N set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 =
|
||||
context size)
|
||||
--dry-sequence-breaker STRING add sequence breaker for DRY sampling, clearing out default breakers
|
||||
('\n', ':', '"', '*') in the process; use "none" to not use any
|
||||
sequence breakers
|
||||
--adaptive-target N adaptive-p: select tokens near this probability (valid range 0.0 to
|
||||
1.0; negative = disabled) (default: -1.00)
|
||||
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)
|
||||
--adaptive-decay N adaptive-p: decay rate for target adaptation over time. lower values
|
||||
are more reactive, higher values are more stable.
|
||||
(valid range 0.0 to 0.99) (default: 0.90)
|
||||
--dynatemp-range N dynamic temperature range (default: 0.00, 0.0 = disabled)
|
||||
--dynatemp-exp N dynamic temperature exponent (default: 1.00)
|
||||
--mirostat N use Mirostat sampling.
|
||||
Top K, Nucleus and Locally Typical samplers are ignored if used.
|
||||
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
|
||||
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.10)
|
||||
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.00)
|
||||
-l, --logit-bias TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
|
||||
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
|
||||
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
|
||||
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/
|
||||
dir)
|
||||
--grammar-file FNAME file to read grammar from
|
||||
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g.
|
||||
`{}` for any JSON object
|
||||
For schemas w/ external $refs, use --grammar +
|
||||
example/json_schema_to_grammar.py instead
|
||||
-jf, --json-schema-file FILE File containing a JSON schema to constrain generations
|
||||
(https://json-schema.org/), e.g. `{}` for any JSON object
|
||||
For schemas w/ external $refs, use --grammar +
|
||||
example/json_schema_to_grammar.py instead
|
||||
-bs, --backend-sampling enable backend sampling (experimental) (default: disabled)
|
||||
(env: LLAMA_ARG_BACKEND_SAMPLING)
|
||||
|
||||
|
||||
----- example-specific params -----
|
||||
|
||||
-lcs, --lookup-cache-static FNAME path to static lookup cache to use for lookup decoding (not updated by
|
||||
generation)
|
||||
-lcd, --lookup-cache-dynamic FNAME path to dynamic lookup cache to use for lookup decoding (updated by
|
||||
generation)
|
||||
-ctxcp, --ctx-checkpoints, --swa-checkpoints N
|
||||
max number of context checkpoints to create per slot (default:
|
||||
32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
|
||||
(env: LLAMA_ARG_CTX_CHECKPOINTS)
|
||||
-cpent, --checkpoint-every-n-tokens N create a checkpoint every n tokens during prefill (processing), -1 to
|
||||
disable (default: 8192)
|
||||
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT)
|
||||
-cram, --cache-ram N set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 -
|
||||
disable)[(more
|
||||
info)](https://github.com/ggml-org/llama.cpp/pull/16391)
|
||||
(env: LLAMA_ARG_CACHE_RAM)
|
||||
-kvu, --kv-unified, -no-kvu, --no-kv-unified
|
||||
use single unified KV buffer shared across all sequences (default:
|
||||
enabled if number of slots is auto)
|
||||
(env: LLAMA_ARG_KV_UNIFIED)
|
||||
--clear-idle, --no-clear-idle save and clear idle slots on new task (default: enabled, requires
|
||||
unified KV and cache-ram)
|
||||
(env: LLAMA_ARG_CLEAR_IDLE)
|
||||
--context-shift, --no-context-shift whether to use context shift on infinite text generation (default:
|
||||
disabled)
|
||||
(env: LLAMA_ARG_CONTEXT_SHIFT)
|
||||
-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
|
||||
-sp, --special special tokens output enabled (default: false)
|
||||
--warmup, --no-warmup whether to perform warmup with an empty run (default: enabled)
|
||||
--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of
|
||||
Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
|
||||
--pooling {none,mean,cls,last,rank} pooling type for embeddings, use model default if unspecified
|
||||
(env: LLAMA_ARG_POOLING)
|
||||
-np, --parallel N number of server slots (default: -1, -1 = auto)
|
||||
(env: LLAMA_ARG_N_PARALLEL)
|
||||
-cb, --cont-batching, -nocb, --no-cont-batching
|
||||
whether to enable continuous batching (a.k.a dynamic batching)
|
||||
(default: enabled)
|
||||
(env: LLAMA_ARG_CONT_BATCHING)
|
||||
-mm, --mmproj FILE path to a multimodal projector file. see tools/mtmd/README.md
|
||||
note: if -hf is used, this argument can be omitted
|
||||
(env: LLAMA_ARG_MMPROJ)
|
||||
-mmu, --mmproj-url URL URL to a multimodal projector file. see tools/mtmd/README.md
|
||||
(env: LLAMA_ARG_MMPROJ_URL)
|
||||
--mmproj-auto, --no-mmproj, --no-mmproj-auto
|
||||
whether to use multimodal projector file (if available), useful when
|
||||
using -hf (default: enabled)
|
||||
(env: LLAMA_ARG_MMPROJ_AUTO)
|
||||
--mmproj-offload, --no-mmproj-offload whether to enable GPU offloading for multimodal projector (default:
|
||||
enabled)
|
||||
(env: LLAMA_ARG_MMPROJ_OFFLOAD)
|
||||
--image-min-tokens N minimum number of tokens each image can take, only used by vision
|
||||
models with dynamic resolution (default: read from model)
|
||||
(env: LLAMA_ARG_IMAGE_MIN_TOKENS)
|
||||
--image-max-tokens N maximum number of tokens each image can take, only used by vision
|
||||
models with dynamic resolution (default: read from model)
|
||||
(env: LLAMA_ARG_IMAGE_MAX_TOKENS)
|
||||
-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type for draft model
|
||||
-cmoed, --cpu-moe-draft keep all Mixture of Experts (MoE) weights in the CPU for the draft
|
||||
model
|
||||
(env: LLAMA_ARG_CPU_MOE_DRAFT)
|
||||
-ncmoed, --n-cpu-moe-draft N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||
CPU for the draft model
|
||||
(env: LLAMA_ARG_N_CPU_MOE_DRAFT)
|
||||
-a, --alias STRING set model name aliases, comma-separated (to be used by API)
|
||||
(env: LLAMA_ARG_ALIAS)
|
||||
--tags STRING set model tags, comma-separated (informational, not used for routing)
|
||||
(env: LLAMA_ARG_TAGS)
|
||||
--host HOST ip address to listen, or bind to an UNIX socket if the address ends
|
||||
with .sock (default: 127.0.0.1)
|
||||
(env: LLAMA_ARG_HOST)
|
||||
--port PORT port to listen (default: 8080)
|
||||
(env: LLAMA_ARG_PORT)
|
||||
--reuse-port allow multiple sockets to bind to the same port (default: disabled)
|
||||
(env: LLAMA_ARG_REUSE_PORT)
|
||||
--path PATH path to serve static files from (default: )
|
||||
(env: LLAMA_ARG_STATIC_PATH)
|
||||
--api-prefix PREFIX prefix path the server serves from, without the trailing slash
|
||||
(default: )
|
||||
(env: LLAMA_ARG_API_PREFIX)
|
||||
--webui-config JSON JSON that provides default WebUI settings (overrides WebUI defaults)
|
||||
(env: LLAMA_ARG_WEBUI_CONFIG)
|
||||
--webui-config-file PATH JSON file that provides default WebUI settings (overrides WebUI
|
||||
defaults)
|
||||
(env: LLAMA_ARG_WEBUI_CONFIG_FILE)
|
||||
--webui-mcp-proxy, --no-webui-mcp-proxy
|
||||
experimental: whether to enable MCP CORS proxy - do not enable in
|
||||
untrusted environments (default: disabled)
|
||||
(env: LLAMA_ARG_WEBUI_MCP_PROXY)
|
||||
--tools TOOL1,TOOL2,... experimental: whether to enable built-in tools for AI agents - do not
|
||||
enable in untrusted environments (default: no tools)
|
||||
specify "all" to enable all tools
|
||||
available tools: read_file, file_glob_search, grep_search,
|
||||
exec_shell_command, write_file, edit_file, apply_diff
|
||||
(env: LLAMA_ARG_TOOLS)
|
||||
--webui, --no-webui whether to enable the Web UI (default: enabled)
|
||||
(env: LLAMA_ARG_WEBUI)
|
||||
--embedding, --embeddings restrict to only support embedding use case; use only with dedicated
|
||||
embedding models (default: disabled)
|
||||
(env: LLAMA_ARG_EMBEDDINGS)
|
||||
--rerank, --reranking enable reranking endpoint on server (default: disabled)
|
||||
(env: LLAMA_ARG_RERANKING)
|
||||
--api-key KEY API key to use for authentication, multiple keys can be provided as a
|
||||
comma-separated list (default: none)
|
||||
(env: LLAMA_API_KEY)
|
||||
--api-key-file FNAME path to file containing API keys (default: none)
|
||||
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
|
||||
(env: LLAMA_ARG_SSL_KEY_FILE)
|
||||
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
|
||||
(env: LLAMA_ARG_SSL_CERT_FILE)
|
||||
--chat-template-kwargs STRING sets additional params for the json template parser, must be a valid
|
||||
json object string, e.g. '{"key1":"value1","key2":"value2"}'
|
||||
(env: LLAMA_CHAT_TEMPLATE_KWARGS)
|
||||
-to, --timeout N server read/write timeout in seconds (default: 600)
|
||||
(env: LLAMA_ARG_TIMEOUT)
|
||||
--threads-http N number of threads used to process HTTP requests (default: -1)
|
||||
(env: LLAMA_ARG_THREADS_HTTP)
|
||||
--cache-prompt, --no-cache-prompt whether to enable prompt caching (default: enabled)
|
||||
(env: LLAMA_ARG_CACHE_PROMPT)
|
||||
--cache-reuse N min chunk size to attempt reusing from the cache via KV shifting,
|
||||
requires prompt caching to be enabled (default: 0)
|
||||
[(card)](https://ggml.ai/f0.png)
|
||||
(env: LLAMA_ARG_CACHE_REUSE)
|
||||
--metrics enable prometheus compatible metrics endpoint (default: disabled)
|
||||
(env: LLAMA_ARG_ENDPOINT_METRICS)
|
||||
--props enable changing global properties via POST /props (default: disabled)
|
||||
(env: LLAMA_ARG_ENDPOINT_PROPS)
|
||||
--slots, --no-slots expose slots monitoring endpoint (default: enabled)
|
||||
(env: LLAMA_ARG_ENDPOINT_SLOTS)
|
||||
--slot-save-path PATH path to save slot kv cache (default: disabled)
|
||||
--media-path PATH directory for loading local media files; files can be accessed via
|
||||
file:// URLs using relative paths (default: disabled)
|
||||
--models-dir PATH directory containing models for the router server (default: disabled)
|
||||
(env: LLAMA_ARG_MODELS_DIR)
|
||||
--models-preset PATH path to INI file containing model presets for the router server
|
||||
(default: disabled)
|
||||
(env: LLAMA_ARG_MODELS_PRESET)
|
||||
--models-max N for router server, maximum number of models to load simultaneously
|
||||
(default: 4, 0 = unlimited)
|
||||
(env: LLAMA_ARG_MODELS_MAX)
|
||||
--models-autoload, --no-models-autoload
|
||||
for router server, whether to automatically load models (default:
|
||||
enabled)
|
||||
(env: LLAMA_ARG_MODELS_AUTOLOAD)
|
||||
--jinja, --no-jinja whether to use jinja template engine for chat (default: enabled)
|
||||
(env: LLAMA_ARG_JINJA)
|
||||
--reasoning-format FORMAT controls whether thought tags are allowed and/or extracted from the
|
||||
response, and in which format they're returned; one of:
|
||||
- none: leaves thoughts unparsed in `message.content`
|
||||
- deepseek: puts thoughts in `message.reasoning_content`
|
||||
- deepseek-legacy: keeps `<think>` tags in `message.content` while
|
||||
also populating `message.reasoning_content`
|
||||
(default: auto)
|
||||
(env: LLAMA_ARG_THINK)
|
||||
-rea, --reasoning [on|off|auto] Use reasoning/thinking in the chat ('on', 'off', or 'auto', default:
|
||||
'auto' (detect from template))
|
||||
(env: LLAMA_ARG_REASONING)
|
||||
--reasoning-budget N token budget for thinking: -1 for unrestricted, 0 for immediate end,
|
||||
N>0 for token budget (default: -1)
|
||||
(env: LLAMA_ARG_THINK_BUDGET)
|
||||
--reasoning-budget-message MESSAGE message injected before the end-of-thinking tag when reasoning budget
|
||||
is exhausted (default: none)
|
||||
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE)
|
||||
--chat-template JINJA_TEMPLATE set custom jinja chat template (default: template taken from model's
|
||||
metadata)
|
||||
if suffix/prefix are specified, template will be disabled
|
||||
only commonly used templates are accepted (unless --jinja is set
|
||||
before this flag):
|
||||
list of built-in templates:
|
||||
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml,
|
||||
command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe,
|
||||
exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite,
|
||||
granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2,
|
||||
llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez,
|
||||
minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7,
|
||||
mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3,
|
||||
phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca,
|
||||
yandex, zephyr
|
||||
(env: LLAMA_ARG_CHAT_TEMPLATE)
|
||||
--chat-template-file JINJA_TEMPLATE_FILE
|
||||
set custom jinja chat template file (default: template taken from
|
||||
model's metadata)
|
||||
if suffix/prefix are specified, template will be disabled
|
||||
only commonly used templates are accepted (unless --jinja is set
|
||||
before this flag):
|
||||
list of built-in templates:
|
||||
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml,
|
||||
command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe,
|
||||
exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite,
|
||||
granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2,
|
||||
llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez,
|
||||
minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7,
|
||||
mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3,
|
||||
phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca,
|
||||
yandex, zephyr
|
||||
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE)
|
||||
--skip-chat-parsing, --no-skip-chat-parsing
|
||||
force a pure content parser, even if a Jinja template is specified;
|
||||
model will output everything in the content section, including any
|
||||
reasoning and/or tool calls (default: disabled)
|
||||
(env: LLAMA_ARG_SKIP_CHAT_PARSING)
|
||||
--prefill-assistant, --no-prefill-assistant
|
||||
whether to prefill the assistant's response if the last message is an
|
||||
assistant message (default: prefill enabled)
|
||||
when this flag is set, if the last message is an assistant message
|
||||
then it will be treated as a full message and not prefilled
|
||||
|
||||
(env: LLAMA_ARG_PREFILL_ASSISTANT)
|
||||
-sps, --slot-prompt-similarity SIMILARITY
|
||||
how much the prompt of a request must match the prompt of a slot in
|
||||
order to use that slot (default: 0.10, 0.0 = disabled)
|
||||
--lora-init-without-apply load LoRA adapters without applying them (apply later via POST
|
||||
/lora-adapters) (default: disabled)
|
||||
--sleep-idle-seconds SECONDS number of seconds of idleness after which the server will sleep
|
||||
(default: -1; -1 = disabled)
|
||||
-td, --threads-draft N number of threads to use during generation (default: same as
|
||||
--threads)
|
||||
-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default:
|
||||
same as --threads-draft)
|
||||
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
|
||||
(env: LLAMA_ARG_DRAFT_MAX)
|
||||
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
|
||||
(default: 0)
|
||||
(env: LLAMA_ARG_DRAFT_MIN)
|
||||
--draft-p-min P minimum speculative decoding probability (greedy) (default: 0.75)
|
||||
(env: LLAMA_ARG_DRAFT_P_MIN)
|
||||
-cd, --ctx-size-draft N size of the prompt context for the draft model (default: 0, 0 = loaded
|
||||
from model)
|
||||
(env: LLAMA_ARG_CTX_SIZE_DRAFT)
|
||||
-devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model
|
||||
(none = don't offload)
|
||||
use --list-devices to see a list of available devices
|
||||
-ngld, --gpu-layers-draft, --n-gpu-layers-draft N
|
||||
max. number of draft model layers to store in VRAM, either an exact
|
||||
number, 'auto', or 'all' (default: auto)
|
||||
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
|
||||
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
|
||||
(env: LLAMA_ARG_MODEL_DRAFT)
|
||||
--spec-replace TARGET DRAFT translate the string in TARGET into DRAFT if the draft model and main
|
||||
model are not compatible
|
||||
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
type of speculative decoding to use when no draft model is provided
|
||||
(default: none)
|
||||
|
||||
(env: LLAMA_ARG_SPEC_TYPE)
|
||||
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
|
||||
of lookup n-gram (default: 12)
|
||||
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
|
||||
of draft m-gram (default: 48)
|
||||
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
|
||||
-mv, --model-vocoder FNAME vocoder model for audio generation (default: unused)
|
||||
--tts-use-guide-tokens Use guide tokens to improve TTS word recall
|
||||
--embd-gemma-default use default EmbeddingGemma model (note: can download weights from the
|
||||
internet)
|
||||
--fim-qwen-1.5b-default use default Qwen 2.5 Coder 1.5B (note: can download weights from the
|
||||
internet)
|
||||
--fim-qwen-3b-default use default Qwen 2.5 Coder 3B (note: can download weights from the
|
||||
internet)
|
||||
--fim-qwen-7b-default use default Qwen 2.5 Coder 7B (note: can download weights from the
|
||||
internet)
|
||||
--fim-qwen-7b-spec use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can
|
||||
download weights from the internet)
|
||||
--fim-qwen-14b-spec use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note:
|
||||
can download weights from the internet)
|
||||
--fim-qwen-30b-default use default Qwen 3 Coder 30B A3B Instruct (note: can download weights
|
||||
from the internet)
|
||||
--gpt-oss-20b-default use gpt-oss-20b (note: can download weights from the internet)
|
||||
--gpt-oss-120b-default use gpt-oss-120b (note: can download weights from the internet)
|
||||
--vision-gemma-4b-default use Gemma 3 4B QAT (note: can download weights from the internet)
|
||||
--vision-gemma-12b-default use Gemma 3 12B QAT (note: can download weights from the internet)
|
||||
31
scripts/_archive/help_gpu_flags.txt
Normal file
31
scripts/_archive/help_gpu_flags.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
ggml_cuda_init: found 2 CUDA devices (Total VRAM: 24575 MiB):
|
||||
Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB
|
||||
Device 1: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB
|
||||
-dev, --device <dev1,dev2,..> comma-separated list of devices to use for offloading (none = don't
|
||||
use --list-devices to see a list of available devices
|
||||
(env: LLAMA_ARG_DEVICE)
|
||||
--list-devices print list of available devices and exit
|
||||
-ot, --override-tensor <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type
|
||||
(env: LLAMA_ARG_OVERRIDE_TENSOR)
|
||||
-cmoe, --cpu-moe keep all Mixture of Experts (MoE) weights in the CPU
|
||||
-ncmoe, --n-cpu-moe N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||
-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of:
|
||||
- layer (default): split layers and KV across GPUs
|
||||
- row: split rows across GPUs
|
||||
(env: LLAMA_ARG_SPLIT_MODE)
|
||||
-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of
|
||||
(env: LLAMA_ARG_TENSOR_SPLIT)
|
||||
-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for
|
||||
intermediate results and KV (with split-mode = row) (default: 0)
|
||||
-fit, --fit [on|off] whether to adjust unset arguments to fit in device memory ('on' or
|
||||
target margin per device for --fit, comma-separated list of values,
|
||||
single value is broadcast across all devices, default: 1024
|
||||
--check-tensors check model tensor data for invalid values (default: false)
|
||||
--op-offload, --no-op-offload whether to offload host tensor operations to device (default: true)
|
||||
-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type for draft model
|
||||
-cmoed, --cpu-moe-draft keep all Mixture of Experts (MoE) weights in the CPU for the draft
|
||||
-ncmoed, --n-cpu-moe-draft N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||
-devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model
|
||||
use --list-devices to see a list of available devices
|
||||
5
scripts/_archive/q4km_latest.txt
Normal file
5
scripts/_archive/q4km_latest.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
pure-GPU nommap small | 62.29 | GPU | VRAM:22975 | ub=128 b=512 t=4
|
||||
pure-GPU ts=0.5,0.5 | 63.89 | GPU | VRAM:23002 | ub=128 b=512 t=4
|
||||
tune t=2 | 64.1 | GPU | VRAM:22980 | ub=128 b=512 t=2
|
||||
tune t=6 | 64.18 | GPU | VRAM:22982 | ub=128 b=512 t=6
|
||||
tune t=8 | 63.11 | GPU | VRAM:22980 | ub=128 b=512 t=8
|
||||
24
scripts/_archive/qwen_latest.txt
Normal file
24
scripts/_archive/qwen_latest.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
UD-IQ4_NL | pure-GPU minbatch | 65.11 | GPU | 19177
|
||||
UD-IQ4_NL | pure-GPU nommap small | 65.01 | GPU | 19672
|
||||
UD-IQ4_NL | pure-GPU row-split | 13.65 | GPU | 19427
|
||||
UD-IQ4_NL | pure-GPU ts=0.5,0.5 | 64.92 | GPU | 19664
|
||||
UD-IQ4_NL | pure-GPU all-tricks | 64.72 | GPU | 19171
|
||||
UD-IQ4_NL | tune t=2 | 64.87 | GPU | 19170
|
||||
UD-IQ4_NL | tune t=6 | 64.88 | GPU | 19168
|
||||
UD-IQ4_NL | tune t=8 | 64.5 | GPU | 19168
|
||||
UD-IQ4_NL | tune ub=256 b=1024 | 64.73 | GPU | 20640
|
||||
UD-IQ4_NL | tune ub=256 b=2048 | 63.69 | GPU | 20614
|
||||
UD-IQ4_NL | tune kv=q8_0/q8_0 | 64.78 | GPU | 20422
|
||||
UD-IQ4_NL | tune kv=f16/f16 | 65.53 | GPU | 22812
|
||||
UD-IQ4_NL | FINAL | 66.31 | GPU | 22811
|
||||
MXFP4_MOE | pure-GPU minbatch | 63.06 | GPU | 22747
|
||||
MXFP4_MOE | pure-GPU nommap small | 63.75 | GPU | 22579
|
||||
MXFP4_MOE | pure-GPU ts=0.5,0.5 | 62.88 | GPU | 22578
|
||||
MXFP4_MOE | pure-GPU all-tricks | 62.55 | GPU | 22743
|
||||
MXFP4_MOE | tune t=2 | 63.07 | GPU | 22601
|
||||
MXFP4_MOE | tune t=6 | 63.58 | GPU | 22583
|
||||
MXFP4_MOE | tune t=8 | 62.92 | GPU | 22536
|
||||
MXFP4_MOE | tune ub=256 b=1024 | 62.76 | GPU | 22874
|
||||
MXFP4_MOE | tune ub=256 b=2048 | 62.74 | GPU | 22912
|
||||
MXFP4_MOE | FINAL | 63.71 | GPU | 22566
|
||||
Q4_K_M | pure-GPU nommap small | 62.29 | GPU | 22975
|
||||
47
scripts/_archive/results/122b_final_results.json
Normal file
47
scripts/_archive/results/122b_final_results.json
Normal file
@@ -0,0 +1,47 @@
|
||||
[
|
||||
{
|
||||
"name": "Baseline: all expert CPU",
|
||||
"avg_tps": 8.72,
|
||||
"best_tps": 8.74,
|
||||
"vram_gpu0": 620,
|
||||
"vram_gpu1": 6493,
|
||||
"vram_total": 7113,
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "n-cpu-moe=60 (4 layers expert GPU)",
|
||||
"avg_tps": 8.72,
|
||||
"best_tps": 8.77,
|
||||
"vram_gpu0": 638,
|
||||
"vram_gpu1": 6493,
|
||||
"vram_total": 7131,
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "n-cpu-moe=56 (8 layers expert GPU)",
|
||||
"avg_tps": 8.72,
|
||||
"best_tps": 8.8,
|
||||
"vram_gpu0": 624,
|
||||
"vram_gpu1": 6493,
|
||||
"vram_total": 7117,
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "n-cpu-moe=52 (12 layers expert GPU)",
|
||||
"avg_tps": 8.76,
|
||||
"best_tps": 8.79,
|
||||
"vram_gpu0": 634,
|
||||
"vram_gpu1": 6493,
|
||||
"vram_total": 7127,
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "n-cpu-moe=48 (16 layers expert GPU)",
|
||||
"avg_tps": 8.81,
|
||||
"best_tps": 8.95,
|
||||
"vram_gpu0": 632,
|
||||
"vram_gpu1": 6493,
|
||||
"vram_total": 7125,
|
||||
"status": "OK"
|
||||
}
|
||||
]
|
||||
52
scripts/_archive/results/122b_gpu1_results.json
Normal file
52
scripts/_archive/results/122b_gpu1_results.json
Normal file
@@ -0,0 +1,52 @@
|
||||
[
|
||||
{
|
||||
"name": "GPU1 only + Expert CPU + 8t",
|
||||
"avg_tps": 8.74,
|
||||
"best_tps": 8.75,
|
||||
"vram_gpu0": 618,
|
||||
"vram_gpu1": 6493,
|
||||
"vram_total": 7111,
|
||||
"pcie": "1, 4 | 4, 16",
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "GPU1 only + Expert CPU + 16t",
|
||||
"avg_tps": 8.0,
|
||||
"best_tps": 8.02,
|
||||
"vram_gpu0": 619,
|
||||
"vram_gpu1": 6493,
|
||||
"vram_total": 7112,
|
||||
"pcie": "1, 4 | 4, 16",
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "Both GPU (main=1) + Expert CPU + 8t",
|
||||
"avg_tps": 4.71,
|
||||
"best_tps": 4.75,
|
||||
"vram_gpu0": 4220,
|
||||
"vram_gpu1": 3779,
|
||||
"vram_total": 7999,
|
||||
"pcie": "3, 4 | 4, 16",
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "Both GPU (ts 0.2,0.8) + Expert CPU + 8t",
|
||||
"avg_tps": 4.53,
|
||||
"best_tps": 4.6,
|
||||
"vram_gpu0": 2666,
|
||||
"vram_gpu1": 5333,
|
||||
"vram_total": 7999,
|
||||
"pcie": "2, 4 | 4, 16",
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "GPU1 only + Expert CPU + 8t + b4096",
|
||||
"avg_tps": 8.73,
|
||||
"best_tps": 8.77,
|
||||
"vram_gpu0": 615,
|
||||
"vram_gpu1": 6895,
|
||||
"vram_total": 7510,
|
||||
"pcie": "1, 4 | 4, 16",
|
||||
"status": "OK"
|
||||
}
|
||||
]
|
||||
37
scripts/_archive/results/122b_ncpumoe_results.json
Normal file
37
scripts/_archive/results/122b_ncpumoe_results.json
Normal file
@@ -0,0 +1,37 @@
|
||||
[
|
||||
{
|
||||
"name": "n-cpu-moe=64 (all CPU)",
|
||||
"n_cpu_moe": 64,
|
||||
"speed_tps": 4.87,
|
||||
"vram_gpu0": 4257,
|
||||
"vram_gpu1": 3793,
|
||||
"vram_total": 8050,
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "n-cpu-moe=56 (8 layers GPU expert)",
|
||||
"n_cpu_moe": 56,
|
||||
"speed_tps": 4.78,
|
||||
"vram_gpu0": 4233,
|
||||
"vram_gpu1": 3793,
|
||||
"vram_total": 8026,
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "n-cpu-moe=48 (16 layers GPU expert)",
|
||||
"n_cpu_moe": 48,
|
||||
"speed_tps": 4.82,
|
||||
"vram_gpu0": 4233,
|
||||
"vram_gpu1": 3793,
|
||||
"vram_total": 8026,
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "n-cpu-moe=40 (24 layers GPU expert)",
|
||||
"status": "BOOT_FAIL"
|
||||
},
|
||||
{
|
||||
"name": "n-cpu-moe=32 (32 layers GPU expert)",
|
||||
"status": "BOOT_FAIL"
|
||||
}
|
||||
]
|
||||
43
scripts/_archive/results/122b_optimization_results.json
Normal file
43
scripts/_archive/results/122b_optimization_results.json
Normal file
@@ -0,0 +1,43 @@
|
||||
[
|
||||
{
|
||||
"name": "ngl=999 + expert CPU + no-mmap",
|
||||
"ngl": 999,
|
||||
"avg_tps": 4.8,
|
||||
"best_tps": 4.84,
|
||||
"vram_gpu0": 4225,
|
||||
"vram_gpu1": 3779,
|
||||
"vram_total": 8004,
|
||||
"pcie": "3, 4\r | 4, 16",
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "ngl=10 (pure, no expert override)",
|
||||
"ngl": 10,
|
||||
"avg_tps": 2.52,
|
||||
"best_tps": 2.56,
|
||||
"vram_gpu0": 10309,
|
||||
"vram_gpu1": 5871,
|
||||
"vram_total": 16180,
|
||||
"pcie": "1, 4\r | 1, 16",
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "ngl=12 (pure)",
|
||||
"ngl": 12,
|
||||
"avg_tps": 2.86,
|
||||
"best_tps": 2.86,
|
||||
"vram_gpu0": 11807,
|
||||
"vram_gpu1": 7377,
|
||||
"vram_total": 19184,
|
||||
"pcie": "2, 4\r | 2, 16",
|
||||
"status": "OK"
|
||||
},
|
||||
{
|
||||
"name": "ngl=14 (pure)",
|
||||
"status": "BOOT_FAIL"
|
||||
},
|
||||
{
|
||||
"name": "ngl=999 + upper expert CPU (blk 32-63)",
|
||||
"status": "BOOT_FAIL"
|
||||
}
|
||||
]
|
||||
68
scripts/_archive/results/deep_tier_auto_results.json
Normal file
68
scripts/_archive/results/deep_tier_auto_results.json
Normal file
@@ -0,0 +1,68 @@
|
||||
[
|
||||
{
|
||||
"name": "Qwen 27B - 256K (q4_0)",
|
||||
"status": "Success",
|
||||
"vram": [
|
||||
"0, 10853 MiB, 12288 MiB",
|
||||
"1, 10951 MiB, 12288 MiB"
|
||||
],
|
||||
"tests": {
|
||||
"code": {
|
||||
"time": 17.89,
|
||||
"tokens": 300,
|
||||
"tps": 16.77,
|
||||
"res": "..."
|
||||
},
|
||||
"logical": {
|
||||
"time": 17.96,
|
||||
"tokens": 300,
|
||||
"tps": 16.71,
|
||||
"res": "..."
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 32K (q4_0)",
|
||||
"status": "Success",
|
||||
"vram": [
|
||||
"0, 9834 MiB, 12288 MiB",
|
||||
"1, 9963 MiB, 12288 MiB"
|
||||
],
|
||||
"tests": {
|
||||
"code": {
|
||||
"time": 18.75,
|
||||
"tokens": 300,
|
||||
"tps": 16.0,
|
||||
"res": "```python\nfrom typing import List, Any\n\ndef merge_sorted_lists(list1: List[Any], list2: List[Any]) -> List[Any]:\n \"\"\"\n Merges two sorted lists i..."
|
||||
},
|
||||
"logical": {
|
||||
"time": 18.82,
|
||||
"tokens": 300,
|
||||
"tps": 15.94,
|
||||
"res": "..."
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 64K (q4_0)",
|
||||
"status": "Success",
|
||||
"vram": [
|
||||
"0, 10346 MiB, 12288 MiB",
|
||||
"1, 10387 MiB, 12288 MiB"
|
||||
],
|
||||
"tests": {
|
||||
"code": {
|
||||
"time": 18.75,
|
||||
"tokens": 300,
|
||||
"tps": 16.0,
|
||||
"res": "```python\nfrom typing import List, Any\n\ndef merge_sorted_lists(list1: List[Any], list2: List[Any]) -> List[Any]:\n \"\"\"\n Merges two sorted lists i..."
|
||||
},
|
||||
"logical": {
|
||||
"time": 18.83,
|
||||
"tokens": 300,
|
||||
"tps": 15.93,
|
||||
"res": "..."
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
68
scripts/_archive/results/deep_tier_extreme_results.json
Normal file
68
scripts/_archive/results/deep_tier_extreme_results.json
Normal file
@@ -0,0 +1,68 @@
|
||||
[
|
||||
{
|
||||
"name": "Qwen 27B - 256K 극한 (q4_0, ub=512)",
|
||||
"status": "Success",
|
||||
"vram": [
|
||||
"0, 11120 MiB, 12288 MiB",
|
||||
"1, 11081 MiB, 12288 MiB"
|
||||
],
|
||||
"tests": {
|
||||
"code": {
|
||||
"time": 17.97,
|
||||
"tokens": 300,
|
||||
"tps": 16.7,
|
||||
"res": "..."
|
||||
},
|
||||
"logical": {
|
||||
"time": 18.01,
|
||||
"tokens": 300,
|
||||
"tps": 16.65,
|
||||
"res": "..."
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 128K 확장 (q4_0)",
|
||||
"status": "Success",
|
||||
"vram": [
|
||||
"0, 11437 MiB, 12288 MiB",
|
||||
"1, 11259 MiB, 12288 MiB"
|
||||
],
|
||||
"tests": {
|
||||
"code": {
|
||||
"time": 18.75,
|
||||
"tokens": 300,
|
||||
"tps": 16.0,
|
||||
"res": "```python\nfrom typing import List, Any\n\ndef merge_sorted_lists(list1: List[Any], list2: List[Any]) -> List[Any]:\n \"\"\"\n Merges two sorted lists i..."
|
||||
},
|
||||
"logical": {
|
||||
"time": 18.79,
|
||||
"tokens": 300,
|
||||
"tps": 15.97,
|
||||
"res": "..."
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Gemma 31B - 192K 극한 (q4_0)",
|
||||
"status": "Success",
|
||||
"vram": [
|
||||
"0, 11888 MiB, 12288 MiB",
|
||||
"1, 11754 MiB, 12288 MiB"
|
||||
],
|
||||
"tests": {
|
||||
"code": {
|
||||
"time": 18.69,
|
||||
"tokens": 300,
|
||||
"tps": 16.05,
|
||||
"res": "```python\nfrom typing import List, Any\n\ndef merge_sorted_lists(list1: List[Any], list2: List[Any]) -> List[Any]:\n \"\"\"\n Merges two sorted lists i..."
|
||||
},
|
||||
"logical": {
|
||||
"time": 18.77,
|
||||
"tokens": 300,
|
||||
"tps": 15.98,
|
||||
"res": "..."
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
1654
scripts/_archive/results/dual_gpu_results.json
Normal file
1654
scripts/_archive/results/dual_gpu_results.json
Normal file
File diff suppressed because it is too large
Load Diff
31
scripts/_archive/results/dual_gpu_summary.txt
Normal file
31
scripts/_archive/results/dual_gpu_summary.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
Dual-GPU Benchmark v2 — 2026-04-06T06:52:08.868Z
|
||||
2x RTX 3060 12GB | 256K Context | 58 configs | 69.4 min
|
||||
|
||||
=======================================================
|
||||
RANKING
|
||||
=======================================================
|
||||
|
||||
🥇 #1: Gemma4-26B Q4_K_M
|
||||
AVG: 76.4 t/s | BEST: 76.75 t/s | Boot: 9s
|
||||
ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16
|
||||
|
||||
🥈 #2: Gemma4-26B MXFP4_MOE
|
||||
AVG: 64.05 t/s | BEST: 64.29 t/s | Boot: 9s
|
||||
ngl=999 t=2 ub=512 b=2048 ctk=q8_0 ctv=q8_0
|
||||
|
||||
🥉 #3: Qwen3.5-35B Q4_K_M
|
||||
AVG: 52.05 t/s | BEST: 54.48 t/s | Boot: 12.1s
|
||||
ngl=999 t=4 ub=256 b=1024 ctk=q4_0 ctv=q4_0
|
||||
--n-cpu-moe 5
|
||||
|
||||
#4: Qwen3.5-35B MXFP4_MOE
|
||||
AVG: 46.66 t/s | BEST: 47.09 t/s | Boot: 15s
|
||||
ngl=999 t=6 ub=512 b=2048 ctk=q4_0 ctv=q4_0
|
||||
--n-cpu-moe 5
|
||||
|
||||
=======================================================
|
||||
★ CHAMPION: Gemma4-26B Q4_K_M — 76.4 t/s
|
||||
=======================================================
|
||||
|
||||
Recommended:
|
||||
llama-server --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf -ngl 999 -c 262144 -t 6 -tb 6 -ub 512 -b 2048 -fa on --cache-type-k f16 --cache-type-v f16 --prio 3 --poll 50 --mlock --port 8000 --host 0.0.0.0
|
||||
8
scripts/_archive/results/gemma4_test_result.txt
Normal file
8
scripts/_archive/results/gemma4_test_result.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
==================================================
|
||||
Gemma4 26B Q4_K_M 5-Run Results:
|
||||
AVG: 74.65 t/s
|
||||
BEST: 75.07 t/s
|
||||
MIN: 74.27 t/s
|
||||
Runs: ['74.59', '74.68', '74.65', '75.07', '74.27']
|
||||
==================================================
|
||||
12
scripts/_archive/results/llm_judge_answers.json
Normal file
12
scripts/_archive/results/llm_judge_answers.json
Normal file
File diff suppressed because one or more lines are too long
124
scripts/_archive/results/quality_result_gemma4.json
Normal file
124
scripts/_archive/results/quality_result_gemma4.json
Normal file
@@ -0,0 +1,124 @@
|
||||
[
|
||||
{
|
||||
"id": "code_generate",
|
||||
"category": "coding",
|
||||
"name": "Python 함수 생성",
|
||||
"model": "gemma4",
|
||||
"response": "```python\nfrom typing import List\n\ndef merge_sorted",
|
||||
"tokens": 800,
|
||||
"time": 11.21,
|
||||
"tps": 71.34,
|
||||
"eval_criteria": [
|
||||
"correctness",
|
||||
"type_hints",
|
||||
"docstring",
|
||||
"edge_cases"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "code_debug",
|
||||
"category": "coding",
|
||||
"name": "버그 찾기 & 수정",
|
||||
"model": "gemma4",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 11.2,
|
||||
"tps": 71.4,
|
||||
"eval_criteria": [
|
||||
"bug_identified",
|
||||
"correct_fix",
|
||||
"clean_code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "code_refactor",
|
||||
"category": "coding",
|
||||
"name": "TypeScript 리팩토링",
|
||||
"model": "gemma4",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 11.23,
|
||||
"tps": 71.26,
|
||||
"eval_criteria": [
|
||||
"types",
|
||||
"error_handling",
|
||||
"backoff",
|
||||
"production_quality"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "korean_schedule",
|
||||
"category": "assistant_kr",
|
||||
"name": "한국어 일정 관리",
|
||||
"model": "gemma4",
|
||||
"response": "요청하신 내일 일정을 정리하고, 서울 시내 이동 시간을 고려하여 현실적인 가능성을 분석해",
|
||||
"tokens": 800,
|
||||
"time": 11.2,
|
||||
"tps": 71.43,
|
||||
"eval_criteria": [
|
||||
"korean_fluency",
|
||||
"schedule_analysis",
|
||||
"practical_advice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "korean_email",
|
||||
"category": "assistant_kr",
|
||||
"name": "한국어 이메일 요약",
|
||||
"model": "gemma4",
|
||||
"response": "요청하신 내용을 다음과 같이 요약 및 정리해 드립니다.\n\n**[3줄 요약]**\n1. 본부장님 지시로 Q2 마케팅 예산이 기존 대비 15% 삭감되었습니다.\n2. 이에 따라 ROI가 낮은 채널(인스타그램 등)을 중심으로 예산 조정이 필요합니다.\n3. 수요일 수정안 제출을 위해 채널별 삭감 우선순위 결정이 시급합니다.\n\n**[필요 액션]**\n* **채널별 삭감 우선순위 정리 및 회신** (기한: **화요일 오전까지**)",
|
||||
"tokens": 686,
|
||||
"time": 9.67,
|
||||
"tps": 70.95,
|
||||
"eval_criteria": [
|
||||
"korean_summary",
|
||||
"action_items",
|
||||
"conciseness"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "tool_calling",
|
||||
"category": "tool_use",
|
||||
"name": "Function Calling (JSON)",
|
||||
"model": "gemma4",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 11.19,
|
||||
"tps": 71.49,
|
||||
"eval_criteria": [
|
||||
"correct_sequence",
|
||||
"valid_json",
|
||||
"complete_args"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "structured_output",
|
||||
"category": "tool_use",
|
||||
"name": "구조화 출력 (JSON)",
|
||||
"model": "gemma4",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 11.41,
|
||||
"tps": 70.12,
|
||||
"eval_criteria": [
|
||||
"correct_parsing",
|
||||
"valid_json",
|
||||
"completeness"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "reasoning",
|
||||
"category": "reasoning",
|
||||
"name": "논리 추론",
|
||||
"model": "gemma4",
|
||||
"response": "To find the amount of wheat produced by each field, we can use algebra. We will express the production of all fields in terms of one variable.\n\n### Step 1: Define the variables\nLet **$B$** be the amount of wheat produced by **Field B**.\n\nBased on the problem description, we can define the other fields relative to $B$:\n* **Field A** produces 20% more than Field B:\n $A = B + 0.20B = 1.2B$\n* **Field C** produces ",
|
||||
"tokens": 800,
|
||||
"time": 11.2,
|
||||
"tps": 71.44,
|
||||
"eval_criteria": [
|
||||
"correct_answer",
|
||||
"clear_steps",
|
||||
"math_accuracy"
|
||||
]
|
||||
}
|
||||
]
|
||||
124
scripts/_archive/results/quality_result_qwen27b.json
Normal file
124
scripts/_archive/results/quality_result_qwen27b.json
Normal file
@@ -0,0 +1,124 @@
|
||||
[
|
||||
{
|
||||
"id": "code_generate",
|
||||
"category": "coding",
|
||||
"name": "Python 함수 생성",
|
||||
"model": "qwen27b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 47.56,
|
||||
"tps": 16.82,
|
||||
"eval_criteria": [
|
||||
"correctness",
|
||||
"type_hints",
|
||||
"docstring",
|
||||
"edge_cases"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "code_debug",
|
||||
"category": "coding",
|
||||
"name": "버그 찾기 & 수정",
|
||||
"model": "qwen27b",
|
||||
"response": "# Fixed Code\n\n```python\ndef find_duplicates(arr):\n count = {}\n for item in arr:\n count[item] = count.get(item, 0) + 1\n \n # Filter to only include items that appear more than once\n duplicates = {item: count for item, count in count.items() if count > 1}\n return duplicates\n```\n\n## What Was Fixed\n\n1. **Changed return type**: Now returns a dictionary `{value: count}` instead of a list\n2. **Fixed counting logic**: Counts all occurrences of each item, not just marks as seen\n3. **Handles empty lists**: Returns `{}` instead of crashing\n4. **Removed unnecessary set conversion**: No need to convert to set since we're returning a dict\n\n## Test Cases\n\n```python\nprint(find_duplicates([1, 2, 2, 3, 3, 3])) # {2: 2, 3: 3}\nprint(find_duplicates([])) # {}\nprint(find_duplicates([1, 1, 1])) # {1: 3}\nprint(find_duplicates([1, 2, 3])) # {}\nprint(find_duplicates([5])) # {}\n```\n\n## Alternative (More Pythonic)\n\n```python\nfrom collections import Counter\n\ndef find_duplicates(arr):\n count = Counter(arr)\n return {item: count for item, count in count.items() if count > 1}\n```\n\nThis version uses `Counter` from the `collections`",
|
||||
"tokens": 800,
|
||||
"time": 48.01,
|
||||
"tps": 16.66,
|
||||
"eval_criteria": [
|
||||
"bug_identified",
|
||||
"correct_fix",
|
||||
"clean_code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "code_refactor",
|
||||
"category": "coding",
|
||||
"name": "TypeScript 리팩토링",
|
||||
"model": "qwen27b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 47.99,
|
||||
"tps": 16.67,
|
||||
"eval_criteria": [
|
||||
"types",
|
||||
"error_handling",
|
||||
"backoff",
|
||||
"production_quality"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "korean_schedule",
|
||||
"category": "assistant_kr",
|
||||
"name": "한국어 일정 관리",
|
||||
"model": "qwen27b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 47.75,
|
||||
"tps": 16.75,
|
||||
"eval_criteria": [
|
||||
"korean_fluency",
|
||||
"schedule_analysis",
|
||||
"practical_advice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "korean_email",
|
||||
"category": "assistant_kr",
|
||||
"name": "한국어 이메일 요약",
|
||||
"model": "qwen27b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 48.05,
|
||||
"tps": 16.65,
|
||||
"eval_criteria": [
|
||||
"korean_summary",
|
||||
"action_items",
|
||||
"conciseness"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "tool_calling",
|
||||
"category": "tool_use",
|
||||
"name": "Function Calling (JSON)",
|
||||
"model": "qwen27b",
|
||||
"response": "[{\"tool\": \"get_calendar\", \"args\": {\"date\": \"tomorrow\"}}, {\"tool\": \"search_web\", \"args\": {\"query\": \"latest quarterly report\"}}, {\"tool\": \"send_email\", \"args\": {\"to\": \"john@example.com\", \"subject\": \"Quarterly Report Summary\", \"body\": \"Summary of the latest quarterly report attached for your review.\"}}]",
|
||||
"tokens": 719,
|
||||
"time": 43.06,
|
||||
"tps": 16.7,
|
||||
"eval_criteria": [
|
||||
"correct_sequence",
|
||||
"valid_json",
|
||||
"complete_args"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "structured_output",
|
||||
"category": "tool_use",
|
||||
"name": "구조화 출력 (JSON)",
|
||||
"model": "qwen27b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 48.01,
|
||||
"tps": 16.66,
|
||||
"eval_criteria": [
|
||||
"correct_parsing",
|
||||
"valid_json",
|
||||
"completeness"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "reasoning",
|
||||
"category": "reasoning",
|
||||
"name": "논리 추론",
|
||||
"model": "qwen27b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 47.67,
|
||||
"tps": 16.78,
|
||||
"eval_criteria": [
|
||||
"correct_answer",
|
||||
"clear_steps",
|
||||
"math_accuracy"
|
||||
]
|
||||
}
|
||||
]
|
||||
124
scripts/_archive/results/quality_result_qwen35b.json
Normal file
124
scripts/_archive/results/quality_result_qwen35b.json
Normal file
@@ -0,0 +1,124 @@
|
||||
[
|
||||
{
|
||||
"id": "code_generate",
|
||||
"category": "coding",
|
||||
"name": "Python 함수 생성",
|
||||
"model": "qwen35b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 12.7,
|
||||
"tps": 62.97,
|
||||
"eval_criteria": [
|
||||
"correctness",
|
||||
"type_hints",
|
||||
"docstring",
|
||||
"edge_cases"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "code_debug",
|
||||
"category": "coding",
|
||||
"name": "버그 찾기 & 수정",
|
||||
"model": "qwen35b",
|
||||
"response": "Here's the corrected version of the function that returns a dictionary of duplicate values with their occurrence counts:\n\n```python\ndef find_duplicates(arr):\n count = {}\n for item in arr:\n count[item] = count.get(item, 0) + 1\n \n return {item: cnt for item, cnt in count.items() if cnt > 1}\n```\n\n### Explanation of Fixes:\n\n1. **Correct Output Format:** The function now returns a dictionary where each key is a duplicate value and the corresponding value is its total count of occurrences.\n2. **Handles Empty Lists:** The function works correctly with an empty list, returning an empty dictionary `{}`.\n3. **Accurate Duplicate Detection:** Only values that appear more than once are included in the result.\n\n### Example Usage:\n\n```python\nprint(find_duplicates([1, 2, 2, 3, 3, 3])) # Output: {2: 2, 3: 3}\nprint(find_duplicates([])) # Output: {}\nprint(find_duplicates([1, 1, 1])) # Output: {1: 3}\nprint(find_duplicates([1, 2, 3])) # Output: {}\n```\n\nThis implementation is efficient, readable, and meets the specified requirements.",
|
||||
"tokens": 788,
|
||||
"time": 12.8,
|
||||
"tps": 61.58,
|
||||
"eval_criteria": [
|
||||
"bug_identified",
|
||||
"correct_fix",
|
||||
"clean_code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "code_refactor",
|
||||
"category": "coding",
|
||||
"name": "TypeScript 리팩토링",
|
||||
"model": "qwen35b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 13.05,
|
||||
"tps": 61.3,
|
||||
"eval_criteria": [
|
||||
"types",
|
||||
"error_handling",
|
||||
"backoff",
|
||||
"production_quality"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "korean_schedule",
|
||||
"category": "assistant_kr",
|
||||
"name": "한국어 일정 관리",
|
||||
"model": "qwen35b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 12.89,
|
||||
"tps": 62.08,
|
||||
"eval_criteria": [
|
||||
"korean_fluency",
|
||||
"schedule_analysis",
|
||||
"practical_advice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "korean_email",
|
||||
"category": "assistant_kr",
|
||||
"name": "한국어 이메일 요약",
|
||||
"model": "qwen35b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 13.06,
|
||||
"tps": 61.27,
|
||||
"eval_criteria": [
|
||||
"korean_summary",
|
||||
"action_items",
|
||||
"conciseness"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "tool_calling",
|
||||
"category": "tool_use",
|
||||
"name": "Function Calling (JSON)",
|
||||
"model": "qwen35b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 12.92,
|
||||
"tps": 61.9,
|
||||
"eval_criteria": [
|
||||
"correct_sequence",
|
||||
"valid_json",
|
||||
"complete_args"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "structured_output",
|
||||
"category": "tool_use",
|
||||
"name": "구조화 출력 (JSON)",
|
||||
"model": "qwen35b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 13.04,
|
||||
"tps": 61.34,
|
||||
"eval_criteria": [
|
||||
"correct_parsing",
|
||||
"valid_json",
|
||||
"completeness"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "reasoning",
|
||||
"category": "reasoning",
|
||||
"name": "논리 추론",
|
||||
"model": "qwen35b",
|
||||
"response": "",
|
||||
"tokens": 800,
|
||||
"time": 12.86,
|
||||
"tps": 62.21,
|
||||
"eval_criteria": [
|
||||
"correct_answer",
|
||||
"clear_steps",
|
||||
"math_accuracy"
|
||||
]
|
||||
}
|
||||
]
|
||||
834
scripts/_archive/results/qwen_fullgpu_results.json
Normal file
834
scripts/_archive/results/qwen_fullgpu_results.json
Normal file
@@ -0,0 +1,834 @@
|
||||
[
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU minbatch",
|
||||
"avg_tps": 65.11,
|
||||
"best_tps": 65.49,
|
||||
"boot": 9,
|
||||
"vram_total": 19177,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10039,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU nommap small",
|
||||
"avg_tps": 65.01,
|
||||
"best_tps": 65.36,
|
||||
"boot": 6,
|
||||
"vram_total": 19672,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10342,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9330,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU row-split",
|
||||
"avg_tps": 13.65,
|
||||
"best_tps": 14.82,
|
||||
"boot": 9,
|
||||
"vram_total": 19427,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10311,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9116,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"splitMode": "row",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU ts=0.5,0.5",
|
||||
"avg_tps": 64.92,
|
||||
"best_tps": 65.23,
|
||||
"boot": 9,
|
||||
"vram_total": 19664,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10334,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9330,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU all-tricks",
|
||||
"avg_tps": 64.72,
|
||||
"best_tps": 64.89,
|
||||
"boot": 6,
|
||||
"vram_total": 19171,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10033,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"noMmap": true,
|
||||
"defragThold": 0.1,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune t=2",
|
||||
"avg_tps": 64.87,
|
||||
"best_tps": 65.13,
|
||||
"boot": 9,
|
||||
"vram_total": 19170,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10032,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 2,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune t=6",
|
||||
"avg_tps": 64.88,
|
||||
"best_tps": 65.17,
|
||||
"boot": 9,
|
||||
"vram_total": 19168,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10030,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 6,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune t=8",
|
||||
"avg_tps": 64.5,
|
||||
"best_tps": 64.77,
|
||||
"boot": 9,
|
||||
"vram_total": 19168,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10030,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 8,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune ub=256 b=1024",
|
||||
"avg_tps": 64.73,
|
||||
"best_tps": 64.98,
|
||||
"boot": 9,
|
||||
"vram_total": 20640,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10928,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9712,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune ub=256 b=2048",
|
||||
"avg_tps": 63.69,
|
||||
"best_tps": 64.94,
|
||||
"boot": 12,
|
||||
"vram_total": 20614,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10902,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9712,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 2048,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune kv=q8_0/q8_0",
|
||||
"avg_tps": 64.78,
|
||||
"best_tps": 65.08,
|
||||
"boot": 9,
|
||||
"vram_total": 20422,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10644,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9778,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune kv=f16/f16",
|
||||
"avg_tps": 65.53,
|
||||
"best_tps": 65.81,
|
||||
"boot": 9,
|
||||
"vram_total": 22812,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11846,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10966,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "f16",
|
||||
"ctv": "f16"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "FINAL",
|
||||
"avg_tps": 66.31,
|
||||
"best_tps": 66.53,
|
||||
"boot": 9,
|
||||
"vram_total": 22811,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11845,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10966,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "f16",
|
||||
"ctv": "f16"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "pure-GPU minbatch",
|
||||
"avg_tps": 63.06,
|
||||
"best_tps": 64.16,
|
||||
"boot": 12,
|
||||
"vram_total": 22747,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11895,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10852,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "pure-GPU nommap small",
|
||||
"avg_tps": 63.75,
|
||||
"best_tps": 63.98,
|
||||
"boot": 9,
|
||||
"vram_total": 22579,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11797,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "pure-GPU ts=0.5,0.5",
|
||||
"avg_tps": 62.88,
|
||||
"best_tps": 63.9,
|
||||
"boot": 12,
|
||||
"vram_total": 22578,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11796,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "pure-GPU all-tricks",
|
||||
"avg_tps": 62.55,
|
||||
"best_tps": 63.71,
|
||||
"boot": 9,
|
||||
"vram_total": 22743,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11891,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10852,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"noMmap": true,
|
||||
"defragThold": 0.1,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune t=2",
|
||||
"avg_tps": 63.07,
|
||||
"best_tps": 64.08,
|
||||
"boot": 9,
|
||||
"vram_total": 22601,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11819,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 2,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune t=6",
|
||||
"avg_tps": 63.58,
|
||||
"best_tps": 64.04,
|
||||
"boot": 9,
|
||||
"vram_total": 22583,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11801,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 6,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune t=8",
|
||||
"avg_tps": 62.92,
|
||||
"best_tps": 63.73,
|
||||
"boot": 9,
|
||||
"vram_total": 22536,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11754,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 8,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune ub=256 b=1024",
|
||||
"avg_tps": 62.76,
|
||||
"best_tps": 63.86,
|
||||
"boot": 9,
|
||||
"vram_total": 22874,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11968,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10906,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune ub=256 b=2048",
|
||||
"avg_tps": 62.74,
|
||||
"best_tps": 63.9,
|
||||
"boot": 9,
|
||||
"vram_total": 22912,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12006,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10906,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 2048,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "FINAL",
|
||||
"avg_tps": 63.71,
|
||||
"best_tps": 64.39,
|
||||
"boot": 9,
|
||||
"vram_total": 22566,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11784,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "pure-GPU nommap small",
|
||||
"avg_tps": 62.29,
|
||||
"best_tps": 63.03,
|
||||
"boot": 9,
|
||||
"vram_total": 22975,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12007,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "pure-GPU ts=0.5,0.5",
|
||||
"avg_tps": 63.89,
|
||||
"best_tps": 64.91,
|
||||
"boot": 12,
|
||||
"vram_total": 23002,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12034,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "tune t=2",
|
||||
"avg_tps": 64.1,
|
||||
"best_tps": 64.54,
|
||||
"boot": 12,
|
||||
"vram_total": 22980,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12012,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 2,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "tune t=6",
|
||||
"avg_tps": 64.18,
|
||||
"best_tps": 64.72,
|
||||
"boot": 12,
|
||||
"vram_total": 22982,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12014,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 6,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "tune t=8",
|
||||
"avg_tps": 63.11,
|
||||
"best_tps": 64.02,
|
||||
"boot": 12,
|
||||
"vram_total": 22980,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12012,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 8,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
}
|
||||
]
|
||||
12
scripts/_archive/results/qwen_intermediate.csv
Normal file
12
scripts/_archive/results/qwen_intermediate.csv
Normal file
@@ -0,0 +1,12 @@
|
||||
model,label,avg,best,mode,vram,t,ub,b,kv,split,mmap
|
||||
UD-IQ4_NL,pure-GPU minbatch,65.11,65.49,GPU,19177,t4,ub64,b256,q4_0/q4_0,,
|
||||
UD-IQ4_NL,pure-GPU nommap small,65.01,65.36,GPU,19672,t4,ub128,b512,q4_0/q4_0,,nommap
|
||||
UD-IQ4_NL,pure-GPU row-split,13.65,14.82,GPU,19427,t4,ub128,b512,q4_0/q4_0,row,
|
||||
UD-IQ4_NL,pure-GPU ts=0.5,0.5,64.92,65.23,GPU,19664,t4,ub128,b512,q4_0/q4_0,,
|
||||
UD-IQ4_NL,pure-GPU all-tricks,64.72,64.89,GPU,19171,t4,ub64,b256,q4_0/q4_0,,nommap
|
||||
UD-IQ4_NL,tune t=2,64.87,65.13,GPU,19170,t2,ub64,b256,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune t=6,64.88,65.17,GPU,19168,t6,ub64,b256,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune t=8,64.5,64.77,GPU,19168,t8,ub64,b256,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune ub=256 b=1024,64.73,64.98,GPU,20640,t4,ub256,b1024,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune ub=256 b=2048,63.69,64.94,GPU,20614,t4,ub256,b2048,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune kv=q8_0/q8_0,64.78,65.08,GPU,20422,t4,ub64,b256,q8_0/q8_0,,
|
||||
|
8
scripts/_archive/results/split_test_result.txt
Normal file
8
scripts/_archive/results/split_test_result.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
==================================================
|
||||
TS=0.5,0.5 5-Run Results (with --mlock --poll 50):
|
||||
AVG: 61.94 t/s
|
||||
BEST: 62.06 t/s
|
||||
MIN: 61.74 t/s
|
||||
Runs: ['62.06', '61.74', '61.92', '62.00', '61.96']
|
||||
==================================================
|
||||
591
scripts/_archive/results/tune_results_gemma4_256k.json
Normal file
591
scripts/_archive/results/tune_results_gemma4_256k.json
Normal file
@@ -0,0 +1,591 @@
|
||||
[
|
||||
{
|
||||
"ngl": 22,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.22049935826915,
|
||||
"best_tps": 25.971732307567606,
|
||||
"vram_used": 11953,
|
||||
"vram_total": 12288,
|
||||
"label": "ngl=22"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.805518952775174,
|
||||
"best_tps": 25.953896683689454,
|
||||
"vram_used": 11942,
|
||||
"vram_total": 12288,
|
||||
"label": "ngl=21"
|
||||
},
|
||||
{
|
||||
"ngl": 20,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 23.537353232262834,
|
||||
"best_tps": 24.32109262330477,
|
||||
"vram_used": 11972,
|
||||
"vram_total": 12288,
|
||||
"label": "ngl=20"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 2,
|
||||
"tb": 2,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 20.167581352340264,
|
||||
"best_tps": 20.701192443418005,
|
||||
"vram_used": 11969,
|
||||
"vram_total": 12288,
|
||||
"label": "t=2 | tb=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.689104997668554,
|
||||
"best_tps": 26.328541632880874,
|
||||
"vram_used": 11975,
|
||||
"vram_total": 12288,
|
||||
"label": "t=4 | tb=4"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.294470150452725,
|
||||
"best_tps": 26.541251363470614,
|
||||
"vram_used": 11984,
|
||||
"vram_total": 12288,
|
||||
"label": "t=4 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.307859289404675,
|
||||
"best_tps": 26.292208504543133,
|
||||
"vram_used": 11984,
|
||||
"vram_total": 12288,
|
||||
"label": "t=6 | tb=6"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 6,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.230599923243314,
|
||||
"best_tps": 26.366065850165732,
|
||||
"vram_used": 11983,
|
||||
"vram_total": 12288,
|
||||
"label": "t=6 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.113108026759278,
|
||||
"best_tps": 26.123872617669583,
|
||||
"vram_used": 11984,
|
||||
"vram_total": 12288,
|
||||
"label": "t=8 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 8,
|
||||
"tb": 12,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.05545428888364,
|
||||
"best_tps": 26.06377500079152,
|
||||
"vram_used": 11983,
|
||||
"vram_total": 12288,
|
||||
"label": "t=8 | tb=12"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 10,
|
||||
"tb": 10,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 24.706926870374986,
|
||||
"best_tps": 25.03033604251865,
|
||||
"vram_used": 11984,
|
||||
"vram_total": 12288,
|
||||
"label": "t=10 | tb=10"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 12,
|
||||
"tb": 12,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 22.468055564001904,
|
||||
"best_tps": 23.425983251691825,
|
||||
"vram_used": 11989,
|
||||
"vram_total": 12288,
|
||||
"label": "t=12 | tb=12"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 16,
|
||||
"tb": 16,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 21.176973905195442,
|
||||
"best_tps": 21.482429642395456,
|
||||
"vram_used": 12021,
|
||||
"vram_total": 12288,
|
||||
"label": "t=16 | tb=16"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.545748810106186,
|
||||
"best_tps": 26.344547829145817,
|
||||
"vram_used": 11986,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=128 | b=512"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.503875205368377,
|
||||
"best_tps": 26.393548686102108,
|
||||
"vram_used": 11981,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=256 | b=1024"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 256,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.46500292415627,
|
||||
"best_tps": 26.2726382287537,
|
||||
"vram_used": 11981,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=256 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 1024,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.50982209452459,
|
||||
"best_tps": 26.292282671074723,
|
||||
"vram_used": 12020,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=1024"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.39646674356899,
|
||||
"best_tps": 26.28106356028714,
|
||||
"vram_used": 12020,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.471945933724726,
|
||||
"best_tps": 26.268422652962233,
|
||||
"vram_used": 12021,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=4096"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.722119623856702,
|
||||
"best_tps": 26.497264927416403,
|
||||
"vram_used": 12019,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=1024 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.665819493145943,
|
||||
"best_tps": 26.301163428594148,
|
||||
"vram_used": 12019,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=1024 | b=4096"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.464915272955533,
|
||||
"best_tps": 26.40667691713752,
|
||||
"vram_used": 12019,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q4_0 | ctv=q4_0"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.489715990281564,
|
||||
"best_tps": 25.884133821146627,
|
||||
"vram_used": 12011,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q8_0 | ctv=q8_0"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 22.751034104721082,
|
||||
"best_tps": 22.91250972782414,
|
||||
"vram_used": 12017,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q4_0 | ctv=q8_0"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "f16",
|
||||
"ctv": "f16",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 24.745831571513975,
|
||||
"best_tps": 25.53926086004382,
|
||||
"vram_used": 11985,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=f16 | ctv=f16"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.21575943186602,
|
||||
"best_tps": 25.796865637378264,
|
||||
"vram_used": 12013,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=50 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": false,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 23.88172807693179,
|
||||
"best_tps": 24.803356430302312,
|
||||
"vram_used": 12016,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=False | poll=50 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 0,
|
||||
"avg_tps": 25.041321207287698,
|
||||
"best_tps": 25.88479834694897,
|
||||
"vram_used": 12017,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=0 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 100,
|
||||
"avg_tps": 25.27990666474703,
|
||||
"best_tps": 26.034861156695197,
|
||||
"vram_used": 12017,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=100 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 3,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.360977804679788,
|
||||
"best_tps": 26.0705565191107,
|
||||
"vram_used": 12022,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=50 | prio=3"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": false,
|
||||
"prio": 3,
|
||||
"poll": 0,
|
||||
"avg_tps": 24.156893523381967,
|
||||
"best_tps": 24.840307911026144,
|
||||
"vram_used": 12021,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=False | poll=0 | prio=3"
|
||||
}
|
||||
]
|
||||
201
scripts/_archive/results/tune_results_gemma4_ncpumoe.json
Normal file
201
scripts/_archive/results/tune_results_gemma4_ncpumoe.json
Normal file
@@ -0,0 +1,201 @@
|
||||
[
|
||||
{
|
||||
"label": "ncpumoe=0",
|
||||
"ncpumoe": 0,
|
||||
"avg": 15.396949591766335,
|
||||
"best": 20.220093309883133,
|
||||
"vram": 12011,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=5",
|
||||
"ncpumoe": 5,
|
||||
"avg": 4.853957926040404,
|
||||
"best": 4.9029479257524216,
|
||||
"vram": 11945,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=10",
|
||||
"ncpumoe": 10,
|
||||
"avg": 20.64137159193706,
|
||||
"best": 26.474940718957154,
|
||||
"vram": 12020,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=15",
|
||||
"ncpumoe": 15,
|
||||
"avg": 13.424368433101165,
|
||||
"best": 13.698684361880598,
|
||||
"vram": 12018,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=20",
|
||||
"ncpumoe": 20,
|
||||
"avg": 10.338449574838693,
|
||||
"best": 13.495275411319872,
|
||||
"vram": 11530,
|
||||
"nommap": true
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=25",
|
||||
"ncpumoe": 25,
|
||||
"avg": 12.920348175328435,
|
||||
"best": 12.99923042323437,
|
||||
"vram": 11625,
|
||||
"nommap": true
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=30",
|
||||
"ncpumoe": 30,
|
||||
"avg": 13.251690836275145,
|
||||
"best": 13.253697466971921,
|
||||
"vram": 9064,
|
||||
"nommap": true
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=7",
|
||||
"ncpumoe": 7,
|
||||
"avg": 16.31796299658782,
|
||||
"best": 23.160760806218782,
|
||||
"vram": 11994,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=9",
|
||||
"ncpumoe": 9,
|
||||
"avg": 7.469651892205037,
|
||||
"best": 10.875064047449284,
|
||||
"vram": 11941,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=11",
|
||||
"ncpumoe": 11,
|
||||
"avg": 14.814740144776437,
|
||||
"best": 15.199641279675724,
|
||||
"vram": 11984,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=13",
|
||||
"ncpumoe": 13,
|
||||
"avg": 14.183175252947136,
|
||||
"best": 14.427257794639086,
|
||||
"vram": 12003,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=2",
|
||||
"ncpumoe": 10,
|
||||
"avg": 28.551811207068425,
|
||||
"best": 28.688565545389164,
|
||||
"vram": 11968,
|
||||
"t": 2,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=4",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.8619310622166,
|
||||
"best": 31.17677746690393,
|
||||
"vram": 11972,
|
||||
"t": 4,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=6",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.578454576249854,
|
||||
"best": 30.971792125516313,
|
||||
"vram": 11983,
|
||||
"t": 6,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=8",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.529393512116172,
|
||||
"best": 30.954830478128166,
|
||||
"vram": 11982,
|
||||
"t": 8,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=10",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.773041112229503,
|
||||
"best": 31.00899077264753,
|
||||
"vram": 11972,
|
||||
"t": 10,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ub=256,b=1024",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.49319055490045,
|
||||
"best": 30.691055921541377,
|
||||
"vram": 11993,
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ub=512,b=2048",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.923573731331718,
|
||||
"best": 31.902272031660825,
|
||||
"vram": 11995,
|
||||
"t": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ub=512,b=4096",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.723820162954862,
|
||||
"best": 31.065476003548053,
|
||||
"vram": 11966,
|
||||
"t": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ub=1024,b=2048",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.489888387093156,
|
||||
"best": 30.982074615885946,
|
||||
"vram": 11964,
|
||||
"t": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "kv=q4_0",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.63156129571348,
|
||||
"best": 31.088674795634944,
|
||||
"vram": 11988,
|
||||
"t": 4,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "kv=q8_0",
|
||||
"ncpumoe": 10,
|
||||
"avg": 29.6114222576863,
|
||||
"best": 30.580427895917573,
|
||||
"vram": 11980,
|
||||
"t": 4,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"nommap": false
|
||||
}
|
||||
]
|
||||
522
scripts/_archive/results/tune_results_qwen35b_256k.json
Normal file
522
scripts/_archive/results/tune_results_qwen35b_256k.json
Normal file
@@ -0,0 +1,522 @@
|
||||
[
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.169961832638464,
|
||||
"best_tps": 26.533887071573073,
|
||||
"vram_used": 4994,
|
||||
"vram_total": 12288,
|
||||
"label": "cpu_moe=True"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": false,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 11.065030380022206,
|
||||
"best_tps": 11.083028272674314,
|
||||
"vram_used": 11949,
|
||||
"vram_total": 12288,
|
||||
"label": "cpu_moe=False"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 2,
|
||||
"tb": 2,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 21.473286428302767,
|
||||
"best_tps": 21.746637577851104,
|
||||
"vram_used": 4994,
|
||||
"vram_total": 12288,
|
||||
"label": "t=2 | tb=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.552358479030676,
|
||||
"best_tps": 27.314237654089343,
|
||||
"vram_used": 4991,
|
||||
"vram_total": 12288,
|
||||
"label": "t=4 | tb=4"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.347068485327956,
|
||||
"best_tps": 26.87924726131441,
|
||||
"vram_used": 4993,
|
||||
"vram_total": 12288,
|
||||
"label": "t=4 | tb=6"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.331286039513458,
|
||||
"best_tps": 26.81427299445741,
|
||||
"vram_used": 5001,
|
||||
"vram_total": 12288,
|
||||
"label": "t=6 | tb=6"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 6,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.391160513711274,
|
||||
"best_tps": 26.735573238878736,
|
||||
"vram_used": 5001,
|
||||
"vram_total": 12288,
|
||||
"label": "t=6 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.32340666199144,
|
||||
"best_tps": 25.87949347494079,
|
||||
"vram_used": 4995,
|
||||
"vram_total": 12288,
|
||||
"label": "t=8 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 10,
|
||||
"tb": 10,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 23.752277317850815,
|
||||
"best_tps": 24.98242898809555,
|
||||
"vram_used": 5011,
|
||||
"vram_total": 12288,
|
||||
"label": "t=10 | tb=10"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 12,
|
||||
"tb": 12,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 21.75032196383532,
|
||||
"best_tps": 23.18963400077116,
|
||||
"vram_used": 5104,
|
||||
"vram_total": 12288,
|
||||
"label": "t=12 | tb=12"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 13.27593572827031,
|
||||
"best_tps": 13.337407402920235,
|
||||
"vram_used": 4391,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=128 | b=512"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.638687188233188,
|
||||
"best_tps": 27.361082444434413,
|
||||
"vram_used": 4495,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=256 | b=1024"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 256,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.29069503392877,
|
||||
"best_tps": 26.63368832924803,
|
||||
"vram_used": 4490,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=256 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 1024,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.518331831441134,
|
||||
"best_tps": 26.972021321271527,
|
||||
"vram_used": 4984,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=1024"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.401541912276873,
|
||||
"best_tps": 26.46530849236633,
|
||||
"vram_used": 4990,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.892711500590455,
|
||||
"best_tps": 26.892711500590455,
|
||||
"vram_used": 5006,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=4096"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 12.600209659679201,
|
||||
"best_tps": 12.759356030807627,
|
||||
"vram_used": 12020,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=1024 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 6.023959262370547,
|
||||
"best_tps": 8.284882268188156,
|
||||
"vram_used": 11931,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=1024 | b=4096"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 12.96992950856374,
|
||||
"best_tps": 12.96992950856374,
|
||||
"vram_used": 12022,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q4_0 | ctv=q4_0"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 11.420078920350697,
|
||||
"best_tps": 13.524778595767653,
|
||||
"vram_used": 12030,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q8_0 | ctv=q8_0"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "f16",
|
||||
"ctv": "f16",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 11.978106511464183,
|
||||
"best_tps": 13.729190013094977,
|
||||
"vram_used": 11518,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=f16 | ctv=f16"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 16.164278220452957,
|
||||
"best_tps": 22.645890325274323,
|
||||
"vram_used": 11623,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=50 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": false,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 16.555542780023114,
|
||||
"best_tps": 23.333815015033892,
|
||||
"vram_used": 9062,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=False | poll=50 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 0,
|
||||
"avg_tps": 13.003619379106329,
|
||||
"best_tps": 13.031594557134142,
|
||||
"vram_used": 11994,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=0 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 100,
|
||||
"avg_tps": 5.7762452690702935,
|
||||
"best_tps": 5.795560155803046,
|
||||
"vram_used": 11953,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=100 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 3,
|
||||
"poll": 50,
|
||||
"avg_tps": 12.59406799687573,
|
||||
"best_tps": 14.966737641114795,
|
||||
"vram_used": 11996,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=50 | prio=3"
|
||||
}
|
||||
]
|
||||
372
scripts/_archive/tuning/auto_tune_122b.py
Normal file
372
scripts/_archive/tuning/auto_tune_122b.py
Normal file
@@ -0,0 +1,372 @@
|
||||
"""
|
||||
Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
|
||||
===========================================
|
||||
각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
|
||||
서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
|
||||
|
||||
예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
# ============================================================
|
||||
# 테스트할 설정 목록
|
||||
# ============================================================
|
||||
# 공통 파라미터 (변경하지 않는 것들)
|
||||
COMMON_ARGS = [
|
||||
"--model", MODEL_PATH,
|
||||
"-ngl", "999",
|
||||
"--cpu-moe",
|
||||
"-c", "2048",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "256",
|
||||
"-b", "1024",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"--no-warmup", # 워밍업은 벤치마크 스크립트에서 직접 수행
|
||||
]
|
||||
|
||||
# 변수 파라미터 조합
|
||||
CONFIGS = [
|
||||
{
|
||||
"name": "A) --no-mmap -t 8",
|
||||
"desc": "서버 권장: mmap 비활성화 (baseline 대비)",
|
||||
"extra": ["--no-mmap", "-t", "8", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "B) --no-mmap -t 6",
|
||||
"desc": "스레드 감소 (캐시 경합 회피)",
|
||||
"extra": ["--no-mmap", "-t", "6", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "C) --no-mmap -t 10",
|
||||
"desc": "스레드 증가 (RAM 대역폭 포화)",
|
||||
"extra": ["--no-mmap", "-t", "10", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "D) --no-mmap -t 12",
|
||||
"desc": "더 많은 스레드",
|
||||
"extra": ["--no-mmap", "-t", "12", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "E) --no-mmap -t 10 --prio 3 --poll 100",
|
||||
"desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
|
||||
"extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
|
||||
},
|
||||
]
|
||||
|
||||
# ============================================================
|
||||
# 유틸리티 함수
|
||||
# ============================================================
|
||||
|
||||
def kill_server():
|
||||
"""llama-server 프로세스 강제 종료"""
|
||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
||||
time.sleep(3)
|
||||
|
||||
def start_server(config, log_path):
|
||||
"""서버 시작, 로그를 파일로 리다이렉트"""
|
||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
||||
log_file = open(log_path, "w", encoding="utf-8")
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=log_file,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd()
|
||||
)
|
||||
return proc, log_file
|
||||
|
||||
def wait_for_server(timeout=600):
|
||||
"""서버가 준비될 때까지 대기"""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def run_single_benchmark(prompt, max_tokens=200):
|
||||
"""단일 벤치마크 실행"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
return completion_tokens, elapsed
|
||||
|
||||
def parse_eval_times(log_path):
|
||||
"""서버 로그에서 순수 eval time 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
|
||||
# "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
|
||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
|
||||
results = []
|
||||
for m in matches:
|
||||
results.append({
|
||||
"total_ms": float(m[0]),
|
||||
"tokens": int(m[1]),
|
||||
"ms_per_token": float(m[2]),
|
||||
"tps": float(m[3])
|
||||
})
|
||||
return results
|
||||
|
||||
def parse_prompt_eval_times(log_path):
|
||||
"""서버 로그에서 prompt eval time 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
|
||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
|
||||
results = []
|
||||
for m in matches:
|
||||
results.append({
|
||||
"total_ms": float(m[0]),
|
||||
"tokens": int(m[1]),
|
||||
"ms_per_token": float(m[2]),
|
||||
"tps": float(m[3])
|
||||
})
|
||||
return results
|
||||
|
||||
def parse_vram_usage(log_path):
|
||||
"""서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return "N/A"
|
||||
|
||||
match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
|
||||
if match:
|
||||
return f"{float(match.group(1)):.0f} MiB"
|
||||
return "N/A"
|
||||
|
||||
# ============================================================
|
||||
# 메인 튜닝 루프
|
||||
# ============================================================
|
||||
|
||||
def main():
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B 자동 정밀 튜닝")
|
||||
print(f" 시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f" 테스트 설정: {len(CONFIGS)}개")
|
||||
print(f" 예상 소요: ~{len(CONFIGS) * 7}분")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print(" 기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
|
||||
print()
|
||||
|
||||
# 결과 저장
|
||||
all_results = []
|
||||
|
||||
for idx, config in enumerate(CONFIGS):
|
||||
config_start = time.time()
|
||||
log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
||||
print(f" {config['desc']}")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# 1. 기존 서버 종료
|
||||
print(" [1/4] 서버 종료 중...")
|
||||
kill_server()
|
||||
|
||||
# 2. 새 서버 시작
|
||||
print(f" [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
|
||||
proc, log_file = start_server(config, log_path)
|
||||
|
||||
# 3. 서버 준비 대기
|
||||
if not wait_for_server(timeout=600):
|
||||
print(" ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
|
||||
kill_server()
|
||||
log_file.close()
|
||||
all_results.append({
|
||||
"config": config["name"],
|
||||
"status": "FAILED",
|
||||
"eval_tps": [],
|
||||
"prompt_tps": [],
|
||||
"vram": "N/A"
|
||||
})
|
||||
continue
|
||||
|
||||
load_time = time.time() - config_start
|
||||
print(f" [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
|
||||
|
||||
# 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
|
||||
print(" [4/4] 벤치마크 실행 중...")
|
||||
|
||||
# 워밍업
|
||||
try:
|
||||
run_single_benchmark("Say hello.", max_tokens=20)
|
||||
print(" 워밍업 완료")
|
||||
except Exception as e:
|
||||
print(f" 워밍업 실패: {e}")
|
||||
|
||||
# 본 테스트 3회
|
||||
prompts = [
|
||||
"Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
|
||||
"Explain the complete process of photosynthesis including light and dark reactions in detail.",
|
||||
"Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
|
||||
]
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
try:
|
||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
||||
approx_tps = tokens / elapsed if elapsed > 0 else 0
|
||||
print(f" Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
|
||||
except Exception as e:
|
||||
print(f" Run {i+1}/3: ERROR - {e}")
|
||||
|
||||
# 서버 종료 전에 로그 플러시를 위해 잠시 대기
|
||||
time.sleep(2)
|
||||
|
||||
# 서버 종료
|
||||
kill_server()
|
||||
log_file.close()
|
||||
time.sleep(2)
|
||||
|
||||
# 로그 파싱
|
||||
eval_times = parse_eval_times(log_path)
|
||||
prompt_times = parse_prompt_eval_times(log_path)
|
||||
vram = parse_vram_usage(log_path)
|
||||
|
||||
# 워밍업 제외 (첫 번째 결과)
|
||||
if len(eval_times) > 1:
|
||||
bench_evals = eval_times[1:] # 워밍업 제외
|
||||
else:
|
||||
bench_evals = eval_times
|
||||
|
||||
if len(prompt_times) > 1:
|
||||
bench_prompts = prompt_times[1:]
|
||||
else:
|
||||
bench_prompts = prompt_times
|
||||
|
||||
eval_speeds = [e["tps"] for e in bench_evals]
|
||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
||||
|
||||
result = {
|
||||
"config": config["name"],
|
||||
"status": "OK",
|
||||
"eval_tps": eval_speeds,
|
||||
"prompt_tps": prompt_speeds,
|
||||
"vram": vram,
|
||||
}
|
||||
all_results.append(result)
|
||||
|
||||
config_elapsed = time.time() - config_start
|
||||
print(f"\n 완료! 소요: {config_elapsed:.0f}초")
|
||||
|
||||
if eval_speeds:
|
||||
avg_eval = sum(eval_speeds) / len(eval_speeds)
|
||||
max_eval = max(eval_speeds)
|
||||
print(f" 📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
|
||||
|
||||
# ============================================================
|
||||
# 최종 결과 비교 테이블
|
||||
# ============================================================
|
||||
print("\n")
|
||||
print("=" * 80)
|
||||
print(" 🏆 최종 결과 비교 테이블")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
# 기존 baseline 추가
|
||||
print(f" {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
|
||||
print(f" {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
|
||||
|
||||
# Baseline (이전 결과)
|
||||
print(f" {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
|
||||
|
||||
best_avg = 0
|
||||
best_config = ""
|
||||
|
||||
for r in all_results:
|
||||
if r["status"] != "OK" or not r["eval_tps"]:
|
||||
print(f" {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
|
||||
continue
|
||||
|
||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
||||
max_e = max(r["eval_tps"])
|
||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
||||
|
||||
if avg_e > best_avg:
|
||||
best_avg = avg_e
|
||||
best_config = r["config"]
|
||||
|
||||
marker = " ⭐" if avg_e > 10.06 else ""
|
||||
print(f" {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
|
||||
|
||||
print()
|
||||
if best_avg > 0:
|
||||
improvement = ((best_avg - 10.02) / 10.02) * 100
|
||||
print(f" 🏆 최고 성능: {best_config}")
|
||||
print(f" → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
|
||||
|
||||
print()
|
||||
print(f" 완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print("=" * 80)
|
||||
|
||||
# 결과를 파일로도 저장
|
||||
result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
|
||||
with open(result_path, "w", encoding="utf-8") as f:
|
||||
f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
|
||||
f.write(f"Date: {timestamp}\n\n")
|
||||
for r in all_results:
|
||||
f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
|
||||
print(f" 결과 저장: {result_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
257
scripts/_archive/tuning/auto_tune_122b_r2.py
Normal file
257
scripts/_archive/tuning/auto_tune_122b_r2.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Qwen3.5 122B-A10B 정밀 튜닝 2라운드
|
||||
====================================
|
||||
1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
|
||||
→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
COMMON_ARGS = [
|
||||
"--model", MODEL_PATH,
|
||||
"-ngl", "999",
|
||||
"--cpu-moe",
|
||||
"-c", "2048",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "256",
|
||||
"-b", "1024",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"--no-warmup",
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
{
|
||||
"name": "F) mmap on, -t 4",
|
||||
"desc": "최소 스레드 (4개, 물리코어 절반)",
|
||||
"extra": ["-t", "4", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "G) mmap on, -t 5",
|
||||
"desc": "스레드 5개",
|
||||
"extra": ["-t", "5", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "H) mmap on, -t 6",
|
||||
"desc": "스레드 6개 (--no-mmap에서 최고였음)",
|
||||
"extra": ["-t", "6", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "I) mmap on, -t 7",
|
||||
"desc": "스레드 7개",
|
||||
"extra": ["-t", "7", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "J) mmap on, -t 6, --prio 3",
|
||||
"desc": "최적 스레드 + 리얼타임 우선순위",
|
||||
"extra": ["-t", "6", "--prio", "3"],
|
||||
},
|
||||
]
|
||||
|
||||
def kill_server():
|
||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
||||
time.sleep(3)
|
||||
|
||||
def start_server(config, log_path):
|
||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
||||
log_file = open(log_path, "w", encoding="utf-8")
|
||||
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
|
||||
return proc, log_file
|
||||
|
||||
def wait_for_server(timeout=600):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def run_single_benchmark(prompt, max_tokens=200):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
usage = result.get("usage", {})
|
||||
return usage.get("completion_tokens", 0), elapsed
|
||||
|
||||
def parse_eval_times(log_path):
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
|
||||
|
||||
def parse_prompt_eval_times(log_path):
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
return [{"tps": float(m[3])} for m in matches]
|
||||
|
||||
def main():
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
all_results = []
|
||||
|
||||
for idx, config in enumerate(CONFIGS):
|
||||
config_start = time.time()
|
||||
log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
||||
print(f" {config['desc']}")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
kill_server()
|
||||
print(f" [1/3] 서버 시작 중...")
|
||||
proc, log_file = start_server(config, log_path)
|
||||
|
||||
if not wait_for_server(timeout=600):
|
||||
print(" ❌ 서버 시작 실패!")
|
||||
kill_server()
|
||||
log_file.close()
|
||||
all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
|
||||
continue
|
||||
|
||||
load_time = time.time() - config_start
|
||||
print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)")
|
||||
|
||||
# 워밍업 + 벤치마크
|
||||
try:
|
||||
run_single_benchmark("Say hello.", max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
print(" [3/3] 벤치마크 3회...")
|
||||
prompts = [
|
||||
"Write a detailed explanation of how neural networks learn through backpropagation.",
|
||||
"Explain the complete process of photosynthesis including light and dark reactions.",
|
||||
"Describe the differences between SQL and NoSQL databases with examples.",
|
||||
]
|
||||
for i, prompt in enumerate(prompts):
|
||||
try:
|
||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
||||
print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
|
||||
except Exception as e:
|
||||
print(f" Run {i+1}: ERROR - {e}")
|
||||
|
||||
time.sleep(2)
|
||||
kill_server()
|
||||
log_file.close()
|
||||
time.sleep(2)
|
||||
|
||||
eval_times = parse_eval_times(log_path)
|
||||
prompt_times = parse_prompt_eval_times(log_path)
|
||||
bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
|
||||
bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
|
||||
|
||||
eval_speeds = [e["tps"] for e in bench_evals]
|
||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
||||
|
||||
all_results.append({
|
||||
"config": config["name"],
|
||||
"status": "OK",
|
||||
"eval_tps": eval_speeds,
|
||||
"prompt_tps": prompt_speeds,
|
||||
})
|
||||
|
||||
if eval_speeds:
|
||||
print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
|
||||
|
||||
# 최종 결과
|
||||
print("\n")
|
||||
print("=" * 85)
|
||||
print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
|
||||
print("=" * 85)
|
||||
print()
|
||||
print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
|
||||
print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
|
||||
|
||||
# 1라운드 결과 (하드코딩)
|
||||
r1 = [
|
||||
("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52),
|
||||
("A) --no-mmap -t 8", 9.66, 9.70, 28.26),
|
||||
("B) --no-mmap -t 6", 10.02, 10.18, 26.73),
|
||||
("C) --no-mmap -t 10", 9.42, 9.46, 27.31),
|
||||
("D) --no-mmap -t 12", 9.04, 9.11, 27.92),
|
||||
("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37),
|
||||
]
|
||||
for name, avg, mx, pp in r1:
|
||||
marker = " ⭐" if avg >= 10.0 else ""
|
||||
print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
|
||||
|
||||
print(f" {'--- 2라운드 ---':<48}")
|
||||
|
||||
best_avg = 10.06 # 기존 최고
|
||||
best_config = "[기준] mmap on, -t 8"
|
||||
|
||||
for r in all_results:
|
||||
if r["status"] != "OK" or not r["eval_tps"]:
|
||||
print(f" {r['config']:<48} {'FAIL':>8}")
|
||||
continue
|
||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
||||
max_e = max(r["eval_tps"])
|
||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
||||
if max_e > best_avg:
|
||||
best_avg = max_e
|
||||
best_config = r["config"]
|
||||
marker = " ⭐" if avg_e >= 10.0 else ""
|
||||
print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
|
||||
|
||||
print()
|
||||
print(f" 🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
|
||||
print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print("=" * 85)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
339
scripts/_archive/tuning/auto_tune_gemma4_256k.py
Normal file
339
scripts/_archive/tuning/auto_tune_gemma4_256k.py
Normal file
@@ -0,0 +1,339 @@
|
||||
"""
|
||||
Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
|
||||
Phase 1: -ngl sweep (GPU layers)
|
||||
Phase 2: -t / -tb sweep (CPU threads)
|
||||
Phase 3: -ub / -b sweep (batch sizes)
|
||||
Phase 4: --cache-type-k/v sweep (KV cache precision)
|
||||
Phase 5: --no-mmap, --poll, --prio sweep (misc)
|
||||
Each phase fixes the best from previous phases.
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import itertools
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||
CONTEXT = 262144
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
|
||||
# ─── Baseline (from previous tuning at -c 4096) ───
|
||||
BEST = {
|
||||
"ngl": 22,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": True,
|
||||
"mmap": True,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
}
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def build_cmd(cfg):
|
||||
cmd = [LLAMA_SERVER, "--model", MODEL,
|
||||
"-ngl", str(cfg["ngl"]),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", cfg["fa"],
|
||||
"--cache-type-k", cfg["ctk"],
|
||||
"--cache-type-v", cfg["ctv"],
|
||||
"-ub", str(cfg["ub"]),
|
||||
"-b", str(cfg["b"]),
|
||||
"-t", str(cfg["t"]),
|
||||
"-tb", str(cfg["tb"]),
|
||||
"--prio", str(cfg["prio"]),
|
||||
"--poll", str(cfg["poll"]),
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0"]
|
||||
if cfg["mlock"]:
|
||||
cmd.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(cfg):
|
||||
cmd = build_cmd(cfg)
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=180):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
return ct / elapsed if elapsed > 0 else 0
|
||||
|
||||
|
||||
def get_vram():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
parts = r.stdout.strip().split(",")
|
||||
return int(parts[0].strip()), int(parts[1].strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test_config(cfg, label=""):
|
||||
kill_server()
|
||||
desc = label or str(cfg)
|
||||
print(f" [{desc}] Starting server...")
|
||||
proc = start_server(cfg)
|
||||
|
||||
if not wait_for_server():
|
||||
print(f" [{desc}] FAILED to start")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram_used, vram_total = get_vram()
|
||||
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Benchmark
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
speeds.append(tps)
|
||||
except Exception as e:
|
||||
print(f"ERR({e}) ", end="", flush=True)
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
print("ALL FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {**cfg, "avg_tps": avg, "best_tps": best,
|
||||
"vram_used": vram_used, "vram_total": vram_total, "label": label}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
def phase_sweep(phase_name, param_name, values, base_cfg):
|
||||
print(f"\n{'='*70}")
|
||||
print(f" PHASE: {phase_name}")
|
||||
print(f" Sweeping: {param_name} = {values}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
best_result = None
|
||||
for val in values:
|
||||
cfg = {**base_cfg}
|
||||
if isinstance(param_name, list):
|
||||
for p, v in zip(param_name, val):
|
||||
cfg[p] = v
|
||||
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
|
||||
else:
|
||||
cfg[param_name] = val
|
||||
label = f"{param_name}={val}"
|
||||
|
||||
r = test_config(cfg, label)
|
||||
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
|
||||
best_result = r
|
||||
|
||||
if best_result:
|
||||
print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
|
||||
return best_result
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner")
|
||||
print(" 256K Context | RTX 3060 12GB")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
cfg = dict(BEST)
|
||||
|
||||
# ─── Phase 1: -ngl (already done, quick verify top 3) ───
|
||||
r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg)
|
||||
if r:
|
||||
cfg["ngl"] = r["ngl"]
|
||||
|
||||
# ─── Phase 2: CPU threads (-t, -tb) ───
|
||||
thread_combos = [
|
||||
(2, 2), (4, 4), (4, 8), (6, 6), (6, 8),
|
||||
(8, 8), (8, 12), (10, 10), (12, 12), (16, 16)
|
||||
]
|
||||
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
|
||||
if r:
|
||||
cfg["t"] = r["t"]
|
||||
cfg["tb"] = r["tb"]
|
||||
|
||||
# ─── Phase 3: Batch sizes (-ub, -b) ───
|
||||
batch_combos = [
|
||||
(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)
|
||||
]
|
||||
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
|
||||
if r:
|
||||
cfg["ub"] = r["ub"]
|
||||
cfg["b"] = r["b"]
|
||||
|
||||
# ─── Phase 4: KV cache precision ───
|
||||
kv_combos = [
|
||||
("q4_0", "q4_0"),
|
||||
("q8_0", "q8_0"),
|
||||
("q4_0", "q8_0"),
|
||||
("f16", "f16"),
|
||||
]
|
||||
r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg)
|
||||
if r:
|
||||
cfg["ctk"] = r["ctk"]
|
||||
cfg["ctv"] = r["ctv"]
|
||||
|
||||
# ─── Phase 5: Misc (mmap, poll, prio) ───
|
||||
misc_combos = [
|
||||
(True, 50, 2), # baseline
|
||||
(False, 50, 2), # no-mmap
|
||||
(True, 0, 2), # no polling
|
||||
(True, 100, 2), # max polling
|
||||
(True, 50, 3), # realtime priority
|
||||
(False, 0, 3), # no-mmap + no-poll + realtime
|
||||
]
|
||||
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
|
||||
if r:
|
||||
cfg["mmap"] = r["mmap"]
|
||||
cfg["poll"] = r["poll"]
|
||||
cfg["prio"] = r["prio"]
|
||||
|
||||
# ─── Final Report ───
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" FINAL OPTIMAL CONFIGURATION")
|
||||
print("=" * 70)
|
||||
print(f" ngl: {cfg['ngl']}")
|
||||
print(f" threads: -t {cfg['t']} -tb {cfg['tb']}")
|
||||
print(f" batch: -ub {cfg['ub']} -b {cfg['b']}")
|
||||
print(f" kv cache: -ctk {cfg['ctk']} -ctv {cfg['ctv']}")
|
||||
print(f" flash: -fa {cfg['fa']}")
|
||||
print(f" mlock: {'yes' if cfg['mlock'] else 'no'}")
|
||||
print(f" mmap: {'yes' if cfg['mmap'] else 'no (--no-mmap)'}")
|
||||
print(f" prio: {cfg['prio']}")
|
||||
print(f" poll: {cfg['poll']}")
|
||||
print()
|
||||
|
||||
# Final verification run
|
||||
print(" Running final verification (5 runs)...")
|
||||
kill_server()
|
||||
proc = start_server(cfg)
|
||||
wait_for_server()
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
final_speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
final_speeds.append(tps)
|
||||
print(f" Run {i+1}: {tps:.2f} t/s")
|
||||
except:
|
||||
pass
|
||||
proc.kill()
|
||||
|
||||
if final_speeds:
|
||||
avg = sum(final_speeds) / len(final_speeds)
|
||||
best = max(final_speeds)
|
||||
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
|
||||
|
||||
print()
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODEL}",
|
||||
f"-ngl {cfg['ngl']} -c {CONTEXT}",
|
||||
f"-t {cfg['t']} -tb {cfg['tb']}",
|
||||
f"-ub {cfg['ub']} -b {cfg['b']}",
|
||||
f"-fa {cfg['fa']}",
|
||||
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
|
||||
f"--prio {cfg['prio']} --poll {cfg['poll']}",
|
||||
]
|
||||
if cfg["mlock"]:
|
||||
cmd_parts.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
print(" Recommended command:")
|
||||
print(f" {' '.join(cmd_parts)}")
|
||||
print("=" * 70)
|
||||
|
||||
# Dump all results to JSON
|
||||
with open("scripts/tune_results_gemma4_256k.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
print(f"\n Full results saved: scripts/tune_results_gemma4_256k.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
163
scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
Normal file
163
scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
|
||||
Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
|
||||
"""
|
||||
import subprocess, time, json, urllib.request, sys, os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||
CTX = 262144
|
||||
RUNS = 3
|
||||
|
||||
|
||||
def kill():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
|
||||
cmd = [SERVER, "--model", MODEL, "-ngl", "999",
|
||||
"-c", str(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", ctk, "--cache-type-v", ctv,
|
||||
"-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
|
||||
"--prio", str(prio), "--poll", "50",
|
||||
"--mlock", "--port", "8000", "--host", "0.0.0.0"]
|
||||
if ncpumoe > 0:
|
||||
cmd.extend(["--n-cpu-moe", str(ncpumoe)])
|
||||
if nommap:
|
||||
cmd.append("--no-mmap")
|
||||
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def wait_ready(timeout=240):
|
||||
t0 = time.time()
|
||||
while time.time() - t0 < timeout:
|
||||
try:
|
||||
with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
|
||||
if json.loads(r.read()).get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def bench(n=200):
|
||||
p = json.dumps({"model": "m", "messages": [{"role": "user",
|
||||
"content": "Count from 1 to 50, each number on new line."}],
|
||||
"max_tokens": n, "temperature": 0.0}).encode()
|
||||
r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
|
||||
headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
with urllib.request.urlopen(r, timeout=300) as resp:
|
||||
res = json.loads(resp.read())
|
||||
dt = time.time() - t0
|
||||
ct = res.get("usage", {}).get("completion_tokens", 0)
|
||||
return ct / dt if dt > 0 else 0
|
||||
|
||||
|
||||
def vram():
|
||||
try:
|
||||
r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
|
||||
a, b = r.stdout.strip().split(",")
|
||||
return int(a.strip()), int(b.strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test(label, ncpumoe, **kw):
|
||||
kill()
|
||||
print(f" [{label}] Starting...", end=" ", flush=True)
|
||||
p = start(ncpumoe, **kw)
|
||||
if not wait_ready():
|
||||
print("FAILED"); p.kill(); return None
|
||||
vu, vt = vram()
|
||||
print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
|
||||
try: bench(20)
|
||||
except: pass
|
||||
speeds = []
|
||||
for _ in range(RUNS):
|
||||
try: speeds.append(bench())
|
||||
except: pass
|
||||
p.kill()
|
||||
if not speeds:
|
||||
print("BENCH FAILED"); return None
|
||||
avg, best = sum(speeds)/len(speeds), max(speeds)
|
||||
print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
|
||||
return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
|
||||
"vram": vu, **kw}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
|
||||
print("=" * 60)
|
||||
results = []
|
||||
|
||||
# Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
|
||||
print("\n--- Phase 1: --n-cpu-moe sweep ---")
|
||||
for n in [0, 5, 10, 15, 20, 25, 30]:
|
||||
nm = n > 15 # use --no-mmap when heavy CPU offload
|
||||
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Find best n-cpu-moe
|
||||
best_r = max(results, key=lambda x: x["avg"])
|
||||
best_n = best_r["ncpumoe"]
|
||||
print(f"\n ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||
|
||||
# Fine-tune around best
|
||||
if best_n > 0:
|
||||
print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
|
||||
for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
|
||||
if n == best_n: continue
|
||||
nm = n > 15
|
||||
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||
if r: results.append(r)
|
||||
best_r = max(results, key=lambda x: x["avg"])
|
||||
best_n = best_r["ncpumoe"]
|
||||
print(f"\n ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||
|
||||
# Phase 2: Thread sweep at best n-cpu-moe
|
||||
nm = best_n > 15
|
||||
print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
|
||||
for t in [2, 4, 6, 8, 10]:
|
||||
r = test(f"t={t}", best_n, t=t, nommap=nm)
|
||||
if r: results.append(r)
|
||||
best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
|
||||
bt = best_t.get("t", 4)
|
||||
print(f"\n ★ Best threads: {bt}")
|
||||
|
||||
# Phase 3: Batch sweep
|
||||
print(f"\n--- Phase 3: Batch sweep ---")
|
||||
for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
|
||||
r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Phase 4: KV cache type
|
||||
print(f"\n--- Phase 4: KV cache type ---")
|
||||
for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
|
||||
r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Final report
|
||||
best_all = max(results, key=lambda x: x["avg"])
|
||||
print(f"\n{'='*60}")
|
||||
print(f" FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
print(" Saved: scripts/tune_results_gemma4_ncpumoe.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
335
scripts/_archive/tuning/auto_tune_qwen35b_256k.py
Normal file
335
scripts/_archive/tuning/auto_tune_qwen35b_256k.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""
|
||||
Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
|
||||
Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
|
||||
Now tuning for -c 262144 (256K context).
|
||||
|
||||
Phase 1: --cpu-moe vs no --cpu-moe baseline
|
||||
Phase 2: -t / -tb sweep
|
||||
Phase 3: -ub / -b sweep
|
||||
Phase 4: --cache-type-k/v sweep
|
||||
Phase 5: Misc (mmap, poll, prio)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
|
||||
CONTEXT = 262144
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
|
||||
BEST = {
|
||||
"ngl": 999,
|
||||
"cpu_moe": True,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": True,
|
||||
"mmap": True,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
}
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def build_cmd(cfg):
|
||||
cmd = [LLAMA_SERVER, "--model", MODEL,
|
||||
"-ngl", str(cfg["ngl"]),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", cfg["fa"],
|
||||
"--cache-type-k", cfg["ctk"],
|
||||
"--cache-type-v", cfg["ctv"],
|
||||
"-ub", str(cfg["ub"]),
|
||||
"-b", str(cfg["b"]),
|
||||
"-t", str(cfg["t"]),
|
||||
"-tb", str(cfg["tb"]),
|
||||
"--prio", str(cfg["prio"]),
|
||||
"--poll", str(cfg["poll"]),
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0"]
|
||||
if cfg.get("cpu_moe"):
|
||||
cmd.append("--cpu-moe")
|
||||
if cfg["mlock"]:
|
||||
cmd.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(cfg):
|
||||
cmd = build_cmd(cfg)
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=240):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
return ct / elapsed if elapsed > 0 else 0
|
||||
|
||||
|
||||
def get_vram():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
parts = r.stdout.strip().split(",")
|
||||
return int(parts[0].strip()), int(parts[1].strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test_config(cfg, label=""):
|
||||
kill_server()
|
||||
desc = label or str(cfg)
|
||||
print(f" [{desc}] Starting server...", flush=True)
|
||||
proc = start_server(cfg)
|
||||
|
||||
if not wait_for_server():
|
||||
print(f" [{desc}] FAILED to start")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram_used, vram_total = get_vram()
|
||||
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
speeds.append(tps)
|
||||
except Exception as e:
|
||||
print(f"ERR({e}) ", end="", flush=True)
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
print("ALL FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
|
||||
"vram_used": vram_used, "vram_total": vram_total, "label": label}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
def phase_sweep(phase_name, param_name, values, base_cfg):
|
||||
print(f"\n{'='*70}")
|
||||
print(f" PHASE: {phase_name}")
|
||||
print(f" Sweeping: {param_name} = {values}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
best_result = None
|
||||
for val in values:
|
||||
cfg = {**base_cfg}
|
||||
if isinstance(param_name, list):
|
||||
for p, v in zip(param_name, val):
|
||||
cfg[p] = v
|
||||
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
|
||||
else:
|
||||
cfg[param_name] = val
|
||||
label = f"{param_name}={val}"
|
||||
|
||||
r = test_config(cfg, label)
|
||||
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
|
||||
best_result = r
|
||||
|
||||
if best_result:
|
||||
print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
|
||||
return best_result
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
|
||||
print(" 256K Context | RTX 3060 12GB")
|
||||
print(" Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
cfg = dict(BEST)
|
||||
|
||||
# ─── Phase 1: --cpu-moe critical test ───
|
||||
r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
|
||||
if r:
|
||||
cfg["cpu_moe"] = r["cpu_moe"]
|
||||
|
||||
# ─── Phase 2: CPU threads ───
|
||||
thread_combos = [
|
||||
(2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
|
||||
(8, 8), (8, 12), (10, 10), (12, 12)
|
||||
]
|
||||
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
|
||||
if r:
|
||||
cfg["t"] = r["t"]
|
||||
cfg["tb"] = r["tb"]
|
||||
|
||||
# ─── Phase 3: Batch sizes ───
|
||||
batch_combos = [
|
||||
(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)
|
||||
]
|
||||
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
|
||||
if r:
|
||||
cfg["ub"] = r["ub"]
|
||||
cfg["b"] = r["b"]
|
||||
|
||||
# ─── Phase 4: KV cache ───
|
||||
kv_combos = [
|
||||
("q4_0", "q4_0"),
|
||||
("q8_0", "q8_0"),
|
||||
("f16", "f16"),
|
||||
]
|
||||
r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
|
||||
if r:
|
||||
cfg["ctk"] = r["ctk"]
|
||||
cfg["ctv"] = r["ctv"]
|
||||
|
||||
# ─── Phase 5: Misc ───
|
||||
misc_combos = [
|
||||
(True, 50, 2),
|
||||
(False, 50, 2),
|
||||
(True, 0, 2),
|
||||
(True, 100, 2),
|
||||
(True, 50, 3),
|
||||
]
|
||||
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
|
||||
if r:
|
||||
cfg["mmap"] = r["mmap"]
|
||||
cfg["poll"] = r["poll"]
|
||||
cfg["prio"] = r["prio"]
|
||||
|
||||
# ─── Final Report ───
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" FINAL OPTIMAL CONFIGURATION")
|
||||
print("=" * 70)
|
||||
for k, v in cfg.items():
|
||||
print(f" {k:>12}: {v}")
|
||||
print()
|
||||
|
||||
# Final verification
|
||||
print(" Running final verification (5 runs)...")
|
||||
kill_server()
|
||||
proc = start_server(cfg)
|
||||
wait_for_server()
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
final_speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
final_speeds.append(tps)
|
||||
print(f" Run {i+1}: {tps:.2f} t/s")
|
||||
except:
|
||||
pass
|
||||
proc.kill()
|
||||
|
||||
if final_speeds:
|
||||
avg = sum(final_speeds) / len(final_speeds)
|
||||
best = max(final_speeds)
|
||||
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
|
||||
|
||||
print()
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODEL}",
|
||||
f"-ngl {cfg['ngl']} -c {CONTEXT}",
|
||||
]
|
||||
if cfg.get("cpu_moe"):
|
||||
cmd_parts.append("--cpu-moe")
|
||||
cmd_parts.extend([
|
||||
f"-t {cfg['t']} -tb {cfg['tb']}",
|
||||
f"-ub {cfg['ub']} -b {cfg['b']}",
|
||||
f"-fa {cfg['fa']}",
|
||||
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
|
||||
f"--prio {cfg['prio']} --poll {cfg['poll']}",
|
||||
])
|
||||
if cfg["mlock"]:
|
||||
cmd_parts.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
print(" Recommended command:")
|
||||
print(f" {' '.join(cmd_parts)}")
|
||||
print("=" * 70)
|
||||
|
||||
with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
print(f"\n Full results saved: scripts/tune_results_qwen35b_256k.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
531
scripts/_archive/tuning/dual_gpu_benchmark.mjs
Normal file
531
scripts/_archive/tuning/dual_gpu_benchmark.mjs
Normal file
@@ -0,0 +1,531 @@
|
||||
/**
|
||||
* Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||
* ===========================================================
|
||||
* Tests 4 models across multiple parameter configurations to find
|
||||
* the absolute best model + settings for 256K context coding agent.
|
||||
*
|
||||
* Models:
|
||||
* 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||
* 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||
* 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||
* 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||
*
|
||||
* Run: node scripts/dual_gpu_benchmark.mjs
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, statSync, existsSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// ─── Configuration ─────────────────────────────────────────────
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CONTEXT = 262144; // 256K
|
||||
const BENCHMARK_RUNS = 3;
|
||||
const BENCHMARK_TOKENS = 200;
|
||||
const SERVER_TIMEOUT = 300_000; // ms
|
||||
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Qwen3.5-35B-A3B Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
type: "qwen", quant: "Q4_K_M", totalLayers: 64,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
|
||||
},
|
||||
{
|
||||
name: "Gemma4 26B-A4B Q4_K_M",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
||||
type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
|
||||
},
|
||||
{
|
||||
name: "Gemma4 26B-A4B MXFP4_MOE",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
||||
type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
|
||||
},
|
||||
];
|
||||
|
||||
const ALL_RESULTS = [];
|
||||
|
||||
// ─── Utility ───────────────────────────────────────────────────
|
||||
|
||||
function log(msg) {
|
||||
const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
|
||||
console.log(`[${ts}] ${msg}`);
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
function killServer() {
|
||||
try {
|
||||
execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
|
||||
} catch {}
|
||||
return sleep(5000);
|
||||
}
|
||||
|
||||
function getVramAll() {
|
||||
try {
|
||||
const out = execSync(
|
||||
'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }
|
||||
);
|
||||
return out.trim().split("\n").map((line) => {
|
||||
const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
|
||||
return { gpu, used, total };
|
||||
});
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function buildCmd(modelPath, params) {
|
||||
const {
|
||||
ngl, t, ub, b, ctk, ctv,
|
||||
cpuMoe = false, nCpuMoe = 0,
|
||||
prio = 3, nommap = false
|
||||
} = params;
|
||||
|
||||
const cmd = [
|
||||
LLAMA_SERVER,
|
||||
"--model", modelPath,
|
||||
"-ngl", String(ngl),
|
||||
"-c", String(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", ctk,
|
||||
"--cache-type-v", ctv,
|
||||
"-ub", String(ub),
|
||||
"-b", String(b),
|
||||
"-t", String(t),
|
||||
"-tb", String(t),
|
||||
"--prio", String(prio),
|
||||
"--poll", "50",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
];
|
||||
|
||||
if (cpuMoe) cmd.push("--cpu-moe");
|
||||
else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
|
||||
if (nommap) cmd.push("--no-mmap");
|
||||
|
||||
return cmd;
|
||||
}
|
||||
|
||||
function startServer(modelPath, params) {
|
||||
const args = buildCmd(modelPath, params);
|
||||
const exe = args.shift();
|
||||
log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
|
||||
return spawn(exe, args, {
|
||||
cwd: process.cwd(),
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
}
|
||||
|
||||
async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
try {
|
||||
const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const data = await resp.json();
|
||||
if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, bootTime: timeoutMs / 1000 };
|
||||
}
|
||||
|
||||
async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
|
||||
const payload = JSON.stringify({
|
||||
model: "local-model",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
|
||||
max_tokens: maxTokens,
|
||||
temperature: 0.0,
|
||||
});
|
||||
|
||||
const start = Date.now();
|
||||
const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: payload,
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const result = await resp.json();
|
||||
const elapsed = (Date.now() - start) / 1000;
|
||||
|
||||
const usage = result.usage || {};
|
||||
const ct = usage.completion_tokens || 0;
|
||||
return {
|
||||
tps: elapsed > 0 ? ct / elapsed : 0,
|
||||
completionTokens: ct,
|
||||
promptTokens: usage.prompt_tokens || 0,
|
||||
elapsed,
|
||||
};
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await killServer();
|
||||
log(` [${label}] Starting server...`);
|
||||
|
||||
const proc = startServer(model.path, params);
|
||||
const { ok, bootTime } = await waitForServer();
|
||||
|
||||
if (!ok) {
|
||||
log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
|
||||
proc.kill("SIGKILL");
|
||||
return null;
|
||||
}
|
||||
|
||||
const vram = getVramAll();
|
||||
const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
|
||||
log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
|
||||
|
||||
// Warmup
|
||||
try { await runBenchmark(20); } catch {}
|
||||
|
||||
// Benchmark
|
||||
const speeds = [];
|
||||
for (let i = 0; i < BENCHMARK_RUNS; i++) {
|
||||
try {
|
||||
const r = await runBenchmark();
|
||||
speeds.push(r.tps);
|
||||
log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Run ${i + 1}: ERROR (${e.message})`);
|
||||
}
|
||||
}
|
||||
|
||||
proc.kill("SIGKILL");
|
||||
|
||||
if (speeds.length === 0) {
|
||||
log(` [${label}] ALL BENCHMARK RUNS FAILED`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
|
||||
|
||||
const result = {
|
||||
model: model.name, quant: model.quant, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot_time: +bootTime.toFixed(1), vram, params,
|
||||
};
|
||||
ALL_RESULTS.push(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Phase Runners ─────────────────────────────────────────────
|
||||
|
||||
async function phase0_bootTest(model) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 0: Boot Test — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
// Try full GPU first
|
||||
let r = await testConfig(model, "boot-ngl999", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
|
||||
});
|
||||
if (r) return r;
|
||||
|
||||
// Try with cpu-moe
|
||||
log(" Full GPU failed, trying with --cpu-moe...");
|
||||
r = await testConfig(model, "boot-cpumoe", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
|
||||
});
|
||||
if (r) return r;
|
||||
|
||||
// Reduced layers
|
||||
log(" --cpu-moe also failed, trying reduced layers...");
|
||||
r = await testConfig(model, "boot-ngl-half", {
|
||||
ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
|
||||
ctk: "q4_0", ctv: "q4_0",
|
||||
});
|
||||
return r;
|
||||
}
|
||||
|
||||
async function phase1_gpuOffload(model, baseline) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 1: GPU Offload Strategy — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const results = baseline ? [baseline] : [];
|
||||
|
||||
// Test --cpu-moe on/off
|
||||
for (const cpuMoe of [true, false]) {
|
||||
const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
|
||||
if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
|
||||
const r = await testConfig(model, lbl, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
// n-cpu-moe sweep
|
||||
for (const n of [0, 5, 10, 15, 20]) {
|
||||
if (n > model.totalLayers) continue;
|
||||
const r = await testConfig(model, `n-cpu-moe=${n}`, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; }
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase2_threads(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 2: CPU Thread Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const t of [2, 4, 6, 8, 10, 12]) {
|
||||
if (t === p.t) continue;
|
||||
const r = await testConfig(model, `t=${t}`, {
|
||||
...p, t,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase3_batch(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 3: Batch Size Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const [ub, b] of [
|
||||
[128, 512], [256, 1024], [256, 2048],
|
||||
[512, 1024], [512, 2048], [512, 4096],
|
||||
[1024, 2048], [1024, 4096],
|
||||
]) {
|
||||
if (ub === p.ub && b === p.b) continue;
|
||||
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase4_kvcache(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 4: KV Cache Type Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const [ctk, ctv] of [
|
||||
["q4_0", "q4_0"], ["q8_0", "q8_0"],
|
||||
["q4_0", "q8_0"], ["f16", "f16"],
|
||||
]) {
|
||||
if (ctk === p.ctk && ctv === p.ctv) continue;
|
||||
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase5_final(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 5: Final Verification (5 runs) — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
await killServer();
|
||||
const proc = startServer(model.path, prev.params);
|
||||
const { ok, bootTime } = await waitForServer();
|
||||
if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; }
|
||||
|
||||
const vram = getVramAll();
|
||||
try { await runBenchmark(20); } catch {}
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try {
|
||||
const r = await runBenchmark();
|
||||
speeds.push(r.tps);
|
||||
log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Final Run ${i + 1}: ERROR (${e.message})`);
|
||||
}
|
||||
}
|
||||
proc.kill("SIGKILL");
|
||||
|
||||
if (speeds.length > 0) {
|
||||
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
|
||||
|
||||
const final_ = {
|
||||
model: model.name, quant: model.quant,
|
||||
label: `FINAL-${model.name}`,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot_time: +bootTime.toFixed(1), vram, params: prev.params,
|
||||
};
|
||||
ALL_RESULTS.push(final_);
|
||||
return final_;
|
||||
}
|
||||
return prev;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
|
||||
async function runModelBenchmark(model) {
|
||||
log(`\n${"#".repeat(70)}`);
|
||||
log(` MODEL: ${model.name}`);
|
||||
log(` File: ${model.path}`);
|
||||
try {
|
||||
const sz = statSync(model.path).size / 1024 ** 3;
|
||||
log(` Size: ${sz.toFixed(2)} GB`);
|
||||
} catch { log(` Size: unknown`); }
|
||||
log(`${"#".repeat(70)}`);
|
||||
|
||||
if (!existsSync(model.path)) {
|
||||
log(` SKIP: Model file not found!`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const baseline = await phase0_bootTest(model);
|
||||
if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; }
|
||||
|
||||
let best = await phase1_gpuOffload(model, baseline);
|
||||
if (!best) return baseline;
|
||||
|
||||
best = await phase2_threads(model, best);
|
||||
best = await phase3_batch(model, best);
|
||||
best = await phase4_kvcache(model, best);
|
||||
best = await phase5_final(model, best);
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const startTime = Date.now();
|
||||
|
||||
log("=".repeat(70));
|
||||
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
|
||||
log(" 2x RTX 3060 (24GB Total) | 256K Context");
|
||||
log(` Models: ${MODELS.length}`);
|
||||
log(` Started: ${new Date().toISOString()}`);
|
||||
log("=".repeat(70));
|
||||
|
||||
const gpus = getVramAll();
|
||||
gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
|
||||
|
||||
const winners = [];
|
||||
|
||||
for (let i = 0; i < MODELS.length; i++) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const winner = await runModelBenchmark(MODELS[i]);
|
||||
if (winner) winners.push(winner);
|
||||
|
||||
// Save intermediate
|
||||
writeFileSync("scripts/dual_gpu_results.json",
|
||||
JSON.stringify(ALL_RESULTS, null, 2));
|
||||
log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`);
|
||||
}
|
||||
|
||||
// ─── Grand Final ───────────────────────────────────────────
|
||||
const elapsed = (Date.now() - startTime) / 60000;
|
||||
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` GRAND FINAL COMPARISON`);
|
||||
log(` Total time: ${elapsed.toFixed(1)} minutes`);
|
||||
log(` Configs tested: ${ALL_RESULTS.length}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
if (winners.length === 0) {
|
||||
log(" No models ran at 256K!");
|
||||
return;
|
||||
}
|
||||
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
const medals = ["🥇", "🥈", "🥉", " "];
|
||||
|
||||
const lines = [
|
||||
`Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
|
||||
`Hardware: 2x RTX 3060 12GB | Context: 256K`,
|
||||
`Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
|
||||
"", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60),
|
||||
];
|
||||
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i];
|
||||
const p = w.params;
|
||||
lines.push("");
|
||||
lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`);
|
||||
lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
|
||||
lines.push(` Boot: ${w.boot_time.toFixed(0)}s`);
|
||||
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
|
||||
lines.push(` ctk=${p.ctk} ctv=${p.ctv}`);
|
||||
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
||||
else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
||||
}
|
||||
|
||||
const champ = winners[0];
|
||||
const cp = champ.params;
|
||||
lines.push("", "=".repeat(60));
|
||||
lines.push(` ★ CHAMPION: ${champ.model}`);
|
||||
lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`);
|
||||
lines.push("=".repeat(60));
|
||||
|
||||
// Build recommended command
|
||||
const cmdParts = [
|
||||
`llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
|
||||
`-ngl ${cp.ngl} -c ${CONTEXT}`,
|
||||
`-t ${cp.t} -tb ${cp.t}`,
|
||||
`-ub ${cp.ub} -b ${cp.b}`,
|
||||
`-fa on`,
|
||||
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
||||
`--prio ${cp.prio || 3} --poll 50`,
|
||||
`--mlock`,
|
||||
];
|
||||
if (cp.cpuMoe) cmdParts.push("--cpu-moe");
|
||||
else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
|
||||
if (cp.nommap) cmdParts.push("--no-mmap");
|
||||
cmdParts.push("--port 8000 --host 0.0.0.0");
|
||||
|
||||
lines.push("", " Recommended command:");
|
||||
lines.push(` ${cmdParts.join(" ")}`);
|
||||
|
||||
const summary = lines.join("\n");
|
||||
console.log(summary);
|
||||
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/dual_gpu_results.json",
|
||||
JSON.stringify(ALL_RESULTS, null, 2));
|
||||
|
||||
log(`\n Results: scripts/dual_gpu_results.json`);
|
||||
log(` Summary: scripts/dual_gpu_summary.txt`);
|
||||
log(` DONE!`);
|
||||
|
||||
await killServer();
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error("Fatal error:", e);
|
||||
process.exit(1);
|
||||
});
|
||||
644
scripts/_archive/tuning/dual_gpu_benchmark.py
Normal file
644
scripts/_archive/tuning/dual_gpu_benchmark.py
Normal file
@@ -0,0 +1,644 @@
|
||||
"""
|
||||
Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||
==========================================================
|
||||
Tests 4 models across multiple parameter configurations to find
|
||||
the absolute best model + settings for 256K context coding agent.
|
||||
|
||||
Models:
|
||||
1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||
2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||
3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||
4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||
|
||||
Test Phases (per model):
|
||||
Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
|
||||
Phase 1: GPU layer + MoE offload strategy sweep
|
||||
Phase 2: CPU thread sweep (carry best from P1)
|
||||
Phase 3: Batch size sweep (carry best from P1+P2)
|
||||
Phase 4: KV cache type sweep (carry best from P1+P2+P3)
|
||||
Phase 5: Final verification (5 runs)
|
||||
|
||||
Output: scripts/dual_gpu_results.json (all raw data)
|
||||
scripts/dual_gpu_summary.txt (human-readable winner)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ─── Configuration ───────────────────────────────────────────────
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
CONTEXT = 262144 # 256K
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
SERVER_TIMEOUT = 300 # seconds to wait for server startup
|
||||
|
||||
MODELS = [
|
||||
{
|
||||
"name": "Qwen3.5-35B-A3B Q4_K_M",
|
||||
"path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
|
||||
"type": "qwen",
|
||||
"quant": "Q4_K_M",
|
||||
"is_mxfp4": False,
|
||||
"total_layers": 64, # Qwen3.5 35B has 64 layers
|
||||
},
|
||||
{
|
||||
"name": "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||
"path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
|
||||
"type": "qwen",
|
||||
"quant": "MXFP4_MOE",
|
||||
"is_mxfp4": True,
|
||||
"total_layers": 64,
|
||||
},
|
||||
{
|
||||
"name": "Gemma4 26B-A4B Q4_K_M",
|
||||
"path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
|
||||
"type": "gemma4",
|
||||
"quant": "Q4_K_M",
|
||||
"is_mxfp4": False,
|
||||
"total_layers": 30, # Gemma4 26B has 30 layers
|
||||
},
|
||||
{
|
||||
"name": "Gemma4 26B-A4B MXFP4_MOE",
|
||||
"path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
|
||||
"type": "gemma4",
|
||||
"quant": "MXFP4_MOE",
|
||||
"is_mxfp4": True,
|
||||
"total_layers": 30,
|
||||
},
|
||||
]
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
# ─── Utility Functions ──────────────────────────────────────────
|
||||
def log(msg):
|
||||
ts = datetime.datetime.now().strftime("%H:%M:%S")
|
||||
print(f"[{ts}] {msg}", flush=True)
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
capture_output=True)
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def get_vram_all():
|
||||
"""Returns list of (used, total) tuples for each GPU."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
gpus = []
|
||||
for line in r.stdout.strip().split("\n"):
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 3:
|
||||
gpus.append({
|
||||
"gpu": int(parts[0]),
|
||||
"used": int(parts[1]),
|
||||
"total": int(parts[2]),
|
||||
})
|
||||
return gpus
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
|
||||
cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
|
||||
"""Build llama-server command for dual-GPU."""
|
||||
cmd = [
|
||||
LLAMA_SERVER,
|
||||
"--model", model_path,
|
||||
"-ngl", str(ngl),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", ctk,
|
||||
"--cache-type-v", ctv,
|
||||
"-ub", str(ub),
|
||||
"-b", str(b),
|
||||
"-t", str(t),
|
||||
"-tb", str(t),
|
||||
"--prio", str(prio),
|
||||
"--poll", "50",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
]
|
||||
# MoE offloading options
|
||||
if cpu_moe:
|
||||
cmd.append("--cpu-moe")
|
||||
elif n_cpu_moe > 0:
|
||||
cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
|
||||
if nommap:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(model_path, **kwargs):
|
||||
cmd = build_cmd(model_path, **kwargs)
|
||||
log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=SERVER_TIMEOUT):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
boot_time = time.time() - start
|
||||
return True, boot_time
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(3)
|
||||
return False, timeout
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user",
|
||||
"content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0,
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
pt = usage.get("prompt_tokens", 0)
|
||||
return {
|
||||
"tps": ct / elapsed if elapsed > 0 else 0,
|
||||
"completion_tokens": ct,
|
||||
"prompt_tokens": pt,
|
||||
"elapsed": elapsed,
|
||||
}
|
||||
|
||||
|
||||
def test_config(model_info, label, **kwargs):
|
||||
"""Test a single configuration. Returns result dict or None."""
|
||||
kill_server()
|
||||
log(f" [{label}] Starting server...")
|
||||
|
||||
proc = start_server(model_info["path"], **kwargs)
|
||||
ok, boot_time = wait_for_server()
|
||||
|
||||
if not ok:
|
||||
log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram = get_vram_all()
|
||||
vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
|
||||
log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Benchmark runs
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
r = run_benchmark()
|
||||
speeds.append(r["tps"])
|
||||
log(f" Run {i+1}: {r['tps']:.2f} t/s")
|
||||
except Exception as e:
|
||||
log(f" Run {i+1}: ERROR ({e})")
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
log(f" [{label}] ALL BENCHMARK RUNS FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {
|
||||
"model": model_info["name"],
|
||||
"quant": model_info["quant"],
|
||||
"label": label,
|
||||
"avg_tps": round(avg, 2),
|
||||
"best_tps": round(best, 2),
|
||||
"boot_time": round(boot_time, 1),
|
||||
"vram": vram,
|
||||
"params": kwargs,
|
||||
}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
# ─── Phase Runners ───────────────────────────────────────────────
|
||||
|
||||
def phase0_boot_test(model):
|
||||
"""Quick test: can the model even boot with 256K on dual GPU?"""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 0: Boot Test — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
# Try -ngl 999 (all layers to GPU) as baseline
|
||||
r = test_config(
|
||||
model, f"boot-ngl999",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
)
|
||||
if r:
|
||||
return r
|
||||
|
||||
# If full GPU fails, try with cpu-moe
|
||||
log(" Full GPU failed, trying with --cpu-moe...")
|
||||
r = test_config(
|
||||
model, f"boot-cpumoe",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
cpu_moe=True,
|
||||
)
|
||||
if r:
|
||||
return r
|
||||
|
||||
# Extreme fallback: fewer layers
|
||||
log(" --cpu-moe also failed, trying reduced layers...")
|
||||
r = test_config(
|
||||
model, f"boot-ngl-half",
|
||||
ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
|
||||
ctk="q4_0", ctv="q4_0",
|
||||
)
|
||||
return r
|
||||
|
||||
|
||||
def phase1_gpu_offload(model, baseline):
|
||||
"""Find optimal GPU layer count and MoE offload strategy."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 1: GPU Offload Strategy — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
results = []
|
||||
if baseline:
|
||||
results.append(baseline)
|
||||
|
||||
total = model["total_layers"]
|
||||
|
||||
# Strategy A: All GPU + cpu-moe variations
|
||||
for cpu_moe in [True, False]:
|
||||
label = f"ngl=999 cpu_moe={cpu_moe}"
|
||||
# Skip if already tested in baseline
|
||||
if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
|
||||
baseline["params"].get("cpu_moe", False) == cpu_moe:
|
||||
continue
|
||||
r = test_config(
|
||||
model, label,
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
cpu_moe=cpu_moe,
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
# Strategy B: n-cpu-moe sweep (selective expert offload)
|
||||
for n in [0, 5, 10, 15, 20]:
|
||||
if n > total:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"n-cpu-moe={n}",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
n_cpu_moe=n,
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
if not results:
|
||||
log(" PHASE 1: No configuration worked!")
|
||||
return None
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase2_threads(model, prev_best):
|
||||
"""Sweep CPU threads with best GPU config locked."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 2: CPU Thread Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
results = [prev_best]
|
||||
|
||||
for t in [2, 4, 6, 8, 10, 12]:
|
||||
if t == p.get("t", 6):
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"t={t}",
|
||||
ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
|
||||
ctk=p["ctk"], ctv=p["ctv"],
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase3_batch(model, prev_best):
|
||||
"""Sweep batch sizes."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 3: Batch Size Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
best_t = p["t"]
|
||||
results = [prev_best]
|
||||
|
||||
for ub, b in [(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)]:
|
||||
if ub == p["ub"] and b == p["b"]:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"ub={ub} b={b}",
|
||||
ngl=p["ngl"], t=best_t, ub=ub, b=b,
|
||||
ctk=p["ctk"], ctv=p["ctv"],
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase4_kvcache(model, prev_best):
|
||||
"""Sweep KV cache precision."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 4: KV Cache Type Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
results = [prev_best]
|
||||
|
||||
for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
|
||||
("q4_0", "q8_0"), ("f16", "f16")]:
|
||||
if ctk == p["ctk"] and ctv == p["ctv"]:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"kv={ctk}/{ctv}",
|
||||
ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
|
||||
ctk=ctk, ctv=ctv,
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase5_final(model, prev_best):
|
||||
"""Final verification with 5 runs."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 5: Final Verification (5 runs) — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
kill_server()
|
||||
proc = start_server(model["path"], **p)
|
||||
ok, boot_time = wait_for_server()
|
||||
if not ok:
|
||||
log(" FAILED to start for final verification!")
|
||||
proc.kill()
|
||||
return prev_best
|
||||
|
||||
vram = get_vram_all()
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
r = run_benchmark()
|
||||
speeds.append(r["tps"])
|
||||
log(f" Final Run {i+1}: {r['tps']:.2f} t/s")
|
||||
except Exception as e:
|
||||
log(f" Final Run {i+1}: ERROR ({e})")
|
||||
|
||||
proc.kill()
|
||||
|
||||
if speeds:
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best_tps = max(speeds)
|
||||
log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
|
||||
|
||||
final = {
|
||||
"model": model["name"],
|
||||
"quant": model["quant"],
|
||||
"label": f"FINAL-{model['name']}",
|
||||
"avg_tps": round(avg, 2),
|
||||
"best_tps": round(best_tps, 2),
|
||||
"boot_time": round(boot_time, 1),
|
||||
"vram": vram,
|
||||
"params": p,
|
||||
}
|
||||
ALL_RESULTS.append(final)
|
||||
return final
|
||||
|
||||
return prev_best
|
||||
|
||||
|
||||
# ─── Main ────────────────────────────────────────────────────────
|
||||
|
||||
def run_full_benchmark_for_model(model):
|
||||
"""Run all phases for a single model."""
|
||||
log(f"\n{'#'*70}")
|
||||
log(f" MODEL: {model['name']}")
|
||||
log(f" File: {model['path']}")
|
||||
log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB")
|
||||
log(f"{'#'*70}")
|
||||
|
||||
# Check model exists
|
||||
if not os.path.exists(model["path"]):
|
||||
log(f" SKIP: Model file not found!")
|
||||
return None
|
||||
|
||||
# Phase 0: Can it boot?
|
||||
baseline = phase0_boot_test(model)
|
||||
if not baseline:
|
||||
log(f" SKIP: {model['name']} cannot boot at 256K context!")
|
||||
return None
|
||||
|
||||
# Phase 1: GPU offload strategy
|
||||
best = phase1_gpu_offload(model, baseline)
|
||||
if not best:
|
||||
return baseline
|
||||
|
||||
# Phase 2: CPU threads
|
||||
best = phase2_threads(model, best)
|
||||
|
||||
# Phase 3: Batch sizes
|
||||
best = phase3_batch(model, best)
|
||||
|
||||
# Phase 4: KV cache
|
||||
best = phase4_kvcache(model, best)
|
||||
|
||||
# Phase 5: Final verification
|
||||
final = phase5_final(model, best)
|
||||
|
||||
return final
|
||||
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
|
||||
log("=" * 70)
|
||||
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
|
||||
log(" 2x RTX 3060 (24GB Total) | 256K Context")
|
||||
log(f" Models: {len(MODELS)}")
|
||||
log(f" Started: {datetime.datetime.now().isoformat()}")
|
||||
log("=" * 70)
|
||||
|
||||
# Show GPU info
|
||||
gpus = get_vram_all()
|
||||
for g in gpus:
|
||||
log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
|
||||
|
||||
# Run benchmarks for each model
|
||||
model_winners = []
|
||||
for i, model in enumerate(MODELS):
|
||||
log(f"\n{'='*70}")
|
||||
log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
winner = run_full_benchmark_for_model(model)
|
||||
if winner:
|
||||
model_winners.append(winner)
|
||||
|
||||
# Save intermediate results
|
||||
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
|
||||
|
||||
# ─── Grand Final Comparison ──────────────────────────────────
|
||||
elapsed = (time.time() - start_time) / 60
|
||||
|
||||
log(f"\n{'='*70}")
|
||||
log(f" GRAND FINAL COMPARISON")
|
||||
log(f" Total time: {elapsed:.1f} minutes")
|
||||
log(f" Configs tested: {len(ALL_RESULTS)}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
if not model_winners:
|
||||
log(" No models were able to run at 256K context!")
|
||||
return
|
||||
|
||||
# Sort by avg t/s
|
||||
model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
|
||||
|
||||
summary_lines = []
|
||||
summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
|
||||
summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
|
||||
summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
|
||||
summary_lines.append(f"Total time: {elapsed:.1f} minutes")
|
||||
summary_lines.append("")
|
||||
summary_lines.append("=" * 60)
|
||||
summary_lines.append(" RANKING (by AVG t/s)")
|
||||
summary_lines.append("=" * 60)
|
||||
|
||||
for rank, w in enumerate(model_winners, 1):
|
||||
medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ")
|
||||
summary_lines.append(f"\n {medal} #{rank}: {w['model']}")
|
||||
summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
|
||||
summary_lines.append(f" Boot: {w['boot_time']:.0f}s")
|
||||
p = w["params"]
|
||||
summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
|
||||
summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}")
|
||||
if p.get("cpu_moe"):
|
||||
summary_lines.append(f" --cpu-moe")
|
||||
elif p.get("n_cpu_moe", 0) > 0:
|
||||
summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}")
|
||||
|
||||
champion = model_winners[0]
|
||||
summary_lines.append(f"\n{'='*60}")
|
||||
summary_lines.append(f" ★ CHAMPION: {champion['model']}")
|
||||
summary_lines.append(f" {champion['avg_tps']:.2f} t/s average")
|
||||
summary_lines.append(f"{'='*60}")
|
||||
|
||||
# Build recommended command
|
||||
p = champion["params"]
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
|
||||
f"-ngl {p['ngl']} -c {CONTEXT}",
|
||||
f"-t {p['t']} -tb {p['t']}",
|
||||
f"-ub {p['ub']} -b {p['b']}",
|
||||
"-fa on",
|
||||
f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
|
||||
f"--prio {p.get('prio', 3)} --poll 50",
|
||||
"--mlock",
|
||||
]
|
||||
if p.get("cpu_moe"):
|
||||
cmd_parts.append("--cpu-moe")
|
||||
elif p.get("n_cpu_moe", 0) > 0:
|
||||
cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
|
||||
if p.get("nommap"):
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
summary_lines.append(f"\n Recommended command:")
|
||||
summary_lines.append(f" {' '.join(cmd_parts)}")
|
||||
|
||||
summary = "\n".join(summary_lines)
|
||||
print(summary)
|
||||
|
||||
with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
|
||||
f.write(summary)
|
||||
|
||||
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
|
||||
log(f"\n Results: scripts/dual_gpu_results.json")
|
||||
log(f" Summary: scripts/dual_gpu_summary.txt")
|
||||
log(f" DONE!")
|
||||
|
||||
kill_server()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
330
scripts/_archive/tuning/dual_gpu_benchmark_v2.mjs
Normal file
330
scripts/_archive/tuning/dual_gpu_benchmark_v2.mjs
Normal file
@@ -0,0 +1,330 @@
|
||||
/**
|
||||
* Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2
|
||||
* =====================================================
|
||||
* Informed by VRAM analysis — tests models in optimal order.
|
||||
*
|
||||
* Key insights applied:
|
||||
* - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA)
|
||||
* - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first
|
||||
* - Skip configs known to fail, minimize wasted time
|
||||
*
|
||||
* Run: node scripts/dual_gpu_benchmark_v2.mjs
|
||||
* Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, existsSync, statSync } from "fs";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CTX = 262144;
|
||||
const RUNS = 3;
|
||||
const TOKENS = 200;
|
||||
const BOOT_TIMEOUT = 300_000;
|
||||
|
||||
// Models ordered: smallest first (most likely to succeed fully on GPU)
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Gemma4-26B MXFP4_MOE",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
||||
quant: "MXFP4_MOE",
|
||||
fitsGPU: true, // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB
|
||||
},
|
||||
{
|
||||
name: "Gemma4-26B Q4_K_M",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
||||
quant: "Q4_K_M",
|
||||
fitsGPU: true, // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
quant: "MXFP4_MOE",
|
||||
fitsGPU: "maybe", // 20.1 + 1.41 + 1 = 22.51 GB — tight
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
quant: "Q4_K_M",
|
||||
fitsGPU: "maybe", // 20.5 + 1.41 + 1 = 22.91 GB — very tight
|
||||
},
|
||||
];
|
||||
|
||||
const ALL = [];
|
||||
let currentProc = null;
|
||||
|
||||
// ─── Utilities ─────────────────────────────────────────────────
|
||||
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
async function kill() {
|
||||
if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; }
|
||||
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||
await sleep(5000);
|
||||
}
|
||||
|
||||
function vram() {
|
||||
try {
|
||||
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||
return { gpu: g, used: u, total: t };
|
||||
});
|
||||
} catch { return []; }
|
||||
}
|
||||
|
||||
function startServer(modelPath, p) {
|
||||
const args = [
|
||||
"--model", modelPath, "-ngl", String(p.ngl),
|
||||
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", p.ctk, "--cache-type-v", p.ctv,
|
||||
"-ub", String(p.ub), "-b", String(p.b),
|
||||
"-t", String(p.t), "-tb", String(p.t),
|
||||
"--prio", String(p.prio || 3), "--poll", "50", "--mlock",
|
||||
"--port", "8000", "--host", "0.0.0.0",
|
||||
];
|
||||
if (p.cpuMoe) args.push("--cpu-moe");
|
||||
else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||
if (p.nommap) args.push("--no-mmap");
|
||||
|
||||
currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||
return currentProc;
|
||||
}
|
||||
|
||||
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||
const t0 = Date.now();
|
||||
while (Date.now() - t0 < timeout) {
|
||||
try {
|
||||
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const d = await r.json();
|
||||
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, boot: timeout / 1000 };
|
||||
}
|
||||
|
||||
async function bench(n = TOKENS) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "m",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||
max_tokens: n, temperature: 0,
|
||||
}),
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const ct = d.usage?.completion_tokens || 0;
|
||||
return { tps: ct / dt, ct, dt };
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await kill();
|
||||
log(` [${label}] Starting...`);
|
||||
startServer(model.path, params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) { log(` [${label}] ✗ FAILED (timeout)`); await kill(); return null; }
|
||||
|
||||
const v = vram();
|
||||
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||
log(` [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`);
|
||||
|
||||
try { await bench(20); } catch {} // warmup
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < RUNS; i++) {
|
||||
try { const r = await bench(); speeds.push(r.tps); log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Run${i+1}: ERR ${e.message}`); }
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (!speeds.length) { log(` [${label}] ✗ ALL RUNS FAILED`); return null; }
|
||||
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||
|
||||
const res = { model: model.name, quant: model.quant, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram: v, params };
|
||||
ALL.push(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
// Save intermediate results after each test
|
||||
function saveIntermediate() {
|
||||
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
}
|
||||
|
||||
// ─── Smart Phase Runner ────────────────────────────────────────
|
||||
|
||||
async function tuneModel(model) {
|
||||
log(`\n${"#".repeat(65)}`);
|
||||
log(` ${model.name} (${model.quant})`);
|
||||
if (!existsSync(model.path)) { log(" ✗ File not found, SKIP"); return null; }
|
||||
const sz = (statSync(model.path).size / 1024**3).toFixed(2);
|
||||
log(` Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`);
|
||||
log(`${"#".repeat(65)}`);
|
||||
|
||||
// ── Step 1: Find working GPU config ──
|
||||
log(`\n ── Step 1: Find optimal GPU offload ──`);
|
||||
let baseline = null;
|
||||
|
||||
if (model.fitsGPU === true || model.fitsGPU === "maybe") {
|
||||
// Try full GPU, no CPU offload
|
||||
baseline = await testConfig(model, "ngl=999 pure-GPU", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" });
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
if (!baseline) {
|
||||
// Try n-cpu-moe values (ascending — find minimum needed)
|
||||
for (const n of [5, 10, 15, 20]) {
|
||||
baseline = await testConfig(model, `n-cpu-moe=${n}`, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n });
|
||||
saveIntermediate();
|
||||
if (baseline) break; // found minimum working offload
|
||||
}
|
||||
}
|
||||
|
||||
if (!baseline) {
|
||||
// Last resort: full cpu-moe
|
||||
baseline = await testConfig(model, "cpu-moe", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true });
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
if (!baseline) { log(` ✗ ${model.name} cannot boot at 256K!`); return null; }
|
||||
|
||||
const bp = baseline.params; // carry forward best params
|
||||
|
||||
// If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory)
|
||||
if (!bp.cpuMoe && !bp.nCpuMoe) {
|
||||
const alt = await testConfig(model, "compare: cpu-moe", {
|
||||
...bp, cpuMoe: true });
|
||||
saveIntermediate();
|
||||
if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; }
|
||||
}
|
||||
|
||||
let best = baseline;
|
||||
|
||||
// ── Step 2: Thread sweep ──
|
||||
log(`\n ── Step 2: Thread sweep ──`);
|
||||
for (const t of [2, 4, 8, 10, 12]) {
|
||||
if (t === best.params.t) continue;
|
||||
const r = await testConfig(model, `t=${t}`, { ...best.params, t });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 3: Batch sweep ──
|
||||
log(`\n ── Step 3: Batch sweep ──`);
|
||||
for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) {
|
||||
if (ub === best.params.ub && b === best.params.b) continue;
|
||||
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 4: KV cache sweep ──
|
||||
log(`\n ── Step 4: KV cache type ──`);
|
||||
for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) {
|
||||
if (ctk === best.params.ctk && ctv === best.params.ctv) continue;
|
||||
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 5: Final verification (5 runs) ──
|
||||
log(`\n ── Step 5: Final verification ──`);
|
||||
await kill();
|
||||
startServer(model.path, best.params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) { await kill(); return best; }
|
||||
const v = vram();
|
||||
try { await bench(20); } catch {}
|
||||
|
||||
const finals = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (finals.length > 0) {
|
||||
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||
const bst = Math.max(...finals);
|
||||
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||
const final = { model: model.name, quant: model.quant, label: `FINAL`,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram: v, params: best.params };
|
||||
ALL.push(final);
|
||||
saveIntermediate();
|
||||
return final;
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
async function main() {
|
||||
const t0 = Date.now();
|
||||
log("=" .repeat(65));
|
||||
log(" DUAL-GPU BENCHMARK v2 — Smart Strategy");
|
||||
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||
log(" " + new Date().toISOString());
|
||||
log("=".repeat(65));
|
||||
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||
|
||||
const winners = [];
|
||||
for (let i = 0; i < MODELS.length; i++) {
|
||||
log(`\n${"=".repeat(65)}`);
|
||||
log(` MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`);
|
||||
log("=".repeat(65));
|
||||
const w = await tuneModel(MODELS[i]);
|
||||
if (w) winners.push(w);
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
// ─── Summary ──────────────────────────────────────────────
|
||||
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
const medals = ["🥇", "🥈", "🥉", " "];
|
||||
|
||||
const lines = [
|
||||
`Dual-GPU Benchmark v2 — ${new Date().toISOString()}`,
|
||||
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||
"", "=" .repeat(55), " RANKING", "=".repeat(55),
|
||||
];
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i], p = w.params;
|
||||
lines.push("", ` ${medals[i]||" "} #${i+1}: ${w.model}`);
|
||||
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`);
|
||||
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
||||
else if (p.nCpuMoe) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
||||
}
|
||||
if (winners.length > 0) {
|
||||
const c = winners[0], cp = c.params;
|
||||
lines.push("", "=".repeat(55), ` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s`, "=".repeat(55));
|
||||
const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`,
|
||||
`-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`,
|
||||
`-ub ${cp.ub} -b ${cp.b} -fa on`,
|
||||
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
||||
`--prio ${cp.prio||3} --poll 50 --mlock`,
|
||||
cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "",
|
||||
"--port 8000 --host 0.0.0.0"].filter(Boolean).join(" ");
|
||||
lines.push("", " Recommended:", ` ${cmd}`);
|
||||
}
|
||||
const summary = lines.join("\n");
|
||||
console.log("\n" + summary);
|
||||
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
log(`\n Saved: dual_gpu_results.json + dual_gpu_summary.txt`);
|
||||
log(" DONE!");
|
||||
await kill();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||
101
scripts/_archive/tuning/find_max_dense.mjs
Normal file
101
scripts/_archive/tuning/find_max_dense.mjs
Normal file
@@ -0,0 +1,101 @@
|
||||
import { spawn, exec } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function killServer() {
|
||||
return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
|
||||
}
|
||||
|
||||
async function testContextSize(modelPath, contextSize) {
|
||||
console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
|
||||
await killServer();
|
||||
|
||||
const args = [
|
||||
'--model', `models\\${modelPath}`,
|
||||
'-ngl', '999',
|
||||
'-c', contextSize.toString(),
|
||||
'-fa', 'on',
|
||||
'--cache-type-k', 'q4_0',
|
||||
'--cache-type-v', 'q4_0',
|
||||
'-ub', '512',
|
||||
'-b', '2048',
|
||||
'-t', '6',
|
||||
'-tb', '6',
|
||||
'--split-mode', 'row',
|
||||
'--prio', '3',
|
||||
'--fit', 'off',
|
||||
'--port', '8000',
|
||||
'--host', '0.0.0.0'
|
||||
];
|
||||
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
|
||||
|
||||
let booted = false;
|
||||
let oomed = false;
|
||||
|
||||
server.stderr.on('data', (d) => {
|
||||
const text = d.toString();
|
||||
if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
|
||||
oomed = true;
|
||||
}
|
||||
});
|
||||
|
||||
for (let i = 0; i < 20; i++) {
|
||||
if (oomed) break;
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
booted = true;
|
||||
break;
|
||||
}
|
||||
} catch(e) {}
|
||||
await delay(2000);
|
||||
}
|
||||
|
||||
if (oomed || !booted) {
|
||||
console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
|
||||
server.kill('SIGKILL');
|
||||
await killServer();
|
||||
return false;
|
||||
}
|
||||
|
||||
console.log(`✅ Booted! Running Benchmark...`);
|
||||
|
||||
// Benchmark
|
||||
const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
r(stdout || stderr);
|
||||
}));
|
||||
|
||||
console.log(bench);
|
||||
await killServer();
|
||||
return true;
|
||||
}
|
||||
|
||||
async function findMaxContext(modelName) {
|
||||
const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
|
||||
|
||||
let maxFound = false;
|
||||
for (const c of contexts) {
|
||||
const success = await testContextSize(modelName, c);
|
||||
if (success) {
|
||||
maxFound = true;
|
||||
console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!maxFound) {
|
||||
console.log(`\n❌ Failed to find any working context size for ${modelName}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
exec('set CUDA_VISIBLE_DEVICES=');
|
||||
console.log("============= QWEN 27B Q4_K_M =============");
|
||||
await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
|
||||
|
||||
console.log("\n============= GEMMA 4 31B Q4_K_M =============");
|
||||
await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
|
||||
}
|
||||
|
||||
main();
|
||||
345
scripts/_archive/tuning/qwen_fullgpu_challenge.mjs
Normal file
345
scripts/_archive/tuning/qwen_fullgpu_challenge.mjs
Normal file
@@ -0,0 +1,345 @@
|
||||
/**
|
||||
* Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크
|
||||
* =====================================================
|
||||
* 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기
|
||||
*
|
||||
* 테스트 모델:
|
||||
* 1. UD-IQ4_NL (16.6 GB) — 확실히 올라감, 기준선
|
||||
* 2. MXFP4_MOE (20.1 GB) — 도전! VRAM 극한 최적화
|
||||
* 3. Q4_K_M (20.5 GB) — 대조군 (n-cpu-moe=5)
|
||||
*
|
||||
* VRAM 절감 전략:
|
||||
* A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소)
|
||||
* B. split-mode row (GPU간 더 균등한 분배)
|
||||
* C. tensor-split 수동 밸런싱
|
||||
* D. no-mmap (메모리 관리 최적화)
|
||||
* E. defrag-thold (KV 캐시 파편화 방지)
|
||||
*
|
||||
* Run: node scripts/qwen_fullgpu_challenge.mjs
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, existsSync, statSync } from "fs";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CTX = 262144;
|
||||
const RUNS = 3;
|
||||
const TOKENS = 200;
|
||||
const BOOT_TIMEOUT = 300_000;
|
||||
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Qwen3.5 UD-IQ4_NL",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`,
|
||||
sizeGB: 16.6,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5 MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
sizeGB: 20.11,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5 Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
sizeGB: 20.5,
|
||||
},
|
||||
];
|
||||
|
||||
const ALL = [];
|
||||
let proc = null;
|
||||
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
async function kill() {
|
||||
if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; }
|
||||
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||
await sleep(5000);
|
||||
}
|
||||
|
||||
function vram() {
|
||||
try {
|
||||
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||
return { gpu: g, used: u, total: t };
|
||||
});
|
||||
} catch { return []; }
|
||||
}
|
||||
|
||||
function startServer(modelPath, p) {
|
||||
const args = [
|
||||
"--model", modelPath, "-ngl", "999",
|
||||
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", p.ctk || "q4_0",
|
||||
"--cache-type-v", p.ctv || "q4_0",
|
||||
"-ub", String(p.ub || 512), "-b", String(p.b || 2048),
|
||||
"-t", String(p.t || 4), "-tb", String(p.t || 4),
|
||||
"--prio", "3", "--poll", "50", "--mlock",
|
||||
"--port", "8000", "--host", "0.0.0.0",
|
||||
];
|
||||
|
||||
// GPU offload strategy
|
||||
if (p.cpuMoe) args.push("--cpu-moe");
|
||||
else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||
|
||||
// VRAM saving options
|
||||
if (p.splitMode) args.push("--split-mode", p.splitMode);
|
||||
if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit);
|
||||
if (p.noMmap) args.push("--no-mmap");
|
||||
if (p.defragThold) args.push("--defrag-thold", String(p.defragThold));
|
||||
if (p.noKvOffload) args.push("--no-kv-offload");
|
||||
|
||||
const cmdStr = args.join(" ");
|
||||
log(` CMD: ...${cmdStr.slice(-80)}`);
|
||||
proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||
return proc;
|
||||
}
|
||||
|
||||
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||
const t0 = Date.now();
|
||||
while (Date.now() - t0 < timeout) {
|
||||
try {
|
||||
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const d = await r.json();
|
||||
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, boot: timeout / 1000 };
|
||||
}
|
||||
|
||||
async function bench(n = TOKENS) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "m",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||
max_tokens: n, temperature: 0,
|
||||
}),
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const ct = d.usage?.completion_tokens || 0;
|
||||
return { tps: ct / dt, ct, dt };
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await kill();
|
||||
log(` [${label}] Starting...`);
|
||||
startServer(model.path, params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) {
|
||||
log(` [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`);
|
||||
await kill();
|
||||
return null;
|
||||
}
|
||||
|
||||
const v = vram();
|
||||
const totalUsed = v.reduce((a, g) => a + g.used, 0);
|
||||
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||
log(` [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`);
|
||||
|
||||
try { await bench(20); } catch {} // warmup
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < RUNS; i++) {
|
||||
try {
|
||||
const r = await bench();
|
||||
speeds.push(r.tps);
|
||||
log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Run${i+1}: ERR ${e.message}`);
|
||||
}
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (!speeds.length) return null;
|
||||
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||
|
||||
const res = {
|
||||
model: model.name, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot: +boot.toFixed(1),
|
||||
vram_total: totalUsed, vram: v,
|
||||
params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" },
|
||||
gpu_only: !params.cpuMoe && !params.nCpuMoe,
|
||||
};
|
||||
ALL.push(res);
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
return res;
|
||||
}
|
||||
|
||||
// ─── Test Strategies ───────────────────────────────────────────
|
||||
|
||||
async function testModel(model) {
|
||||
log(`\n${"#".repeat(65)}`);
|
||||
log(` ${model.name} (${model.sizeGB} GB)`);
|
||||
if (!existsSync(model.path)) { log(" ✗ File not found!"); return null; }
|
||||
log(`${"#".repeat(65)}`);
|
||||
|
||||
let best = null;
|
||||
const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; };
|
||||
|
||||
// ── Strategy 1: Pure GPU, default settings ──
|
||||
log(`\n ── Strategy 1: Pure GPU (default) ──`);
|
||||
update(await testConfig(model, "pure-GPU default", {
|
||||
t: 4, ub: 512, b: 2048
|
||||
}));
|
||||
|
||||
// ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ──
|
||||
log(`\n ── Strategy 2: Pure GPU, minimal batch ──`);
|
||||
update(await testConfig(model, "pure-GPU minbatch", {
|
||||
t: 4, ub: 64, b: 256
|
||||
}));
|
||||
|
||||
// ── Strategy 3: Pure GPU, small batch + no-mmap ──
|
||||
log(`\n ── Strategy 3: Pure GPU + no-mmap + small batch ──`);
|
||||
update(await testConfig(model, "pure-GPU nommap small", {
|
||||
t: 4, ub: 128, b: 512, noMmap: true
|
||||
}));
|
||||
|
||||
// ── Strategy 4: Pure GPU, split-mode row ──
|
||||
log(`\n ── Strategy 4: Pure GPU + split-mode row ──`);
|
||||
update(await testConfig(model, "pure-GPU row-split", {
|
||||
t: 4, ub: 128, b: 512, splitMode: "row"
|
||||
}));
|
||||
|
||||
// ── Strategy 5: Pure GPU, tensor-split manual balance ──
|
||||
log(`\n ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`);
|
||||
update(await testConfig(model, "pure-GPU ts=0.5,0.5", {
|
||||
t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5"
|
||||
}));
|
||||
|
||||
// ── Strategy 6: Pure GPU, defrag + all tricks ──
|
||||
log(`\n ── Strategy 6: Pure GPU ALL tricks ──`);
|
||||
update(await testConfig(model, "pure-GPU all-tricks", {
|
||||
t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1
|
||||
}));
|
||||
|
||||
// ── Fallback: n-cpu-moe=5 baseline ──
|
||||
if (!best || !best.gpu_only) {
|
||||
log(`\n ── Fallback: n-cpu-moe=5 ──`);
|
||||
update(await testConfig(model, "n-cpu-moe=5 baseline", {
|
||||
t: 4, ub: 256, b: 1024, nCpuMoe: 5
|
||||
}));
|
||||
}
|
||||
|
||||
// ── If pure GPU worked, tune batch/thread/kv ──
|
||||
if (best && best.gpu_only) {
|
||||
log(`\n ── Pure GPU succeeded! Fine-tuning... ──`);
|
||||
const bp = best.params;
|
||||
|
||||
// Thread sweep
|
||||
for (const t of [2, 6, 8]) {
|
||||
if (t === bp.t) continue;
|
||||
update(await testConfig(model, `tune t=${t}`, { ...bp, t }));
|
||||
}
|
||||
|
||||
// Batch sweep
|
||||
for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) {
|
||||
if (ub === bp.ub && b === bp.b) continue;
|
||||
update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b }));
|
||||
}
|
||||
|
||||
// KV cache upgrade (extra VRAM available?)
|
||||
for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) {
|
||||
update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv }));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Final verification ──
|
||||
if (best) {
|
||||
log(`\n ── Final verification (5 runs) ──`);
|
||||
await kill();
|
||||
startServer(model.path, best.params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (ok) {
|
||||
const v = vram();
|
||||
try { await bench(20); } catch {}
|
||||
const finals = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||
}
|
||||
await kill();
|
||||
if (finals.length > 0) {
|
||||
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||
const bst = Math.max(...finals);
|
||||
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||
const final = { model: model.name, label: "FINAL",
|
||||
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0),
|
||||
vram: v, params: best.params, gpu_only: best.gpu_only };
|
||||
ALL.push(final);
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
return final;
|
||||
}
|
||||
}
|
||||
await kill();
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const t0 = Date.now();
|
||||
log("=".repeat(65));
|
||||
log(" QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET");
|
||||
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||
log(" " + new Date().toISOString());
|
||||
log("=".repeat(65));
|
||||
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||
|
||||
const winners = [];
|
||||
for (const model of MODELS) {
|
||||
const w = await testModel(model);
|
||||
if (w) winners.push(w);
|
||||
}
|
||||
|
||||
// ─── Summary ──────────────────────────────────────────────
|
||||
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
|
||||
const lines = [
|
||||
`Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`,
|
||||
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||
"", "=".repeat(55), " RANKING", "=".repeat(55),
|
||||
];
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i], p = w.params;
|
||||
const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload";
|
||||
lines.push("", ` #${i+1}: ${w.model} [${gpu}]`);
|
||||
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||
lines.push(` VRAM: ${w.vram_total} MiB total`);
|
||||
const flags = [];
|
||||
if (p.splitMode) flags.push(`split=${p.splitMode}`);
|
||||
if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`);
|
||||
if (p.noMmap) flags.push("no-mmap");
|
||||
if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`);
|
||||
lines.push(` t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`);
|
||||
}
|
||||
|
||||
if (winners.length > 0) {
|
||||
const c = winners[0];
|
||||
lines.push("", "=".repeat(55));
|
||||
lines.push(` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`);
|
||||
lines.push("=".repeat(55));
|
||||
}
|
||||
|
||||
const summary = lines.join("\n");
|
||||
console.log("\n" + summary);
|
||||
writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
log(`\n Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`);
|
||||
log(" DONE!");
|
||||
await kill();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||
129
scripts/_archive/tuning/tune_122b.py
Normal file
129
scripts/_archive/tuning/tune_122b.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import subprocess, time, urllib.request, json, sys
|
||||
try: sys.stdout.reconfigure(encoding='utf-8')
|
||||
except: pass
|
||||
|
||||
MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
BASE = "http://127.0.0.1:8000"
|
||||
|
||||
# BEST SO FAR: GPU1 only + Expert CPU + 8t = 8.75 t/s (6.5GB / 12GB used)
|
||||
# 5.5GB VRAM remaining on GPU 1. Let's use it!
|
||||
# Strategy: keep some experts on GPU 1 using -ncmoe (n-cpu-moe)
|
||||
# n-cpu-moe = number of layers whose experts stay on CPU
|
||||
# Lower = more experts on GPU = more VRAM used = potentially faster
|
||||
|
||||
BASE_CMD = [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", MODEL,
|
||||
"-ngl", "999",
|
||||
"-sm", "none", "--main-gpu", "1",
|
||||
"-c", "4096", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "512", "-b", "2048",
|
||||
"-t", "8", "-tb", "8",
|
||||
"--prio", "3", "--poll", "50",
|
||||
"--no-mmap",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
# Baseline: all experts CPU (confirmed 8.75 t/s)
|
||||
{"name": "Baseline: all expert CPU", "extra": ["-ot", ".*ffn_.*_exps.*=CPU"]},
|
||||
# Try n-cpu-moe with GPU1 only: keep some experts on GPU
|
||||
{"name": "n-cpu-moe=60 (4 layers expert GPU)", "extra": ["-ncmoe", "60"]},
|
||||
{"name": "n-cpu-moe=56 (8 layers expert GPU)", "extra": ["-ncmoe", "56"]},
|
||||
{"name": "n-cpu-moe=52 (12 layers expert GPU)", "extra": ["-ncmoe", "52"]},
|
||||
{"name": "n-cpu-moe=48 (16 layers expert GPU)", "extra": ["-ncmoe", "48"]},
|
||||
]
|
||||
|
||||
def kill():
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(4)
|
||||
|
||||
def check_server(timeout=900):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
||||
if resp.get("status") in ("ok", "ready"):
|
||||
return True
|
||||
except: pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def bench(runs=3):
|
||||
speeds = []
|
||||
for i in range(runs):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Write a Python fibonacci function with memoization."}
|
||||
],
|
||||
"max_tokens": 200,
|
||||
"temperature": 0.0
|
||||
}).encode('utf-8')
|
||||
req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
|
||||
dt = time.time() - t0
|
||||
tokens = resp.get("usage", {}).get("completion_tokens", 0)
|
||||
speed = tokens / dt if dt > 0 else 0
|
||||
speeds.append(speed)
|
||||
print(f" Run {i+1}: {speed:.2f} t/s ({tokens} tok / {dt:.1f}s)")
|
||||
return sum(speeds)/len(speeds), max(speeds)
|
||||
|
||||
def vram():
|
||||
try:
|
||||
out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
|
||||
return [int(x.strip()) for x in out.split('\n')]
|
||||
except: return [0, 0]
|
||||
|
||||
results = []
|
||||
for cfg in CONFIGS:
|
||||
kill()
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing: {cfg['name']}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
cmd = BASE_CMD + cfg["extra"]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
if not check_server(900):
|
||||
print(f" FAILED TO BOOT")
|
||||
results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
|
||||
proc.terminate(); kill(); continue
|
||||
|
||||
print(" Server ready! Warming up...")
|
||||
try:
|
||||
p = json.dumps({"model":"m","messages":[{"role":"system","content":"Hi"},{"role":"user","content":"Hi"}],"max_tokens":5}).encode()
|
||||
urllib.request.urlopen(urllib.request.Request(f"{BASE}/v1/chat/completions",data=p,headers={"Content-Type":"application/json"}), timeout=120)
|
||||
except: pass
|
||||
|
||||
v = vram()
|
||||
print(f" VRAM: GPU0={v[0]}MB, GPU1={v[1]}MB, Total={sum(v)}MB")
|
||||
|
||||
avg, best = bench(runs=3)
|
||||
print(f" >>> AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
results.append({
|
||||
"name": cfg["name"], "avg_tps": round(avg,2), "best_tps": round(best,2),
|
||||
"vram_gpu0": v[0], "vram_gpu1": v[1], "vram_total": sum(v), "status": "OK"
|
||||
})
|
||||
proc.terminate()
|
||||
|
||||
kill()
|
||||
print(f"\n\n{'='*60}")
|
||||
print("FINAL RESULTS - GPU1 Expert Balance (Target: 10+ t/s)")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'Config':<48} {'AVG':>6} {'BEST':>6} {'GPU1':>7}")
|
||||
print("-" * 72)
|
||||
for r in results:
|
||||
if r["status"] == "OK":
|
||||
print(f" {r['name']:<46} {r['avg_tps']:>5} {r['best_tps']:>5} {r['vram_gpu1']:>5}MB")
|
||||
else:
|
||||
print(f" {r['name']:<46} {'FAIL':>5}")
|
||||
|
||||
with open("scripts/122b_final_results.json", "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print("\nSaved to scripts/122b_final_results.json")
|
||||
64
scripts/_archive/tuning/tune_122b_20ts.mjs
Normal file
64
scripts/_archive/tuning/tune_122b_20ts.mjs
Normal file
@@ -0,0 +1,64 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, envVars, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const env = { ...process.env, ...envVars };
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
env
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48)
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer");
|
||||
|
||||
// 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer");
|
||||
|
||||
// 3. Fallback to 36 if OOM happens on 32/28
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer");
|
||||
|
||||
console.log("\nALL TESTS COMPLETED");
|
||||
}
|
||||
|
||||
main();
|
||||
72
scripts/_archive/tuning/tune_exact.mjs
Normal file
72
scripts/_archive/tuning/tune_exact.mjs
Normal file
@@ -0,0 +1,72 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, envVars, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Env: ${JSON.stringify(envVars)}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const env = { ...process.env, ...envVars };
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
env
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running speed test...`);
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// 1. 122B-A10B: Pure GPU offload (No n-cpu-moe at all)
|
||||
// -ngl 999 will offload all 48 layers to the NVIDIA driver (triggering Shared VRAM fallback on Windows)
|
||||
const args122B = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
await runTest(args122B, {}, "122B-A10B: Pure GPU (NVIDIA Shared Memory Fallback)");
|
||||
|
||||
// 2. 35B-A3B: Pure GPU tuning to hit 70 t/s
|
||||
// Base configuration from previous full-gpu run:
|
||||
const args35B = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 128 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// We already got ~64 t/s basically.
|
||||
// Let's try MMQ for custom matrix multiplication which is often faster on Ampere for low batch generation
|
||||
await runTest(args35B, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Force MMQ = 1");
|
||||
|
||||
// Try increasing threads to 12 just in case
|
||||
const args35B_t12 = args35B.replace("-t 6 -tb 6", "-t 12 -tb 12");
|
||||
await runTest(args35B_t12, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Threads 12 + MMQ");
|
||||
|
||||
console.log("\nALL TESTS COMPLETED");
|
||||
}
|
||||
|
||||
main();
|
||||
84
scripts/_archive/tuning/tune_models.mjs
Normal file
84
scripts/_archive/tuning/tune_models.mjs
Normal file
@@ -0,0 +1,84 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
// Kill existing
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore'
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
let oom = false;
|
||||
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT (Likely OOM)`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||
// Run pptest
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
|
||||
// Extract TG and PP from TG-500
|
||||
const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/);
|
||||
const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/);
|
||||
|
||||
const tg = tgMatch ? parseFloat(tgMatch[1]) : 0;
|
||||
const pp = ppMatch ? parseFloat(ppMatch[1]) : 0;
|
||||
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true, tg, pp });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512
|
||||
const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// Test 1: n-cpu-moe 1, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512");
|
||||
|
||||
// Test 2: n-cpu-moe 2, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512");
|
||||
|
||||
// Test 3: n-cpu-moe 4, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512");
|
||||
|
||||
// 2. 122B Tuning: Find optimal n-cpu-moe
|
||||
const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU.
|
||||
// Let's try 38, 35, 30
|
||||
await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38");
|
||||
await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30");
|
||||
await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22");
|
||||
|
||||
console.log("Tuning finished.");
|
||||
}
|
||||
|
||||
main();
|
||||
107
scripts/_archive/tuning/tune_n_cpu_moe.py
Normal file
107
scripts/_archive/tuning/tune_n_cpu_moe.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import subprocess, time, urllib.request, json, sys
|
||||
try: sys.stdout.reconfigure(encoding='utf-8')
|
||||
except: pass
|
||||
|
||||
MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
BASE = "http://127.0.0.1:8000"
|
||||
|
||||
# Goal: See if decreasing n-cpu-moe increases VRAM usage on GPU 1 and improves speed
|
||||
# Now that we know GPU 1 is isolated and has ~3.5GB free with 256K context
|
||||
|
||||
BASE_CMD = [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", MODEL,
|
||||
"-ngl", "999",
|
||||
"-sm", "none", "--main-gpu", "1",
|
||||
"-c", "4096", "-np", "1", "-fa", "on", # use a small context for fast boot/testing
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "512", "-b", "2048",
|
||||
"-t", "8", "-tb", "8",
|
||||
"--prio", "3", "--poll", "50",
|
||||
"--no-mmap",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
{"name": "n-cpu-moe=48 (baseline)", "extra": ["-ncmoe", "48"]},
|
||||
{"name": "n-cpu-moe=40", "extra": ["-ncmoe", "40"]},
|
||||
{"name": "n-cpu-moe=32", "extra": ["-ncmoe", "32"]},
|
||||
{"name": "n-cpu-moe=24", "extra": ["-ncmoe", "24"]},
|
||||
]
|
||||
|
||||
def kill():
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(4)
|
||||
|
||||
def check_server(timeout=900):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
||||
if resp.get("status") in ("ok", "ready"):
|
||||
return True
|
||||
except: pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def bench(runs=2):
|
||||
speeds = []
|
||||
for i in range(runs):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Write a short Python script."}
|
||||
],
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.0
|
||||
}).encode('utf-8')
|
||||
req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
|
||||
dt = time.time() - t0
|
||||
tokens = resp.get("usage", {}).get("completion_tokens", 0)
|
||||
speed = tokens / dt if dt > 0 else 0
|
||||
speeds.append(speed)
|
||||
return sum(speeds)/len(speeds), max(speeds)
|
||||
|
||||
def vram():
|
||||
try:
|
||||
out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
|
||||
return [int(x.strip()) for x in out.split('\n')]
|
||||
except: return [0, 0]
|
||||
|
||||
results = []
|
||||
for cfg in CONFIGS:
|
||||
kill()
|
||||
print(f"\n{'='*60}\nTesting: {cfg['name']}\n{'='*60}")
|
||||
|
||||
cmd = BASE_CMD + cfg["extra"]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
if not check_server(300):
|
||||
print(f" FAILED TO BOOT (OOM?)")
|
||||
results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
|
||||
proc.terminate(); kill(); continue
|
||||
|
||||
print(" Server ready! Warming up...")
|
||||
time.sleep(2)
|
||||
v = vram()
|
||||
|
||||
avg, best = bench(runs=2)
|
||||
print(f" >>> AVG: {avg:.2f} t/s | VRAM GPU1: {v[1]}MB")
|
||||
|
||||
results.append({
|
||||
"name": cfg["name"], "avg_tps": round(avg,2),
|
||||
"vram_gpu1": v[1], "status": "OK"
|
||||
})
|
||||
proc.terminate()
|
||||
|
||||
kill()
|
||||
print("\nFINAL RESULTS:")
|
||||
for r in results:
|
||||
if r["status"] == "OK":
|
||||
print(f" {r['name']:<25} {r['avg_tps']:>5} t/s | {r['vram_gpu1']:>5}MB")
|
||||
else:
|
||||
print(f" {r['name']:<25} FAIL (OOM)")
|
||||
Reference in New Issue
Block a user