feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning):
- Gemma4 26B: 74.65 t/s (fast)
- Qwen 35B: 61.62 t/s (balanced)
- Gemma4 31B: 16.0 t/s (deep-coder)
- Qwen 27B: 16.7 t/s (deep-logic)
- Qwen 122B: 8.95 t/s (ultra, GPU 1 only)

Phase 02 (API Engine):
- FastAPI reverse proxy on port 8000
- /engine/switch hot-swap with 503 protection
- config/engine_models.json as single source of truth
- Replaced 4 individual .bat files with unified engine

File cleanup:
- scripts/ 85 files -> 9 + _archive/
- Root .bat files -> _archive/
This commit is contained in:
Variet-Worker
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions

View File

@@ -0,0 +1,372 @@
"""
Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
===========================================
각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
"""
import subprocess
import time
import json
import urllib.request
import os
import re
import sys
import datetime
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
SERVER_EXE = r"llama_bin_run\llama-server.exe"
# ============================================================
# 테스트할 설정 목록
# ============================================================
# 공통 파라미터 (변경하지 않는 것들)
COMMON_ARGS = [
"--model", MODEL_PATH,
"-ngl", "999",
"--cpu-moe",
"-c", "2048",
"-np", "1",
"-fa", "on",
"--cache-type-k", "q4_0",
"--cache-type-v", "q4_0",
"-ub", "256",
"-b", "1024",
"--mlock",
"--port", "8000",
"--host", "0.0.0.0",
"--no-warmup", # 워밍업은 벤치마크 스크립트에서 직접 수행
]
# 변수 파라미터 조합
CONFIGS = [
{
"name": "A) --no-mmap -t 8",
"desc": "서버 권장: mmap 비활성화 (baseline 대비)",
"extra": ["--no-mmap", "-t", "8", "--prio", "2"],
},
{
"name": "B) --no-mmap -t 6",
"desc": "스레드 감소 (캐시 경합 회피)",
"extra": ["--no-mmap", "-t", "6", "--prio", "2"],
},
{
"name": "C) --no-mmap -t 10",
"desc": "스레드 증가 (RAM 대역폭 포화)",
"extra": ["--no-mmap", "-t", "10", "--prio", "2"],
},
{
"name": "D) --no-mmap -t 12",
"desc": "더 많은 스레드",
"extra": ["--no-mmap", "-t", "12", "--prio", "2"],
},
{
"name": "E) --no-mmap -t 10 --prio 3 --poll 100",
"desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
"extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
},
]
# ============================================================
# 유틸리티 함수
# ============================================================
def kill_server():
"""llama-server 프로세스 강제 종료"""
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
time.sleep(3)
def start_server(config, log_path):
"""서버 시작, 로그를 파일로 리다이렉트"""
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
log_file = open(log_path, "w", encoding="utf-8")
proc = subprocess.Popen(
cmd,
stdout=log_file,
stderr=subprocess.STDOUT,
cwd=os.getcwd()
)
return proc, log_file
def wait_for_server(timeout=600):
"""서버가 준비될 때까지 대기"""
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
return True
except:
pass
time.sleep(5)
return False
def run_single_benchmark(prompt, max_tokens=200):
"""단일 벤치마크 실행"""
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
completion_tokens = usage.get("completion_tokens", 0)
return completion_tokens, elapsed
def parse_eval_times(log_path):
"""서버 로그에서 순수 eval time 파싱"""
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return []
# "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
matches = re.findall(pattern, content, re.MULTILINE)
results = []
for m in matches:
results.append({
"total_ms": float(m[0]),
"tokens": int(m[1]),
"ms_per_token": float(m[2]),
"tps": float(m[3])
})
return results
def parse_prompt_eval_times(log_path):
"""서버 로그에서 prompt eval time 파싱"""
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return []
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
matches = re.findall(pattern, content, re.MULTILINE)
results = []
for m in matches:
results.append({
"total_ms": float(m[0]),
"tokens": int(m[1]),
"ms_per_token": float(m[2]),
"tps": float(m[3])
})
return results
def parse_vram_usage(log_path):
"""서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return "N/A"
match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
if match:
return f"{float(match.group(1)):.0f} MiB"
return "N/A"
# ============================================================
# 메인 튜닝 루프
# ============================================================
def main():
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
print("=" * 70)
print(" Qwen3.5 122B-A10B 자동 정밀 튜닝")
print(f" 시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
print(f" 테스트 설정: {len(CONFIGS)}")
print(f" 예상 소요: ~{len(CONFIGS) * 7}")
print("=" * 70)
print()
print(" 기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
print()
# 결과 저장
all_results = []
for idx, config in enumerate(CONFIGS):
config_start = time.time()
log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
print(f"\n{'='*70}")
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
print(f" {config['desc']}")
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
print(f"{'='*70}")
# 1. 기존 서버 종료
print(" [1/4] 서버 종료 중...")
kill_server()
# 2. 새 서버 시작
print(f" [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
proc, log_file = start_server(config, log_path)
# 3. 서버 준비 대기
if not wait_for_server(timeout=600):
print(" ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
kill_server()
log_file.close()
all_results.append({
"config": config["name"],
"status": "FAILED",
"eval_tps": [],
"prompt_tps": [],
"vram": "N/A"
})
continue
load_time = time.time() - config_start
print(f" [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
# 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
print(" [4/4] 벤치마크 실행 중...")
# 워밍업
try:
run_single_benchmark("Say hello.", max_tokens=20)
print(" 워밍업 완료")
except Exception as e:
print(f" 워밍업 실패: {e}")
# 본 테스트 3회
prompts = [
"Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
"Explain the complete process of photosynthesis including light and dark reactions in detail.",
"Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
]
for i, prompt in enumerate(prompts):
try:
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
approx_tps = tokens / elapsed if elapsed > 0 else 0
print(f" Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
except Exception as e:
print(f" Run {i+1}/3: ERROR - {e}")
# 서버 종료 전에 로그 플러시를 위해 잠시 대기
time.sleep(2)
# 서버 종료
kill_server()
log_file.close()
time.sleep(2)
# 로그 파싱
eval_times = parse_eval_times(log_path)
prompt_times = parse_prompt_eval_times(log_path)
vram = parse_vram_usage(log_path)
# 워밍업 제외 (첫 번째 결과)
if len(eval_times) > 1:
bench_evals = eval_times[1:] # 워밍업 제외
else:
bench_evals = eval_times
if len(prompt_times) > 1:
bench_prompts = prompt_times[1:]
else:
bench_prompts = prompt_times
eval_speeds = [e["tps"] for e in bench_evals]
prompt_speeds = [p["tps"] for p in bench_prompts]
result = {
"config": config["name"],
"status": "OK",
"eval_tps": eval_speeds,
"prompt_tps": prompt_speeds,
"vram": vram,
}
all_results.append(result)
config_elapsed = time.time() - config_start
print(f"\n 완료! 소요: {config_elapsed:.0f}")
if eval_speeds:
avg_eval = sum(eval_speeds) / len(eval_speeds)
max_eval = max(eval_speeds)
print(f" 📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
# ============================================================
# 최종 결과 비교 테이블
# ============================================================
print("\n")
print("=" * 80)
print(" 🏆 최종 결과 비교 테이블")
print("=" * 80)
print()
# 기존 baseline 추가
print(f" {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
print(f" {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
# Baseline (이전 결과)
print(f" {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
best_avg = 0
best_config = ""
for r in all_results:
if r["status"] != "OK" or not r["eval_tps"]:
print(f" {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
continue
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
max_e = max(r["eval_tps"])
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
if avg_e > best_avg:
best_avg = avg_e
best_config = r["config"]
marker = "" if avg_e > 10.06 else ""
print(f" {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
print()
if best_avg > 0:
improvement = ((best_avg - 10.02) / 10.02) * 100
print(f" 🏆 최고 성능: {best_config}")
print(f"{best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
print()
print(f" 완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
print("=" * 80)
# 결과를 파일로도 저장
result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
with open(result_path, "w", encoding="utf-8") as f:
f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
f.write(f"Date: {timestamp}\n\n")
for r in all_results:
f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
print(f" 결과 저장: {result_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,257 @@
"""
Qwen3.5 122B-A10B 정밀 튜닝 2라운드
====================================
1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
"""
import subprocess
import time
import json
import urllib.request
import os
import re
import sys
import datetime
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
SERVER_EXE = r"llama_bin_run\llama-server.exe"
COMMON_ARGS = [
"--model", MODEL_PATH,
"-ngl", "999",
"--cpu-moe",
"-c", "2048",
"-np", "1",
"-fa", "on",
"--cache-type-k", "q4_0",
"--cache-type-v", "q4_0",
"-ub", "256",
"-b", "1024",
"--mlock",
"--port", "8000",
"--host", "0.0.0.0",
"--no-warmup",
]
CONFIGS = [
{
"name": "F) mmap on, -t 4",
"desc": "최소 스레드 (4개, 물리코어 절반)",
"extra": ["-t", "4", "--prio", "2"],
},
{
"name": "G) mmap on, -t 5",
"desc": "스레드 5개",
"extra": ["-t", "5", "--prio", "2"],
},
{
"name": "H) mmap on, -t 6",
"desc": "스레드 6개 (--no-mmap에서 최고였음)",
"extra": ["-t", "6", "--prio", "2"],
},
{
"name": "I) mmap on, -t 7",
"desc": "스레드 7개",
"extra": ["-t", "7", "--prio", "2"],
},
{
"name": "J) mmap on, -t 6, --prio 3",
"desc": "최적 스레드 + 리얼타임 우선순위",
"extra": ["-t", "6", "--prio", "3"],
},
]
def kill_server():
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
time.sleep(3)
def start_server(config, log_path):
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
log_file = open(log_path, "w", encoding="utf-8")
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
return proc, log_file
def wait_for_server(timeout=600):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
return True
except:
pass
time.sleep(5)
return False
def run_single_benchmark(prompt, max_tokens=200):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
return usage.get("completion_tokens", 0), elapsed
def parse_eval_times(log_path):
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return []
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
matches = re.findall(pattern, content, re.MULTILINE)
return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
def parse_prompt_eval_times(log_path):
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return []
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
matches = re.findall(pattern, content, re.MULTILINE)
return [{"tps": float(m[3])} for m in matches]
def main():
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
print("=" * 70)
print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
print("=" * 70)
print()
all_results = []
for idx, config in enumerate(CONFIGS):
config_start = time.time()
log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
print(f"\n{'='*70}")
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
print(f" {config['desc']}")
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
print(f"{'='*70}")
kill_server()
print(f" [1/3] 서버 시작 중...")
proc, log_file = start_server(config, log_path)
if not wait_for_server(timeout=600):
print(" ❌ 서버 시작 실패!")
kill_server()
log_file.close()
all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
continue
load_time = time.time() - config_start
print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)")
# 워밍업 + 벤치마크
try:
run_single_benchmark("Say hello.", max_tokens=20)
except:
pass
print(" [3/3] 벤치마크 3회...")
prompts = [
"Write a detailed explanation of how neural networks learn through backpropagation.",
"Explain the complete process of photosynthesis including light and dark reactions.",
"Describe the differences between SQL and NoSQL databases with examples.",
]
for i, prompt in enumerate(prompts):
try:
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
except Exception as e:
print(f" Run {i+1}: ERROR - {e}")
time.sleep(2)
kill_server()
log_file.close()
time.sleep(2)
eval_times = parse_eval_times(log_path)
prompt_times = parse_prompt_eval_times(log_path)
bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
eval_speeds = [e["tps"] for e in bench_evals]
prompt_speeds = [p["tps"] for p in bench_prompts]
all_results.append({
"config": config["name"],
"status": "OK",
"eval_tps": eval_speeds,
"prompt_tps": prompt_speeds,
})
if eval_speeds:
print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
# 최종 결과
print("\n")
print("=" * 85)
print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
print("=" * 85)
print()
print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
# 1라운드 결과 (하드코딩)
r1 = [
("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52),
("A) --no-mmap -t 8", 9.66, 9.70, 28.26),
("B) --no-mmap -t 6", 10.02, 10.18, 26.73),
("C) --no-mmap -t 10", 9.42, 9.46, 27.31),
("D) --no-mmap -t 12", 9.04, 9.11, 27.92),
("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37),
]
for name, avg, mx, pp in r1:
marker = "" if avg >= 10.0 else ""
print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
print(f" {'--- 2라운드 ---':<48}")
best_avg = 10.06 # 기존 최고
best_config = "[기준] mmap on, -t 8"
for r in all_results:
if r["status"] != "OK" or not r["eval_tps"]:
print(f" {r['config']:<48} {'FAIL':>8}")
continue
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
max_e = max(r["eval_tps"])
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
if max_e > best_avg:
best_avg = max_e
best_config = r["config"]
marker = "" if avg_e >= 10.0 else ""
print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
print()
print(f" 🏆 최고 성능: {best_config}{best_avg:.2f} t/s")
print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
print("=" * 85)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,339 @@
"""
Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
Phase 1: -ngl sweep (GPU layers)
Phase 2: -t / -tb sweep (CPU threads)
Phase 3: -ub / -b sweep (batch sizes)
Phase 4: --cache-type-k/v sweep (KV cache precision)
Phase 5: --no-mmap, --poll, --prio sweep (misc)
Each phase fixes the best from previous phases.
"""
import subprocess
import time
import json
import urllib.request
import sys
import os
import itertools
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
CONTEXT = 262144
BENCHMARK_RUNS = 3
BENCHMARK_TOKENS = 200
# ─── Baseline (from previous tuning at -c 4096) ───
BEST = {
"ngl": 22,
"t": 8,
"tb": 8,
"ub": 512,
"b": 2048,
"ctk": "q4_0",
"ctv": "q4_0",
"fa": "on",
"mlock": True,
"mmap": True,
"prio": 2,
"poll": 50,
}
ALL_RESULTS = []
def kill_server():
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
capture_output=True)
time.sleep(4)
def build_cmd(cfg):
cmd = [LLAMA_SERVER, "--model", MODEL,
"-ngl", str(cfg["ngl"]),
"-c", str(CONTEXT),
"-np", "1",
"-fa", cfg["fa"],
"--cache-type-k", cfg["ctk"],
"--cache-type-v", cfg["ctv"],
"-ub", str(cfg["ub"]),
"-b", str(cfg["b"]),
"-t", str(cfg["t"]),
"-tb", str(cfg["tb"]),
"--prio", str(cfg["prio"]),
"--poll", str(cfg["poll"]),
"--port", "8000",
"--host", "0.0.0.0"]
if cfg["mlock"]:
cmd.append("--mlock")
if not cfg["mmap"]:
cmd.append("--no-mmap")
return cmd
def start_server(cfg):
cmd = build_cmd(cfg)
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
)
return proc
def wait_for_server(timeout=180):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=3) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
return True
except:
pass
time.sleep(2)
return False
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=300) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
ct = usage.get("completion_tokens", 0)
return ct / elapsed if elapsed > 0 else 0
def get_vram():
try:
r = subprocess.run(
["nvidia-smi", "--query-gpu=memory.used,memory.total",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
parts = r.stdout.strip().split(",")
return int(parts[0].strip()), int(parts[1].strip())
except:
return 0, 0
def test_config(cfg, label=""):
kill_server()
desc = label or str(cfg)
print(f" [{desc}] Starting server...")
proc = start_server(cfg)
if not wait_for_server():
print(f" [{desc}] FAILED to start")
proc.kill()
return None
vram_used, vram_total = get_vram()
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
# Warmup
try:
run_benchmark(max_tokens=20)
except:
pass
# Benchmark
speeds = []
for i in range(BENCHMARK_RUNS):
try:
tps = run_benchmark()
speeds.append(tps)
except Exception as e:
print(f"ERR({e}) ", end="", flush=True)
proc.kill()
if not speeds:
print("ALL FAILED")
return None
avg = sum(speeds) / len(speeds)
best = max(speeds)
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
result = {**cfg, "avg_tps": avg, "best_tps": best,
"vram_used": vram_used, "vram_total": vram_total, "label": label}
ALL_RESULTS.append(result)
return result
def phase_sweep(phase_name, param_name, values, base_cfg):
print(f"\n{'='*70}")
print(f" PHASE: {phase_name}")
print(f" Sweeping: {param_name} = {values}")
print(f"{'='*70}")
best_result = None
for val in values:
cfg = {**base_cfg}
if isinstance(param_name, list):
for p, v in zip(param_name, val):
cfg[p] = v
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
else:
cfg[param_name] = val
label = f"{param_name}={val}"
r = test_config(cfg, label)
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
best_result = r
if best_result:
print(f"\n ★ Phase winner: {best_result['label']}{best_result['avg_tps']:.2f} t/s")
return best_result
def main():
print("=" * 70)
print(" Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner")
print(" 256K Context | RTX 3060 12GB")
print("=" * 70)
print()
cfg = dict(BEST)
# ─── Phase 1: -ngl (already done, quick verify top 3) ───
r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg)
if r:
cfg["ngl"] = r["ngl"]
# ─── Phase 2: CPU threads (-t, -tb) ───
thread_combos = [
(2, 2), (4, 4), (4, 8), (6, 6), (6, 8),
(8, 8), (8, 12), (10, 10), (12, 12), (16, 16)
]
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
if r:
cfg["t"] = r["t"]
cfg["tb"] = r["tb"]
# ─── Phase 3: Batch sizes (-ub, -b) ───
batch_combos = [
(128, 512), (256, 1024), (256, 2048),
(512, 1024), (512, 2048), (512, 4096),
(1024, 2048), (1024, 4096)
]
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
if r:
cfg["ub"] = r["ub"]
cfg["b"] = r["b"]
# ─── Phase 4: KV cache precision ───
kv_combos = [
("q4_0", "q4_0"),
("q8_0", "q8_0"),
("q4_0", "q8_0"),
("f16", "f16"),
]
r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg)
if r:
cfg["ctk"] = r["ctk"]
cfg["ctv"] = r["ctv"]
# ─── Phase 5: Misc (mmap, poll, prio) ───
misc_combos = [
(True, 50, 2), # baseline
(False, 50, 2), # no-mmap
(True, 0, 2), # no polling
(True, 100, 2), # max polling
(True, 50, 3), # realtime priority
(False, 0, 3), # no-mmap + no-poll + realtime
]
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
if r:
cfg["mmap"] = r["mmap"]
cfg["poll"] = r["poll"]
cfg["prio"] = r["prio"]
# ─── Final Report ───
print()
print("=" * 70)
print(" FINAL OPTIMAL CONFIGURATION")
print("=" * 70)
print(f" ngl: {cfg['ngl']}")
print(f" threads: -t {cfg['t']} -tb {cfg['tb']}")
print(f" batch: -ub {cfg['ub']} -b {cfg['b']}")
print(f" kv cache: -ctk {cfg['ctk']} -ctv {cfg['ctv']}")
print(f" flash: -fa {cfg['fa']}")
print(f" mlock: {'yes' if cfg['mlock'] else 'no'}")
print(f" mmap: {'yes' if cfg['mmap'] else 'no (--no-mmap)'}")
print(f" prio: {cfg['prio']}")
print(f" poll: {cfg['poll']}")
print()
# Final verification run
print(" Running final verification (5 runs)...")
kill_server()
proc = start_server(cfg)
wait_for_server()
try:
run_benchmark(max_tokens=20)
except:
pass
final_speeds = []
for i in range(5):
try:
tps = run_benchmark()
final_speeds.append(tps)
print(f" Run {i+1}: {tps:.2f} t/s")
except:
pass
proc.kill()
if final_speeds:
avg = sum(final_speeds) / len(final_speeds)
best = max(final_speeds)
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
print()
cmd_parts = [
f"llama-server --model {MODEL}",
f"-ngl {cfg['ngl']} -c {CONTEXT}",
f"-t {cfg['t']} -tb {cfg['tb']}",
f"-ub {cfg['ub']} -b {cfg['b']}",
f"-fa {cfg['fa']}",
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
f"--prio {cfg['prio']} --poll {cfg['poll']}",
]
if cfg["mlock"]:
cmd_parts.append("--mlock")
if not cfg["mmap"]:
cmd_parts.append("--no-mmap")
cmd_parts.append("--port 8000 --host 0.0.0.0")
print(" Recommended command:")
print(f" {' '.join(cmd_parts)}")
print("=" * 70)
# Dump all results to JSON
with open("scripts/tune_results_gemma4_256k.json", "w") as f:
json.dump(ALL_RESULTS, f, indent=2, default=str)
print(f"\n Full results saved: scripts/tune_results_gemma4_256k.json")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,163 @@
"""
Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
"""
import subprocess, time, json, urllib.request, sys, os
try:
sys.stdout.reconfigure(encoding='utf-8')
except:
pass
BASE_URL = "http://127.0.0.1:8000"
SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
CTX = 262144
RUNS = 3
def kill():
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
time.sleep(4)
def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
cmd = [SERVER, "--model", MODEL, "-ngl", "999",
"-c", str(CTX), "-np", "1", "-fa", "on",
"--cache-type-k", ctk, "--cache-type-v", ctv,
"-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
"--prio", str(prio), "--poll", "50",
"--mlock", "--port", "8000", "--host", "0.0.0.0"]
if ncpumoe > 0:
cmd.extend(["--n-cpu-moe", str(ncpumoe)])
if nommap:
cmd.append("--no-mmap")
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
def wait_ready(timeout=240):
t0 = time.time()
while time.time() - t0 < timeout:
try:
with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
if json.loads(r.read()).get("status") == "ok":
return True
except:
pass
time.sleep(2)
return False
def bench(n=200):
p = json.dumps({"model": "m", "messages": [{"role": "user",
"content": "Count from 1 to 50, each number on new line."}],
"max_tokens": n, "temperature": 0.0}).encode()
r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
headers={"Content-Type": "application/json"})
t0 = time.time()
with urllib.request.urlopen(r, timeout=300) as resp:
res = json.loads(resp.read())
dt = time.time() - t0
ct = res.get("usage", {}).get("completion_tokens", 0)
return ct / dt if dt > 0 else 0
def vram():
try:
r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
"--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
a, b = r.stdout.strip().split(",")
return int(a.strip()), int(b.strip())
except:
return 0, 0
def test(label, ncpumoe, **kw):
kill()
print(f" [{label}] Starting...", end=" ", flush=True)
p = start(ncpumoe, **kw)
if not wait_ready():
print("FAILED"); p.kill(); return None
vu, vt = vram()
print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
try: bench(20)
except: pass
speeds = []
for _ in range(RUNS):
try: speeds.append(bench())
except: pass
p.kill()
if not speeds:
print("BENCH FAILED"); return None
avg, best = sum(speeds)/len(speeds), max(speeds)
print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
"vram": vu, **kw}
def main():
print("=" * 60)
print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
print("=" * 60)
results = []
# Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
print("\n--- Phase 1: --n-cpu-moe sweep ---")
for n in [0, 5, 10, 15, 20, 25, 30]:
nm = n > 15 # use --no-mmap when heavy CPU offload
r = test(f"ncpumoe={n}", n, nommap=nm)
if r: results.append(r)
# Find best n-cpu-moe
best_r = max(results, key=lambda x: x["avg"])
best_n = best_r["ncpumoe"]
print(f"\n ★ Best n-cpu-moe: {best_n}{best_r['avg']:.1f} t/s")
# Fine-tune around best
if best_n > 0:
print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
if n == best_n: continue
nm = n > 15
r = test(f"ncpumoe={n}", n, nommap=nm)
if r: results.append(r)
best_r = max(results, key=lambda x: x["avg"])
best_n = best_r["ncpumoe"]
print(f"\n ★ Refined n-cpu-moe: {best_n}{best_r['avg']:.1f} t/s")
# Phase 2: Thread sweep at best n-cpu-moe
nm = best_n > 15
print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
for t in [2, 4, 6, 8, 10]:
r = test(f"t={t}", best_n, t=t, nommap=nm)
if r: results.append(r)
best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
bt = best_t.get("t", 4)
print(f"\n ★ Best threads: {bt}")
# Phase 3: Batch sweep
print(f"\n--- Phase 3: Batch sweep ---")
for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
if r: results.append(r)
# Phase 4: KV cache type
print(f"\n--- Phase 4: KV cache type ---")
for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
if r: results.append(r)
# Final report
best_all = max(results, key=lambda x: x["avg"])
print(f"\n{'='*60}")
print(f" FINAL BEST: {best_all['label']}{best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
print(f"{'='*60}")
with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
json.dump(results, f, indent=2, default=str)
print(" Saved: scripts/tune_results_gemma4_ncpumoe.json")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,335 @@
"""
Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
Now tuning for -c 262144 (256K context).
Phase 1: --cpu-moe vs no --cpu-moe baseline
Phase 2: -t / -tb sweep
Phase 3: -ub / -b sweep
Phase 4: --cache-type-k/v sweep
Phase 5: Misc (mmap, poll, prio)
"""
import subprocess
import time
import json
import urllib.request
import sys
import os
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
CONTEXT = 262144
BENCHMARK_RUNS = 3
BENCHMARK_TOKENS = 200
BEST = {
"ngl": 999,
"cpu_moe": True,
"t": 6,
"tb": 6,
"ub": 512,
"b": 2048,
"ctk": "q4_0",
"ctv": "q4_0",
"fa": "on",
"mlock": True,
"mmap": True,
"prio": 2,
"poll": 50,
}
ALL_RESULTS = []
def kill_server():
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
time.sleep(4)
def build_cmd(cfg):
cmd = [LLAMA_SERVER, "--model", MODEL,
"-ngl", str(cfg["ngl"]),
"-c", str(CONTEXT),
"-np", "1",
"-fa", cfg["fa"],
"--cache-type-k", cfg["ctk"],
"--cache-type-v", cfg["ctv"],
"-ub", str(cfg["ub"]),
"-b", str(cfg["b"]),
"-t", str(cfg["t"]),
"-tb", str(cfg["tb"]),
"--prio", str(cfg["prio"]),
"--poll", str(cfg["poll"]),
"--port", "8000",
"--host", "0.0.0.0"]
if cfg.get("cpu_moe"):
cmd.append("--cpu-moe")
if cfg["mlock"]:
cmd.append("--mlock")
if not cfg["mmap"]:
cmd.append("--no-mmap")
return cmd
def start_server(cfg):
cmd = build_cmd(cfg)
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
)
return proc
def wait_for_server(timeout=240):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=3) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
return True
except:
pass
time.sleep(2)
return False
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=300) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
ct = usage.get("completion_tokens", 0)
return ct / elapsed if elapsed > 0 else 0
def get_vram():
try:
r = subprocess.run(
["nvidia-smi", "--query-gpu=memory.used,memory.total",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
parts = r.stdout.strip().split(",")
return int(parts[0].strip()), int(parts[1].strip())
except:
return 0, 0
def test_config(cfg, label=""):
kill_server()
desc = label or str(cfg)
print(f" [{desc}] Starting server...", flush=True)
proc = start_server(cfg)
if not wait_for_server():
print(f" [{desc}] FAILED to start")
proc.kill()
return None
vram_used, vram_total = get_vram()
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
# Warmup
try:
run_benchmark(max_tokens=20)
except:
pass
speeds = []
for i in range(BENCHMARK_RUNS):
try:
tps = run_benchmark()
speeds.append(tps)
except Exception as e:
print(f"ERR({e}) ", end="", flush=True)
proc.kill()
if not speeds:
print("ALL FAILED")
return None
avg = sum(speeds) / len(speeds)
best = max(speeds)
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
"vram_used": vram_used, "vram_total": vram_total, "label": label}
ALL_RESULTS.append(result)
return result
def phase_sweep(phase_name, param_name, values, base_cfg):
print(f"\n{'='*70}")
print(f" PHASE: {phase_name}")
print(f" Sweeping: {param_name} = {values}")
print(f"{'='*70}")
best_result = None
for val in values:
cfg = {**base_cfg}
if isinstance(param_name, list):
for p, v in zip(param_name, val):
cfg[p] = v
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
else:
cfg[param_name] = val
label = f"{param_name}={val}"
r = test_config(cfg, label)
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
best_result = r
if best_result:
print(f"\n ★ Phase winner: {best_result['label']}{best_result['avg_tps']:.2f} t/s")
return best_result
def main():
print("=" * 70)
print(" Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
print(" 256K Context | RTX 3060 12GB")
print(" Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
print("=" * 70)
print()
cfg = dict(BEST)
# ─── Phase 1: --cpu-moe critical test ───
r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
if r:
cfg["cpu_moe"] = r["cpu_moe"]
# ─── Phase 2: CPU threads ───
thread_combos = [
(2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
(8, 8), (8, 12), (10, 10), (12, 12)
]
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
if r:
cfg["t"] = r["t"]
cfg["tb"] = r["tb"]
# ─── Phase 3: Batch sizes ───
batch_combos = [
(128, 512), (256, 1024), (256, 2048),
(512, 1024), (512, 2048), (512, 4096),
(1024, 2048), (1024, 4096)
]
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
if r:
cfg["ub"] = r["ub"]
cfg["b"] = r["b"]
# ─── Phase 4: KV cache ───
kv_combos = [
("q4_0", "q4_0"),
("q8_0", "q8_0"),
("f16", "f16"),
]
r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
if r:
cfg["ctk"] = r["ctk"]
cfg["ctv"] = r["ctv"]
# ─── Phase 5: Misc ───
misc_combos = [
(True, 50, 2),
(False, 50, 2),
(True, 0, 2),
(True, 100, 2),
(True, 50, 3),
]
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
if r:
cfg["mmap"] = r["mmap"]
cfg["poll"] = r["poll"]
cfg["prio"] = r["prio"]
# ─── Final Report ───
print()
print("=" * 70)
print(" FINAL OPTIMAL CONFIGURATION")
print("=" * 70)
for k, v in cfg.items():
print(f" {k:>12}: {v}")
print()
# Final verification
print(" Running final verification (5 runs)...")
kill_server()
proc = start_server(cfg)
wait_for_server()
try:
run_benchmark(max_tokens=20)
except:
pass
final_speeds = []
for i in range(5):
try:
tps = run_benchmark()
final_speeds.append(tps)
print(f" Run {i+1}: {tps:.2f} t/s")
except:
pass
proc.kill()
if final_speeds:
avg = sum(final_speeds) / len(final_speeds)
best = max(final_speeds)
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
print()
cmd_parts = [
f"llama-server --model {MODEL}",
f"-ngl {cfg['ngl']} -c {CONTEXT}",
]
if cfg.get("cpu_moe"):
cmd_parts.append("--cpu-moe")
cmd_parts.extend([
f"-t {cfg['t']} -tb {cfg['tb']}",
f"-ub {cfg['ub']} -b {cfg['b']}",
f"-fa {cfg['fa']}",
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
f"--prio {cfg['prio']} --poll {cfg['poll']}",
])
if cfg["mlock"]:
cmd_parts.append("--mlock")
if not cfg["mmap"]:
cmd_parts.append("--no-mmap")
cmd_parts.append("--port 8000 --host 0.0.0.0")
print(" Recommended command:")
print(f" {' '.join(cmd_parts)}")
print("=" * 70)
with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
json.dump(ALL_RESULTS, f, indent=2, default=str)
print(f"\n Full results saved: scripts/tune_results_qwen35b_256k.json")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,531 @@
/**
* Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
* ===========================================================
* Tests 4 models across multiple parameter configurations to find
* the absolute best model + settings for 256K context coding agent.
*
* Models:
* 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
* 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
* 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
* 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
*
* Run: node scripts/dual_gpu_benchmark.mjs
*/
import { spawn, execSync } from "child_process";
import { writeFileSync, statSync, existsSync } from "fs";
import { resolve } from "path";
// ─── Configuration ─────────────────────────────────────────────
const BASE_URL = "http://127.0.0.1:8000";
const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
const CONTEXT = 262144; // 256K
const BENCHMARK_RUNS = 3;
const BENCHMARK_TOKENS = 200;
const SERVER_TIMEOUT = 300_000; // ms
const MODELS = [
{
name: "Qwen3.5-35B-A3B Q4_K_M",
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
type: "qwen", quant: "Q4_K_M", totalLayers: 64,
},
{
name: "Qwen3.5-35B-A3B MXFP4_MOE",
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
},
{
name: "Gemma4 26B-A4B Q4_K_M",
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
},
{
name: "Gemma4 26B-A4B MXFP4_MOE",
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
},
];
const ALL_RESULTS = [];
// ─── Utility ───────────────────────────────────────────────────
function log(msg) {
const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
console.log(`[${ts}] ${msg}`);
}
function sleep(ms) {
return new Promise((r) => setTimeout(r, ms));
}
function killServer() {
try {
execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
} catch {}
return sleep(5000);
}
function getVramAll() {
try {
const out = execSync(
'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
{ encoding: "utf-8", timeout: 5000 }
);
return out.trim().split("\n").map((line) => {
const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
return { gpu, used, total };
});
} catch {
return [];
}
}
function buildCmd(modelPath, params) {
const {
ngl, t, ub, b, ctk, ctv,
cpuMoe = false, nCpuMoe = 0,
prio = 3, nommap = false
} = params;
const cmd = [
LLAMA_SERVER,
"--model", modelPath,
"-ngl", String(ngl),
"-c", String(CONTEXT),
"-np", "1",
"-fa", "on",
"--cache-type-k", ctk,
"--cache-type-v", ctv,
"-ub", String(ub),
"-b", String(b),
"-t", String(t),
"-tb", String(t),
"--prio", String(prio),
"--poll", "50",
"--mlock",
"--port", "8000",
"--host", "0.0.0.0",
];
if (cpuMoe) cmd.push("--cpu-moe");
else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
if (nommap) cmd.push("--no-mmap");
return cmd;
}
function startServer(modelPath, params) {
const args = buildCmd(modelPath, params);
const exe = args.shift();
log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
return spawn(exe, args, {
cwd: process.cwd(),
stdio: ["ignore", "pipe", "pipe"],
});
}
async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
try {
const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
const data = await resp.json();
if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
} catch {}
await sleep(3000);
}
return { ok: false, bootTime: timeoutMs / 1000 };
}
async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
const payload = JSON.stringify({
model: "local-model",
messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
max_tokens: maxTokens,
temperature: 0.0,
});
const start = Date.now();
const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: payload,
signal: AbortSignal.timeout(600_000),
});
const result = await resp.json();
const elapsed = (Date.now() - start) / 1000;
const usage = result.usage || {};
const ct = usage.completion_tokens || 0;
return {
tps: elapsed > 0 ? ct / elapsed : 0,
completionTokens: ct,
promptTokens: usage.prompt_tokens || 0,
elapsed,
};
}
async function testConfig(model, label, params) {
await killServer();
log(` [${label}] Starting server...`);
const proc = startServer(model.path, params);
const { ok, bootTime } = await waitForServer();
if (!ok) {
log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
proc.kill("SIGKILL");
return null;
}
const vram = getVramAll();
const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
// Warmup
try { await runBenchmark(20); } catch {}
// Benchmark
const speeds = [];
for (let i = 0; i < BENCHMARK_RUNS; i++) {
try {
const r = await runBenchmark();
speeds.push(r.tps);
log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
} catch (e) {
log(` Run ${i + 1}: ERROR (${e.message})`);
}
}
proc.kill("SIGKILL");
if (speeds.length === 0) {
log(` [${label}] ALL BENCHMARK RUNS FAILED`);
return null;
}
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
const best = Math.max(...speeds);
log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
const result = {
model: model.name, quant: model.quant, label,
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
boot_time: +bootTime.toFixed(1), vram, params,
};
ALL_RESULTS.push(result);
return result;
}
// ─── Phase Runners ─────────────────────────────────────────────
async function phase0_bootTest(model) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 0: Boot Test — ${model.name}`);
log(`${"=".repeat(70)}`);
// Try full GPU first
let r = await testConfig(model, "boot-ngl999", {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
});
if (r) return r;
// Try with cpu-moe
log(" Full GPU failed, trying with --cpu-moe...");
r = await testConfig(model, "boot-cpumoe", {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
});
if (r) return r;
// Reduced layers
log(" --cpu-moe also failed, trying reduced layers...");
r = await testConfig(model, "boot-ngl-half", {
ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
ctk: "q4_0", ctv: "q4_0",
});
return r;
}
async function phase1_gpuOffload(model, baseline) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 1: GPU Offload Strategy — ${model.name}`);
log(`${"=".repeat(70)}`);
const results = baseline ? [baseline] : [];
// Test --cpu-moe on/off
for (const cpuMoe of [true, false]) {
const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
const r = await testConfig(model, lbl, {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
});
if (r) results.push(r);
}
// n-cpu-moe sweep
for (const n of [0, 5, 10, 15, 20]) {
if (n > model.totalLayers) continue;
const r = await testConfig(model, `n-cpu-moe=${n}`, {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
});
if (r) results.push(r);
}
if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; }
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
log(`\n ★ Phase 1 winner: ${best.label}${best.avg_tps.toFixed(2)} t/s`);
return best;
}
async function phase2_threads(model, prev) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 2: CPU Thread Sweep — ${model.name}`);
log(`${"=".repeat(70)}`);
const p = prev.params;
const results = [prev];
for (const t of [2, 4, 6, 8, 10, 12]) {
if (t === p.t) continue;
const r = await testConfig(model, `t=${t}`, {
...p, t,
});
if (r) results.push(r);
}
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
log(`\n ★ Phase 2 winner: ${best.label}${best.avg_tps.toFixed(2)} t/s`);
return best;
}
async function phase3_batch(model, prev) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 3: Batch Size Sweep — ${model.name}`);
log(`${"=".repeat(70)}`);
const p = prev.params;
const results = [prev];
for (const [ub, b] of [
[128, 512], [256, 1024], [256, 2048],
[512, 1024], [512, 2048], [512, 4096],
[1024, 2048], [1024, 4096],
]) {
if (ub === p.ub && b === p.b) continue;
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
if (r) results.push(r);
}
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
log(`\n ★ Phase 3 winner: ${best.label}${best.avg_tps.toFixed(2)} t/s`);
return best;
}
async function phase4_kvcache(model, prev) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 4: KV Cache Type Sweep — ${model.name}`);
log(`${"=".repeat(70)}`);
const p = prev.params;
const results = [prev];
for (const [ctk, ctv] of [
["q4_0", "q4_0"], ["q8_0", "q8_0"],
["q4_0", "q8_0"], ["f16", "f16"],
]) {
if (ctk === p.ctk && ctv === p.ctv) continue;
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
if (r) results.push(r);
}
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
log(`\n ★ Phase 4 winner: ${best.label}${best.avg_tps.toFixed(2)} t/s`);
return best;
}
async function phase5_final(model, prev) {
log(`\n${"=".repeat(70)}`);
log(` PHASE 5: Final Verification (5 runs) — ${model.name}`);
log(`${"=".repeat(70)}`);
await killServer();
const proc = startServer(model.path, prev.params);
const { ok, bootTime } = await waitForServer();
if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; }
const vram = getVramAll();
try { await runBenchmark(20); } catch {}
const speeds = [];
for (let i = 0; i < 5; i++) {
try {
const r = await runBenchmark();
speeds.push(r.tps);
log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
} catch (e) {
log(` Final Run ${i + 1}: ERROR (${e.message})`);
}
}
proc.kill("SIGKILL");
if (speeds.length > 0) {
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
const best = Math.max(...speeds);
log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
const final_ = {
model: model.name, quant: model.quant,
label: `FINAL-${model.name}`,
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
boot_time: +bootTime.toFixed(1), vram, params: prev.params,
};
ALL_RESULTS.push(final_);
return final_;
}
return prev;
}
// ─── Main ──────────────────────────────────────────────────────
async function runModelBenchmark(model) {
log(`\n${"#".repeat(70)}`);
log(` MODEL: ${model.name}`);
log(` File: ${model.path}`);
try {
const sz = statSync(model.path).size / 1024 ** 3;
log(` Size: ${sz.toFixed(2)} GB`);
} catch { log(` Size: unknown`); }
log(`${"#".repeat(70)}`);
if (!existsSync(model.path)) {
log(` SKIP: Model file not found!`);
return null;
}
const baseline = await phase0_bootTest(model);
if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; }
let best = await phase1_gpuOffload(model, baseline);
if (!best) return baseline;
best = await phase2_threads(model, best);
best = await phase3_batch(model, best);
best = await phase4_kvcache(model, best);
best = await phase5_final(model, best);
return best;
}
async function main() {
const startTime = Date.now();
log("=".repeat(70));
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
log(" 2x RTX 3060 (24GB Total) | 256K Context");
log(` Models: ${MODELS.length}`);
log(` Started: ${new Date().toISOString()}`);
log("=".repeat(70));
const gpus = getVramAll();
gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
const winners = [];
for (let i = 0; i < MODELS.length; i++) {
log(`\n${"=".repeat(70)}`);
log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
log(`${"=".repeat(70)}`);
const winner = await runModelBenchmark(MODELS[i]);
if (winner) winners.push(winner);
// Save intermediate
writeFileSync("scripts/dual_gpu_results.json",
JSON.stringify(ALL_RESULTS, null, 2));
log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`);
}
// ─── Grand Final ───────────────────────────────────────────
const elapsed = (Date.now() - startTime) / 60000;
log(`\n${"=".repeat(70)}`);
log(` GRAND FINAL COMPARISON`);
log(` Total time: ${elapsed.toFixed(1)} minutes`);
log(` Configs tested: ${ALL_RESULTS.length}`);
log(`${"=".repeat(70)}`);
if (winners.length === 0) {
log(" No models ran at 256K!");
return;
}
winners.sort((a, b) => b.avg_tps - a.avg_tps);
const medals = ["🥇", "🥈", "🥉", " "];
const lines = [
`Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
`Hardware: 2x RTX 3060 12GB | Context: 256K`,
`Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
"", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60),
];
for (let i = 0; i < winners.length; i++) {
const w = winners[i];
const p = w.params;
lines.push("");
lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`);
lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
lines.push(` Boot: ${w.boot_time.toFixed(0)}s`);
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
lines.push(` ctk=${p.ctk} ctv=${p.ctv}`);
if (p.cpuMoe) lines.push(` --cpu-moe`);
else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
}
const champ = winners[0];
const cp = champ.params;
lines.push("", "=".repeat(60));
lines.push(` ★ CHAMPION: ${champ.model}`);
lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`);
lines.push("=".repeat(60));
// Build recommended command
const cmdParts = [
`llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
`-ngl ${cp.ngl} -c ${CONTEXT}`,
`-t ${cp.t} -tb ${cp.t}`,
`-ub ${cp.ub} -b ${cp.b}`,
`-fa on`,
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
`--prio ${cp.prio || 3} --poll 50`,
`--mlock`,
];
if (cp.cpuMoe) cmdParts.push("--cpu-moe");
else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
if (cp.nommap) cmdParts.push("--no-mmap");
cmdParts.push("--port 8000 --host 0.0.0.0");
lines.push("", " Recommended command:");
lines.push(` ${cmdParts.join(" ")}`);
const summary = lines.join("\n");
console.log(summary);
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
writeFileSync("scripts/dual_gpu_results.json",
JSON.stringify(ALL_RESULTS, null, 2));
log(`\n Results: scripts/dual_gpu_results.json`);
log(` Summary: scripts/dual_gpu_summary.txt`);
log(` DONE!`);
await killServer();
}
main().catch((e) => {
console.error("Fatal error:", e);
process.exit(1);
});

View File

@@ -0,0 +1,644 @@
"""
Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
==========================================================
Tests 4 models across multiple parameter configurations to find
the absolute best model + settings for 256K context coding agent.
Models:
1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
Test Phases (per model):
Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
Phase 1: GPU layer + MoE offload strategy sweep
Phase 2: CPU thread sweep (carry best from P1)
Phase 3: Batch size sweep (carry best from P1+P2)
Phase 4: KV cache type sweep (carry best from P1+P2+P3)
Phase 5: Final verification (5 runs)
Output: scripts/dual_gpu_results.json (all raw data)
scripts/dual_gpu_summary.txt (human-readable winner)
"""
import subprocess
import time
import json
import urllib.request
import sys
import os
import datetime
try:
sys.stdout.reconfigure(encoding='utf-8')
except Exception:
pass
# ─── Configuration ───────────────────────────────────────────────
BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
CONTEXT = 262144 # 256K
BENCHMARK_RUNS = 3
BENCHMARK_TOKENS = 200
SERVER_TIMEOUT = 300 # seconds to wait for server startup
MODELS = [
{
"name": "Qwen3.5-35B-A3B Q4_K_M",
"path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
"type": "qwen",
"quant": "Q4_K_M",
"is_mxfp4": False,
"total_layers": 64, # Qwen3.5 35B has 64 layers
},
{
"name": "Qwen3.5-35B-A3B MXFP4_MOE",
"path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
"type": "qwen",
"quant": "MXFP4_MOE",
"is_mxfp4": True,
"total_layers": 64,
},
{
"name": "Gemma4 26B-A4B Q4_K_M",
"path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
"type": "gemma4",
"quant": "Q4_K_M",
"is_mxfp4": False,
"total_layers": 30, # Gemma4 26B has 30 layers
},
{
"name": "Gemma4 26B-A4B MXFP4_MOE",
"path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
"type": "gemma4",
"quant": "MXFP4_MOE",
"is_mxfp4": True,
"total_layers": 30,
},
]
ALL_RESULTS = []
# ─── Utility Functions ──────────────────────────────────────────
def log(msg):
ts = datetime.datetime.now().strftime("%H:%M:%S")
print(f"[{ts}] {msg}", flush=True)
def kill_server():
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
capture_output=True)
time.sleep(5)
def get_vram_all():
"""Returns list of (used, total) tuples for each GPU."""
try:
r = subprocess.run(
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
gpus = []
for line in r.stdout.strip().split("\n"):
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 3:
gpus.append({
"gpu": int(parts[0]),
"used": int(parts[1]),
"total": int(parts[2]),
})
return gpus
except Exception:
return []
def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
"""Build llama-server command for dual-GPU."""
cmd = [
LLAMA_SERVER,
"--model", model_path,
"-ngl", str(ngl),
"-c", str(CONTEXT),
"-np", "1",
"-fa", "on",
"--cache-type-k", ctk,
"--cache-type-v", ctv,
"-ub", str(ub),
"-b", str(b),
"-t", str(t),
"-tb", str(t),
"--prio", str(prio),
"--poll", "50",
"--mlock",
"--port", "8000",
"--host", "0.0.0.0",
]
# MoE offloading options
if cpu_moe:
cmd.append("--cpu-moe")
elif n_cpu_moe > 0:
cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
if nommap:
cmd.append("--no-mmap")
return cmd
def start_server(model_path, **kwargs):
cmd = build_cmd(model_path, **kwargs)
log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
)
return proc
def wait_for_server(timeout=SERVER_TIMEOUT):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=3) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
boot_time = time.time() - start
return True, boot_time
except Exception:
pass
time.sleep(3)
return False, timeout
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user",
"content": "Count from 1 to 50, writing each number on a new line."}],
"max_tokens": max_tokens,
"temperature": 0.0,
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"},
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
ct = usage.get("completion_tokens", 0)
pt = usage.get("prompt_tokens", 0)
return {
"tps": ct / elapsed if elapsed > 0 else 0,
"completion_tokens": ct,
"prompt_tokens": pt,
"elapsed": elapsed,
}
def test_config(model_info, label, **kwargs):
"""Test a single configuration. Returns result dict or None."""
kill_server()
log(f" [{label}] Starting server...")
proc = start_server(model_info["path"], **kwargs)
ok, boot_time = wait_for_server()
if not ok:
log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
proc.kill()
return None
vram = get_vram_all()
vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
# Warmup
try:
run_benchmark(max_tokens=20)
except Exception:
pass
# Benchmark runs
speeds = []
for i in range(BENCHMARK_RUNS):
try:
r = run_benchmark()
speeds.append(r["tps"])
log(f" Run {i+1}: {r['tps']:.2f} t/s")
except Exception as e:
log(f" Run {i+1}: ERROR ({e})")
proc.kill()
if not speeds:
log(f" [{label}] ALL BENCHMARK RUNS FAILED")
return None
avg = sum(speeds) / len(speeds)
best = max(speeds)
log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
result = {
"model": model_info["name"],
"quant": model_info["quant"],
"label": label,
"avg_tps": round(avg, 2),
"best_tps": round(best, 2),
"boot_time": round(boot_time, 1),
"vram": vram,
"params": kwargs,
}
ALL_RESULTS.append(result)
return result
# ─── Phase Runners ───────────────────────────────────────────────
def phase0_boot_test(model):
"""Quick test: can the model even boot with 256K on dual GPU?"""
log(f"\n{'='*70}")
log(f" PHASE 0: Boot Test — {model['name']}")
log(f"{'='*70}")
# Try -ngl 999 (all layers to GPU) as baseline
r = test_config(
model, f"boot-ngl999",
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
)
if r:
return r
# If full GPU fails, try with cpu-moe
log(" Full GPU failed, trying with --cpu-moe...")
r = test_config(
model, f"boot-cpumoe",
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
cpu_moe=True,
)
if r:
return r
# Extreme fallback: fewer layers
log(" --cpu-moe also failed, trying reduced layers...")
r = test_config(
model, f"boot-ngl-half",
ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
ctk="q4_0", ctv="q4_0",
)
return r
def phase1_gpu_offload(model, baseline):
"""Find optimal GPU layer count and MoE offload strategy."""
log(f"\n{'='*70}")
log(f" PHASE 1: GPU Offload Strategy — {model['name']}")
log(f"{'='*70}")
results = []
if baseline:
results.append(baseline)
total = model["total_layers"]
# Strategy A: All GPU + cpu-moe variations
for cpu_moe in [True, False]:
label = f"ngl=999 cpu_moe={cpu_moe}"
# Skip if already tested in baseline
if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
baseline["params"].get("cpu_moe", False) == cpu_moe:
continue
r = test_config(
model, label,
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
cpu_moe=cpu_moe,
)
if r:
results.append(r)
# Strategy B: n-cpu-moe sweep (selective expert offload)
for n in [0, 5, 10, 15, 20]:
if n > total:
continue
r = test_config(
model, f"n-cpu-moe={n}",
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
n_cpu_moe=n,
)
if r:
results.append(r)
if not results:
log(" PHASE 1: No configuration worked!")
return None
best = max(results, key=lambda x: x["avg_tps"])
log(f"\n ★ Phase 1 winner: {best['label']}{best['avg_tps']:.2f} t/s")
return best
def phase2_threads(model, prev_best):
"""Sweep CPU threads with best GPU config locked."""
log(f"\n{'='*70}")
log(f" PHASE 2: CPU Thread Sweep — {model['name']}")
log(f"{'='*70}")
p = prev_best["params"]
results = [prev_best]
for t in [2, 4, 6, 8, 10, 12]:
if t == p.get("t", 6):
continue
r = test_config(
model, f"t={t}",
ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
ctk=p["ctk"], ctv=p["ctv"],
cpu_moe=p.get("cpu_moe", False),
n_cpu_moe=p.get("n_cpu_moe", 0),
)
if r:
results.append(r)
best = max(results, key=lambda x: x["avg_tps"])
log(f"\n ★ Phase 2 winner: {best['label']}{best['avg_tps']:.2f} t/s")
return best
def phase3_batch(model, prev_best):
"""Sweep batch sizes."""
log(f"\n{'='*70}")
log(f" PHASE 3: Batch Size Sweep — {model['name']}")
log(f"{'='*70}")
p = prev_best["params"]
best_t = p["t"]
results = [prev_best]
for ub, b in [(128, 512), (256, 1024), (256, 2048),
(512, 1024), (512, 2048), (512, 4096),
(1024, 2048), (1024, 4096)]:
if ub == p["ub"] and b == p["b"]:
continue
r = test_config(
model, f"ub={ub} b={b}",
ngl=p["ngl"], t=best_t, ub=ub, b=b,
ctk=p["ctk"], ctv=p["ctv"],
cpu_moe=p.get("cpu_moe", False),
n_cpu_moe=p.get("n_cpu_moe", 0),
)
if r:
results.append(r)
best = max(results, key=lambda x: x["avg_tps"])
log(f"\n ★ Phase 3 winner: {best['label']}{best['avg_tps']:.2f} t/s")
return best
def phase4_kvcache(model, prev_best):
"""Sweep KV cache precision."""
log(f"\n{'='*70}")
log(f" PHASE 4: KV Cache Type Sweep — {model['name']}")
log(f"{'='*70}")
p = prev_best["params"]
results = [prev_best]
for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
("q4_0", "q8_0"), ("f16", "f16")]:
if ctk == p["ctk"] and ctv == p["ctv"]:
continue
r = test_config(
model, f"kv={ctk}/{ctv}",
ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
ctk=ctk, ctv=ctv,
cpu_moe=p.get("cpu_moe", False),
n_cpu_moe=p.get("n_cpu_moe", 0),
)
if r:
results.append(r)
best = max(results, key=lambda x: x["avg_tps"])
log(f"\n ★ Phase 4 winner: {best['label']}{best['avg_tps']:.2f} t/s")
return best
def phase5_final(model, prev_best):
"""Final verification with 5 runs."""
log(f"\n{'='*70}")
log(f" PHASE 5: Final Verification (5 runs) — {model['name']}")
log(f"{'='*70}")
p = prev_best["params"]
kill_server()
proc = start_server(model["path"], **p)
ok, boot_time = wait_for_server()
if not ok:
log(" FAILED to start for final verification!")
proc.kill()
return prev_best
vram = get_vram_all()
# Warmup
try:
run_benchmark(max_tokens=20)
except Exception:
pass
speeds = []
for i in range(5):
try:
r = run_benchmark()
speeds.append(r["tps"])
log(f" Final Run {i+1}: {r['tps']:.2f} t/s")
except Exception as e:
log(f" Final Run {i+1}: ERROR ({e})")
proc.kill()
if speeds:
avg = sum(speeds) / len(speeds)
best_tps = max(speeds)
log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
final = {
"model": model["name"],
"quant": model["quant"],
"label": f"FINAL-{model['name']}",
"avg_tps": round(avg, 2),
"best_tps": round(best_tps, 2),
"boot_time": round(boot_time, 1),
"vram": vram,
"params": p,
}
ALL_RESULTS.append(final)
return final
return prev_best
# ─── Main ────────────────────────────────────────────────────────
def run_full_benchmark_for_model(model):
"""Run all phases for a single model."""
log(f"\n{'#'*70}")
log(f" MODEL: {model['name']}")
log(f" File: {model['path']}")
log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB")
log(f"{'#'*70}")
# Check model exists
if not os.path.exists(model["path"]):
log(f" SKIP: Model file not found!")
return None
# Phase 0: Can it boot?
baseline = phase0_boot_test(model)
if not baseline:
log(f" SKIP: {model['name']} cannot boot at 256K context!")
return None
# Phase 1: GPU offload strategy
best = phase1_gpu_offload(model, baseline)
if not best:
return baseline
# Phase 2: CPU threads
best = phase2_threads(model, best)
# Phase 3: Batch sizes
best = phase3_batch(model, best)
# Phase 4: KV cache
best = phase4_kvcache(model, best)
# Phase 5: Final verification
final = phase5_final(model, best)
return final
def main():
start_time = time.time()
log("=" * 70)
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
log(" 2x RTX 3060 (24GB Total) | 256K Context")
log(f" Models: {len(MODELS)}")
log(f" Started: {datetime.datetime.now().isoformat()}")
log("=" * 70)
# Show GPU info
gpus = get_vram_all()
for g in gpus:
log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
# Run benchmarks for each model
model_winners = []
for i, model in enumerate(MODELS):
log(f"\n{'='*70}")
log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
log(f"{'='*70}")
winner = run_full_benchmark_for_model(model)
if winner:
model_winners.append(winner)
# Save intermediate results
with open("scripts/dual_gpu_results.json", "w") as f:
json.dump(ALL_RESULTS, f, indent=2, default=str)
log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
# ─── Grand Final Comparison ──────────────────────────────────
elapsed = (time.time() - start_time) / 60
log(f"\n{'='*70}")
log(f" GRAND FINAL COMPARISON")
log(f" Total time: {elapsed:.1f} minutes")
log(f" Configs tested: {len(ALL_RESULTS)}")
log(f"{'='*70}")
if not model_winners:
log(" No models were able to run at 256K context!")
return
# Sort by avg t/s
model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
summary_lines = []
summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
summary_lines.append(f"Total time: {elapsed:.1f} minutes")
summary_lines.append("")
summary_lines.append("=" * 60)
summary_lines.append(" RANKING (by AVG t/s)")
summary_lines.append("=" * 60)
for rank, w in enumerate(model_winners, 1):
medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ")
summary_lines.append(f"\n {medal} #{rank}: {w['model']}")
summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
summary_lines.append(f" Boot: {w['boot_time']:.0f}s")
p = w["params"]
summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}")
if p.get("cpu_moe"):
summary_lines.append(f" --cpu-moe")
elif p.get("n_cpu_moe", 0) > 0:
summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}")
champion = model_winners[0]
summary_lines.append(f"\n{'='*60}")
summary_lines.append(f" ★ CHAMPION: {champion['model']}")
summary_lines.append(f" {champion['avg_tps']:.2f} t/s average")
summary_lines.append(f"{'='*60}")
# Build recommended command
p = champion["params"]
cmd_parts = [
f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
f"-ngl {p['ngl']} -c {CONTEXT}",
f"-t {p['t']} -tb {p['t']}",
f"-ub {p['ub']} -b {p['b']}",
"-fa on",
f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
f"--prio {p.get('prio', 3)} --poll 50",
"--mlock",
]
if p.get("cpu_moe"):
cmd_parts.append("--cpu-moe")
elif p.get("n_cpu_moe", 0) > 0:
cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
if p.get("nommap"):
cmd_parts.append("--no-mmap")
cmd_parts.append("--port 8000 --host 0.0.0.0")
summary_lines.append(f"\n Recommended command:")
summary_lines.append(f" {' '.join(cmd_parts)}")
summary = "\n".join(summary_lines)
print(summary)
with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
f.write(summary)
with open("scripts/dual_gpu_results.json", "w") as f:
json.dump(ALL_RESULTS, f, indent=2, default=str)
log(f"\n Results: scripts/dual_gpu_results.json")
log(f" Summary: scripts/dual_gpu_summary.txt")
log(f" DONE!")
kill_server()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,330 @@
/**
* Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2
* =====================================================
* Informed by VRAM analysis — tests models in optimal order.
*
* Key insights applied:
* - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA)
* - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first
* - Skip configs known to fail, minimize wasted time
*
* Run: node scripts/dual_gpu_benchmark_v2.mjs
* Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt
*/
import { spawn, execSync } from "child_process";
import { writeFileSync, existsSync, statSync } from "fs";
const BASE_URL = "http://127.0.0.1:8000";
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
const CTX = 262144;
const RUNS = 3;
const TOKENS = 200;
const BOOT_TIMEOUT = 300_000;
// Models ordered: smallest first (most likely to succeed fully on GPU)
const MODELS = [
{
name: "Gemma4-26B MXFP4_MOE",
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
quant: "MXFP4_MOE",
fitsGPU: true, // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB
},
{
name: "Gemma4-26B Q4_K_M",
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
quant: "Q4_K_M",
fitsGPU: true, // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB
},
{
name: "Qwen3.5-35B MXFP4_MOE",
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
quant: "MXFP4_MOE",
fitsGPU: "maybe", // 20.1 + 1.41 + 1 = 22.51 GB — tight
},
{
name: "Qwen3.5-35B Q4_K_M",
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
quant: "Q4_K_M",
fitsGPU: "maybe", // 20.5 + 1.41 + 1 = 22.91 GB — very tight
},
];
const ALL = [];
let currentProc = null;
// ─── Utilities ─────────────────────────────────────────────────
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
async function kill() {
if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; }
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
await sleep(5000);
}
function vram() {
try {
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
const [g, u, t] = l.split(",").map(s => parseInt(s));
return { gpu: g, used: u, total: t };
});
} catch { return []; }
}
function startServer(modelPath, p) {
const args = [
"--model", modelPath, "-ngl", String(p.ngl),
"-c", String(CTX), "-np", "1", "-fa", "on",
"--cache-type-k", p.ctk, "--cache-type-v", p.ctv,
"-ub", String(p.ub), "-b", String(p.b),
"-t", String(p.t), "-tb", String(p.t),
"--prio", String(p.prio || 3), "--poll", "50", "--mlock",
"--port", "8000", "--host", "0.0.0.0",
];
if (p.cpuMoe) args.push("--cpu-moe");
else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe));
if (p.nommap) args.push("--no-mmap");
currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
return currentProc;
}
async function waitReady(timeout = BOOT_TIMEOUT) {
const t0 = Date.now();
while (Date.now() - t0 < timeout) {
try {
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
const d = await r.json();
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
} catch {}
await sleep(3000);
}
return { ok: false, boot: timeout / 1000 };
}
async function bench(n = TOKENS) {
const t0 = Date.now();
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "m",
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
max_tokens: n, temperature: 0,
}),
signal: AbortSignal.timeout(600_000),
});
const d = await r.json();
const dt = (Date.now() - t0) / 1000;
const ct = d.usage?.completion_tokens || 0;
return { tps: ct / dt, ct, dt };
}
async function testConfig(model, label, params) {
await kill();
log(` [${label}] Starting...`);
startServer(model.path, params);
const { ok, boot } = await waitReady();
if (!ok) { log(` [${label}] ✗ FAILED (timeout)`); await kill(); return null; }
const v = vram();
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
log(` [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`);
try { await bench(20); } catch {} // warmup
const speeds = [];
for (let i = 0; i < RUNS; i++) {
try { const r = await bench(); speeds.push(r.tps); log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
} catch (e) { log(` Run${i+1}: ERR ${e.message}`); }
}
await kill();
if (!speeds.length) { log(` [${label}] ✗ ALL RUNS FAILED`); return null; }
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
const best = Math.max(...speeds);
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
const res = { model: model.name, quant: model.quant, label,
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
boot: +boot.toFixed(1), vram: v, params };
ALL.push(res);
return res;
}
// Save intermediate results after each test
function saveIntermediate() {
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
}
// ─── Smart Phase Runner ────────────────────────────────────────
async function tuneModel(model) {
log(`\n${"#".repeat(65)}`);
log(` ${model.name} (${model.quant})`);
if (!existsSync(model.path)) { log(" ✗ File not found, SKIP"); return null; }
const sz = (statSync(model.path).size / 1024**3).toFixed(2);
log(` Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`);
log(`${"#".repeat(65)}`);
// ── Step 1: Find working GPU config ──
log(`\n ── Step 1: Find optimal GPU offload ──`);
let baseline = null;
if (model.fitsGPU === true || model.fitsGPU === "maybe") {
// Try full GPU, no CPU offload
baseline = await testConfig(model, "ngl=999 pure-GPU", {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" });
saveIntermediate();
}
if (!baseline) {
// Try n-cpu-moe values (ascending — find minimum needed)
for (const n of [5, 10, 15, 20]) {
baseline = await testConfig(model, `n-cpu-moe=${n}`, {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n });
saveIntermediate();
if (baseline) break; // found minimum working offload
}
}
if (!baseline) {
// Last resort: full cpu-moe
baseline = await testConfig(model, "cpu-moe", {
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true });
saveIntermediate();
}
if (!baseline) { log(`${model.name} cannot boot at 256K!`); return null; }
const bp = baseline.params; // carry forward best params
// If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory)
if (!bp.cpuMoe && !bp.nCpuMoe) {
const alt = await testConfig(model, "compare: cpu-moe", {
...bp, cpuMoe: true });
saveIntermediate();
if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; }
}
let best = baseline;
// ── Step 2: Thread sweep ──
log(`\n ── Step 2: Thread sweep ──`);
for (const t of [2, 4, 8, 10, 12]) {
if (t === best.params.t) continue;
const r = await testConfig(model, `t=${t}`, { ...best.params, t });
saveIntermediate();
if (r && r.avg_tps > best.avg_tps) best = r;
}
// ── Step 3: Batch sweep ──
log(`\n ── Step 3: Batch sweep ──`);
for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) {
if (ub === best.params.ub && b === best.params.b) continue;
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b });
saveIntermediate();
if (r && r.avg_tps > best.avg_tps) best = r;
}
// ── Step 4: KV cache sweep ──
log(`\n ── Step 4: KV cache type ──`);
for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) {
if (ctk === best.params.ctk && ctv === best.params.ctv) continue;
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv });
saveIntermediate();
if (r && r.avg_tps > best.avg_tps) best = r;
}
// ── Step 5: Final verification (5 runs) ──
log(`\n ── Step 5: Final verification ──`);
await kill();
startServer(model.path, best.params);
const { ok, boot } = await waitReady();
if (!ok) { await kill(); return best; }
const v = vram();
try { await bench(20); } catch {}
const finals = [];
for (let i = 0; i < 5; i++) {
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
} catch (e) { log(` Final ${i+1}: ERR`); }
}
await kill();
if (finals.length > 0) {
const avg = finals.reduce((a,b)=>a+b) / finals.length;
const bst = Math.max(...finals);
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
const final = { model: model.name, quant: model.quant, label: `FINAL`,
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
boot: +boot.toFixed(1), vram: v, params: best.params };
ALL.push(final);
saveIntermediate();
return final;
}
return best;
}
// ─── Main ──────────────────────────────────────────────────────
async function main() {
const t0 = Date.now();
log("=" .repeat(65));
log(" DUAL-GPU BENCHMARK v2 — Smart Strategy");
log(" 2x RTX 3060 (24GB) | 256K Context");
log(" " + new Date().toISOString());
log("=".repeat(65));
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
const winners = [];
for (let i = 0; i < MODELS.length; i++) {
log(`\n${"=".repeat(65)}`);
log(` MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`);
log("=".repeat(65));
const w = await tuneModel(MODELS[i]);
if (w) winners.push(w);
saveIntermediate();
}
// ─── Summary ──────────────────────────────────────────────
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
winners.sort((a, b) => b.avg_tps - a.avg_tps);
const medals = ["🥇", "🥈", "🥉", " "];
const lines = [
`Dual-GPU Benchmark v2 — ${new Date().toISOString()}`,
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
"", "=" .repeat(55), " RANKING", "=".repeat(55),
];
for (let i = 0; i < winners.length; i++) {
const w = winners[i], p = w.params;
lines.push("", ` ${medals[i]||" "} #${i+1}: ${w.model}`);
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`);
if (p.cpuMoe) lines.push(` --cpu-moe`);
else if (p.nCpuMoe) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
}
if (winners.length > 0) {
const c = winners[0], cp = c.params;
lines.push("", "=".repeat(55), ` ★ CHAMPION: ${c.model}${c.avg_tps} t/s`, "=".repeat(55));
const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`,
`-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`,
`-ub ${cp.ub} -b ${cp.b} -fa on`,
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
`--prio ${cp.prio||3} --poll 50 --mlock`,
cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "",
"--port 8000 --host 0.0.0.0"].filter(Boolean).join(" ");
lines.push("", " Recommended:", ` ${cmd}`);
}
const summary = lines.join("\n");
console.log("\n" + summary);
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
log(`\n Saved: dual_gpu_results.json + dual_gpu_summary.txt`);
log(" DONE!");
await kill();
}
main().catch(e => { console.error("FATAL:", e); process.exit(1); });

View File

@@ -0,0 +1,101 @@
import { spawn, exec } from 'child_process';
const delay = ms => new Promise(res => setTimeout(res, ms));
async function killServer() {
return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
}
async function testContextSize(modelPath, contextSize) {
console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
await killServer();
const args = [
'--model', `models\\${modelPath}`,
'-ngl', '999',
'-c', contextSize.toString(),
'-fa', 'on',
'--cache-type-k', 'q4_0',
'--cache-type-v', 'q4_0',
'-ub', '512',
'-b', '2048',
'-t', '6',
'-tb', '6',
'--split-mode', 'row',
'--prio', '3',
'--fit', 'off',
'--port', '8000',
'--host', '0.0.0.0'
];
const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
let booted = false;
let oomed = false;
server.stderr.on('data', (d) => {
const text = d.toString();
if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
oomed = true;
}
});
for (let i = 0; i < 20; i++) {
if (oomed) break;
try {
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
if (res.status === 200) {
booted = true;
break;
}
} catch(e) {}
await delay(2000);
}
if (oomed || !booted) {
console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
server.kill('SIGKILL');
await killServer();
return false;
}
console.log(`✅ Booted! Running Benchmark...`);
// Benchmark
const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
r(stdout || stderr);
}));
console.log(bench);
await killServer();
return true;
}
async function findMaxContext(modelName) {
const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
let maxFound = false;
for (const c of contexts) {
const success = await testContextSize(modelName, c);
if (success) {
maxFound = true;
console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
break;
}
}
if (!maxFound) {
console.log(`\n❌ Failed to find any working context size for ${modelName}`);
}
}
async function main() {
exec('set CUDA_VISIBLE_DEVICES=');
console.log("============= QWEN 27B Q4_K_M =============");
await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
console.log("\n============= GEMMA 4 31B Q4_K_M =============");
await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
}
main();

View File

@@ -0,0 +1,345 @@
/**
* Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크
* =====================================================
* 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기
*
* 테스트 모델:
* 1. UD-IQ4_NL (16.6 GB) — 확실히 올라감, 기준선
* 2. MXFP4_MOE (20.1 GB) — 도전! VRAM 극한 최적화
* 3. Q4_K_M (20.5 GB) — 대조군 (n-cpu-moe=5)
*
* VRAM 절감 전략:
* A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소)
* B. split-mode row (GPU간 더 균등한 분배)
* C. tensor-split 수동 밸런싱
* D. no-mmap (메모리 관리 최적화)
* E. defrag-thold (KV 캐시 파편화 방지)
*
* Run: node scripts/qwen_fullgpu_challenge.mjs
*/
import { spawn, execSync } from "child_process";
import { writeFileSync, existsSync, statSync } from "fs";
const BASE_URL = "http://127.0.0.1:8000";
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
const CTX = 262144;
const RUNS = 3;
const TOKENS = 200;
const BOOT_TIMEOUT = 300_000;
const MODELS = [
{
name: "Qwen3.5 UD-IQ4_NL",
path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`,
sizeGB: 16.6,
},
{
name: "Qwen3.5 MXFP4_MOE",
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
sizeGB: 20.11,
},
{
name: "Qwen3.5 Q4_K_M",
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
sizeGB: 20.5,
},
];
const ALL = [];
let proc = null;
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
async function kill() {
if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; }
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
await sleep(5000);
}
function vram() {
try {
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
const [g, u, t] = l.split(",").map(s => parseInt(s));
return { gpu: g, used: u, total: t };
});
} catch { return []; }
}
function startServer(modelPath, p) {
const args = [
"--model", modelPath, "-ngl", "999",
"-c", String(CTX), "-np", "1", "-fa", "on",
"--cache-type-k", p.ctk || "q4_0",
"--cache-type-v", p.ctv || "q4_0",
"-ub", String(p.ub || 512), "-b", String(p.b || 2048),
"-t", String(p.t || 4), "-tb", String(p.t || 4),
"--prio", "3", "--poll", "50", "--mlock",
"--port", "8000", "--host", "0.0.0.0",
];
// GPU offload strategy
if (p.cpuMoe) args.push("--cpu-moe");
else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe));
// VRAM saving options
if (p.splitMode) args.push("--split-mode", p.splitMode);
if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit);
if (p.noMmap) args.push("--no-mmap");
if (p.defragThold) args.push("--defrag-thold", String(p.defragThold));
if (p.noKvOffload) args.push("--no-kv-offload");
const cmdStr = args.join(" ");
log(` CMD: ...${cmdStr.slice(-80)}`);
proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
return proc;
}
async function waitReady(timeout = BOOT_TIMEOUT) {
const t0 = Date.now();
while (Date.now() - t0 < timeout) {
try {
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
const d = await r.json();
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
} catch {}
await sleep(3000);
}
return { ok: false, boot: timeout / 1000 };
}
async function bench(n = TOKENS) {
const t0 = Date.now();
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model: "m",
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
max_tokens: n, temperature: 0,
}),
signal: AbortSignal.timeout(600_000),
});
const d = await r.json();
const dt = (Date.now() - t0) / 1000;
const ct = d.usage?.completion_tokens || 0;
return { tps: ct / dt, ct, dt };
}
async function testConfig(model, label, params) {
await kill();
log(` [${label}] Starting...`);
startServer(model.path, params);
const { ok, boot } = await waitReady();
if (!ok) {
log(` [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`);
await kill();
return null;
}
const v = vram();
const totalUsed = v.reduce((a, g) => a + g.used, 0);
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
log(` [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`);
try { await bench(20); } catch {} // warmup
const speeds = [];
for (let i = 0; i < RUNS; i++) {
try {
const r = await bench();
speeds.push(r.tps);
log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
} catch (e) {
log(` Run${i+1}: ERR ${e.message}`);
}
}
await kill();
if (!speeds.length) return null;
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
const best = Math.max(...speeds);
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
const res = {
model: model.name, label,
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
boot: +boot.toFixed(1),
vram_total: totalUsed, vram: v,
params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" },
gpu_only: !params.cpuMoe && !params.nCpuMoe,
};
ALL.push(res);
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
return res;
}
// ─── Test Strategies ───────────────────────────────────────────
async function testModel(model) {
log(`\n${"#".repeat(65)}`);
log(` ${model.name} (${model.sizeGB} GB)`);
if (!existsSync(model.path)) { log(" ✗ File not found!"); return null; }
log(`${"#".repeat(65)}`);
let best = null;
const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; };
// ── Strategy 1: Pure GPU, default settings ──
log(`\n ── Strategy 1: Pure GPU (default) ──`);
update(await testConfig(model, "pure-GPU default", {
t: 4, ub: 512, b: 2048
}));
// ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ──
log(`\n ── Strategy 2: Pure GPU, minimal batch ──`);
update(await testConfig(model, "pure-GPU minbatch", {
t: 4, ub: 64, b: 256
}));
// ── Strategy 3: Pure GPU, small batch + no-mmap ──
log(`\n ── Strategy 3: Pure GPU + no-mmap + small batch ──`);
update(await testConfig(model, "pure-GPU nommap small", {
t: 4, ub: 128, b: 512, noMmap: true
}));
// ── Strategy 4: Pure GPU, split-mode row ──
log(`\n ── Strategy 4: Pure GPU + split-mode row ──`);
update(await testConfig(model, "pure-GPU row-split", {
t: 4, ub: 128, b: 512, splitMode: "row"
}));
// ── Strategy 5: Pure GPU, tensor-split manual balance ──
log(`\n ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`);
update(await testConfig(model, "pure-GPU ts=0.5,0.5", {
t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5"
}));
// ── Strategy 6: Pure GPU, defrag + all tricks ──
log(`\n ── Strategy 6: Pure GPU ALL tricks ──`);
update(await testConfig(model, "pure-GPU all-tricks", {
t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1
}));
// ── Fallback: n-cpu-moe=5 baseline ──
if (!best || !best.gpu_only) {
log(`\n ── Fallback: n-cpu-moe=5 ──`);
update(await testConfig(model, "n-cpu-moe=5 baseline", {
t: 4, ub: 256, b: 1024, nCpuMoe: 5
}));
}
// ── If pure GPU worked, tune batch/thread/kv ──
if (best && best.gpu_only) {
log(`\n ── Pure GPU succeeded! Fine-tuning... ──`);
const bp = best.params;
// Thread sweep
for (const t of [2, 6, 8]) {
if (t === bp.t) continue;
update(await testConfig(model, `tune t=${t}`, { ...bp, t }));
}
// Batch sweep
for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) {
if (ub === bp.ub && b === bp.b) continue;
update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b }));
}
// KV cache upgrade (extra VRAM available?)
for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) {
update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv }));
}
}
// ── Final verification ──
if (best) {
log(`\n ── Final verification (5 runs) ──`);
await kill();
startServer(model.path, best.params);
const { ok, boot } = await waitReady();
if (ok) {
const v = vram();
try { await bench(20); } catch {}
const finals = [];
for (let i = 0; i < 5; i++) {
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
} catch (e) { log(` Final ${i+1}: ERR`); }
}
await kill();
if (finals.length > 0) {
const avg = finals.reduce((a,b)=>a+b) / finals.length;
const bst = Math.max(...finals);
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
const final = { model: model.name, label: "FINAL",
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0),
vram: v, params: best.params, gpu_only: best.gpu_only };
ALL.push(final);
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
return final;
}
}
await kill();
}
return best;
}
// ─── Main ──────────────────────────────────────────────────────
async function main() {
const t0 = Date.now();
log("=".repeat(65));
log(" QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET");
log(" 2x RTX 3060 (24GB) | 256K Context");
log(" " + new Date().toISOString());
log("=".repeat(65));
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
const winners = [];
for (const model of MODELS) {
const w = await testModel(model);
if (w) winners.push(w);
}
// ─── Summary ──────────────────────────────────────────────
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
winners.sort((a, b) => b.avg_tps - a.avg_tps);
const lines = [
`Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`,
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
"", "=".repeat(55), " RANKING", "=".repeat(55),
];
for (let i = 0; i < winners.length; i++) {
const w = winners[i], p = w.params;
const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload";
lines.push("", ` #${i+1}: ${w.model} [${gpu}]`);
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
lines.push(` VRAM: ${w.vram_total} MiB total`);
const flags = [];
if (p.splitMode) flags.push(`split=${p.splitMode}`);
if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`);
if (p.noMmap) flags.push("no-mmap");
if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`);
lines.push(` t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`);
}
if (winners.length > 0) {
const c = winners[0];
lines.push("", "=".repeat(55));
lines.push(` ★ CHAMPION: ${c.model}${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`);
lines.push("=".repeat(55));
}
const summary = lines.join("\n");
console.log("\n" + summary);
writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8");
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
log(`\n Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`);
log(" DONE!");
await kill();
}
main().catch(e => { console.error("FATAL:", e); process.exit(1); });

View File

@@ -0,0 +1,129 @@
import subprocess, time, urllib.request, json, sys
try: sys.stdout.reconfigure(encoding='utf-8')
except: pass
MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
BASE = "http://127.0.0.1:8000"
# BEST SO FAR: GPU1 only + Expert CPU + 8t = 8.75 t/s (6.5GB / 12GB used)
# 5.5GB VRAM remaining on GPU 1. Let's use it!
# Strategy: keep some experts on GPU 1 using -ncmoe (n-cpu-moe)
# n-cpu-moe = number of layers whose experts stay on CPU
# Lower = more experts on GPU = more VRAM used = potentially faster
BASE_CMD = [
r"llama_bin_run\llama-server.exe",
"--model", MODEL,
"-ngl", "999",
"-sm", "none", "--main-gpu", "1",
"-c", "4096", "-np", "1", "-fa", "on",
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "512", "-b", "2048",
"-t", "8", "-tb", "8",
"--prio", "3", "--poll", "50",
"--no-mmap",
"--port", "8000", "--host", "0.0.0.0"
]
CONFIGS = [
# Baseline: all experts CPU (confirmed 8.75 t/s)
{"name": "Baseline: all expert CPU", "extra": ["-ot", ".*ffn_.*_exps.*=CPU"]},
# Try n-cpu-moe with GPU1 only: keep some experts on GPU
{"name": "n-cpu-moe=60 (4 layers expert GPU)", "extra": ["-ncmoe", "60"]},
{"name": "n-cpu-moe=56 (8 layers expert GPU)", "extra": ["-ncmoe", "56"]},
{"name": "n-cpu-moe=52 (12 layers expert GPU)", "extra": ["-ncmoe", "52"]},
{"name": "n-cpu-moe=48 (16 layers expert GPU)", "extra": ["-ncmoe", "48"]},
]
def kill():
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(4)
def check_server(timeout=900):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
if resp.get("status") in ("ok", "ready"):
return True
except: pass
time.sleep(5)
return False
def bench(runs=3):
speeds = []
for i in range(runs):
payload = json.dumps({
"model": "m",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a Python fibonacci function with memoization."}
],
"max_tokens": 200,
"temperature": 0.0
}).encode('utf-8')
req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
dt = time.time() - t0
tokens = resp.get("usage", {}).get("completion_tokens", 0)
speed = tokens / dt if dt > 0 else 0
speeds.append(speed)
print(f" Run {i+1}: {speed:.2f} t/s ({tokens} tok / {dt:.1f}s)")
return sum(speeds)/len(speeds), max(speeds)
def vram():
try:
out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
return [int(x.strip()) for x in out.split('\n')]
except: return [0, 0]
results = []
for cfg in CONFIGS:
kill()
print(f"\n{'='*60}")
print(f"Testing: {cfg['name']}")
print(f"{'='*60}")
cmd = BASE_CMD + cfg["extra"]
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not check_server(900):
print(f" FAILED TO BOOT")
results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
proc.terminate(); kill(); continue
print(" Server ready! Warming up...")
try:
p = json.dumps({"model":"m","messages":[{"role":"system","content":"Hi"},{"role":"user","content":"Hi"}],"max_tokens":5}).encode()
urllib.request.urlopen(urllib.request.Request(f"{BASE}/v1/chat/completions",data=p,headers={"Content-Type":"application/json"}), timeout=120)
except: pass
v = vram()
print(f" VRAM: GPU0={v[0]}MB, GPU1={v[1]}MB, Total={sum(v)}MB")
avg, best = bench(runs=3)
print(f" >>> AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
results.append({
"name": cfg["name"], "avg_tps": round(avg,2), "best_tps": round(best,2),
"vram_gpu0": v[0], "vram_gpu1": v[1], "vram_total": sum(v), "status": "OK"
})
proc.terminate()
kill()
print(f"\n\n{'='*60}")
print("FINAL RESULTS - GPU1 Expert Balance (Target: 10+ t/s)")
print(f"{'='*60}")
print(f"{'Config':<48} {'AVG':>6} {'BEST':>6} {'GPU1':>7}")
print("-" * 72)
for r in results:
if r["status"] == "OK":
print(f" {r['name']:<46} {r['avg_tps']:>5} {r['best_tps']:>5} {r['vram_gpu1']:>5}MB")
else:
print(f" {r['name']:<46} {'FAIL':>5}")
with open("scripts/122b_final_results.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print("\nSaved to scripts/122b_final_results.json")

View File

@@ -0,0 +1,64 @@
import { exec, spawn } from 'child_process';
const delay = ms => new Promise(res => setTimeout(res, ms));
async function runTest(modelArgs, envVars, name) {
console.log(`\n===========================================`);
console.log(`Testing: ${name}`);
console.log(`Args: ${modelArgs}`);
return new Promise(async (resolve) => {
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
await delay(2000);
const env = { ...process.env, ...envVars };
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
detached: true,
stdio: 'ignore',
env
});
let ready = false;
for (let i = 0; i < 40; i++) {
try {
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
if (res.status === 200) {
ready = true;
break;
}
} catch (e) {}
await delay(3000);
}
if (!ready) {
console.log(`[${name}] FAILED TO BOOT`);
exec('taskkill /F /IM llama-server.exe');
resolve({ success: false });
return;
}
console.log(`[${name}] Server Ready! Running benchmark...`);
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
console.log(stdout || stderr);
exec('taskkill /F /IM llama-server.exe');
resolve({ success: true });
});
});
}
async function main() {
const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
// 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48)
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer");
// 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer");
// 3. Fallback to 36 if OOM happens on 32/28
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer");
console.log("\nALL TESTS COMPLETED");
}
main();

View File

@@ -0,0 +1,72 @@
import { exec, spawn } from 'child_process';
const delay = ms => new Promise(res => setTimeout(res, ms));
async function runTest(modelArgs, envVars, name) {
console.log(`\n===========================================`);
console.log(`Testing: ${name}`);
console.log(`Env: ${JSON.stringify(envVars)}`);
console.log(`Args: ${modelArgs}`);
return new Promise(async (resolve) => {
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
await delay(2000);
const env = { ...process.env, ...envVars };
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
detached: true,
stdio: 'ignore',
env
});
let ready = false;
for (let i = 0; i < 40; i++) {
try {
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
if (res.status === 200) {
ready = true;
break;
}
} catch (e) {}
await delay(3000);
}
if (!ready) {
console.log(`[${name}] FAILED TO BOOT`);
exec('taskkill /F /IM llama-server.exe');
resolve({ success: false });
return;
}
console.log(`[${name}] Server Ready! Running speed test...`);
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
console.log(stdout || stderr);
exec('taskkill /F /IM llama-server.exe');
resolve({ success: true });
});
});
}
async function main() {
// 1. 122B-A10B: Pure GPU offload (No n-cpu-moe at all)
// -ngl 999 will offload all 48 layers to the NVIDIA driver (triggering Shared VRAM fallback on Windows)
const args122B = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
await runTest(args122B, {}, "122B-A10B: Pure GPU (NVIDIA Shared Memory Fallback)");
// 2. 35B-A3B: Pure GPU tuning to hit 70 t/s
// Base configuration from previous full-gpu run:
const args35B = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 128 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
// We already got ~64 t/s basically.
// Let's try MMQ for custom matrix multiplication which is often faster on Ampere for low batch generation
await runTest(args35B, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Force MMQ = 1");
// Try increasing threads to 12 just in case
const args35B_t12 = args35B.replace("-t 6 -tb 6", "-t 12 -tb 12");
await runTest(args35B_t12, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Threads 12 + MMQ");
console.log("\nALL TESTS COMPLETED");
}
main();

View File

@@ -0,0 +1,84 @@
import { exec, spawn } from 'child_process';
const delay = ms => new Promise(res => setTimeout(res, ms));
async function runTest(modelArgs, name) {
console.log(`\n===========================================`);
console.log(`Testing: ${name}`);
console.log(`Args: ${modelArgs}`);
return new Promise(async (resolve) => {
// Kill existing
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
await delay(2000);
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
detached: true,
stdio: 'ignore'
});
let ready = false;
let oom = false;
for (let i = 0; i < 40; i++) {
try {
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
if (res.status === 200) {
ready = true;
break;
}
} catch (e) {}
await delay(3000);
}
if (!ready) {
console.log(`[${name}] FAILED TO BOOT (Likely OOM)`);
exec('taskkill /F /IM llama-server.exe');
resolve({ success: false });
return;
}
console.log(`[${name}] Server Ready! Running benchmark...`);
// Run pptest
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
console.log(stdout || stderr);
// Extract TG and PP from TG-500
const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/);
const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/);
const tg = tgMatch ? parseFloat(tgMatch[1]) : 0;
const pp = ppMatch ? parseFloat(ppMatch[1]) : 0;
exec('taskkill /F /IM llama-server.exe');
resolve({ success: true, tg, pp });
});
});
}
async function main() {
// 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512
const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
// Test 1: n-cpu-moe 1, ub 512
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512");
// Test 2: n-cpu-moe 2, ub 512
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512");
// Test 3: n-cpu-moe 4, ub 512
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512");
// 2. 122B Tuning: Find optimal n-cpu-moe
const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
// Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU.
// Let's try 38, 35, 30
await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38");
await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30");
await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22");
console.log("Tuning finished.");
}
main();

View File

@@ -0,0 +1,107 @@
import subprocess, time, urllib.request, json, sys
try: sys.stdout.reconfigure(encoding='utf-8')
except: pass
MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
BASE = "http://127.0.0.1:8000"
# Goal: See if decreasing n-cpu-moe increases VRAM usage on GPU 1 and improves speed
# Now that we know GPU 1 is isolated and has ~3.5GB free with 256K context
BASE_CMD = [
r"llama_bin_run\llama-server.exe",
"--model", MODEL,
"-ngl", "999",
"-sm", "none", "--main-gpu", "1",
"-c", "4096", "-np", "1", "-fa", "on", # use a small context for fast boot/testing
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
"-ub", "512", "-b", "2048",
"-t", "8", "-tb", "8",
"--prio", "3", "--poll", "50",
"--no-mmap",
"--port", "8000", "--host", "0.0.0.0"
]
CONFIGS = [
{"name": "n-cpu-moe=48 (baseline)", "extra": ["-ncmoe", "48"]},
{"name": "n-cpu-moe=40", "extra": ["-ncmoe", "40"]},
{"name": "n-cpu-moe=32", "extra": ["-ncmoe", "32"]},
{"name": "n-cpu-moe=24", "extra": ["-ncmoe", "24"]},
]
def kill():
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(4)
def check_server(timeout=900):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE}/health")
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
if resp.get("status") in ("ok", "ready"):
return True
except: pass
time.sleep(5)
return False
def bench(runs=2):
speeds = []
for i in range(runs):
payload = json.dumps({
"model": "m",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a short Python script."}
],
"max_tokens": 100,
"temperature": 0.0
}).encode('utf-8')
req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
t0 = time.time()
resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
dt = time.time() - t0
tokens = resp.get("usage", {}).get("completion_tokens", 0)
speed = tokens / dt if dt > 0 else 0
speeds.append(speed)
return sum(speeds)/len(speeds), max(speeds)
def vram():
try:
out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
return [int(x.strip()) for x in out.split('\n')]
except: return [0, 0]
results = []
for cfg in CONFIGS:
kill()
print(f"\n{'='*60}\nTesting: {cfg['name']}\n{'='*60}")
cmd = BASE_CMD + cfg["extra"]
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not check_server(300):
print(f" FAILED TO BOOT (OOM?)")
results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
proc.terminate(); kill(); continue
print(" Server ready! Warming up...")
time.sleep(2)
v = vram()
avg, best = bench(runs=2)
print(f" >>> AVG: {avg:.2f} t/s | VRAM GPU1: {v[1]}MB")
results.append({
"name": cfg["name"], "avg_tps": round(avg,2),
"vram_gpu1": v[1], "status": "OK"
})
proc.terminate()
kill()
print("\nFINAL RESULTS:")
for r in results:
if r["status"] == "OK":
print(f" {r['name']:<25} {r['avg_tps']:>5} t/s | {r['vram_gpu1']:>5}MB")
else:
print(f" {r['name']:<25} FAIL (OOM)")