feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
This commit is contained in:
372
scripts/_archive/tuning/auto_tune_122b.py
Normal file
372
scripts/_archive/tuning/auto_tune_122b.py
Normal file
@@ -0,0 +1,372 @@
|
||||
"""
|
||||
Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
|
||||
===========================================
|
||||
각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
|
||||
서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
|
||||
|
||||
예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
# ============================================================
|
||||
# 테스트할 설정 목록
|
||||
# ============================================================
|
||||
# 공통 파라미터 (변경하지 않는 것들)
|
||||
COMMON_ARGS = [
|
||||
"--model", MODEL_PATH,
|
||||
"-ngl", "999",
|
||||
"--cpu-moe",
|
||||
"-c", "2048",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "256",
|
||||
"-b", "1024",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"--no-warmup", # 워밍업은 벤치마크 스크립트에서 직접 수행
|
||||
]
|
||||
|
||||
# 변수 파라미터 조합
|
||||
CONFIGS = [
|
||||
{
|
||||
"name": "A) --no-mmap -t 8",
|
||||
"desc": "서버 권장: mmap 비활성화 (baseline 대비)",
|
||||
"extra": ["--no-mmap", "-t", "8", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "B) --no-mmap -t 6",
|
||||
"desc": "스레드 감소 (캐시 경합 회피)",
|
||||
"extra": ["--no-mmap", "-t", "6", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "C) --no-mmap -t 10",
|
||||
"desc": "스레드 증가 (RAM 대역폭 포화)",
|
||||
"extra": ["--no-mmap", "-t", "10", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "D) --no-mmap -t 12",
|
||||
"desc": "더 많은 스레드",
|
||||
"extra": ["--no-mmap", "-t", "12", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "E) --no-mmap -t 10 --prio 3 --poll 100",
|
||||
"desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
|
||||
"extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
|
||||
},
|
||||
]
|
||||
|
||||
# ============================================================
|
||||
# 유틸리티 함수
|
||||
# ============================================================
|
||||
|
||||
def kill_server():
|
||||
"""llama-server 프로세스 강제 종료"""
|
||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
||||
time.sleep(3)
|
||||
|
||||
def start_server(config, log_path):
|
||||
"""서버 시작, 로그를 파일로 리다이렉트"""
|
||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
||||
log_file = open(log_path, "w", encoding="utf-8")
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=log_file,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd()
|
||||
)
|
||||
return proc, log_file
|
||||
|
||||
def wait_for_server(timeout=600):
|
||||
"""서버가 준비될 때까지 대기"""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def run_single_benchmark(prompt, max_tokens=200):
|
||||
"""단일 벤치마크 실행"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
return completion_tokens, elapsed
|
||||
|
||||
def parse_eval_times(log_path):
|
||||
"""서버 로그에서 순수 eval time 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
|
||||
# "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
|
||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
|
||||
results = []
|
||||
for m in matches:
|
||||
results.append({
|
||||
"total_ms": float(m[0]),
|
||||
"tokens": int(m[1]),
|
||||
"ms_per_token": float(m[2]),
|
||||
"tps": float(m[3])
|
||||
})
|
||||
return results
|
||||
|
||||
def parse_prompt_eval_times(log_path):
|
||||
"""서버 로그에서 prompt eval time 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
|
||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
|
||||
results = []
|
||||
for m in matches:
|
||||
results.append({
|
||||
"total_ms": float(m[0]),
|
||||
"tokens": int(m[1]),
|
||||
"ms_per_token": float(m[2]),
|
||||
"tps": float(m[3])
|
||||
})
|
||||
return results
|
||||
|
||||
def parse_vram_usage(log_path):
|
||||
"""서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return "N/A"
|
||||
|
||||
match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
|
||||
if match:
|
||||
return f"{float(match.group(1)):.0f} MiB"
|
||||
return "N/A"
|
||||
|
||||
# ============================================================
|
||||
# 메인 튜닝 루프
|
||||
# ============================================================
|
||||
|
||||
def main():
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B 자동 정밀 튜닝")
|
||||
print(f" 시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f" 테스트 설정: {len(CONFIGS)}개")
|
||||
print(f" 예상 소요: ~{len(CONFIGS) * 7}분")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print(" 기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
|
||||
print()
|
||||
|
||||
# 결과 저장
|
||||
all_results = []
|
||||
|
||||
for idx, config in enumerate(CONFIGS):
|
||||
config_start = time.time()
|
||||
log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
||||
print(f" {config['desc']}")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# 1. 기존 서버 종료
|
||||
print(" [1/4] 서버 종료 중...")
|
||||
kill_server()
|
||||
|
||||
# 2. 새 서버 시작
|
||||
print(f" [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
|
||||
proc, log_file = start_server(config, log_path)
|
||||
|
||||
# 3. 서버 준비 대기
|
||||
if not wait_for_server(timeout=600):
|
||||
print(" ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
|
||||
kill_server()
|
||||
log_file.close()
|
||||
all_results.append({
|
||||
"config": config["name"],
|
||||
"status": "FAILED",
|
||||
"eval_tps": [],
|
||||
"prompt_tps": [],
|
||||
"vram": "N/A"
|
||||
})
|
||||
continue
|
||||
|
||||
load_time = time.time() - config_start
|
||||
print(f" [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
|
||||
|
||||
# 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
|
||||
print(" [4/4] 벤치마크 실행 중...")
|
||||
|
||||
# 워밍업
|
||||
try:
|
||||
run_single_benchmark("Say hello.", max_tokens=20)
|
||||
print(" 워밍업 완료")
|
||||
except Exception as e:
|
||||
print(f" 워밍업 실패: {e}")
|
||||
|
||||
# 본 테스트 3회
|
||||
prompts = [
|
||||
"Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
|
||||
"Explain the complete process of photosynthesis including light and dark reactions in detail.",
|
||||
"Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
|
||||
]
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
try:
|
||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
||||
approx_tps = tokens / elapsed if elapsed > 0 else 0
|
||||
print(f" Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
|
||||
except Exception as e:
|
||||
print(f" Run {i+1}/3: ERROR - {e}")
|
||||
|
||||
# 서버 종료 전에 로그 플러시를 위해 잠시 대기
|
||||
time.sleep(2)
|
||||
|
||||
# 서버 종료
|
||||
kill_server()
|
||||
log_file.close()
|
||||
time.sleep(2)
|
||||
|
||||
# 로그 파싱
|
||||
eval_times = parse_eval_times(log_path)
|
||||
prompt_times = parse_prompt_eval_times(log_path)
|
||||
vram = parse_vram_usage(log_path)
|
||||
|
||||
# 워밍업 제외 (첫 번째 결과)
|
||||
if len(eval_times) > 1:
|
||||
bench_evals = eval_times[1:] # 워밍업 제외
|
||||
else:
|
||||
bench_evals = eval_times
|
||||
|
||||
if len(prompt_times) > 1:
|
||||
bench_prompts = prompt_times[1:]
|
||||
else:
|
||||
bench_prompts = prompt_times
|
||||
|
||||
eval_speeds = [e["tps"] for e in bench_evals]
|
||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
||||
|
||||
result = {
|
||||
"config": config["name"],
|
||||
"status": "OK",
|
||||
"eval_tps": eval_speeds,
|
||||
"prompt_tps": prompt_speeds,
|
||||
"vram": vram,
|
||||
}
|
||||
all_results.append(result)
|
||||
|
||||
config_elapsed = time.time() - config_start
|
||||
print(f"\n 완료! 소요: {config_elapsed:.0f}초")
|
||||
|
||||
if eval_speeds:
|
||||
avg_eval = sum(eval_speeds) / len(eval_speeds)
|
||||
max_eval = max(eval_speeds)
|
||||
print(f" 📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
|
||||
|
||||
# ============================================================
|
||||
# 최종 결과 비교 테이블
|
||||
# ============================================================
|
||||
print("\n")
|
||||
print("=" * 80)
|
||||
print(" 🏆 최종 결과 비교 테이블")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
# 기존 baseline 추가
|
||||
print(f" {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
|
||||
print(f" {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
|
||||
|
||||
# Baseline (이전 결과)
|
||||
print(f" {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
|
||||
|
||||
best_avg = 0
|
||||
best_config = ""
|
||||
|
||||
for r in all_results:
|
||||
if r["status"] != "OK" or not r["eval_tps"]:
|
||||
print(f" {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
|
||||
continue
|
||||
|
||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
||||
max_e = max(r["eval_tps"])
|
||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
||||
|
||||
if avg_e > best_avg:
|
||||
best_avg = avg_e
|
||||
best_config = r["config"]
|
||||
|
||||
marker = " ⭐" if avg_e > 10.06 else ""
|
||||
print(f" {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
|
||||
|
||||
print()
|
||||
if best_avg > 0:
|
||||
improvement = ((best_avg - 10.02) / 10.02) * 100
|
||||
print(f" 🏆 최고 성능: {best_config}")
|
||||
print(f" → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
|
||||
|
||||
print()
|
||||
print(f" 완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print("=" * 80)
|
||||
|
||||
# 결과를 파일로도 저장
|
||||
result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
|
||||
with open(result_path, "w", encoding="utf-8") as f:
|
||||
f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
|
||||
f.write(f"Date: {timestamp}\n\n")
|
||||
for r in all_results:
|
||||
f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
|
||||
print(f" 결과 저장: {result_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
257
scripts/_archive/tuning/auto_tune_122b_r2.py
Normal file
257
scripts/_archive/tuning/auto_tune_122b_r2.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Qwen3.5 122B-A10B 정밀 튜닝 2라운드
|
||||
====================================
|
||||
1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
|
||||
→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
COMMON_ARGS = [
|
||||
"--model", MODEL_PATH,
|
||||
"-ngl", "999",
|
||||
"--cpu-moe",
|
||||
"-c", "2048",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "256",
|
||||
"-b", "1024",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"--no-warmup",
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
{
|
||||
"name": "F) mmap on, -t 4",
|
||||
"desc": "최소 스레드 (4개, 물리코어 절반)",
|
||||
"extra": ["-t", "4", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "G) mmap on, -t 5",
|
||||
"desc": "스레드 5개",
|
||||
"extra": ["-t", "5", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "H) mmap on, -t 6",
|
||||
"desc": "스레드 6개 (--no-mmap에서 최고였음)",
|
||||
"extra": ["-t", "6", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "I) mmap on, -t 7",
|
||||
"desc": "스레드 7개",
|
||||
"extra": ["-t", "7", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "J) mmap on, -t 6, --prio 3",
|
||||
"desc": "최적 스레드 + 리얼타임 우선순위",
|
||||
"extra": ["-t", "6", "--prio", "3"],
|
||||
},
|
||||
]
|
||||
|
||||
def kill_server():
|
||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
||||
time.sleep(3)
|
||||
|
||||
def start_server(config, log_path):
|
||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
||||
log_file = open(log_path, "w", encoding="utf-8")
|
||||
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
|
||||
return proc, log_file
|
||||
|
||||
def wait_for_server(timeout=600):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def run_single_benchmark(prompt, max_tokens=200):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
usage = result.get("usage", {})
|
||||
return usage.get("completion_tokens", 0), elapsed
|
||||
|
||||
def parse_eval_times(log_path):
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
|
||||
|
||||
def parse_prompt_eval_times(log_path):
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
return [{"tps": float(m[3])} for m in matches]
|
||||
|
||||
def main():
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
all_results = []
|
||||
|
||||
for idx, config in enumerate(CONFIGS):
|
||||
config_start = time.time()
|
||||
log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
||||
print(f" {config['desc']}")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
kill_server()
|
||||
print(f" [1/3] 서버 시작 중...")
|
||||
proc, log_file = start_server(config, log_path)
|
||||
|
||||
if not wait_for_server(timeout=600):
|
||||
print(" ❌ 서버 시작 실패!")
|
||||
kill_server()
|
||||
log_file.close()
|
||||
all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
|
||||
continue
|
||||
|
||||
load_time = time.time() - config_start
|
||||
print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)")
|
||||
|
||||
# 워밍업 + 벤치마크
|
||||
try:
|
||||
run_single_benchmark("Say hello.", max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
print(" [3/3] 벤치마크 3회...")
|
||||
prompts = [
|
||||
"Write a detailed explanation of how neural networks learn through backpropagation.",
|
||||
"Explain the complete process of photosynthesis including light and dark reactions.",
|
||||
"Describe the differences between SQL and NoSQL databases with examples.",
|
||||
]
|
||||
for i, prompt in enumerate(prompts):
|
||||
try:
|
||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
||||
print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
|
||||
except Exception as e:
|
||||
print(f" Run {i+1}: ERROR - {e}")
|
||||
|
||||
time.sleep(2)
|
||||
kill_server()
|
||||
log_file.close()
|
||||
time.sleep(2)
|
||||
|
||||
eval_times = parse_eval_times(log_path)
|
||||
prompt_times = parse_prompt_eval_times(log_path)
|
||||
bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
|
||||
bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
|
||||
|
||||
eval_speeds = [e["tps"] for e in bench_evals]
|
||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
||||
|
||||
all_results.append({
|
||||
"config": config["name"],
|
||||
"status": "OK",
|
||||
"eval_tps": eval_speeds,
|
||||
"prompt_tps": prompt_speeds,
|
||||
})
|
||||
|
||||
if eval_speeds:
|
||||
print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
|
||||
|
||||
# 최종 결과
|
||||
print("\n")
|
||||
print("=" * 85)
|
||||
print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
|
||||
print("=" * 85)
|
||||
print()
|
||||
print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
|
||||
print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
|
||||
|
||||
# 1라운드 결과 (하드코딩)
|
||||
r1 = [
|
||||
("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52),
|
||||
("A) --no-mmap -t 8", 9.66, 9.70, 28.26),
|
||||
("B) --no-mmap -t 6", 10.02, 10.18, 26.73),
|
||||
("C) --no-mmap -t 10", 9.42, 9.46, 27.31),
|
||||
("D) --no-mmap -t 12", 9.04, 9.11, 27.92),
|
||||
("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37),
|
||||
]
|
||||
for name, avg, mx, pp in r1:
|
||||
marker = " ⭐" if avg >= 10.0 else ""
|
||||
print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
|
||||
|
||||
print(f" {'--- 2라운드 ---':<48}")
|
||||
|
||||
best_avg = 10.06 # 기존 최고
|
||||
best_config = "[기준] mmap on, -t 8"
|
||||
|
||||
for r in all_results:
|
||||
if r["status"] != "OK" or not r["eval_tps"]:
|
||||
print(f" {r['config']:<48} {'FAIL':>8}")
|
||||
continue
|
||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
||||
max_e = max(r["eval_tps"])
|
||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
||||
if max_e > best_avg:
|
||||
best_avg = max_e
|
||||
best_config = r["config"]
|
||||
marker = " ⭐" if avg_e >= 10.0 else ""
|
||||
print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
|
||||
|
||||
print()
|
||||
print(f" 🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
|
||||
print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print("=" * 85)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
339
scripts/_archive/tuning/auto_tune_gemma4_256k.py
Normal file
339
scripts/_archive/tuning/auto_tune_gemma4_256k.py
Normal file
@@ -0,0 +1,339 @@
|
||||
"""
|
||||
Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
|
||||
Phase 1: -ngl sweep (GPU layers)
|
||||
Phase 2: -t / -tb sweep (CPU threads)
|
||||
Phase 3: -ub / -b sweep (batch sizes)
|
||||
Phase 4: --cache-type-k/v sweep (KV cache precision)
|
||||
Phase 5: --no-mmap, --poll, --prio sweep (misc)
|
||||
Each phase fixes the best from previous phases.
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import itertools
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||
CONTEXT = 262144
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
|
||||
# ─── Baseline (from previous tuning at -c 4096) ───
|
||||
BEST = {
|
||||
"ngl": 22,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": True,
|
||||
"mmap": True,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
}
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def build_cmd(cfg):
|
||||
cmd = [LLAMA_SERVER, "--model", MODEL,
|
||||
"-ngl", str(cfg["ngl"]),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", cfg["fa"],
|
||||
"--cache-type-k", cfg["ctk"],
|
||||
"--cache-type-v", cfg["ctv"],
|
||||
"-ub", str(cfg["ub"]),
|
||||
"-b", str(cfg["b"]),
|
||||
"-t", str(cfg["t"]),
|
||||
"-tb", str(cfg["tb"]),
|
||||
"--prio", str(cfg["prio"]),
|
||||
"--poll", str(cfg["poll"]),
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0"]
|
||||
if cfg["mlock"]:
|
||||
cmd.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(cfg):
|
||||
cmd = build_cmd(cfg)
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=180):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
return ct / elapsed if elapsed > 0 else 0
|
||||
|
||||
|
||||
def get_vram():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
parts = r.stdout.strip().split(",")
|
||||
return int(parts[0].strip()), int(parts[1].strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test_config(cfg, label=""):
|
||||
kill_server()
|
||||
desc = label or str(cfg)
|
||||
print(f" [{desc}] Starting server...")
|
||||
proc = start_server(cfg)
|
||||
|
||||
if not wait_for_server():
|
||||
print(f" [{desc}] FAILED to start")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram_used, vram_total = get_vram()
|
||||
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Benchmark
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
speeds.append(tps)
|
||||
except Exception as e:
|
||||
print(f"ERR({e}) ", end="", flush=True)
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
print("ALL FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {**cfg, "avg_tps": avg, "best_tps": best,
|
||||
"vram_used": vram_used, "vram_total": vram_total, "label": label}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
def phase_sweep(phase_name, param_name, values, base_cfg):
|
||||
print(f"\n{'='*70}")
|
||||
print(f" PHASE: {phase_name}")
|
||||
print(f" Sweeping: {param_name} = {values}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
best_result = None
|
||||
for val in values:
|
||||
cfg = {**base_cfg}
|
||||
if isinstance(param_name, list):
|
||||
for p, v in zip(param_name, val):
|
||||
cfg[p] = v
|
||||
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
|
||||
else:
|
||||
cfg[param_name] = val
|
||||
label = f"{param_name}={val}"
|
||||
|
||||
r = test_config(cfg, label)
|
||||
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
|
||||
best_result = r
|
||||
|
||||
if best_result:
|
||||
print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
|
||||
return best_result
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner")
|
||||
print(" 256K Context | RTX 3060 12GB")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
cfg = dict(BEST)
|
||||
|
||||
# ─── Phase 1: -ngl (already done, quick verify top 3) ───
|
||||
r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg)
|
||||
if r:
|
||||
cfg["ngl"] = r["ngl"]
|
||||
|
||||
# ─── Phase 2: CPU threads (-t, -tb) ───
|
||||
thread_combos = [
|
||||
(2, 2), (4, 4), (4, 8), (6, 6), (6, 8),
|
||||
(8, 8), (8, 12), (10, 10), (12, 12), (16, 16)
|
||||
]
|
||||
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
|
||||
if r:
|
||||
cfg["t"] = r["t"]
|
||||
cfg["tb"] = r["tb"]
|
||||
|
||||
# ─── Phase 3: Batch sizes (-ub, -b) ───
|
||||
batch_combos = [
|
||||
(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)
|
||||
]
|
||||
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
|
||||
if r:
|
||||
cfg["ub"] = r["ub"]
|
||||
cfg["b"] = r["b"]
|
||||
|
||||
# ─── Phase 4: KV cache precision ───
|
||||
kv_combos = [
|
||||
("q4_0", "q4_0"),
|
||||
("q8_0", "q8_0"),
|
||||
("q4_0", "q8_0"),
|
||||
("f16", "f16"),
|
||||
]
|
||||
r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg)
|
||||
if r:
|
||||
cfg["ctk"] = r["ctk"]
|
||||
cfg["ctv"] = r["ctv"]
|
||||
|
||||
# ─── Phase 5: Misc (mmap, poll, prio) ───
|
||||
misc_combos = [
|
||||
(True, 50, 2), # baseline
|
||||
(False, 50, 2), # no-mmap
|
||||
(True, 0, 2), # no polling
|
||||
(True, 100, 2), # max polling
|
||||
(True, 50, 3), # realtime priority
|
||||
(False, 0, 3), # no-mmap + no-poll + realtime
|
||||
]
|
||||
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
|
||||
if r:
|
||||
cfg["mmap"] = r["mmap"]
|
||||
cfg["poll"] = r["poll"]
|
||||
cfg["prio"] = r["prio"]
|
||||
|
||||
# ─── Final Report ───
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" FINAL OPTIMAL CONFIGURATION")
|
||||
print("=" * 70)
|
||||
print(f" ngl: {cfg['ngl']}")
|
||||
print(f" threads: -t {cfg['t']} -tb {cfg['tb']}")
|
||||
print(f" batch: -ub {cfg['ub']} -b {cfg['b']}")
|
||||
print(f" kv cache: -ctk {cfg['ctk']} -ctv {cfg['ctv']}")
|
||||
print(f" flash: -fa {cfg['fa']}")
|
||||
print(f" mlock: {'yes' if cfg['mlock'] else 'no'}")
|
||||
print(f" mmap: {'yes' if cfg['mmap'] else 'no (--no-mmap)'}")
|
||||
print(f" prio: {cfg['prio']}")
|
||||
print(f" poll: {cfg['poll']}")
|
||||
print()
|
||||
|
||||
# Final verification run
|
||||
print(" Running final verification (5 runs)...")
|
||||
kill_server()
|
||||
proc = start_server(cfg)
|
||||
wait_for_server()
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
final_speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
final_speeds.append(tps)
|
||||
print(f" Run {i+1}: {tps:.2f} t/s")
|
||||
except:
|
||||
pass
|
||||
proc.kill()
|
||||
|
||||
if final_speeds:
|
||||
avg = sum(final_speeds) / len(final_speeds)
|
||||
best = max(final_speeds)
|
||||
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
|
||||
|
||||
print()
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODEL}",
|
||||
f"-ngl {cfg['ngl']} -c {CONTEXT}",
|
||||
f"-t {cfg['t']} -tb {cfg['tb']}",
|
||||
f"-ub {cfg['ub']} -b {cfg['b']}",
|
||||
f"-fa {cfg['fa']}",
|
||||
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
|
||||
f"--prio {cfg['prio']} --poll {cfg['poll']}",
|
||||
]
|
||||
if cfg["mlock"]:
|
||||
cmd_parts.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
print(" Recommended command:")
|
||||
print(f" {' '.join(cmd_parts)}")
|
||||
print("=" * 70)
|
||||
|
||||
# Dump all results to JSON
|
||||
with open("scripts/tune_results_gemma4_256k.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
print(f"\n Full results saved: scripts/tune_results_gemma4_256k.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
163
scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
Normal file
163
scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
|
||||
Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
|
||||
"""
|
||||
import subprocess, time, json, urllib.request, sys, os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||
CTX = 262144
|
||||
RUNS = 3
|
||||
|
||||
|
||||
def kill():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
|
||||
cmd = [SERVER, "--model", MODEL, "-ngl", "999",
|
||||
"-c", str(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", ctk, "--cache-type-v", ctv,
|
||||
"-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
|
||||
"--prio", str(prio), "--poll", "50",
|
||||
"--mlock", "--port", "8000", "--host", "0.0.0.0"]
|
||||
if ncpumoe > 0:
|
||||
cmd.extend(["--n-cpu-moe", str(ncpumoe)])
|
||||
if nommap:
|
||||
cmd.append("--no-mmap")
|
||||
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def wait_ready(timeout=240):
|
||||
t0 = time.time()
|
||||
while time.time() - t0 < timeout:
|
||||
try:
|
||||
with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
|
||||
if json.loads(r.read()).get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def bench(n=200):
|
||||
p = json.dumps({"model": "m", "messages": [{"role": "user",
|
||||
"content": "Count from 1 to 50, each number on new line."}],
|
||||
"max_tokens": n, "temperature": 0.0}).encode()
|
||||
r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
|
||||
headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
with urllib.request.urlopen(r, timeout=300) as resp:
|
||||
res = json.loads(resp.read())
|
||||
dt = time.time() - t0
|
||||
ct = res.get("usage", {}).get("completion_tokens", 0)
|
||||
return ct / dt if dt > 0 else 0
|
||||
|
||||
|
||||
def vram():
|
||||
try:
|
||||
r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
|
||||
a, b = r.stdout.strip().split(",")
|
||||
return int(a.strip()), int(b.strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test(label, ncpumoe, **kw):
|
||||
kill()
|
||||
print(f" [{label}] Starting...", end=" ", flush=True)
|
||||
p = start(ncpumoe, **kw)
|
||||
if not wait_ready():
|
||||
print("FAILED"); p.kill(); return None
|
||||
vu, vt = vram()
|
||||
print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
|
||||
try: bench(20)
|
||||
except: pass
|
||||
speeds = []
|
||||
for _ in range(RUNS):
|
||||
try: speeds.append(bench())
|
||||
except: pass
|
||||
p.kill()
|
||||
if not speeds:
|
||||
print("BENCH FAILED"); return None
|
||||
avg, best = sum(speeds)/len(speeds), max(speeds)
|
||||
print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
|
||||
return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
|
||||
"vram": vu, **kw}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
|
||||
print("=" * 60)
|
||||
results = []
|
||||
|
||||
# Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
|
||||
print("\n--- Phase 1: --n-cpu-moe sweep ---")
|
||||
for n in [0, 5, 10, 15, 20, 25, 30]:
|
||||
nm = n > 15 # use --no-mmap when heavy CPU offload
|
||||
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Find best n-cpu-moe
|
||||
best_r = max(results, key=lambda x: x["avg"])
|
||||
best_n = best_r["ncpumoe"]
|
||||
print(f"\n ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||
|
||||
# Fine-tune around best
|
||||
if best_n > 0:
|
||||
print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
|
||||
for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
|
||||
if n == best_n: continue
|
||||
nm = n > 15
|
||||
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||
if r: results.append(r)
|
||||
best_r = max(results, key=lambda x: x["avg"])
|
||||
best_n = best_r["ncpumoe"]
|
||||
print(f"\n ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||
|
||||
# Phase 2: Thread sweep at best n-cpu-moe
|
||||
nm = best_n > 15
|
||||
print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
|
||||
for t in [2, 4, 6, 8, 10]:
|
||||
r = test(f"t={t}", best_n, t=t, nommap=nm)
|
||||
if r: results.append(r)
|
||||
best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
|
||||
bt = best_t.get("t", 4)
|
||||
print(f"\n ★ Best threads: {bt}")
|
||||
|
||||
# Phase 3: Batch sweep
|
||||
print(f"\n--- Phase 3: Batch sweep ---")
|
||||
for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
|
||||
r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Phase 4: KV cache type
|
||||
print(f"\n--- Phase 4: KV cache type ---")
|
||||
for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
|
||||
r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Final report
|
||||
best_all = max(results, key=lambda x: x["avg"])
|
||||
print(f"\n{'='*60}")
|
||||
print(f" FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
print(" Saved: scripts/tune_results_gemma4_ncpumoe.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
335
scripts/_archive/tuning/auto_tune_qwen35b_256k.py
Normal file
335
scripts/_archive/tuning/auto_tune_qwen35b_256k.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""
|
||||
Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
|
||||
Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
|
||||
Now tuning for -c 262144 (256K context).
|
||||
|
||||
Phase 1: --cpu-moe vs no --cpu-moe baseline
|
||||
Phase 2: -t / -tb sweep
|
||||
Phase 3: -ub / -b sweep
|
||||
Phase 4: --cache-type-k/v sweep
|
||||
Phase 5: Misc (mmap, poll, prio)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
|
||||
CONTEXT = 262144
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
|
||||
BEST = {
|
||||
"ngl": 999,
|
||||
"cpu_moe": True,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": True,
|
||||
"mmap": True,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
}
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def build_cmd(cfg):
|
||||
cmd = [LLAMA_SERVER, "--model", MODEL,
|
||||
"-ngl", str(cfg["ngl"]),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", cfg["fa"],
|
||||
"--cache-type-k", cfg["ctk"],
|
||||
"--cache-type-v", cfg["ctv"],
|
||||
"-ub", str(cfg["ub"]),
|
||||
"-b", str(cfg["b"]),
|
||||
"-t", str(cfg["t"]),
|
||||
"-tb", str(cfg["tb"]),
|
||||
"--prio", str(cfg["prio"]),
|
||||
"--poll", str(cfg["poll"]),
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0"]
|
||||
if cfg.get("cpu_moe"):
|
||||
cmd.append("--cpu-moe")
|
||||
if cfg["mlock"]:
|
||||
cmd.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(cfg):
|
||||
cmd = build_cmd(cfg)
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=240):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
return ct / elapsed if elapsed > 0 else 0
|
||||
|
||||
|
||||
def get_vram():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
parts = r.stdout.strip().split(",")
|
||||
return int(parts[0].strip()), int(parts[1].strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test_config(cfg, label=""):
|
||||
kill_server()
|
||||
desc = label or str(cfg)
|
||||
print(f" [{desc}] Starting server...", flush=True)
|
||||
proc = start_server(cfg)
|
||||
|
||||
if not wait_for_server():
|
||||
print(f" [{desc}] FAILED to start")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram_used, vram_total = get_vram()
|
||||
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
speeds.append(tps)
|
||||
except Exception as e:
|
||||
print(f"ERR({e}) ", end="", flush=True)
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
print("ALL FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
|
||||
"vram_used": vram_used, "vram_total": vram_total, "label": label}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
def phase_sweep(phase_name, param_name, values, base_cfg):
|
||||
print(f"\n{'='*70}")
|
||||
print(f" PHASE: {phase_name}")
|
||||
print(f" Sweeping: {param_name} = {values}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
best_result = None
|
||||
for val in values:
|
||||
cfg = {**base_cfg}
|
||||
if isinstance(param_name, list):
|
||||
for p, v in zip(param_name, val):
|
||||
cfg[p] = v
|
||||
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
|
||||
else:
|
||||
cfg[param_name] = val
|
||||
label = f"{param_name}={val}"
|
||||
|
||||
r = test_config(cfg, label)
|
||||
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
|
||||
best_result = r
|
||||
|
||||
if best_result:
|
||||
print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
|
||||
return best_result
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
|
||||
print(" 256K Context | RTX 3060 12GB")
|
||||
print(" Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
cfg = dict(BEST)
|
||||
|
||||
# ─── Phase 1: --cpu-moe critical test ───
|
||||
r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
|
||||
if r:
|
||||
cfg["cpu_moe"] = r["cpu_moe"]
|
||||
|
||||
# ─── Phase 2: CPU threads ───
|
||||
thread_combos = [
|
||||
(2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
|
||||
(8, 8), (8, 12), (10, 10), (12, 12)
|
||||
]
|
||||
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
|
||||
if r:
|
||||
cfg["t"] = r["t"]
|
||||
cfg["tb"] = r["tb"]
|
||||
|
||||
# ─── Phase 3: Batch sizes ───
|
||||
batch_combos = [
|
||||
(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)
|
||||
]
|
||||
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
|
||||
if r:
|
||||
cfg["ub"] = r["ub"]
|
||||
cfg["b"] = r["b"]
|
||||
|
||||
# ─── Phase 4: KV cache ───
|
||||
kv_combos = [
|
||||
("q4_0", "q4_0"),
|
||||
("q8_0", "q8_0"),
|
||||
("f16", "f16"),
|
||||
]
|
||||
r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
|
||||
if r:
|
||||
cfg["ctk"] = r["ctk"]
|
||||
cfg["ctv"] = r["ctv"]
|
||||
|
||||
# ─── Phase 5: Misc ───
|
||||
misc_combos = [
|
||||
(True, 50, 2),
|
||||
(False, 50, 2),
|
||||
(True, 0, 2),
|
||||
(True, 100, 2),
|
||||
(True, 50, 3),
|
||||
]
|
||||
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
|
||||
if r:
|
||||
cfg["mmap"] = r["mmap"]
|
||||
cfg["poll"] = r["poll"]
|
||||
cfg["prio"] = r["prio"]
|
||||
|
||||
# ─── Final Report ───
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" FINAL OPTIMAL CONFIGURATION")
|
||||
print("=" * 70)
|
||||
for k, v in cfg.items():
|
||||
print(f" {k:>12}: {v}")
|
||||
print()
|
||||
|
||||
# Final verification
|
||||
print(" Running final verification (5 runs)...")
|
||||
kill_server()
|
||||
proc = start_server(cfg)
|
||||
wait_for_server()
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
final_speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
final_speeds.append(tps)
|
||||
print(f" Run {i+1}: {tps:.2f} t/s")
|
||||
except:
|
||||
pass
|
||||
proc.kill()
|
||||
|
||||
if final_speeds:
|
||||
avg = sum(final_speeds) / len(final_speeds)
|
||||
best = max(final_speeds)
|
||||
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
|
||||
|
||||
print()
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODEL}",
|
||||
f"-ngl {cfg['ngl']} -c {CONTEXT}",
|
||||
]
|
||||
if cfg.get("cpu_moe"):
|
||||
cmd_parts.append("--cpu-moe")
|
||||
cmd_parts.extend([
|
||||
f"-t {cfg['t']} -tb {cfg['tb']}",
|
||||
f"-ub {cfg['ub']} -b {cfg['b']}",
|
||||
f"-fa {cfg['fa']}",
|
||||
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
|
||||
f"--prio {cfg['prio']} --poll {cfg['poll']}",
|
||||
])
|
||||
if cfg["mlock"]:
|
||||
cmd_parts.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
print(" Recommended command:")
|
||||
print(f" {' '.join(cmd_parts)}")
|
||||
print("=" * 70)
|
||||
|
||||
with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
print(f"\n Full results saved: scripts/tune_results_qwen35b_256k.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
531
scripts/_archive/tuning/dual_gpu_benchmark.mjs
Normal file
531
scripts/_archive/tuning/dual_gpu_benchmark.mjs
Normal file
@@ -0,0 +1,531 @@
|
||||
/**
|
||||
* Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||
* ===========================================================
|
||||
* Tests 4 models across multiple parameter configurations to find
|
||||
* the absolute best model + settings for 256K context coding agent.
|
||||
*
|
||||
* Models:
|
||||
* 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||
* 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||
* 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||
* 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||
*
|
||||
* Run: node scripts/dual_gpu_benchmark.mjs
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, statSync, existsSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// ─── Configuration ─────────────────────────────────────────────
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CONTEXT = 262144; // 256K
|
||||
const BENCHMARK_RUNS = 3;
|
||||
const BENCHMARK_TOKENS = 200;
|
||||
const SERVER_TIMEOUT = 300_000; // ms
|
||||
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Qwen3.5-35B-A3B Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
type: "qwen", quant: "Q4_K_M", totalLayers: 64,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
|
||||
},
|
||||
{
|
||||
name: "Gemma4 26B-A4B Q4_K_M",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
||||
type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
|
||||
},
|
||||
{
|
||||
name: "Gemma4 26B-A4B MXFP4_MOE",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
||||
type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
|
||||
},
|
||||
];
|
||||
|
||||
const ALL_RESULTS = [];
|
||||
|
||||
// ─── Utility ───────────────────────────────────────────────────
|
||||
|
||||
function log(msg) {
|
||||
const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
|
||||
console.log(`[${ts}] ${msg}`);
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
function killServer() {
|
||||
try {
|
||||
execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
|
||||
} catch {}
|
||||
return sleep(5000);
|
||||
}
|
||||
|
||||
function getVramAll() {
|
||||
try {
|
||||
const out = execSync(
|
||||
'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }
|
||||
);
|
||||
return out.trim().split("\n").map((line) => {
|
||||
const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
|
||||
return { gpu, used, total };
|
||||
});
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function buildCmd(modelPath, params) {
|
||||
const {
|
||||
ngl, t, ub, b, ctk, ctv,
|
||||
cpuMoe = false, nCpuMoe = 0,
|
||||
prio = 3, nommap = false
|
||||
} = params;
|
||||
|
||||
const cmd = [
|
||||
LLAMA_SERVER,
|
||||
"--model", modelPath,
|
||||
"-ngl", String(ngl),
|
||||
"-c", String(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", ctk,
|
||||
"--cache-type-v", ctv,
|
||||
"-ub", String(ub),
|
||||
"-b", String(b),
|
||||
"-t", String(t),
|
||||
"-tb", String(t),
|
||||
"--prio", String(prio),
|
||||
"--poll", "50",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
];
|
||||
|
||||
if (cpuMoe) cmd.push("--cpu-moe");
|
||||
else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
|
||||
if (nommap) cmd.push("--no-mmap");
|
||||
|
||||
return cmd;
|
||||
}
|
||||
|
||||
function startServer(modelPath, params) {
|
||||
const args = buildCmd(modelPath, params);
|
||||
const exe = args.shift();
|
||||
log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
|
||||
return spawn(exe, args, {
|
||||
cwd: process.cwd(),
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
}
|
||||
|
||||
async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
try {
|
||||
const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const data = await resp.json();
|
||||
if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, bootTime: timeoutMs / 1000 };
|
||||
}
|
||||
|
||||
async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
|
||||
const payload = JSON.stringify({
|
||||
model: "local-model",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
|
||||
max_tokens: maxTokens,
|
||||
temperature: 0.0,
|
||||
});
|
||||
|
||||
const start = Date.now();
|
||||
const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: payload,
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const result = await resp.json();
|
||||
const elapsed = (Date.now() - start) / 1000;
|
||||
|
||||
const usage = result.usage || {};
|
||||
const ct = usage.completion_tokens || 0;
|
||||
return {
|
||||
tps: elapsed > 0 ? ct / elapsed : 0,
|
||||
completionTokens: ct,
|
||||
promptTokens: usage.prompt_tokens || 0,
|
||||
elapsed,
|
||||
};
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await killServer();
|
||||
log(` [${label}] Starting server...`);
|
||||
|
||||
const proc = startServer(model.path, params);
|
||||
const { ok, bootTime } = await waitForServer();
|
||||
|
||||
if (!ok) {
|
||||
log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
|
||||
proc.kill("SIGKILL");
|
||||
return null;
|
||||
}
|
||||
|
||||
const vram = getVramAll();
|
||||
const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
|
||||
log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
|
||||
|
||||
// Warmup
|
||||
try { await runBenchmark(20); } catch {}
|
||||
|
||||
// Benchmark
|
||||
const speeds = [];
|
||||
for (let i = 0; i < BENCHMARK_RUNS; i++) {
|
||||
try {
|
||||
const r = await runBenchmark();
|
||||
speeds.push(r.tps);
|
||||
log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Run ${i + 1}: ERROR (${e.message})`);
|
||||
}
|
||||
}
|
||||
|
||||
proc.kill("SIGKILL");
|
||||
|
||||
if (speeds.length === 0) {
|
||||
log(` [${label}] ALL BENCHMARK RUNS FAILED`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
|
||||
|
||||
const result = {
|
||||
model: model.name, quant: model.quant, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot_time: +bootTime.toFixed(1), vram, params,
|
||||
};
|
||||
ALL_RESULTS.push(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Phase Runners ─────────────────────────────────────────────
|
||||
|
||||
async function phase0_bootTest(model) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 0: Boot Test — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
// Try full GPU first
|
||||
let r = await testConfig(model, "boot-ngl999", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
|
||||
});
|
||||
if (r) return r;
|
||||
|
||||
// Try with cpu-moe
|
||||
log(" Full GPU failed, trying with --cpu-moe...");
|
||||
r = await testConfig(model, "boot-cpumoe", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
|
||||
});
|
||||
if (r) return r;
|
||||
|
||||
// Reduced layers
|
||||
log(" --cpu-moe also failed, trying reduced layers...");
|
||||
r = await testConfig(model, "boot-ngl-half", {
|
||||
ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
|
||||
ctk: "q4_0", ctv: "q4_0",
|
||||
});
|
||||
return r;
|
||||
}
|
||||
|
||||
async function phase1_gpuOffload(model, baseline) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 1: GPU Offload Strategy — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const results = baseline ? [baseline] : [];
|
||||
|
||||
// Test --cpu-moe on/off
|
||||
for (const cpuMoe of [true, false]) {
|
||||
const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
|
||||
if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
|
||||
const r = await testConfig(model, lbl, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
// n-cpu-moe sweep
|
||||
for (const n of [0, 5, 10, 15, 20]) {
|
||||
if (n > model.totalLayers) continue;
|
||||
const r = await testConfig(model, `n-cpu-moe=${n}`, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; }
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase2_threads(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 2: CPU Thread Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const t of [2, 4, 6, 8, 10, 12]) {
|
||||
if (t === p.t) continue;
|
||||
const r = await testConfig(model, `t=${t}`, {
|
||||
...p, t,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase3_batch(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 3: Batch Size Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const [ub, b] of [
|
||||
[128, 512], [256, 1024], [256, 2048],
|
||||
[512, 1024], [512, 2048], [512, 4096],
|
||||
[1024, 2048], [1024, 4096],
|
||||
]) {
|
||||
if (ub === p.ub && b === p.b) continue;
|
||||
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase4_kvcache(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 4: KV Cache Type Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const [ctk, ctv] of [
|
||||
["q4_0", "q4_0"], ["q8_0", "q8_0"],
|
||||
["q4_0", "q8_0"], ["f16", "f16"],
|
||||
]) {
|
||||
if (ctk === p.ctk && ctv === p.ctv) continue;
|
||||
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase5_final(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 5: Final Verification (5 runs) — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
await killServer();
|
||||
const proc = startServer(model.path, prev.params);
|
||||
const { ok, bootTime } = await waitForServer();
|
||||
if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; }
|
||||
|
||||
const vram = getVramAll();
|
||||
try { await runBenchmark(20); } catch {}
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try {
|
||||
const r = await runBenchmark();
|
||||
speeds.push(r.tps);
|
||||
log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Final Run ${i + 1}: ERROR (${e.message})`);
|
||||
}
|
||||
}
|
||||
proc.kill("SIGKILL");
|
||||
|
||||
if (speeds.length > 0) {
|
||||
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
|
||||
|
||||
const final_ = {
|
||||
model: model.name, quant: model.quant,
|
||||
label: `FINAL-${model.name}`,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot_time: +bootTime.toFixed(1), vram, params: prev.params,
|
||||
};
|
||||
ALL_RESULTS.push(final_);
|
||||
return final_;
|
||||
}
|
||||
return prev;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
|
||||
async function runModelBenchmark(model) {
|
||||
log(`\n${"#".repeat(70)}`);
|
||||
log(` MODEL: ${model.name}`);
|
||||
log(` File: ${model.path}`);
|
||||
try {
|
||||
const sz = statSync(model.path).size / 1024 ** 3;
|
||||
log(` Size: ${sz.toFixed(2)} GB`);
|
||||
} catch { log(` Size: unknown`); }
|
||||
log(`${"#".repeat(70)}`);
|
||||
|
||||
if (!existsSync(model.path)) {
|
||||
log(` SKIP: Model file not found!`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const baseline = await phase0_bootTest(model);
|
||||
if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; }
|
||||
|
||||
let best = await phase1_gpuOffload(model, baseline);
|
||||
if (!best) return baseline;
|
||||
|
||||
best = await phase2_threads(model, best);
|
||||
best = await phase3_batch(model, best);
|
||||
best = await phase4_kvcache(model, best);
|
||||
best = await phase5_final(model, best);
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const startTime = Date.now();
|
||||
|
||||
log("=".repeat(70));
|
||||
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
|
||||
log(" 2x RTX 3060 (24GB Total) | 256K Context");
|
||||
log(` Models: ${MODELS.length}`);
|
||||
log(` Started: ${new Date().toISOString()}`);
|
||||
log("=".repeat(70));
|
||||
|
||||
const gpus = getVramAll();
|
||||
gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
|
||||
|
||||
const winners = [];
|
||||
|
||||
for (let i = 0; i < MODELS.length; i++) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const winner = await runModelBenchmark(MODELS[i]);
|
||||
if (winner) winners.push(winner);
|
||||
|
||||
// Save intermediate
|
||||
writeFileSync("scripts/dual_gpu_results.json",
|
||||
JSON.stringify(ALL_RESULTS, null, 2));
|
||||
log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`);
|
||||
}
|
||||
|
||||
// ─── Grand Final ───────────────────────────────────────────
|
||||
const elapsed = (Date.now() - startTime) / 60000;
|
||||
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` GRAND FINAL COMPARISON`);
|
||||
log(` Total time: ${elapsed.toFixed(1)} minutes`);
|
||||
log(` Configs tested: ${ALL_RESULTS.length}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
if (winners.length === 0) {
|
||||
log(" No models ran at 256K!");
|
||||
return;
|
||||
}
|
||||
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
const medals = ["🥇", "🥈", "🥉", " "];
|
||||
|
||||
const lines = [
|
||||
`Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
|
||||
`Hardware: 2x RTX 3060 12GB | Context: 256K`,
|
||||
`Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
|
||||
"", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60),
|
||||
];
|
||||
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i];
|
||||
const p = w.params;
|
||||
lines.push("");
|
||||
lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`);
|
||||
lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
|
||||
lines.push(` Boot: ${w.boot_time.toFixed(0)}s`);
|
||||
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
|
||||
lines.push(` ctk=${p.ctk} ctv=${p.ctv}`);
|
||||
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
||||
else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
||||
}
|
||||
|
||||
const champ = winners[0];
|
||||
const cp = champ.params;
|
||||
lines.push("", "=".repeat(60));
|
||||
lines.push(` ★ CHAMPION: ${champ.model}`);
|
||||
lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`);
|
||||
lines.push("=".repeat(60));
|
||||
|
||||
// Build recommended command
|
||||
const cmdParts = [
|
||||
`llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
|
||||
`-ngl ${cp.ngl} -c ${CONTEXT}`,
|
||||
`-t ${cp.t} -tb ${cp.t}`,
|
||||
`-ub ${cp.ub} -b ${cp.b}`,
|
||||
`-fa on`,
|
||||
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
||||
`--prio ${cp.prio || 3} --poll 50`,
|
||||
`--mlock`,
|
||||
];
|
||||
if (cp.cpuMoe) cmdParts.push("--cpu-moe");
|
||||
else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
|
||||
if (cp.nommap) cmdParts.push("--no-mmap");
|
||||
cmdParts.push("--port 8000 --host 0.0.0.0");
|
||||
|
||||
lines.push("", " Recommended command:");
|
||||
lines.push(` ${cmdParts.join(" ")}`);
|
||||
|
||||
const summary = lines.join("\n");
|
||||
console.log(summary);
|
||||
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/dual_gpu_results.json",
|
||||
JSON.stringify(ALL_RESULTS, null, 2));
|
||||
|
||||
log(`\n Results: scripts/dual_gpu_results.json`);
|
||||
log(` Summary: scripts/dual_gpu_summary.txt`);
|
||||
log(` DONE!`);
|
||||
|
||||
await killServer();
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error("Fatal error:", e);
|
||||
process.exit(1);
|
||||
});
|
||||
644
scripts/_archive/tuning/dual_gpu_benchmark.py
Normal file
644
scripts/_archive/tuning/dual_gpu_benchmark.py
Normal file
@@ -0,0 +1,644 @@
|
||||
"""
|
||||
Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||
==========================================================
|
||||
Tests 4 models across multiple parameter configurations to find
|
||||
the absolute best model + settings for 256K context coding agent.
|
||||
|
||||
Models:
|
||||
1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||
2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||
3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||
4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||
|
||||
Test Phases (per model):
|
||||
Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
|
||||
Phase 1: GPU layer + MoE offload strategy sweep
|
||||
Phase 2: CPU thread sweep (carry best from P1)
|
||||
Phase 3: Batch size sweep (carry best from P1+P2)
|
||||
Phase 4: KV cache type sweep (carry best from P1+P2+P3)
|
||||
Phase 5: Final verification (5 runs)
|
||||
|
||||
Output: scripts/dual_gpu_results.json (all raw data)
|
||||
scripts/dual_gpu_summary.txt (human-readable winner)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ─── Configuration ───────────────────────────────────────────────
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
CONTEXT = 262144 # 256K
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
SERVER_TIMEOUT = 300 # seconds to wait for server startup
|
||||
|
||||
MODELS = [
|
||||
{
|
||||
"name": "Qwen3.5-35B-A3B Q4_K_M",
|
||||
"path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
|
||||
"type": "qwen",
|
||||
"quant": "Q4_K_M",
|
||||
"is_mxfp4": False,
|
||||
"total_layers": 64, # Qwen3.5 35B has 64 layers
|
||||
},
|
||||
{
|
||||
"name": "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||
"path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
|
||||
"type": "qwen",
|
||||
"quant": "MXFP4_MOE",
|
||||
"is_mxfp4": True,
|
||||
"total_layers": 64,
|
||||
},
|
||||
{
|
||||
"name": "Gemma4 26B-A4B Q4_K_M",
|
||||
"path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
|
||||
"type": "gemma4",
|
||||
"quant": "Q4_K_M",
|
||||
"is_mxfp4": False,
|
||||
"total_layers": 30, # Gemma4 26B has 30 layers
|
||||
},
|
||||
{
|
||||
"name": "Gemma4 26B-A4B MXFP4_MOE",
|
||||
"path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
|
||||
"type": "gemma4",
|
||||
"quant": "MXFP4_MOE",
|
||||
"is_mxfp4": True,
|
||||
"total_layers": 30,
|
||||
},
|
||||
]
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
# ─── Utility Functions ──────────────────────────────────────────
|
||||
def log(msg):
|
||||
ts = datetime.datetime.now().strftime("%H:%M:%S")
|
||||
print(f"[{ts}] {msg}", flush=True)
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
capture_output=True)
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def get_vram_all():
|
||||
"""Returns list of (used, total) tuples for each GPU."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
gpus = []
|
||||
for line in r.stdout.strip().split("\n"):
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 3:
|
||||
gpus.append({
|
||||
"gpu": int(parts[0]),
|
||||
"used": int(parts[1]),
|
||||
"total": int(parts[2]),
|
||||
})
|
||||
return gpus
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
|
||||
cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
|
||||
"""Build llama-server command for dual-GPU."""
|
||||
cmd = [
|
||||
LLAMA_SERVER,
|
||||
"--model", model_path,
|
||||
"-ngl", str(ngl),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", ctk,
|
||||
"--cache-type-v", ctv,
|
||||
"-ub", str(ub),
|
||||
"-b", str(b),
|
||||
"-t", str(t),
|
||||
"-tb", str(t),
|
||||
"--prio", str(prio),
|
||||
"--poll", "50",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
]
|
||||
# MoE offloading options
|
||||
if cpu_moe:
|
||||
cmd.append("--cpu-moe")
|
||||
elif n_cpu_moe > 0:
|
||||
cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
|
||||
if nommap:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(model_path, **kwargs):
|
||||
cmd = build_cmd(model_path, **kwargs)
|
||||
log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=SERVER_TIMEOUT):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
boot_time = time.time() - start
|
||||
return True, boot_time
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(3)
|
||||
return False, timeout
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user",
|
||||
"content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0,
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
pt = usage.get("prompt_tokens", 0)
|
||||
return {
|
||||
"tps": ct / elapsed if elapsed > 0 else 0,
|
||||
"completion_tokens": ct,
|
||||
"prompt_tokens": pt,
|
||||
"elapsed": elapsed,
|
||||
}
|
||||
|
||||
|
||||
def test_config(model_info, label, **kwargs):
|
||||
"""Test a single configuration. Returns result dict or None."""
|
||||
kill_server()
|
||||
log(f" [{label}] Starting server...")
|
||||
|
||||
proc = start_server(model_info["path"], **kwargs)
|
||||
ok, boot_time = wait_for_server()
|
||||
|
||||
if not ok:
|
||||
log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram = get_vram_all()
|
||||
vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
|
||||
log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Benchmark runs
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
r = run_benchmark()
|
||||
speeds.append(r["tps"])
|
||||
log(f" Run {i+1}: {r['tps']:.2f} t/s")
|
||||
except Exception as e:
|
||||
log(f" Run {i+1}: ERROR ({e})")
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
log(f" [{label}] ALL BENCHMARK RUNS FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {
|
||||
"model": model_info["name"],
|
||||
"quant": model_info["quant"],
|
||||
"label": label,
|
||||
"avg_tps": round(avg, 2),
|
||||
"best_tps": round(best, 2),
|
||||
"boot_time": round(boot_time, 1),
|
||||
"vram": vram,
|
||||
"params": kwargs,
|
||||
}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
# ─── Phase Runners ───────────────────────────────────────────────
|
||||
|
||||
def phase0_boot_test(model):
|
||||
"""Quick test: can the model even boot with 256K on dual GPU?"""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 0: Boot Test — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
# Try -ngl 999 (all layers to GPU) as baseline
|
||||
r = test_config(
|
||||
model, f"boot-ngl999",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
)
|
||||
if r:
|
||||
return r
|
||||
|
||||
# If full GPU fails, try with cpu-moe
|
||||
log(" Full GPU failed, trying with --cpu-moe...")
|
||||
r = test_config(
|
||||
model, f"boot-cpumoe",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
cpu_moe=True,
|
||||
)
|
||||
if r:
|
||||
return r
|
||||
|
||||
# Extreme fallback: fewer layers
|
||||
log(" --cpu-moe also failed, trying reduced layers...")
|
||||
r = test_config(
|
||||
model, f"boot-ngl-half",
|
||||
ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
|
||||
ctk="q4_0", ctv="q4_0",
|
||||
)
|
||||
return r
|
||||
|
||||
|
||||
def phase1_gpu_offload(model, baseline):
|
||||
"""Find optimal GPU layer count and MoE offload strategy."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 1: GPU Offload Strategy — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
results = []
|
||||
if baseline:
|
||||
results.append(baseline)
|
||||
|
||||
total = model["total_layers"]
|
||||
|
||||
# Strategy A: All GPU + cpu-moe variations
|
||||
for cpu_moe in [True, False]:
|
||||
label = f"ngl=999 cpu_moe={cpu_moe}"
|
||||
# Skip if already tested in baseline
|
||||
if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
|
||||
baseline["params"].get("cpu_moe", False) == cpu_moe:
|
||||
continue
|
||||
r = test_config(
|
||||
model, label,
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
cpu_moe=cpu_moe,
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
# Strategy B: n-cpu-moe sweep (selective expert offload)
|
||||
for n in [0, 5, 10, 15, 20]:
|
||||
if n > total:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"n-cpu-moe={n}",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
n_cpu_moe=n,
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
if not results:
|
||||
log(" PHASE 1: No configuration worked!")
|
||||
return None
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase2_threads(model, prev_best):
|
||||
"""Sweep CPU threads with best GPU config locked."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 2: CPU Thread Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
results = [prev_best]
|
||||
|
||||
for t in [2, 4, 6, 8, 10, 12]:
|
||||
if t == p.get("t", 6):
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"t={t}",
|
||||
ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
|
||||
ctk=p["ctk"], ctv=p["ctv"],
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase3_batch(model, prev_best):
|
||||
"""Sweep batch sizes."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 3: Batch Size Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
best_t = p["t"]
|
||||
results = [prev_best]
|
||||
|
||||
for ub, b in [(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)]:
|
||||
if ub == p["ub"] and b == p["b"]:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"ub={ub} b={b}",
|
||||
ngl=p["ngl"], t=best_t, ub=ub, b=b,
|
||||
ctk=p["ctk"], ctv=p["ctv"],
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase4_kvcache(model, prev_best):
|
||||
"""Sweep KV cache precision."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 4: KV Cache Type Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
results = [prev_best]
|
||||
|
||||
for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
|
||||
("q4_0", "q8_0"), ("f16", "f16")]:
|
||||
if ctk == p["ctk"] and ctv == p["ctv"]:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"kv={ctk}/{ctv}",
|
||||
ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
|
||||
ctk=ctk, ctv=ctv,
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase5_final(model, prev_best):
|
||||
"""Final verification with 5 runs."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 5: Final Verification (5 runs) — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
kill_server()
|
||||
proc = start_server(model["path"], **p)
|
||||
ok, boot_time = wait_for_server()
|
||||
if not ok:
|
||||
log(" FAILED to start for final verification!")
|
||||
proc.kill()
|
||||
return prev_best
|
||||
|
||||
vram = get_vram_all()
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
r = run_benchmark()
|
||||
speeds.append(r["tps"])
|
||||
log(f" Final Run {i+1}: {r['tps']:.2f} t/s")
|
||||
except Exception as e:
|
||||
log(f" Final Run {i+1}: ERROR ({e})")
|
||||
|
||||
proc.kill()
|
||||
|
||||
if speeds:
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best_tps = max(speeds)
|
||||
log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
|
||||
|
||||
final = {
|
||||
"model": model["name"],
|
||||
"quant": model["quant"],
|
||||
"label": f"FINAL-{model['name']}",
|
||||
"avg_tps": round(avg, 2),
|
||||
"best_tps": round(best_tps, 2),
|
||||
"boot_time": round(boot_time, 1),
|
||||
"vram": vram,
|
||||
"params": p,
|
||||
}
|
||||
ALL_RESULTS.append(final)
|
||||
return final
|
||||
|
||||
return prev_best
|
||||
|
||||
|
||||
# ─── Main ────────────────────────────────────────────────────────
|
||||
|
||||
def run_full_benchmark_for_model(model):
|
||||
"""Run all phases for a single model."""
|
||||
log(f"\n{'#'*70}")
|
||||
log(f" MODEL: {model['name']}")
|
||||
log(f" File: {model['path']}")
|
||||
log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB")
|
||||
log(f"{'#'*70}")
|
||||
|
||||
# Check model exists
|
||||
if not os.path.exists(model["path"]):
|
||||
log(f" SKIP: Model file not found!")
|
||||
return None
|
||||
|
||||
# Phase 0: Can it boot?
|
||||
baseline = phase0_boot_test(model)
|
||||
if not baseline:
|
||||
log(f" SKIP: {model['name']} cannot boot at 256K context!")
|
||||
return None
|
||||
|
||||
# Phase 1: GPU offload strategy
|
||||
best = phase1_gpu_offload(model, baseline)
|
||||
if not best:
|
||||
return baseline
|
||||
|
||||
# Phase 2: CPU threads
|
||||
best = phase2_threads(model, best)
|
||||
|
||||
# Phase 3: Batch sizes
|
||||
best = phase3_batch(model, best)
|
||||
|
||||
# Phase 4: KV cache
|
||||
best = phase4_kvcache(model, best)
|
||||
|
||||
# Phase 5: Final verification
|
||||
final = phase5_final(model, best)
|
||||
|
||||
return final
|
||||
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
|
||||
log("=" * 70)
|
||||
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
|
||||
log(" 2x RTX 3060 (24GB Total) | 256K Context")
|
||||
log(f" Models: {len(MODELS)}")
|
||||
log(f" Started: {datetime.datetime.now().isoformat()}")
|
||||
log("=" * 70)
|
||||
|
||||
# Show GPU info
|
||||
gpus = get_vram_all()
|
||||
for g in gpus:
|
||||
log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
|
||||
|
||||
# Run benchmarks for each model
|
||||
model_winners = []
|
||||
for i, model in enumerate(MODELS):
|
||||
log(f"\n{'='*70}")
|
||||
log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
winner = run_full_benchmark_for_model(model)
|
||||
if winner:
|
||||
model_winners.append(winner)
|
||||
|
||||
# Save intermediate results
|
||||
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
|
||||
|
||||
# ─── Grand Final Comparison ──────────────────────────────────
|
||||
elapsed = (time.time() - start_time) / 60
|
||||
|
||||
log(f"\n{'='*70}")
|
||||
log(f" GRAND FINAL COMPARISON")
|
||||
log(f" Total time: {elapsed:.1f} minutes")
|
||||
log(f" Configs tested: {len(ALL_RESULTS)}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
if not model_winners:
|
||||
log(" No models were able to run at 256K context!")
|
||||
return
|
||||
|
||||
# Sort by avg t/s
|
||||
model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
|
||||
|
||||
summary_lines = []
|
||||
summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
|
||||
summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
|
||||
summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
|
||||
summary_lines.append(f"Total time: {elapsed:.1f} minutes")
|
||||
summary_lines.append("")
|
||||
summary_lines.append("=" * 60)
|
||||
summary_lines.append(" RANKING (by AVG t/s)")
|
||||
summary_lines.append("=" * 60)
|
||||
|
||||
for rank, w in enumerate(model_winners, 1):
|
||||
medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ")
|
||||
summary_lines.append(f"\n {medal} #{rank}: {w['model']}")
|
||||
summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
|
||||
summary_lines.append(f" Boot: {w['boot_time']:.0f}s")
|
||||
p = w["params"]
|
||||
summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
|
||||
summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}")
|
||||
if p.get("cpu_moe"):
|
||||
summary_lines.append(f" --cpu-moe")
|
||||
elif p.get("n_cpu_moe", 0) > 0:
|
||||
summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}")
|
||||
|
||||
champion = model_winners[0]
|
||||
summary_lines.append(f"\n{'='*60}")
|
||||
summary_lines.append(f" ★ CHAMPION: {champion['model']}")
|
||||
summary_lines.append(f" {champion['avg_tps']:.2f} t/s average")
|
||||
summary_lines.append(f"{'='*60}")
|
||||
|
||||
# Build recommended command
|
||||
p = champion["params"]
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
|
||||
f"-ngl {p['ngl']} -c {CONTEXT}",
|
||||
f"-t {p['t']} -tb {p['t']}",
|
||||
f"-ub {p['ub']} -b {p['b']}",
|
||||
"-fa on",
|
||||
f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
|
||||
f"--prio {p.get('prio', 3)} --poll 50",
|
||||
"--mlock",
|
||||
]
|
||||
if p.get("cpu_moe"):
|
||||
cmd_parts.append("--cpu-moe")
|
||||
elif p.get("n_cpu_moe", 0) > 0:
|
||||
cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
|
||||
if p.get("nommap"):
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
summary_lines.append(f"\n Recommended command:")
|
||||
summary_lines.append(f" {' '.join(cmd_parts)}")
|
||||
|
||||
summary = "\n".join(summary_lines)
|
||||
print(summary)
|
||||
|
||||
with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
|
||||
f.write(summary)
|
||||
|
||||
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
|
||||
log(f"\n Results: scripts/dual_gpu_results.json")
|
||||
log(f" Summary: scripts/dual_gpu_summary.txt")
|
||||
log(f" DONE!")
|
||||
|
||||
kill_server()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
330
scripts/_archive/tuning/dual_gpu_benchmark_v2.mjs
Normal file
330
scripts/_archive/tuning/dual_gpu_benchmark_v2.mjs
Normal file
@@ -0,0 +1,330 @@
|
||||
/**
|
||||
* Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2
|
||||
* =====================================================
|
||||
* Informed by VRAM analysis — tests models in optimal order.
|
||||
*
|
||||
* Key insights applied:
|
||||
* - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA)
|
||||
* - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first
|
||||
* - Skip configs known to fail, minimize wasted time
|
||||
*
|
||||
* Run: node scripts/dual_gpu_benchmark_v2.mjs
|
||||
* Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, existsSync, statSync } from "fs";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CTX = 262144;
|
||||
const RUNS = 3;
|
||||
const TOKENS = 200;
|
||||
const BOOT_TIMEOUT = 300_000;
|
||||
|
||||
// Models ordered: smallest first (most likely to succeed fully on GPU)
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Gemma4-26B MXFP4_MOE",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
||||
quant: "MXFP4_MOE",
|
||||
fitsGPU: true, // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB
|
||||
},
|
||||
{
|
||||
name: "Gemma4-26B Q4_K_M",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
||||
quant: "Q4_K_M",
|
||||
fitsGPU: true, // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
quant: "MXFP4_MOE",
|
||||
fitsGPU: "maybe", // 20.1 + 1.41 + 1 = 22.51 GB — tight
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
quant: "Q4_K_M",
|
||||
fitsGPU: "maybe", // 20.5 + 1.41 + 1 = 22.91 GB — very tight
|
||||
},
|
||||
];
|
||||
|
||||
const ALL = [];
|
||||
let currentProc = null;
|
||||
|
||||
// ─── Utilities ─────────────────────────────────────────────────
|
||||
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
async function kill() {
|
||||
if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; }
|
||||
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||
await sleep(5000);
|
||||
}
|
||||
|
||||
function vram() {
|
||||
try {
|
||||
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||
return { gpu: g, used: u, total: t };
|
||||
});
|
||||
} catch { return []; }
|
||||
}
|
||||
|
||||
function startServer(modelPath, p) {
|
||||
const args = [
|
||||
"--model", modelPath, "-ngl", String(p.ngl),
|
||||
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", p.ctk, "--cache-type-v", p.ctv,
|
||||
"-ub", String(p.ub), "-b", String(p.b),
|
||||
"-t", String(p.t), "-tb", String(p.t),
|
||||
"--prio", String(p.prio || 3), "--poll", "50", "--mlock",
|
||||
"--port", "8000", "--host", "0.0.0.0",
|
||||
];
|
||||
if (p.cpuMoe) args.push("--cpu-moe");
|
||||
else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||
if (p.nommap) args.push("--no-mmap");
|
||||
|
||||
currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||
return currentProc;
|
||||
}
|
||||
|
||||
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||
const t0 = Date.now();
|
||||
while (Date.now() - t0 < timeout) {
|
||||
try {
|
||||
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const d = await r.json();
|
||||
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, boot: timeout / 1000 };
|
||||
}
|
||||
|
||||
async function bench(n = TOKENS) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "m",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||
max_tokens: n, temperature: 0,
|
||||
}),
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const ct = d.usage?.completion_tokens || 0;
|
||||
return { tps: ct / dt, ct, dt };
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await kill();
|
||||
log(` [${label}] Starting...`);
|
||||
startServer(model.path, params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) { log(` [${label}] ✗ FAILED (timeout)`); await kill(); return null; }
|
||||
|
||||
const v = vram();
|
||||
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||
log(` [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`);
|
||||
|
||||
try { await bench(20); } catch {} // warmup
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < RUNS; i++) {
|
||||
try { const r = await bench(); speeds.push(r.tps); log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Run${i+1}: ERR ${e.message}`); }
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (!speeds.length) { log(` [${label}] ✗ ALL RUNS FAILED`); return null; }
|
||||
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||
|
||||
const res = { model: model.name, quant: model.quant, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram: v, params };
|
||||
ALL.push(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
// Save intermediate results after each test
|
||||
function saveIntermediate() {
|
||||
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
}
|
||||
|
||||
// ─── Smart Phase Runner ────────────────────────────────────────
|
||||
|
||||
async function tuneModel(model) {
|
||||
log(`\n${"#".repeat(65)}`);
|
||||
log(` ${model.name} (${model.quant})`);
|
||||
if (!existsSync(model.path)) { log(" ✗ File not found, SKIP"); return null; }
|
||||
const sz = (statSync(model.path).size / 1024**3).toFixed(2);
|
||||
log(` Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`);
|
||||
log(`${"#".repeat(65)}`);
|
||||
|
||||
// ── Step 1: Find working GPU config ──
|
||||
log(`\n ── Step 1: Find optimal GPU offload ──`);
|
||||
let baseline = null;
|
||||
|
||||
if (model.fitsGPU === true || model.fitsGPU === "maybe") {
|
||||
// Try full GPU, no CPU offload
|
||||
baseline = await testConfig(model, "ngl=999 pure-GPU", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" });
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
if (!baseline) {
|
||||
// Try n-cpu-moe values (ascending — find minimum needed)
|
||||
for (const n of [5, 10, 15, 20]) {
|
||||
baseline = await testConfig(model, `n-cpu-moe=${n}`, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n });
|
||||
saveIntermediate();
|
||||
if (baseline) break; // found minimum working offload
|
||||
}
|
||||
}
|
||||
|
||||
if (!baseline) {
|
||||
// Last resort: full cpu-moe
|
||||
baseline = await testConfig(model, "cpu-moe", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true });
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
if (!baseline) { log(` ✗ ${model.name} cannot boot at 256K!`); return null; }
|
||||
|
||||
const bp = baseline.params; // carry forward best params
|
||||
|
||||
// If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory)
|
||||
if (!bp.cpuMoe && !bp.nCpuMoe) {
|
||||
const alt = await testConfig(model, "compare: cpu-moe", {
|
||||
...bp, cpuMoe: true });
|
||||
saveIntermediate();
|
||||
if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; }
|
||||
}
|
||||
|
||||
let best = baseline;
|
||||
|
||||
// ── Step 2: Thread sweep ──
|
||||
log(`\n ── Step 2: Thread sweep ──`);
|
||||
for (const t of [2, 4, 8, 10, 12]) {
|
||||
if (t === best.params.t) continue;
|
||||
const r = await testConfig(model, `t=${t}`, { ...best.params, t });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 3: Batch sweep ──
|
||||
log(`\n ── Step 3: Batch sweep ──`);
|
||||
for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) {
|
||||
if (ub === best.params.ub && b === best.params.b) continue;
|
||||
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 4: KV cache sweep ──
|
||||
log(`\n ── Step 4: KV cache type ──`);
|
||||
for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) {
|
||||
if (ctk === best.params.ctk && ctv === best.params.ctv) continue;
|
||||
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 5: Final verification (5 runs) ──
|
||||
log(`\n ── Step 5: Final verification ──`);
|
||||
await kill();
|
||||
startServer(model.path, best.params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) { await kill(); return best; }
|
||||
const v = vram();
|
||||
try { await bench(20); } catch {}
|
||||
|
||||
const finals = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (finals.length > 0) {
|
||||
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||
const bst = Math.max(...finals);
|
||||
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||
const final = { model: model.name, quant: model.quant, label: `FINAL`,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram: v, params: best.params };
|
||||
ALL.push(final);
|
||||
saveIntermediate();
|
||||
return final;
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
async function main() {
|
||||
const t0 = Date.now();
|
||||
log("=" .repeat(65));
|
||||
log(" DUAL-GPU BENCHMARK v2 — Smart Strategy");
|
||||
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||
log(" " + new Date().toISOString());
|
||||
log("=".repeat(65));
|
||||
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||
|
||||
const winners = [];
|
||||
for (let i = 0; i < MODELS.length; i++) {
|
||||
log(`\n${"=".repeat(65)}`);
|
||||
log(` MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`);
|
||||
log("=".repeat(65));
|
||||
const w = await tuneModel(MODELS[i]);
|
||||
if (w) winners.push(w);
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
// ─── Summary ──────────────────────────────────────────────
|
||||
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
const medals = ["🥇", "🥈", "🥉", " "];
|
||||
|
||||
const lines = [
|
||||
`Dual-GPU Benchmark v2 — ${new Date().toISOString()}`,
|
||||
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||
"", "=" .repeat(55), " RANKING", "=".repeat(55),
|
||||
];
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i], p = w.params;
|
||||
lines.push("", ` ${medals[i]||" "} #${i+1}: ${w.model}`);
|
||||
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`);
|
||||
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
||||
else if (p.nCpuMoe) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
||||
}
|
||||
if (winners.length > 0) {
|
||||
const c = winners[0], cp = c.params;
|
||||
lines.push("", "=".repeat(55), ` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s`, "=".repeat(55));
|
||||
const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`,
|
||||
`-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`,
|
||||
`-ub ${cp.ub} -b ${cp.b} -fa on`,
|
||||
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
||||
`--prio ${cp.prio||3} --poll 50 --mlock`,
|
||||
cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "",
|
||||
"--port 8000 --host 0.0.0.0"].filter(Boolean).join(" ");
|
||||
lines.push("", " Recommended:", ` ${cmd}`);
|
||||
}
|
||||
const summary = lines.join("\n");
|
||||
console.log("\n" + summary);
|
||||
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
log(`\n Saved: dual_gpu_results.json + dual_gpu_summary.txt`);
|
||||
log(" DONE!");
|
||||
await kill();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||
101
scripts/_archive/tuning/find_max_dense.mjs
Normal file
101
scripts/_archive/tuning/find_max_dense.mjs
Normal file
@@ -0,0 +1,101 @@
|
||||
import { spawn, exec } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function killServer() {
|
||||
return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
|
||||
}
|
||||
|
||||
async function testContextSize(modelPath, contextSize) {
|
||||
console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
|
||||
await killServer();
|
||||
|
||||
const args = [
|
||||
'--model', `models\\${modelPath}`,
|
||||
'-ngl', '999',
|
||||
'-c', contextSize.toString(),
|
||||
'-fa', 'on',
|
||||
'--cache-type-k', 'q4_0',
|
||||
'--cache-type-v', 'q4_0',
|
||||
'-ub', '512',
|
||||
'-b', '2048',
|
||||
'-t', '6',
|
||||
'-tb', '6',
|
||||
'--split-mode', 'row',
|
||||
'--prio', '3',
|
||||
'--fit', 'off',
|
||||
'--port', '8000',
|
||||
'--host', '0.0.0.0'
|
||||
];
|
||||
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
|
||||
|
||||
let booted = false;
|
||||
let oomed = false;
|
||||
|
||||
server.stderr.on('data', (d) => {
|
||||
const text = d.toString();
|
||||
if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
|
||||
oomed = true;
|
||||
}
|
||||
});
|
||||
|
||||
for (let i = 0; i < 20; i++) {
|
||||
if (oomed) break;
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
booted = true;
|
||||
break;
|
||||
}
|
||||
} catch(e) {}
|
||||
await delay(2000);
|
||||
}
|
||||
|
||||
if (oomed || !booted) {
|
||||
console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
|
||||
server.kill('SIGKILL');
|
||||
await killServer();
|
||||
return false;
|
||||
}
|
||||
|
||||
console.log(`✅ Booted! Running Benchmark...`);
|
||||
|
||||
// Benchmark
|
||||
const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
r(stdout || stderr);
|
||||
}));
|
||||
|
||||
console.log(bench);
|
||||
await killServer();
|
||||
return true;
|
||||
}
|
||||
|
||||
async function findMaxContext(modelName) {
|
||||
const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
|
||||
|
||||
let maxFound = false;
|
||||
for (const c of contexts) {
|
||||
const success = await testContextSize(modelName, c);
|
||||
if (success) {
|
||||
maxFound = true;
|
||||
console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!maxFound) {
|
||||
console.log(`\n❌ Failed to find any working context size for ${modelName}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
exec('set CUDA_VISIBLE_DEVICES=');
|
||||
console.log("============= QWEN 27B Q4_K_M =============");
|
||||
await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
|
||||
|
||||
console.log("\n============= GEMMA 4 31B Q4_K_M =============");
|
||||
await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
|
||||
}
|
||||
|
||||
main();
|
||||
345
scripts/_archive/tuning/qwen_fullgpu_challenge.mjs
Normal file
345
scripts/_archive/tuning/qwen_fullgpu_challenge.mjs
Normal file
@@ -0,0 +1,345 @@
|
||||
/**
|
||||
* Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크
|
||||
* =====================================================
|
||||
* 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기
|
||||
*
|
||||
* 테스트 모델:
|
||||
* 1. UD-IQ4_NL (16.6 GB) — 확실히 올라감, 기준선
|
||||
* 2. MXFP4_MOE (20.1 GB) — 도전! VRAM 극한 최적화
|
||||
* 3. Q4_K_M (20.5 GB) — 대조군 (n-cpu-moe=5)
|
||||
*
|
||||
* VRAM 절감 전략:
|
||||
* A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소)
|
||||
* B. split-mode row (GPU간 더 균등한 분배)
|
||||
* C. tensor-split 수동 밸런싱
|
||||
* D. no-mmap (메모리 관리 최적화)
|
||||
* E. defrag-thold (KV 캐시 파편화 방지)
|
||||
*
|
||||
* Run: node scripts/qwen_fullgpu_challenge.mjs
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, existsSync, statSync } from "fs";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CTX = 262144;
|
||||
const RUNS = 3;
|
||||
const TOKENS = 200;
|
||||
const BOOT_TIMEOUT = 300_000;
|
||||
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Qwen3.5 UD-IQ4_NL",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`,
|
||||
sizeGB: 16.6,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5 MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
sizeGB: 20.11,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5 Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
sizeGB: 20.5,
|
||||
},
|
||||
];
|
||||
|
||||
const ALL = [];
|
||||
let proc = null;
|
||||
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
async function kill() {
|
||||
if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; }
|
||||
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||
await sleep(5000);
|
||||
}
|
||||
|
||||
function vram() {
|
||||
try {
|
||||
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||
return { gpu: g, used: u, total: t };
|
||||
});
|
||||
} catch { return []; }
|
||||
}
|
||||
|
||||
function startServer(modelPath, p) {
|
||||
const args = [
|
||||
"--model", modelPath, "-ngl", "999",
|
||||
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", p.ctk || "q4_0",
|
||||
"--cache-type-v", p.ctv || "q4_0",
|
||||
"-ub", String(p.ub || 512), "-b", String(p.b || 2048),
|
||||
"-t", String(p.t || 4), "-tb", String(p.t || 4),
|
||||
"--prio", "3", "--poll", "50", "--mlock",
|
||||
"--port", "8000", "--host", "0.0.0.0",
|
||||
];
|
||||
|
||||
// GPU offload strategy
|
||||
if (p.cpuMoe) args.push("--cpu-moe");
|
||||
else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||
|
||||
// VRAM saving options
|
||||
if (p.splitMode) args.push("--split-mode", p.splitMode);
|
||||
if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit);
|
||||
if (p.noMmap) args.push("--no-mmap");
|
||||
if (p.defragThold) args.push("--defrag-thold", String(p.defragThold));
|
||||
if (p.noKvOffload) args.push("--no-kv-offload");
|
||||
|
||||
const cmdStr = args.join(" ");
|
||||
log(` CMD: ...${cmdStr.slice(-80)}`);
|
||||
proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||
return proc;
|
||||
}
|
||||
|
||||
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||
const t0 = Date.now();
|
||||
while (Date.now() - t0 < timeout) {
|
||||
try {
|
||||
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const d = await r.json();
|
||||
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, boot: timeout / 1000 };
|
||||
}
|
||||
|
||||
async function bench(n = TOKENS) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "m",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||
max_tokens: n, temperature: 0,
|
||||
}),
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const ct = d.usage?.completion_tokens || 0;
|
||||
return { tps: ct / dt, ct, dt };
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await kill();
|
||||
log(` [${label}] Starting...`);
|
||||
startServer(model.path, params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) {
|
||||
log(` [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`);
|
||||
await kill();
|
||||
return null;
|
||||
}
|
||||
|
||||
const v = vram();
|
||||
const totalUsed = v.reduce((a, g) => a + g.used, 0);
|
||||
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||
log(` [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`);
|
||||
|
||||
try { await bench(20); } catch {} // warmup
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < RUNS; i++) {
|
||||
try {
|
||||
const r = await bench();
|
||||
speeds.push(r.tps);
|
||||
log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Run${i+1}: ERR ${e.message}`);
|
||||
}
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (!speeds.length) return null;
|
||||
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||
|
||||
const res = {
|
||||
model: model.name, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot: +boot.toFixed(1),
|
||||
vram_total: totalUsed, vram: v,
|
||||
params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" },
|
||||
gpu_only: !params.cpuMoe && !params.nCpuMoe,
|
||||
};
|
||||
ALL.push(res);
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
return res;
|
||||
}
|
||||
|
||||
// ─── Test Strategies ───────────────────────────────────────────
|
||||
|
||||
async function testModel(model) {
|
||||
log(`\n${"#".repeat(65)}`);
|
||||
log(` ${model.name} (${model.sizeGB} GB)`);
|
||||
if (!existsSync(model.path)) { log(" ✗ File not found!"); return null; }
|
||||
log(`${"#".repeat(65)}`);
|
||||
|
||||
let best = null;
|
||||
const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; };
|
||||
|
||||
// ── Strategy 1: Pure GPU, default settings ──
|
||||
log(`\n ── Strategy 1: Pure GPU (default) ──`);
|
||||
update(await testConfig(model, "pure-GPU default", {
|
||||
t: 4, ub: 512, b: 2048
|
||||
}));
|
||||
|
||||
// ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ──
|
||||
log(`\n ── Strategy 2: Pure GPU, minimal batch ──`);
|
||||
update(await testConfig(model, "pure-GPU minbatch", {
|
||||
t: 4, ub: 64, b: 256
|
||||
}));
|
||||
|
||||
// ── Strategy 3: Pure GPU, small batch + no-mmap ──
|
||||
log(`\n ── Strategy 3: Pure GPU + no-mmap + small batch ──`);
|
||||
update(await testConfig(model, "pure-GPU nommap small", {
|
||||
t: 4, ub: 128, b: 512, noMmap: true
|
||||
}));
|
||||
|
||||
// ── Strategy 4: Pure GPU, split-mode row ──
|
||||
log(`\n ── Strategy 4: Pure GPU + split-mode row ──`);
|
||||
update(await testConfig(model, "pure-GPU row-split", {
|
||||
t: 4, ub: 128, b: 512, splitMode: "row"
|
||||
}));
|
||||
|
||||
// ── Strategy 5: Pure GPU, tensor-split manual balance ──
|
||||
log(`\n ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`);
|
||||
update(await testConfig(model, "pure-GPU ts=0.5,0.5", {
|
||||
t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5"
|
||||
}));
|
||||
|
||||
// ── Strategy 6: Pure GPU, defrag + all tricks ──
|
||||
log(`\n ── Strategy 6: Pure GPU ALL tricks ──`);
|
||||
update(await testConfig(model, "pure-GPU all-tricks", {
|
||||
t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1
|
||||
}));
|
||||
|
||||
// ── Fallback: n-cpu-moe=5 baseline ──
|
||||
if (!best || !best.gpu_only) {
|
||||
log(`\n ── Fallback: n-cpu-moe=5 ──`);
|
||||
update(await testConfig(model, "n-cpu-moe=5 baseline", {
|
||||
t: 4, ub: 256, b: 1024, nCpuMoe: 5
|
||||
}));
|
||||
}
|
||||
|
||||
// ── If pure GPU worked, tune batch/thread/kv ──
|
||||
if (best && best.gpu_only) {
|
||||
log(`\n ── Pure GPU succeeded! Fine-tuning... ──`);
|
||||
const bp = best.params;
|
||||
|
||||
// Thread sweep
|
||||
for (const t of [2, 6, 8]) {
|
||||
if (t === bp.t) continue;
|
||||
update(await testConfig(model, `tune t=${t}`, { ...bp, t }));
|
||||
}
|
||||
|
||||
// Batch sweep
|
||||
for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) {
|
||||
if (ub === bp.ub && b === bp.b) continue;
|
||||
update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b }));
|
||||
}
|
||||
|
||||
// KV cache upgrade (extra VRAM available?)
|
||||
for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) {
|
||||
update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv }));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Final verification ──
|
||||
if (best) {
|
||||
log(`\n ── Final verification (5 runs) ──`);
|
||||
await kill();
|
||||
startServer(model.path, best.params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (ok) {
|
||||
const v = vram();
|
||||
try { await bench(20); } catch {}
|
||||
const finals = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||
}
|
||||
await kill();
|
||||
if (finals.length > 0) {
|
||||
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||
const bst = Math.max(...finals);
|
||||
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||
const final = { model: model.name, label: "FINAL",
|
||||
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0),
|
||||
vram: v, params: best.params, gpu_only: best.gpu_only };
|
||||
ALL.push(final);
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
return final;
|
||||
}
|
||||
}
|
||||
await kill();
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const t0 = Date.now();
|
||||
log("=".repeat(65));
|
||||
log(" QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET");
|
||||
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||
log(" " + new Date().toISOString());
|
||||
log("=".repeat(65));
|
||||
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||
|
||||
const winners = [];
|
||||
for (const model of MODELS) {
|
||||
const w = await testModel(model);
|
||||
if (w) winners.push(w);
|
||||
}
|
||||
|
||||
// ─── Summary ──────────────────────────────────────────────
|
||||
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
|
||||
const lines = [
|
||||
`Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`,
|
||||
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||
"", "=".repeat(55), " RANKING", "=".repeat(55),
|
||||
];
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i], p = w.params;
|
||||
const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload";
|
||||
lines.push("", ` #${i+1}: ${w.model} [${gpu}]`);
|
||||
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||
lines.push(` VRAM: ${w.vram_total} MiB total`);
|
||||
const flags = [];
|
||||
if (p.splitMode) flags.push(`split=${p.splitMode}`);
|
||||
if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`);
|
||||
if (p.noMmap) flags.push("no-mmap");
|
||||
if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`);
|
||||
lines.push(` t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`);
|
||||
}
|
||||
|
||||
if (winners.length > 0) {
|
||||
const c = winners[0];
|
||||
lines.push("", "=".repeat(55));
|
||||
lines.push(` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`);
|
||||
lines.push("=".repeat(55));
|
||||
}
|
||||
|
||||
const summary = lines.join("\n");
|
||||
console.log("\n" + summary);
|
||||
writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
log(`\n Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`);
|
||||
log(" DONE!");
|
||||
await kill();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||
129
scripts/_archive/tuning/tune_122b.py
Normal file
129
scripts/_archive/tuning/tune_122b.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import subprocess, time, urllib.request, json, sys
|
||||
try: sys.stdout.reconfigure(encoding='utf-8')
|
||||
except: pass
|
||||
|
||||
MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
BASE = "http://127.0.0.1:8000"
|
||||
|
||||
# BEST SO FAR: GPU1 only + Expert CPU + 8t = 8.75 t/s (6.5GB / 12GB used)
|
||||
# 5.5GB VRAM remaining on GPU 1. Let's use it!
|
||||
# Strategy: keep some experts on GPU 1 using -ncmoe (n-cpu-moe)
|
||||
# n-cpu-moe = number of layers whose experts stay on CPU
|
||||
# Lower = more experts on GPU = more VRAM used = potentially faster
|
||||
|
||||
BASE_CMD = [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", MODEL,
|
||||
"-ngl", "999",
|
||||
"-sm", "none", "--main-gpu", "1",
|
||||
"-c", "4096", "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "512", "-b", "2048",
|
||||
"-t", "8", "-tb", "8",
|
||||
"--prio", "3", "--poll", "50",
|
||||
"--no-mmap",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
# Baseline: all experts CPU (confirmed 8.75 t/s)
|
||||
{"name": "Baseline: all expert CPU", "extra": ["-ot", ".*ffn_.*_exps.*=CPU"]},
|
||||
# Try n-cpu-moe with GPU1 only: keep some experts on GPU
|
||||
{"name": "n-cpu-moe=60 (4 layers expert GPU)", "extra": ["-ncmoe", "60"]},
|
||||
{"name": "n-cpu-moe=56 (8 layers expert GPU)", "extra": ["-ncmoe", "56"]},
|
||||
{"name": "n-cpu-moe=52 (12 layers expert GPU)", "extra": ["-ncmoe", "52"]},
|
||||
{"name": "n-cpu-moe=48 (16 layers expert GPU)", "extra": ["-ncmoe", "48"]},
|
||||
]
|
||||
|
||||
def kill():
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(4)
|
||||
|
||||
def check_server(timeout=900):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
||||
if resp.get("status") in ("ok", "ready"):
|
||||
return True
|
||||
except: pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def bench(runs=3):
|
||||
speeds = []
|
||||
for i in range(runs):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Write a Python fibonacci function with memoization."}
|
||||
],
|
||||
"max_tokens": 200,
|
||||
"temperature": 0.0
|
||||
}).encode('utf-8')
|
||||
req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
|
||||
dt = time.time() - t0
|
||||
tokens = resp.get("usage", {}).get("completion_tokens", 0)
|
||||
speed = tokens / dt if dt > 0 else 0
|
||||
speeds.append(speed)
|
||||
print(f" Run {i+1}: {speed:.2f} t/s ({tokens} tok / {dt:.1f}s)")
|
||||
return sum(speeds)/len(speeds), max(speeds)
|
||||
|
||||
def vram():
|
||||
try:
|
||||
out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
|
||||
return [int(x.strip()) for x in out.split('\n')]
|
||||
except: return [0, 0]
|
||||
|
||||
results = []
|
||||
for cfg in CONFIGS:
|
||||
kill()
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing: {cfg['name']}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
cmd = BASE_CMD + cfg["extra"]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
if not check_server(900):
|
||||
print(f" FAILED TO BOOT")
|
||||
results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
|
||||
proc.terminate(); kill(); continue
|
||||
|
||||
print(" Server ready! Warming up...")
|
||||
try:
|
||||
p = json.dumps({"model":"m","messages":[{"role":"system","content":"Hi"},{"role":"user","content":"Hi"}],"max_tokens":5}).encode()
|
||||
urllib.request.urlopen(urllib.request.Request(f"{BASE}/v1/chat/completions",data=p,headers={"Content-Type":"application/json"}), timeout=120)
|
||||
except: pass
|
||||
|
||||
v = vram()
|
||||
print(f" VRAM: GPU0={v[0]}MB, GPU1={v[1]}MB, Total={sum(v)}MB")
|
||||
|
||||
avg, best = bench(runs=3)
|
||||
print(f" >>> AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
results.append({
|
||||
"name": cfg["name"], "avg_tps": round(avg,2), "best_tps": round(best,2),
|
||||
"vram_gpu0": v[0], "vram_gpu1": v[1], "vram_total": sum(v), "status": "OK"
|
||||
})
|
||||
proc.terminate()
|
||||
|
||||
kill()
|
||||
print(f"\n\n{'='*60}")
|
||||
print("FINAL RESULTS - GPU1 Expert Balance (Target: 10+ t/s)")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'Config':<48} {'AVG':>6} {'BEST':>6} {'GPU1':>7}")
|
||||
print("-" * 72)
|
||||
for r in results:
|
||||
if r["status"] == "OK":
|
||||
print(f" {r['name']:<46} {r['avg_tps']:>5} {r['best_tps']:>5} {r['vram_gpu1']:>5}MB")
|
||||
else:
|
||||
print(f" {r['name']:<46} {'FAIL':>5}")
|
||||
|
||||
with open("scripts/122b_final_results.json", "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print("\nSaved to scripts/122b_final_results.json")
|
||||
64
scripts/_archive/tuning/tune_122b_20ts.mjs
Normal file
64
scripts/_archive/tuning/tune_122b_20ts.mjs
Normal file
@@ -0,0 +1,64 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, envVars, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const env = { ...process.env, ...envVars };
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
env
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48)
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer");
|
||||
|
||||
// 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer");
|
||||
|
||||
// 3. Fallback to 36 if OOM happens on 32/28
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer");
|
||||
|
||||
console.log("\nALL TESTS COMPLETED");
|
||||
}
|
||||
|
||||
main();
|
||||
72
scripts/_archive/tuning/tune_exact.mjs
Normal file
72
scripts/_archive/tuning/tune_exact.mjs
Normal file
@@ -0,0 +1,72 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, envVars, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Env: ${JSON.stringify(envVars)}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const env = { ...process.env, ...envVars };
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
env
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running speed test...`);
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// 1. 122B-A10B: Pure GPU offload (No n-cpu-moe at all)
|
||||
// -ngl 999 will offload all 48 layers to the NVIDIA driver (triggering Shared VRAM fallback on Windows)
|
||||
const args122B = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
await runTest(args122B, {}, "122B-A10B: Pure GPU (NVIDIA Shared Memory Fallback)");
|
||||
|
||||
// 2. 35B-A3B: Pure GPU tuning to hit 70 t/s
|
||||
// Base configuration from previous full-gpu run:
|
||||
const args35B = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 128 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// We already got ~64 t/s basically.
|
||||
// Let's try MMQ for custom matrix multiplication which is often faster on Ampere for low batch generation
|
||||
await runTest(args35B, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Force MMQ = 1");
|
||||
|
||||
// Try increasing threads to 12 just in case
|
||||
const args35B_t12 = args35B.replace("-t 6 -tb 6", "-t 12 -tb 12");
|
||||
await runTest(args35B_t12, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Threads 12 + MMQ");
|
||||
|
||||
console.log("\nALL TESTS COMPLETED");
|
||||
}
|
||||
|
||||
main();
|
||||
84
scripts/_archive/tuning/tune_models.mjs
Normal file
84
scripts/_archive/tuning/tune_models.mjs
Normal file
@@ -0,0 +1,84 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
// Kill existing
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore'
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
let oom = false;
|
||||
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT (Likely OOM)`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||
// Run pptest
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
|
||||
// Extract TG and PP from TG-500
|
||||
const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/);
|
||||
const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/);
|
||||
|
||||
const tg = tgMatch ? parseFloat(tgMatch[1]) : 0;
|
||||
const pp = ppMatch ? parseFloat(ppMatch[1]) : 0;
|
||||
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true, tg, pp });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512
|
||||
const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// Test 1: n-cpu-moe 1, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512");
|
||||
|
||||
// Test 2: n-cpu-moe 2, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512");
|
||||
|
||||
// Test 3: n-cpu-moe 4, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512");
|
||||
|
||||
// 2. 122B Tuning: Find optimal n-cpu-moe
|
||||
const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU.
|
||||
// Let's try 38, 35, 30
|
||||
await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38");
|
||||
await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30");
|
||||
await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22");
|
||||
|
||||
console.log("Tuning finished.");
|
||||
}
|
||||
|
||||
main();
|
||||
107
scripts/_archive/tuning/tune_n_cpu_moe.py
Normal file
107
scripts/_archive/tuning/tune_n_cpu_moe.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import subprocess, time, urllib.request, json, sys
|
||||
try: sys.stdout.reconfigure(encoding='utf-8')
|
||||
except: pass
|
||||
|
||||
MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
BASE = "http://127.0.0.1:8000"
|
||||
|
||||
# Goal: See if decreasing n-cpu-moe increases VRAM usage on GPU 1 and improves speed
|
||||
# Now that we know GPU 1 is isolated and has ~3.5GB free with 256K context
|
||||
|
||||
BASE_CMD = [
|
||||
r"llama_bin_run\llama-server.exe",
|
||||
"--model", MODEL,
|
||||
"-ngl", "999",
|
||||
"-sm", "none", "--main-gpu", "1",
|
||||
"-c", "4096", "-np", "1", "-fa", "on", # use a small context for fast boot/testing
|
||||
"--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
|
||||
"-ub", "512", "-b", "2048",
|
||||
"-t", "8", "-tb", "8",
|
||||
"--prio", "3", "--poll", "50",
|
||||
"--no-mmap",
|
||||
"--port", "8000", "--host", "0.0.0.0"
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
{"name": "n-cpu-moe=48 (baseline)", "extra": ["-ncmoe", "48"]},
|
||||
{"name": "n-cpu-moe=40", "extra": ["-ncmoe", "40"]},
|
||||
{"name": "n-cpu-moe=32", "extra": ["-ncmoe", "32"]},
|
||||
{"name": "n-cpu-moe=24", "extra": ["-ncmoe", "24"]},
|
||||
]
|
||||
|
||||
def kill():
|
||||
subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
time.sleep(4)
|
||||
|
||||
def check_server(timeout=900):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE}/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
|
||||
if resp.get("status") in ("ok", "ready"):
|
||||
return True
|
||||
except: pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def bench(runs=2):
|
||||
speeds = []
|
||||
for i in range(runs):
|
||||
payload = json.dumps({
|
||||
"model": "m",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Write a short Python script."}
|
||||
],
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.0
|
||||
}).encode('utf-8')
|
||||
req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
|
||||
dt = time.time() - t0
|
||||
tokens = resp.get("usage", {}).get("completion_tokens", 0)
|
||||
speed = tokens / dt if dt > 0 else 0
|
||||
speeds.append(speed)
|
||||
return sum(speeds)/len(speeds), max(speeds)
|
||||
|
||||
def vram():
|
||||
try:
|
||||
out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
|
||||
return [int(x.strip()) for x in out.split('\n')]
|
||||
except: return [0, 0]
|
||||
|
||||
results = []
|
||||
for cfg in CONFIGS:
|
||||
kill()
|
||||
print(f"\n{'='*60}\nTesting: {cfg['name']}\n{'='*60}")
|
||||
|
||||
cmd = BASE_CMD + cfg["extra"]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
if not check_server(300):
|
||||
print(f" FAILED TO BOOT (OOM?)")
|
||||
results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
|
||||
proc.terminate(); kill(); continue
|
||||
|
||||
print(" Server ready! Warming up...")
|
||||
time.sleep(2)
|
||||
v = vram()
|
||||
|
||||
avg, best = bench(runs=2)
|
||||
print(f" >>> AVG: {avg:.2f} t/s | VRAM GPU1: {v[1]}MB")
|
||||
|
||||
results.append({
|
||||
"name": cfg["name"], "avg_tps": round(avg,2),
|
||||
"vram_gpu1": v[1], "status": "OK"
|
||||
})
|
||||
proc.terminate()
|
||||
|
||||
kill()
|
||||
print("\nFINAL RESULTS:")
|
||||
for r in results:
|
||||
if r["status"] == "OK":
|
||||
print(f" {r['name']:<25} {r['avg_tps']:>5} t/s | {r['vram_gpu1']:>5}MB")
|
||||
else:
|
||||
print(f" {r['name']:<25} FAIL (OOM)")
|
||||
Reference in New Issue
Block a user