wip: [01-llm-tuning] paused at task 1/3
This commit is contained in:
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -30,3 +30,7 @@
|
|||||||
path = .agent/vendor/mini-swe
|
path = .agent/vendor/mini-swe
|
||||||
url = https://github.com/swe-agent/swe-agent.git
|
url = https://github.com/swe-agent/swe-agent.git
|
||||||
branch = main
|
branch = main
|
||||||
|
[submodule "openclaude"]
|
||||||
|
path = openclaude
|
||||||
|
url = https://github.com/Gitlawb/openclaude.git
|
||||||
|
branch = main
|
||||||
|
|||||||
@@ -1,25 +1,33 @@
|
|||||||
{
|
{
|
||||||
"version": "1.0",
|
"version": "1.0",
|
||||||
"timestamp": "2026-04-05T00:51:15+09:00",
|
"timestamp": "2026-04-05T13:54:58.707Z",
|
||||||
"phase": "00-initialization",
|
"phase": "01",
|
||||||
"phase_name": "Project Initialization",
|
"phase_name": "01-llm-tuning",
|
||||||
"phase_dir": ".planning",
|
"phase_dir": ".planning/phases/01-llm-tuning",
|
||||||
"plan": 0,
|
"plan": 1,
|
||||||
"task": 0,
|
"task": 1,
|
||||||
"total_tasks": 0,
|
"total_tasks": 3,
|
||||||
"status": "paused",
|
"status": "paused",
|
||||||
"completed_tasks": [
|
"completed_tasks": [
|
||||||
{"id": 1, "name": "Initialize Project & Repo", "status": "done", "commit": "e37f65a"}
|
{"id": 1, "name": "Gemma4 26B performance tuning at 256K context", "status": "done", "commit": "none"}
|
||||||
],
|
],
|
||||||
"remaining_tasks": [
|
"remaining_tasks": [
|
||||||
{"id": 2, "name": "Run /gsd-plan-phase 1 to start planning Phase 1", "status": "not_started"}
|
{"id": 2, "name": "Proceed with extensions frontend UI integration", "status": "not_started"},
|
||||||
|
{"id": 3, "name": "Add 2nd RTX 3060 to verify 45-60 t/s MoE performance", "status": "not_started"}
|
||||||
],
|
],
|
||||||
"blockers": [],
|
"blockers": [],
|
||||||
"human_actions_pending": [],
|
"human_actions_pending": [
|
||||||
"decisions": [
|
{"action": "Decide next step: integration of Extension frontend streaming or adding second GPU for Qwen/Gemma4 full evaluation", "context": "Server is fully optimized for 1 GPU, further improvements in speed require hardware upgrade", "blocking": false}
|
||||||
{"decision": "2+0 GPU Architecture (Machine A API Server, Machine B tools)", "rationale": "Prioritize coding speed (50-80 t/s) and separate logic cleanly", "phase": "00"}
|
|
||||||
],
|
],
|
||||||
"uncommitted_files": [],
|
"decisions": [
|
||||||
"next_action": "Run /gsd-plan-phase 1 to plan the Machine A server setup and hot-swap script.",
|
{"decision": "Used --n-cpu-moe 10 for Gemma4 26B instead of --cpu-moe", "rationale": "Applying --cpu-moe globally to Gemma4 resulted in severe instability and crashes (graph splits 62) due to SWA+MoE entanglement. Targeted offload (10 layers) prevents VRAM swap and stabilizes split at 2, achieving 30.9 t/s on 1 GPU.", "phase": "01"},
|
||||||
"context_notes": "We just finalized the initial architecture plan for Variet LLM involving Dual GPUs on Machine A for pure API inference, and Machine B as the workstation for VS Code Extension, Discord Bot, and Search/MCP tools."
|
{"decision": "Verified Qwen3.5 35B-A3B speed capabilities", "rationale": "Tested Qwen 35B limits on 12GB. Found it causes heavy WDDM swap without MoE offload. Confirmed its smaller active parameters (3B vs Gemma4's 4B) will likely make it significantly faster than Gemma4 on a dual 3060 24GB setup up to 64K context.", "phase": "01"}
|
||||||
|
],
|
||||||
|
"uncommitted_files": [
|
||||||
|
"start_gemma4_26b_api.bat",
|
||||||
|
"scripts/auto_tune_gemma4_256k.py",
|
||||||
|
"scripts/auto_tune_gemma4_ncpumoe.py"
|
||||||
|
],
|
||||||
|
"next_action": "Resume development on OpenClaude integration (Extension frontend UI) or configure Dual-GPU testing.",
|
||||||
|
"context_notes": "We've successfully proven the 1 GPU tuning threshold for Gemma4 (30.9 t/s). We also understood why OpenClaude needs large contexts (200K default scaling) and mapped out exact expectations for Qwen VS Gemma on 2x GPUs."
|
||||||
}
|
}
|
||||||
|
|||||||
1
openclaude
Submodule
1
openclaude
Submodule
Submodule openclaude added at 5ef79546e9
@@ -1,372 +0,0 @@
|
|||||||
"""
|
|
||||||
Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
|
|
||||||
===========================================
|
|
||||||
각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
|
|
||||||
서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
|
|
||||||
|
|
||||||
예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
|
|
||||||
"""
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import urllib.request
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
try:
|
|
||||||
sys.stdout.reconfigure(encoding='utf-8')
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
BASE_URL = "http://127.0.0.1:8000"
|
|
||||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
|
||||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# 테스트할 설정 목록
|
|
||||||
# ============================================================
|
|
||||||
# 공통 파라미터 (변경하지 않는 것들)
|
|
||||||
COMMON_ARGS = [
|
|
||||||
"--model", MODEL_PATH,
|
|
||||||
"-ngl", "999",
|
|
||||||
"--cpu-moe",
|
|
||||||
"-c", "2048",
|
|
||||||
"-np", "1",
|
|
||||||
"-fa", "on",
|
|
||||||
"--cache-type-k", "q4_0",
|
|
||||||
"--cache-type-v", "q4_0",
|
|
||||||
"-ub", "256",
|
|
||||||
"-b", "1024",
|
|
||||||
"--mlock",
|
|
||||||
"--port", "8000",
|
|
||||||
"--host", "0.0.0.0",
|
|
||||||
"--no-warmup", # 워밍업은 벤치마크 스크립트에서 직접 수행
|
|
||||||
]
|
|
||||||
|
|
||||||
# 변수 파라미터 조합
|
|
||||||
CONFIGS = [
|
|
||||||
{
|
|
||||||
"name": "A) --no-mmap -t 8",
|
|
||||||
"desc": "서버 권장: mmap 비활성화 (baseline 대비)",
|
|
||||||
"extra": ["--no-mmap", "-t", "8", "--prio", "2"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B) --no-mmap -t 6",
|
|
||||||
"desc": "스레드 감소 (캐시 경합 회피)",
|
|
||||||
"extra": ["--no-mmap", "-t", "6", "--prio", "2"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "C) --no-mmap -t 10",
|
|
||||||
"desc": "스레드 증가 (RAM 대역폭 포화)",
|
|
||||||
"extra": ["--no-mmap", "-t", "10", "--prio", "2"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "D) --no-mmap -t 12",
|
|
||||||
"desc": "더 많은 스레드",
|
|
||||||
"extra": ["--no-mmap", "-t", "12", "--prio", "2"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "E) --no-mmap -t 10 --prio 3 --poll 100",
|
|
||||||
"desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
|
|
||||||
"extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# 유틸리티 함수
|
|
||||||
# ============================================================
|
|
||||||
|
|
||||||
def kill_server():
|
|
||||||
"""llama-server 프로세스 강제 종료"""
|
|
||||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
def start_server(config, log_path):
|
|
||||||
"""서버 시작, 로그를 파일로 리다이렉트"""
|
|
||||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
|
||||||
log_file = open(log_path, "w", encoding="utf-8")
|
|
||||||
proc = subprocess.Popen(
|
|
||||||
cmd,
|
|
||||||
stdout=log_file,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
cwd=os.getcwd()
|
|
||||||
)
|
|
||||||
return proc, log_file
|
|
||||||
|
|
||||||
def wait_for_server(timeout=600):
|
|
||||||
"""서버가 준비될 때까지 대기"""
|
|
||||||
start = time.time()
|
|
||||||
while time.time() - start < timeout:
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
|
||||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
||||||
data = json.loads(resp.read())
|
|
||||||
if data.get("status") == "ok":
|
|
||||||
return True
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
time.sleep(5)
|
|
||||||
return False
|
|
||||||
|
|
||||||
def run_single_benchmark(prompt, max_tokens=200):
|
|
||||||
"""단일 벤치마크 실행"""
|
|
||||||
payload = json.dumps({
|
|
||||||
"model": "local-model",
|
|
||||||
"messages": [{"role": "user", "content": prompt}],
|
|
||||||
"max_tokens": max_tokens,
|
|
||||||
"temperature": 0.0
|
|
||||||
}).encode("utf-8")
|
|
||||||
|
|
||||||
req = urllib.request.Request(
|
|
||||||
f"{BASE_URL}/v1/chat/completions",
|
|
||||||
data=payload,
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
)
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
|
||||||
result = json.loads(resp.read())
|
|
||||||
elapsed = time.time() - start
|
|
||||||
|
|
||||||
usage = result.get("usage", {})
|
|
||||||
completion_tokens = usage.get("completion_tokens", 0)
|
|
||||||
return completion_tokens, elapsed
|
|
||||||
|
|
||||||
def parse_eval_times(log_path):
|
|
||||||
"""서버 로그에서 순수 eval time 파싱"""
|
|
||||||
try:
|
|
||||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
||||||
content = f.read()
|
|
||||||
except:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
|
|
||||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
|
||||||
matches = re.findall(pattern, content, re.MULTILINE)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for m in matches:
|
|
||||||
results.append({
|
|
||||||
"total_ms": float(m[0]),
|
|
||||||
"tokens": int(m[1]),
|
|
||||||
"ms_per_token": float(m[2]),
|
|
||||||
"tps": float(m[3])
|
|
||||||
})
|
|
||||||
return results
|
|
||||||
|
|
||||||
def parse_prompt_eval_times(log_path):
|
|
||||||
"""서버 로그에서 prompt eval time 파싱"""
|
|
||||||
try:
|
|
||||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
||||||
content = f.read()
|
|
||||||
except:
|
|
||||||
return []
|
|
||||||
|
|
||||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
|
||||||
matches = re.findall(pattern, content, re.MULTILINE)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for m in matches:
|
|
||||||
results.append({
|
|
||||||
"total_ms": float(m[0]),
|
|
||||||
"tokens": int(m[1]),
|
|
||||||
"ms_per_token": float(m[2]),
|
|
||||||
"tps": float(m[3])
|
|
||||||
})
|
|
||||||
return results
|
|
||||||
|
|
||||||
def parse_vram_usage(log_path):
|
|
||||||
"""서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
|
|
||||||
try:
|
|
||||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
||||||
content = f.read()
|
|
||||||
except:
|
|
||||||
return "N/A"
|
|
||||||
|
|
||||||
match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
|
|
||||||
if match:
|
|
||||||
return f"{float(match.group(1)):.0f} MiB"
|
|
||||||
return "N/A"
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# 메인 튜닝 루프
|
|
||||||
# ============================================================
|
|
||||||
|
|
||||||
def main():
|
|
||||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
||||||
|
|
||||||
print("=" * 70)
|
|
||||||
print(" Qwen3.5 122B-A10B 자동 정밀 튜닝")
|
|
||||||
print(f" 시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
|
||||||
print(f" 테스트 설정: {len(CONFIGS)}개")
|
|
||||||
print(f" 예상 소요: ~{len(CONFIGS) * 7}분")
|
|
||||||
print("=" * 70)
|
|
||||||
print()
|
|
||||||
print(" 기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# 결과 저장
|
|
||||||
all_results = []
|
|
||||||
|
|
||||||
for idx, config in enumerate(CONFIGS):
|
|
||||||
config_start = time.time()
|
|
||||||
log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
|
|
||||||
|
|
||||||
print(f"\n{'='*70}")
|
|
||||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
|
||||||
print(f" {config['desc']}")
|
|
||||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
|
||||||
print(f"{'='*70}")
|
|
||||||
|
|
||||||
# 1. 기존 서버 종료
|
|
||||||
print(" [1/4] 서버 종료 중...")
|
|
||||||
kill_server()
|
|
||||||
|
|
||||||
# 2. 새 서버 시작
|
|
||||||
print(f" [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
|
|
||||||
proc, log_file = start_server(config, log_path)
|
|
||||||
|
|
||||||
# 3. 서버 준비 대기
|
|
||||||
if not wait_for_server(timeout=600):
|
|
||||||
print(" ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
|
|
||||||
kill_server()
|
|
||||||
log_file.close()
|
|
||||||
all_results.append({
|
|
||||||
"config": config["name"],
|
|
||||||
"status": "FAILED",
|
|
||||||
"eval_tps": [],
|
|
||||||
"prompt_tps": [],
|
|
||||||
"vram": "N/A"
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
load_time = time.time() - config_start
|
|
||||||
print(f" [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
|
|
||||||
|
|
||||||
# 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
|
|
||||||
print(" [4/4] 벤치마크 실행 중...")
|
|
||||||
|
|
||||||
# 워밍업
|
|
||||||
try:
|
|
||||||
run_single_benchmark("Say hello.", max_tokens=20)
|
|
||||||
print(" 워밍업 완료")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" 워밍업 실패: {e}")
|
|
||||||
|
|
||||||
# 본 테스트 3회
|
|
||||||
prompts = [
|
|
||||||
"Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
|
|
||||||
"Explain the complete process of photosynthesis including light and dark reactions in detail.",
|
|
||||||
"Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
|
|
||||||
]
|
|
||||||
|
|
||||||
for i, prompt in enumerate(prompts):
|
|
||||||
try:
|
|
||||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
|
||||||
approx_tps = tokens / elapsed if elapsed > 0 else 0
|
|
||||||
print(f" Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Run {i+1}/3: ERROR - {e}")
|
|
||||||
|
|
||||||
# 서버 종료 전에 로그 플러시를 위해 잠시 대기
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# 서버 종료
|
|
||||||
kill_server()
|
|
||||||
log_file.close()
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# 로그 파싱
|
|
||||||
eval_times = parse_eval_times(log_path)
|
|
||||||
prompt_times = parse_prompt_eval_times(log_path)
|
|
||||||
vram = parse_vram_usage(log_path)
|
|
||||||
|
|
||||||
# 워밍업 제외 (첫 번째 결과)
|
|
||||||
if len(eval_times) > 1:
|
|
||||||
bench_evals = eval_times[1:] # 워밍업 제외
|
|
||||||
else:
|
|
||||||
bench_evals = eval_times
|
|
||||||
|
|
||||||
if len(prompt_times) > 1:
|
|
||||||
bench_prompts = prompt_times[1:]
|
|
||||||
else:
|
|
||||||
bench_prompts = prompt_times
|
|
||||||
|
|
||||||
eval_speeds = [e["tps"] for e in bench_evals]
|
|
||||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"config": config["name"],
|
|
||||||
"status": "OK",
|
|
||||||
"eval_tps": eval_speeds,
|
|
||||||
"prompt_tps": prompt_speeds,
|
|
||||||
"vram": vram,
|
|
||||||
}
|
|
||||||
all_results.append(result)
|
|
||||||
|
|
||||||
config_elapsed = time.time() - config_start
|
|
||||||
print(f"\n 완료! 소요: {config_elapsed:.0f}초")
|
|
||||||
|
|
||||||
if eval_speeds:
|
|
||||||
avg_eval = sum(eval_speeds) / len(eval_speeds)
|
|
||||||
max_eval = max(eval_speeds)
|
|
||||||
print(f" 📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
|
|
||||||
|
|
||||||
# ============================================================
|
|
||||||
# 최종 결과 비교 테이블
|
|
||||||
# ============================================================
|
|
||||||
print("\n")
|
|
||||||
print("=" * 80)
|
|
||||||
print(" 🏆 최종 결과 비교 테이블")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# 기존 baseline 추가
|
|
||||||
print(f" {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
|
|
||||||
print(f" {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
|
|
||||||
|
|
||||||
# Baseline (이전 결과)
|
|
||||||
print(f" {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
|
|
||||||
|
|
||||||
best_avg = 0
|
|
||||||
best_config = ""
|
|
||||||
|
|
||||||
for r in all_results:
|
|
||||||
if r["status"] != "OK" or not r["eval_tps"]:
|
|
||||||
print(f" {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
|
||||||
max_e = max(r["eval_tps"])
|
|
||||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
|
||||||
|
|
||||||
if avg_e > best_avg:
|
|
||||||
best_avg = avg_e
|
|
||||||
best_config = r["config"]
|
|
||||||
|
|
||||||
marker = " ⭐" if avg_e > 10.06 else ""
|
|
||||||
print(f" {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
if best_avg > 0:
|
|
||||||
improvement = ((best_avg - 10.02) / 10.02) * 100
|
|
||||||
print(f" 🏆 최고 성능: {best_config}")
|
|
||||||
print(f" → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print(f" 완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
# 결과를 파일로도 저장
|
|
||||||
result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
|
|
||||||
with open(result_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
|
|
||||||
f.write(f"Date: {timestamp}\n\n")
|
|
||||||
for r in all_results:
|
|
||||||
f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
|
|
||||||
print(f" 결과 저장: {result_path}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,257 +0,0 @@
|
|||||||
"""
|
|
||||||
Qwen3.5 122B-A10B 정밀 튜닝 2라운드
|
|
||||||
====================================
|
|
||||||
1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
|
|
||||||
→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
|
|
||||||
"""
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import urllib.request
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
try:
|
|
||||||
sys.stdout.reconfigure(encoding='utf-8')
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
BASE_URL = "http://127.0.0.1:8000"
|
|
||||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
|
||||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
|
||||||
|
|
||||||
COMMON_ARGS = [
|
|
||||||
"--model", MODEL_PATH,
|
|
||||||
"-ngl", "999",
|
|
||||||
"--cpu-moe",
|
|
||||||
"-c", "2048",
|
|
||||||
"-np", "1",
|
|
||||||
"-fa", "on",
|
|
||||||
"--cache-type-k", "q4_0",
|
|
||||||
"--cache-type-v", "q4_0",
|
|
||||||
"-ub", "256",
|
|
||||||
"-b", "1024",
|
|
||||||
"--mlock",
|
|
||||||
"--port", "8000",
|
|
||||||
"--host", "0.0.0.0",
|
|
||||||
"--no-warmup",
|
|
||||||
]
|
|
||||||
|
|
||||||
CONFIGS = [
|
|
||||||
{
|
|
||||||
"name": "F) mmap on, -t 4",
|
|
||||||
"desc": "최소 스레드 (4개, 물리코어 절반)",
|
|
||||||
"extra": ["-t", "4", "--prio", "2"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "G) mmap on, -t 5",
|
|
||||||
"desc": "스레드 5개",
|
|
||||||
"extra": ["-t", "5", "--prio", "2"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "H) mmap on, -t 6",
|
|
||||||
"desc": "스레드 6개 (--no-mmap에서 최고였음)",
|
|
||||||
"extra": ["-t", "6", "--prio", "2"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "I) mmap on, -t 7",
|
|
||||||
"desc": "스레드 7개",
|
|
||||||
"extra": ["-t", "7", "--prio", "2"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "J) mmap on, -t 6, --prio 3",
|
|
||||||
"desc": "최적 스레드 + 리얼타임 우선순위",
|
|
||||||
"extra": ["-t", "6", "--prio", "3"],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
def kill_server():
|
|
||||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
def start_server(config, log_path):
|
|
||||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
|
||||||
log_file = open(log_path, "w", encoding="utf-8")
|
|
||||||
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
|
|
||||||
return proc, log_file
|
|
||||||
|
|
||||||
def wait_for_server(timeout=600):
|
|
||||||
start = time.time()
|
|
||||||
while time.time() - start < timeout:
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
|
||||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
||||||
data = json.loads(resp.read())
|
|
||||||
if data.get("status") == "ok":
|
|
||||||
return True
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
time.sleep(5)
|
|
||||||
return False
|
|
||||||
|
|
||||||
def run_single_benchmark(prompt, max_tokens=200):
|
|
||||||
payload = json.dumps({
|
|
||||||
"model": "local-model",
|
|
||||||
"messages": [{"role": "user", "content": prompt}],
|
|
||||||
"max_tokens": max_tokens,
|
|
||||||
"temperature": 0.0
|
|
||||||
}).encode("utf-8")
|
|
||||||
req = urllib.request.Request(
|
|
||||||
f"{BASE_URL}/v1/chat/completions",
|
|
||||||
data=payload,
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
)
|
|
||||||
start = time.time()
|
|
||||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
|
||||||
result = json.loads(resp.read())
|
|
||||||
elapsed = time.time() - start
|
|
||||||
usage = result.get("usage", {})
|
|
||||||
return usage.get("completion_tokens", 0), elapsed
|
|
||||||
|
|
||||||
def parse_eval_times(log_path):
|
|
||||||
try:
|
|
||||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
||||||
content = f.read()
|
|
||||||
except:
|
|
||||||
return []
|
|
||||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
|
||||||
matches = re.findall(pattern, content, re.MULTILINE)
|
|
||||||
return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
|
|
||||||
|
|
||||||
def parse_prompt_eval_times(log_path):
|
|
||||||
try:
|
|
||||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
||||||
content = f.read()
|
|
||||||
except:
|
|
||||||
return []
|
|
||||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
|
||||||
matches = re.findall(pattern, content, re.MULTILINE)
|
|
||||||
return [{"tps": float(m[3])} for m in matches]
|
|
||||||
|
|
||||||
def main():
|
|
||||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
||||||
|
|
||||||
print("=" * 70)
|
|
||||||
print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
|
|
||||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
|
||||||
print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
|
|
||||||
print("=" * 70)
|
|
||||||
print()
|
|
||||||
|
|
||||||
all_results = []
|
|
||||||
|
|
||||||
for idx, config in enumerate(CONFIGS):
|
|
||||||
config_start = time.time()
|
|
||||||
log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
|
|
||||||
|
|
||||||
print(f"\n{'='*70}")
|
|
||||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
|
||||||
print(f" {config['desc']}")
|
|
||||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
|
||||||
print(f"{'='*70}")
|
|
||||||
|
|
||||||
kill_server()
|
|
||||||
print(f" [1/3] 서버 시작 중...")
|
|
||||||
proc, log_file = start_server(config, log_path)
|
|
||||||
|
|
||||||
if not wait_for_server(timeout=600):
|
|
||||||
print(" ❌ 서버 시작 실패!")
|
|
||||||
kill_server()
|
|
||||||
log_file.close()
|
|
||||||
all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
|
|
||||||
continue
|
|
||||||
|
|
||||||
load_time = time.time() - config_start
|
|
||||||
print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)")
|
|
||||||
|
|
||||||
# 워밍업 + 벤치마크
|
|
||||||
try:
|
|
||||||
run_single_benchmark("Say hello.", max_tokens=20)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
print(" [3/3] 벤치마크 3회...")
|
|
||||||
prompts = [
|
|
||||||
"Write a detailed explanation of how neural networks learn through backpropagation.",
|
|
||||||
"Explain the complete process of photosynthesis including light and dark reactions.",
|
|
||||||
"Describe the differences between SQL and NoSQL databases with examples.",
|
|
||||||
]
|
|
||||||
for i, prompt in enumerate(prompts):
|
|
||||||
try:
|
|
||||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
|
||||||
print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Run {i+1}: ERROR - {e}")
|
|
||||||
|
|
||||||
time.sleep(2)
|
|
||||||
kill_server()
|
|
||||||
log_file.close()
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
eval_times = parse_eval_times(log_path)
|
|
||||||
prompt_times = parse_prompt_eval_times(log_path)
|
|
||||||
bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
|
|
||||||
bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
|
|
||||||
|
|
||||||
eval_speeds = [e["tps"] for e in bench_evals]
|
|
||||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
|
||||||
|
|
||||||
all_results.append({
|
|
||||||
"config": config["name"],
|
|
||||||
"status": "OK",
|
|
||||||
"eval_tps": eval_speeds,
|
|
||||||
"prompt_tps": prompt_speeds,
|
|
||||||
})
|
|
||||||
|
|
||||||
if eval_speeds:
|
|
||||||
print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
|
|
||||||
|
|
||||||
# 최종 결과
|
|
||||||
print("\n")
|
|
||||||
print("=" * 85)
|
|
||||||
print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
|
|
||||||
print("=" * 85)
|
|
||||||
print()
|
|
||||||
print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
|
|
||||||
print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
|
|
||||||
|
|
||||||
# 1라운드 결과 (하드코딩)
|
|
||||||
r1 = [
|
|
||||||
("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52),
|
|
||||||
("A) --no-mmap -t 8", 9.66, 9.70, 28.26),
|
|
||||||
("B) --no-mmap -t 6", 10.02, 10.18, 26.73),
|
|
||||||
("C) --no-mmap -t 10", 9.42, 9.46, 27.31),
|
|
||||||
("D) --no-mmap -t 12", 9.04, 9.11, 27.92),
|
|
||||||
("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37),
|
|
||||||
]
|
|
||||||
for name, avg, mx, pp in r1:
|
|
||||||
marker = " ⭐" if avg >= 10.0 else ""
|
|
||||||
print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
|
|
||||||
|
|
||||||
print(f" {'--- 2라운드 ---':<48}")
|
|
||||||
|
|
||||||
best_avg = 10.06 # 기존 최고
|
|
||||||
best_config = "[기준] mmap on, -t 8"
|
|
||||||
|
|
||||||
for r in all_results:
|
|
||||||
if r["status"] != "OK" or not r["eval_tps"]:
|
|
||||||
print(f" {r['config']:<48} {'FAIL':>8}")
|
|
||||||
continue
|
|
||||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
|
||||||
max_e = max(r["eval_tps"])
|
|
||||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
|
||||||
if max_e > best_avg:
|
|
||||||
best_avg = max_e
|
|
||||||
best_config = r["config"]
|
|
||||||
marker = " ⭐" if avg_e >= 10.0 else ""
|
|
||||||
print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print(f" 🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
|
|
||||||
print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
|
||||||
print("=" * 85)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
import urllib.request
|
|
||||||
import json
|
|
||||||
import zipfile
|
|
||||||
import os
|
|
||||||
import ssl
|
|
||||||
|
|
||||||
ctx = ssl.create_default_context()
|
|
||||||
ctx.check_hostname = False
|
|
||||||
ctx.verify_mode = ssl.CERT_NONE
|
|
||||||
|
|
||||||
url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
|
||||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen(req, context=ctx) as response:
|
|
||||||
data = json.loads(response.read().decode())
|
|
||||||
|
|
||||||
download_url = None
|
|
||||||
for asset in data['assets']:
|
|
||||||
if "bin-win-cuda-cu12.2.0-x64.zip" in asset['name'] or ("bin-win-cu" in asset['name'] and asset['name'].endswith(".zip") and "x64" in asset['name']):
|
|
||||||
download_url = asset['browser_download_url']
|
|
||||||
break
|
|
||||||
|
|
||||||
if download_url:
|
|
||||||
print(f"Downloading {download_url}...")
|
|
||||||
zip_path = "llama.zip"
|
|
||||||
with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
|
|
||||||
out_file.write(resp.read())
|
|
||||||
print("Extracting to 'llama_bin'...")
|
|
||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
||||||
zip_ref.extractall("llama_bin")
|
|
||||||
print("Done extracting.")
|
|
||||||
os.remove(zip_path)
|
|
||||||
else:
|
|
||||||
print("Could not find the target zip. Available assets:")
|
|
||||||
for asset in data['assets']:
|
|
||||||
print(" -", asset['name'])
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
import os
|
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
|
|
||||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
|
||||||
|
|
||||||
models = [
|
|
||||||
# 먼저 용량이 작은 Gemma4 26B 부터 다운로드
|
|
||||||
("ggml-org/gemma-4-26B-A4B-it-GGUF", "gemma-4-26B-A4B-it-Q4_K_M.gguf"),
|
|
||||||
# 다음 Qwen 35B
|
|
||||||
("unsloth/Qwen3.5-35B-A3B-GGUF", "Qwen3.5-35B-A3B-Q4_K_M.gguf"),
|
|
||||||
# 마지막으로 122B (분할 압축되어 있음)
|
|
||||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"),
|
|
||||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf"),
|
|
||||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf")
|
|
||||||
]
|
|
||||||
|
|
||||||
print("=== 고속 다운로더 시작 (huggingface_hub & hf_transfer) ===")
|
|
||||||
os.makedirs("models", exist_ok=True)
|
|
||||||
|
|
||||||
for repo, filename in models:
|
|
||||||
print(f"\n>>> 다운로드 중 (백그라운드 진행): [{repo}] 의 [{filename}]...")
|
|
||||||
try:
|
|
||||||
path = hf_hub_download(
|
|
||||||
repo_id=repo,
|
|
||||||
filename=filename,
|
|
||||||
local_dir="./models",
|
|
||||||
local_dir_use_symlinks=False
|
|
||||||
)
|
|
||||||
print(f"완료: {path}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"다운로드 실패: {e}")
|
|
||||||
|
|
||||||
print("\n모든 다운로드 프로세스가 종료되었습니다.")
|
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
import urllib.request
|
|
||||||
import json
|
|
||||||
import zipfile
|
|
||||||
import os
|
|
||||||
import ssl
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
ctx = ssl.create_default_context()
|
|
||||||
ctx.check_hostname = False
|
|
||||||
ctx.verify_mode = ssl.CERT_NONE
|
|
||||||
|
|
||||||
url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
|
||||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen(req, context=ctx) as response:
|
|
||||||
data = json.loads(response.read().decode())
|
|
||||||
|
|
||||||
download_url = None
|
|
||||||
for asset in data['assets']:
|
|
||||||
if "bin-win-cuda-12.4-x64.zip" in asset['name'] and "cudart" not in asset['name']:
|
|
||||||
download_url = asset['browser_download_url']
|
|
||||||
break
|
|
||||||
|
|
||||||
if download_url:
|
|
||||||
print(f"Downloading true binaries: {download_url}...")
|
|
||||||
zip_path = "llama_main.zip"
|
|
||||||
with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
|
|
||||||
out_file.write(resp.read())
|
|
||||||
|
|
||||||
print("Extracting to temporary folder 'llama_temp'...")
|
|
||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
||||||
zip_ref.extractall("llama_temp")
|
|
||||||
|
|
||||||
print("Moving exact files to 'llama_bin_run'...")
|
|
||||||
os.makedirs("llama_bin_run", exist_ok=True)
|
|
||||||
for root, dirs, files in os.walk("llama_temp"):
|
|
||||||
for file in files:
|
|
||||||
shutil.move(os.path.join(root, file), os.path.join("llama_bin_run", file))
|
|
||||||
|
|
||||||
if os.path.exists("llama_bin"):
|
|
||||||
for item in os.listdir("llama_bin"):
|
|
||||||
src = os.path.join("llama_bin", item)
|
|
||||||
dst = os.path.join("llama_bin_run", item)
|
|
||||||
if not os.path.exists(dst):
|
|
||||||
try:
|
|
||||||
shutil.copy(src, dst)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
os.remove(zip_path)
|
|
||||||
shutil.rmtree("llama_temp", ignore_errors=True)
|
|
||||||
print("Download and path extraction fully complete.")
|
|
||||||
else:
|
|
||||||
print("Could not find the target zip.")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
from huggingface_hub import HfApi
|
|
||||||
import sys
|
|
||||||
|
|
||||||
api = HfApi()
|
|
||||||
|
|
||||||
def search_gguf(query):
|
|
||||||
print(f"\n--- Searching for: {query} ---")
|
|
||||||
try:
|
|
||||||
models = api.list_models(search=query, limit=3)
|
|
||||||
found = list(models)
|
|
||||||
if not found:
|
|
||||||
print("No models found.")
|
|
||||||
return
|
|
||||||
for m in found:
|
|
||||||
print(f"Repo: {m.id}")
|
|
||||||
files = api.list_repo_files(repo_id=m.id)
|
|
||||||
ggufs = [f for f in files if "q4_k_m" in f.lower() and f.endswith(".gguf")]
|
|
||||||
if not ggufs:
|
|
||||||
ggufs = [f for f in files if f.endswith(".gguf")][:3]
|
|
||||||
print(f" GGUFs: {ggufs}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}")
|
|
||||||
|
|
||||||
search_gguf("122b-a10b gguf")
|
|
||||||
search_gguf("Qwen3.5 122b gguf")
|
|
||||||
search_gguf("35b-a3b gguf")
|
|
||||||
search_gguf("gemma-4 26b gguf")
|
|
||||||
search_gguf("Qwen 122B")
|
|
||||||
@@ -1,123 +0,0 @@
|
|||||||
import time
|
|
||||||
import json
|
|
||||||
import urllib.request
|
|
||||||
import sys
|
|
||||||
|
|
||||||
try:
|
|
||||||
sys.stdout.reconfigure(encoding='utf-8')
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
BASE_URL = "http://127.0.0.1:8000"
|
|
||||||
|
|
||||||
def check_server():
|
|
||||||
"""Check if server is up"""
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
|
||||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
||||||
data = json.loads(resp.read())
|
|
||||||
return data.get("status") == "ok"
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def run_benchmark(prompt, max_tokens=100, label="Test"):
|
|
||||||
"""Run a single benchmark request and return results"""
|
|
||||||
payload = json.dumps({
|
|
||||||
"model": "local-model",
|
|
||||||
"messages": [{"role": "user", "content": prompt}],
|
|
||||||
"max_tokens": max_tokens,
|
|
||||||
"temperature": 0.0
|
|
||||||
}).encode("utf-8")
|
|
||||||
|
|
||||||
req = urllib.request.Request(
|
|
||||||
f"{BASE_URL}/v1/chat/completions",
|
|
||||||
data=payload,
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
)
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
|
||||||
result = json.loads(resp.read())
|
|
||||||
elapsed = time.time() - start
|
|
||||||
|
|
||||||
content = result["choices"][0]["message"].get("content", "")
|
|
||||||
usage = result.get("usage", {})
|
|
||||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
|
||||||
completion_tokens = usage.get("completion_tokens", 0)
|
|
||||||
|
|
||||||
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
|
||||||
|
|
||||||
return {
|
|
||||||
"label": label,
|
|
||||||
"prompt_tokens": prompt_tokens,
|
|
||||||
"completion_tokens": completion_tokens,
|
|
||||||
"elapsed": elapsed,
|
|
||||||
"gen_tps_approx": gen_tps,
|
|
||||||
"content_preview": content[:100]
|
|
||||||
}
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("=" * 60)
|
|
||||||
print(" LLM Performance Benchmark Tool")
|
|
||||||
print("=" * 60)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Wait for server
|
|
||||||
print("[1/3] Checking server health...")
|
|
||||||
for i in range(30):
|
|
||||||
if check_server():
|
|
||||||
print(" -> Server is ready!")
|
|
||||||
break
|
|
||||||
print(f" -> Waiting for server... ({i+1}/30)")
|
|
||||||
time.sleep(2)
|
|
||||||
else:
|
|
||||||
print(" -> ERROR: Server not responding after 60s")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Warmup
|
|
||||||
print()
|
|
||||||
print("[2/3] Warmup run (short)...")
|
|
||||||
try:
|
|
||||||
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
|
||||||
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" -> Warmup failed: {e}")
|
|
||||||
|
|
||||||
# Main benchmark
|
|
||||||
print()
|
|
||||||
print("[3/3] Running main benchmark...")
|
|
||||||
print("-" * 60)
|
|
||||||
|
|
||||||
test_prompt = "Count from 1 to 50, writing each number on a new line."
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for i in range(3):
|
|
||||||
print(f" Run {i+1}/3...")
|
|
||||||
try:
|
|
||||||
r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}")
|
|
||||||
results.append(r)
|
|
||||||
print(f" Tokens: {r['completion_tokens']} | "
|
|
||||||
f"Time: {r['elapsed']:.2f}s | "
|
|
||||||
f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ERROR: {e}")
|
|
||||||
|
|
||||||
if results:
|
|
||||||
print()
|
|
||||||
print("=" * 60)
|
|
||||||
print(" RESULTS SUMMARY")
|
|
||||||
print("=" * 60)
|
|
||||||
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
|
||||||
max_tps = max(r["gen_tps_approx"] for r in results)
|
|
||||||
min_tps = min(r["gen_tps_approx"] for r in results)
|
|
||||||
print(f" Runs: {len(results)}")
|
|
||||||
print(f" Avg TPS: {avg_tps:.2f} t/s (approx, includes prompt eval)")
|
|
||||||
print(f" Min TPS: {min_tps:.2f} t/s")
|
|
||||||
print(f" Max TPS: {max_tps:.2f} t/s")
|
|
||||||
print()
|
|
||||||
print(" NOTE: Check server console for exact generation t/s")
|
|
||||||
print(" (the 'eval time' line shows pure token generation speed)")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,169 +0,0 @@
|
|||||||
import time
|
|
||||||
import json
|
|
||||||
import urllib.request
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
|
|
||||||
try:
|
|
||||||
sys.stdout.reconfigure(encoding='utf-8')
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
BASE_URL = "http://127.0.0.1:8000"
|
|
||||||
|
|
||||||
def check_server():
|
|
||||||
"""Check if server is up"""
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
|
||||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
||||||
data = json.loads(resp.read())
|
|
||||||
return data.get("status") == "ok"
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def check_slots():
|
|
||||||
"""Check server slot info for VRAM usage details"""
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(f"{BASE_URL}/slots")
|
|
||||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
||||||
return json.loads(resp.read())
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def run_benchmark(prompt, max_tokens=300, label="Test"):
|
|
||||||
"""Run a single benchmark request and return results"""
|
|
||||||
payload = json.dumps({
|
|
||||||
"model": "local-model",
|
|
||||||
"messages": [{"role": "user", "content": prompt}],
|
|
||||||
"max_tokens": max_tokens,
|
|
||||||
"temperature": 0.0
|
|
||||||
}).encode("utf-8")
|
|
||||||
|
|
||||||
req = urllib.request.Request(
|
|
||||||
f"{BASE_URL}/v1/chat/completions",
|
|
||||||
data=payload,
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
)
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
|
||||||
result = json.loads(resp.read())
|
|
||||||
elapsed = time.time() - start
|
|
||||||
|
|
||||||
content = result["choices"][0]["message"].get("content", "")
|
|
||||||
usage = result.get("usage", {})
|
|
||||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
|
||||||
completion_tokens = usage.get("completion_tokens", 0)
|
|
||||||
|
|
||||||
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
|
||||||
|
|
||||||
return {
|
|
||||||
"label": label,
|
|
||||||
"prompt_tokens": prompt_tokens,
|
|
||||||
"completion_tokens": completion_tokens,
|
|
||||||
"elapsed": elapsed,
|
|
||||||
"gen_tps_approx": gen_tps,
|
|
||||||
"content_preview": content[:150]
|
|
||||||
}
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("=" * 70)
|
|
||||||
print(" Qwen3.5 122B-A10B Performance Benchmark")
|
|
||||||
print(" Target: 10+ t/s generation speed")
|
|
||||||
print("=" * 70)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Wait for server (model loading takes 3-5 min for 71 GB)
|
|
||||||
print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
|
|
||||||
max_wait = 600 # 10 minutes max
|
|
||||||
for i in range(max_wait // 5):
|
|
||||||
if check_server():
|
|
||||||
print(f" -> Server is ready! (waited {i*5}s)")
|
|
||||||
break
|
|
||||||
if i % 6 == 0:
|
|
||||||
print(f" -> Loading model... ({i*5}s / {max_wait}s)")
|
|
||||||
time.sleep(5)
|
|
||||||
else:
|
|
||||||
print(f" -> ERROR: Server not responding after {max_wait}s")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Check server info
|
|
||||||
print()
|
|
||||||
print("[2/4] Checking server status...")
|
|
||||||
slots = check_slots()
|
|
||||||
if slots:
|
|
||||||
print(f" -> Slots available: {len(slots)}")
|
|
||||||
|
|
||||||
# Warmup
|
|
||||||
print()
|
|
||||||
print("[3/4] Warmup run (short, pre-heating GPU caches)...")
|
|
||||||
try:
|
|
||||||
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
|
||||||
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
|
||||||
print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" -> Warmup failed: {e}")
|
|
||||||
|
|
||||||
# Main benchmark - 5 runs for statistical reliability
|
|
||||||
print()
|
|
||||||
print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
|
|
||||||
print("-" * 70)
|
|
||||||
|
|
||||||
test_prompts = [
|
|
||||||
"Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
|
|
||||||
"Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
|
|
||||||
"Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
|
|
||||||
"Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
|
|
||||||
"Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
|
|
||||||
]
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for i in range(5):
|
|
||||||
prompt = test_prompts[i % len(test_prompts)]
|
|
||||||
print(f"\n Run {i+1}/5: {prompt[:50]}...")
|
|
||||||
try:
|
|
||||||
r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
|
|
||||||
results.append(r)
|
|
||||||
print(f" Completion tokens: {r['completion_tokens']}")
|
|
||||||
print(f" Total time: {r['elapsed']:.2f}s")
|
|
||||||
print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ERROR: {e}")
|
|
||||||
|
|
||||||
if results:
|
|
||||||
print()
|
|
||||||
print("=" * 70)
|
|
||||||
print(" RESULTS SUMMARY - Qwen3.5 122B-A10B")
|
|
||||||
print("=" * 70)
|
|
||||||
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
|
||||||
max_tps = max(r["gen_tps_approx"] for r in results)
|
|
||||||
min_tps = min(r["gen_tps_approx"] for r in results)
|
|
||||||
total_tokens = sum(r["completion_tokens"] for r in results)
|
|
||||||
total_time = sum(r["elapsed"] for r in results)
|
|
||||||
|
|
||||||
print(f" Runs completed: {len(results)}/5")
|
|
||||||
print(f" Total tokens: {total_tokens}")
|
|
||||||
print(f" Total time: {total_time:.1f}s")
|
|
||||||
print()
|
|
||||||
print(f" Approx TPS (avg): {avg_tps:.2f} t/s")
|
|
||||||
print(f" Approx TPS (min): {min_tps:.2f} t/s")
|
|
||||||
print(f" Approx TPS (max): {max_tps:.2f} t/s")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Verdict
|
|
||||||
if avg_tps >= 10:
|
|
||||||
print(" ✅ TARGET ACHIEVED: 10+ t/s!")
|
|
||||||
elif avg_tps >= 8:
|
|
||||||
print(" ⚠️ CLOSE TO TARGET: Consider further tuning")
|
|
||||||
else:
|
|
||||||
print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
|
|
||||||
print(" ⚡ Check the server console/log for exact 'eval time' t/s value,")
|
|
||||||
print(" ⚡ which shows pure token generation speed (always higher).")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,8 +1,30 @@
|
|||||||
@echo off
|
@echo off
|
||||||
|
chcp 65001 >nul
|
||||||
echo =========================================================
|
echo =========================================================
|
||||||
echo Gemma4 26B-A4B API Server (Tuned for Max Speed)
|
echo Gemma4 26B-A4B API Server (256K Context - Final Optimal)
|
||||||
echo [INFO] Tuning VRAM limit correctly to avoid WDDM swap (-ngl 22)
|
echo [CORE] --n-cpu-moe 10: VRAM 12GB 최적화용 Expert 오프로드
|
||||||
|
echo [TUNED] -t 4 -ub 512: CPU 병목 방지 및 SWA 캐시 최적화
|
||||||
|
echo [PERF] Speed: ~30.9 t/s (1x RTX 3060)
|
||||||
echo =========================================================
|
echo =========================================================
|
||||||
echo.
|
echo.
|
||||||
llama_bin_run\llama-server.exe --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf -ngl 22 -c 4096 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 8 --mlock --prio 2 --port 8000 --host 0.0.0.0
|
|
||||||
|
llama_bin_run\llama-server.exe ^
|
||||||
|
--model models\gemma-4-26B-A4B-it-Q4_K_M.gguf ^
|
||||||
|
-ngl 999 ^
|
||||||
|
--n-cpu-moe 10 ^
|
||||||
|
-c 262144 ^
|
||||||
|
-np 1 ^
|
||||||
|
-fa on ^
|
||||||
|
--cache-type-k q4_0 ^
|
||||||
|
--cache-type-v q4_0 ^
|
||||||
|
-ub 512 ^
|
||||||
|
-b 2048 ^
|
||||||
|
-t 4 ^
|
||||||
|
-tb 4 ^
|
||||||
|
--mlock ^
|
||||||
|
--prio 3 ^
|
||||||
|
--poll 50 ^
|
||||||
|
--port 8000 ^
|
||||||
|
--host 0.0.0.0
|
||||||
|
|
||||||
pause
|
pause
|
||||||
|
|||||||
Reference in New Issue
Block a user