chore: initial project setup with agent guide

This commit is contained in:
Variet-Worker
2026-04-05 00:43:39 +09:00
commit 7890ff6644
1368 changed files with 213076 additions and 0 deletions

372
scripts/auto_tune_122b.py Normal file
View File

@@ -0,0 +1,372 @@
"""
Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
===========================================
각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
"""
import subprocess
import time
import json
import urllib.request
import os
import re
import sys
import datetime
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
SERVER_EXE = r"llama_bin_run\llama-server.exe"
# ============================================================
# 테스트할 설정 목록
# ============================================================
# 공통 파라미터 (변경하지 않는 것들)
COMMON_ARGS = [
"--model", MODEL_PATH,
"-ngl", "999",
"--cpu-moe",
"-c", "2048",
"-np", "1",
"-fa", "on",
"--cache-type-k", "q4_0",
"--cache-type-v", "q4_0",
"-ub", "256",
"-b", "1024",
"--mlock",
"--port", "8000",
"--host", "0.0.0.0",
"--no-warmup", # 워밍업은 벤치마크 스크립트에서 직접 수행
]
# 변수 파라미터 조합
CONFIGS = [
{
"name": "A) --no-mmap -t 8",
"desc": "서버 권장: mmap 비활성화 (baseline 대비)",
"extra": ["--no-mmap", "-t", "8", "--prio", "2"],
},
{
"name": "B) --no-mmap -t 6",
"desc": "스레드 감소 (캐시 경합 회피)",
"extra": ["--no-mmap", "-t", "6", "--prio", "2"],
},
{
"name": "C) --no-mmap -t 10",
"desc": "스레드 증가 (RAM 대역폭 포화)",
"extra": ["--no-mmap", "-t", "10", "--prio", "2"],
},
{
"name": "D) --no-mmap -t 12",
"desc": "더 많은 스레드",
"extra": ["--no-mmap", "-t", "12", "--prio", "2"],
},
{
"name": "E) --no-mmap -t 10 --prio 3 --poll 100",
"desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
"extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
},
]
# ============================================================
# 유틸리티 함수
# ============================================================
def kill_server():
"""llama-server 프로세스 강제 종료"""
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
time.sleep(3)
def start_server(config, log_path):
"""서버 시작, 로그를 파일로 리다이렉트"""
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
log_file = open(log_path, "w", encoding="utf-8")
proc = subprocess.Popen(
cmd,
stdout=log_file,
stderr=subprocess.STDOUT,
cwd=os.getcwd()
)
return proc, log_file
def wait_for_server(timeout=600):
"""서버가 준비될 때까지 대기"""
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
return True
except:
pass
time.sleep(5)
return False
def run_single_benchmark(prompt, max_tokens=200):
"""단일 벤치마크 실행"""
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
completion_tokens = usage.get("completion_tokens", 0)
return completion_tokens, elapsed
def parse_eval_times(log_path):
"""서버 로그에서 순수 eval time 파싱"""
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return []
# "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
matches = re.findall(pattern, content, re.MULTILINE)
results = []
for m in matches:
results.append({
"total_ms": float(m[0]),
"tokens": int(m[1]),
"ms_per_token": float(m[2]),
"tps": float(m[3])
})
return results
def parse_prompt_eval_times(log_path):
"""서버 로그에서 prompt eval time 파싱"""
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return []
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
matches = re.findall(pattern, content, re.MULTILINE)
results = []
for m in matches:
results.append({
"total_ms": float(m[0]),
"tokens": int(m[1]),
"ms_per_token": float(m[2]),
"tps": float(m[3])
})
return results
def parse_vram_usage(log_path):
"""서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return "N/A"
match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
if match:
return f"{float(match.group(1)):.0f} MiB"
return "N/A"
# ============================================================
# 메인 튜닝 루프
# ============================================================
def main():
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
print("=" * 70)
print(" Qwen3.5 122B-A10B 자동 정밀 튜닝")
print(f" 시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
print(f" 테스트 설정: {len(CONFIGS)}")
print(f" 예상 소요: ~{len(CONFIGS) * 7}")
print("=" * 70)
print()
print(" 기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
print()
# 결과 저장
all_results = []
for idx, config in enumerate(CONFIGS):
config_start = time.time()
log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
print(f"\n{'='*70}")
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
print(f" {config['desc']}")
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
print(f"{'='*70}")
# 1. 기존 서버 종료
print(" [1/4] 서버 종료 중...")
kill_server()
# 2. 새 서버 시작
print(f" [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
proc, log_file = start_server(config, log_path)
# 3. 서버 준비 대기
if not wait_for_server(timeout=600):
print(" ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
kill_server()
log_file.close()
all_results.append({
"config": config["name"],
"status": "FAILED",
"eval_tps": [],
"prompt_tps": [],
"vram": "N/A"
})
continue
load_time = time.time() - config_start
print(f" [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
# 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
print(" [4/4] 벤치마크 실행 중...")
# 워밍업
try:
run_single_benchmark("Say hello.", max_tokens=20)
print(" 워밍업 완료")
except Exception as e:
print(f" 워밍업 실패: {e}")
# 본 테스트 3회
prompts = [
"Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
"Explain the complete process of photosynthesis including light and dark reactions in detail.",
"Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
]
for i, prompt in enumerate(prompts):
try:
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
approx_tps = tokens / elapsed if elapsed > 0 else 0
print(f" Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
except Exception as e:
print(f" Run {i+1}/3: ERROR - {e}")
# 서버 종료 전에 로그 플러시를 위해 잠시 대기
time.sleep(2)
# 서버 종료
kill_server()
log_file.close()
time.sleep(2)
# 로그 파싱
eval_times = parse_eval_times(log_path)
prompt_times = parse_prompt_eval_times(log_path)
vram = parse_vram_usage(log_path)
# 워밍업 제외 (첫 번째 결과)
if len(eval_times) > 1:
bench_evals = eval_times[1:] # 워밍업 제외
else:
bench_evals = eval_times
if len(prompt_times) > 1:
bench_prompts = prompt_times[1:]
else:
bench_prompts = prompt_times
eval_speeds = [e["tps"] for e in bench_evals]
prompt_speeds = [p["tps"] for p in bench_prompts]
result = {
"config": config["name"],
"status": "OK",
"eval_tps": eval_speeds,
"prompt_tps": prompt_speeds,
"vram": vram,
}
all_results.append(result)
config_elapsed = time.time() - config_start
print(f"\n 완료! 소요: {config_elapsed:.0f}")
if eval_speeds:
avg_eval = sum(eval_speeds) / len(eval_speeds)
max_eval = max(eval_speeds)
print(f" 📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
# ============================================================
# 최종 결과 비교 테이블
# ============================================================
print("\n")
print("=" * 80)
print(" 🏆 최종 결과 비교 테이블")
print("=" * 80)
print()
# 기존 baseline 추가
print(f" {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
print(f" {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
# Baseline (이전 결과)
print(f" {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
best_avg = 0
best_config = ""
for r in all_results:
if r["status"] != "OK" or not r["eval_tps"]:
print(f" {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
continue
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
max_e = max(r["eval_tps"])
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
if avg_e > best_avg:
best_avg = avg_e
best_config = r["config"]
marker = "" if avg_e > 10.06 else ""
print(f" {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
print()
if best_avg > 0:
improvement = ((best_avg - 10.02) / 10.02) * 100
print(f" 🏆 최고 성능: {best_config}")
print(f"{best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
print()
print(f" 완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
print("=" * 80)
# 결과를 파일로도 저장
result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
with open(result_path, "w", encoding="utf-8") as f:
f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
f.write(f"Date: {timestamp}\n\n")
for r in all_results:
f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
print(f" 결과 저장: {result_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,257 @@
"""
Qwen3.5 122B-A10B 정밀 튜닝 2라운드
====================================
1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
"""
import subprocess
import time
import json
import urllib.request
import os
import re
import sys
import datetime
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
SERVER_EXE = r"llama_bin_run\llama-server.exe"
COMMON_ARGS = [
"--model", MODEL_PATH,
"-ngl", "999",
"--cpu-moe",
"-c", "2048",
"-np", "1",
"-fa", "on",
"--cache-type-k", "q4_0",
"--cache-type-v", "q4_0",
"-ub", "256",
"-b", "1024",
"--mlock",
"--port", "8000",
"--host", "0.0.0.0",
"--no-warmup",
]
CONFIGS = [
{
"name": "F) mmap on, -t 4",
"desc": "최소 스레드 (4개, 물리코어 절반)",
"extra": ["-t", "4", "--prio", "2"],
},
{
"name": "G) mmap on, -t 5",
"desc": "스레드 5개",
"extra": ["-t", "5", "--prio", "2"],
},
{
"name": "H) mmap on, -t 6",
"desc": "스레드 6개 (--no-mmap에서 최고였음)",
"extra": ["-t", "6", "--prio", "2"],
},
{
"name": "I) mmap on, -t 7",
"desc": "스레드 7개",
"extra": ["-t", "7", "--prio", "2"],
},
{
"name": "J) mmap on, -t 6, --prio 3",
"desc": "최적 스레드 + 리얼타임 우선순위",
"extra": ["-t", "6", "--prio", "3"],
},
]
def kill_server():
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
time.sleep(3)
def start_server(config, log_path):
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
log_file = open(log_path, "w", encoding="utf-8")
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
return proc, log_file
def wait_for_server(timeout=600):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
return True
except:
pass
time.sleep(5)
return False
def run_single_benchmark(prompt, max_tokens=200):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
return usage.get("completion_tokens", 0), elapsed
def parse_eval_times(log_path):
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return []
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
matches = re.findall(pattern, content, re.MULTILINE)
return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
def parse_prompt_eval_times(log_path):
try:
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except:
return []
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
matches = re.findall(pattern, content, re.MULTILINE)
return [{"tps": float(m[3])} for m in matches]
def main():
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
print("=" * 70)
print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
print("=" * 70)
print()
all_results = []
for idx, config in enumerate(CONFIGS):
config_start = time.time()
log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
print(f"\n{'='*70}")
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
print(f" {config['desc']}")
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
print(f"{'='*70}")
kill_server()
print(f" [1/3] 서버 시작 중...")
proc, log_file = start_server(config, log_path)
if not wait_for_server(timeout=600):
print(" ❌ 서버 시작 실패!")
kill_server()
log_file.close()
all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
continue
load_time = time.time() - config_start
print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)")
# 워밍업 + 벤치마크
try:
run_single_benchmark("Say hello.", max_tokens=20)
except:
pass
print(" [3/3] 벤치마크 3회...")
prompts = [
"Write a detailed explanation of how neural networks learn through backpropagation.",
"Explain the complete process of photosynthesis including light and dark reactions.",
"Describe the differences between SQL and NoSQL databases with examples.",
]
for i, prompt in enumerate(prompts):
try:
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
except Exception as e:
print(f" Run {i+1}: ERROR - {e}")
time.sleep(2)
kill_server()
log_file.close()
time.sleep(2)
eval_times = parse_eval_times(log_path)
prompt_times = parse_prompt_eval_times(log_path)
bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
eval_speeds = [e["tps"] for e in bench_evals]
prompt_speeds = [p["tps"] for p in bench_prompts]
all_results.append({
"config": config["name"],
"status": "OK",
"eval_tps": eval_speeds,
"prompt_tps": prompt_speeds,
})
if eval_speeds:
print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
# 최종 결과
print("\n")
print("=" * 85)
print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
print("=" * 85)
print()
print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
# 1라운드 결과 (하드코딩)
r1 = [
("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52),
("A) --no-mmap -t 8", 9.66, 9.70, 28.26),
("B) --no-mmap -t 6", 10.02, 10.18, 26.73),
("C) --no-mmap -t 10", 9.42, 9.46, 27.31),
("D) --no-mmap -t 12", 9.04, 9.11, 27.92),
("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37),
]
for name, avg, mx, pp in r1:
marker = "" if avg >= 10.0 else ""
print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
print(f" {'--- 2라운드 ---':<48}")
best_avg = 10.06 # 기존 최고
best_config = "[기준] mmap on, -t 8"
for r in all_results:
if r["status"] != "OK" or not r["eval_tps"]:
print(f" {r['config']:<48} {'FAIL':>8}")
continue
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
max_e = max(r["eval_tps"])
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
if max_e > best_avg:
best_avg = max_e
best_config = r["config"]
marker = "" if avg_e >= 10.0 else ""
print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
print()
print(f" 🏆 최고 성능: {best_config}{best_avg:.2f} t/s")
print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
print("=" * 85)
if __name__ == "__main__":
main()

38
scripts/download_llama.py Normal file
View File

@@ -0,0 +1,38 @@
import urllib.request
import json
import zipfile
import os
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
with urllib.request.urlopen(req, context=ctx) as response:
data = json.loads(response.read().decode())
download_url = None
for asset in data['assets']:
if "bin-win-cuda-cu12.2.0-x64.zip" in asset['name'] or ("bin-win-cu" in asset['name'] and asset['name'].endswith(".zip") and "x64" in asset['name']):
download_url = asset['browser_download_url']
break
if download_url:
print(f"Downloading {download_url}...")
zip_path = "llama.zip"
with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
out_file.write(resp.read())
print("Extracting to 'llama_bin'...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall("llama_bin")
print("Done extracting.")
os.remove(zip_path)
else:
print("Could not find the target zip. Available assets:")
for asset in data['assets']:
print(" -", asset['name'])
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,33 @@
import os
from huggingface_hub import hf_hub_download
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
models = [
# 먼저 용량이 작은 Gemma4 26B 부터 다운로드
("ggml-org/gemma-4-26B-A4B-it-GGUF", "gemma-4-26B-A4B-it-Q4_K_M.gguf"),
# 다음 Qwen 35B
("unsloth/Qwen3.5-35B-A3B-GGUF", "Qwen3.5-35B-A3B-Q4_K_M.gguf"),
# 마지막으로 122B (분할 압축되어 있음)
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"),
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf"),
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf")
]
print("=== 고속 다운로더 시작 (huggingface_hub & hf_transfer) ===")
os.makedirs("models", exist_ok=True)
for repo, filename in models:
print(f"\n>>> 다운로드 중 (백그라운드 진행): [{repo}] 의 [{filename}]...")
try:
path = hf_hub_download(
repo_id=repo,
filename=filename,
local_dir="./models",
local_dir_use_symlinks=False
)
print(f"완료: {path}")
except Exception as e:
print(f"다운로드 실패: {e}")
print("\n모든 다운로드 프로세스가 종료되었습니다.")

View File

@@ -0,0 +1,56 @@
import urllib.request
import json
import zipfile
import os
import ssl
import shutil
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
with urllib.request.urlopen(req, context=ctx) as response:
data = json.loads(response.read().decode())
download_url = None
for asset in data['assets']:
if "bin-win-cuda-12.4-x64.zip" in asset['name'] and "cudart" not in asset['name']:
download_url = asset['browser_download_url']
break
if download_url:
print(f"Downloading true binaries: {download_url}...")
zip_path = "llama_main.zip"
with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
out_file.write(resp.read())
print("Extracting to temporary folder 'llama_temp'...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall("llama_temp")
print("Moving exact files to 'llama_bin_run'...")
os.makedirs("llama_bin_run", exist_ok=True)
for root, dirs, files in os.walk("llama_temp"):
for file in files:
shutil.move(os.path.join(root, file), os.path.join("llama_bin_run", file))
if os.path.exists("llama_bin"):
for item in os.listdir("llama_bin"):
src = os.path.join("llama_bin", item)
dst = os.path.join("llama_bin_run", item)
if not os.path.exists(dst):
try:
shutil.copy(src, dst)
except:
pass
os.remove(zip_path)
shutil.rmtree("llama_temp", ignore_errors=True)
print("Download and path extraction fully complete.")
else:
print("Could not find the target zip.")
except Exception as e:
print(f"Error: {e}")

28
scripts/hf_search.py Normal file
View File

@@ -0,0 +1,28 @@
from huggingface_hub import HfApi
import sys
api = HfApi()
def search_gguf(query):
print(f"\n--- Searching for: {query} ---")
try:
models = api.list_models(search=query, limit=3)
found = list(models)
if not found:
print("No models found.")
return
for m in found:
print(f"Repo: {m.id}")
files = api.list_repo_files(repo_id=m.id)
ggufs = [f for f in files if "q4_k_m" in f.lower() and f.endswith(".gguf")]
if not ggufs:
ggufs = [f for f in files if f.endswith(".gguf")][:3]
print(f" GGUFs: {ggufs}")
except Exception as e:
print(f"Error: {e}")
search_gguf("122b-a10b gguf")
search_gguf("Qwen3.5 122b gguf")
search_gguf("35b-a3b gguf")
search_gguf("gemma-4 26b gguf")
search_gguf("Qwen 122B")

123
scripts/perf_test.py Normal file
View File

@@ -0,0 +1,123 @@
import time
import json
import urllib.request
import sys
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
def check_server():
"""Check if server is up"""
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
return data.get("status") == "ok"
except:
return False
def run_benchmark(prompt, max_tokens=100, label="Test"):
"""Run a single benchmark request and return results"""
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=300) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
content = result["choices"][0]["message"].get("content", "")
usage = result.get("usage", {})
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
return {
"label": label,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"elapsed": elapsed,
"gen_tps_approx": gen_tps,
"content_preview": content[:100]
}
def main():
print("=" * 60)
print(" LLM Performance Benchmark Tool")
print("=" * 60)
print()
# Wait for server
print("[1/3] Checking server health...")
for i in range(30):
if check_server():
print(" -> Server is ready!")
break
print(f" -> Waiting for server... ({i+1}/30)")
time.sleep(2)
else:
print(" -> ERROR: Server not responding after 60s")
return
# Warmup
print()
print("[2/3] Warmup run (short)...")
try:
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
except Exception as e:
print(f" -> Warmup failed: {e}")
# Main benchmark
print()
print("[3/3] Running main benchmark...")
print("-" * 60)
test_prompt = "Count from 1 to 50, writing each number on a new line."
results = []
for i in range(3):
print(f" Run {i+1}/3...")
try:
r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}")
results.append(r)
print(f" Tokens: {r['completion_tokens']} | "
f"Time: {r['elapsed']:.2f}s | "
f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)")
except Exception as e:
print(f" ERROR: {e}")
if results:
print()
print("=" * 60)
print(" RESULTS SUMMARY")
print("=" * 60)
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
max_tps = max(r["gen_tps_approx"] for r in results)
min_tps = min(r["gen_tps_approx"] for r in results)
print(f" Runs: {len(results)}")
print(f" Avg TPS: {avg_tps:.2f} t/s (approx, includes prompt eval)")
print(f" Min TPS: {min_tps:.2f} t/s")
print(f" Max TPS: {max_tps:.2f} t/s")
print()
print(" NOTE: Check server console for exact generation t/s")
print(" (the 'eval time' line shows pure token generation speed)")
print("=" * 60)
if __name__ == "__main__":
main()

169
scripts/perf_test_122b.py Normal file
View File

@@ -0,0 +1,169 @@
import time
import json
import urllib.request
import sys
import os
import re
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
def check_server():
"""Check if server is up"""
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
return data.get("status") == "ok"
except:
return False
def check_slots():
"""Check server slot info for VRAM usage details"""
try:
req = urllib.request.Request(f"{BASE_URL}/slots")
with urllib.request.urlopen(req, timeout=5) as resp:
return json.loads(resp.read())
except:
return None
def run_benchmark(prompt, max_tokens=300, label="Test"):
"""Run a single benchmark request and return results"""
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
content = result["choices"][0]["message"].get("content", "")
usage = result.get("usage", {})
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
return {
"label": label,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"elapsed": elapsed,
"gen_tps_approx": gen_tps,
"content_preview": content[:150]
}
def main():
print("=" * 70)
print(" Qwen3.5 122B-A10B Performance Benchmark")
print(" Target: 10+ t/s generation speed")
print("=" * 70)
print()
# Wait for server (model loading takes 3-5 min for 71 GB)
print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
max_wait = 600 # 10 minutes max
for i in range(max_wait // 5):
if check_server():
print(f" -> Server is ready! (waited {i*5}s)")
break
if i % 6 == 0:
print(f" -> Loading model... ({i*5}s / {max_wait}s)")
time.sleep(5)
else:
print(f" -> ERROR: Server not responding after {max_wait}s")
return
# Check server info
print()
print("[2/4] Checking server status...")
slots = check_slots()
if slots:
print(f" -> Slots available: {len(slots)}")
# Warmup
print()
print("[3/4] Warmup run (short, pre-heating GPU caches)...")
try:
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
except Exception as e:
print(f" -> Warmup failed: {e}")
# Main benchmark - 5 runs for statistical reliability
print()
print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
print("-" * 70)
test_prompts = [
"Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
"Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
"Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
"Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
"Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
]
results = []
for i in range(5):
prompt = test_prompts[i % len(test_prompts)]
print(f"\n Run {i+1}/5: {prompt[:50]}...")
try:
r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
results.append(r)
print(f" Completion tokens: {r['completion_tokens']}")
print(f" Total time: {r['elapsed']:.2f}s")
print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
except Exception as e:
print(f" ERROR: {e}")
if results:
print()
print("=" * 70)
print(" RESULTS SUMMARY - Qwen3.5 122B-A10B")
print("=" * 70)
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
max_tps = max(r["gen_tps_approx"] for r in results)
min_tps = min(r["gen_tps_approx"] for r in results)
total_tokens = sum(r["completion_tokens"] for r in results)
total_time = sum(r["elapsed"] for r in results)
print(f" Runs completed: {len(results)}/5")
print(f" Total tokens: {total_tokens}")
print(f" Total time: {total_time:.1f}s")
print()
print(f" Approx TPS (avg): {avg_tps:.2f} t/s")
print(f" Approx TPS (min): {min_tps:.2f} t/s")
print(f" Approx TPS (max): {max_tps:.2f} t/s")
print()
# Verdict
if avg_tps >= 10:
print(" ✅ TARGET ACHIEVED: 10+ t/s!")
elif avg_tps >= 8:
print(" ⚠️ CLOSE TO TARGET: Consider further tuning")
else:
print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
print()
print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
print(" ⚡ Check the server console/log for exact 'eval time' t/s value,")
print(" ⚡ which shows pure token generation speed (always higher).")
print("=" * 70)
if __name__ == "__main__":
main()

10
scripts/run_test.bat Normal file
View File

@@ -0,0 +1,10 @@
@echo off
chcp 65001 >nul
echo ---------------------------------------------------------
echo 🚀 Qwen 35B 성능 테스트 도구 시작 (python perf_test.py)
echo ---------------------------------------------------------
echo.
REM 사용자가 지정한 conda 환경의 파이썬을 이용해 성능 테스트 스크립트 실행
"C:\ProgramData\miniforge3\envs\variet-llm\python.exe" perf_test.py
echo.
pause