""" Qwen3.5 122B-A10B 정밀 튜닝 2라운드 ==================================== 1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름 → mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색 """ import subprocess import time import json import urllib.request import os import re import sys import datetime try: sys.stdout.reconfigure(encoding='utf-8') except AttributeError: pass BASE_URL = "http://127.0.0.1:8000" MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf" SERVER_EXE = r"llama_bin_run\llama-server.exe" COMMON_ARGS = [ "--model", MODEL_PATH, "-ngl", "999", "--cpu-moe", "-c", "2048", "-np", "1", "-fa", "on", "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "256", "-b", "1024", "--mlock", "--port", "8000", "--host", "0.0.0.0", "--no-warmup", ] CONFIGS = [ { "name": "F) mmap on, -t 4", "desc": "최소 스레드 (4개, 물리코어 절반)", "extra": ["-t", "4", "--prio", "2"], }, { "name": "G) mmap on, -t 5", "desc": "스레드 5개", "extra": ["-t", "5", "--prio", "2"], }, { "name": "H) mmap on, -t 6", "desc": "스레드 6개 (--no-mmap에서 최고였음)", "extra": ["-t", "6", "--prio", "2"], }, { "name": "I) mmap on, -t 7", "desc": "스레드 7개", "extra": ["-t", "7", "--prio", "2"], }, { "name": "J) mmap on, -t 6, --prio 3", "desc": "최적 스레드 + 리얼타임 우선순위", "extra": ["-t", "6", "--prio", "3"], }, ] def kill_server(): os.system("taskkill /F /IM llama-server.exe >nul 2>&1") time.sleep(3) def start_server(config, log_path): cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"] log_file = open(log_path, "w", encoding="utf-8") proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd()) return proc, log_file def wait_for_server(timeout=600): start = time.time() while time.time() - start < timeout: try: req = urllib.request.Request(f"{BASE_URL}/health") with urllib.request.urlopen(req, timeout=5) as resp: data = json.loads(resp.read()) if data.get("status") == "ok": return True except: pass time.sleep(5) return False def run_single_benchmark(prompt, max_tokens=200): payload = json.dumps({ "model": "local-model", "messages": [{"role": "user", "content": prompt}], "max_tokens": max_tokens, "temperature": 0.0 }).encode("utf-8") req = urllib.request.Request( f"{BASE_URL}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"} ) start = time.time() with urllib.request.urlopen(req, timeout=600) as resp: result = json.loads(resp.read()) elapsed = time.time() - start usage = result.get("usage", {}) return usage.get("completion_tokens", 0), elapsed def parse_eval_times(log_path): try: with open(log_path, "r", encoding="utf-8", errors="ignore") as f: content = f.read() except: return [] pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)' matches = re.findall(pattern, content, re.MULTILINE) return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches] def parse_prompt_eval_times(log_path): try: with open(log_path, "r", encoding="utf-8", errors="ignore") as f: content = f.read() except: return [] pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)' matches = re.findall(pattern, content, re.MULTILINE) return [{"tps": float(m[3])} for m in matches] def main(): timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") print("=" * 70) print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드") print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}") print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)") print("=" * 70) print() all_results = [] for idx, config in enumerate(CONFIGS): config_start = time.time() log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt") print(f"\n{'='*70}") print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}") print(f" {config['desc']}") print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}") print(f"{'='*70}") kill_server() print(f" [1/3] 서버 시작 중...") proc, log_file = start_server(config, log_path) if not wait_for_server(timeout=600): print(" ❌ 서버 시작 실패!") kill_server() log_file.close() all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []}) continue load_time = time.time() - config_start print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)") # 워밍업 + 벤치마크 try: run_single_benchmark("Say hello.", max_tokens=20) except: pass print(" [3/3] 벤치마크 3회...") prompts = [ "Write a detailed explanation of how neural networks learn through backpropagation.", "Explain the complete process of photosynthesis including light and dark reactions.", "Describe the differences between SQL and NoSQL databases with examples.", ] for i, prompt in enumerate(prompts): try: tokens, elapsed = run_single_benchmark(prompt, max_tokens=200) print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s") except Exception as e: print(f" Run {i+1}: ERROR - {e}") time.sleep(2) kill_server() log_file.close() time.sleep(2) eval_times = parse_eval_times(log_path) prompt_times = parse_prompt_eval_times(log_path) bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times eval_speeds = [e["tps"] for e in bench_evals] prompt_speeds = [p["tps"] for p in bench_prompts] all_results.append({ "config": config["name"], "status": "OK", "eval_tps": eval_speeds, "prompt_tps": prompt_speeds, }) if eval_speeds: print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}") # 최종 결과 print("\n") print("=" * 85) print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)") print("=" * 85) print() print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}") print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}") # 1라운드 결과 (하드코딩) r1 = [ ("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52), ("A) --no-mmap -t 8", 9.66, 9.70, 28.26), ("B) --no-mmap -t 6", 10.02, 10.18, 26.73), ("C) --no-mmap -t 10", 9.42, 9.46, 27.31), ("D) --no-mmap -t 12", 9.04, 9.11, 27.92), ("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37), ] for name, avg, mx, pp in r1: marker = " ⭐" if avg >= 10.0 else "" print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}") print(f" {'--- 2라운드 ---':<48}") best_avg = 10.06 # 기존 최고 best_config = "[기준] mmap on, -t 8" for r in all_results: if r["status"] != "OK" or not r["eval_tps"]: print(f" {r['config']:<48} {'FAIL':>8}") continue avg_e = sum(r["eval_tps"]) / len(r["eval_tps"]) max_e = max(r["eval_tps"]) avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0 if max_e > best_avg: best_avg = max_e best_config = r["config"] marker = " ⭐" if avg_e >= 10.0 else "" print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}") print() print(f" 🏆 최고 성능: {best_config} → {best_avg:.2f} t/s") print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}") print("=" * 85) if __name__ == "__main__": main()