wip: [01-llm-tuning] paused at task 1/3

2026-04-05 22:59:01 +09:00
parent 66778b750d
commit a09736e930
12 changed files with 53 additions and 1094 deletions
--- a/scripts/auto_tune_122b_r2.py
+++ b/scripts/auto_tune_122b_r2.py
@@ -1,257 +0,0 @@
-"""
-Qwen3.5 122B-A10B 정밀 튜닝 2라운드
-====================================
-1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
-→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
-"""
-import subprocess
-import time
-import json
-import urllib.request
-import os
-import re
-import sys
-import datetime
-
-try:
-    sys.stdout.reconfigure(encoding='utf-8')
-except AttributeError:
-    pass
-
-BASE_URL = "http://127.0.0.1:8000"
-MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
-SERVER_EXE = r"llama_bin_run\llama-server.exe"
-
-COMMON_ARGS = [
-    "--model", MODEL_PATH,
-    "-ngl", "999",
-    "--cpu-moe",
-    "-c", "2048",
-    "-np", "1",
-    "-fa", "on",
-    "--cache-type-k", "q4_0",
-    "--cache-type-v", "q4_0",
-    "-ub", "256",
-    "-b", "1024",
-    "--mlock",
-    "--port", "8000",
-    "--host", "0.0.0.0",
-    "--no-warmup",
-]
-
-CONFIGS = [
-    {
-        "name": "F) mmap on, -t 4",
-        "desc": "최소 스레드 (4개, 물리코어 절반)",
-        "extra": ["-t", "4", "--prio", "2"],
-    },
-    {
-        "name": "G) mmap on, -t 5",
-        "desc": "스레드 5개",
-        "extra": ["-t", "5", "--prio", "2"],
-    },
-    {
-        "name": "H) mmap on, -t 6",
-        "desc": "스레드 6개 (--no-mmap에서 최고였음)",
-        "extra": ["-t", "6", "--prio", "2"],
-    },
-    {
-        "name": "I) mmap on, -t 7",
-        "desc": "스레드 7개",
-        "extra": ["-t", "7", "--prio", "2"],
-    },
-    {
-        "name": "J) mmap on, -t 6, --prio 3",
-        "desc": "최적 스레드 + 리얼타임 우선순위",
-        "extra": ["-t", "6", "--prio", "3"],
-    },
-]
-
-def kill_server():
-    os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
-    time.sleep(3)
-
-def start_server(config, log_path):
-    cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
-    log_file = open(log_path, "w", encoding="utf-8")
-    proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
-    return proc, log_file
-
-def wait_for_server(timeout=600):
-    start = time.time()
-    while time.time() - start < timeout:
-        try:
-            req = urllib.request.Request(f"{BASE_URL}/health")
-            with urllib.request.urlopen(req, timeout=5) as resp:
-                data = json.loads(resp.read())
-                if data.get("status") == "ok":
-                    return True
-        except:
-            pass
-        time.sleep(5)
-    return False
-
-def run_single_benchmark(prompt, max_tokens=200):
-    payload = json.dumps({
-        "model": "local-model",
-        "messages": [{"role": "user", "content": prompt}],
-        "max_tokens": max_tokens,
-        "temperature": 0.0
-    }).encode("utf-8")
-    req = urllib.request.Request(
-        f"{BASE_URL}/v1/chat/completions",
-        data=payload,
-        headers={"Content-Type": "application/json"}
-    )
-    start = time.time()
-    with urllib.request.urlopen(req, timeout=600) as resp:
-        result = json.loads(resp.read())
-    elapsed = time.time() - start
-    usage = result.get("usage", {})
-    return usage.get("completion_tokens", 0), elapsed
-
-def parse_eval_times(log_path):
-    try:
-        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
-            content = f.read()
-    except:
-        return []
-    pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
-    matches = re.findall(pattern, content, re.MULTILINE)
-    return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
-
-def parse_prompt_eval_times(log_path):
-    try:
-        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
-            content = f.read()
-    except:
-        return []
-    pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
-    matches = re.findall(pattern, content, re.MULTILINE)
-    return [{"tps": float(m[3])} for m in matches]
-
-def main():
-    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    
-    print("=" * 70)
-    print("  Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
-    print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
-    print(f"  테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
-    print("=" * 70)
-    print()
-    
-    all_results = []
-    
-    for idx, config in enumerate(CONFIGS):
-        config_start = time.time()
-        log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
-        
-        print(f"\n{'='*70}")
-        print(f"  [{idx+1}/{len(CONFIGS)}] {config['name']}")
-        print(f"  {config['desc']}")
-        print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
-        print(f"{'='*70}")
-        
-        kill_server()
-        print(f"  [1/3] 서버 시작 중...")
-        proc, log_file = start_server(config, log_path)
-        
-        if not wait_for_server(timeout=600):
-            print("  ❌ 서버 시작 실패!")
-            kill_server()
-            log_file.close()
-            all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
-            continue
-        
-        load_time = time.time() - config_start
-        print(f"  [2/3] 서버 준비 완료! ({load_time:.0f}초)")
-        
-        # 워밍업 + 벤치마크
-        try:
-            run_single_benchmark("Say hello.", max_tokens=20)
-        except:
-            pass
-        
-        print("  [3/3] 벤치마크 3회...")
-        prompts = [
-            "Write a detailed explanation of how neural networks learn through backpropagation.",
-            "Explain the complete process of photosynthesis including light and dark reactions.",
-            "Describe the differences between SQL and NoSQL databases with examples.",
-        ]
-        for i, prompt in enumerate(prompts):
-            try:
-                tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
-                print(f"    Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
-            except Exception as e:
-                print(f"    Run {i+1}: ERROR - {e}")
-        
-        time.sleep(2)
-        kill_server()
-        log_file.close()
-        time.sleep(2)
-        
-        eval_times = parse_eval_times(log_path)
-        prompt_times = parse_prompt_eval_times(log_path)
-        bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
-        bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
-        
-        eval_speeds = [e["tps"] for e in bench_evals]
-        prompt_speeds = [p["tps"] for p in bench_prompts]
-        
-        all_results.append({
-            "config": config["name"],
-            "status": "OK",
-            "eval_tps": eval_speeds,
-            "prompt_tps": prompt_speeds,
-        })
-        
-        if eval_speeds:
-            print(f"  📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
-    
-    # 최종 결과
-    print("\n")
-    print("=" * 85)
-    print("  🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
-    print("=" * 85)
-    print()
-    print(f"  {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
-    print(f"  {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
-    
-    # 1라운드 결과 (하드코딩)
-    r1 = [
-        ("[기준] mmap on, -t 8, --prio 2",              10.02, 10.06, 29.52),
-        ("A) --no-mmap -t 8",                           9.66,  9.70,  28.26),
-        ("B) --no-mmap -t 6",                          10.02, 10.18,  26.73),
-        ("C) --no-mmap -t 10",                          9.42,  9.46,  27.31),
-        ("D) --no-mmap -t 12",                          9.04,  9.11,  27.92),
-        ("E) --no-mmap -t 10 --prio 3 --poll 100",     9.41,  9.45,  28.37),
-    ]
-    for name, avg, mx, pp in r1:
-        marker = " ⭐" if avg >= 10.0 else ""
-        print(f"  {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
-    
-    print(f"  {'--- 2라운드 ---':<48}")
-    
-    best_avg = 10.06  # 기존 최고
-    best_config = "[기준] mmap on, -t 8"
-    
-    for r in all_results:
-        if r["status"] != "OK" or not r["eval_tps"]:
-            print(f"  {r['config']:<48} {'FAIL':>8}")
-            continue
-        avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
-        max_e = max(r["eval_tps"])
-        avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
-        if max_e > best_avg:
-            best_avg = max_e
-            best_config = r["config"]
-        marker = " ⭐" if avg_e >= 10.0 else ""
-        print(f"  {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
-    
-    print()
-    print(f"  🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
-    print(f"  완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
-    print("=" * 85)
-
-if __name__ == "__main__":
-    main()