wip: [01-llm-tuning] paused at task 1/3

2026-04-05 22:59:01 +09:00
parent 66778b750d
commit a09736e930
12 changed files with 53 additions and 1094 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -30,3 +30,7 @@
 	path = .agent/vendor/mini-swe
 	url = https://github.com/swe-agent/swe-agent.git
 	branch = main
 [submodule "openclaude"]
 	path = openclaude
 	url = https://github.com/Gitlawb/openclaude.git
 	branch = main
--- a/.planning/HANDOFF.json
+++ b/.planning/HANDOFF.json
@@ -1,25 +1,33 @@
 {
  "version": "1.0",
-  "timestamp": "2026-04-05T00:51:15+09:00",
+  "timestamp": "2026-04-05T13:54:58.707Z",
-  "phase": "00-initialization",
+  "phase": "01",
-  "phase_name": "Project Initialization",
+  "phase_name": "01-llm-tuning",
-  "phase_dir": ".planning",
+  "phase_dir": ".planning/phases/01-llm-tuning",
-  "plan": 0,
+  "plan": 1,
-  "task": 0,
+  "task": 1,
-  "total_tasks": 0,
+  "total_tasks": 3,
  "status": "paused",
  "completed_tasks": [
-    {"id": 1, "name": "Initialize Project & Repo", "status": "done", "commit": "e37f65a"}
+    {"id": 1, "name": "Gemma4 26B performance tuning at 256K context", "status": "done", "commit": "none"}
  ],
  "remaining_tasks": [
-    {"id": 2, "name": "Run /gsd-plan-phase 1 to start planning Phase 1", "status": "not_started"}
+    {"id": 2, "name": "Proceed with extensions frontend UI integration", "status": "not_started"},
    {"id": 3, "name": "Add 2nd RTX 3060 to verify 45-60 t/s MoE performance", "status": "not_started"}
  ],
  "blockers": [],
-  "human_actions_pending": [],
+  "human_actions_pending": [
-  "decisions": [
+    {"action": "Decide next step: integration of Extension frontend streaming or adding second GPU for Qwen/Gemma4 full evaluation", "context": "Server is fully optimized for 1 GPU, further improvements in speed require hardware upgrade", "blocking": false}
    {"decision": "2+0 GPU Architecture (Machine A API Server, Machine B tools)", "rationale": "Prioritize coding speed (50-80 t/s) and separate logic cleanly", "phase": "00"}
  ],
-  "uncommitted_files": [],
+  "decisions": [
-  "next_action": "Run /gsd-plan-phase 1 to plan the Machine A server setup and hot-swap script.",
+    {"decision": "Used --n-cpu-moe 10 for Gemma4 26B instead of --cpu-moe", "rationale": "Applying --cpu-moe globally to Gemma4 resulted in severe instability and crashes (graph splits 62) due to SWA+MoE entanglement. Targeted offload (10 layers) prevents VRAM swap and stabilizes split at 2, achieving 30.9 t/s on 1 GPU.", "phase": "01"},
-  "context_notes": "We just finalized the initial architecture plan for Variet LLM involving Dual GPUs on Machine A for pure API inference, and Machine B as the workstation for VS Code Extension, Discord Bot, and Search/MCP tools."
+    {"decision": "Verified Qwen3.5 35B-A3B speed capabilities", "rationale": "Tested Qwen 35B limits on 12GB. Found it causes heavy WDDM swap without MoE offload. Confirmed its smaller active parameters (3B vs Gemma4's 4B) will likely make it significantly faster than Gemma4 on a dual 3060 24GB setup up to 64K context.", "phase": "01"}
  ],
  "uncommitted_files": [
    "start_gemma4_26b_api.bat",
    "scripts/auto_tune_gemma4_256k.py",
    "scripts/auto_tune_gemma4_ncpumoe.py"
  ],
  "next_action": "Resume development on OpenClaude integration (Extension frontend UI) or configure Dual-GPU testing.",
  "context_notes": "We've successfully proven the 1 GPU tuning threshold for Gemma4 (30.9 t/s). We also understood why OpenClaude needs large contexts (200K default scaling) and mapped out exact expectations for Qwen VS Gemma on 2x GPUs."
 }
--- a/1
+++ b/1
--- a/scripts/auto_tune_122b.py
+++ b/scripts/auto_tune_122b.py
@@ -1,372 +0,0 @@
 """
 Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
 ===========================================
 각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
 서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
 예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
 """
 import subprocess
 import time
 import json
 import urllib.request
 import os
 import re
 import sys
 import datetime
 try:
    sys.stdout.reconfigure(encoding='utf-8')
 except AttributeError:
    pass
 BASE_URL = "http://127.0.0.1:8000"
 MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
 SERVER_EXE = r"llama_bin_run\llama-server.exe"
 # ============================================================
 # 테스트할 설정 목록
 # ============================================================
 # 공통 파라미터 (변경하지 않는 것들)
 COMMON_ARGS = [
    "--model", MODEL_PATH,
    "-ngl", "999",
    "--cpu-moe",
    "-c", "2048",
    "-np", "1",
    "-fa", "on",
    "--cache-type-k", "q4_0",
    "--cache-type-v", "q4_0",
    "-ub", "256",
    "-b", "1024",
    "--mlock",
    "--port", "8000",
    "--host", "0.0.0.0",
    "--no-warmup",  # 워밍업은 벤치마크 스크립트에서 직접 수행
 ]
 # 변수 파라미터 조합
 CONFIGS = [
    {
        "name": "A) --no-mmap -t 8",
        "desc": "서버 권장: mmap 비활성화 (baseline 대비)",
        "extra": ["--no-mmap", "-t", "8", "--prio", "2"],
    },
    {
        "name": "B) --no-mmap -t 6",
        "desc": "스레드 감소 (캐시 경합 회피)",
        "extra": ["--no-mmap", "-t", "6", "--prio", "2"],
    },
    {
        "name": "C) --no-mmap -t 10",
        "desc": "스레드 증가 (RAM 대역폭 포화)",
        "extra": ["--no-mmap", "-t", "10", "--prio", "2"],
    },
    {
        "name": "D) --no-mmap -t 12",
        "desc": "더 많은 스레드",
        "extra": ["--no-mmap", "-t", "12", "--prio", "2"],
    },
    {
        "name": "E) --no-mmap -t 10 --prio 3 --poll 100",
        "desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
        "extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
    },
 ]
 # ============================================================
 # 유틸리티 함수
 # ============================================================
 def kill_server():
    """llama-server 프로세스 강제 종료"""
    os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
    time.sleep(3)
 def start_server(config, log_path):
    """서버 시작, 로그를 파일로 리다이렉트"""
    cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
    log_file = open(log_path, "w", encoding="utf-8")
    proc = subprocess.Popen(
        cmd,
        stdout=log_file,
        stderr=subprocess.STDOUT,
        cwd=os.getcwd()
    )
    return proc, log_file
 def wait_for_server(timeout=600):
    """서버가 준비될 때까지 대기"""
    start = time.time()
    while time.time() - start < timeout:
        try:
            req = urllib.request.Request(f"{BASE_URL}/health")
            with urllib.request.urlopen(req, timeout=5) as resp:
                data = json.loads(resp.read())
                if data.get("status") == "ok":
                    return True
        except:
            pass
        time.sleep(5)
    return False
 def run_single_benchmark(prompt, max_tokens=200):
    """단일 벤치마크 실행"""
    payload = json.dumps({
        "model": "local-model",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.0
    }).encode("utf-8")
    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )
    start = time.time()
    with urllib.request.urlopen(req, timeout=600) as resp:
        result = json.loads(resp.read())
    elapsed = time.time() - start
    usage = result.get("usage", {})
    completion_tokens = usage.get("completion_tokens", 0)
    return completion_tokens, elapsed
 def parse_eval_times(log_path):
    """서버 로그에서 순수 eval time 파싱"""
    try:
        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read()
    except:
        return []
    # "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
    pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
    matches = re.findall(pattern, content, re.MULTILINE)
    results = []
    for m in matches:
        results.append({
            "total_ms": float(m[0]),
            "tokens": int(m[1]),
            "ms_per_token": float(m[2]),
            "tps": float(m[3])
        })
    return results
 def parse_prompt_eval_times(log_path):
    """서버 로그에서 prompt eval time 파싱"""
    try:
        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read()
    except:
        return []
    pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
    matches = re.findall(pattern, content, re.MULTILINE)
    results = []
    for m in matches:
        results.append({
            "total_ms": float(m[0]),
            "tokens": int(m[1]),
            "ms_per_token": float(m[2]),
            "tps": float(m[3])
        })
    return results
 def parse_vram_usage(log_path):
    """서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
    try:
        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read()
    except:
        return "N/A"
    match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
    if match:
        return f"{float(match.group(1)):.0f} MiB"
    return "N/A"
 # ============================================================
 # 메인 튜닝 루프
 # ============================================================
 def main():
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    print("=" * 70)
    print("  Qwen3.5 122B-A10B 자동 정밀 튜닝")
    print(f"  시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
    print(f"  테스트 설정: {len(CONFIGS)}개")
    print(f"  예상 소요: ~{len(CONFIGS) * 7}분")
    print("=" * 70)
    print()
    print("  기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
    print()
    # 결과 저장
    all_results = []
    for idx, config in enumerate(CONFIGS):
        config_start = time.time()
        log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
        print(f"\n{'='*70}")
        print(f"  [{idx+1}/{len(CONFIGS)}] {config['name']}")
        print(f"  {config['desc']}")
        print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
        print(f"{'='*70}")
        # 1. 기존 서버 종료
        print("  [1/4] 서버 종료 중...")
        kill_server()
        # 2. 새 서버 시작
        print(f"  [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
        proc, log_file = start_server(config, log_path)
        # 3. 서버 준비 대기
        if not wait_for_server(timeout=600):
            print("  ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
            kill_server()
            log_file.close()
            all_results.append({
                "config": config["name"],
                "status": "FAILED",
                "eval_tps": [],
                "prompt_tps": [],
                "vram": "N/A"
            })
            continue
        load_time = time.time() - config_start
        print(f"  [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
        # 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
        print("  [4/4] 벤치마크 실행 중...")
        # 워밍업
        try:
            run_single_benchmark("Say hello.", max_tokens=20)
            print("    워밍업 완료")
        except Exception as e:
            print(f"    워밍업 실패: {e}")
        # 본 테스트 3회
        prompts = [
            "Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
            "Explain the complete process of photosynthesis including light and dark reactions in detail.",
            "Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
        ]
        for i, prompt in enumerate(prompts):
            try:
                tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
                approx_tps = tokens / elapsed if elapsed > 0 else 0
                print(f"    Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
            except Exception as e:
                print(f"    Run {i+1}/3: ERROR - {e}")
        # 서버 종료 전에 로그 플러시를 위해 잠시 대기
        time.sleep(2)
        # 서버 종료
        kill_server()
        log_file.close()
        time.sleep(2)
        # 로그 파싱
        eval_times = parse_eval_times(log_path)
        prompt_times = parse_prompt_eval_times(log_path)
        vram = parse_vram_usage(log_path)
        # 워밍업 제외 (첫 번째 결과)
        if len(eval_times) > 1:
            bench_evals = eval_times[1:]  # 워밍업 제외
        else:
            bench_evals = eval_times
        if len(prompt_times) > 1:
            bench_prompts = prompt_times[1:]
        else:
            bench_prompts = prompt_times
        eval_speeds = [e["tps"] for e in bench_evals]
        prompt_speeds = [p["tps"] for p in bench_prompts]
        result = {
            "config": config["name"],
            "status": "OK",
            "eval_tps": eval_speeds,
            "prompt_tps": prompt_speeds,
            "vram": vram,
        }
        all_results.append(result)
        config_elapsed = time.time() - config_start
        print(f"\n  완료! 소요: {config_elapsed:.0f}초")
        if eval_speeds:
            avg_eval = sum(eval_speeds) / len(eval_speeds)
            max_eval = max(eval_speeds)
            print(f"  📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
    # ============================================================
    # 최종 결과 비교 테이블
    # ============================================================
    print("\n")
    print("=" * 80)
    print("  🏆 최종 결과 비교 테이블")
    print("=" * 80)
    print()
    # 기존 baseline 추가
    print(f"  {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
    print(f"  {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
    # Baseline (이전 결과)
    print(f"  {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
    best_avg = 0
    best_config = ""
    for r in all_results:
        if r["status"] != "OK" or not r["eval_tps"]:
            print(f"  {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
            continue
        avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
        max_e = max(r["eval_tps"])
        avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
        if avg_e > best_avg:
            best_avg = avg_e
            best_config = r["config"]
        marker = " ⭐" if avg_e > 10.06 else ""
        print(f"  {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
    print()
    if best_avg > 0:
        improvement = ((best_avg - 10.02) / 10.02) * 100
        print(f"  🏆 최고 성능: {best_config}")
        print(f"     → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
    print()
    print(f"  완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
    print("=" * 80)
    # 결과를 파일로도 저장
    result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
    with open(result_path, "w", encoding="utf-8") as f:
        f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
        f.write(f"Date: {timestamp}\n\n")
        for r in all_results:
            f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
    print(f"  결과 저장: {result_path}")
 if __name__ == "__main__":
    main()
--- a/scripts/auto_tune_122b_r2.py
+++ b/scripts/auto_tune_122b_r2.py
@@ -1,257 +0,0 @@
 """
 Qwen3.5 122B-A10B 정밀 튜닝 2라운드
 ====================================
 1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
 → mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
 """
 import subprocess
 import time
 import json
 import urllib.request
 import os
 import re
 import sys
 import datetime
 try:
    sys.stdout.reconfigure(encoding='utf-8')
 except AttributeError:
    pass
 BASE_URL = "http://127.0.0.1:8000"
 MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
 SERVER_EXE = r"llama_bin_run\llama-server.exe"
 COMMON_ARGS = [
    "--model", MODEL_PATH,
    "-ngl", "999",
    "--cpu-moe",
    "-c", "2048",
    "-np", "1",
    "-fa", "on",
    "--cache-type-k", "q4_0",
    "--cache-type-v", "q4_0",
    "-ub", "256",
    "-b", "1024",
    "--mlock",
    "--port", "8000",
    "--host", "0.0.0.0",
    "--no-warmup",
 ]
 CONFIGS = [
    {
        "name": "F) mmap on, -t 4",
        "desc": "최소 스레드 (4개, 물리코어 절반)",
        "extra": ["-t", "4", "--prio", "2"],
    },
    {
        "name": "G) mmap on, -t 5",
        "desc": "스레드 5개",
        "extra": ["-t", "5", "--prio", "2"],
    },
    {
        "name": "H) mmap on, -t 6",
        "desc": "스레드 6개 (--no-mmap에서 최고였음)",
        "extra": ["-t", "6", "--prio", "2"],
    },
    {
        "name": "I) mmap on, -t 7",
        "desc": "스레드 7개",
        "extra": ["-t", "7", "--prio", "2"],
    },
    {
        "name": "J) mmap on, -t 6, --prio 3",
        "desc": "최적 스레드 + 리얼타임 우선순위",
        "extra": ["-t", "6", "--prio", "3"],
    },
 ]
 def kill_server():
    os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
    time.sleep(3)
 def start_server(config, log_path):
    cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
    log_file = open(log_path, "w", encoding="utf-8")
    proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
    return proc, log_file
 def wait_for_server(timeout=600):
    start = time.time()
    while time.time() - start < timeout:
        try:
            req = urllib.request.Request(f"{BASE_URL}/health")
            with urllib.request.urlopen(req, timeout=5) as resp:
                data = json.loads(resp.read())
                if data.get("status") == "ok":
                    return True
        except:
            pass
        time.sleep(5)
    return False
 def run_single_benchmark(prompt, max_tokens=200):
    payload = json.dumps({
        "model": "local-model",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.0
    }).encode("utf-8")
    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )
    start = time.time()
    with urllib.request.urlopen(req, timeout=600) as resp:
        result = json.loads(resp.read())
    elapsed = time.time() - start
    usage = result.get("usage", {})
    return usage.get("completion_tokens", 0), elapsed
 def parse_eval_times(log_path):
    try:
        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read()
    except:
        return []
    pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
    matches = re.findall(pattern, content, re.MULTILINE)
    return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
 def parse_prompt_eval_times(log_path):
    try:
        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read()
    except:
        return []
    pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
    matches = re.findall(pattern, content, re.MULTILINE)
    return [{"tps": float(m[3])} for m in matches]
 def main():
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    print("=" * 70)
    print("  Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
    print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
    print(f"  테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
    print("=" * 70)
    print()
    all_results = []
    for idx, config in enumerate(CONFIGS):
        config_start = time.time()
        log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
        print(f"\n{'='*70}")
        print(f"  [{idx+1}/{len(CONFIGS)}] {config['name']}")
        print(f"  {config['desc']}")
        print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
        print(f"{'='*70}")
        kill_server()
        print(f"  [1/3] 서버 시작 중...")
        proc, log_file = start_server(config, log_path)
        if not wait_for_server(timeout=600):
            print("  ❌ 서버 시작 실패!")
            kill_server()
            log_file.close()
            all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
            continue
        load_time = time.time() - config_start
        print(f"  [2/3] 서버 준비 완료! ({load_time:.0f}초)")
        # 워밍업 + 벤치마크
        try:
            run_single_benchmark("Say hello.", max_tokens=20)
        except:
            pass
        print("  [3/3] 벤치마크 3회...")
        prompts = [
            "Write a detailed explanation of how neural networks learn through backpropagation.",
            "Explain the complete process of photosynthesis including light and dark reactions.",
            "Describe the differences between SQL and NoSQL databases with examples.",
        ]
        for i, prompt in enumerate(prompts):
            try:
                tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
                print(f"    Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
            except Exception as e:
                print(f"    Run {i+1}: ERROR - {e}")
        time.sleep(2)
        kill_server()
        log_file.close()
        time.sleep(2)
        eval_times = parse_eval_times(log_path)
        prompt_times = parse_prompt_eval_times(log_path)
        bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
        bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
        eval_speeds = [e["tps"] for e in bench_evals]
        prompt_speeds = [p["tps"] for p in bench_prompts]
        all_results.append({
            "config": config["name"],
            "status": "OK",
            "eval_tps": eval_speeds,
            "prompt_tps": prompt_speeds,
        })
        if eval_speeds:
            print(f"  📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
    # 최종 결과
    print("\n")
    print("=" * 85)
    print("  🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
    print("=" * 85)
    print()
    print(f"  {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
    print(f"  {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
    # 1라운드 결과 (하드코딩)
    r1 = [
        ("[기준] mmap on, -t 8, --prio 2",              10.02, 10.06, 29.52),
        ("A) --no-mmap -t 8",                           9.66,  9.70,  28.26),
        ("B) --no-mmap -t 6",                          10.02, 10.18,  26.73),
        ("C) --no-mmap -t 10",                          9.42,  9.46,  27.31),
        ("D) --no-mmap -t 12",                          9.04,  9.11,  27.92),
        ("E) --no-mmap -t 10 --prio 3 --poll 100",     9.41,  9.45,  28.37),
    ]
    for name, avg, mx, pp in r1:
        marker = " ⭐" if avg >= 10.0 else ""
        print(f"  {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
    print(f"  {'--- 2라운드 ---':<48}")
    best_avg = 10.06  # 기존 최고
    best_config = "[기준] mmap on, -t 8"
    for r in all_results:
        if r["status"] != "OK" or not r["eval_tps"]:
            print(f"  {r['config']:<48} {'FAIL':>8}")
            continue
        avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
        max_e = max(r["eval_tps"])
        avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
        if max_e > best_avg:
            best_avg = max_e
            best_config = r["config"]
        marker = " ⭐" if avg_e >= 10.0 else ""
        print(f"  {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
    print()
    print(f"  🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
    print(f"  완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
    print("=" * 85)
 if __name__ == "__main__":
    main()
--- a/scripts/download_llama.py
+++ b/scripts/download_llama.py
@@ -1,38 +0,0 @@
 import urllib.request
 import json
 import zipfile
 import os
 import ssl
 ctx = ssl.create_default_context()
 ctx.check_hostname = False
 ctx.verify_mode = ssl.CERT_NONE
 url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
 req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
 try:
    with urllib.request.urlopen(req, context=ctx) as response:
        data = json.loads(response.read().decode())
    download_url = None
    for asset in data['assets']:
        if "bin-win-cuda-cu12.2.0-x64.zip" in asset['name'] or ("bin-win-cu" in asset['name'] and asset['name'].endswith(".zip") and "x64" in asset['name']):
            download_url = asset['browser_download_url']
            break
    if download_url:
        print(f"Downloading {download_url}...")
        zip_path = "llama.zip"
        with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
            out_file.write(resp.read())
        print("Extracting to 'llama_bin'...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall("llama_bin")
        print("Done extracting.")
        os.remove(zip_path)
    else:
        print("Could not find the target zip. Available assets:")
        for asset in data['assets']:
            print(" -", asset['name'])
 except Exception as e:
    print(f"Error: {e}")
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
@@ -1,33 +0,0 @@
 import os
 from huggingface_hub import hf_hub_download
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 models = [
    # 먼저 용량이 작은 Gemma4 26B 부터 다운로드
    ("ggml-org/gemma-4-26B-A4B-it-GGUF", "gemma-4-26B-A4B-it-Q4_K_M.gguf"),
    # 다음 Qwen 35B
    ("unsloth/Qwen3.5-35B-A3B-GGUF", "Qwen3.5-35B-A3B-Q4_K_M.gguf"),
    # 마지막으로 122B (분할 압축되어 있음)
    ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"),
    ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf"),
    ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf")
 ]
 print("=== 고속 다운로더 시작 (huggingface_hub & hf_transfer) ===")
 os.makedirs("models", exist_ok=True)
 for repo, filename in models:
    print(f"\n>>> 다운로드 중 (백그라운드 진행): [{repo}] 의 [{filename}]...")
    try:
        path = hf_hub_download(
            repo_id=repo, 
            filename=filename, 
            local_dir="./models", 
            local_dir_use_symlinks=False
        )
        print(f"완료: {path}")
    except Exception as e:
        print(f"다운로드 실패: {e}")
 print("\n모든 다운로드 프로세스가 종료되었습니다.")
--- a/scripts/download_true_llama.py
+++ b/scripts/download_true_llama.py
@@ -1,56 +0,0 @@
 import urllib.request
 import json
 import zipfile
 import os
 import ssl
 import shutil
 ctx = ssl.create_default_context()
 ctx.check_hostname = False
 ctx.verify_mode = ssl.CERT_NONE
 url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
 req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
 try:
    with urllib.request.urlopen(req, context=ctx) as response:
        data = json.loads(response.read().decode())
    download_url = None
    for asset in data['assets']:
        if "bin-win-cuda-12.4-x64.zip" in asset['name'] and "cudart" not in asset['name']:
            download_url = asset['browser_download_url']
            break
    if download_url:
        print(f"Downloading true binaries: {download_url}...")
        zip_path = "llama_main.zip"
        with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
            out_file.write(resp.read())
        print("Extracting to temporary folder 'llama_temp'...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall("llama_temp")
        print("Moving exact files to 'llama_bin_run'...")
        os.makedirs("llama_bin_run", exist_ok=True)
        for root, dirs, files in os.walk("llama_temp"):
            for file in files:
                shutil.move(os.path.join(root, file), os.path.join("llama_bin_run", file))
        if os.path.exists("llama_bin"):
            for item in os.listdir("llama_bin"):
                src = os.path.join("llama_bin", item)
                dst = os.path.join("llama_bin_run", item)
                if not os.path.exists(dst):
                    try:
                        shutil.copy(src, dst)
                    except:
                        pass
        os.remove(zip_path)
        shutil.rmtree("llama_temp", ignore_errors=True)
        print("Download and path extraction fully complete.")
    else:
        print("Could not find the target zip.")
 except Exception as e:
    print(f"Error: {e}")
--- a/scripts/hf_search.py
+++ b/scripts/hf_search.py
@@ -1,28 +0,0 @@
 from huggingface_hub import HfApi
 import sys
 api = HfApi()
 def search_gguf(query):
    print(f"\n--- Searching for: {query} ---")
    try:
        models = api.list_models(search=query, limit=3)
        found = list(models)
        if not found:
            print("No models found.")
            return
        for m in found:
            print(f"Repo: {m.id}")
            files = api.list_repo_files(repo_id=m.id)
            ggufs = [f for f in files if "q4_k_m" in f.lower() and f.endswith(".gguf")]
            if not ggufs:
                ggufs = [f for f in files if f.endswith(".gguf")][:3]
            print(f"  GGUFs: {ggufs}")
    except Exception as e:
        print(f"Error: {e}")
 search_gguf("122b-a10b gguf")
 search_gguf("Qwen3.5 122b gguf")
 search_gguf("35b-a3b gguf")
 search_gguf("gemma-4 26b gguf")
 search_gguf("Qwen 122B")
--- a/scripts/perf_test.py
+++ b/scripts/perf_test.py
@@ -1,123 +0,0 @@
 import time
 import json
 import urllib.request
 import sys
 try:
    sys.stdout.reconfigure(encoding='utf-8')
 except AttributeError:
    pass
 BASE_URL = "http://127.0.0.1:8000"
 def check_server():
    """Check if server is up"""
    try:
        req = urllib.request.Request(f"{BASE_URL}/health")
        with urllib.request.urlopen(req, timeout=5) as resp:
            data = json.loads(resp.read())
            return data.get("status") == "ok"
    except:
        return False
 def run_benchmark(prompt, max_tokens=100, label="Test"):
    """Run a single benchmark request and return results"""
    payload = json.dumps({
        "model": "local-model",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.0
    }).encode("utf-8")
    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )
    start = time.time()
    with urllib.request.urlopen(req, timeout=300) as resp:
        result = json.loads(resp.read())
    elapsed = time.time() - start
    content = result["choices"][0]["message"].get("content", "")
    usage = result.get("usage", {})
    prompt_tokens = usage.get("prompt_tokens", 0)
    completion_tokens = usage.get("completion_tokens", 0)
    gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
    return {
        "label": label,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "elapsed": elapsed,
        "gen_tps_approx": gen_tps,
        "content_preview": content[:100]
    }
 def main():
    print("=" * 60)
    print("  LLM Performance Benchmark Tool")
    print("=" * 60)
    print()
    # Wait for server
    print("[1/3] Checking server health...")
    for i in range(30):
        if check_server():
            print("  -> Server is ready!")
            break
        print(f"  -> Waiting for server... ({i+1}/30)")
        time.sleep(2)
    else:
        print("  -> ERROR: Server not responding after 60s")
        return
    # Warmup
    print()
    print("[2/3] Warmup run (short)...")
    try:
        warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
        print(f"  -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
    except Exception as e:
        print(f"  -> Warmup failed: {e}")
    # Main benchmark
    print()
    print("[3/3] Running main benchmark...")
    print("-" * 60)
    test_prompt = "Count from 1 to 50, writing each number on a new line."
    results = []
    for i in range(3):
        print(f"  Run {i+1}/3...")
        try:
            r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}")
            results.append(r)
            print(f"    Tokens: {r['completion_tokens']} | "
                  f"Time: {r['elapsed']:.2f}s | "
                  f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)")
        except Exception as e:
            print(f"    ERROR: {e}")
    if results:
        print()
        print("=" * 60)
        print("  RESULTS SUMMARY")
        print("=" * 60)
        avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
        max_tps = max(r["gen_tps_approx"] for r in results)
        min_tps = min(r["gen_tps_approx"] for r in results)
        print(f"  Runs:     {len(results)}")
        print(f"  Avg TPS:  {avg_tps:.2f} t/s (approx, includes prompt eval)")
        print(f"  Min TPS:  {min_tps:.2f} t/s")
        print(f"  Max TPS:  {max_tps:.2f} t/s")
        print()
        print("  NOTE: Check server console for exact generation t/s")
        print("  (the 'eval time' line shows pure token generation speed)")
        print("=" * 60)
 if __name__ == "__main__":
    main()
--- a/scripts/perf_test_122b.py
+++ b/scripts/perf_test_122b.py
@@ -1,169 +0,0 @@
 import time
 import json
 import urllib.request
 import sys
 import os
 import re
 try:
    sys.stdout.reconfigure(encoding='utf-8')
 except AttributeError:
    pass
 BASE_URL = "http://127.0.0.1:8000"
 def check_server():
    """Check if server is up"""
    try:
        req = urllib.request.Request(f"{BASE_URL}/health")
        with urllib.request.urlopen(req, timeout=5) as resp:
            data = json.loads(resp.read())
            return data.get("status") == "ok"
    except:
        return False
 def check_slots():
    """Check server slot info for VRAM usage details"""
    try:
        req = urllib.request.Request(f"{BASE_URL}/slots")
        with urllib.request.urlopen(req, timeout=5) as resp:
            return json.loads(resp.read())
    except:
        return None
 def run_benchmark(prompt, max_tokens=300, label="Test"):
    """Run a single benchmark request and return results"""
    payload = json.dumps({
        "model": "local-model",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.0
    }).encode("utf-8")
    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )
    start = time.time()
    with urllib.request.urlopen(req, timeout=600) as resp:
        result = json.loads(resp.read())
    elapsed = time.time() - start
    content = result["choices"][0]["message"].get("content", "")
    usage = result.get("usage", {})
    prompt_tokens = usage.get("prompt_tokens", 0)
    completion_tokens = usage.get("completion_tokens", 0)
    gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
    return {
        "label": label,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "elapsed": elapsed,
        "gen_tps_approx": gen_tps,
        "content_preview": content[:150]
    }
 def main():
    print("=" * 70)
    print("  Qwen3.5 122B-A10B Performance Benchmark")
    print("  Target: 10+ t/s generation speed")
    print("=" * 70)
    print()
    # Wait for server (model loading takes 3-5 min for 71 GB)
    print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
    max_wait = 600  # 10 minutes max
    for i in range(max_wait // 5):
        if check_server():
            print(f"  -> Server is ready! (waited {i*5}s)")
            break
        if i % 6 == 0:
            print(f"  -> Loading model... ({i*5}s / {max_wait}s)")
        time.sleep(5)
    else:
        print(f"  -> ERROR: Server not responding after {max_wait}s")
        return
    # Check server info
    print()
    print("[2/4] Checking server status...")
    slots = check_slots()
    if slots:
        print(f"  -> Slots available: {len(slots)}")
    # Warmup
    print()
    print("[3/4] Warmup run (short, pre-heating GPU caches)...")
    try:
        warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
        print(f"  -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
        print(f"  -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
    except Exception as e:
        print(f"  -> Warmup failed: {e}")
    # Main benchmark - 5 runs for statistical reliability
    print()
    print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
    print("-" * 70)
    test_prompts = [
        "Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
        "Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
        "Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
        "Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
        "Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
    ]
    results = []
    for i in range(5):
        prompt = test_prompts[i % len(test_prompts)]
        print(f"\n  Run {i+1}/5: {prompt[:50]}...")
        try:
            r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
            results.append(r)
            print(f"    Completion tokens: {r['completion_tokens']}")
            print(f"    Total time: {r['elapsed']:.2f}s")
            print(f"    Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
        except Exception as e:
            print(f"    ERROR: {e}")
    if results:
        print()
        print("=" * 70)
        print("  RESULTS SUMMARY - Qwen3.5 122B-A10B")
        print("=" * 70)
        avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
        max_tps = max(r["gen_tps_approx"] for r in results)
        min_tps = min(r["gen_tps_approx"] for r in results)
        total_tokens = sum(r["completion_tokens"] for r in results)
        total_time = sum(r["elapsed"] for r in results)
        print(f"  Runs completed: {len(results)}/5")
        print(f"  Total tokens:   {total_tokens}")
        print(f"  Total time:     {total_time:.1f}s")
        print()
        print(f"  Approx TPS (avg): {avg_tps:.2f} t/s")
        print(f"  Approx TPS (min): {min_tps:.2f} t/s")
        print(f"  Approx TPS (max): {max_tps:.2f} t/s")
        print()
        # Verdict
        if avg_tps >= 10:
            print("  ✅ TARGET ACHIEVED: 10+ t/s!")
        elif avg_tps >= 8:
            print("  ⚠️  CLOSE TO TARGET: Consider further tuning")
        else:
            print(f"  ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
        print()
        print("  ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
        print("  ⚡ Check the server console/log for exact 'eval time' t/s value,")
        print("  ⚡ which shows pure token generation speed (always higher).")
        print("=" * 70)
 if __name__ == "__main__":
    main()
--- a/start_gemma4_26b_api.bat
+++ b/start_gemma4_26b_api.bat
@@ -1,8 +1,30 @@
@echo off
 chcp 65001 >nul
 echo =========================================================
-echo  Gemma4 26B-A4B API Server (Tuned for Max Speed)
+echo  Gemma4 26B-A4B API Server (256K Context - Final Optimal)
-echo  [INFO] Tuning VRAM limit correctly to avoid WDDM swap (-ngl 22)
+echo  [CORE] --n-cpu-moe 10: VRAM 12GB 최적화용 Expert 오프로드
 echo  [TUNED] -t 4 -ub 512: CPU 병목 방지 및 SWA 캐시 최적화
 echo  [PERF] Speed: ~30.9 t/s (1x RTX 3060)
 echo =========================================================
 echo.
-llama_bin_run\llama-server.exe --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf -ngl 22 -c 4096 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 8 --mlock --prio 2 --port 8000 --host 0.0.0.0
+
 llama_bin_run\llama-server.exe ^
  --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf ^
  -ngl 999 ^
  --n-cpu-moe 10 ^
  -c 262144 ^
  -np 1 ^
  -fa on ^
  --cache-type-k q4_0 ^
  --cache-type-v q4_0 ^
  -ub 512 ^
  -b 2048 ^
  -t 4 ^
  -tb 4 ^
  --mlock ^
  --prio 3 ^
  --poll 50 ^
  --port 8000 ^
  --host 0.0.0.0
 pause