feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions
--- a/scripts/_archive/tuning/auto_tune_122b.py
+++ b/scripts/_archive/tuning/auto_tune_122b.py
@@ -0,0 +1,372 @@
+"""
+Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
+===========================================
+각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
+서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
+
+예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import os
+import re
+import sys
+import datetime
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
+SERVER_EXE = r"llama_bin_run\llama-server.exe"
+
+# ============================================================
+# 테스트할 설정 목록
+# ============================================================
+# 공통 파라미터 (변경하지 않는 것들)
+COMMON_ARGS = [
+    "--model", MODEL_PATH,
+    "-ngl", "999",
+    "--cpu-moe",
+    "-c", "2048",
+    "-np", "1",
+    "-fa", "on",
+    "--cache-type-k", "q4_0",
+    "--cache-type-v", "q4_0",
+    "-ub", "256",
+    "-b", "1024",
+    "--mlock",
+    "--port", "8000",
+    "--host", "0.0.0.0",
+    "--no-warmup",  # 워밍업은 벤치마크 스크립트에서 직접 수행
+]
+
+# 변수 파라미터 조합
+CONFIGS = [
+    {
+        "name": "A) --no-mmap -t 8",
+        "desc": "서버 권장: mmap 비활성화 (baseline 대비)",
+        "extra": ["--no-mmap", "-t", "8", "--prio", "2"],
+    },
+    {
+        "name": "B) --no-mmap -t 6",
+        "desc": "스레드 감소 (캐시 경합 회피)",
+        "extra": ["--no-mmap", "-t", "6", "--prio", "2"],
+    },
+    {
+        "name": "C) --no-mmap -t 10",
+        "desc": "스레드 증가 (RAM 대역폭 포화)",
+        "extra": ["--no-mmap", "-t", "10", "--prio", "2"],
+    },
+    {
+        "name": "D) --no-mmap -t 12",
+        "desc": "더 많은 스레드",
+        "extra": ["--no-mmap", "-t", "12", "--prio", "2"],
+    },
+    {
+        "name": "E) --no-mmap -t 10 --prio 3 --poll 100",
+        "desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
+        "extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
+    },
+]
+
+# ============================================================
+# 유틸리티 함수
+# ============================================================
+
+def kill_server():
+    """llama-server 프로세스 강제 종료"""
+    os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
+    time.sleep(3)
+
+def start_server(config, log_path):
+    """서버 시작, 로그를 파일로 리다이렉트"""
+    cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
+    log_file = open(log_path, "w", encoding="utf-8")
+    proc = subprocess.Popen(
+        cmd,
+        stdout=log_file,
+        stderr=subprocess.STDOUT,
+        cwd=os.getcwd()
+    )
+    return proc, log_file
+
+def wait_for_server(timeout=600):
+    """서버가 준비될 때까지 대기"""
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(5)
+    return False
+
+def run_single_benchmark(prompt, max_tokens=200):
+    """단일 벤치마크 실행"""
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    completion_tokens = usage.get("completion_tokens", 0)
+    return completion_tokens, elapsed
+
+def parse_eval_times(log_path):
+    """서버 로그에서 순수 eval time 파싱"""
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return []
+    
+    # "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
+    pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
+    matches = re.findall(pattern, content, re.MULTILINE)
+    
+    results = []
+    for m in matches:
+        results.append({
+            "total_ms": float(m[0]),
+            "tokens": int(m[1]),
+            "ms_per_token": float(m[2]),
+            "tps": float(m[3])
+        })
+    return results
+
+def parse_prompt_eval_times(log_path):
+    """서버 로그에서 prompt eval time 파싱"""
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return []
+    
+    pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
+    matches = re.findall(pattern, content, re.MULTILINE)
+    
+    results = []
+    for m in matches:
+        results.append({
+            "total_ms": float(m[0]),
+            "tokens": int(m[1]),
+            "ms_per_token": float(m[2]),
+            "tps": float(m[3])
+        })
+    return results
+
+def parse_vram_usage(log_path):
+    """서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return "N/A"
+    
+    match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
+    if match:
+        return f"{float(match.group(1)):.0f} MiB"
+    return "N/A"
+
+# ============================================================
+# 메인 튜닝 루프
+# ============================================================
+
+def main():
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    
+    print("=" * 70)
+    print("  Qwen3.5 122B-A10B 자동 정밀 튜닝")
+    print(f"  시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
+    print(f"  테스트 설정: {len(CONFIGS)}개")
+    print(f"  예상 소요: ~{len(CONFIGS) * 7}분")
+    print("=" * 70)
+    print()
+    print("  기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
+    print()
+    
+    # 결과 저장
+    all_results = []
+    
+    for idx, config in enumerate(CONFIGS):
+        config_start = time.time()
+        log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
+        
+        print(f"\n{'='*70}")
+        print(f"  [{idx+1}/{len(CONFIGS)}] {config['name']}")
+        print(f"  {config['desc']}")
+        print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
+        print(f"{'='*70}")
+        
+        # 1. 기존 서버 종료
+        print("  [1/4] 서버 종료 중...")
+        kill_server()
+        
+        # 2. 새 서버 시작
+        print(f"  [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
+        proc, log_file = start_server(config, log_path)
+        
+        # 3. 서버 준비 대기
+        if not wait_for_server(timeout=600):
+            print("  ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
+            kill_server()
+            log_file.close()
+            all_results.append({
+                "config": config["name"],
+                "status": "FAILED",
+                "eval_tps": [],
+                "prompt_tps": [],
+                "vram": "N/A"
+            })
+            continue
+        
+        load_time = time.time() - config_start
+        print(f"  [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
+        
+        # 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
+        print("  [4/4] 벤치마크 실행 중...")
+        
+        # 워밍업
+        try:
+            run_single_benchmark("Say hello.", max_tokens=20)
+            print("    워밍업 완료")
+        except Exception as e:
+            print(f"    워밍업 실패: {e}")
+        
+        # 본 테스트 3회
+        prompts = [
+            "Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
+            "Explain the complete process of photosynthesis including light and dark reactions in detail.",
+            "Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
+        ]
+        
+        for i, prompt in enumerate(prompts):
+            try:
+                tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
+                approx_tps = tokens / elapsed if elapsed > 0 else 0
+                print(f"    Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
+            except Exception as e:
+                print(f"    Run {i+1}/3: ERROR - {e}")
+        
+        # 서버 종료 전에 로그 플러시를 위해 잠시 대기
+        time.sleep(2)
+        
+        # 서버 종료
+        kill_server()
+        log_file.close()
+        time.sleep(2)
+        
+        # 로그 파싱
+        eval_times = parse_eval_times(log_path)
+        prompt_times = parse_prompt_eval_times(log_path)
+        vram = parse_vram_usage(log_path)
+        
+        # 워밍업 제외 (첫 번째 결과)
+        if len(eval_times) > 1:
+            bench_evals = eval_times[1:]  # 워밍업 제외
+        else:
+            bench_evals = eval_times
+        
+        if len(prompt_times) > 1:
+            bench_prompts = prompt_times[1:]
+        else:
+            bench_prompts = prompt_times
+        
+        eval_speeds = [e["tps"] for e in bench_evals]
+        prompt_speeds = [p["tps"] for p in bench_prompts]
+        
+        result = {
+            "config": config["name"],
+            "status": "OK",
+            "eval_tps": eval_speeds,
+            "prompt_tps": prompt_speeds,
+            "vram": vram,
+        }
+        all_results.append(result)
+        
+        config_elapsed = time.time() - config_start
+        print(f"\n  완료! 소요: {config_elapsed:.0f}초")
+        
+        if eval_speeds:
+            avg_eval = sum(eval_speeds) / len(eval_speeds)
+            max_eval = max(eval_speeds)
+            print(f"  📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
+    
+    # ============================================================
+    # 최종 결과 비교 테이블
+    # ============================================================
+    print("\n")
+    print("=" * 80)
+    print("  🏆 최종 결과 비교 테이블")
+    print("=" * 80)
+    print()
+    
+    # 기존 baseline 추가
+    print(f"  {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
+    print(f"  {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
+    
+    # Baseline (이전 결과)
+    print(f"  {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
+    
+    best_avg = 0
+    best_config = ""
+    
+    for r in all_results:
+        if r["status"] != "OK" or not r["eval_tps"]:
+            print(f"  {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
+            continue
+        
+        avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
+        max_e = max(r["eval_tps"])
+        avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
+        
+        if avg_e > best_avg:
+            best_avg = avg_e
+            best_config = r["config"]
+        
+        marker = " ⭐" if avg_e > 10.06 else ""
+        print(f"  {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
+    
+    print()
+    if best_avg > 0:
+        improvement = ((best_avg - 10.02) / 10.02) * 100
+        print(f"  🏆 최고 성능: {best_config}")
+        print(f"     → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
+    
+    print()
+    print(f"  완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
+    print("=" * 80)
+    
+    # 결과를 파일로도 저장
+    result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
+    with open(result_path, "w", encoding="utf-8") as f:
+        f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
+        f.write(f"Date: {timestamp}\n\n")
+        for r in all_results:
+            f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
+    print(f"  결과 저장: {result_path}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/tuning/auto_tune_122b_r2.py
+++ b/scripts/_archive/tuning/auto_tune_122b_r2.py
@@ -0,0 +1,257 @@
+"""
+Qwen3.5 122B-A10B 정밀 튜닝 2라운드
+====================================
+1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
+→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import os
+import re
+import sys
+import datetime
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
+SERVER_EXE = r"llama_bin_run\llama-server.exe"
+
+COMMON_ARGS = [
+    "--model", MODEL_PATH,
+    "-ngl", "999",
+    "--cpu-moe",
+    "-c", "2048",
+    "-np", "1",
+    "-fa", "on",
+    "--cache-type-k", "q4_0",
+    "--cache-type-v", "q4_0",
+    "-ub", "256",
+    "-b", "1024",
+    "--mlock",
+    "--port", "8000",
+    "--host", "0.0.0.0",
+    "--no-warmup",
+]
+
+CONFIGS = [
+    {
+        "name": "F) mmap on, -t 4",
+        "desc": "최소 스레드 (4개, 물리코어 절반)",
+        "extra": ["-t", "4", "--prio", "2"],
+    },
+    {
+        "name": "G) mmap on, -t 5",
+        "desc": "스레드 5개",
+        "extra": ["-t", "5", "--prio", "2"],
+    },
+    {
+        "name": "H) mmap on, -t 6",
+        "desc": "스레드 6개 (--no-mmap에서 최고였음)",
+        "extra": ["-t", "6", "--prio", "2"],
+    },
+    {
+        "name": "I) mmap on, -t 7",
+        "desc": "스레드 7개",
+        "extra": ["-t", "7", "--prio", "2"],
+    },
+    {
+        "name": "J) mmap on, -t 6, --prio 3",
+        "desc": "최적 스레드 + 리얼타임 우선순위",
+        "extra": ["-t", "6", "--prio", "3"],
+    },
+]
+
+def kill_server():
+    os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
+    time.sleep(3)
+
+def start_server(config, log_path):
+    cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
+    log_file = open(log_path, "w", encoding="utf-8")
+    proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
+    return proc, log_file
+
+def wait_for_server(timeout=600):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(5)
+    return False
+
+def run_single_benchmark(prompt, max_tokens=200):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+    usage = result.get("usage", {})
+    return usage.get("completion_tokens", 0), elapsed
+
+def parse_eval_times(log_path):
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return []
+    pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
+    matches = re.findall(pattern, content, re.MULTILINE)
+    return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
+
+def parse_prompt_eval_times(log_path):
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return []
+    pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
+    matches = re.findall(pattern, content, re.MULTILINE)
+    return [{"tps": float(m[3])} for m in matches]
+
+def main():
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    
+    print("=" * 70)
+    print("  Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
+    print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
+    print(f"  테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
+    print("=" * 70)
+    print()
+    
+    all_results = []
+    
+    for idx, config in enumerate(CONFIGS):
+        config_start = time.time()
+        log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
+        
+        print(f"\n{'='*70}")
+        print(f"  [{idx+1}/{len(CONFIGS)}] {config['name']}")
+        print(f"  {config['desc']}")
+        print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
+        print(f"{'='*70}")
+        
+        kill_server()
+        print(f"  [1/3] 서버 시작 중...")
+        proc, log_file = start_server(config, log_path)
+        
+        if not wait_for_server(timeout=600):
+            print("  ❌ 서버 시작 실패!")
+            kill_server()
+            log_file.close()
+            all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
+            continue
+        
+        load_time = time.time() - config_start
+        print(f"  [2/3] 서버 준비 완료! ({load_time:.0f}초)")
+        
+        # 워밍업 + 벤치마크
+        try:
+            run_single_benchmark("Say hello.", max_tokens=20)
+        except:
+            pass
+        
+        print("  [3/3] 벤치마크 3회...")
+        prompts = [
+            "Write a detailed explanation of how neural networks learn through backpropagation.",
+            "Explain the complete process of photosynthesis including light and dark reactions.",
+            "Describe the differences between SQL and NoSQL databases with examples.",
+        ]
+        for i, prompt in enumerate(prompts):
+            try:
+                tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
+                print(f"    Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
+            except Exception as e:
+                print(f"    Run {i+1}: ERROR - {e}")
+        
+        time.sleep(2)
+        kill_server()
+        log_file.close()
+        time.sleep(2)
+        
+        eval_times = parse_eval_times(log_path)
+        prompt_times = parse_prompt_eval_times(log_path)
+        bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
+        bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
+        
+        eval_speeds = [e["tps"] for e in bench_evals]
+        prompt_speeds = [p["tps"] for p in bench_prompts]
+        
+        all_results.append({
+            "config": config["name"],
+            "status": "OK",
+            "eval_tps": eval_speeds,
+            "prompt_tps": prompt_speeds,
+        })
+        
+        if eval_speeds:
+            print(f"  📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
+    
+    # 최종 결과
+    print("\n")
+    print("=" * 85)
+    print("  🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
+    print("=" * 85)
+    print()
+    print(f"  {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
+    print(f"  {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
+    
+    # 1라운드 결과 (하드코딩)
+    r1 = [
+        ("[기준] mmap on, -t 8, --prio 2",              10.02, 10.06, 29.52),
+        ("A) --no-mmap -t 8",                           9.66,  9.70,  28.26),
+        ("B) --no-mmap -t 6",                          10.02, 10.18,  26.73),
+        ("C) --no-mmap -t 10",                          9.42,  9.46,  27.31),
+        ("D) --no-mmap -t 12",                          9.04,  9.11,  27.92),
+        ("E) --no-mmap -t 10 --prio 3 --poll 100",     9.41,  9.45,  28.37),
+    ]
+    for name, avg, mx, pp in r1:
+        marker = " ⭐" if avg >= 10.0 else ""
+        print(f"  {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
+    
+    print(f"  {'--- 2라운드 ---':<48}")
+    
+    best_avg = 10.06  # 기존 최고
+    best_config = "[기준] mmap on, -t 8"
+    
+    for r in all_results:
+        if r["status"] != "OK" or not r["eval_tps"]:
+            print(f"  {r['config']:<48} {'FAIL':>8}")
+            continue
+        avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
+        max_e = max(r["eval_tps"])
+        avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
+        if max_e > best_avg:
+            best_avg = max_e
+            best_config = r["config"]
+        marker = " ⭐" if avg_e >= 10.0 else ""
+        print(f"  {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
+    
+    print()
+    print(f"  🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
+    print(f"  완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
+    print("=" * 85)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/tuning/auto_tune_gemma4_256k.py
+++ b/scripts/_archive/tuning/auto_tune_gemma4_256k.py
@@ -0,0 +1,339 @@
+"""
+Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
+Phase 1: -ngl sweep (GPU layers)
+Phase 2: -t / -tb sweep (CPU threads)
+Phase 3: -ub / -b sweep (batch sizes)
+Phase 4: --cache-type-k/v sweep (KV cache precision)
+Phase 5: --no-mmap, --poll, --prio sweep (misc)
+Each phase fixes the best from previous phases.
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+import itertools
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
+CONTEXT = 262144
+BENCHMARK_RUNS = 3
+BENCHMARK_TOKENS = 200
+
+# ─── Baseline (from previous tuning at -c 4096) ───
+BEST = {
+    "ngl": 22,
+    "t": 8,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": True,
+    "mmap": True,
+    "prio": 2,
+    "poll": 50,
+}
+
+ALL_RESULTS = []
+
+
+def kill_server():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
+                   capture_output=True)
+    time.sleep(4)
+
+
+def build_cmd(cfg):
+    cmd = [LLAMA_SERVER, "--model", MODEL,
+           "-ngl", str(cfg["ngl"]),
+           "-c", str(CONTEXT),
+           "-np", "1",
+           "-fa", cfg["fa"],
+           "--cache-type-k", cfg["ctk"],
+           "--cache-type-v", cfg["ctv"],
+           "-ub", str(cfg["ub"]),
+           "-b", str(cfg["b"]),
+           "-t", str(cfg["t"]),
+           "-tb", str(cfg["tb"]),
+           "--prio", str(cfg["prio"]),
+           "--poll", str(cfg["poll"]),
+           "--port", "8000",
+           "--host", "0.0.0.0"]
+    if cfg["mlock"]:
+        cmd.append("--mlock")
+    if not cfg["mmap"]:
+        cmd.append("--no-mmap")
+    return cmd
+
+
+def start_server(cfg):
+    cmd = build_cmd(cfg)
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
+    )
+    return proc
+
+
+def wait_for_server(timeout=180):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=3) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(2)
+    return False
+
+
+def run_benchmark(max_tokens=BENCHMARK_TOKENS):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    ct = usage.get("completion_tokens", 0)
+    return ct / elapsed if elapsed > 0 else 0
+
+
+def get_vram():
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=memory.used,memory.total",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        parts = r.stdout.strip().split(",")
+        return int(parts[0].strip()), int(parts[1].strip())
+    except:
+        return 0, 0
+
+
+def test_config(cfg, label=""):
+    kill_server()
+    desc = label or str(cfg)
+    print(f"  [{desc}] Starting server...")
+    proc = start_server(cfg)
+
+    if not wait_for_server():
+        print(f"  [{desc}] FAILED to start")
+        proc.kill()
+        return None
+
+    vram_used, vram_total = get_vram()
+    print(f"  [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+
+    # Benchmark
+    speeds = []
+    for i in range(BENCHMARK_RUNS):
+        try:
+            tps = run_benchmark()
+            speeds.append(tps)
+        except Exception as e:
+            print(f"ERR({e}) ", end="", flush=True)
+
+    proc.kill()
+
+    if not speeds:
+        print("ALL FAILED")
+        return None
+
+    avg = sum(speeds) / len(speeds)
+    best = max(speeds)
+    print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
+
+    result = {**cfg, "avg_tps": avg, "best_tps": best,
+              "vram_used": vram_used, "vram_total": vram_total, "label": label}
+    ALL_RESULTS.append(result)
+    return result
+
+
+def phase_sweep(phase_name, param_name, values, base_cfg):
+    print(f"\n{'='*70}")
+    print(f"  PHASE: {phase_name}")
+    print(f"  Sweeping: {param_name} = {values}")
+    print(f"{'='*70}")
+
+    best_result = None
+    for val in values:
+        cfg = {**base_cfg}
+        if isinstance(param_name, list):
+            for p, v in zip(param_name, val):
+                cfg[p] = v
+            label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
+        else:
+            cfg[param_name] = val
+            label = f"{param_name}={val}"
+
+        r = test_config(cfg, label)
+        if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
+            best_result = r
+
+    if best_result:
+        print(f"\n  ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
+    return best_result
+
+
+def main():
+    print("=" * 70)
+    print("  Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner")
+    print("  256K Context | RTX 3060 12GB")
+    print("=" * 70)
+    print()
+
+    cfg = dict(BEST)
+
+    # ─── Phase 1: -ngl (already done, quick verify top 3) ───
+    r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg)
+    if r:
+        cfg["ngl"] = r["ngl"]
+
+    # ─── Phase 2: CPU threads (-t, -tb) ───
+    thread_combos = [
+        (2, 2), (4, 4), (4, 8), (6, 6), (6, 8),
+        (8, 8), (8, 12), (10, 10), (12, 12), (16, 16)
+    ]
+    r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
+    if r:
+        cfg["t"] = r["t"]
+        cfg["tb"] = r["tb"]
+
+    # ─── Phase 3: Batch sizes (-ub, -b) ───
+    batch_combos = [
+        (128, 512), (256, 1024), (256, 2048),
+        (512, 1024), (512, 2048), (512, 4096),
+        (1024, 2048), (1024, 4096)
+    ]
+    r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
+    if r:
+        cfg["ub"] = r["ub"]
+        cfg["b"] = r["b"]
+
+    # ─── Phase 4: KV cache precision ───
+    kv_combos = [
+        ("q4_0", "q4_0"),
+        ("q8_0", "q8_0"),
+        ("q4_0", "q8_0"),
+        ("f16", "f16"),
+    ]
+    r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg)
+    if r:
+        cfg["ctk"] = r["ctk"]
+        cfg["ctv"] = r["ctv"]
+
+    # ─── Phase 5: Misc (mmap, poll, prio) ───
+    misc_combos = [
+        (True, 50, 2),   # baseline
+        (False, 50, 2),  # no-mmap
+        (True, 0, 2),    # no polling
+        (True, 100, 2),  # max polling
+        (True, 50, 3),   # realtime priority
+        (False, 0, 3),   # no-mmap + no-poll + realtime
+    ]
+    r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
+    if r:
+        cfg["mmap"] = r["mmap"]
+        cfg["poll"] = r["poll"]
+        cfg["prio"] = r["prio"]
+
+    # ─── Final Report ───
+    print()
+    print("=" * 70)
+    print("  FINAL OPTIMAL CONFIGURATION")
+    print("=" * 70)
+    print(f"  ngl:       {cfg['ngl']}")
+    print(f"  threads:   -t {cfg['t']} -tb {cfg['tb']}")
+    print(f"  batch:     -ub {cfg['ub']} -b {cfg['b']}")
+    print(f"  kv cache:  -ctk {cfg['ctk']} -ctv {cfg['ctv']}")
+    print(f"  flash:     -fa {cfg['fa']}")
+    print(f"  mlock:     {'yes' if cfg['mlock'] else 'no'}")
+    print(f"  mmap:      {'yes' if cfg['mmap'] else 'no (--no-mmap)'}")
+    print(f"  prio:      {cfg['prio']}")
+    print(f"  poll:      {cfg['poll']}")
+    print()
+
+    # Final verification run
+    print("  Running final verification (5 runs)...")
+    kill_server()
+    proc = start_server(cfg)
+    wait_for_server()
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+    final_speeds = []
+    for i in range(5):
+        try:
+            tps = run_benchmark()
+            final_speeds.append(tps)
+            print(f"    Run {i+1}: {tps:.2f} t/s")
+        except:
+            pass
+    proc.kill()
+
+    if final_speeds:
+        avg = sum(final_speeds) / len(final_speeds)
+        best = max(final_speeds)
+        print(f"\n  ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
+
+    print()
+    cmd_parts = [
+        f"llama-server --model {MODEL}",
+        f"-ngl {cfg['ngl']} -c {CONTEXT}",
+        f"-t {cfg['t']} -tb {cfg['tb']}",
+        f"-ub {cfg['ub']} -b {cfg['b']}",
+        f"-fa {cfg['fa']}",
+        f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
+        f"--prio {cfg['prio']} --poll {cfg['poll']}",
+    ]
+    if cfg["mlock"]:
+        cmd_parts.append("--mlock")
+    if not cfg["mmap"]:
+        cmd_parts.append("--no-mmap")
+    cmd_parts.append("--port 8000 --host 0.0.0.0")
+
+    print("  Recommended command:")
+    print(f"    {' '.join(cmd_parts)}")
+    print("=" * 70)
+
+    # Dump all results to JSON
+    with open("scripts/tune_results_gemma4_256k.json", "w") as f:
+        json.dump(ALL_RESULTS, f, indent=2, default=str)
+    print(f"\n  Full results saved: scripts/tune_results_gemma4_256k.json")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
+++ b/scripts/_archive/tuning/auto_tune_gemma4_ncpumoe.py
@@ -0,0 +1,163 @@
+"""
+Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
+Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
+"""
+import subprocess, time, json, urllib.request, sys, os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
+CTX = 262144
+RUNS = 3
+
+
+def kill():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+    time.sleep(4)
+
+
+def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
+    cmd = [SERVER, "--model", MODEL, "-ngl", "999",
+           "-c", str(CTX), "-np", "1", "-fa", "on",
+           "--cache-type-k", ctk, "--cache-type-v", ctv,
+           "-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
+           "--prio", str(prio), "--poll", "50",
+           "--mlock", "--port", "8000", "--host", "0.0.0.0"]
+    if ncpumoe > 0:
+        cmd.extend(["--n-cpu-moe", str(ncpumoe)])
+    if nommap:
+        cmd.append("--no-mmap")
+    return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                            cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
+
+
+def wait_ready(timeout=240):
+    t0 = time.time()
+    while time.time() - t0 < timeout:
+        try:
+            with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
+                if json.loads(r.read()).get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(2)
+    return False
+
+
+def bench(n=200):
+    p = json.dumps({"model": "m", "messages": [{"role": "user",
+         "content": "Count from 1 to 50, each number on new line."}],
+         "max_tokens": n, "temperature": 0.0}).encode()
+    r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
+                               headers={"Content-Type": "application/json"})
+    t0 = time.time()
+    with urllib.request.urlopen(r, timeout=300) as resp:
+        res = json.loads(resp.read())
+    dt = time.time() - t0
+    ct = res.get("usage", {}).get("completion_tokens", 0)
+    return ct / dt if dt > 0 else 0
+
+
+def vram():
+    try:
+        r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
+                            "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
+        a, b = r.stdout.strip().split(",")
+        return int(a.strip()), int(b.strip())
+    except:
+        return 0, 0
+
+
+def test(label, ncpumoe, **kw):
+    kill()
+    print(f"  [{label}] Starting...", end=" ", flush=True)
+    p = start(ncpumoe, **kw)
+    if not wait_ready():
+        print("FAILED"); p.kill(); return None
+    vu, vt = vram()
+    print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
+    try: bench(20)
+    except: pass
+    speeds = []
+    for _ in range(RUNS):
+        try: speeds.append(bench())
+        except: pass
+    p.kill()
+    if not speeds:
+        print("BENCH FAILED"); return None
+    avg, best = sum(speeds)/len(speeds), max(speeds)
+    print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
+    return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
+            "vram": vu, **kw}
+
+
+def main():
+    print("=" * 60)
+    print("  Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
+    print("=" * 60)
+    results = []
+
+    # Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
+    print("\n--- Phase 1: --n-cpu-moe sweep ---")
+    for n in [0, 5, 10, 15, 20, 25, 30]:
+        nm = n > 15  # use --no-mmap when heavy CPU offload
+        r = test(f"ncpumoe={n}", n, nommap=nm)
+        if r: results.append(r)
+
+    # Find best n-cpu-moe
+    best_r = max(results, key=lambda x: x["avg"])
+    best_n = best_r["ncpumoe"]
+    print(f"\n  ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
+
+    # Fine-tune around best
+    if best_n > 0:
+        print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
+        for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
+            if n == best_n: continue
+            nm = n > 15
+            r = test(f"ncpumoe={n}", n, nommap=nm)
+            if r: results.append(r)
+        best_r = max(results, key=lambda x: x["avg"])
+        best_n = best_r["ncpumoe"]
+        print(f"\n  ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
+
+    # Phase 2: Thread sweep at best n-cpu-moe
+    nm = best_n > 15
+    print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
+    for t in [2, 4, 6, 8, 10]:
+        r = test(f"t={t}", best_n, t=t, nommap=nm)
+        if r: results.append(r)
+    best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
+    bt = best_t.get("t", 4)
+    print(f"\n  ★ Best threads: {bt}")
+
+    # Phase 3: Batch sweep
+    print(f"\n--- Phase 3: Batch sweep ---")
+    for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
+        r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
+        if r: results.append(r)
+
+    # Phase 4: KV cache type
+    print(f"\n--- Phase 4: KV cache type ---")
+    for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
+        r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
+        if r: results.append(r)
+
+    # Final report
+    best_all = max(results, key=lambda x: x["avg"])
+    print(f"\n{'='*60}")
+    print(f"  FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
+    print(f"{'='*60}")
+
+    with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
+        json.dump(results, f, indent=2, default=str)
+    print("  Saved: scripts/tune_results_gemma4_ncpumoe.json")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/tuning/auto_tune_qwen35b_256k.py
+++ b/scripts/_archive/tuning/auto_tune_qwen35b_256k.py
@@ -0,0 +1,335 @@
+"""
+Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
+Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
+Now tuning for -c 262144 (256K context).
+
+Phase 1: --cpu-moe vs no --cpu-moe baseline
+Phase 2: -t / -tb sweep
+Phase 3: -ub / -b sweep
+Phase 4: --cache-type-k/v sweep
+Phase 5: Misc (mmap, poll, prio)
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
+CONTEXT = 262144
+BENCHMARK_RUNS = 3
+BENCHMARK_TOKENS = 200
+
+BEST = {
+    "ngl": 999,
+    "cpu_moe": True,
+    "t": 6,
+    "tb": 6,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": True,
+    "mmap": True,
+    "prio": 2,
+    "poll": 50,
+}
+
+ALL_RESULTS = []
+
+
+def kill_server():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+    time.sleep(4)
+
+
+def build_cmd(cfg):
+    cmd = [LLAMA_SERVER, "--model", MODEL,
+           "-ngl", str(cfg["ngl"]),
+           "-c", str(CONTEXT),
+           "-np", "1",
+           "-fa", cfg["fa"],
+           "--cache-type-k", cfg["ctk"],
+           "--cache-type-v", cfg["ctv"],
+           "-ub", str(cfg["ub"]),
+           "-b", str(cfg["b"]),
+           "-t", str(cfg["t"]),
+           "-tb", str(cfg["tb"]),
+           "--prio", str(cfg["prio"]),
+           "--poll", str(cfg["poll"]),
+           "--port", "8000",
+           "--host", "0.0.0.0"]
+    if cfg.get("cpu_moe"):
+        cmd.append("--cpu-moe")
+    if cfg["mlock"]:
+        cmd.append("--mlock")
+    if not cfg["mmap"]:
+        cmd.append("--no-mmap")
+    return cmd
+
+
+def start_server(cfg):
+    cmd = build_cmd(cfg)
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
+    )
+    return proc
+
+
+def wait_for_server(timeout=240):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=3) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(2)
+    return False
+
+
+def run_benchmark(max_tokens=BENCHMARK_TOKENS):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    ct = usage.get("completion_tokens", 0)
+    return ct / elapsed if elapsed > 0 else 0
+
+
+def get_vram():
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=memory.used,memory.total",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        parts = r.stdout.strip().split(",")
+        return int(parts[0].strip()), int(parts[1].strip())
+    except:
+        return 0, 0
+
+
+def test_config(cfg, label=""):
+    kill_server()
+    desc = label or str(cfg)
+    print(f"  [{desc}] Starting server...", flush=True)
+    proc = start_server(cfg)
+
+    if not wait_for_server():
+        print(f"  [{desc}] FAILED to start")
+        proc.kill()
+        return None
+
+    vram_used, vram_total = get_vram()
+    print(f"  [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+
+    speeds = []
+    for i in range(BENCHMARK_RUNS):
+        try:
+            tps = run_benchmark()
+            speeds.append(tps)
+        except Exception as e:
+            print(f"ERR({e}) ", end="", flush=True)
+
+    proc.kill()
+
+    if not speeds:
+        print("ALL FAILED")
+        return None
+
+    avg = sum(speeds) / len(speeds)
+    best = max(speeds)
+    print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
+
+    result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
+              "vram_used": vram_used, "vram_total": vram_total, "label": label}
+    ALL_RESULTS.append(result)
+    return result
+
+
+def phase_sweep(phase_name, param_name, values, base_cfg):
+    print(f"\n{'='*70}")
+    print(f"  PHASE: {phase_name}")
+    print(f"  Sweeping: {param_name} = {values}")
+    print(f"{'='*70}")
+
+    best_result = None
+    for val in values:
+        cfg = {**base_cfg}
+        if isinstance(param_name, list):
+            for p, v in zip(param_name, val):
+                cfg[p] = v
+            label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
+        else:
+            cfg[param_name] = val
+            label = f"{param_name}={val}"
+
+        r = test_config(cfg, label)
+        if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
+            best_result = r
+
+    if best_result:
+        print(f"\n  ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
+    return best_result
+
+
+def main():
+    print("=" * 70)
+    print("  Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
+    print("  256K Context | RTX 3060 12GB")
+    print("  Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
+    print("=" * 70)
+    print()
+
+    cfg = dict(BEST)
+
+    # ─── Phase 1: --cpu-moe critical test ───
+    r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
+    if r:
+        cfg["cpu_moe"] = r["cpu_moe"]
+
+    # ─── Phase 2: CPU threads ───
+    thread_combos = [
+        (2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
+        (8, 8), (8, 12), (10, 10), (12, 12)
+    ]
+    r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
+    if r:
+        cfg["t"] = r["t"]
+        cfg["tb"] = r["tb"]
+
+    # ─── Phase 3: Batch sizes ───
+    batch_combos = [
+        (128, 512), (256, 1024), (256, 2048),
+        (512, 1024), (512, 2048), (512, 4096),
+        (1024, 2048), (1024, 4096)
+    ]
+    r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
+    if r:
+        cfg["ub"] = r["ub"]
+        cfg["b"] = r["b"]
+
+    # ─── Phase 4: KV cache ───
+    kv_combos = [
+        ("q4_0", "q4_0"),
+        ("q8_0", "q8_0"),
+        ("f16", "f16"),
+    ]
+    r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
+    if r:
+        cfg["ctk"] = r["ctk"]
+        cfg["ctv"] = r["ctv"]
+
+    # ─── Phase 5: Misc ───
+    misc_combos = [
+        (True, 50, 2),
+        (False, 50, 2),
+        (True, 0, 2),
+        (True, 100, 2),
+        (True, 50, 3),
+    ]
+    r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
+    if r:
+        cfg["mmap"] = r["mmap"]
+        cfg["poll"] = r["poll"]
+        cfg["prio"] = r["prio"]
+
+    # ─── Final Report ───
+    print()
+    print("=" * 70)
+    print("  FINAL OPTIMAL CONFIGURATION")
+    print("=" * 70)
+    for k, v in cfg.items():
+        print(f"  {k:>12}: {v}")
+    print()
+
+    # Final verification
+    print("  Running final verification (5 runs)...")
+    kill_server()
+    proc = start_server(cfg)
+    wait_for_server()
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+    final_speeds = []
+    for i in range(5):
+        try:
+            tps = run_benchmark()
+            final_speeds.append(tps)
+            print(f"    Run {i+1}: {tps:.2f} t/s")
+        except:
+            pass
+    proc.kill()
+
+    if final_speeds:
+        avg = sum(final_speeds) / len(final_speeds)
+        best = max(final_speeds)
+        print(f"\n  ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
+
+    print()
+    cmd_parts = [
+        f"llama-server --model {MODEL}",
+        f"-ngl {cfg['ngl']} -c {CONTEXT}",
+    ]
+    if cfg.get("cpu_moe"):
+        cmd_parts.append("--cpu-moe")
+    cmd_parts.extend([
+        f"-t {cfg['t']} -tb {cfg['tb']}",
+        f"-ub {cfg['ub']} -b {cfg['b']}",
+        f"-fa {cfg['fa']}",
+        f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
+        f"--prio {cfg['prio']} --poll {cfg['poll']}",
+    ])
+    if cfg["mlock"]:
+        cmd_parts.append("--mlock")
+    if not cfg["mmap"]:
+        cmd_parts.append("--no-mmap")
+    cmd_parts.append("--port 8000 --host 0.0.0.0")
+
+    print("  Recommended command:")
+    print(f"    {' '.join(cmd_parts)}")
+    print("=" * 70)
+
+    with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
+        json.dump(ALL_RESULTS, f, indent=2, default=str)
+    print(f"\n  Full results saved: scripts/tune_results_qwen35b_256k.json")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/tuning/dual_gpu_benchmark.mjs
+++ b/scripts/_archive/tuning/dual_gpu_benchmark.mjs
@@ -0,0 +1,531 @@
+/**
+ * Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
+ * ===========================================================
+ * Tests 4 models across multiple parameter configurations to find
+ * the absolute best model + settings for 256K context coding agent.
+ *
+ * Models:
+ *   1. Qwen3.5-35B-A3B  Q4_K_M     (~20.5 GB)
+ *   2. Qwen3.5-35B-A3B  MXFP4_MOE  (~20.1 GB)
+ *   3. Gemma4  26B-A4B   Q4_K_M     (~15.6 GB)
+ *   4. Gemma4  26B-A4B   MXFP4_MOE  (~15.5 GB)
+ *
+ * Run: node scripts/dual_gpu_benchmark.mjs
+ */
+
+import { spawn, execSync } from "child_process";
+import { writeFileSync, statSync, existsSync } from "fs";
+import { resolve } from "path";
+
+// ─── Configuration ─────────────────────────────────────────────
+const BASE_URL = "http://127.0.0.1:8000";
+const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
+const CONTEXT = 262144; // 256K
+const BENCHMARK_RUNS = 3;
+const BENCHMARK_TOKENS = 200;
+const SERVER_TIMEOUT = 300_000; // ms
+
+const MODELS = [
+  {
+    name: "Qwen3.5-35B-A3B Q4_K_M",
+    path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
+    type: "qwen", quant: "Q4_K_M", totalLayers: 64,
+  },
+  {
+    name: "Qwen3.5-35B-A3B MXFP4_MOE",
+    path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
+    type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
+  },
+  {
+    name: "Gemma4 26B-A4B Q4_K_M",
+    path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
+    type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
+  },
+  {
+    name: "Gemma4 26B-A4B MXFP4_MOE",
+    path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
+    type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
+  },
+];
+
+const ALL_RESULTS = [];
+
+// ─── Utility ───────────────────────────────────────────────────
+
+function log(msg) {
+  const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
+  console.log(`[${ts}] ${msg}`);
+}
+
+function sleep(ms) {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+function killServer() {
+  try {
+    execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
+  } catch {}
+  return sleep(5000);
+}
+
+function getVramAll() {
+  try {
+    const out = execSync(
+      'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
+      { encoding: "utf-8", timeout: 5000 }
+    );
+    return out.trim().split("\n").map((line) => {
+      const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
+      return { gpu, used, total };
+    });
+  } catch {
+    return [];
+  }
+}
+
+function buildCmd(modelPath, params) {
+  const {
+    ngl, t, ub, b, ctk, ctv,
+    cpuMoe = false, nCpuMoe = 0,
+    prio = 3, nommap = false
+  } = params;
+
+  const cmd = [
+    LLAMA_SERVER,
+    "--model", modelPath,
+    "-ngl", String(ngl),
+    "-c", String(CONTEXT),
+    "-np", "1",
+    "-fa", "on",
+    "--cache-type-k", ctk,
+    "--cache-type-v", ctv,
+    "-ub", String(ub),
+    "-b", String(b),
+    "-t", String(t),
+    "-tb", String(t),
+    "--prio", String(prio),
+    "--poll", "50",
+    "--mlock",
+    "--port", "8000",
+    "--host", "0.0.0.0",
+  ];
+
+  if (cpuMoe) cmd.push("--cpu-moe");
+  else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
+  if (nommap) cmd.push("--no-mmap");
+
+  return cmd;
+}
+
+function startServer(modelPath, params) {
+  const args = buildCmd(modelPath, params);
+  const exe = args.shift();
+  log(`  CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
+  return spawn(exe, args, {
+    cwd: process.cwd(),
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+}
+
+async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
+  const start = Date.now();
+  while (Date.now() - start < timeoutMs) {
+    try {
+      const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
+      const data = await resp.json();
+      if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
+    } catch {}
+    await sleep(3000);
+  }
+  return { ok: false, bootTime: timeoutMs / 1000 };
+}
+
+async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
+  const payload = JSON.stringify({
+    model: "local-model",
+    messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
+    max_tokens: maxTokens,
+    temperature: 0.0,
+  });
+
+  const start = Date.now();
+  const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: payload,
+    signal: AbortSignal.timeout(600_000),
+  });
+  const result = await resp.json();
+  const elapsed = (Date.now() - start) / 1000;
+
+  const usage = result.usage || {};
+  const ct = usage.completion_tokens || 0;
+  return {
+    tps: elapsed > 0 ? ct / elapsed : 0,
+    completionTokens: ct,
+    promptTokens: usage.prompt_tokens || 0,
+    elapsed,
+  };
+}
+
+async function testConfig(model, label, params) {
+  await killServer();
+  log(`  [${label}] Starting server...`);
+
+  const proc = startServer(model.path, params);
+  const { ok, bootTime } = await waitForServer();
+
+  if (!ok) {
+    log(`  [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
+    proc.kill("SIGKILL");
+    return null;
+  }
+
+  const vram = getVramAll();
+  const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
+  log(`  [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
+
+  // Warmup
+  try { await runBenchmark(20); } catch {}
+
+  // Benchmark
+  const speeds = [];
+  for (let i = 0; i < BENCHMARK_RUNS; i++) {
+    try {
+      const r = await runBenchmark();
+      speeds.push(r.tps);
+      log(`    Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) {
+      log(`    Run ${i + 1}: ERROR (${e.message})`);
+    }
+  }
+
+  proc.kill("SIGKILL");
+
+  if (speeds.length === 0) {
+    log(`  [${label}] ALL BENCHMARK RUNS FAILED`);
+    return null;
+  }
+
+  const avg = speeds.reduce((a, b) => a + b) / speeds.length;
+  const best = Math.max(...speeds);
+  log(`  [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
+
+  const result = {
+    model: model.name, quant: model.quant, label,
+    avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+    boot_time: +bootTime.toFixed(1), vram, params,
+  };
+  ALL_RESULTS.push(result);
+  return result;
+}
+
+// ─── Phase Runners ─────────────────────────────────────────────
+
+async function phase0_bootTest(model) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 0: Boot Test — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  // Try full GPU first
+  let r = await testConfig(model, "boot-ngl999", {
+    ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
+  });
+  if (r) return r;
+
+  // Try with cpu-moe
+  log("  Full GPU failed, trying with --cpu-moe...");
+  r = await testConfig(model, "boot-cpumoe", {
+    ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
+  });
+  if (r) return r;
+
+  // Reduced layers
+  log("  --cpu-moe also failed, trying reduced layers...");
+  r = await testConfig(model, "boot-ngl-half", {
+    ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
+    ctk: "q4_0", ctv: "q4_0",
+  });
+  return r;
+}
+
+async function phase1_gpuOffload(model, baseline) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 1: GPU Offload Strategy — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const results = baseline ? [baseline] : [];
+
+  // Test --cpu-moe on/off
+  for (const cpuMoe of [true, false]) {
+    const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
+    if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
+    const r = await testConfig(model, lbl, {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
+    });
+    if (r) results.push(r);
+  }
+
+  // n-cpu-moe sweep
+  for (const n of [0, 5, 10, 15, 20]) {
+    if (n > model.totalLayers) continue;
+    const r = await testConfig(model, `n-cpu-moe=${n}`, {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
+    });
+    if (r) results.push(r);
+  }
+
+  if (results.length === 0) { log("  PHASE 1: No config worked!"); return null; }
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase2_threads(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 2: CPU Thread Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const t of [2, 4, 6, 8, 10, 12]) {
+    if (t === p.t) continue;
+    const r = await testConfig(model, `t=${t}`, {
+      ...p, t,
+    });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase3_batch(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 3: Batch Size Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const [ub, b] of [
+    [128, 512], [256, 1024], [256, 2048],
+    [512, 1024], [512, 2048], [512, 4096],
+    [1024, 2048], [1024, 4096],
+  ]) {
+    if (ub === p.ub && b === p.b) continue;
+    const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase4_kvcache(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 4: KV Cache Type Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const [ctk, ctv] of [
+    ["q4_0", "q4_0"], ["q8_0", "q8_0"],
+    ["q4_0", "q8_0"], ["f16", "f16"],
+  ]) {
+    if (ctk === p.ctk && ctv === p.ctv) continue;
+    const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase5_final(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 5: Final Verification (5 runs) — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  await killServer();
+  const proc = startServer(model.path, prev.params);
+  const { ok, bootTime } = await waitForServer();
+  if (!ok) { log("  FAILED to start!"); proc.kill("SIGKILL"); return prev; }
+
+  const vram = getVramAll();
+  try { await runBenchmark(20); } catch {}
+
+  const speeds = [];
+  for (let i = 0; i < 5; i++) {
+    try {
+      const r = await runBenchmark();
+      speeds.push(r.tps);
+      log(`    Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) {
+      log(`    Final Run ${i + 1}: ERROR (${e.message})`);
+    }
+  }
+  proc.kill("SIGKILL");
+
+  if (speeds.length > 0) {
+    const avg = speeds.reduce((a, b) => a + b) / speeds.length;
+    const best = Math.max(...speeds);
+    log(`\n  ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
+
+    const final_ = {
+      model: model.name, quant: model.quant,
+      label: `FINAL-${model.name}`,
+      avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+      boot_time: +bootTime.toFixed(1), vram, params: prev.params,
+    };
+    ALL_RESULTS.push(final_);
+    return final_;
+  }
+  return prev;
+}
+
+// ─── Main ──────────────────────────────────────────────────────
+
+async function runModelBenchmark(model) {
+  log(`\n${"#".repeat(70)}`);
+  log(`  MODEL: ${model.name}`);
+  log(`  File:  ${model.path}`);
+  try {
+    const sz = statSync(model.path).size / 1024 ** 3;
+    log(`  Size:  ${sz.toFixed(2)} GB`);
+  } catch { log(`  Size:  unknown`); }
+  log(`${"#".repeat(70)}`);
+
+  if (!existsSync(model.path)) {
+    log(`  SKIP: Model file not found!`);
+    return null;
+  }
+
+  const baseline = await phase0_bootTest(model);
+  if (!baseline) { log(`  SKIP: Cannot boot at 256K!`); return null; }
+
+  let best = await phase1_gpuOffload(model, baseline);
+  if (!best) return baseline;
+
+  best = await phase2_threads(model, best);
+  best = await phase3_batch(model, best);
+  best = await phase4_kvcache(model, best);
+  best = await phase5_final(model, best);
+
+  return best;
+}
+
+async function main() {
+  const startTime = Date.now();
+
+  log("=".repeat(70));
+  log("  DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
+  log("  2x RTX 3060 (24GB Total) | 256K Context");
+  log(`  Models: ${MODELS.length}`);
+  log(`  Started: ${new Date().toISOString()}`);
+  log("=".repeat(70));
+
+  const gpus = getVramAll();
+  gpus.forEach((g) => log(`  GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
+
+  const winners = [];
+
+  for (let i = 0; i < MODELS.length; i++) {
+    log(`\n${"=".repeat(70)}`);
+    log(`  STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
+    log(`${"=".repeat(70)}`);
+
+    const winner = await runModelBenchmark(MODELS[i]);
+    if (winner) winners.push(winner);
+
+    // Save intermediate
+    writeFileSync("scripts/dual_gpu_results.json",
+      JSON.stringify(ALL_RESULTS, null, 2));
+    log(`  Intermediate saved (${ALL_RESULTS.length} configs tested)`);
+  }
+
+  // ─── Grand Final ───────────────────────────────────────────
+  const elapsed = (Date.now() - startTime) / 60000;
+
+  log(`\n${"=".repeat(70)}`);
+  log(`  GRAND FINAL COMPARISON`);
+  log(`  Total time: ${elapsed.toFixed(1)} minutes`);
+  log(`  Configs tested: ${ALL_RESULTS.length}`);
+  log(`${"=".repeat(70)}`);
+
+  if (winners.length === 0) {
+    log("  No models ran at 256K!");
+    return;
+  }
+
+  winners.sort((a, b) => b.avg_tps - a.avg_tps);
+  const medals = ["🥇", "🥈", "🥉", "  "];
+
+  const lines = [
+    `Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
+    `Hardware: 2x RTX 3060 12GB | Context: 256K`,
+    `Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
+    "", "=".repeat(60), "  RANKING (by AVG t/s)", "=".repeat(60),
+  ];
+
+  for (let i = 0; i < winners.length; i++) {
+    const w = winners[i];
+    const p = w.params;
+    lines.push("");
+    lines.push(`  ${medals[i] || "  "} #${i + 1}: ${w.model}`);
+    lines.push(`      AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
+    lines.push(`      Boot: ${w.boot_time.toFixed(0)}s`);
+    lines.push(`      ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
+    lines.push(`      ctk=${p.ctk} ctv=${p.ctv}`);
+    if (p.cpuMoe) lines.push(`      --cpu-moe`);
+    else if ((p.nCpuMoe || 0) > 0) lines.push(`      --n-cpu-moe ${p.nCpuMoe}`);
+  }
+
+  const champ = winners[0];
+  const cp = champ.params;
+  lines.push("", "=".repeat(60));
+  lines.push(`  ★ CHAMPION: ${champ.model}`);
+  lines.push(`    ${champ.avg_tps.toFixed(2)} t/s average`);
+  lines.push("=".repeat(60));
+
+  // Build recommended command
+  const cmdParts = [
+    `llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
+    `-ngl ${cp.ngl} -c ${CONTEXT}`,
+    `-t ${cp.t} -tb ${cp.t}`,
+    `-ub ${cp.ub} -b ${cp.b}`,
+    `-fa on`,
+    `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
+    `--prio ${cp.prio || 3} --poll 50`,
+    `--mlock`,
+  ];
+  if (cp.cpuMoe) cmdParts.push("--cpu-moe");
+  else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
+  if (cp.nommap) cmdParts.push("--no-mmap");
+  cmdParts.push("--port 8000 --host 0.0.0.0");
+
+  lines.push("", "  Recommended command:");
+  lines.push(`    ${cmdParts.join(" ")}`);
+
+  const summary = lines.join("\n");
+  console.log(summary);
+  writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
+  writeFileSync("scripts/dual_gpu_results.json",
+    JSON.stringify(ALL_RESULTS, null, 2));
+
+  log(`\n  Results: scripts/dual_gpu_results.json`);
+  log(`  Summary: scripts/dual_gpu_summary.txt`);
+  log(`  DONE!`);
+
+  await killServer();
+}
+
+main().catch((e) => {
+  console.error("Fatal error:", e);
+  process.exit(1);
+});
--- a/scripts/_archive/tuning/dual_gpu_benchmark.py
+++ b/scripts/_archive/tuning/dual_gpu_benchmark.py
@@ -0,0 +1,644 @@
+"""
+Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
+==========================================================
+Tests 4 models across multiple parameter configurations to find
+the absolute best model + settings for 256K context coding agent.
+
+Models:
+  1. Qwen3.5-35B-A3B  Q4_K_M     (~20.5 GB)
+  2. Qwen3.5-35B-A3B  MXFP4_MOE  (~20.1 GB)
+  3. Gemma4  26B-A4B   Q4_K_M     (~15.6 GB)
+  4. Gemma4  26B-A4B   MXFP4_MOE  (~15.5 GB)
+
+Test Phases (per model):
+  Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
+  Phase 1: GPU layer + MoE offload strategy sweep
+  Phase 2: CPU thread sweep (carry best from P1)
+  Phase 3: Batch size sweep (carry best from P1+P2)
+  Phase 4: KV cache type sweep (carry best from P1+P2+P3)
+  Phase 5: Final verification (5 runs)
+
+Output: scripts/dual_gpu_results.json  (all raw data)
+        scripts/dual_gpu_summary.txt   (human-readable winner)
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+import datetime
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except Exception:
+    pass
+
+# ─── Configuration ───────────────────────────────────────────────
+BASE_URL = "http://127.0.0.1:8000"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+CONTEXT = 262144  # 256K
+BENCHMARK_RUNS = 3
+BENCHMARK_TOKENS = 200
+SERVER_TIMEOUT = 300  # seconds to wait for server startup
+
+MODELS = [
+    {
+        "name": "Qwen3.5-35B-A3B Q4_K_M",
+        "path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
+        "type": "qwen",
+        "quant": "Q4_K_M",
+        "is_mxfp4": False,
+        "total_layers": 64,  # Qwen3.5 35B has 64 layers
+    },
+    {
+        "name": "Qwen3.5-35B-A3B MXFP4_MOE",
+        "path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
+        "type": "qwen",
+        "quant": "MXFP4_MOE",
+        "is_mxfp4": True,
+        "total_layers": 64,
+    },
+    {
+        "name": "Gemma4 26B-A4B Q4_K_M",
+        "path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
+        "type": "gemma4",
+        "quant": "Q4_K_M",
+        "is_mxfp4": False,
+        "total_layers": 30,  # Gemma4 26B has 30 layers
+    },
+    {
+        "name": "Gemma4 26B-A4B MXFP4_MOE",
+        "path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
+        "type": "gemma4",
+        "quant": "MXFP4_MOE",
+        "is_mxfp4": True,
+        "total_layers": 30,
+    },
+]
+
+ALL_RESULTS = []
+
+
+# ─── Utility Functions ──────────────────────────────────────────
+def log(msg):
+    ts = datetime.datetime.now().strftime("%H:%M:%S")
+    print(f"[{ts}] {msg}", flush=True)
+
+
+def kill_server():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
+                   capture_output=True)
+    time.sleep(5)
+
+
+def get_vram_all():
+    """Returns list of (used, total) tuples for each GPU."""
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        gpus = []
+        for line in r.stdout.strip().split("\n"):
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) >= 3:
+                gpus.append({
+                    "gpu": int(parts[0]),
+                    "used": int(parts[1]),
+                    "total": int(parts[2]),
+                })
+        return gpus
+    except Exception:
+        return []
+
+
+def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
+              cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
+    """Build llama-server command for dual-GPU."""
+    cmd = [
+        LLAMA_SERVER,
+        "--model", model_path,
+        "-ngl", str(ngl),
+        "-c", str(CONTEXT),
+        "-np", "1",
+        "-fa", "on",
+        "--cache-type-k", ctk,
+        "--cache-type-v", ctv,
+        "-ub", str(ub),
+        "-b", str(b),
+        "-t", str(t),
+        "-tb", str(t),
+        "--prio", str(prio),
+        "--poll", "50",
+        "--mlock",
+        "--port", "8000",
+        "--host", "0.0.0.0",
+    ]
+    # MoE offloading options
+    if cpu_moe:
+        cmd.append("--cpu-moe")
+    elif n_cpu_moe > 0:
+        cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
+    if nommap:
+        cmd.append("--no-mmap")
+    return cmd
+
+
+def start_server(model_path, **kwargs):
+    cmd = build_cmd(model_path, **kwargs)
+    log(f"  CMD: {' '.join(cmd[-20:])}")  # show last 20 args
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
+    )
+    return proc
+
+
+def wait_for_server(timeout=SERVER_TIMEOUT):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=3) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    boot_time = time.time() - start
+                    return True, boot_time
+        except Exception:
+            pass
+        time.sleep(3)
+    return False, timeout
+
+
+def run_benchmark(max_tokens=BENCHMARK_TOKENS):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user",
+                      "content": "Count from 1 to 50, writing each number on a new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0,
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    ct = usage.get("completion_tokens", 0)
+    pt = usage.get("prompt_tokens", 0)
+    return {
+        "tps": ct / elapsed if elapsed > 0 else 0,
+        "completion_tokens": ct,
+        "prompt_tokens": pt,
+        "elapsed": elapsed,
+    }
+
+
+def test_config(model_info, label, **kwargs):
+    """Test a single configuration. Returns result dict or None."""
+    kill_server()
+    log(f"  [{label}] Starting server...")
+
+    proc = start_server(model_info["path"], **kwargs)
+    ok, boot_time = wait_for_server()
+
+    if not ok:
+        log(f"  [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
+        proc.kill()
+        return None
+
+    vram = get_vram_all()
+    vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
+    log(f"  [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except Exception:
+        pass
+
+    # Benchmark runs
+    speeds = []
+    for i in range(BENCHMARK_RUNS):
+        try:
+            r = run_benchmark()
+            speeds.append(r["tps"])
+            log(f"    Run {i+1}: {r['tps']:.2f} t/s")
+        except Exception as e:
+            log(f"    Run {i+1}: ERROR ({e})")
+
+    proc.kill()
+
+    if not speeds:
+        log(f"  [{label}] ALL BENCHMARK RUNS FAILED")
+        return None
+
+    avg = sum(speeds) / len(speeds)
+    best = max(speeds)
+    log(f"  [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
+
+    result = {
+        "model": model_info["name"],
+        "quant": model_info["quant"],
+        "label": label,
+        "avg_tps": round(avg, 2),
+        "best_tps": round(best, 2),
+        "boot_time": round(boot_time, 1),
+        "vram": vram,
+        "params": kwargs,
+    }
+    ALL_RESULTS.append(result)
+    return result
+
+
+# ─── Phase Runners ───────────────────────────────────────────────
+
+def phase0_boot_test(model):
+    """Quick test: can the model even boot with 256K on dual GPU?"""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 0: Boot Test — {model['name']}")
+    log(f"{'='*70}")
+
+    # Try -ngl 999 (all layers to GPU) as baseline
+    r = test_config(
+        model, f"boot-ngl999",
+        ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
+    )
+    if r:
+        return r
+
+    # If full GPU fails, try with cpu-moe
+    log("  Full GPU failed, trying with --cpu-moe...")
+    r = test_config(
+        model, f"boot-cpumoe",
+        ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
+        cpu_moe=True,
+    )
+    if r:
+        return r
+
+    # Extreme fallback: fewer layers
+    log("  --cpu-moe also failed, trying reduced layers...")
+    r = test_config(
+        model, f"boot-ngl-half",
+        ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
+        ctk="q4_0", ctv="q4_0",
+    )
+    return r
+
+
+def phase1_gpu_offload(model, baseline):
+    """Find optimal GPU layer count and MoE offload strategy."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 1: GPU Offload Strategy — {model['name']}")
+    log(f"{'='*70}")
+
+    results = []
+    if baseline:
+        results.append(baseline)
+
+    total = model["total_layers"]
+
+    # Strategy A: All GPU + cpu-moe variations
+    for cpu_moe in [True, False]:
+        label = f"ngl=999 cpu_moe={cpu_moe}"
+        # Skip if already tested in baseline
+        if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
+           baseline["params"].get("cpu_moe", False) == cpu_moe:
+            continue
+        r = test_config(
+            model, label,
+            ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
+            cpu_moe=cpu_moe,
+        )
+        if r:
+            results.append(r)
+
+    # Strategy B: n-cpu-moe sweep (selective expert offload)
+    for n in [0, 5, 10, 15, 20]:
+        if n > total:
+            continue
+        r = test_config(
+            model, f"n-cpu-moe={n}",
+            ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
+            n_cpu_moe=n,
+        )
+        if r:
+            results.append(r)
+
+    if not results:
+        log("  PHASE 1: No configuration worked!")
+        return None
+
+    best = max(results, key=lambda x: x["avg_tps"])
+    log(f"\n  ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
+    return best
+
+
+def phase2_threads(model, prev_best):
+    """Sweep CPU threads with best GPU config locked."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 2: CPU Thread Sweep — {model['name']}")
+    log(f"{'='*70}")
+
+    p = prev_best["params"]
+    results = [prev_best]
+
+    for t in [2, 4, 6, 8, 10, 12]:
+        if t == p.get("t", 6):
+            continue
+        r = test_config(
+            model, f"t={t}",
+            ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
+            ctk=p["ctk"], ctv=p["ctv"],
+            cpu_moe=p.get("cpu_moe", False),
+            n_cpu_moe=p.get("n_cpu_moe", 0),
+        )
+        if r:
+            results.append(r)
+
+    best = max(results, key=lambda x: x["avg_tps"])
+    log(f"\n  ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
+    return best
+
+
+def phase3_batch(model, prev_best):
+    """Sweep batch sizes."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 3: Batch Size Sweep — {model['name']}")
+    log(f"{'='*70}")
+
+    p = prev_best["params"]
+    best_t = p["t"]
+    results = [prev_best]
+
+    for ub, b in [(128, 512), (256, 1024), (256, 2048),
+                  (512, 1024), (512, 2048), (512, 4096),
+                  (1024, 2048), (1024, 4096)]:
+        if ub == p["ub"] and b == p["b"]:
+            continue
+        r = test_config(
+            model, f"ub={ub} b={b}",
+            ngl=p["ngl"], t=best_t, ub=ub, b=b,
+            ctk=p["ctk"], ctv=p["ctv"],
+            cpu_moe=p.get("cpu_moe", False),
+            n_cpu_moe=p.get("n_cpu_moe", 0),
+        )
+        if r:
+            results.append(r)
+
+    best = max(results, key=lambda x: x["avg_tps"])
+    log(f"\n  ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
+    return best
+
+
+def phase4_kvcache(model, prev_best):
+    """Sweep KV cache precision."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 4: KV Cache Type Sweep — {model['name']}")
+    log(f"{'='*70}")
+
+    p = prev_best["params"]
+    results = [prev_best]
+
+    for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
+                     ("q4_0", "q8_0"), ("f16", "f16")]:
+        if ctk == p["ctk"] and ctv == p["ctv"]:
+            continue
+        r = test_config(
+            model, f"kv={ctk}/{ctv}",
+            ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
+            ctk=ctk, ctv=ctv,
+            cpu_moe=p.get("cpu_moe", False),
+            n_cpu_moe=p.get("n_cpu_moe", 0),
+        )
+        if r:
+            results.append(r)
+
+    best = max(results, key=lambda x: x["avg_tps"])
+    log(f"\n  ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
+    return best
+
+
+def phase5_final(model, prev_best):
+    """Final verification with 5 runs."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 5: Final Verification (5 runs) — {model['name']}")
+    log(f"{'='*70}")
+
+    p = prev_best["params"]
+    kill_server()
+    proc = start_server(model["path"], **p)
+    ok, boot_time = wait_for_server()
+    if not ok:
+        log("  FAILED to start for final verification!")
+        proc.kill()
+        return prev_best
+
+    vram = get_vram_all()
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except Exception:
+        pass
+
+    speeds = []
+    for i in range(5):
+        try:
+            r = run_benchmark()
+            speeds.append(r["tps"])
+            log(f"    Final Run {i+1}: {r['tps']:.2f} t/s")
+        except Exception as e:
+            log(f"    Final Run {i+1}: ERROR ({e})")
+
+    proc.kill()
+
+    if speeds:
+        avg = sum(speeds) / len(speeds)
+        best_tps = max(speeds)
+        log(f"\n  ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
+
+        final = {
+            "model": model["name"],
+            "quant": model["quant"],
+            "label": f"FINAL-{model['name']}",
+            "avg_tps": round(avg, 2),
+            "best_tps": round(best_tps, 2),
+            "boot_time": round(boot_time, 1),
+            "vram": vram,
+            "params": p,
+        }
+        ALL_RESULTS.append(final)
+        return final
+
+    return prev_best
+
+
+# ─── Main ────────────────────────────────────────────────────────
+
+def run_full_benchmark_for_model(model):
+    """Run all phases for a single model."""
+    log(f"\n{'#'*70}")
+    log(f"  MODEL: {model['name']}")
+    log(f"  File:  {model['path']}")
+    log(f"  Size:  {os.path.getsize(model['path'])/1024**3:.2f} GB")
+    log(f"{'#'*70}")
+
+    # Check model exists
+    if not os.path.exists(model["path"]):
+        log(f"  SKIP: Model file not found!")
+        return None
+
+    # Phase 0: Can it boot?
+    baseline = phase0_boot_test(model)
+    if not baseline:
+        log(f"  SKIP: {model['name']} cannot boot at 256K context!")
+        return None
+
+    # Phase 1: GPU offload strategy
+    best = phase1_gpu_offload(model, baseline)
+    if not best:
+        return baseline
+
+    # Phase 2: CPU threads
+    best = phase2_threads(model, best)
+
+    # Phase 3: Batch sizes
+    best = phase3_batch(model, best)
+
+    # Phase 4: KV cache
+    best = phase4_kvcache(model, best)
+
+    # Phase 5: Final verification
+    final = phase5_final(model, best)
+
+    return final
+
+
+def main():
+    start_time = time.time()
+
+    log("=" * 70)
+    log("  DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
+    log("  2x RTX 3060 (24GB Total) | 256K Context")
+    log(f"  Models: {len(MODELS)}")
+    log(f"  Started: {datetime.datetime.now().isoformat()}")
+    log("=" * 70)
+
+    # Show GPU info
+    gpus = get_vram_all()
+    for g in gpus:
+        log(f"  GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
+
+    # Run benchmarks for each model
+    model_winners = []
+    for i, model in enumerate(MODELS):
+        log(f"\n{'='*70}")
+        log(f"  STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
+        log(f"{'='*70}")
+
+        winner = run_full_benchmark_for_model(model)
+        if winner:
+            model_winners.append(winner)
+
+        # Save intermediate results
+        with open("scripts/dual_gpu_results.json", "w") as f:
+            json.dump(ALL_RESULTS, f, indent=2, default=str)
+        log(f"  Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
+
+    # ─── Grand Final Comparison ──────────────────────────────────
+    elapsed = (time.time() - start_time) / 60
+
+    log(f"\n{'='*70}")
+    log(f"  GRAND FINAL COMPARISON")
+    log(f"  Total time: {elapsed:.1f} minutes")
+    log(f"  Configs tested: {len(ALL_RESULTS)}")
+    log(f"{'='*70}")
+
+    if not model_winners:
+        log("  No models were able to run at 256K context!")
+        return
+
+    # Sort by avg t/s
+    model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
+
+    summary_lines = []
+    summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
+    summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
+    summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
+    summary_lines.append(f"Total time: {elapsed:.1f} minutes")
+    summary_lines.append("")
+    summary_lines.append("=" * 60)
+    summary_lines.append("  RANKING (by AVG t/s)")
+    summary_lines.append("=" * 60)
+
+    for rank, w in enumerate(model_winners, 1):
+        medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: "  "}.get(rank, "  ")
+        summary_lines.append(f"\n  {medal} #{rank}: {w['model']}")
+        summary_lines.append(f"      AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
+        summary_lines.append(f"      Boot: {w['boot_time']:.0f}s")
+        p = w["params"]
+        summary_lines.append(f"      ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
+        summary_lines.append(f"      ctk={p['ctk']} ctv={p['ctv']}")
+        if p.get("cpu_moe"):
+            summary_lines.append(f"      --cpu-moe")
+        elif p.get("n_cpu_moe", 0) > 0:
+            summary_lines.append(f"      --n-cpu-moe {p['n_cpu_moe']}")
+
+    champion = model_winners[0]
+    summary_lines.append(f"\n{'='*60}")
+    summary_lines.append(f"  ★ CHAMPION: {champion['model']}")
+    summary_lines.append(f"    {champion['avg_tps']:.2f} t/s average")
+    summary_lines.append(f"{'='*60}")
+
+    # Build recommended command
+    p = champion["params"]
+    cmd_parts = [
+        f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
+        f"-ngl {p['ngl']} -c {CONTEXT}",
+        f"-t {p['t']} -tb {p['t']}",
+        f"-ub {p['ub']} -b {p['b']}",
+        "-fa on",
+        f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
+        f"--prio {p.get('prio', 3)} --poll 50",
+        "--mlock",
+    ]
+    if p.get("cpu_moe"):
+        cmd_parts.append("--cpu-moe")
+    elif p.get("n_cpu_moe", 0) > 0:
+        cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
+    if p.get("nommap"):
+        cmd_parts.append("--no-mmap")
+    cmd_parts.append("--port 8000 --host 0.0.0.0")
+
+    summary_lines.append(f"\n  Recommended command:")
+    summary_lines.append(f"    {' '.join(cmd_parts)}")
+
+    summary = "\n".join(summary_lines)
+    print(summary)
+
+    with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
+        f.write(summary)
+
+    with open("scripts/dual_gpu_results.json", "w") as f:
+        json.dump(ALL_RESULTS, f, indent=2, default=str)
+
+    log(f"\n  Results: scripts/dual_gpu_results.json")
+    log(f"  Summary: scripts/dual_gpu_summary.txt")
+    log(f"  DONE!")
+
+    kill_server()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/tuning/dual_gpu_benchmark_v2.mjs
+++ b/scripts/_archive/tuning/dual_gpu_benchmark_v2.mjs
@@ -0,0 +1,330 @@
+/**
+ * Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2
+ * =====================================================
+ * Informed by VRAM analysis — tests models in optimal order.
+ * 
+ * Key insights applied:
+ *   - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA)
+ *   - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first
+ *   - Skip configs known to fail, minimize wasted time
+ *
+ * Run: node scripts/dual_gpu_benchmark_v2.mjs
+ * Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt
+ */
+
+import { spawn, execSync } from "child_process";
+import { writeFileSync, existsSync, statSync } from "fs";
+
+const BASE_URL = "http://127.0.0.1:8000";
+const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
+const CTX = 262144;
+const RUNS = 3;
+const TOKENS = 200;
+const BOOT_TIMEOUT = 300_000;
+
+// Models ordered: smallest first (most likely to succeed fully on GPU)
+const MODELS = [
+  {
+    name: "Gemma4-26B MXFP4_MOE",
+    path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
+    quant: "MXFP4_MOE",
+    fitsGPU: true,  // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB
+  },
+  {
+    name: "Gemma4-26B Q4_K_M",
+    path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
+    quant: "Q4_K_M",
+    fitsGPU: true,  // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB
+  },
+  {
+    name: "Qwen3.5-35B MXFP4_MOE",
+    path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
+    quant: "MXFP4_MOE",
+    fitsGPU: "maybe",  // 20.1 + 1.41 + 1 = 22.51 GB — tight
+  },
+  {
+    name: "Qwen3.5-35B Q4_K_M",
+    path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
+    quant: "Q4_K_M",
+    fitsGPU: "maybe",  // 20.5 + 1.41 + 1 = 22.91 GB — very tight
+  },
+];
+
+const ALL = [];
+let currentProc = null;
+
+// ─── Utilities ─────────────────────────────────────────────────
+const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
+const sleep = (ms) => new Promise(r => setTimeout(r, ms));
+
+async function kill() {
+  if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; }
+  try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
+  await sleep(5000);
+}
+
+function vram() {
+  try {
+    return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
+      { encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
+      const [g, u, t] = l.split(",").map(s => parseInt(s));
+      return { gpu: g, used: u, total: t };
+    });
+  } catch { return []; }
+}
+
+function startServer(modelPath, p) {
+  const args = [
+    "--model", modelPath, "-ngl", String(p.ngl),
+    "-c", String(CTX), "-np", "1", "-fa", "on",
+    "--cache-type-k", p.ctk, "--cache-type-v", p.ctv,
+    "-ub", String(p.ub), "-b", String(p.b),
+    "-t", String(p.t), "-tb", String(p.t),
+    "--prio", String(p.prio || 3), "--poll", "50", "--mlock",
+    "--port", "8000", "--host", "0.0.0.0",
+  ];
+  if (p.cpuMoe) args.push("--cpu-moe");
+  else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe));
+  if (p.nommap) args.push("--no-mmap");
+
+  currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
+  return currentProc;
+}
+
+async function waitReady(timeout = BOOT_TIMEOUT) {
+  const t0 = Date.now();
+  while (Date.now() - t0 < timeout) {
+    try {
+      const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
+      const d = await r.json();
+      if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
+    } catch {}
+    await sleep(3000);
+  }
+  return { ok: false, boot: timeout / 1000 };
+}
+
+async function bench(n = TOKENS) {
+  const t0 = Date.now();
+  const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      model: "m",
+      messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
+      max_tokens: n, temperature: 0,
+    }),
+    signal: AbortSignal.timeout(600_000),
+  });
+  const d = await r.json();
+  const dt = (Date.now() - t0) / 1000;
+  const ct = d.usage?.completion_tokens || 0;
+  return { tps: ct / dt, ct, dt };
+}
+
+async function testConfig(model, label, params) {
+  await kill();
+  log(`  [${label}] Starting...`);
+  startServer(model.path, params);
+  const { ok, boot } = await waitReady();
+  if (!ok) { log(`  [${label}] ✗ FAILED (timeout)`); await kill(); return null; }
+
+  const v = vram();
+  const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
+  log(`  [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`);
+
+  try { await bench(20); } catch {} // warmup
+
+  const speeds = [];
+  for (let i = 0; i < RUNS; i++) {
+    try { const r = await bench(); speeds.push(r.tps); log(`    Run${i+1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) { log(`    Run${i+1}: ERR ${e.message}`); }
+  }
+  await kill();
+
+  if (!speeds.length) { log(`  [${label}] ✗ ALL RUNS FAILED`); return null; }
+  const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
+  const best = Math.max(...speeds);
+  log(`  [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
+
+  const res = { model: model.name, quant: model.quant, label,
+    avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+    boot: +boot.toFixed(1), vram: v, params };
+  ALL.push(res);
+  return res;
+}
+
+// Save intermediate results after each test
+function saveIntermediate() {
+  writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
+}
+
+// ─── Smart Phase Runner ────────────────────────────────────────
+
+async function tuneModel(model) {
+  log(`\n${"#".repeat(65)}`);
+  log(`  ${model.name} (${model.quant})`);
+  if (!existsSync(model.path)) { log("  ✗ File not found, SKIP"); return null; }
+  const sz = (statSync(model.path).size / 1024**3).toFixed(2);
+  log(`  Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`);
+  log(`${"#".repeat(65)}`);
+
+  // ── Step 1: Find working GPU config ──
+  log(`\n  ── Step 1: Find optimal GPU offload ──`);
+  let baseline = null;
+
+  if (model.fitsGPU === true || model.fitsGPU === "maybe") {
+    // Try full GPU, no CPU offload
+    baseline = await testConfig(model, "ngl=999 pure-GPU", {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" });
+    saveIntermediate();
+  }
+
+  if (!baseline) {
+    // Try n-cpu-moe values (ascending — find minimum needed)
+    for (const n of [5, 10, 15, 20]) {
+      baseline = await testConfig(model, `n-cpu-moe=${n}`, {
+        ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n });
+      saveIntermediate();
+      if (baseline) break; // found minimum working offload
+    }
+  }
+
+  if (!baseline) {
+    // Last resort: full cpu-moe
+    baseline = await testConfig(model, "cpu-moe", {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true });
+    saveIntermediate();
+  }
+
+  if (!baseline) { log(`  ✗ ${model.name} cannot boot at 256K!`); return null; }
+
+  const bp = baseline.params; // carry forward best params
+
+  // If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory)
+  if (!bp.cpuMoe && !bp.nCpuMoe) {
+    const alt = await testConfig(model, "compare: cpu-moe", {
+      ...bp, cpuMoe: true });
+    saveIntermediate();
+    if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; }
+  }
+
+  let best = baseline;
+
+  // ── Step 2: Thread sweep ──
+  log(`\n  ── Step 2: Thread sweep ──`);
+  for (const t of [2, 4, 8, 10, 12]) {
+    if (t === best.params.t) continue;
+    const r = await testConfig(model, `t=${t}`, { ...best.params, t });
+    saveIntermediate();
+    if (r && r.avg_tps > best.avg_tps) best = r;
+  }
+
+  // ── Step 3: Batch sweep ──
+  log(`\n  ── Step 3: Batch sweep ──`);
+  for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) {
+    if (ub === best.params.ub && b === best.params.b) continue;
+    const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b });
+    saveIntermediate();
+    if (r && r.avg_tps > best.avg_tps) best = r;
+  }
+
+  // ── Step 4: KV cache sweep ──
+  log(`\n  ── Step 4: KV cache type ──`);
+  for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) {
+    if (ctk === best.params.ctk && ctv === best.params.ctv) continue;
+    const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv });
+    saveIntermediate();
+    if (r && r.avg_tps > best.avg_tps) best = r;
+  }
+
+  // ── Step 5: Final verification (5 runs) ──
+  log(`\n  ── Step 5: Final verification ──`);
+  await kill();
+  startServer(model.path, best.params);
+  const { ok, boot } = await waitReady();
+  if (!ok) { await kill(); return best; }
+  const v = vram();
+  try { await bench(20); } catch {}
+
+  const finals = [];
+  for (let i = 0; i < 5; i++) {
+    try { const r = await bench(); finals.push(r.tps); log(`    Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) { log(`    Final ${i+1}: ERR`); }
+  }
+  await kill();
+
+  if (finals.length > 0) {
+    const avg = finals.reduce((a,b)=>a+b) / finals.length;
+    const bst = Math.max(...finals);
+    log(`  ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
+    const final = { model: model.name, quant: model.quant, label: `FINAL`,
+      avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
+      boot: +boot.toFixed(1), vram: v, params: best.params };
+    ALL.push(final);
+    saveIntermediate();
+    return final;
+  }
+  return best;
+}
+
+// ─── Main ──────────────────────────────────────────────────────
+async function main() {
+  const t0 = Date.now();
+  log("=" .repeat(65));
+  log("  DUAL-GPU BENCHMARK v2 — Smart Strategy");
+  log("  2x RTX 3060 (24GB) | 256K Context");
+  log("  " + new Date().toISOString());
+  log("=".repeat(65));
+  vram().forEach(g => log(`  GPU${g.gpu}: ${g.used}/${g.total} MiB`));
+
+  const winners = [];
+  for (let i = 0; i < MODELS.length; i++) {
+    log(`\n${"=".repeat(65)}`);
+    log(`  MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`);
+    log("=".repeat(65));
+    const w = await tuneModel(MODELS[i]);
+    if (w) winners.push(w);
+    saveIntermediate();
+  }
+
+  // ─── Summary ──────────────────────────────────────────────
+  const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
+  winners.sort((a, b) => b.avg_tps - a.avg_tps);
+  const medals = ["🥇", "🥈", "🥉", "  "];
+
+  const lines = [
+    `Dual-GPU Benchmark v2 — ${new Date().toISOString()}`,
+    `2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
+    "", "=" .repeat(55), "  RANKING", "=".repeat(55),
+  ];
+  for (let i = 0; i < winners.length; i++) {
+    const w = winners[i], p = w.params;
+    lines.push("", `  ${medals[i]||"  "} #${i+1}: ${w.model}`);
+    lines.push(`      AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
+    lines.push(`      ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`);
+    if (p.cpuMoe) lines.push(`      --cpu-moe`);
+    else if (p.nCpuMoe) lines.push(`      --n-cpu-moe ${p.nCpuMoe}`);
+  }
+  if (winners.length > 0) {
+    const c = winners[0], cp = c.params;
+    lines.push("", "=".repeat(55), `  ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s`, "=".repeat(55));
+    const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`,
+      `-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`,
+      `-ub ${cp.ub} -b ${cp.b} -fa on`,
+      `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
+      `--prio ${cp.prio||3} --poll 50 --mlock`,
+      cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "",
+      "--port 8000 --host 0.0.0.0"].filter(Boolean).join(" ");
+    lines.push("", "  Recommended:", `    ${cmd}`);
+  }
+  const summary = lines.join("\n");
+  console.log("\n" + summary);
+  writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
+  writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
+  log(`\n  Saved: dual_gpu_results.json + dual_gpu_summary.txt`);
+  log("  DONE!");
+  await kill();
+}
+
+main().catch(e => { console.error("FATAL:", e); process.exit(1); });
--- a/scripts/_archive/tuning/find_max_dense.mjs
+++ b/scripts/_archive/tuning/find_max_dense.mjs
@@ -0,0 +1,101 @@
+import { spawn, exec } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function killServer() {
+    return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
+}
+
+async function testContextSize(modelPath, contextSize) {
+    console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
+    await killServer();
+
+    const args = [
+        '--model', `models\\${modelPath}`,
+        '-ngl', '999',
+        '-c', contextSize.toString(),
+        '-fa', 'on',
+        '--cache-type-k', 'q4_0',
+        '--cache-type-v', 'q4_0',
+        '-ub', '512',
+        '-b', '2048',
+        '-t', '6',
+        '-tb', '6',
+        '--split-mode', 'row',
+        '--prio', '3',
+        '--fit', 'off',
+        '--port', '8000',
+        '--host', '0.0.0.0'
+    ];
+
+    const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
+    
+    let booted = false;
+    let oomed = false;
+
+    server.stderr.on('data', (d) => {
+        const text = d.toString();
+        if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
+            oomed = true;
+        }
+    });
+
+    for (let i = 0; i < 20; i++) {
+        if (oomed) break;
+        try {
+            const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+            if (res.status === 200) {
+                booted = true;
+                break;
+            }
+        } catch(e) {}
+        await delay(2000);
+    }
+
+    if (oomed || !booted) {
+        console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
+        server.kill('SIGKILL');
+        await killServer();
+        return false;
+    }
+
+    console.log(`✅ Booted! Running Benchmark...`);
+    
+    // Benchmark
+    const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+        r(stdout || stderr);
+    }));
+    
+    console.log(bench);
+    await killServer();
+    return true;
+}
+
+async function findMaxContext(modelName) {
+    const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
+    
+    let maxFound = false;
+    for (const c of contexts) {
+        const success = await testContextSize(modelName, c);
+        if (success) {
+            maxFound = true;
+            console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
+            break;
+        }
+    }
+    
+    if (!maxFound) {
+        console.log(`\n❌ Failed to find any working context size for ${modelName}`);
+    }
+}
+
+async function main() {
+    exec('set CUDA_VISIBLE_DEVICES=');
+    console.log("============= QWEN 27B Q4_K_M =============");
+    await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
+    
+    console.log("\n============= GEMMA 4 31B Q4_K_M =============");
+    await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
+}
+
+main();
--- a/scripts/_archive/tuning/qwen_fullgpu_challenge.mjs
+++ b/scripts/_archive/tuning/qwen_fullgpu_challenge.mjs
@@ -0,0 +1,345 @@
+/**
+ * Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크
+ * =====================================================
+ * 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기
+ *
+ * 테스트 모델:
+ *   1. UD-IQ4_NL  (16.6 GB) — 확실히 올라감, 기준선
+ *   2. MXFP4_MOE  (20.1 GB) — 도전! VRAM 극한 최적화
+ *   3. Q4_K_M     (20.5 GB) — 대조군 (n-cpu-moe=5)
+ *
+ * VRAM 절감 전략:
+ *   A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소)
+ *   B. split-mode row (GPU간 더 균등한 분배)
+ *   C. tensor-split 수동 밸런싱
+ *   D. no-mmap (메모리 관리 최적화)
+ *   E. defrag-thold (KV 캐시 파편화 방지)
+ *
+ * Run: node scripts/qwen_fullgpu_challenge.mjs
+ */
+
+import { spawn, execSync } from "child_process";
+import { writeFileSync, existsSync, statSync } from "fs";
+
+const BASE_URL = "http://127.0.0.1:8000";
+const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
+const CTX = 262144;
+const RUNS = 3;
+const TOKENS = 200;
+const BOOT_TIMEOUT = 300_000;
+
+const MODELS = [
+  {
+    name: "Qwen3.5 UD-IQ4_NL",
+    path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`,
+    sizeGB: 16.6,
+  },
+  {
+    name: "Qwen3.5 MXFP4_MOE",
+    path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
+    sizeGB: 20.11,
+  },
+  {
+    name: "Qwen3.5 Q4_K_M",
+    path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
+    sizeGB: 20.5,
+  },
+];
+
+const ALL = [];
+let proc = null;
+const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
+const sleep = (ms) => new Promise(r => setTimeout(r, ms));
+
+async function kill() {
+  if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; }
+  try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
+  await sleep(5000);
+}
+
+function vram() {
+  try {
+    return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
+      { encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
+      const [g, u, t] = l.split(",").map(s => parseInt(s));
+      return { gpu: g, used: u, total: t };
+    });
+  } catch { return []; }
+}
+
+function startServer(modelPath, p) {
+  const args = [
+    "--model", modelPath, "-ngl", "999",
+    "-c", String(CTX), "-np", "1", "-fa", "on",
+    "--cache-type-k", p.ctk || "q4_0",
+    "--cache-type-v", p.ctv || "q4_0",
+    "-ub", String(p.ub || 512), "-b", String(p.b || 2048),
+    "-t", String(p.t || 4), "-tb", String(p.t || 4),
+    "--prio", "3", "--poll", "50", "--mlock",
+    "--port", "8000", "--host", "0.0.0.0",
+  ];
+
+  // GPU offload strategy
+  if (p.cpuMoe) args.push("--cpu-moe");
+  else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe));
+
+  // VRAM saving options
+  if (p.splitMode) args.push("--split-mode", p.splitMode);
+  if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit);
+  if (p.noMmap) args.push("--no-mmap");
+  if (p.defragThold) args.push("--defrag-thold", String(p.defragThold));
+  if (p.noKvOffload) args.push("--no-kv-offload");
+
+  const cmdStr = args.join(" ");
+  log(`  CMD: ...${cmdStr.slice(-80)}`);
+  proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
+  return proc;
+}
+
+async function waitReady(timeout = BOOT_TIMEOUT) {
+  const t0 = Date.now();
+  while (Date.now() - t0 < timeout) {
+    try {
+      const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
+      const d = await r.json();
+      if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
+    } catch {}
+    await sleep(3000);
+  }
+  return { ok: false, boot: timeout / 1000 };
+}
+
+async function bench(n = TOKENS) {
+  const t0 = Date.now();
+  const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      model: "m",
+      messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
+      max_tokens: n, temperature: 0,
+    }),
+    signal: AbortSignal.timeout(600_000),
+  });
+  const d = await r.json();
+  const dt = (Date.now() - t0) / 1000;
+  const ct = d.usage?.completion_tokens || 0;
+  return { tps: ct / dt, ct, dt };
+}
+
+async function testConfig(model, label, params) {
+  await kill();
+  log(`  [${label}] Starting...`);
+  startServer(model.path, params);
+  const { ok, boot } = await waitReady();
+  if (!ok) {
+    log(`  [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`);
+    await kill();
+    return null;
+  }
+
+  const v = vram();
+  const totalUsed = v.reduce((a, g) => a + g.used, 0);
+  const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
+  log(`  [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`);
+
+  try { await bench(20); } catch {} // warmup
+
+  const speeds = [];
+  for (let i = 0; i < RUNS; i++) {
+    try {
+      const r = await bench();
+      speeds.push(r.tps);
+      log(`    Run${i+1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) {
+      log(`    Run${i+1}: ERR ${e.message}`);
+    }
+  }
+  await kill();
+
+  if (!speeds.length) return null;
+  const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
+  const best = Math.max(...speeds);
+  log(`  [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
+
+  const res = {
+    model: model.name, label,
+    avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+    boot: +boot.toFixed(1),
+    vram_total: totalUsed, vram: v,
+    params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" },
+    gpu_only: !params.cpuMoe && !params.nCpuMoe,
+  };
+  ALL.push(res);
+  writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
+  return res;
+}
+
+// ─── Test Strategies ───────────────────────────────────────────
+
+async function testModel(model) {
+  log(`\n${"#".repeat(65)}`);
+  log(`  ${model.name} (${model.sizeGB} GB)`);
+  if (!existsSync(model.path)) { log("  ✗ File not found!"); return null; }
+  log(`${"#".repeat(65)}`);
+
+  let best = null;
+  const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; };
+
+  // ── Strategy 1: Pure GPU, default settings ──
+  log(`\n  ── Strategy 1: Pure GPU (default) ──`);
+  update(await testConfig(model, "pure-GPU default", {
+    t: 4, ub: 512, b: 2048
+  }));
+
+  // ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ──
+  log(`\n  ── Strategy 2: Pure GPU, minimal batch ──`);
+  update(await testConfig(model, "pure-GPU minbatch", {
+    t: 4, ub: 64, b: 256
+  }));
+
+  // ── Strategy 3: Pure GPU, small batch + no-mmap ──
+  log(`\n  ── Strategy 3: Pure GPU + no-mmap + small batch ──`);
+  update(await testConfig(model, "pure-GPU nommap small", {
+    t: 4, ub: 128, b: 512, noMmap: true
+  }));
+
+  // ── Strategy 4: Pure GPU, split-mode row ──
+  log(`\n  ── Strategy 4: Pure GPU + split-mode row ──`);
+  update(await testConfig(model, "pure-GPU row-split", {
+    t: 4, ub: 128, b: 512, splitMode: "row"
+  }));
+
+  // ── Strategy 5: Pure GPU, tensor-split manual balance ──
+  log(`\n  ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`);
+  update(await testConfig(model, "pure-GPU ts=0.5,0.5", {
+    t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5"
+  }));
+
+  // ── Strategy 6: Pure GPU, defrag + all tricks ──
+  log(`\n  ── Strategy 6: Pure GPU ALL tricks ──`);
+  update(await testConfig(model, "pure-GPU all-tricks", {
+    t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1
+  }));
+
+  // ── Fallback: n-cpu-moe=5 baseline ──
+  if (!best || !best.gpu_only) {
+    log(`\n  ── Fallback: n-cpu-moe=5 ──`);
+    update(await testConfig(model, "n-cpu-moe=5 baseline", {
+      t: 4, ub: 256, b: 1024, nCpuMoe: 5
+    }));
+  }
+
+  // ── If pure GPU worked, tune batch/thread/kv ──
+  if (best && best.gpu_only) {
+    log(`\n  ── Pure GPU succeeded! Fine-tuning... ──`);
+    const bp = best.params;
+
+    // Thread sweep
+    for (const t of [2, 6, 8]) {
+      if (t === bp.t) continue;
+      update(await testConfig(model, `tune t=${t}`, { ...bp, t }));
+    }
+
+    // Batch sweep
+    for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) {
+      if (ub === bp.ub && b === bp.b) continue;
+      update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b }));
+    }
+
+    // KV cache upgrade (extra VRAM available?)
+    for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) {
+      update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv }));
+    }
+  }
+
+  // ── Final verification ──
+  if (best) {
+    log(`\n  ── Final verification (5 runs) ──`);
+    await kill();
+    startServer(model.path, best.params);
+    const { ok, boot } = await waitReady();
+    if (ok) {
+      const v = vram();
+      try { await bench(20); } catch {}
+      const finals = [];
+      for (let i = 0; i < 5; i++) {
+        try { const r = await bench(); finals.push(r.tps); log(`    Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
+        } catch (e) { log(`    Final ${i+1}: ERR`); }
+      }
+      await kill();
+      if (finals.length > 0) {
+        const avg = finals.reduce((a,b)=>a+b) / finals.length;
+        const bst = Math.max(...finals);
+        log(`  ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
+        const final = { model: model.name, label: "FINAL",
+          avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
+          boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0),
+          vram: v, params: best.params, gpu_only: best.gpu_only };
+        ALL.push(final);
+        writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
+        return final;
+      }
+    }
+    await kill();
+  }
+  return best;
+}
+
+// ─── Main ──────────────────────────────────────────────────────
+
+async function main() {
+  const t0 = Date.now();
+  log("=".repeat(65));
+  log("  QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET");
+  log("  2x RTX 3060 (24GB) | 256K Context");
+  log("  " + new Date().toISOString());
+  log("=".repeat(65));
+  vram().forEach(g => log(`  GPU${g.gpu}: ${g.used}/${g.total} MiB`));
+
+  const winners = [];
+  for (const model of MODELS) {
+    const w = await testModel(model);
+    if (w) winners.push(w);
+  }
+
+  // ─── Summary ──────────────────────────────────────────────
+  const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
+  winners.sort((a, b) => b.avg_tps - a.avg_tps);
+
+  const lines = [
+    `Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`,
+    `2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
+    "", "=".repeat(55), "  RANKING", "=".repeat(55),
+  ];
+  for (let i = 0; i < winners.length; i++) {
+    const w = winners[i], p = w.params;
+    const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload";
+    lines.push("", `  #${i+1}: ${w.model} [${gpu}]`);
+    lines.push(`      AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
+    lines.push(`      VRAM: ${w.vram_total} MiB total`);
+    const flags = [];
+    if (p.splitMode) flags.push(`split=${p.splitMode}`);
+    if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`);
+    if (p.noMmap) flags.push("no-mmap");
+    if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`);
+    lines.push(`      t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`);
+  }
+
+  if (winners.length > 0) {
+    const c = winners[0];
+    lines.push("", "=".repeat(55));
+    lines.push(`  ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`);
+    lines.push("=".repeat(55));
+  }
+
+  const summary = lines.join("\n");
+  console.log("\n" + summary);
+  writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8");
+  writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
+  log(`\n  Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`);
+  log("  DONE!");
+  await kill();
+}
+
+main().catch(e => { console.error("FATAL:", e); process.exit(1); });
--- a/scripts/_archive/tuning/tune_122b.py
+++ b/scripts/_archive/tuning/tune_122b.py
@@ -0,0 +1,129 @@
+import subprocess, time, urllib.request, json, sys
+try: sys.stdout.reconfigure(encoding='utf-8')
+except: pass
+
+MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
+BASE = "http://127.0.0.1:8000"
+
+# BEST SO FAR: GPU1 only + Expert CPU + 8t = 8.75 t/s (6.5GB / 12GB used)
+# 5.5GB VRAM remaining on GPU 1. Let's use it!
+# Strategy: keep some experts on GPU 1 using -ncmoe (n-cpu-moe)
+# n-cpu-moe = number of layers whose experts stay on CPU
+# Lower = more experts on GPU = more VRAM used = potentially faster
+
+BASE_CMD = [
+    r"llama_bin_run\llama-server.exe",
+    "--model", MODEL,
+    "-ngl", "999",
+    "-sm", "none", "--main-gpu", "1",
+    "-c", "4096", "-np", "1", "-fa", "on",
+    "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+    "-ub", "512", "-b", "2048",
+    "-t", "8", "-tb", "8",
+    "--prio", "3", "--poll", "50",
+    "--no-mmap",
+    "--port", "8000", "--host", "0.0.0.0"
+]
+
+CONFIGS = [
+    # Baseline: all experts CPU (confirmed 8.75 t/s)
+    {"name": "Baseline: all expert CPU", "extra": ["-ot", ".*ffn_.*_exps.*=CPU"]},
+    # Try n-cpu-moe with GPU1 only: keep some experts on GPU
+    {"name": "n-cpu-moe=60 (4 layers expert GPU)", "extra": ["-ncmoe", "60"]},
+    {"name": "n-cpu-moe=56 (8 layers expert GPU)", "extra": ["-ncmoe", "56"]},
+    {"name": "n-cpu-moe=52 (12 layers expert GPU)", "extra": ["-ncmoe", "52"]},
+    {"name": "n-cpu-moe=48 (16 layers expert GPU)", "extra": ["-ncmoe", "48"]},
+]
+
+def kill():
+    subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    time.sleep(4)
+
+def check_server(timeout=900):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE}/health")
+            resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
+            if resp.get("status") in ("ok", "ready"):
+                return True
+        except: pass
+        time.sleep(5)
+    return False
+
+def bench(runs=3):
+    speeds = []
+    for i in range(runs):
+        payload = json.dumps({
+            "model": "m",
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Write a Python fibonacci function with memoization."}
+            ],
+            "max_tokens": 200,
+            "temperature": 0.0
+        }).encode('utf-8')
+        req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
+        t0 = time.time()
+        resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
+        dt = time.time() - t0
+        tokens = resp.get("usage", {}).get("completion_tokens", 0)
+        speed = tokens / dt if dt > 0 else 0
+        speeds.append(speed)
+        print(f"    Run {i+1}: {speed:.2f} t/s ({tokens} tok / {dt:.1f}s)")
+    return sum(speeds)/len(speeds), max(speeds)
+
+def vram():
+    try:
+        out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
+        return [int(x.strip()) for x in out.split('\n')]
+    except: return [0, 0]
+
+results = []
+for cfg in CONFIGS:
+    kill()
+    print(f"\n{'='*60}")
+    print(f"Testing: {cfg['name']}")
+    print(f"{'='*60}")
+    
+    cmd = BASE_CMD + cfg["extra"]
+    proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    
+    if not check_server(900):
+        print(f"  FAILED TO BOOT")
+        results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
+        proc.terminate(); kill(); continue
+    
+    print("  Server ready! Warming up...")
+    try:
+        p = json.dumps({"model":"m","messages":[{"role":"system","content":"Hi"},{"role":"user","content":"Hi"}],"max_tokens":5}).encode()
+        urllib.request.urlopen(urllib.request.Request(f"{BASE}/v1/chat/completions",data=p,headers={"Content-Type":"application/json"}), timeout=120)
+    except: pass
+    
+    v = vram()
+    print(f"  VRAM: GPU0={v[0]}MB, GPU1={v[1]}MB, Total={sum(v)}MB")
+    
+    avg, best = bench(runs=3)
+    print(f"  >>> AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
+    
+    results.append({
+        "name": cfg["name"], "avg_tps": round(avg,2), "best_tps": round(best,2),
+        "vram_gpu0": v[0], "vram_gpu1": v[1], "vram_total": sum(v), "status": "OK"
+    })
+    proc.terminate()
+
+kill()
+print(f"\n\n{'='*60}")
+print("FINAL RESULTS - GPU1 Expert Balance (Target: 10+ t/s)")
+print(f"{'='*60}")
+print(f"{'Config':<48} {'AVG':>6} {'BEST':>6} {'GPU1':>7}")
+print("-" * 72)
+for r in results:
+    if r["status"] == "OK":
+        print(f"  {r['name']:<46} {r['avg_tps']:>5} {r['best_tps']:>5} {r['vram_gpu1']:>5}MB")
+    else:
+        print(f"  {r['name']:<46} {'FAIL':>5}")
+
+with open("scripts/122b_final_results.json", "w", encoding="utf-8") as f:
+    json.dump(results, f, ensure_ascii=False, indent=2)
+print("\nSaved to scripts/122b_final_results.json")
--- a/scripts/_archive/tuning/tune_122b_20ts.mjs
+++ b/scripts/_archive/tuning/tune_122b_20ts.mjs
@@ -0,0 +1,64 @@
+import { exec, spawn } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function runTest(modelArgs, envVars, name) {
+    console.log(`\n===========================================`);
+    console.log(`Testing: ${name}`);
+    console.log(`Args: ${modelArgs}`);
+    
+    return new Promise(async (resolve) => {
+        await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
+        await delay(2000);
+
+        const env = { ...process.env, ...envVars };
+        const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
+            detached: true,
+            stdio: 'ignore',
+            env
+        });
+
+        let ready = false;
+        for (let i = 0; i < 40; i++) {
+            try {
+                const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+                if (res.status === 200) {
+                    ready = true;
+                    break;
+                }
+            } catch (e) {}
+            await delay(3000);
+        }
+
+        if (!ready) {
+            console.log(`[${name}] FAILED TO BOOT`);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: false });
+            return;
+        }
+
+        console.log(`[${name}] Server Ready! Running benchmark...`);
+        exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+            console.log(stdout || stderr);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: true });
+        });
+    });
+}
+
+async function main() {
+    const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+
+    // 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48)
+    await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer");
+
+    // 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage
+    await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer");
+    
+    // 3. Fallback to 36 if OOM happens on 32/28
+    await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer");
+
+    console.log("\nALL TESTS COMPLETED");
+}
+
+main();
--- a/scripts/_archive/tuning/tune_exact.mjs
+++ b/scripts/_archive/tuning/tune_exact.mjs
@@ -0,0 +1,72 @@
+import { exec, spawn } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function runTest(modelArgs, envVars, name) {
+    console.log(`\n===========================================`);
+    console.log(`Testing: ${name}`);
+    console.log(`Env: ${JSON.stringify(envVars)}`);
+    console.log(`Args: ${modelArgs}`);
+    
+    return new Promise(async (resolve) => {
+        await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
+        await delay(2000);
+
+        const env = { ...process.env, ...envVars };
+        const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
+            detached: true,
+            stdio: 'ignore',
+            env
+        });
+
+        let ready = false;
+        
+        for (let i = 0; i < 40; i++) {
+            try {
+                const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+                if (res.status === 200) {
+                    ready = true;
+                    break;
+                }
+            } catch (e) {}
+            await delay(3000);
+        }
+
+        if (!ready) {
+            console.log(`[${name}] FAILED TO BOOT`);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: false });
+            return;
+        }
+
+        console.log(`[${name}] Server Ready! Running speed test...`);
+        exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+            console.log(stdout || stderr);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: true });
+        });
+    });
+}
+
+async function main() {
+    // 1. 122B-A10B: Pure GPU offload (No n-cpu-moe at all)
+    // -ngl 999 will offload all 48 layers to the NVIDIA driver (triggering Shared VRAM fallback on Windows)
+    const args122B = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+    await runTest(args122B, {}, "122B-A10B: Pure GPU (NVIDIA Shared Memory Fallback)");
+
+    // 2. 35B-A3B: Pure GPU tuning to hit 70 t/s
+    // Base configuration from previous full-gpu run:
+    const args35B = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 128 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+    
+    // We already got ~64 t/s basically. 
+    // Let's try MMQ for custom matrix multiplication which is often faster on Ampere for low batch generation
+    await runTest(args35B, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Force MMQ = 1");
+    
+    // Try increasing threads to 12 just in case
+    const args35B_t12 = args35B.replace("-t 6 -tb 6", "-t 12 -tb 12");
+    await runTest(args35B_t12, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Threads 12 + MMQ");
+
+    console.log("\nALL TESTS COMPLETED");
+}
+
+main();
--- a/scripts/_archive/tuning/tune_models.mjs
+++ b/scripts/_archive/tuning/tune_models.mjs
@@ -0,0 +1,84 @@
+import { exec, spawn } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function runTest(modelArgs, name) {
+    console.log(`\n===========================================`);
+    console.log(`Testing: ${name}`);
+    console.log(`Args: ${modelArgs}`);
+    
+    return new Promise(async (resolve) => {
+        // Kill existing
+        await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
+        await delay(2000);
+
+        const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
+            detached: true,
+            stdio: 'ignore' 
+        });
+
+        let ready = false;
+        let oom = false;
+        
+        for (let i = 0; i < 40; i++) {
+            try {
+                const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+                if (res.status === 200) {
+                    ready = true;
+                    break;
+                }
+            } catch (e) {}
+            await delay(3000);
+        }
+
+        if (!ready) {
+            console.log(`[${name}] FAILED TO BOOT (Likely OOM)`);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: false });
+            return;
+        }
+
+        console.log(`[${name}] Server Ready! Running benchmark...`);
+        // Run pptest
+        exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+            console.log(stdout || stderr);
+            
+            // Extract TG and PP from TG-500
+            const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/);
+            const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/);
+            
+            const tg = tgMatch ? parseFloat(tgMatch[1]) : 0;
+            const pp = ppMatch ? parseFloat(ppMatch[1]) : 0;
+            
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: true, tg, pp });
+        });
+    });
+}
+
+async function main() {
+    // 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512
+    const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+    
+    // Test 1: n-cpu-moe 1, ub 512
+    await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512");
+    
+    // Test 2: n-cpu-moe 2, ub 512
+    await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512");
+
+    // Test 3: n-cpu-moe 4, ub 512
+    await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512");
+    
+    // 2. 122B Tuning: Find optimal n-cpu-moe
+    const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+    
+    // Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU.
+    // Let's try 38, 35, 30
+    await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38");
+    await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30");
+    await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22");
+
+    console.log("Tuning finished.");
+}
+
+main();
--- a/scripts/_archive/tuning/tune_n_cpu_moe.py
+++ b/scripts/_archive/tuning/tune_n_cpu_moe.py
@@ -0,0 +1,107 @@
+import subprocess, time, urllib.request, json, sys
+try: sys.stdout.reconfigure(encoding='utf-8')
+except: pass
+
+MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
+BASE = "http://127.0.0.1:8000"
+
+# Goal: See if decreasing n-cpu-moe increases VRAM usage on GPU 1 and improves speed
+# Now that we know GPU 1 is isolated and has ~3.5GB free with 256K context
+
+BASE_CMD = [
+    r"llama_bin_run\llama-server.exe",
+    "--model", MODEL,
+    "-ngl", "999",
+    "-sm", "none", "--main-gpu", "1",
+    "-c", "4096", "-np", "1", "-fa", "on", # use a small context for fast boot/testing
+    "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+    "-ub", "512", "-b", "2048",
+    "-t", "8", "-tb", "8",
+    "--prio", "3", "--poll", "50",
+    "--no-mmap",
+    "--port", "8000", "--host", "0.0.0.0"
+]
+
+CONFIGS = [
+    {"name": "n-cpu-moe=48 (baseline)", "extra": ["-ncmoe", "48"]},
+    {"name": "n-cpu-moe=40", "extra": ["-ncmoe", "40"]},
+    {"name": "n-cpu-moe=32", "extra": ["-ncmoe", "32"]},
+    {"name": "n-cpu-moe=24", "extra": ["-ncmoe", "24"]},
+]
+
+def kill():
+    subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    time.sleep(4)
+
+def check_server(timeout=900):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE}/health")
+            resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
+            if resp.get("status") in ("ok", "ready"):
+                return True
+        except: pass
+        time.sleep(5)
+    return False
+
+def bench(runs=2):
+    speeds = []
+    for i in range(runs):
+        payload = json.dumps({
+            "model": "m",
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Write a short Python script."}
+            ],
+            "max_tokens": 100,
+            "temperature": 0.0
+        }).encode('utf-8')
+        req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
+        t0 = time.time()
+        resp = json.loads(urllib.request.urlopen(req, timeout=600).read())
+        dt = time.time() - t0
+        tokens = resp.get("usage", {}).get("completion_tokens", 0)
+        speed = tokens / dt if dt > 0 else 0
+        speeds.append(speed)
+    return sum(speeds)/len(speeds), max(speeds)
+
+def vram():
+    try:
+        out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip()
+        return [int(x.strip()) for x in out.split('\n')]
+    except: return [0, 0]
+
+results = []
+for cfg in CONFIGS:
+    kill()
+    print(f"\n{'='*60}\nTesting: {cfg['name']}\n{'='*60}")
+    
+    cmd = BASE_CMD + cfg["extra"]
+    proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    
+    if not check_server(300):
+        print(f"  FAILED TO BOOT (OOM?)")
+        results.append({"name": cfg["name"], "status": "BOOT_FAIL"})
+        proc.terminate(); kill(); continue
+    
+    print("  Server ready! Warming up...")
+    time.sleep(2)
+    v = vram()
+    
+    avg, best = bench(runs=2)
+    print(f"  >>> AVG: {avg:.2f} t/s | VRAM GPU1: {v[1]}MB")
+    
+    results.append({
+        "name": cfg["name"], "avg_tps": round(avg,2), 
+        "vram_gpu1": v[1], "status": "OK"
+    })
+    proc.terminate()
+
+kill()
+print("\nFINAL RESULTS:")
+for r in results:
+    if r["status"] == "OK":
+        print(f"  {r['name']:<25} {r['avg_tps']:>5} t/s | {r['vram_gpu1']:>5}MB")
+    else:
+        print(f"  {r['name']:<25} FAIL (OOM)")