feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions
--- a/scripts/_archive/benchmarks/bench_122b.py
+++ b/scripts/_archive/benchmarks/bench_122b.py
@@ -0,0 +1,38 @@
+import urllib.request, json, time, sys
+try: sys.stdout.reconfigure(encoding='utf-8')
+except: pass
+
+BASE = "http://127.0.0.1:8000"
+prompt = "Write a Python function to calculate fibonacci numbers efficiently using memoization. Include type hints and docstring."
+
+payload = json.dumps({
+    "model": "m",
+    "messages": [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt}
+    ],
+    "max_tokens": 500,
+    "temperature": 0.0
+}).encode('utf-8')
+
+req = urllib.request.Request(
+    f"{BASE}/v1/chat/completions",
+    data=payload,
+    headers={"Content-Type": "application/json"}
+)
+
+print("Sending request...")
+t0 = time.time()
+resp = json.loads(urllib.request.urlopen(req, timeout=300).read())
+dt = time.time() - t0
+
+u = resp.get("usage", {})
+tokens = u.get("completion_tokens", 0)
+speed = tokens / dt if dt > 0 else 0
+
+print(f"\n=== 122B Benchmark ===")
+print(f"Time: {dt:.1f}s")
+print(f"Completion Tokens: {tokens}")
+print(f"Speed: {speed:.2f} t/s")
+print(f"\n--- Response Preview ---")
+print(resp["choices"][0]["message"]["content"][:300])
--- a/scripts/_archive/benchmarks/deep_tier_auto_test.py
+++ b/scripts/_archive/benchmarks/deep_tier_auto_test.py
@@ -0,0 +1,177 @@
+import subprocess
+import time
+import urllib.request
+import json
+import sys
+import os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+RESULTS_FILE = "scripts/deep_tier_auto_results.json"
+
+MODELS = [
+    {
+        "name": "Qwen 27B - 256K (q4_0)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    },
+    {
+        "name": "Gemma 31B - 32K (q4_0)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "32768", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    },
+    {
+        "name": "Gemma 31B - 64K (q4_0)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "65536", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    }
+]
+
+TEST_PROMPTS = [
+    {
+        "id": "code",
+        "prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
+    },
+    {
+        "id": "logical",
+        "prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
+    }
+]
+
+def check_server(timeout=300):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
+            if resp.get("status") == "ok" or resp.get("status") == "ready":
+                return True
+        except:
+            pass
+        time.sleep(5)
+    return False
+
+def get_vram_usage():
+    try:
+        out = subprocess.check_output(
+            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
+            text=True
+        )
+        return out.strip().split("\n")
+    except:
+        return ["Failed to get VRAM info"]
+
+def ask(prompt, max_tokens=300):
+    payload = json.dumps({
+        "model": "m",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode()
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+    t0 = time.time()
+    resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
+    dt = time.time() - t0
+    usage = resp.get("usage", {})
+    content = resp["choices"][0]["message"]["content"]
+    
+    tokens = usage.get("completion_tokens", 0)
+    tps = round(tokens / dt, 2) if dt > 0 else 0
+    return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
+
+def main():
+    results = []
+    
+    # Kill any existing llama-server
+    subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    time.sleep(3)
+    
+    for cfg in MODELS:
+        print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
+        
+        # Start server
+        proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        
+        # Wait for boot
+        print(f"Waiting for server to boot (up to 5 mins)...")
+        is_ready = check_server(300)
+        
+        if not is_ready:
+            print(f"❌ Failed to boot {cfg['name']}.")
+            results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
+            proc.terminate()
+            subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            time.sleep(5)
+            continue
+            
+        print(f"✅ Server Ready!")
+        vram = get_vram_usage()
+        
+        # Warmup
+        try:
+            ask("Hello", max_tokens=10)
+        except Exception as e:
+            pass
+            
+        test_data = {}
+        for p in TEST_PROMPTS:
+            print(f"  Testing {p['id']}...", end="", flush=True)
+            try:
+                res = ask(p["prompt"])
+                test_data[p["id"]] = res
+                print(f" {res['tps']} t/s")
+            except Exception as e:
+                test_data[p["id"]] = {"error": str(e)}
+                print(f" ERROR: {e}")
+                
+        results.append({
+            "name": cfg["name"],
+            "status": "Success",
+            "vram": vram,
+            "tests": test_data
+        })
+        
+        # Save incremental
+        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+            
+        # Shutdown
+        print("Shutting down server...")
+        proc.terminate()
+        subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        time.sleep(5)
+
+    print("\n✅ All tests complete!")
+    print(f"Results saved to {RESULTS_FILE}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/benchmarks/deep_tier_extreme_test.py
+++ b/scripts/_archive/benchmarks/deep_tier_extreme_test.py
@@ -0,0 +1,171 @@
+import subprocess
+import time
+import urllib.request
+import json
+import sys
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+RESULTS_FILE = "scripts/deep_tier_extreme_results.json"
+
+MODELS = [
+    {
+        "name": "Qwen 27B - 256K 극한 (q4_0, ub=512)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    },
+    {
+        "name": "Gemma 31B - 128K 확장 (q4_0)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "131072", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    },
+    {
+        "name": "Gemma 31B - 192K 극한 (q4_0)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    }
+]
+
+TEST_PROMPTS = [
+    {
+        "id": "code",
+        "prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
+    },
+    {
+        "id": "logical",
+        "prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
+    }
+]
+
+def check_server(timeout=300):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
+            if resp.get("status") == "ok" or resp.get("status") == "ready":
+                return True
+        except:
+            pass
+        time.sleep(5)
+    return False
+
+def get_vram_usage():
+    try:
+        out = subprocess.check_output(
+            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
+            text=True
+        )
+        return out.strip().split("\n")
+    except:
+        return ["Failed to get VRAM info"]
+
+def ask(prompt, max_tokens=300):
+    payload = json.dumps({
+        "model": "m",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode()
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+    t0 = time.time()
+    resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
+    dt = time.time() - t0
+    usage = resp.get("usage", {})
+    content = resp["choices"][0]["message"]["content"]
+    
+    tokens = usage.get("completion_tokens", 0)
+    tps = round(tokens / dt, 2) if dt > 0 else 0
+    return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
+
+def main():
+    results = []
+    
+    # Clean init
+    subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    time.sleep(3)
+    
+    for cfg in MODELS:
+        print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
+        
+        proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        print(f"Waiting for server to boot (up to 5 mins)...")
+        is_ready = check_server(300)
+        
+        if not is_ready:
+            print(f"❌ Failed to boot {cfg['name']}.")
+            results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
+            proc.terminate()
+            subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            time.sleep(5)
+            continue
+            
+        print(f"✅ Server Ready!")
+        vram = get_vram_usage()
+        print(f"VRAM: {vram}")
+        
+        # Warmup
+        try:
+            ask("Hello", max_tokens=10)
+        except Exception:
+            pass
+            
+        test_data = {}
+        for p in TEST_PROMPTS:
+            print(f"  Testing {p['id']}...", end="", flush=True)
+            try:
+                res = ask(p["prompt"])
+                test_data[p["id"]] = res
+                print(f" {res['tps']} t/s")
+            except Exception as e:
+                test_data[p["id"]] = {"error": str(e)}
+                print(f" ERROR: {e}")
+                
+        results.append({
+            "name": cfg["name"],
+            "status": "Success",
+            "vram": vram,
+            "tests": test_data
+        })
+        
+        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+            
+        print("Shutting down server...")
+        proc.terminate()
+        subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        time.sleep(5)
+
+    print("\n✅ Extreme testing complete! Check results in", RESULTS_FILE)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/benchmarks/gemma4_test.py
+++ b/scripts/_archive/benchmarks/gemma4_test.py
@@ -0,0 +1,88 @@
+"""
+Gemma 4 26B-A4B Q4_K_M - 76.4 t/s 재현 테스트
+이전 최적값: ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16
+"""
+import subprocess, time, json, urllib.request, sys, os
+
+try: sys.stdout.reconfigure(encoding='utf-8')
+except: pass
+
+LLAMA = os.path.join(os.getcwd(), "llama_bin_run", "llama-server.exe")
+MODEL = os.path.join(os.getcwd(), "models", "gemma-4-26B-A4B-it-Q4_K_M.gguf")
+
+subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+time.sleep(3)
+
+cmd = [
+    LLAMA, "--model", MODEL,
+    "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
+    "--cache-type-k", "f16", "--cache-type-v", "f16",
+    "-ub", "512", "-b", "2048", "-t", "6", "-tb", "6",
+    "--prio", "3", "--mlock", "--poll", "50",
+    "--port", "8000", "--host", "0.0.0.0",
+]
+
+print("[1/4] Starting Gemma4 26B Q4_K_M (76.4 t/s config)...")
+server = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+print("[2/4] Waiting for boot...")
+healthy = False
+for sec in range(180):
+    time.sleep(1)
+    if server.poll() is not None:
+        print(f"  !! CRASHED (exit code {server.returncode})")
+        sys.exit(1)
+    try:
+        with urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=1) as r:
+            if json.loads(r.read()).get("status") == "ok":
+                healthy = True; break
+    except: pass
+    if sec % 10 == 9: print(f"  ... {sec+1}s")
+
+if not healthy:
+    print("  FAIL: boot timeout"); server.kill(); sys.exit(1)
+
+print(f"  OK!")
+try:
+    v = subprocess.run(["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
+    print(f"  VRAM: {v.stdout.strip()}")
+except: pass
+
+def bench(n):
+    payload = json.dumps({"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}], "max_tokens": n, "temperature": 0}).encode()
+    req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
+    t0 = time.time()
+    with urllib.request.urlopen(req, timeout=120) as r:
+        res = json.loads(r.read())
+    el = time.time() - t0
+    ct = res["usage"]["completion_tokens"]
+    return ct / el, ct, el
+
+try: bench(10)
+except: pass
+
+print("[3/4] Running 5x benchmark (200 tokens)...")
+results = []
+for i in range(5):
+    tps, tok, el = bench(200)
+    results.append(tps)
+    print(f"  Run {i+1}: {tps:.2f} t/s  ({tok} tok / {el:.2f}s)")
+
+avg = sum(results) / len(results)
+best = max(results)
+worst = min(results)
+summary = f"""
+==================================================
+ Gemma4 26B Q4_K_M  5-Run Results:
+   AVG:  {avg:.2f} t/s
+   BEST: {best:.2f} t/s
+   MIN:  {worst:.2f} t/s
+   Runs: {[f'{r:.2f}' for r in results]}
+==================================================
+"""
+print(summary)
+with open("scripts/gemma4_test_result.txt", "w") as f:
+    f.write(summary)
+
+server.kill()
+subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
--- a/scripts/_archive/benchmarks/llm_judge_test.py
+++ b/scripts/_archive/benchmarks/llm_judge_test.py
@@ -0,0 +1,134 @@
+import subprocess
+import time
+import urllib.request
+import json
+import sys
+import traceback
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+RESULTS_FILE = "scripts/llm_judge_answers.json"
+
+MODELS = [
+    {
+        "name": "Qwen 27B",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    },
+    {
+        "name": "Gemma 31B",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    }
+]
+
+QUESTIONS = [
+    {
+        "id": "architecture",
+        "prompt": "수백만 건의 실시간 주식 틱 데이터를 수집하고, 이를 가공하여 초당 수천 명의 클라이언트에게 웹소켓으로 지연 없이 브로드캐스팅하는 시스템을 설계해야 합니다. 언어, 메시지 큐, 데이터베이스, 캐싱 전략 등을 포함해 구체적인 아키텍처를 제안하고, 병목 현상에 대비한 해결책을 설명하세요."
+    },
+    {
+        "id": "logic",
+        "prompt": "논리 문제: 방 안에 5명의 사람(A, B, C, D, E)이 있습니다. A는 B를 제외한 모든 사람과 악수했습니다. B는 C와만 악수했습니다. C는 D와 악수하지 않았습니다. 그렇다면 E는 총 몇 명과 악수했을까요? 당신의 논리적 사고 과정을 한 단계씩 명확히 설명해주세요."
+    },
+    {
+        "id": "coding",
+        "prompt": "파이썬에서 데코레이터를 작성하세요. 이 데코레이터는 함수의 실행을 최대 3번까지 재시도하며, 각 재시도 간에 지수 백오프(Exponential Backoff)를 적용해야 합니다. 로깅 처리가 포함되어야 하며, 어떤 예외 타입(Exception type)이 발생했을 때만 재시도할지 인자로 받을 수 있어야 합니다."
+    }
+]
+
+def check_server(timeout=300):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
+            if resp.get("status") == "ok" or resp.get("status") == "ready":
+                return True
+        except:
+            pass
+        time.sleep(5)
+    return False
+
+def ask(prompt, max_tokens=4096):
+    payload = json.dumps({
+        "model": "m",
+        "messages": [
+            {"role": "system", "content": "You are a world-class IT system architect and developer. Please output your response in Korean."},
+            {"role": "user", "content": prompt}
+        ],
+        "max_tokens": -1,
+        "temperature": 0.0
+    }).encode('utf-8')
+    
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+    t0 = time.time()
+    resp = json.loads(urllib.request.urlopen(req, timeout=1800).read())
+    dt = time.time() - t0
+    content = resp["choices"][0]["message"]["content"]
+    return content
+
+def main():
+    results = {}
+    
+    subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    time.sleep(3)
+    
+    for cfg in MODELS:
+        print(f"\n[{time.strftime('%H:%M:%S')}] Booting {cfg['name']}...")
+        proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        
+        if not check_server(300):
+            print(f"Failed to boot {cfg['name']}.")
+            proc.terminate()
+            continue
+            
+        print(f"✅ {cfg['name']} is ready! Asking questions...")
+        
+        try: ask("Hi", max_tokens=10) 
+        except: pass
+        
+        results[cfg['name']] = {}
+        for q in QUESTIONS:
+            print(f"  -> Asking: {q['id']}")
+            try:
+                ans = ask(q['prompt'])
+                results[cfg['name']][q['id']] = ans
+                print("     (Done)")
+            except Exception as e:
+                results[cfg['name']][q['id']] = f"ERROR: {e}"
+                print("     (Error)")
+                
+        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+            
+        proc.terminate()
+        subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        time.sleep(5)
+
+    print("\n✅ All questions answered! Results saved to", RESULTS_FILE)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/benchmarks/perf_test_122b.py
+++ b/scripts/_archive/benchmarks/perf_test_122b.py
@@ -0,0 +1,169 @@
+import time
+import json
+import urllib.request
+import sys
+import os
+import re
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+
+def check_server():
+    """Check if server is up"""
+    try:
+        req = urllib.request.Request(f"{BASE_URL}/health")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            data = json.loads(resp.read())
+            return data.get("status") == "ok"
+    except:
+        return False
+
+def check_slots():
+    """Check server slot info for VRAM usage details"""
+    try:
+        req = urllib.request.Request(f"{BASE_URL}/slots")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            return json.loads(resp.read())
+    except:
+        return None
+
+def run_benchmark(prompt, max_tokens=300, label="Test"):
+    """Run a single benchmark request and return results"""
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    content = result["choices"][0]["message"].get("content", "")
+    usage = result.get("usage", {})
+    prompt_tokens = usage.get("prompt_tokens", 0)
+    completion_tokens = usage.get("completion_tokens", 0)
+
+    gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
+
+    return {
+        "label": label,
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "elapsed": elapsed,
+        "gen_tps_approx": gen_tps,
+        "content_preview": content[:150]
+    }
+
+def main():
+    print("=" * 70)
+    print("  Qwen3.5 122B-A10B Performance Benchmark")
+    print("  Target: 10+ t/s generation speed")
+    print("=" * 70)
+    print()
+
+    # Wait for server (model loading takes 3-5 min for 71 GB)
+    print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
+    max_wait = 600  # 10 minutes max
+    for i in range(max_wait // 5):
+        if check_server():
+            print(f"  -> Server is ready! (waited {i*5}s)")
+            break
+        if i % 6 == 0:
+            print(f"  -> Loading model... ({i*5}s / {max_wait}s)")
+        time.sleep(5)
+    else:
+        print(f"  -> ERROR: Server not responding after {max_wait}s")
+        return
+
+    # Check server info
+    print()
+    print("[2/4] Checking server status...")
+    slots = check_slots()
+    if slots:
+        print(f"  -> Slots available: {len(slots)}")
+
+    # Warmup
+    print()
+    print("[3/4] Warmup run (short, pre-heating GPU caches)...")
+    try:
+        warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
+        print(f"  -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
+        print(f"  -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
+    except Exception as e:
+        print(f"  -> Warmup failed: {e}")
+
+    # Main benchmark - 5 runs for statistical reliability
+    print()
+    print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
+    print("-" * 70)
+
+    test_prompts = [
+        "Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
+        "Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
+        "Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
+        "Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
+        "Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
+    ]
+    
+    results = []
+    for i in range(5):
+        prompt = test_prompts[i % len(test_prompts)]
+        print(f"\n  Run {i+1}/5: {prompt[:50]}...")
+        try:
+            r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
+            results.append(r)
+            print(f"    Completion tokens: {r['completion_tokens']}")
+            print(f"    Total time: {r['elapsed']:.2f}s")
+            print(f"    Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
+        except Exception as e:
+            print(f"    ERROR: {e}")
+
+    if results:
+        print()
+        print("=" * 70)
+        print("  RESULTS SUMMARY - Qwen3.5 122B-A10B")
+        print("=" * 70)
+        avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
+        max_tps = max(r["gen_tps_approx"] for r in results)
+        min_tps = min(r["gen_tps_approx"] for r in results)
+        total_tokens = sum(r["completion_tokens"] for r in results)
+        total_time = sum(r["elapsed"] for r in results)
+        
+        print(f"  Runs completed: {len(results)}/5")
+        print(f"  Total tokens:   {total_tokens}")
+        print(f"  Total time:     {total_time:.1f}s")
+        print()
+        print(f"  Approx TPS (avg): {avg_tps:.2f} t/s")
+        print(f"  Approx TPS (min): {min_tps:.2f} t/s")
+        print(f"  Approx TPS (max): {max_tps:.2f} t/s")
+        print()
+        
+        # Verdict
+        if avg_tps >= 10:
+            print("  ✅ TARGET ACHIEVED: 10+ t/s!")
+        elif avg_tps >= 8:
+            print("  ⚠️  CLOSE TO TARGET: Consider further tuning")
+        else:
+            print(f"  ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
+        
+        print()
+        print("  ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
+        print("  ⚡ Check the server console/log for exact 'eval time' t/s value,")
+        print("  ⚡ which shows pure token generation speed (always higher).")
+        print("=" * 70)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/benchmarks/quality_ab_test.py
+++ b/scripts/_archive/benchmarks/quality_ab_test.py
@@ -0,0 +1,241 @@
+"""
+Quality A/B Test — Gemma 4 26B vs Qwen 3.5 35B
+실제 서비스 시나리오 기반 품질 비교
+"""
+import urllib.request, json, time, sys, os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except:
+    pass
+
+BASE = "http://127.0.0.1:8000"
+MODEL_NAME = sys.argv[1] if len(sys.argv) > 1 else "unknown"
+OUTPUT_FILE = f"scripts/quality_result_{MODEL_NAME}.json"
+
+SCENARIOS = [
+    # ═══ 1. 코딩 에이전트 (VS Code) ═══
+    {
+        "id": "code_generate",
+        "category": "coding",
+        "name": "Python 함수 생성",
+        "prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code.",
+        "eval_criteria": ["correctness", "type_hints", "docstring", "edge_cases"]
+    },
+    {
+        "id": "code_debug",
+        "category": "coding",
+        "name": "버그 찾기 & 수정",
+        "prompt": """Find and fix the bug in this code:
+```python
+def find_duplicates(arr):
+    seen = {}
+    duplicates = []
+    for item in arr:
+        if item in seen:
+            duplicates.append(item)
+        seen[item] = True
+    return list(set(duplicates))
+
+# Bug: find_duplicates([1,2,2,3,3,3]) returns [2,3] but
+# find_duplicates([]) crashes with unexpected behavior
+# Also it should return count of each duplicate
+```
+Fix it to return a dict like {2: 2, 3: 3} (value=count of occurrences).""",
+        "eval_criteria": ["bug_identified", "correct_fix", "clean_code"]
+    },
+    {
+        "id": "code_refactor",
+        "category": "coding",
+        "name": "TypeScript 리팩토링",
+        "prompt": """Refactor this messy TypeScript into clean, typed code:
+```typescript
+async function getData(url, retry, timeout) {
+  let result = null
+  for (let i = 0; i < retry; i++) {
+    try {
+      const r = await fetch(url, {signal: AbortSignal.timeout(timeout)})
+      if (r.ok) {
+        result = await r.json()
+        break
+      }
+    } catch(e) {
+      if (i === retry - 1) throw e
+      await new Promise(r => setTimeout(r, 1000 * (i+1)))
+    }
+  }
+  return result
+}
+```
+Add proper types, error handling, configurable backoff, and make it production-ready.""",
+        "eval_criteria": ["types", "error_handling", "backoff", "production_quality"]
+    },
+
+    # ═══ 2. 개인 비서 (Discord Bot) — 한국어 ═══
+    {
+        "id": "korean_schedule",
+        "category": "assistant_kr",
+        "name": "한국어 일정 관리",
+        "prompt": "내일 오후 2시에 팀 미팅이 있고, 3시에 치과 예약이 있어. 그리고 저녁 7시에 친구랑 홍대에서 만나기로 했어. 이 일정들을 정리해주고, 이동 시간을 고려해서 현실적으로 가능한지 알려줘. 서울 기준으로.",
+        "eval_criteria": ["korean_fluency", "schedule_analysis", "practical_advice"]
+    },
+    {
+        "id": "korean_email",
+        "category": "assistant_kr",
+        "name": "한국어 이메일 요약",
+        "prompt": """다음 이메일을 3줄로 요약하고, 필요한 액션을 정리해줘:
+
+안녕하세요 김팀장님,
+
+지난 주 논의했던 Q2 마케팅 예산 관련하여 연락드립니다. 
+본부장님께서 기존 제안 대비 15% 삭감을 요청하셨습니다. 
+이에 따라 디지털 마케팅 채널 중 ROI가 낮은 채널을 우선 정리해야 할 것 같습니다.
+
+리서치팀에서는 네이버 검색광고 대비 인스타그램 광고의 전환율이 
+0.3%로 가장 낮다는 분석 결과를 공유했습니다. 
+수요일까지 수정안을 제출해야 하니, 화요일 오전까지 
+각 채널별 삭감 우선순위를 정리해서 회신 부탁드립니다.
+
+감사합니다.
+마케팅팀 박과장 드림""",
+        "eval_criteria": ["korean_summary", "action_items", "conciseness"]
+    },
+
+    # ═══ 3. MCP 도구 (Function Calling) ═══
+    {
+        "id": "tool_calling",
+        "category": "tool_use",
+        "name": "Function Calling (JSON)",
+        "prompt": """You have access to these tools:
+- search_web(query: string) -> string
+- get_calendar(date: string) -> list[Event]  
+- send_email(to: string, subject: string, body: string) -> bool
+
+User says: "Check my calendar for tomorrow, and if I have a meeting with John, search for the latest quarterly report and email him a summary."
+
+Respond with the exact sequence of tool calls as JSON array. Use this format:
+[{"tool": "name", "args": {...}}, ...]""",
+        "eval_criteria": ["correct_sequence", "valid_json", "complete_args"]
+    },
+    {
+        "id": "structured_output",
+        "category": "tool_use",
+        "name": "구조화 출력 (JSON)",
+        "prompt": """Parse this unstructured text into a JSON object:
+
+"삼성전자가 2026년 1분기 실적을 발표했다. 매출은 79조원으로 전년 동기 대비 12% 증가했고, 영업이익은 15.2조원을 기록했다. 반도체 부문이 전체 이익의 65%를 차지했으며, 특히 HBM4 수요 증가로 인해 메모리 사업부 매출이 전 분기 대비 23% 성장했다."
+
+Output format:
+{
+  "company": "",
+  "period": "",
+  "revenue": {"amount": "", "unit": "", "yoy_change": ""},
+  "operating_profit": {"amount": "", "unit": ""},
+  "segments": [{"name": "", "profit_share": "", "highlights": ""}]
+}""",
+        "eval_criteria": ["correct_parsing", "valid_json", "completeness"]
+    },
+
+    # ═══ 4. 일반 추론 ═══
+    {
+        "id": "reasoning",
+        "category": "reasoning",
+        "name": "논리 추론",
+        "prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step.",
+        "eval_criteria": ["correct_answer", "clear_steps", "math_accuracy"]
+    },
+]
+
+
+def ask(prompt, max_tokens=800):
+    payload = json.dumps({
+        "model": "m",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0
+    }).encode()
+    req = urllib.request.Request(
+        f"{BASE}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+    t0 = time.time()
+    resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
+    dt = time.time() - t0
+    usage = resp.get("usage", {})
+    content = resp["choices"][0]["message"]["content"]
+    return {
+        "content": content,
+        "tokens": usage.get("completion_tokens", 0),
+        "time": round(dt, 2),
+        "tps": round(usage.get("completion_tokens", 0) / dt, 2) if dt > 0 else 0
+    }
+
+
+def main():
+    print(f"{'='*60}")
+    print(f"  Quality A/B Test — Model: {MODEL_NAME}")
+    print(f"  {len(SCENARIOS)} scenarios | {time.strftime('%Y-%m-%d %H:%M')}")
+    print(f"{'='*60}\n")
+
+    # Health check
+    try:
+        req = urllib.request.Request(f"{BASE}/health")
+        resp = json.loads(urllib.request.urlopen(req, timeout=5).read())
+        if resp.get("status") != "ok":
+            print("Server not ready!")
+            sys.exit(1)
+    except Exception as e:
+        print(f"Server not reachable: {e}")
+        sys.exit(1)
+
+    # Warmup
+    print("Warmup...", flush=True)
+    ask("Hello", max_tokens=10)
+    print("Done\n", flush=True)
+
+    results = []
+    for i, sc in enumerate(SCENARIOS):
+        print(f"[{i+1}/{len(SCENARIOS)}] {sc['category']} — {sc['name']}")
+        print(f"  Prompt: {sc['prompt'][:80]}...", flush=True)
+
+        try:
+            resp = ask(sc["prompt"])
+            print(f"  ✅ {resp['tokens']} tokens | {resp['tps']:.1f} t/s | {resp['time']}s")
+            print(f"  Response preview: {resp['content'][:120]}...\n")
+
+            results.append({
+                "id": sc["id"],
+                "category": sc["category"],
+                "name": sc["name"],
+                "model": MODEL_NAME,
+                "response": resp["content"],
+                "tokens": resp["tokens"],
+                "time": resp["time"],
+                "tps": resp["tps"],
+                "eval_criteria": sc["eval_criteria"]
+            })
+        except Exception as e:
+            print(f"  ❌ Error: {e}\n")
+            results.append({
+                "id": sc["id"],
+                "category": sc["category"],
+                "name": sc["name"],
+                "model": MODEL_NAME,
+                "response": f"ERROR: {e}",
+                "tokens": 0,
+                "time": 0,
+                "tps": 0,
+            })
+
+    # Save
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\n{'='*60}")
+    print(f"  Results saved: {OUTPUT_FILE}")
+    print(f"  Total scenarios: {len(results)}")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/_archive/benchmarks/quick_bench.py
+++ b/scripts/_archive/benchmarks/quick_bench.py
@@ -0,0 +1,45 @@
+"""Quick benchmark for running llama-server instance"""
+import urllib.request, json, time, sys
+
+BASE = "http://127.0.0.1:8000"
+RUNS = 5
+TOKENS = 200
+
+def bench(max_tokens=TOKENS):
+    payload = json.dumps({
+        "model": "m",
+        "messages": [{"role": "user", "content": "Count from 1 to 100, each number on a new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0
+    }).encode()
+    req = urllib.request.Request(
+        f"{BASE}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+    t0 = time.time()
+    resp = json.loads(urllib.request.urlopen(req, timeout=300).read())
+    dt = time.time() - t0
+    ct = resp.get("usage", {}).get("completion_tokens", 0)
+    return ct / dt if dt > 0 else 0, ct, dt
+
+print("Warmup...", flush=True)
+try:
+    bench(20)
+except Exception as e:
+    print(f"Warmup failed: {e}")
+    sys.exit(1)
+print("Warmup done\n", flush=True)
+
+speeds = []
+for i in range(RUNS):
+    tps, ct, dt = bench()
+    speeds.append(tps)
+    print(f"  Run {i+1}: {tps:.2f} t/s (tokens={ct}, time={dt:.2f}s)", flush=True)
+
+avg = sum(speeds) / len(speeds)
+best = max(speeds)
+mn = min(speeds)
+print(f"\n{'='*50}")
+print(f"  RESULT: AVG {avg:.2f} / BEST {best:.2f} / MIN {mn:.2f} t/s")
+print(f"{'='*50}")
--- a/scripts/_archive/benchmarks/quick_pptest.mjs
+++ b/scripts/_archive/benchmarks/quick_pptest.mjs
@@ -0,0 +1,31 @@
+// Quick PP+TG speed test
+const BASE = "http://127.0.0.1:8000";
+
+async function test(label, prompt, maxTok) {
+  const t0 = Date.now();
+  const r = await fetch(`${BASE}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ model: "m", messages: [{ role: "user", content: prompt }], max_tokens: maxTok, temperature: 0 }),
+    signal: AbortSignal.timeout(600000),
+  });
+  const d = await r.json();
+  const dt = (Date.now() - t0) / 1000;
+  const u = d.usage || {};
+  const pp = u.prompt_tokens || 0;
+  const tg = u.completion_tokens || 0;
+  const ppSpeed = pp > 0 ? (pp / dt).toFixed(1) : "?";
+  const tgSpeed = tg > 0 ? (tg / dt).toFixed(1) : "?";
+  console.log(`${label} | PP:${pp}tok ${ppSpeed}t/s | TG:${tg}tok ${tgSpeed}t/s | ${dt.toFixed(1)}s`);
+}
+
+const short = "Count 1 to 20.";
+const long = "x".repeat(3000) + " Summarize above in 3 words.";
+const code = Array(200).fill("function foo(x) { return x * 2 + Math.random(); }").join("\n") + "\n\nRefactor above to arrow functions. Show first 5 lines.";
+
+await test("warmup", short, 20);
+await test("SHORT", short, 200);
+await test("3K-PP", long, 100);
+await test("10K-CODE", code, 100);
+await test("TG-500", short, 500);
+console.log("DONE");
--- a/scripts/_archive/benchmarks/qwen_split_challenge.py
+++ b/scripts/_archive/benchmarks/qwen_split_challenge.py
@@ -0,0 +1,67 @@
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+
+try: sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError: pass
+
+MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+
+subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+time.sleep(2)
+
+cmd = [
+    LLAMA_SERVER, "--model", MODEL,
+    "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
+    "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+    "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
+    "--prio", "3", "--port", "8000", "--host", "0.0.0.0",
+    "-ts", "0.44,0.56"
+]
+
+print(f"🚀 Starting Challenge (0.44, 0.56) ...")
+proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+ready = False
+for i in range(120):
+    try:
+        req = urllib.request.Request("http://127.0.0.1:8000/health")
+        with urllib.request.urlopen(req, timeout=1) as r:
+            if json.loads(r.read()).get("status") == "ok":
+                ready = True
+                break
+    except:
+        pass
+    print(f" booting... {i}s", end='\r', flush=True)
+    time.sleep(1)
+
+if not ready:
+    print("\n❌ FAILED to boot.")
+    proc.kill()
+    sys.exit(1)
+
+print("\n✅ Booted! Testing 200 tokens...")
+try:
+    payload = json.dumps({
+        "messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
+        "max_tokens": 200, "temperature": 0
+    }).encode()
+    req = urllib.request.Request("http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"})
+    t0 = time.time()
+    with urllib.request.urlopen(req, timeout=300) as r:
+        res = json.loads(r.read())
+        el = time.time() - t0
+        ct = res["usage"]["completion_tokens"]
+        tps = ct / el
+        print("="*50)
+        print(f"★ 0.44 / 0.56 RESULT: {tps:.2f} t/s ★")
+        print(f"   Tokens: {ct} | Time: {el:.2f}s")
+        print("="*50)
+except Exception as e:
+    print(f"\n❌ Benchmark Error: {e}")
+
+proc.kill()
--- a/scripts/_archive/benchmarks/qwen_split_test.py
+++ b/scripts/_archive/benchmarks/qwen_split_test.py
@@ -0,0 +1,141 @@
+"""
+Qwen 3.5 35B-A3B Q4_K_M - Tensor Split 0.42/0.58 Speed Test
+64 t/s 달성 설정 기반, 스플릿 비율만 변경
+"""
+import subprocess, time, json, urllib.request, sys, os
+
+PYTHON = sys.executable
+LLAMA = os.path.join(os.getcwd(), "llama_bin_run", "llama-server.exe")
+MODEL = os.path.join(os.getcwd(), "models", "Qwen3.5-35B-A3B-Q4_K_M.gguf")
+TS = "0.55,0.45"
+
+# 1. Kill any existing server
+print("[1/4] Killing existing llama-server...")
+subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
+               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+time.sleep(3)
+
+# 2. Start server with 64t/s config + custom split
+args = [
+    LLAMA, "--model", MODEL,
+    "-ngl", "999",
+    "-c", "262144",
+    "-np", "1",
+    "-fa", "on",
+    "--cache-type-k", "q4_0",
+    "--cache-type-v", "q4_0",
+    "-ub", "128",
+    "-b", "512",
+    "-t", "6",
+    "-tb", "6",
+    "--prio", "3",
+    "--mlock",
+    "--poll", "50",
+    "--port", "8000",
+    "--host", "0.0.0.0",
+    "-ts", TS,
+]
+print(f"[2/4] Starting server with -ts {TS}")
+print(f"       CMD: {' '.join(args[-6:])}")
+server = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+# 3. Wait for health
+print("[3/4] Waiting for server to become healthy...")
+t_boot = time.time()
+healthy = False
+for sec in range(180):  # max 3 min
+    time.sleep(1)
+    # Check if process crashed
+    if server.poll() is not None:
+        print(f"  !! Server process CRASHED (exit code {server.returncode})")
+        sys.exit(1)
+    try:
+        r = urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=1)
+        body = json.loads(r.read())
+        if body.get("status") == "ok":
+            healthy = True
+            break
+    except Exception:
+        pass
+    if sec % 10 == 9:
+        print(f"  ... {sec+1}s elapsed")
+
+if not healthy:
+    print(f"  FAIL: Server not healthy after 180 seconds")
+    server.kill()
+    sys.exit(1)
+
+boot_secs = time.time() - t_boot
+print(f"  OK: Booted in {boot_secs:.1f}s")
+
+# VRAM check
+try:
+    v = subprocess.run(
+        ["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
+         "--format=csv,noheader,nounits"],
+        capture_output=True, text=True, timeout=5)
+    print(f"  VRAM: {v.stdout.strip()}")
+except:
+    pass
+
+# 4. Benchmark
+print("[4/4] Running token speed benchmark (200 tokens)...")
+
+def do_bench(max_tok):
+    payload = json.dumps({
+        "messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
+        "max_tokens": max_tok,
+        "temperature": 0
+    }).encode("utf-8")
+    req = urllib.request.Request(
+        "http://127.0.0.1:8000/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"})
+    t0 = time.time()
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - t0
+    ct = result["usage"]["completion_tokens"]
+    return ct / elapsed, ct, elapsed
+
+# warmup
+try:
+    do_bench(10)
+except:
+    pass
+
+# real runs - 5회
+print("[4/4] Running 5x benchmark (200 tokens each)...")
+results = []
+for i in range(5):
+    tps, tokens, elapsed = do_bench(200)
+    results.append(tps)
+    # VRAM check per run
+    try:
+        v = subprocess.run(["nvidia-smi", "--query-gpu=index,memory.used,memory.total,memory.free", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
+        vram_info = v.stdout.strip()
+    except:
+        vram_info = "?"
+    print(f"  Run {i+1}: {tps:.2f} t/s  ({tokens} tok / {elapsed:.2f}s) | VRAM: {vram_info}")
+
+avg = sum(results) / len(results)
+best = max(results)
+worst = min(results)
+summary = f"""
+==================================================
+ TS={TS}  5-Run Results (with --mlock --poll 50):
+   AVG:  {avg:.2f} t/s
+   BEST: {best:.2f} t/s
+   MIN:  {worst:.2f} t/s
+   Runs: {[f'{r:.2f}' for r in results]}
+==================================================
+"""
+print(summary)
+with open("scripts/split_test_result.txt", "w") as f:
+    f.write(summary)
+
+# cleanup
+server.kill()
+subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
+               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
--- a/scripts/_archive/benchmarks/test_qwen.py
+++ b/scripts/_archive/benchmarks/test_qwen.py
@@ -0,0 +1,36 @@
+import urllib.request
+import json
+import traceback
+
+BASE_URL = "http://127.0.0.1:8000"
+prompt = "수백만 건의 실시간 주식 틱 데이터를 수집하고, 이를 가공하여 초당 수천 명의 클라이언트에게 웹소켓으로 지연 없이 브로드캐스팅하는 시스템을 설계해야 합니다. 언어, 메시지 큐, 데이터베이스, 캐싱 전략 등을 포함해 구체적인 아키텍처를 제안하고, 병목 현상에 대비한 해결책을 설명하세요."
+
+def test():
+    try:
+        payload = json.dumps({
+            "model": "m",
+            "messages": [
+                {"role": "system", "content": "You are a world-class IT system architect and developer. Please output your response in Korean."},
+                {"role": "user", "content": prompt}
+            ],
+            "max_tokens": 4096,
+            "temperature": 0.1
+        }).encode('utf-8')
+        
+        req = urllib.request.Request(
+            f"{BASE_URL}/v1/chat/completions",
+            data=payload,
+            headers={"Content-Type": "application/json"}
+        )
+        print("전송 중... (타임아웃 300초)")
+        resp = urllib.request.urlopen(req, timeout=300).read()
+        res_json = json.loads(resp)
+        print("\n=== 결과 ===")
+        print(res_json["choices"][0]["message"]["content"])
+    except Exception as e:
+        print("\n=== 에러 발생 ===")
+        print(e)
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    test()
--- a/scripts/_archive/benchmarks/test_split_03_07.mjs
+++ b/scripts/_archive/benchmarks/test_split_03_07.mjs
@@ -0,0 +1,84 @@
+import { spawn, execSync } from "child_process";
+
+const BASE_URL = "http://127.0.0.1:8000";
+const args = [
+  "llama_bin_run\\llama-server.exe",
+  "--model", "models\\Qwen3.5-35B-A3B-Q4_K_M.gguf",
+  "-ngl", "999",
+  "-c", "262144",
+  "-np", "1",
+  "-fa", "on",
+  "--cache-type-k", "q4_0",
+  "--cache-type-v", "q4_0",
+  "-ub", "128",
+  "-b", "512",
+  "-t", "6",
+  "-tb", "6",
+  "--prio", "3",
+  "--port", "8000",
+  "--host", "0.0.0.0",
+  "-ts", "0.3,0.7"
+];
+
+console.log(`Starting server with args: \n${args.join(" ")}\n`);
+try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
+await new Promise(r => setTimeout(r, 2000));
+
+const server = spawn(args[0], args.slice(1), { stdio: 'ignore' });
+
+let ready = false;
+let bootStart = Date.now();
+for (let i = 0; i < 60; i++) {
+  try {
+    const res = await fetch(`${BASE_URL}/health`);
+    if (res.status === 200) { ready = true; break; }
+  } catch(e) {}
+  await new Promise(r => setTimeout(r, 3000));
+}
+
+if (!ready) {
+  console.log("Server failed to boot within 3 mins.");
+  try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
+  process.exit(1);
+}
+
+const bootElapsed = (Date.now() - bootStart) / 1000;
+console.log(`\n===========================================`);
+console.log(`Server booted in ${bootElapsed.toFixed(1)}s.`);
+
+try {
+  const vram = execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', { encoding: 'utf-8' });
+  console.log(`VRAM USAGE:\n${vram.trim()}`);
+} catch(e) {}
+console.log(`===========================================\n`);
+
+try {
+  await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST", headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ messages: [{ role: "user", content: "hi" }], max_tokens: 10, temperature: 0 })
+  });
+} catch(e) {}
+
+console.log("Running speed test (200 tokens)...");
+const t0 = Date.now();
+try {
+    const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
+      method: "POST", headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], max_tokens: 200, temperature: 0 })
+    });
+    const result = await res.json();
+    const elapsed = (Date.now() - t0) / 1000;
+    const ct = result?.usage?.completion_tokens || 0;
+    const tps = ct / elapsed;
+
+    console.log(`\n===========================================`);
+    console.log(`★ 0.3/0.7 SPLIT RESULT: ${tps.toFixed(2)} t/s ★`);
+    console.log(`   Tokens: ${ct}`);
+    console.log(`   Time: ${elapsed.toFixed(2)}s\n===========================================\n`);
+
+} catch(e) {
+    console.log("ERROR during benchmark:", e.message);
+}
+
+try { execSync('taskkill /F /IM llama-server.exe', { stdio: 'ignore' }); } catch(e) {}
+process.exit(0);
--- a/scripts/_archive/benchmarks/test_split_03_07.py
+++ b/scripts/_archive/benchmarks/test_split_03_07.py
@@ -0,0 +1,108 @@
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
+CONTEXT = 262144
+
+def kill_server():
+    try:
+        subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+    except:
+        pass
+    time.sleep(3)
+
+def run_benchmark(max_tokens=200):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": "Count from 1 to 50, each on new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    ct = usage.get("completion_tokens", 0)
+    return ct / elapsed if elapsed > 0 else 0, ct, elapsed
+
+def get_vram():
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        return r.stdout.strip()
+    except:
+        return "Unknown"
+
+kill_server()
+
+cmd = [
+    LLAMA_SERVER, "--model", MODEL,
+    "-ngl", "999", "-c", str(CONTEXT), "-np", "1", "-fa", "on",
+    "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+    "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
+    "--prio", "3", "--port", "8000", "--host", "0.0.0.0",
+    "-ts", "0.45,0.55"
+]
+
+print("Starting server with tensorSplit 0.45,0.55")
+proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=os.getcwd())
+
+ready = False
+boot_start = time.time()
+for _ in range(30):
+    try:
+        req = urllib.request.Request(f"{BASE_URL}/health")
+        with urllib.request.urlopen(req, timeout=2) as resp:
+            data = json.loads(resp.read())
+            if data.get("status") == "ok":
+                ready = True
+                break
+    except:
+        pass
+    time.sleep(3)
+
+if not ready:
+    print("Server failed to boot.")
+    kill_server()
+    sys.exit(1)
+
+boot_time = time.time() - boot_start
+print(f"Booted in {boot_time:.1f}s")
+print(f"VRAM:\n{get_vram()}")
+
+try:
+    print("Warming up...")
+    run_benchmark(10)
+    
+    print("Benchmarking (200 tokens)...")
+    tps, ct, el = run_benchmark(200)
+    print("=" * 50)
+    print(f"★ 0.3/0.7 SPLIT RESULT: {tps:.2f} t/s ★")
+    print(f"   Tokens: {ct} / Time: {el:.2f}s")
+    print("=" * 50)
+except Exception as e:
+    print(f"Error benchmark: {e}")
+
+kill_server()