variet_llm/scripts/_archive/benchmarks/deep_tier_extreme_test.py

import subprocess
import time
import urllib.request
import json
import sys

try:
    sys.stdout.reconfigure(encoding='utf-8')
except:
    pass

BASE_URL = "http://127.0.0.1:8000"
RESULTS_FILE = "scripts/deep_tier_extreme_results.json"

MODELS = [
    {
        "name": "Qwen 27B - 256K 극한 (q4_0, ub=512)",
        "cmd": [
            r"llama_bin_run\llama-server.exe",
            "--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
            "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
            "-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
            "--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
            "--port", "8000", "--host", "0.0.0.0"
        ]
    },
    {
        "name": "Gemma 31B - 128K 확장 (q4_0)",
        "cmd": [
            r"llama_bin_run\llama-server.exe",
            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
            "-ngl", "999", "-c", "131072", "-np", "1", "-fa", "on",
            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
            "-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
            "--prio", "3", "--mlock", "--poll", "50",
            "--port", "8000", "--host", "0.0.0.0"
        ]
    },
    {
        "name": "Gemma 31B - 192K 극한 (q4_0)",
        "cmd": [
            r"llama_bin_run\llama-server.exe",
            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
            "-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
            "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
            "--prio", "3", "--mlock", "--poll", "50",
            "--port", "8000", "--host", "0.0.0.0"
        ]
    }
]

TEST_PROMPTS = [
    {
        "id": "code",
        "prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
    },
    {
        "id": "logical",
        "prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
    }
]

def check_server(timeout=300):
    start = time.time()
    while time.time() - start < timeout:
        try:
            req = urllib.request.Request(f"{BASE_URL}/health")
            resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
            if resp.get("status") == "ok" or resp.get("status") == "ready":
                return True
        except:
            pass
        time.sleep(5)
    return False

def get_vram_usage():
    try:
        out = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
            text=True
        )
        return out.strip().split("\n")
    except:
        return ["Failed to get VRAM info"]

def ask(prompt, max_tokens=300):
    payload = json.dumps({
        "model": "m",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.0
    }).encode()
    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )
    t0 = time.time()
    resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
    dt = time.time() - t0
    usage = resp.get("usage", {})
    content = resp["choices"][0]["message"]["content"]

    tokens = usage.get("completion_tokens", 0)
    tps = round(tokens / dt, 2) if dt > 0 else 0
    return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}

def main():
    results = []

    # Clean init
    subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(3)

    for cfg in MODELS:
        print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")

        proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print(f"Waiting for server to boot (up to 5 mins)...")
        is_ready = check_server(300)

        if not is_ready:
            print(f"❌ Failed to boot {cfg['name']}.")
            results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
            proc.terminate()
            subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            time.sleep(5)
            continue

        print(f"✅ Server Ready!")
        vram = get_vram_usage()
        print(f"VRAM: {vram}")

        # Warmup
        try:
            ask("Hello", max_tokens=10)
        except Exception:
            pass

        test_data = {}
        for p in TEST_PROMPTS:
            print(f"  Testing {p['id']}...", end="", flush=True)
            try:
                res = ask(p["prompt"])
                test_data[p["id"]] = res
                print(f" {res['tps']} t/s")
            except Exception as e:
                test_data[p["id"]] = {"error": str(e)}
                print(f" ERROR: {e}")

        results.append({
            "name": cfg["name"],
            "status": "Success",
            "vram": vram,
            "tests": test_data
        })

        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        print("Shutting down server...")
        proc.terminate()
        subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        time.sleep(5)

    print("\n✅ Extreme testing complete! Check results in", RESULTS_FILE)

if __name__ == "__main__":
    main()