variet_llm/scripts/_archive/benchmarks/llm_judge_test.py

import subprocess
import time
import urllib.request
import json
import sys
import traceback

try:
    sys.stdout.reconfigure(encoding='utf-8')
except:
    pass

BASE_URL = "http://127.0.0.1:8000"
RESULTS_FILE = "scripts/llm_judge_answers.json"

MODELS = [
    {
        "name": "Qwen 27B",
        "cmd": [
            r"llama_bin_run\llama-server.exe",
            "--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
            "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
            "-ub", "512", "-b", "1024", "-t", "6", "-tb", "6",
            "--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
            "--port", "8000", "--host", "0.0.0.0"
        ]
    },
    {
        "name": "Gemma 31B",
        "cmd": [
            r"llama_bin_run\llama-server.exe",
            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
            "-ngl", "999", "-c", "196608", "-np", "1", "-fa", "on",
            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
            "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
            "--prio", "3", "--mlock", "--poll", "50",
            "--port", "8000", "--host", "0.0.0.0"
        ]
    }
]

QUESTIONS = [
    {
        "id": "architecture",
        "prompt": "수백만 건의 실시간 주식 틱 데이터를 수집하고, 이를 가공하여 초당 수천 명의 클라이언트에게 웹소켓으로 지연 없이 브로드캐스팅하는 시스템을 설계해야 합니다. 언어, 메시지 큐, 데이터베이스, 캐싱 전략 등을 포함해 구체적인 아키텍처를 제안하고, 병목 현상에 대비한 해결책을 설명하세요."
    },
    {
        "id": "logic",
        "prompt": "논리 문제: 방 안에 5명의 사람(A, B, C, D, E)이 있습니다. A는 B를 제외한 모든 사람과 악수했습니다. B는 C와만 악수했습니다. C는 D와 악수하지 않았습니다. 그렇다면 E는 총 몇 명과 악수했을까요? 당신의 논리적 사고 과정을 한 단계씩 명확히 설명해주세요."
    },
    {
        "id": "coding",
        "prompt": "파이썬에서 데코레이터를 작성하세요. 이 데코레이터는 함수의 실행을 최대 3번까지 재시도하며, 각 재시도 간에 지수 백오프(Exponential Backoff)를 적용해야 합니다. 로깅 처리가 포함되어야 하며, 어떤 예외 타입(Exception type)이 발생했을 때만 재시도할지 인자로 받을 수 있어야 합니다."
    }
]

def check_server(timeout=300):
    start = time.time()
    while time.time() - start < timeout:
        try:
            req = urllib.request.Request(f"{BASE_URL}/health")
            resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
            if resp.get("status") == "ok" or resp.get("status") == "ready":
                return True
        except:
            pass
        time.sleep(5)
    return False

def ask(prompt, max_tokens=4096):
    payload = json.dumps({
        "model": "m",
        "messages": [
            {"role": "system", "content": "You are a world-class IT system architect and developer. Please output your response in Korean."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": -1,
        "temperature": 0.0
    }).encode('utf-8')

    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )
    t0 = time.time()
    resp = json.loads(urllib.request.urlopen(req, timeout=1800).read())
    dt = time.time() - t0
    content = resp["choices"][0]["message"]["content"]
    return content

def main():
    results = {}

    subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(3)

    for cfg in MODELS:
        print(f"\n[{time.strftime('%H:%M:%S')}] Booting {cfg['name']}...")
        proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        if not check_server(300):
            print(f"Failed to boot {cfg['name']}.")
            proc.terminate()
            continue

        print(f"✅ {cfg['name']} is ready! Asking questions...")

        try: ask("Hi", max_tokens=10)
        except: pass

        results[cfg['name']] = {}
        for q in QUESTIONS:
            print(f"  -> Asking: {q['id']}")
            try:
                ans = ask(q['prompt'])
                results[cfg['name']][q['id']] = ans
                print("     (Done)")
            except Exception as e:
                results[cfg['name']][q['id']] = f"ERROR: {e}"
                print("     (Error)")

        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        proc.terminate()
        subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        time.sleep(5)

    print("\n✅ All questions answered! Results saved to", RESULTS_FILE)

if __name__ == "__main__":
    main()