import subprocess import time import urllib.request import json import sys import os try: sys.stdout.reconfigure(encoding='utf-8') except: pass BASE_URL = "http://127.0.0.1:8000" RESULTS_FILE = "scripts/deep_tier_auto_results.json" MODELS = [ { "name": "Qwen 27B - 256K (q4_0)", "cmd": [ r"llama_bin_run\llama-server.exe", "--model", r"models\Qwen3.5-27B-Q4_K_M.gguf", "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on", "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "128", "-b", "512", "-t", "6", "-tb", "6", "--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5", "--port", "8000", "--host", "0.0.0.0" ] }, { "name": "Gemma 31B - 32K (q4_0)", "cmd": [ r"llama_bin_run\llama-server.exe", "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf", "-ngl", "999", "-c", "32768", "-np", "1", "-fa", "on", "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "256", "-b", "1024", "-t", "6", "-tb", "6", "--prio", "3", "--mlock", "--poll", "50", "--port", "8000", "--host", "0.0.0.0" ] }, { "name": "Gemma 31B - 64K (q4_0)", "cmd": [ r"llama_bin_run\llama-server.exe", "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf", "-ngl", "999", "-c", "65536", "-np", "1", "-fa", "on", "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "256", "-b", "1024", "-t", "6", "-tb", "6", "--prio", "3", "--mlock", "--poll", "50", "--port", "8000", "--host", "0.0.0.0" ] } ] TEST_PROMPTS = [ { "id": "code", "prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code." }, { "id": "logical", "prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step." } ] def check_server(timeout=300): start = time.time() while time.time() - start < timeout: try: req = urllib.request.Request(f"{BASE_URL}/health") resp = json.loads(urllib.request.urlopen(req, timeout=2).read()) if resp.get("status") == "ok" or resp.get("status") == "ready": return True except: pass time.sleep(5) return False def get_vram_usage(): try: out = subprocess.check_output( ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"], text=True ) return out.strip().split("\n") except: return ["Failed to get VRAM info"] def ask(prompt, max_tokens=300): payload = json.dumps({ "model": "m", "messages": [{"role": "user", "content": prompt}], "max_tokens": max_tokens, "temperature": 0.0 }).encode() req = urllib.request.Request( f"{BASE_URL}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"} ) t0 = time.time() resp = json.loads(urllib.request.urlopen(req, timeout=120).read()) dt = time.time() - t0 usage = resp.get("usage", {}) content = resp["choices"][0]["message"]["content"] tokens = usage.get("completion_tokens", 0) tps = round(tokens / dt, 2) if dt > 0 else 0 return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."} def main(): results = [] # Kill any existing llama-server subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) time.sleep(3) for cfg in MODELS: print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...") # Start server proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Wait for boot print(f"Waiting for server to boot (up to 5 mins)...") is_ready = check_server(300) if not is_ready: print(f"āŒ Failed to boot {cfg['name']}.") results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"}) proc.terminate() subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) time.sleep(5) continue print(f"āœ… Server Ready!") vram = get_vram_usage() # Warmup try: ask("Hello", max_tokens=10) except Exception as e: pass test_data = {} for p in TEST_PROMPTS: print(f" Testing {p['id']}...", end="", flush=True) try: res = ask(p["prompt"]) test_data[p["id"]] = res print(f" {res['tps']} t/s") except Exception as e: test_data[p["id"]] = {"error": str(e)} print(f" ERROR: {e}") results.append({ "name": cfg["name"], "status": "Success", "vram": vram, "tests": test_data }) # Save incremental with open(RESULTS_FILE, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) # Shutdown print("Shutting down server...") proc.terminate() subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) time.sleep(5) print("\nāœ… All tests complete!") print(f"Results saved to {RESULTS_FILE}") if __name__ == "__main__": main()