""" Qwen 3.5 35B-A3B Q4_K_M - Tensor Split 0.42/0.58 Speed Test 64 t/s 달성 설정 기반, 스플릿 비율만 변경 """ import subprocess, time, json, urllib.request, sys, os PYTHON = sys.executable LLAMA = os.path.join(os.getcwd(), "llama_bin_run", "llama-server.exe") MODEL = os.path.join(os.getcwd(), "models", "Qwen3.5-35B-A3B-Q4_K_M.gguf") TS = "0.55,0.45" # 1. Kill any existing server print("[1/4] Killing existing llama-server...") subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) time.sleep(3) # 2. Start server with 64t/s config + custom split args = [ LLAMA, "--model", MODEL, "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on", "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "128", "-b", "512", "-t", "6", "-tb", "6", "--prio", "3", "--mlock", "--poll", "50", "--port", "8000", "--host", "0.0.0.0", "-ts", TS, ] print(f"[2/4] Starting server with -ts {TS}") print(f" CMD: {' '.join(args[-6:])}") server = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # 3. Wait for health print("[3/4] Waiting for server to become healthy...") t_boot = time.time() healthy = False for sec in range(180): # max 3 min time.sleep(1) # Check if process crashed if server.poll() is not None: print(f" !! Server process CRASHED (exit code {server.returncode})") sys.exit(1) try: r = urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=1) body = json.loads(r.read()) if body.get("status") == "ok": healthy = True break except Exception: pass if sec % 10 == 9: print(f" ... {sec+1}s elapsed") if not healthy: print(f" FAIL: Server not healthy after 180 seconds") server.kill() sys.exit(1) boot_secs = time.time() - t_boot print(f" OK: Booted in {boot_secs:.1f}s") # VRAM check try: v = subprocess.run( ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5) print(f" VRAM: {v.stdout.strip()}") except: pass # 4. Benchmark print("[4/4] Running token speed benchmark (200 tokens)...") def do_bench(max_tok): payload = json.dumps({ "messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}], "max_tokens": max_tok, "temperature": 0 }).encode("utf-8") req = urllib.request.Request( "http://127.0.0.1:8000/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"}) t0 = time.time() with urllib.request.urlopen(req, timeout=120) as resp: result = json.loads(resp.read()) elapsed = time.time() - t0 ct = result["usage"]["completion_tokens"] return ct / elapsed, ct, elapsed # warmup try: do_bench(10) except: pass # real runs - 5회 print("[4/4] Running 5x benchmark (200 tokens each)...") results = [] for i in range(5): tps, tokens, elapsed = do_bench(200) results.append(tps) # VRAM check per run try: v = subprocess.run(["nvidia-smi", "--query-gpu=index,memory.used,memory.total,memory.free", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5) vram_info = v.stdout.strip() except: vram_info = "?" print(f" Run {i+1}: {tps:.2f} t/s ({tokens} tok / {elapsed:.2f}s) | VRAM: {vram_info}") avg = sum(results) / len(results) best = max(results) worst = min(results) summary = f""" ================================================== TS={TS} 5-Run Results (with --mlock --poll 50): AVG: {avg:.2f} t/s BEST: {best:.2f} t/s MIN: {worst:.2f} t/s Runs: {[f'{r:.2f}' for r in results]} ================================================== """ print(summary) with open("scripts/split_test_result.txt", "w") as f: f.write(summary) # cleanup server.kill() subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)