import subprocess, time, urllib.request, json, sys try: sys.stdout.reconfigure(encoding='utf-8') except: pass MODEL = "C:/Users/Variet-Worker/Desktop/variet-llm/models/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf" BASE = "http://127.0.0.1:8000" # Goal: See if decreasing n-cpu-moe increases VRAM usage on GPU 1 and improves speed # Now that we know GPU 1 is isolated and has ~3.5GB free with 256K context BASE_CMD = [ r"llama_bin_run\llama-server.exe", "--model", MODEL, "-ngl", "999", "-sm", "none", "--main-gpu", "1", "-c", "4096", "-np", "1", "-fa", "on", # use a small context for fast boot/testing "--cache-type-k", "q4_0", "--cache-type-v", "q4_0", "-ub", "512", "-b", "2048", "-t", "8", "-tb", "8", "--prio", "3", "--poll", "50", "--no-mmap", "--port", "8000", "--host", "0.0.0.0" ] CONFIGS = [ {"name": "n-cpu-moe=48 (baseline)", "extra": ["-ncmoe", "48"]}, {"name": "n-cpu-moe=40", "extra": ["-ncmoe", "40"]}, {"name": "n-cpu-moe=32", "extra": ["-ncmoe", "32"]}, {"name": "n-cpu-moe=24", "extra": ["-ncmoe", "24"]}, ] def kill(): subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) time.sleep(4) def check_server(timeout=900): start = time.time() while time.time() - start < timeout: try: req = urllib.request.Request(f"{BASE}/health") resp = json.loads(urllib.request.urlopen(req, timeout=2).read()) if resp.get("status") in ("ok", "ready"): return True except: pass time.sleep(5) return False def bench(runs=2): speeds = [] for i in range(runs): payload = json.dumps({ "model": "m", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Write a short Python script."} ], "max_tokens": 100, "temperature": 0.0 }).encode('utf-8') req = urllib.request.Request(f"{BASE}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"}) t0 = time.time() resp = json.loads(urllib.request.urlopen(req, timeout=600).read()) dt = time.time() - t0 tokens = resp.get("usage", {}).get("completion_tokens", 0) speed = tokens / dt if dt > 0 else 0 speeds.append(speed) return sum(speeds)/len(speeds), max(speeds) def vram(): try: out = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True).decode().strip() return [int(x.strip()) for x in out.split('\n')] except: return [0, 0] results = [] for cfg in CONFIGS: kill() print(f"\n{'='*60}\nTesting: {cfg['name']}\n{'='*60}") cmd = BASE_CMD + cfg["extra"] proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if not check_server(300): print(f" FAILED TO BOOT (OOM?)") results.append({"name": cfg["name"], "status": "BOOT_FAIL"}) proc.terminate(); kill(); continue print(" Server ready! Warming up...") time.sleep(2) v = vram() avg, best = bench(runs=2) print(f" >>> AVG: {avg:.2f} t/s | VRAM GPU1: {v[1]}MB") results.append({ "name": cfg["name"], "avg_tps": round(avg,2), "vram_gpu1": v[1], "status": "OK" }) proc.terminate() kill() print("\nFINAL RESULTS:") for r in results: if r["status"] == "OK": print(f" {r['name']:<25} {r['avg_tps']:>5} t/s | {r['vram_gpu1']:>5}MB") else: print(f" {r['name']:<25} FAIL (OOM)")