#!/usr/bin/env python """Test multiple -ts ratios to find which ones start normally (no OOM, PP enabled).""" import subprocess import time import json import urllib.request import urllib.error import sys import re from pathlib import Path ROOT = Path(__file__).parent.parent CONFIG_FILE = ROOT / "config" / "engine_models.json" LLAMA_LOG = ROOT / "logs" / "llama-server.log" ENGINE_LOG = ROOT / "logs" / "engine_test.log" PYTHON = r"C:\ProgramData\miniforge3\envs\variet-llm\python.exe" ENGINE_SCRIPT = ROOT / "engine" / "variet_engine.py" RATIOS = [ ("0.5", "0.5"), ("0.48", "0.52"), ("0.47", "0.53"), ("0.45", "0.55"), ("0.43", "0.57"), ("0.40", "0.60"), ] try: sys.stdout.reconfigure(encoding="utf-8") except Exception: pass def kill_servers(): subprocess.run( ["powershell", "-Command", "Get-WmiObject Win32_Process | Where-Object { $_.CommandLine -like '*engine/variet_engine.py*' -or $_.Name -eq 'llama-server.exe' } | ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }"], capture_output=True ) time.sleep(2) def update_config(ts_a, ts_b): with open(CONFIG_FILE, encoding="utf-8") as f: cfg = json.load(f) args = cfg["roles"]["balanced"]["args"] for i, a in enumerate(args): if a == "-ts" and i + 1 < len(args): args[i + 1] = f"{ts_a},{ts_b}" break with open(CONFIG_FILE, "w", encoding="utf-8") as f: json.dump(cfg, f, indent=2, ensure_ascii=False) def start_engine(): LLAMA_LOG.write_text("") ENGINE_LOG.write_text("") return subprocess.Popen( [PYTHON, str(ENGINE_SCRIPT)], cwd=str(ROOT), stdout=open(ENGINE_LOG, "wb"), stderr=subprocess.STDOUT, creationflags=subprocess.CREATE_NEW_PROCESS_GROUP ) def wait_for_result(timeout=180): """Return (status, log_tail) where status is 'ready'|'oom'|'error'|'timeout'.""" deadline = time.time() + timeout while time.time() < deadline: time.sleep(3) # check engine status try: with urllib.request.urlopen("http://localhost:8000/engine/status", timeout=2) as r: data = json.loads(r.read()) if data.get("state") == "ready": return "ready", "" if data.get("state") == "error": return "error", "" except Exception: pass return "timeout", "" def analyze_log(): if not LLAMA_LOG.exists(): return {} text = LLAMA_LOG.read_text(encoding="utf-8", errors="ignore") result = { "pp_enabled": "pipeline parallelism enabled" in text, "pp_fallback": "retrying without pipeline parallelism" in text, "oom": "out of memory" in text, "listening": "main: server is listening" in text, } m = re.search(r"CUDA0 model buffer size = +([0-9.]+) MiB", text) if m: result["cuda0_model"] = float(m.group(1)) m = re.search(r"CUDA1 model buffer size = +([0-9.]+) MiB", text) if m: result["cuda1_model"] = float(m.group(1)) m = re.search(r"CUDA0 KV buffer size = +([0-9.]+) MiB", text) if m: result["cuda0_kv"] = float(m.group(1)) m = re.search(r"CUDA1 KV buffer size = +([0-9.]+) MiB", text) if m: result["cuda1_kv"] = float(m.group(1)) m = re.search(r"CUDA0 compute buffer size = +([0-9.]+) MiB", text) if m: result["cuda0_compute"] = float(m.group(1)) m = re.search(r"CUDA1 compute buffer size = +([0-9.]+) MiB", text) if m: result["cuda1_compute"] = float(m.group(1)) return result def main(): results = [] print(f"{'ratio':<14} {'status':<10} {'PP':<6} {'cuda0_m':<9} {'cuda1_m':<9} {'cuda0_kv':<9} {'cuda1_kv':<9} {'c0_comp':<9} {'c1_comp':<9}") print("-" * 110) for ts_a, ts_b in RATIOS: label = f"{ts_a},{ts_b}" kill_servers() update_config(ts_a, ts_b) proc = start_engine() status, _ = wait_for_result(timeout=180) info = analyze_log() pp = "ON" if info.get("pp_enabled") and not info.get("pp_fallback") else ("FALLBACK" if info.get("pp_fallback") else "?") c0m = f"{info.get('cuda0_model', 0):.0f}" c1m = f"{info.get('cuda1_model', 0):.0f}" c0kv = f"{info.get('cuda0_kv', 0):.0f}" c1kv = f"{info.get('cuda1_kv', 0):.0f}" c0c = f"{info.get('cuda0_compute', 0):.0f}" c1c = f"{info.get('cuda1_compute', 0):.0f}" print(f"{label:<14} {status:<10} {pp:<6} {c0m:<9} {c1m:<9} {c0kv:<9} {c1kv:<9} {c0c:<9} {c1c:<9}") results.append({"ratio": label, "status": status, "info": info}) proc.terminate() time.sleep(1) kill_servers() print("\nDone.") if __name__ == "__main__": main()