variet_llm/scripts/test_ts_ratios.py

#!/usr/bin/env python
"""Test multiple -ts ratios to find which ones start normally (no OOM, PP enabled)."""
import subprocess
import time
import json
import urllib.request
import urllib.error
import sys
import re
from pathlib import Path

ROOT = Path(__file__).parent.parent
CONFIG_FILE = ROOT / "config" / "engine_models.json"
LLAMA_LOG = ROOT / "logs" / "llama-server.log"
ENGINE_LOG = ROOT / "logs" / "engine_test.log"
PYTHON = r"C:\ProgramData\miniforge3\envs\variet-llm\python.exe"
ENGINE_SCRIPT = ROOT / "engine" / "variet_engine.py"

RATIOS = [
    ("0.5", "0.5"),
    ("0.48", "0.52"),
    ("0.47", "0.53"),
    ("0.45", "0.55"),
    ("0.43", "0.57"),
    ("0.40", "0.60"),
]

try:
    sys.stdout.reconfigure(encoding="utf-8")
except Exception:
    pass


def kill_servers():
    subprocess.run(
        ["powershell", "-Command",
         "Get-WmiObject Win32_Process | Where-Object { $_.CommandLine -like '*engine/variet_engine.py*' -or $_.Name -eq 'llama-server.exe' } | ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }"],
        capture_output=True
    )
    time.sleep(2)


def update_config(ts_a, ts_b):
    with open(CONFIG_FILE, encoding="utf-8") as f:
        cfg = json.load(f)
    args = cfg["roles"]["balanced"]["args"]
    for i, a in enumerate(args):
        if a == "-ts" and i + 1 < len(args):
            args[i + 1] = f"{ts_a},{ts_b}"
            break
    with open(CONFIG_FILE, "w", encoding="utf-8") as f:
        json.dump(cfg, f, indent=2, ensure_ascii=False)


def start_engine():
    LLAMA_LOG.write_text("")
    ENGINE_LOG.write_text("")
    return subprocess.Popen(
        [PYTHON, str(ENGINE_SCRIPT)],
        cwd=str(ROOT),
        stdout=open(ENGINE_LOG, "wb"),
        stderr=subprocess.STDOUT,
        creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
    )


def wait_for_result(timeout=180):
    """Return (status, log_tail) where status is 'ready'|'oom'|'error'|'timeout'."""
    deadline = time.time() + timeout
    while time.time() < deadline:
        time.sleep(3)
        # check engine status
        try:
            with urllib.request.urlopen("http://localhost:8000/engine/status", timeout=2) as r:
                data = json.loads(r.read())
                if data.get("state") == "ready":
                    return "ready", ""
                if data.get("state") == "error":
                    return "error", ""
        except Exception:
            pass
    return "timeout", ""


def analyze_log():
    if not LLAMA_LOG.exists():
        return {}
    text = LLAMA_LOG.read_text(encoding="utf-8", errors="ignore")
    result = {
        "pp_enabled": "pipeline parallelism enabled" in text,
        "pp_fallback": "retrying without pipeline parallelism" in text,
        "oom": "out of memory" in text,
        "listening": "main: server is listening" in text,
    }
    m = re.search(r"CUDA0 model buffer size = +([0-9.]+) MiB", text)
    if m:
        result["cuda0_model"] = float(m.group(1))
    m = re.search(r"CUDA1 model buffer size = +([0-9.]+) MiB", text)
    if m:
        result["cuda1_model"] = float(m.group(1))
    m = re.search(r"CUDA0 KV buffer size = +([0-9.]+) MiB", text)
    if m:
        result["cuda0_kv"] = float(m.group(1))
    m = re.search(r"CUDA1 KV buffer size = +([0-9.]+) MiB", text)
    if m:
        result["cuda1_kv"] = float(m.group(1))
    m = re.search(r"CUDA0 compute buffer size = +([0-9.]+) MiB", text)
    if m:
        result["cuda0_compute"] = float(m.group(1))
    m = re.search(r"CUDA1 compute buffer size = +([0-9.]+) MiB", text)
    if m:
        result["cuda1_compute"] = float(m.group(1))
    return result


def main():
    results = []
    print(f"{'ratio':<14} {'status':<10} {'PP':<6} {'cuda0_m':<9} {'cuda1_m':<9} {'cuda0_kv':<9} {'cuda1_kv':<9} {'c0_comp':<9} {'c1_comp':<9}")
    print("-" * 110)

    for ts_a, ts_b in RATIOS:
        label = f"{ts_a},{ts_b}"
        kill_servers()
        update_config(ts_a, ts_b)
        proc = start_engine()
        status, _ = wait_for_result(timeout=180)
        info = analyze_log()
        pp = "ON" if info.get("pp_enabled") and not info.get("pp_fallback") else ("FALLBACK" if info.get("pp_fallback") else "?")
        c0m = f"{info.get('cuda0_model', 0):.0f}"
        c1m = f"{info.get('cuda1_model', 0):.0f}"
        c0kv = f"{info.get('cuda0_kv', 0):.0f}"
        c1kv = f"{info.get('cuda1_kv', 0):.0f}"
        c0c = f"{info.get('cuda0_compute', 0):.0f}"
        c1c = f"{info.get('cuda1_compute', 0):.0f}"
        print(f"{label:<14} {status:<10} {pp:<6} {c0m:<9} {c1m:<9} {c0kv:<9} {c1kv:<9} {c0c:<9} {c1c:<9}")
        results.append({"ratio": label, "status": status, "info": info})
        proc.terminate()
        time.sleep(1)

    kill_servers()
    print("\nDone.")


if __name__ == "__main__":
    main()