variet_llm/scripts/dual_gpu_benchmark.py

"""
Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
==========================================================
Tests 4 models across multiple parameter configurations to find
the absolute best model + settings for 256K context coding agent.

Models:
  1. Qwen3.5-35B-A3B  Q4_K_M     (~20.5 GB)
  2. Qwen3.5-35B-A3B  MXFP4_MOE  (~20.1 GB)
  3. Gemma4  26B-A4B   Q4_K_M     (~15.6 GB)
  4. Gemma4  26B-A4B   MXFP4_MOE  (~15.5 GB)

Test Phases (per model):
  Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
  Phase 1: GPU layer + MoE offload strategy sweep
  Phase 2: CPU thread sweep (carry best from P1)
  Phase 3: Batch size sweep (carry best from P1+P2)
  Phase 4: KV cache type sweep (carry best from P1+P2+P3)
  Phase 5: Final verification (5 runs)

Output: scripts/dual_gpu_results.json  (all raw data)
        scripts/dual_gpu_summary.txt   (human-readable winner)
"""
import subprocess
import time
import json
import urllib.request
import sys
import os
import datetime

try:
    sys.stdout.reconfigure(encoding='utf-8')
except Exception:
    pass

# ─── Configuration ───────────────────────────────────────────────
BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
CONTEXT = 262144  # 256K
BENCHMARK_RUNS = 3
BENCHMARK_TOKENS = 200
SERVER_TIMEOUT = 300  # seconds to wait for server startup

MODELS = [
    {
        "name": "Qwen3.5-35B-A3B Q4_K_M",
        "path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
        "type": "qwen",
        "quant": "Q4_K_M",
        "is_mxfp4": False,
        "total_layers": 64,  # Qwen3.5 35B has 64 layers
    },
    {
        "name": "Qwen3.5-35B-A3B MXFP4_MOE",
        "path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
        "type": "qwen",
        "quant": "MXFP4_MOE",
        "is_mxfp4": True,
        "total_layers": 64,
    },
    {
        "name": "Gemma4 26B-A4B Q4_K_M",
        "path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
        "type": "gemma4",
        "quant": "Q4_K_M",
        "is_mxfp4": False,
        "total_layers": 30,  # Gemma4 26B has 30 layers
    },
    {
        "name": "Gemma4 26B-A4B MXFP4_MOE",
        "path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
        "type": "gemma4",
        "quant": "MXFP4_MOE",
        "is_mxfp4": True,
        "total_layers": 30,
    },
]

ALL_RESULTS = []


# ─── Utility Functions ──────────────────────────────────────────
def log(msg):
    ts = datetime.datetime.now().strftime("%H:%M:%S")
    print(f"[{ts}] {msg}", flush=True)


def kill_server():
    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
                   capture_output=True)
    time.sleep(5)


def get_vram_all():
    """Returns list of (used, total) tuples for each GPU."""
    try:
        r = subprocess.run(
            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
             "--format=csv,noheader,nounits"],
            capture_output=True, text=True, timeout=5
        )
        gpus = []
        for line in r.stdout.strip().split("\n"):
            parts = [p.strip() for p in line.split(",")]
            if len(parts) >= 3:
                gpus.append({
                    "gpu": int(parts[0]),
                    "used": int(parts[1]),
                    "total": int(parts[2]),
                })
        return gpus
    except Exception:
        return []


def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
              cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
    """Build llama-server command for dual-GPU."""
    cmd = [
        LLAMA_SERVER,
        "--model", model_path,
        "-ngl", str(ngl),
        "-c", str(CONTEXT),
        "-np", "1",
        "-fa", "on",
        "--cache-type-k", ctk,
        "--cache-type-v", ctv,
        "-ub", str(ub),
        "-b", str(b),
        "-t", str(t),
        "-tb", str(t),
        "--prio", str(prio),
        "--poll", "50",
        "--mlock",
        "--port", "8000",
        "--host", "0.0.0.0",
    ]
    # MoE offloading options
    if cpu_moe:
        cmd.append("--cpu-moe")
    elif n_cpu_moe > 0:
        cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
    if nommap:
        cmd.append("--no-mmap")
    return cmd


def start_server(model_path, **kwargs):
    cmd = build_cmd(model_path, **kwargs)
    log(f"  CMD: {' '.join(cmd[-20:])}")  # show last 20 args
    proc = subprocess.Popen(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
        cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
    )
    return proc


def wait_for_server(timeout=SERVER_TIMEOUT):
    start = time.time()
    while time.time() - start < timeout:
        try:
            req = urllib.request.Request(f"{BASE_URL}/health")
            with urllib.request.urlopen(req, timeout=3) as resp:
                data = json.loads(resp.read())
                if data.get("status") == "ok":
                    boot_time = time.time() - start
                    return True, boot_time
        except Exception:
            pass
        time.sleep(3)
    return False, timeout


def run_benchmark(max_tokens=BENCHMARK_TOKENS):
    payload = json.dumps({
        "model": "local-model",
        "messages": [{"role": "user",
                      "content": "Count from 1 to 50, writing each number on a new line."}],
        "max_tokens": max_tokens,
        "temperature": 0.0,
    }).encode("utf-8")

    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"},
    )

    start = time.time()
    with urllib.request.urlopen(req, timeout=600) as resp:
        result = json.loads(resp.read())
    elapsed = time.time() - start

    usage = result.get("usage", {})
    ct = usage.get("completion_tokens", 0)
    pt = usage.get("prompt_tokens", 0)
    return {
        "tps": ct / elapsed if elapsed > 0 else 0,
        "completion_tokens": ct,
        "prompt_tokens": pt,
        "elapsed": elapsed,
    }


def test_config(model_info, label, **kwargs):
    """Test a single configuration. Returns result dict or None."""
    kill_server()
    log(f"  [{label}] Starting server...")

    proc = start_server(model_info["path"], **kwargs)
    ok, boot_time = wait_for_server()

    if not ok:
        log(f"  [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
        proc.kill()
        return None

    vram = get_vram_all()
    vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
    log(f"  [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")

    # Warmup
    try:
        run_benchmark(max_tokens=20)
    except Exception:
        pass

    # Benchmark runs
    speeds = []
    for i in range(BENCHMARK_RUNS):
        try:
            r = run_benchmark()
            speeds.append(r["tps"])
            log(f"    Run {i+1}: {r['tps']:.2f} t/s")
        except Exception as e:
            log(f"    Run {i+1}: ERROR ({e})")

    proc.kill()

    if not speeds:
        log(f"  [{label}] ALL BENCHMARK RUNS FAILED")
        return None

    avg = sum(speeds) / len(speeds)
    best = max(speeds)
    log(f"  [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")

    result = {
        "model": model_info["name"],
        "quant": model_info["quant"],
        "label": label,
        "avg_tps": round(avg, 2),
        "best_tps": round(best, 2),
        "boot_time": round(boot_time, 1),
        "vram": vram,
        "params": kwargs,
    }
    ALL_RESULTS.append(result)
    return result


# ─── Phase Runners ───────────────────────────────────────────────

def phase0_boot_test(model):
    """Quick test: can the model even boot with 256K on dual GPU?"""
    log(f"\n{'='*70}")
    log(f"  PHASE 0: Boot Test — {model['name']}")
    log(f"{'='*70}")

    # Try -ngl 999 (all layers to GPU) as baseline
    r = test_config(
        model, f"boot-ngl999",
        ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
    )
    if r:
        return r

    # If full GPU fails, try with cpu-moe
    log("  Full GPU failed, trying with --cpu-moe...")
    r = test_config(
        model, f"boot-cpumoe",
        ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
        cpu_moe=True,
    )
    if r:
        return r

    # Extreme fallback: fewer layers
    log("  --cpu-moe also failed, trying reduced layers...")
    r = test_config(
        model, f"boot-ngl-half",
        ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
        ctk="q4_0", ctv="q4_0",
    )
    return r


def phase1_gpu_offload(model, baseline):
    """Find optimal GPU layer count and MoE offload strategy."""
    log(f"\n{'='*70}")
    log(f"  PHASE 1: GPU Offload Strategy — {model['name']}")
    log(f"{'='*70}")

    results = []
    if baseline:
        results.append(baseline)

    total = model["total_layers"]

    # Strategy A: All GPU + cpu-moe variations
    for cpu_moe in [True, False]:
        label = f"ngl=999 cpu_moe={cpu_moe}"
        # Skip if already tested in baseline
        if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
           baseline["params"].get("cpu_moe", False) == cpu_moe:
            continue
        r = test_config(
            model, label,
            ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
            cpu_moe=cpu_moe,
        )
        if r:
            results.append(r)

    # Strategy B: n-cpu-moe sweep (selective expert offload)
    for n in [0, 5, 10, 15, 20]:
        if n > total:
            continue
        r = test_config(
            model, f"n-cpu-moe={n}",
            ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
            n_cpu_moe=n,
        )
        if r:
            results.append(r)

    if not results:
        log("  PHASE 1: No configuration worked!")
        return None

    best = max(results, key=lambda x: x["avg_tps"])
    log(f"\n  ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
    return best


def phase2_threads(model, prev_best):
    """Sweep CPU threads with best GPU config locked."""
    log(f"\n{'='*70}")
    log(f"  PHASE 2: CPU Thread Sweep — {model['name']}")
    log(f"{'='*70}")

    p = prev_best["params"]
    results = [prev_best]

    for t in [2, 4, 6, 8, 10, 12]:
        if t == p.get("t", 6):
            continue
        r = test_config(
            model, f"t={t}",
            ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
            ctk=p["ctk"], ctv=p["ctv"],
            cpu_moe=p.get("cpu_moe", False),
            n_cpu_moe=p.get("n_cpu_moe", 0),
        )
        if r:
            results.append(r)

    best = max(results, key=lambda x: x["avg_tps"])
    log(f"\n  ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
    return best


def phase3_batch(model, prev_best):
    """Sweep batch sizes."""
    log(f"\n{'='*70}")
    log(f"  PHASE 3: Batch Size Sweep — {model['name']}")
    log(f"{'='*70}")

    p = prev_best["params"]
    best_t = p["t"]
    results = [prev_best]

    for ub, b in [(128, 512), (256, 1024), (256, 2048),
                  (512, 1024), (512, 2048), (512, 4096),
                  (1024, 2048), (1024, 4096)]:
        if ub == p["ub"] and b == p["b"]:
            continue
        r = test_config(
            model, f"ub={ub} b={b}",
            ngl=p["ngl"], t=best_t, ub=ub, b=b,
            ctk=p["ctk"], ctv=p["ctv"],
            cpu_moe=p.get("cpu_moe", False),
            n_cpu_moe=p.get("n_cpu_moe", 0),
        )
        if r:
            results.append(r)

    best = max(results, key=lambda x: x["avg_tps"])
    log(f"\n  ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
    return best


def phase4_kvcache(model, prev_best):
    """Sweep KV cache precision."""
    log(f"\n{'='*70}")
    log(f"  PHASE 4: KV Cache Type Sweep — {model['name']}")
    log(f"{'='*70}")

    p = prev_best["params"]
    results = [prev_best]

    for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
                     ("q4_0", "q8_0"), ("f16", "f16")]:
        if ctk == p["ctk"] and ctv == p["ctv"]:
            continue
        r = test_config(
            model, f"kv={ctk}/{ctv}",
            ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
            ctk=ctk, ctv=ctv,
            cpu_moe=p.get("cpu_moe", False),
            n_cpu_moe=p.get("n_cpu_moe", 0),
        )
        if r:
            results.append(r)

    best = max(results, key=lambda x: x["avg_tps"])
    log(f"\n  ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
    return best


def phase5_final(model, prev_best):
    """Final verification with 5 runs."""
    log(f"\n{'='*70}")
    log(f"  PHASE 5: Final Verification (5 runs) — {model['name']}")
    log(f"{'='*70}")

    p = prev_best["params"]
    kill_server()
    proc = start_server(model["path"], **p)
    ok, boot_time = wait_for_server()
    if not ok:
        log("  FAILED to start for final verification!")
        proc.kill()
        return prev_best

    vram = get_vram_all()

    # Warmup
    try:
        run_benchmark(max_tokens=20)
    except Exception:
        pass

    speeds = []
    for i in range(5):
        try:
            r = run_benchmark()
            speeds.append(r["tps"])
            log(f"    Final Run {i+1}: {r['tps']:.2f} t/s")
        except Exception as e:
            log(f"    Final Run {i+1}: ERROR ({e})")

    proc.kill()

    if speeds:
        avg = sum(speeds) / len(speeds)
        best_tps = max(speeds)
        log(f"\n  ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")

        final = {
            "model": model["name"],
            "quant": model["quant"],
            "label": f"FINAL-{model['name']}",
            "avg_tps": round(avg, 2),
            "best_tps": round(best_tps, 2),
            "boot_time": round(boot_time, 1),
            "vram": vram,
            "params": p,
        }
        ALL_RESULTS.append(final)
        return final

    return prev_best


# ─── Main ────────────────────────────────────────────────────────

def run_full_benchmark_for_model(model):
    """Run all phases for a single model."""
    log(f"\n{'#'*70}")
    log(f"  MODEL: {model['name']}")
    log(f"  File:  {model['path']}")
    log(f"  Size:  {os.path.getsize(model['path'])/1024**3:.2f} GB")
    log(f"{'#'*70}")

    # Check model exists
    if not os.path.exists(model["path"]):
        log(f"  SKIP: Model file not found!")
        return None

    # Phase 0: Can it boot?
    baseline = phase0_boot_test(model)
    if not baseline:
        log(f"  SKIP: {model['name']} cannot boot at 256K context!")
        return None

    # Phase 1: GPU offload strategy
    best = phase1_gpu_offload(model, baseline)
    if not best:
        return baseline

    # Phase 2: CPU threads
    best = phase2_threads(model, best)

    # Phase 3: Batch sizes
    best = phase3_batch(model, best)

    # Phase 4: KV cache
    best = phase4_kvcache(model, best)

    # Phase 5: Final verification
    final = phase5_final(model, best)

    return final


def main():
    start_time = time.time()

    log("=" * 70)
    log("  DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
    log("  2x RTX 3060 (24GB Total) | 256K Context")
    log(f"  Models: {len(MODELS)}")
    log(f"  Started: {datetime.datetime.now().isoformat()}")
    log("=" * 70)

    # Show GPU info
    gpus = get_vram_all()
    for g in gpus:
        log(f"  GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")

    # Run benchmarks for each model
    model_winners = []
    for i, model in enumerate(MODELS):
        log(f"\n{'='*70}")
        log(f"  STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
        log(f"{'='*70}")

        winner = run_full_benchmark_for_model(model)
        if winner:
            model_winners.append(winner)

        # Save intermediate results
        with open("scripts/dual_gpu_results.json", "w") as f:
            json.dump(ALL_RESULTS, f, indent=2, default=str)
        log(f"  Intermediate results saved ({len(ALL_RESULTS)} configs tested)")

    # ─── Grand Final Comparison ──────────────────────────────────
    elapsed = (time.time() - start_time) / 60

    log(f"\n{'='*70}")
    log(f"  GRAND FINAL COMPARISON")
    log(f"  Total time: {elapsed:.1f} minutes")
    log(f"  Configs tested: {len(ALL_RESULTS)}")
    log(f"{'='*70}")

    if not model_winners:
        log("  No models were able to run at 256K context!")
        return

    # Sort by avg t/s
    model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)

    summary_lines = []
    summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
    summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
    summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
    summary_lines.append(f"Total time: {elapsed:.1f} minutes")
    summary_lines.append("")
    summary_lines.append("=" * 60)
    summary_lines.append("  RANKING (by AVG t/s)")
    summary_lines.append("=" * 60)

    for rank, w in enumerate(model_winners, 1):
        medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: "  "}.get(rank, "  ")
        summary_lines.append(f"\n  {medal} #{rank}: {w['model']}")
        summary_lines.append(f"      AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
        summary_lines.append(f"      Boot: {w['boot_time']:.0f}s")
        p = w["params"]
        summary_lines.append(f"      ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
        summary_lines.append(f"      ctk={p['ctk']} ctv={p['ctv']}")
        if p.get("cpu_moe"):
            summary_lines.append(f"      --cpu-moe")
        elif p.get("n_cpu_moe", 0) > 0:
            summary_lines.append(f"      --n-cpu-moe {p['n_cpu_moe']}")

    champion = model_winners[0]
    summary_lines.append(f"\n{'='*60}")
    summary_lines.append(f"  ★ CHAMPION: {champion['model']}")
    summary_lines.append(f"    {champion['avg_tps']:.2f} t/s average")
    summary_lines.append(f"{'='*60}")

    # Build recommended command
    p = champion["params"]
    cmd_parts = [
        f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
        f"-ngl {p['ngl']} -c {CONTEXT}",
        f"-t {p['t']} -tb {p['t']}",
        f"-ub {p['ub']} -b {p['b']}",
        "-fa on",
        f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
        f"--prio {p.get('prio', 3)} --poll 50",
        "--mlock",
    ]
    if p.get("cpu_moe"):
        cmd_parts.append("--cpu-moe")
    elif p.get("n_cpu_moe", 0) > 0:
        cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
    if p.get("nommap"):
        cmd_parts.append("--no-mmap")
    cmd_parts.append("--port 8000 --host 0.0.0.0")

    summary_lines.append(f"\n  Recommended command:")
    summary_lines.append(f"    {' '.join(cmd_parts)}")

    summary = "\n".join(summary_lines)
    print(summary)

    with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
        f.write(summary)

    with open("scripts/dual_gpu_results.json", "w") as f:
        json.dump(ALL_RESULTS, f, indent=2, default=str)

    log(f"\n  Results: scripts/dual_gpu_results.json")
    log(f"  Summary: scripts/dual_gpu_summary.txt")
    log(f"  DONE!")

    kill_server()


if __name__ == "__main__":
    main()