"""
Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
"""
import subprocess, time, json, urllib.request, sys, os

try:
    sys.stdout.reconfigure(encoding='utf-8')
except:
    pass

BASE_URL = "http://127.0.0.1:8000"
SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
CTX = 262144
RUNS = 3


def kill():
    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
    time.sleep(4)


def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
    cmd = [SERVER, "--model", MODEL, "-ngl", "999",
           "-c", str(CTX), "-np", "1", "-fa", "on",
           "--cache-type-k", ctk, "--cache-type-v", ctv,
           "-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
           "--prio", str(prio), "--poll", "50",
           "--mlock", "--port", "8000", "--host", "0.0.0.0"]
    if ncpumoe > 0:
        cmd.extend(["--n-cpu-moe", str(ncpumoe)])
    if nommap:
        cmd.append("--no-mmap")
    return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                            cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')


def wait_ready(timeout=240):
    t0 = time.time()
    while time.time() - t0 < timeout:
        try:
            with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
                if json.loads(r.read()).get("status") == "ok":
                    return True
        except:
            pass
        time.sleep(2)
    return False


def bench(n=200):
    p = json.dumps({"model": "m", "messages": [{"role": "user",
         "content": "Count from 1 to 50, each number on new line."}],
         "max_tokens": n, "temperature": 0.0}).encode()
    r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
                               headers={"Content-Type": "application/json"})
    t0 = time.time()
    with urllib.request.urlopen(r, timeout=300) as resp:
        res = json.loads(resp.read())
    dt = time.time() - t0
    ct = res.get("usage", {}).get("completion_tokens", 0)
    return ct / dt if dt > 0 else 0


def vram():
    try:
        r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
                            "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
        a, b = r.stdout.strip().split(",")
        return int(a.strip()), int(b.strip())
    except:
        return 0, 0


def test(label, ncpumoe, **kw):
    kill()
    print(f"  [{label}] Starting...", end=" ", flush=True)
    p = start(ncpumoe, **kw)
    if not wait_ready():
        print("FAILED"); p.kill(); return None
    vu, vt = vram()
    print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
    try: bench(20)
    except: pass
    speeds = []
    for _ in range(RUNS):
        try: speeds.append(bench())
        except: pass
    p.kill()
    if not speeds:
        print("BENCH FAILED"); return None
    avg, best = sum(speeds)/len(speeds), max(speeds)
    print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
    return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
            "vram": vu, **kw}


def main():
    print("=" * 60)
    print("  Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
    print("=" * 60)
    results = []

    # Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
    print("\n--- Phase 1: --n-cpu-moe sweep ---")
    for n in [0, 5, 10, 15, 20, 25, 30]:
        nm = n > 15  # use --no-mmap when heavy CPU offload
        r = test(f"ncpumoe={n}", n, nommap=nm)
        if r: results.append(r)

    # Find best n-cpu-moe
    best_r = max(results, key=lambda x: x["avg"])
    best_n = best_r["ncpumoe"]
    print(f"\n  ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")

    # Fine-tune around best
    if best_n > 0:
        print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
        for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
            if n == best_n: continue
            nm = n > 15
            r = test(f"ncpumoe={n}", n, nommap=nm)
            if r: results.append(r)
        best_r = max(results, key=lambda x: x["avg"])
        best_n = best_r["ncpumoe"]
        print(f"\n  ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")

    # Phase 2: Thread sweep at best n-cpu-moe
    nm = best_n > 15
    print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
    for t in [2, 4, 6, 8, 10]:
        r = test(f"t={t}", best_n, t=t, nommap=nm)
        if r: results.append(r)
    best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
    bt = best_t.get("t", 4)
    print(f"\n  ★ Best threads: {bt}")

    # Phase 3: Batch sweep
    print(f"\n--- Phase 3: Batch sweep ---")
    for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
        r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
        if r: results.append(r)

    # Phase 4: KV cache type
    print(f"\n--- Phase 4: KV cache type ---")
    for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
        r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
        if r: results.append(r)

    # Final report
    best_all = max(results, key=lambda x: x["avg"])
    print(f"\n{'='*60}")
    print(f"  FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
    print(f"{'='*60}")

    with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
        json.dump(results, f, indent=2, default=str)
    print("  Saved: scripts/tune_results_gemma4_ncpumoe.json")


if __name__ == "__main__":
    main()