variet_llm/scripts/bench_long.py

"""Benchmark with long prompts to measure prompt processing (prefill) speed."""
import json
import time
import urllib.request
import sys

try:
    sys.stdout.reconfigure(encoding="utf-8")
except Exception:
    pass

BASE_SENTENCE = (
    "The history of computing is a vast and multifaceted journey that spans millennia, "
    "from the earliest mechanical calculating aids to the sophisticated digital systems of today. "
    "It begins with simple counting devices like the abacus, which originated in ancient Mesopotamia "
    "around 2300 BCE and was later refined by Chinese and Roman civilizations. "
    "These early tools laid the conceptual groundwork for mechanical computation. "
)


def make_prompt(seed):
    # each seed produces a slightly different long prompt to defeat caching
    unique = f"Session {seed}. Random seed value: {seed * 31337 + 17}. "
    long_text = unique + (BASE_SENTENCE * 40)
    return (
        "Read the following text carefully, then answer in exactly one short sentence:\n\n"
        f"{long_text}\n\n"
        "Question: What is the main subject of the text above? Answer in one short sentence only."
    )


def bench(label, seed, gen_tokens=150):
    payload = {
        "model": "balanced",
        "messages": [{"role": "user", "content": make_prompt(seed)}],
        "max_tokens": gen_tokens,
        "stream": False,
        "temperature": 0.3,
    }
    req = urllib.request.Request(
        "http://localhost:8000/v1/chat/completions",
        data=json.dumps(payload).encode(),
        headers={"Content-Type": "application/json"},
    )
    t0 = time.time()
    with urllib.request.urlopen(req, timeout=600) as r:
        d = json.loads(r.read())
    total = time.time() - t0
    t = d.get("timings", {})
    print(f"[{label}]")
    print(f"  prompt: {t['prompt_n']:>5} tok  {t['prompt_ms']:>7.0f} ms  {t['prompt_per_second']:>7.2f} t/s")
    print(f"  gen:    {t['predicted_n']:>5} tok  {t['predicted_ms']:>7.0f} ms  {t['predicted_per_second']:>7.2f} t/s")
    print(f"  total:  {total:.2f} s")
    return t


if __name__ == "__main__":
    label = sys.argv[1] if len(sys.argv) > 1 else "run"
    results = []
    for i in range(3):
        t = bench(f"{label} #{i+1}", seed=i + 1)
        results.append(t)
        print()
    if results:
        avg_prompt = sum(r["prompt_per_second"] for r in results) / len(results)
        avg_gen = sum(r["predicted_per_second"] for r in results) / len(results)
        print(f"=== [{label}] AVG === prompt: {avg_prompt:.2f} t/s | gen: {avg_gen:.2f} t/s")