"""Benchmark with long prompts to measure prompt processing (prefill) speed.""" import json import time import urllib.request import sys try: sys.stdout.reconfigure(encoding="utf-8") except Exception: pass BASE_SENTENCE = ( "The history of computing is a vast and multifaceted journey that spans millennia, " "from the earliest mechanical calculating aids to the sophisticated digital systems of today. " "It begins with simple counting devices like the abacus, which originated in ancient Mesopotamia " "around 2300 BCE and was later refined by Chinese and Roman civilizations. " "These early tools laid the conceptual groundwork for mechanical computation. " ) def make_prompt(seed): # each seed produces a slightly different long prompt to defeat caching unique = f"Session {seed}. Random seed value: {seed * 31337 + 17}. " long_text = unique + (BASE_SENTENCE * 40) return ( "Read the following text carefully, then answer in exactly one short sentence:\n\n" f"{long_text}\n\n" "Question: What is the main subject of the text above? Answer in one short sentence only." ) def bench(label, seed, gen_tokens=150): payload = { "model": "balanced", "messages": [{"role": "user", "content": make_prompt(seed)}], "max_tokens": gen_tokens, "stream": False, "temperature": 0.3, } req = urllib.request.Request( "http://localhost:8000/v1/chat/completions", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, ) t0 = time.time() with urllib.request.urlopen(req, timeout=600) as r: d = json.loads(r.read()) total = time.time() - t0 t = d.get("timings", {}) print(f"[{label}]") print(f" prompt: {t['prompt_n']:>5} tok {t['prompt_ms']:>7.0f} ms {t['prompt_per_second']:>7.2f} t/s") print(f" gen: {t['predicted_n']:>5} tok {t['predicted_ms']:>7.0f} ms {t['predicted_per_second']:>7.2f} t/s") print(f" total: {total:.2f} s") return t if __name__ == "__main__": label = sys.argv[1] if len(sys.argv) > 1 else "run" results = [] for i in range(3): t = bench(f"{label} #{i+1}", seed=i + 1) results.append(t) print() if results: avg_prompt = sum(r["prompt_per_second"] for r in results) / len(results) avg_gen = sum(r["predicted_per_second"] for r in results) / len(results) print(f"=== [{label}] AVG === prompt: {avg_prompt:.2f} t/s | gen: {avg_gen:.2f} t/s")