import time import json import urllib.request import sys try: sys.stdout.reconfigure(encoding='utf-8') except AttributeError: pass BASE_URL = "http://127.0.0.1:8000" def check_server(): """Check if server is up""" try: req = urllib.request.Request(f"{BASE_URL}/health") with urllib.request.urlopen(req, timeout=5) as resp: data = json.loads(resp.read()) return data.get("status") == "ok" except: return False def run_benchmark(prompt, max_tokens=100, label="Test"): """Run a single benchmark request and return results""" payload = json.dumps({ "model": "local-model", "messages": [{"role": "user", "content": prompt}], "max_tokens": max_tokens, "temperature": 0.0 }).encode("utf-8") req = urllib.request.Request( f"{BASE_URL}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"} ) start = time.time() with urllib.request.urlopen(req, timeout=300) as resp: result = json.loads(resp.read()) elapsed = time.time() - start content = result["choices"][0]["message"].get("content", "") usage = result.get("usage", {}) prompt_tokens = usage.get("prompt_tokens", 0) completion_tokens = usage.get("completion_tokens", 0) gen_tps = completion_tokens / elapsed if elapsed > 0 else 0 return { "label": label, "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "elapsed": elapsed, "gen_tps_approx": gen_tps, "content_preview": content[:100] } def main(): print("=" * 60) print(" LLM Performance Benchmark Tool") print("=" * 60) print() # Wait for server print("[1/3] Checking server health...") for i in range(30): if check_server(): print(" -> Server is ready!") break print(f" -> Waiting for server... ({i+1}/30)") time.sleep(2) else: print(" -> ERROR: Server not responding after 60s") return # Warmup print() print("[2/3] Warmup run (short)...") try: warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup") print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s") except Exception as e: print(f" -> Warmup failed: {e}") # Main benchmark print() print("[3/3] Running main benchmark...") print("-" * 60) test_prompt = "Count from 1 to 50, writing each number on a new line." results = [] for i in range(3): print(f" Run {i+1}/3...") try: r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}") results.append(r) print(f" Tokens: {r['completion_tokens']} | " f"Time: {r['elapsed']:.2f}s | " f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)") except Exception as e: print(f" ERROR: {e}") if results: print() print("=" * 60) print(" RESULTS SUMMARY") print("=" * 60) avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results) max_tps = max(r["gen_tps_approx"] for r in results) min_tps = min(r["gen_tps_approx"] for r in results) print(f" Runs: {len(results)}") print(f" Avg TPS: {avg_tps:.2f} t/s (approx, includes prompt eval)") print(f" Min TPS: {min_tps:.2f} t/s") print(f" Max TPS: {max_tps:.2f} t/s") print() print(" NOTE: Check server console for exact generation t/s") print(" (the 'eval time' line shows pure token generation speed)") print("=" * 60) if __name__ == "__main__": main()