124 lines
3.7 KiB
Python
124 lines
3.7 KiB
Python
import time
|
|
import json
|
|
import urllib.request
|
|
import sys
|
|
|
|
try:
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
except AttributeError:
|
|
pass
|
|
|
|
BASE_URL = "http://127.0.0.1:8000"
|
|
|
|
def check_server():
|
|
"""Check if server is up"""
|
|
try:
|
|
req = urllib.request.Request(f"{BASE_URL}/health")
|
|
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
data = json.loads(resp.read())
|
|
return data.get("status") == "ok"
|
|
except:
|
|
return False
|
|
|
|
def run_benchmark(prompt, max_tokens=100, label="Test"):
|
|
"""Run a single benchmark request and return results"""
|
|
payload = json.dumps({
|
|
"model": "local-model",
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"max_tokens": max_tokens,
|
|
"temperature": 0.0
|
|
}).encode("utf-8")
|
|
|
|
req = urllib.request.Request(
|
|
f"{BASE_URL}/v1/chat/completions",
|
|
data=payload,
|
|
headers={"Content-Type": "application/json"}
|
|
)
|
|
|
|
start = time.time()
|
|
with urllib.request.urlopen(req, timeout=300) as resp:
|
|
result = json.loads(resp.read())
|
|
elapsed = time.time() - start
|
|
|
|
content = result["choices"][0]["message"].get("content", "")
|
|
usage = result.get("usage", {})
|
|
prompt_tokens = usage.get("prompt_tokens", 0)
|
|
completion_tokens = usage.get("completion_tokens", 0)
|
|
|
|
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
|
|
|
return {
|
|
"label": label,
|
|
"prompt_tokens": prompt_tokens,
|
|
"completion_tokens": completion_tokens,
|
|
"elapsed": elapsed,
|
|
"gen_tps_approx": gen_tps,
|
|
"content_preview": content[:100]
|
|
}
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print(" LLM Performance Benchmark Tool")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
# Wait for server
|
|
print("[1/3] Checking server health...")
|
|
for i in range(30):
|
|
if check_server():
|
|
print(" -> Server is ready!")
|
|
break
|
|
print(f" -> Waiting for server... ({i+1}/30)")
|
|
time.sleep(2)
|
|
else:
|
|
print(" -> ERROR: Server not responding after 60s")
|
|
return
|
|
|
|
# Warmup
|
|
print()
|
|
print("[2/3] Warmup run (short)...")
|
|
try:
|
|
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
|
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
|
except Exception as e:
|
|
print(f" -> Warmup failed: {e}")
|
|
|
|
# Main benchmark
|
|
print()
|
|
print("[3/3] Running main benchmark...")
|
|
print("-" * 60)
|
|
|
|
test_prompt = "Count from 1 to 50, writing each number on a new line."
|
|
|
|
results = []
|
|
for i in range(3):
|
|
print(f" Run {i+1}/3...")
|
|
try:
|
|
r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}")
|
|
results.append(r)
|
|
print(f" Tokens: {r['completion_tokens']} | "
|
|
f"Time: {r['elapsed']:.2f}s | "
|
|
f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)")
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
|
|
if results:
|
|
print()
|
|
print("=" * 60)
|
|
print(" RESULTS SUMMARY")
|
|
print("=" * 60)
|
|
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
|
max_tps = max(r["gen_tps_approx"] for r in results)
|
|
min_tps = min(r["gen_tps_approx"] for r in results)
|
|
print(f" Runs: {len(results)}")
|
|
print(f" Avg TPS: {avg_tps:.2f} t/s (approx, includes prompt eval)")
|
|
print(f" Min TPS: {min_tps:.2f} t/s")
|
|
print(f" Max TPS: {max_tps:.2f} t/s")
|
|
print()
|
|
print(" NOTE: Check server console for exact generation t/s")
|
|
print(" (the 'eval time' line shows pure token generation speed)")
|
|
print("=" * 60)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|