Files
variet_llm/scripts/perf_test_122b.py

170 lines
5.9 KiB
Python

import time
import json
import urllib.request
import sys
import os
import re
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
def check_server():
"""Check if server is up"""
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
return data.get("status") == "ok"
except:
return False
def check_slots():
"""Check server slot info for VRAM usage details"""
try:
req = urllib.request.Request(f"{BASE_URL}/slots")
with urllib.request.urlopen(req, timeout=5) as resp:
return json.loads(resp.read())
except:
return None
def run_benchmark(prompt, max_tokens=300, label="Test"):
"""Run a single benchmark request and return results"""
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
content = result["choices"][0]["message"].get("content", "")
usage = result.get("usage", {})
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
return {
"label": label,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"elapsed": elapsed,
"gen_tps_approx": gen_tps,
"content_preview": content[:150]
}
def main():
print("=" * 70)
print(" Qwen3.5 122B-A10B Performance Benchmark")
print(" Target: 10+ t/s generation speed")
print("=" * 70)
print()
# Wait for server (model loading takes 3-5 min for 71 GB)
print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
max_wait = 600 # 10 minutes max
for i in range(max_wait // 5):
if check_server():
print(f" -> Server is ready! (waited {i*5}s)")
break
if i % 6 == 0:
print(f" -> Loading model... ({i*5}s / {max_wait}s)")
time.sleep(5)
else:
print(f" -> ERROR: Server not responding after {max_wait}s")
return
# Check server info
print()
print("[2/4] Checking server status...")
slots = check_slots()
if slots:
print(f" -> Slots available: {len(slots)}")
# Warmup
print()
print("[3/4] Warmup run (short, pre-heating GPU caches)...")
try:
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
except Exception as e:
print(f" -> Warmup failed: {e}")
# Main benchmark - 5 runs for statistical reliability
print()
print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
print("-" * 70)
test_prompts = [
"Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
"Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
"Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
"Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
"Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
]
results = []
for i in range(5):
prompt = test_prompts[i % len(test_prompts)]
print(f"\n Run {i+1}/5: {prompt[:50]}...")
try:
r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
results.append(r)
print(f" Completion tokens: {r['completion_tokens']}")
print(f" Total time: {r['elapsed']:.2f}s")
print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
except Exception as e:
print(f" ERROR: {e}")
if results:
print()
print("=" * 70)
print(" RESULTS SUMMARY - Qwen3.5 122B-A10B")
print("=" * 70)
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
max_tps = max(r["gen_tps_approx"] for r in results)
min_tps = min(r["gen_tps_approx"] for r in results)
total_tokens = sum(r["completion_tokens"] for r in results)
total_time = sum(r["elapsed"] for r in results)
print(f" Runs completed: {len(results)}/5")
print(f" Total tokens: {total_tokens}")
print(f" Total time: {total_time:.1f}s")
print()
print(f" Approx TPS (avg): {avg_tps:.2f} t/s")
print(f" Approx TPS (min): {min_tps:.2f} t/s")
print(f" Approx TPS (max): {max_tps:.2f} t/s")
print()
# Verdict
if avg_tps >= 10:
print(" ✅ TARGET ACHIEVED: 10+ t/s!")
elif avg_tps >= 8:
print(" ⚠️ CLOSE TO TARGET: Consider further tuning")
else:
print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
print()
print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
print(" ⚡ Check the server console/log for exact 'eval time' t/s value,")
print(" ⚡ which shows pure token generation speed (always higher).")
print("=" * 70)
if __name__ == "__main__":
main()