import time import json import urllib.request import sys import os import re try: sys.stdout.reconfigure(encoding='utf-8') except AttributeError: pass BASE_URL = "http://127.0.0.1:8000" def check_server(): """Check if server is up""" try: req = urllib.request.Request(f"{BASE_URL}/health") with urllib.request.urlopen(req, timeout=5) as resp: data = json.loads(resp.read()) return data.get("status") == "ok" except: return False def check_slots(): """Check server slot info for VRAM usage details""" try: req = urllib.request.Request(f"{BASE_URL}/slots") with urllib.request.urlopen(req, timeout=5) as resp: return json.loads(resp.read()) except: return None def run_benchmark(prompt, max_tokens=300, label="Test"): """Run a single benchmark request and return results""" payload = json.dumps({ "model": "local-model", "messages": [{"role": "user", "content": prompt}], "max_tokens": max_tokens, "temperature": 0.0 }).encode("utf-8") req = urllib.request.Request( f"{BASE_URL}/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"} ) start = time.time() with urllib.request.urlopen(req, timeout=600) as resp: result = json.loads(resp.read()) elapsed = time.time() - start content = result["choices"][0]["message"].get("content", "") usage = result.get("usage", {}) prompt_tokens = usage.get("prompt_tokens", 0) completion_tokens = usage.get("completion_tokens", 0) gen_tps = completion_tokens / elapsed if elapsed > 0 else 0 return { "label": label, "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "elapsed": elapsed, "gen_tps_approx": gen_tps, "content_preview": content[:150] } def main(): print("=" * 70) print(" Qwen3.5 122B-A10B Performance Benchmark") print(" Target: 10+ t/s generation speed") print("=" * 70) print() # Wait for server (model loading takes 3-5 min for 71 GB) print("[1/4] Waiting for server (122B model load takes 3-5 min)...") max_wait = 600 # 10 minutes max for i in range(max_wait // 5): if check_server(): print(f" -> Server is ready! (waited {i*5}s)") break if i % 6 == 0: print(f" -> Loading model... ({i*5}s / {max_wait}s)") time.sleep(5) else: print(f" -> ERROR: Server not responding after {max_wait}s") return # Check server info print() print("[2/4] Checking server status...") slots = check_slots() if slots: print(f" -> Slots available: {len(slots)}") # Warmup print() print("[3/4] Warmup run (short, pre-heating GPU caches)...") try: warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup") print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s") print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)") except Exception as e: print(f" -> Warmup failed: {e}") # Main benchmark - 5 runs for statistical reliability print() print("[4/4] Running main benchmark (5 runs x 300 tokens)...") print("-" * 70) test_prompts = [ "Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.", "Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.", "Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.", "Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.", "Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.", ] results = [] for i in range(5): prompt = test_prompts[i % len(test_prompts)] print(f"\n Run {i+1}/5: {prompt[:50]}...") try: r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}") results.append(r) print(f" Completion tokens: {r['completion_tokens']}") print(f" Total time: {r['elapsed']:.2f}s") print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)") except Exception as e: print(f" ERROR: {e}") if results: print() print("=" * 70) print(" RESULTS SUMMARY - Qwen3.5 122B-A10B") print("=" * 70) avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results) max_tps = max(r["gen_tps_approx"] for r in results) min_tps = min(r["gen_tps_approx"] for r in results) total_tokens = sum(r["completion_tokens"] for r in results) total_time = sum(r["elapsed"] for r in results) print(f" Runs completed: {len(results)}/5") print(f" Total tokens: {total_tokens}") print(f" Total time: {total_time:.1f}s") print() print(f" Approx TPS (avg): {avg_tps:.2f} t/s") print(f" Approx TPS (min): {min_tps:.2f} t/s") print(f" Approx TPS (max): {max_tps:.2f} t/s") print() # Verdict if avg_tps >= 10: print(" ✅ TARGET ACHIEVED: 10+ t/s!") elif avg_tps >= 8: print(" ⚠️ CLOSE TO TARGET: Consider further tuning") else: print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s") print() print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.") print(" ⚡ Check the server console/log for exact 'eval time' t/s value,") print(" ⚡ which shows pure token generation speed (always higher).") print("=" * 70) if __name__ == "__main__": main()