variet_llm/scripts/perf_test_122b.py

import time
import json
import urllib.request
import sys
import os
import re

try:
    sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
    pass

BASE_URL = "http://127.0.0.1:8000"

def check_server():
    """Check if server is up"""
    try:
        req = urllib.request.Request(f"{BASE_URL}/health")
        with urllib.request.urlopen(req, timeout=5) as resp:
            data = json.loads(resp.read())
            return data.get("status") == "ok"
    except:
        return False

def check_slots():
    """Check server slot info for VRAM usage details"""
    try:
        req = urllib.request.Request(f"{BASE_URL}/slots")
        with urllib.request.urlopen(req, timeout=5) as resp:
            return json.loads(resp.read())
    except:
        return None

def run_benchmark(prompt, max_tokens=300, label="Test"):
    """Run a single benchmark request and return results"""
    payload = json.dumps({
        "model": "local-model",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.0
    }).encode("utf-8")

    req = urllib.request.Request(
        f"{BASE_URL}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )

    start = time.time()
    with urllib.request.urlopen(req, timeout=600) as resp:
        result = json.loads(resp.read())
    elapsed = time.time() - start

    content = result["choices"][0]["message"].get("content", "")
    usage = result.get("usage", {})
    prompt_tokens = usage.get("prompt_tokens", 0)
    completion_tokens = usage.get("completion_tokens", 0)

    gen_tps = completion_tokens / elapsed if elapsed > 0 else 0

    return {
        "label": label,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "elapsed": elapsed,
        "gen_tps_approx": gen_tps,
        "content_preview": content[:150]
    }

def main():
    print("=" * 70)
    print("  Qwen3.5 122B-A10B Performance Benchmark")
    print("  Target: 10+ t/s generation speed")
    print("=" * 70)
    print()

    # Wait for server (model loading takes 3-5 min for 71 GB)
    print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
    max_wait = 600  # 10 minutes max
    for i in range(max_wait // 5):
        if check_server():
            print(f"  -> Server is ready! (waited {i*5}s)")
            break
        if i % 6 == 0:
            print(f"  -> Loading model... ({i*5}s / {max_wait}s)")
        time.sleep(5)
    else:
        print(f"  -> ERROR: Server not responding after {max_wait}s")
        return

    # Check server info
    print()
    print("[2/4] Checking server status...")
    slots = check_slots()
    if slots:
        print(f"  -> Slots available: {len(slots)}")

    # Warmup
    print()
    print("[3/4] Warmup run (short, pre-heating GPU caches)...")
    try:
        warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
        print(f"  -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
        print(f"  -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
    except Exception as e:
        print(f"  -> Warmup failed: {e}")

    # Main benchmark - 5 runs for statistical reliability
    print()
    print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
    print("-" * 70)

    test_prompts = [
        "Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
        "Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
        "Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
        "Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
        "Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
    ]

    results = []
    for i in range(5):
        prompt = test_prompts[i % len(test_prompts)]
        print(f"\n  Run {i+1}/5: {prompt[:50]}...")
        try:
            r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
            results.append(r)
            print(f"    Completion tokens: {r['completion_tokens']}")
            print(f"    Total time: {r['elapsed']:.2f}s")
            print(f"    Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
        except Exception as e:
            print(f"    ERROR: {e}")

    if results:
        print()
        print("=" * 70)
        print("  RESULTS SUMMARY - Qwen3.5 122B-A10B")
        print("=" * 70)
        avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
        max_tps = max(r["gen_tps_approx"] for r in results)
        min_tps = min(r["gen_tps_approx"] for r in results)
        total_tokens = sum(r["completion_tokens"] for r in results)
        total_time = sum(r["elapsed"] for r in results)

        print(f"  Runs completed: {len(results)}/5")
        print(f"  Total tokens:   {total_tokens}")
        print(f"  Total time:     {total_time:.1f}s")
        print()
        print(f"  Approx TPS (avg): {avg_tps:.2f} t/s")
        print(f"  Approx TPS (min): {min_tps:.2f} t/s")
        print(f"  Approx TPS (max): {max_tps:.2f} t/s")
        print()

        # Verdict
        if avg_tps >= 10:
            print("  ✅ TARGET ACHIEVED: 10+ t/s!")
        elif avg_tps >= 8:
            print("  ⚠️  CLOSE TO TARGET: Consider further tuning")
        else:
            print(f"  ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")

        print()
        print("  ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
        print("  ⚡ Check the server console/log for exact 'eval time' t/s value,")
        print("  ⚡ which shows pure token generation speed (always higher).")
        print("=" * 70)

if __name__ == "__main__":
    main()