Files
variet_llm/scripts/_archive/benchmarks/perf_test_122b.py
Variet-Worker c111b3a9b0 feat: Variet Engine v1.0 + 5-model tuning complete
Phase 01 (LLM Tuning):
- Gemma4 26B: 74.65 t/s (fast)
- Qwen 35B: 61.62 t/s (balanced)
- Gemma4 31B: 16.0 t/s (deep-coder)
- Qwen 27B: 16.7 t/s (deep-logic)
- Qwen 122B: 8.95 t/s (ultra, GPU 1 only)

Phase 02 (API Engine):
- FastAPI reverse proxy on port 8000
- /engine/switch hot-swap with 503 protection
- config/engine_models.json as single source of truth
- Replaced 4 individual .bat files with unified engine

File cleanup:
- scripts/ 85 files -> 9 + _archive/
- Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00

170 lines
5.9 KiB
Python

import time
import json
import urllib.request
import sys
import os
import re
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
def check_server():
"""Check if server is up"""
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
return data.get("status") == "ok"
except:
return False
def check_slots():
"""Check server slot info for VRAM usage details"""
try:
req = urllib.request.Request(f"{BASE_URL}/slots")
with urllib.request.urlopen(req, timeout=5) as resp:
return json.loads(resp.read())
except:
return None
def run_benchmark(prompt, max_tokens=300, label="Test"):
"""Run a single benchmark request and return results"""
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=600) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
content = result["choices"][0]["message"].get("content", "")
usage = result.get("usage", {})
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
return {
"label": label,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"elapsed": elapsed,
"gen_tps_approx": gen_tps,
"content_preview": content[:150]
}
def main():
print("=" * 70)
print(" Qwen3.5 122B-A10B Performance Benchmark")
print(" Target: 10+ t/s generation speed")
print("=" * 70)
print()
# Wait for server (model loading takes 3-5 min for 71 GB)
print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
max_wait = 600 # 10 minutes max
for i in range(max_wait // 5):
if check_server():
print(f" -> Server is ready! (waited {i*5}s)")
break
if i % 6 == 0:
print(f" -> Loading model... ({i*5}s / {max_wait}s)")
time.sleep(5)
else:
print(f" -> ERROR: Server not responding after {max_wait}s")
return
# Check server info
print()
print("[2/4] Checking server status...")
slots = check_slots()
if slots:
print(f" -> Slots available: {len(slots)}")
# Warmup
print()
print("[3/4] Warmup run (short, pre-heating GPU caches)...")
try:
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
except Exception as e:
print(f" -> Warmup failed: {e}")
# Main benchmark - 5 runs for statistical reliability
print()
print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
print("-" * 70)
test_prompts = [
"Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
"Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
"Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
"Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
"Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
]
results = []
for i in range(5):
prompt = test_prompts[i % len(test_prompts)]
print(f"\n Run {i+1}/5: {prompt[:50]}...")
try:
r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
results.append(r)
print(f" Completion tokens: {r['completion_tokens']}")
print(f" Total time: {r['elapsed']:.2f}s")
print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
except Exception as e:
print(f" ERROR: {e}")
if results:
print()
print("=" * 70)
print(" RESULTS SUMMARY - Qwen3.5 122B-A10B")
print("=" * 70)
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
max_tps = max(r["gen_tps_approx"] for r in results)
min_tps = min(r["gen_tps_approx"] for r in results)
total_tokens = sum(r["completion_tokens"] for r in results)
total_time = sum(r["elapsed"] for r in results)
print(f" Runs completed: {len(results)}/5")
print(f" Total tokens: {total_tokens}")
print(f" Total time: {total_time:.1f}s")
print()
print(f" Approx TPS (avg): {avg_tps:.2f} t/s")
print(f" Approx TPS (min): {min_tps:.2f} t/s")
print(f" Approx TPS (max): {max_tps:.2f} t/s")
print()
# Verdict
if avg_tps >= 10:
print(" ✅ TARGET ACHIEVED: 10+ t/s!")
elif avg_tps >= 8:
print(" ⚠️ CLOSE TO TARGET: Consider further tuning")
else:
print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
print()
print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
print(" ⚡ Check the server console/log for exact 'eval time' t/s value,")
print(" ⚡ which shows pure token generation speed (always higher).")
print("=" * 70)
if __name__ == "__main__":
main()