Update tuning scripts and add task creation to sync_vikunja.js
This commit is contained in:
169
scripts/perf_test_122b.py
Normal file
169
scripts/perf_test_122b.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
|
||||
def check_server():
|
||||
"""Check if server is up"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("status") == "ok"
|
||||
except:
|
||||
return False
|
||||
|
||||
def check_slots():
|
||||
"""Check server slot info for VRAM usage details"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/slots")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read())
|
||||
except:
|
||||
return None
|
||||
|
||||
def run_benchmark(prompt, max_tokens=300, label="Test"):
|
||||
"""Run a single benchmark request and return results"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
content = result["choices"][0]["message"].get("content", "")
|
||||
usage = result.get("usage", {})
|
||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
|
||||
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"elapsed": elapsed,
|
||||
"gen_tps_approx": gen_tps,
|
||||
"content_preview": content[:150]
|
||||
}
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B Performance Benchmark")
|
||||
print(" Target: 10+ t/s generation speed")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Wait for server (model loading takes 3-5 min for 71 GB)
|
||||
print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
|
||||
max_wait = 600 # 10 minutes max
|
||||
for i in range(max_wait // 5):
|
||||
if check_server():
|
||||
print(f" -> Server is ready! (waited {i*5}s)")
|
||||
break
|
||||
if i % 6 == 0:
|
||||
print(f" -> Loading model... ({i*5}s / {max_wait}s)")
|
||||
time.sleep(5)
|
||||
else:
|
||||
print(f" -> ERROR: Server not responding after {max_wait}s")
|
||||
return
|
||||
|
||||
# Check server info
|
||||
print()
|
||||
print("[2/4] Checking server status...")
|
||||
slots = check_slots()
|
||||
if slots:
|
||||
print(f" -> Slots available: {len(slots)}")
|
||||
|
||||
# Warmup
|
||||
print()
|
||||
print("[3/4] Warmup run (short, pre-heating GPU caches)...")
|
||||
try:
|
||||
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
||||
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
||||
print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
||||
except Exception as e:
|
||||
print(f" -> Warmup failed: {e}")
|
||||
|
||||
# Main benchmark - 5 runs for statistical reliability
|
||||
print()
|
||||
print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
|
||||
print("-" * 70)
|
||||
|
||||
test_prompts = [
|
||||
"Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
|
||||
"Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
|
||||
"Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
|
||||
"Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
|
||||
"Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
|
||||
]
|
||||
|
||||
results = []
|
||||
for i in range(5):
|
||||
prompt = test_prompts[i % len(test_prompts)]
|
||||
print(f"\n Run {i+1}/5: {prompt[:50]}...")
|
||||
try:
|
||||
r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
|
||||
results.append(r)
|
||||
print(f" Completion tokens: {r['completion_tokens']}")
|
||||
print(f" Total time: {r['elapsed']:.2f}s")
|
||||
print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
if results:
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" RESULTS SUMMARY - Qwen3.5 122B-A10B")
|
||||
print("=" * 70)
|
||||
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
||||
max_tps = max(r["gen_tps_approx"] for r in results)
|
||||
min_tps = min(r["gen_tps_approx"] for r in results)
|
||||
total_tokens = sum(r["completion_tokens"] for r in results)
|
||||
total_time = sum(r["elapsed"] for r in results)
|
||||
|
||||
print(f" Runs completed: {len(results)}/5")
|
||||
print(f" Total tokens: {total_tokens}")
|
||||
print(f" Total time: {total_time:.1f}s")
|
||||
print()
|
||||
print(f" Approx TPS (avg): {avg_tps:.2f} t/s")
|
||||
print(f" Approx TPS (min): {min_tps:.2f} t/s")
|
||||
print(f" Approx TPS (max): {max_tps:.2f} t/s")
|
||||
print()
|
||||
|
||||
# Verdict
|
||||
if avg_tps >= 10:
|
||||
print(" ✅ TARGET ACHIEVED: 10+ t/s!")
|
||||
elif avg_tps >= 8:
|
||||
print(" ⚠️ CLOSE TO TARGET: Consider further tuning")
|
||||
else:
|
||||
print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
|
||||
|
||||
print()
|
||||
print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
|
||||
print(" ⚡ Check the server console/log for exact 'eval time' t/s value,")
|
||||
print(" ⚡ which shows pure token generation speed (always higher).")
|
||||
print("=" * 70)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user