Update tuning scripts and add task creation to sync_vikunja.js

2026-04-06 21:49:56 +09:00
parent 626a089b6b
commit 7c7a899fd5
61 changed files with 8705 additions and 1566 deletions
--- a/scripts/perf_test_122b.py
+++ b/scripts/perf_test_122b.py
@@ -0,0 +1,169 @@
+import time
+import json
+import urllib.request
+import sys
+import os
+import re
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+
+def check_server():
+    """Check if server is up"""
+    try:
+        req = urllib.request.Request(f"{BASE_URL}/health")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            data = json.loads(resp.read())
+            return data.get("status") == "ok"
+    except:
+        return False
+
+def check_slots():
+    """Check server slot info for VRAM usage details"""
+    try:
+        req = urllib.request.Request(f"{BASE_URL}/slots")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            return json.loads(resp.read())
+    except:
+        return None
+
+def run_benchmark(prompt, max_tokens=300, label="Test"):
+    """Run a single benchmark request and return results"""
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    content = result["choices"][0]["message"].get("content", "")
+    usage = result.get("usage", {})
+    prompt_tokens = usage.get("prompt_tokens", 0)
+    completion_tokens = usage.get("completion_tokens", 0)
+
+    gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
+
+    return {
+        "label": label,
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "elapsed": elapsed,
+        "gen_tps_approx": gen_tps,
+        "content_preview": content[:150]
+    }
+
+def main():
+    print("=" * 70)
+    print("  Qwen3.5 122B-A10B Performance Benchmark")
+    print("  Target: 10+ t/s generation speed")
+    print("=" * 70)
+    print()
+
+    # Wait for server (model loading takes 3-5 min for 71 GB)
+    print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
+    max_wait = 600  # 10 minutes max
+    for i in range(max_wait // 5):
+        if check_server():
+            print(f"  -> Server is ready! (waited {i*5}s)")
+            break
+        if i % 6 == 0:
+            print(f"  -> Loading model... ({i*5}s / {max_wait}s)")
+        time.sleep(5)
+    else:
+        print(f"  -> ERROR: Server not responding after {max_wait}s")
+        return
+
+    # Check server info
+    print()
+    print("[2/4] Checking server status...")
+    slots = check_slots()
+    if slots:
+        print(f"  -> Slots available: {len(slots)}")
+
+    # Warmup
+    print()
+    print("[3/4] Warmup run (short, pre-heating GPU caches)...")
+    try:
+        warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
+        print(f"  -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
+        print(f"  -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
+    except Exception as e:
+        print(f"  -> Warmup failed: {e}")
+
+    # Main benchmark - 5 runs for statistical reliability
+    print()
+    print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
+    print("-" * 70)
+
+    test_prompts = [
+        "Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
+        "Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
+        "Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
+        "Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
+        "Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
+    ]
+    
+    results = []
+    for i in range(5):
+        prompt = test_prompts[i % len(test_prompts)]
+        print(f"\n  Run {i+1}/5: {prompt[:50]}...")
+        try:
+            r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
+            results.append(r)
+            print(f"    Completion tokens: {r['completion_tokens']}")
+            print(f"    Total time: {r['elapsed']:.2f}s")
+            print(f"    Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
+        except Exception as e:
+            print(f"    ERROR: {e}")
+
+    if results:
+        print()
+        print("=" * 70)
+        print("  RESULTS SUMMARY - Qwen3.5 122B-A10B")
+        print("=" * 70)
+        avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
+        max_tps = max(r["gen_tps_approx"] for r in results)
+        min_tps = min(r["gen_tps_approx"] for r in results)
+        total_tokens = sum(r["completion_tokens"] for r in results)
+        total_time = sum(r["elapsed"] for r in results)
+        
+        print(f"  Runs completed: {len(results)}/5")
+        print(f"  Total tokens:   {total_tokens}")
+        print(f"  Total time:     {total_time:.1f}s")
+        print()
+        print(f"  Approx TPS (avg): {avg_tps:.2f} t/s")
+        print(f"  Approx TPS (min): {min_tps:.2f} t/s")
+        print(f"  Approx TPS (max): {max_tps:.2f} t/s")
+        print()
+        
+        # Verdict
+        if avg_tps >= 10:
+            print("  ✅ TARGET ACHIEVED: 10+ t/s!")
+        elif avg_tps >= 8:
+            print("  ⚠️  CLOSE TO TARGET: Consider further tuning")
+        else:
+            print(f"  ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
+        
+        print()
+        print("  ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
+        print("  ⚡ Check the server console/log for exact 'eval time' t/s value,")
+        print("  ⚡ which shows pure token generation speed (always higher).")
+        print("=" * 70)
+
+if __name__ == "__main__":
+    main()