Update tuning scripts and add task creation to sync_vikunja.js
This commit is contained in:
644
scripts/dual_gpu_benchmark.py
Normal file
644
scripts/dual_gpu_benchmark.py
Normal file
@@ -0,0 +1,644 @@
|
||||
"""
|
||||
Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||
==========================================================
|
||||
Tests 4 models across multiple parameter configurations to find
|
||||
the absolute best model + settings for 256K context coding agent.
|
||||
|
||||
Models:
|
||||
1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||
2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||
3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||
4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||
|
||||
Test Phases (per model):
|
||||
Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
|
||||
Phase 1: GPU layer + MoE offload strategy sweep
|
||||
Phase 2: CPU thread sweep (carry best from P1)
|
||||
Phase 3: Batch size sweep (carry best from P1+P2)
|
||||
Phase 4: KV cache type sweep (carry best from P1+P2+P3)
|
||||
Phase 5: Final verification (5 runs)
|
||||
|
||||
Output: scripts/dual_gpu_results.json (all raw data)
|
||||
scripts/dual_gpu_summary.txt (human-readable winner)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ─── Configuration ───────────────────────────────────────────────
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
CONTEXT = 262144 # 256K
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
SERVER_TIMEOUT = 300 # seconds to wait for server startup
|
||||
|
||||
MODELS = [
|
||||
{
|
||||
"name": "Qwen3.5-35B-A3B Q4_K_M",
|
||||
"path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
|
||||
"type": "qwen",
|
||||
"quant": "Q4_K_M",
|
||||
"is_mxfp4": False,
|
||||
"total_layers": 64, # Qwen3.5 35B has 64 layers
|
||||
},
|
||||
{
|
||||
"name": "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||
"path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
|
||||
"type": "qwen",
|
||||
"quant": "MXFP4_MOE",
|
||||
"is_mxfp4": True,
|
||||
"total_layers": 64,
|
||||
},
|
||||
{
|
||||
"name": "Gemma4 26B-A4B Q4_K_M",
|
||||
"path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
|
||||
"type": "gemma4",
|
||||
"quant": "Q4_K_M",
|
||||
"is_mxfp4": False,
|
||||
"total_layers": 30, # Gemma4 26B has 30 layers
|
||||
},
|
||||
{
|
||||
"name": "Gemma4 26B-A4B MXFP4_MOE",
|
||||
"path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
|
||||
"type": "gemma4",
|
||||
"quant": "MXFP4_MOE",
|
||||
"is_mxfp4": True,
|
||||
"total_layers": 30,
|
||||
},
|
||||
]
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
# ─── Utility Functions ──────────────────────────────────────────
|
||||
def log(msg):
|
||||
ts = datetime.datetime.now().strftime("%H:%M:%S")
|
||||
print(f"[{ts}] {msg}", flush=True)
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
capture_output=True)
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def get_vram_all():
|
||||
"""Returns list of (used, total) tuples for each GPU."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
gpus = []
|
||||
for line in r.stdout.strip().split("\n"):
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 3:
|
||||
gpus.append({
|
||||
"gpu": int(parts[0]),
|
||||
"used": int(parts[1]),
|
||||
"total": int(parts[2]),
|
||||
})
|
||||
return gpus
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
|
||||
cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
|
||||
"""Build llama-server command for dual-GPU."""
|
||||
cmd = [
|
||||
LLAMA_SERVER,
|
||||
"--model", model_path,
|
||||
"-ngl", str(ngl),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", ctk,
|
||||
"--cache-type-v", ctv,
|
||||
"-ub", str(ub),
|
||||
"-b", str(b),
|
||||
"-t", str(t),
|
||||
"-tb", str(t),
|
||||
"--prio", str(prio),
|
||||
"--poll", "50",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
]
|
||||
# MoE offloading options
|
||||
if cpu_moe:
|
||||
cmd.append("--cpu-moe")
|
||||
elif n_cpu_moe > 0:
|
||||
cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
|
||||
if nommap:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(model_path, **kwargs):
|
||||
cmd = build_cmd(model_path, **kwargs)
|
||||
log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=SERVER_TIMEOUT):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
boot_time = time.time() - start
|
||||
return True, boot_time
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(3)
|
||||
return False, timeout
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user",
|
||||
"content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0,
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
pt = usage.get("prompt_tokens", 0)
|
||||
return {
|
||||
"tps": ct / elapsed if elapsed > 0 else 0,
|
||||
"completion_tokens": ct,
|
||||
"prompt_tokens": pt,
|
||||
"elapsed": elapsed,
|
||||
}
|
||||
|
||||
|
||||
def test_config(model_info, label, **kwargs):
|
||||
"""Test a single configuration. Returns result dict or None."""
|
||||
kill_server()
|
||||
log(f" [{label}] Starting server...")
|
||||
|
||||
proc = start_server(model_info["path"], **kwargs)
|
||||
ok, boot_time = wait_for_server()
|
||||
|
||||
if not ok:
|
||||
log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram = get_vram_all()
|
||||
vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
|
||||
log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Benchmark runs
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
r = run_benchmark()
|
||||
speeds.append(r["tps"])
|
||||
log(f" Run {i+1}: {r['tps']:.2f} t/s")
|
||||
except Exception as e:
|
||||
log(f" Run {i+1}: ERROR ({e})")
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
log(f" [{label}] ALL BENCHMARK RUNS FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {
|
||||
"model": model_info["name"],
|
||||
"quant": model_info["quant"],
|
||||
"label": label,
|
||||
"avg_tps": round(avg, 2),
|
||||
"best_tps": round(best, 2),
|
||||
"boot_time": round(boot_time, 1),
|
||||
"vram": vram,
|
||||
"params": kwargs,
|
||||
}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
# ─── Phase Runners ───────────────────────────────────────────────
|
||||
|
||||
def phase0_boot_test(model):
|
||||
"""Quick test: can the model even boot with 256K on dual GPU?"""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 0: Boot Test — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
# Try -ngl 999 (all layers to GPU) as baseline
|
||||
r = test_config(
|
||||
model, f"boot-ngl999",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
)
|
||||
if r:
|
||||
return r
|
||||
|
||||
# If full GPU fails, try with cpu-moe
|
||||
log(" Full GPU failed, trying with --cpu-moe...")
|
||||
r = test_config(
|
||||
model, f"boot-cpumoe",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
cpu_moe=True,
|
||||
)
|
||||
if r:
|
||||
return r
|
||||
|
||||
# Extreme fallback: fewer layers
|
||||
log(" --cpu-moe also failed, trying reduced layers...")
|
||||
r = test_config(
|
||||
model, f"boot-ngl-half",
|
||||
ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
|
||||
ctk="q4_0", ctv="q4_0",
|
||||
)
|
||||
return r
|
||||
|
||||
|
||||
def phase1_gpu_offload(model, baseline):
|
||||
"""Find optimal GPU layer count and MoE offload strategy."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 1: GPU Offload Strategy — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
results = []
|
||||
if baseline:
|
||||
results.append(baseline)
|
||||
|
||||
total = model["total_layers"]
|
||||
|
||||
# Strategy A: All GPU + cpu-moe variations
|
||||
for cpu_moe in [True, False]:
|
||||
label = f"ngl=999 cpu_moe={cpu_moe}"
|
||||
# Skip if already tested in baseline
|
||||
if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
|
||||
baseline["params"].get("cpu_moe", False) == cpu_moe:
|
||||
continue
|
||||
r = test_config(
|
||||
model, label,
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
cpu_moe=cpu_moe,
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
# Strategy B: n-cpu-moe sweep (selective expert offload)
|
||||
for n in [0, 5, 10, 15, 20]:
|
||||
if n > total:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"n-cpu-moe={n}",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
n_cpu_moe=n,
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
if not results:
|
||||
log(" PHASE 1: No configuration worked!")
|
||||
return None
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase2_threads(model, prev_best):
|
||||
"""Sweep CPU threads with best GPU config locked."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 2: CPU Thread Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
results = [prev_best]
|
||||
|
||||
for t in [2, 4, 6, 8, 10, 12]:
|
||||
if t == p.get("t", 6):
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"t={t}",
|
||||
ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
|
||||
ctk=p["ctk"], ctv=p["ctv"],
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase3_batch(model, prev_best):
|
||||
"""Sweep batch sizes."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 3: Batch Size Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
best_t = p["t"]
|
||||
results = [prev_best]
|
||||
|
||||
for ub, b in [(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)]:
|
||||
if ub == p["ub"] and b == p["b"]:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"ub={ub} b={b}",
|
||||
ngl=p["ngl"], t=best_t, ub=ub, b=b,
|
||||
ctk=p["ctk"], ctv=p["ctv"],
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase4_kvcache(model, prev_best):
|
||||
"""Sweep KV cache precision."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 4: KV Cache Type Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
results = [prev_best]
|
||||
|
||||
for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
|
||||
("q4_0", "q8_0"), ("f16", "f16")]:
|
||||
if ctk == p["ctk"] and ctv == p["ctv"]:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"kv={ctk}/{ctv}",
|
||||
ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
|
||||
ctk=ctk, ctv=ctv,
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase5_final(model, prev_best):
|
||||
"""Final verification with 5 runs."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 5: Final Verification (5 runs) — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
kill_server()
|
||||
proc = start_server(model["path"], **p)
|
||||
ok, boot_time = wait_for_server()
|
||||
if not ok:
|
||||
log(" FAILED to start for final verification!")
|
||||
proc.kill()
|
||||
return prev_best
|
||||
|
||||
vram = get_vram_all()
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
r = run_benchmark()
|
||||
speeds.append(r["tps"])
|
||||
log(f" Final Run {i+1}: {r['tps']:.2f} t/s")
|
||||
except Exception as e:
|
||||
log(f" Final Run {i+1}: ERROR ({e})")
|
||||
|
||||
proc.kill()
|
||||
|
||||
if speeds:
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best_tps = max(speeds)
|
||||
log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
|
||||
|
||||
final = {
|
||||
"model": model["name"],
|
||||
"quant": model["quant"],
|
||||
"label": f"FINAL-{model['name']}",
|
||||
"avg_tps": round(avg, 2),
|
||||
"best_tps": round(best_tps, 2),
|
||||
"boot_time": round(boot_time, 1),
|
||||
"vram": vram,
|
||||
"params": p,
|
||||
}
|
||||
ALL_RESULTS.append(final)
|
||||
return final
|
||||
|
||||
return prev_best
|
||||
|
||||
|
||||
# ─── Main ────────────────────────────────────────────────────────
|
||||
|
||||
def run_full_benchmark_for_model(model):
|
||||
"""Run all phases for a single model."""
|
||||
log(f"\n{'#'*70}")
|
||||
log(f" MODEL: {model['name']}")
|
||||
log(f" File: {model['path']}")
|
||||
log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB")
|
||||
log(f"{'#'*70}")
|
||||
|
||||
# Check model exists
|
||||
if not os.path.exists(model["path"]):
|
||||
log(f" SKIP: Model file not found!")
|
||||
return None
|
||||
|
||||
# Phase 0: Can it boot?
|
||||
baseline = phase0_boot_test(model)
|
||||
if not baseline:
|
||||
log(f" SKIP: {model['name']} cannot boot at 256K context!")
|
||||
return None
|
||||
|
||||
# Phase 1: GPU offload strategy
|
||||
best = phase1_gpu_offload(model, baseline)
|
||||
if not best:
|
||||
return baseline
|
||||
|
||||
# Phase 2: CPU threads
|
||||
best = phase2_threads(model, best)
|
||||
|
||||
# Phase 3: Batch sizes
|
||||
best = phase3_batch(model, best)
|
||||
|
||||
# Phase 4: KV cache
|
||||
best = phase4_kvcache(model, best)
|
||||
|
||||
# Phase 5: Final verification
|
||||
final = phase5_final(model, best)
|
||||
|
||||
return final
|
||||
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
|
||||
log("=" * 70)
|
||||
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
|
||||
log(" 2x RTX 3060 (24GB Total) | 256K Context")
|
||||
log(f" Models: {len(MODELS)}")
|
||||
log(f" Started: {datetime.datetime.now().isoformat()}")
|
||||
log("=" * 70)
|
||||
|
||||
# Show GPU info
|
||||
gpus = get_vram_all()
|
||||
for g in gpus:
|
||||
log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
|
||||
|
||||
# Run benchmarks for each model
|
||||
model_winners = []
|
||||
for i, model in enumerate(MODELS):
|
||||
log(f"\n{'='*70}")
|
||||
log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
winner = run_full_benchmark_for_model(model)
|
||||
if winner:
|
||||
model_winners.append(winner)
|
||||
|
||||
# Save intermediate results
|
||||
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
|
||||
|
||||
# ─── Grand Final Comparison ──────────────────────────────────
|
||||
elapsed = (time.time() - start_time) / 60
|
||||
|
||||
log(f"\n{'='*70}")
|
||||
log(f" GRAND FINAL COMPARISON")
|
||||
log(f" Total time: {elapsed:.1f} minutes")
|
||||
log(f" Configs tested: {len(ALL_RESULTS)}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
if not model_winners:
|
||||
log(" No models were able to run at 256K context!")
|
||||
return
|
||||
|
||||
# Sort by avg t/s
|
||||
model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
|
||||
|
||||
summary_lines = []
|
||||
summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
|
||||
summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
|
||||
summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
|
||||
summary_lines.append(f"Total time: {elapsed:.1f} minutes")
|
||||
summary_lines.append("")
|
||||
summary_lines.append("=" * 60)
|
||||
summary_lines.append(" RANKING (by AVG t/s)")
|
||||
summary_lines.append("=" * 60)
|
||||
|
||||
for rank, w in enumerate(model_winners, 1):
|
||||
medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ")
|
||||
summary_lines.append(f"\n {medal} #{rank}: {w['model']}")
|
||||
summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
|
||||
summary_lines.append(f" Boot: {w['boot_time']:.0f}s")
|
||||
p = w["params"]
|
||||
summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
|
||||
summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}")
|
||||
if p.get("cpu_moe"):
|
||||
summary_lines.append(f" --cpu-moe")
|
||||
elif p.get("n_cpu_moe", 0) > 0:
|
||||
summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}")
|
||||
|
||||
champion = model_winners[0]
|
||||
summary_lines.append(f"\n{'='*60}")
|
||||
summary_lines.append(f" ★ CHAMPION: {champion['model']}")
|
||||
summary_lines.append(f" {champion['avg_tps']:.2f} t/s average")
|
||||
summary_lines.append(f"{'='*60}")
|
||||
|
||||
# Build recommended command
|
||||
p = champion["params"]
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
|
||||
f"-ngl {p['ngl']} -c {CONTEXT}",
|
||||
f"-t {p['t']} -tb {p['t']}",
|
||||
f"-ub {p['ub']} -b {p['b']}",
|
||||
"-fa on",
|
||||
f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
|
||||
f"--prio {p.get('prio', 3)} --poll 50",
|
||||
"--mlock",
|
||||
]
|
||||
if p.get("cpu_moe"):
|
||||
cmd_parts.append("--cpu-moe")
|
||||
elif p.get("n_cpu_moe", 0) > 0:
|
||||
cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
|
||||
if p.get("nommap"):
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
summary_lines.append(f"\n Recommended command:")
|
||||
summary_lines.append(f" {' '.join(cmd_parts)}")
|
||||
|
||||
summary = "\n".join(summary_lines)
|
||||
print(summary)
|
||||
|
||||
with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
|
||||
f.write(summary)
|
||||
|
||||
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
|
||||
log(f"\n Results: scripts/dual_gpu_results.json")
|
||||
log(f" Summary: scripts/dual_gpu_summary.txt")
|
||||
log(f" DONE!")
|
||||
|
||||
kill_server()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user