feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning):
- Gemma4 26B: 74.65 t/s (fast)
- Qwen 35B: 61.62 t/s (balanced)
- Gemma4 31B: 16.0 t/s (deep-coder)
- Qwen 27B: 16.7 t/s (deep-logic)
- Qwen 122B: 8.95 t/s (ultra, GPU 1 only)

Phase 02 (API Engine):
- FastAPI reverse proxy on port 8000
- /engine/switch hot-swap with 503 protection
- config/engine_models.json as single source of truth
- Replaced 4 individual .bat files with unified engine

File cleanup:
- scripts/ 85 files -> 9 + _archive/
- Root .bat files -> _archive/
This commit is contained in:
Variet-Worker
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions

View File

@@ -0,0 +1,335 @@
"""
Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
Now tuning for -c 262144 (256K context).
Phase 1: --cpu-moe vs no --cpu-moe baseline
Phase 2: -t / -tb sweep
Phase 3: -ub / -b sweep
Phase 4: --cache-type-k/v sweep
Phase 5: Misc (mmap, poll, prio)
"""
import subprocess
import time
import json
import urllib.request
import sys
import os
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
pass
BASE_URL = "http://127.0.0.1:8000"
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
CONTEXT = 262144
BENCHMARK_RUNS = 3
BENCHMARK_TOKENS = 200
BEST = {
"ngl": 999,
"cpu_moe": True,
"t": 6,
"tb": 6,
"ub": 512,
"b": 2048,
"ctk": "q4_0",
"ctv": "q4_0",
"fa": "on",
"mlock": True,
"mmap": True,
"prio": 2,
"poll": 50,
}
ALL_RESULTS = []
def kill_server():
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
time.sleep(4)
def build_cmd(cfg):
cmd = [LLAMA_SERVER, "--model", MODEL,
"-ngl", str(cfg["ngl"]),
"-c", str(CONTEXT),
"-np", "1",
"-fa", cfg["fa"],
"--cache-type-k", cfg["ctk"],
"--cache-type-v", cfg["ctv"],
"-ub", str(cfg["ub"]),
"-b", str(cfg["b"]),
"-t", str(cfg["t"]),
"-tb", str(cfg["tb"]),
"--prio", str(cfg["prio"]),
"--poll", str(cfg["poll"]),
"--port", "8000",
"--host", "0.0.0.0"]
if cfg.get("cpu_moe"):
cmd.append("--cpu-moe")
if cfg["mlock"]:
cmd.append("--mlock")
if not cfg["mmap"]:
cmd.append("--no-mmap")
return cmd
def start_server(cfg):
cmd = build_cmd(cfg)
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
)
return proc
def wait_for_server(timeout=240):
start = time.time()
while time.time() - start < timeout:
try:
req = urllib.request.Request(f"{BASE_URL}/health")
with urllib.request.urlopen(req, timeout=3) as resp:
data = json.loads(resp.read())
if data.get("status") == "ok":
return True
except:
pass
time.sleep(2)
return False
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
payload = json.dumps({
"model": "local-model",
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
"max_tokens": max_tokens,
"temperature": 0.0
}).encode("utf-8")
req = urllib.request.Request(
f"{BASE_URL}/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
start = time.time()
with urllib.request.urlopen(req, timeout=300) as resp:
result = json.loads(resp.read())
elapsed = time.time() - start
usage = result.get("usage", {})
ct = usage.get("completion_tokens", 0)
return ct / elapsed if elapsed > 0 else 0
def get_vram():
try:
r = subprocess.run(
["nvidia-smi", "--query-gpu=memory.used,memory.total",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
parts = r.stdout.strip().split(",")
return int(parts[0].strip()), int(parts[1].strip())
except:
return 0, 0
def test_config(cfg, label=""):
kill_server()
desc = label or str(cfg)
print(f" [{desc}] Starting server...", flush=True)
proc = start_server(cfg)
if not wait_for_server():
print(f" [{desc}] FAILED to start")
proc.kill()
return None
vram_used, vram_total = get_vram()
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
# Warmup
try:
run_benchmark(max_tokens=20)
except:
pass
speeds = []
for i in range(BENCHMARK_RUNS):
try:
tps = run_benchmark()
speeds.append(tps)
except Exception as e:
print(f"ERR({e}) ", end="", flush=True)
proc.kill()
if not speeds:
print("ALL FAILED")
return None
avg = sum(speeds) / len(speeds)
best = max(speeds)
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
"vram_used": vram_used, "vram_total": vram_total, "label": label}
ALL_RESULTS.append(result)
return result
def phase_sweep(phase_name, param_name, values, base_cfg):
print(f"\n{'='*70}")
print(f" PHASE: {phase_name}")
print(f" Sweeping: {param_name} = {values}")
print(f"{'='*70}")
best_result = None
for val in values:
cfg = {**base_cfg}
if isinstance(param_name, list):
for p, v in zip(param_name, val):
cfg[p] = v
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
else:
cfg[param_name] = val
label = f"{param_name}={val}"
r = test_config(cfg, label)
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
best_result = r
if best_result:
print(f"\n ★ Phase winner: {best_result['label']}{best_result['avg_tps']:.2f} t/s")
return best_result
def main():
print("=" * 70)
print(" Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
print(" 256K Context | RTX 3060 12GB")
print(" Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
print("=" * 70)
print()
cfg = dict(BEST)
# ─── Phase 1: --cpu-moe critical test ───
r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
if r:
cfg["cpu_moe"] = r["cpu_moe"]
# ─── Phase 2: CPU threads ───
thread_combos = [
(2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
(8, 8), (8, 12), (10, 10), (12, 12)
]
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
if r:
cfg["t"] = r["t"]
cfg["tb"] = r["tb"]
# ─── Phase 3: Batch sizes ───
batch_combos = [
(128, 512), (256, 1024), (256, 2048),
(512, 1024), (512, 2048), (512, 4096),
(1024, 2048), (1024, 4096)
]
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
if r:
cfg["ub"] = r["ub"]
cfg["b"] = r["b"]
# ─── Phase 4: KV cache ───
kv_combos = [
("q4_0", "q4_0"),
("q8_0", "q8_0"),
("f16", "f16"),
]
r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
if r:
cfg["ctk"] = r["ctk"]
cfg["ctv"] = r["ctv"]
# ─── Phase 5: Misc ───
misc_combos = [
(True, 50, 2),
(False, 50, 2),
(True, 0, 2),
(True, 100, 2),
(True, 50, 3),
]
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
if r:
cfg["mmap"] = r["mmap"]
cfg["poll"] = r["poll"]
cfg["prio"] = r["prio"]
# ─── Final Report ───
print()
print("=" * 70)
print(" FINAL OPTIMAL CONFIGURATION")
print("=" * 70)
for k, v in cfg.items():
print(f" {k:>12}: {v}")
print()
# Final verification
print(" Running final verification (5 runs)...")
kill_server()
proc = start_server(cfg)
wait_for_server()
try:
run_benchmark(max_tokens=20)
except:
pass
final_speeds = []
for i in range(5):
try:
tps = run_benchmark()
final_speeds.append(tps)
print(f" Run {i+1}: {tps:.2f} t/s")
except:
pass
proc.kill()
if final_speeds:
avg = sum(final_speeds) / len(final_speeds)
best = max(final_speeds)
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
print()
cmd_parts = [
f"llama-server --model {MODEL}",
f"-ngl {cfg['ngl']} -c {CONTEXT}",
]
if cfg.get("cpu_moe"):
cmd_parts.append("--cpu-moe")
cmd_parts.extend([
f"-t {cfg['t']} -tb {cfg['tb']}",
f"-ub {cfg['ub']} -b {cfg['b']}",
f"-fa {cfg['fa']}",
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
f"--prio {cfg['prio']} --poll {cfg['poll']}",
])
if cfg["mlock"]:
cmd_parts.append("--mlock")
if not cfg["mmap"]:
cmd_parts.append("--no-mmap")
cmd_parts.append("--port 8000 --host 0.0.0.0")
print(" Recommended command:")
print(f" {' '.join(cmd_parts)}")
print("=" * 70)
with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
json.dump(ALL_RESULTS, f, indent=2, default=str)
print(f"\n Full results saved: scripts/tune_results_qwen35b_256k.json")
if __name__ == "__main__":
main()