feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions
--- a/scripts/_archive/tuning/auto_tune_qwen35b_256k.py
+++ b/scripts/_archive/tuning/auto_tune_qwen35b_256k.py
@@ -0,0 +1,335 @@
+"""
+Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
+Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
+Now tuning for -c 262144 (256K context).
+
+Phase 1: --cpu-moe vs no --cpu-moe baseline
+Phase 2: -t / -tb sweep
+Phase 3: -ub / -b sweep
+Phase 4: --cache-type-k/v sweep
+Phase 5: Misc (mmap, poll, prio)
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
+CONTEXT = 262144
+BENCHMARK_RUNS = 3
+BENCHMARK_TOKENS = 200
+
+BEST = {
+    "ngl": 999,
+    "cpu_moe": True,
+    "t": 6,
+    "tb": 6,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": True,
+    "mmap": True,
+    "prio": 2,
+    "poll": 50,
+}
+
+ALL_RESULTS = []
+
+
+def kill_server():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+    time.sleep(4)
+
+
+def build_cmd(cfg):
+    cmd = [LLAMA_SERVER, "--model", MODEL,
+           "-ngl", str(cfg["ngl"]),
+           "-c", str(CONTEXT),
+           "-np", "1",
+           "-fa", cfg["fa"],
+           "--cache-type-k", cfg["ctk"],
+           "--cache-type-v", cfg["ctv"],
+           "-ub", str(cfg["ub"]),
+           "-b", str(cfg["b"]),
+           "-t", str(cfg["t"]),
+           "-tb", str(cfg["tb"]),
+           "--prio", str(cfg["prio"]),
+           "--poll", str(cfg["poll"]),
+           "--port", "8000",
+           "--host", "0.0.0.0"]
+    if cfg.get("cpu_moe"):
+        cmd.append("--cpu-moe")
+    if cfg["mlock"]:
+        cmd.append("--mlock")
+    if not cfg["mmap"]:
+        cmd.append("--no-mmap")
+    return cmd
+
+
+def start_server(cfg):
+    cmd = build_cmd(cfg)
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
+    )
+    return proc
+
+
+def wait_for_server(timeout=240):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=3) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(2)
+    return False
+
+
+def run_benchmark(max_tokens=BENCHMARK_TOKENS):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    ct = usage.get("completion_tokens", 0)
+    return ct / elapsed if elapsed > 0 else 0
+
+
+def get_vram():
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=memory.used,memory.total",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        parts = r.stdout.strip().split(",")
+        return int(parts[0].strip()), int(parts[1].strip())
+    except:
+        return 0, 0
+
+
+def test_config(cfg, label=""):
+    kill_server()
+    desc = label or str(cfg)
+    print(f"  [{desc}] Starting server...", flush=True)
+    proc = start_server(cfg)
+
+    if not wait_for_server():
+        print(f"  [{desc}] FAILED to start")
+        proc.kill()
+        return None
+
+    vram_used, vram_total = get_vram()
+    print(f"  [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+
+    speeds = []
+    for i in range(BENCHMARK_RUNS):
+        try:
+            tps = run_benchmark()
+            speeds.append(tps)
+        except Exception as e:
+            print(f"ERR({e}) ", end="", flush=True)
+
+    proc.kill()
+
+    if not speeds:
+        print("ALL FAILED")
+        return None
+
+    avg = sum(speeds) / len(speeds)
+    best = max(speeds)
+    print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
+
+    result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
+              "vram_used": vram_used, "vram_total": vram_total, "label": label}
+    ALL_RESULTS.append(result)
+    return result
+
+
+def phase_sweep(phase_name, param_name, values, base_cfg):
+    print(f"\n{'='*70}")
+    print(f"  PHASE: {phase_name}")
+    print(f"  Sweeping: {param_name} = {values}")
+    print(f"{'='*70}")
+
+    best_result = None
+    for val in values:
+        cfg = {**base_cfg}
+        if isinstance(param_name, list):
+            for p, v in zip(param_name, val):
+                cfg[p] = v
+            label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
+        else:
+            cfg[param_name] = val
+            label = f"{param_name}={val}"
+
+        r = test_config(cfg, label)
+        if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
+            best_result = r
+
+    if best_result:
+        print(f"\n  ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
+    return best_result
+
+
+def main():
+    print("=" * 70)
+    print("  Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
+    print("  256K Context | RTX 3060 12GB")
+    print("  Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
+    print("=" * 70)
+    print()
+
+    cfg = dict(BEST)
+
+    # ─── Phase 1: --cpu-moe critical test ───
+    r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
+    if r:
+        cfg["cpu_moe"] = r["cpu_moe"]
+
+    # ─── Phase 2: CPU threads ───
+    thread_combos = [
+        (2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
+        (8, 8), (8, 12), (10, 10), (12, 12)
+    ]
+    r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
+    if r:
+        cfg["t"] = r["t"]
+        cfg["tb"] = r["tb"]
+
+    # ─── Phase 3: Batch sizes ───
+    batch_combos = [
+        (128, 512), (256, 1024), (256, 2048),
+        (512, 1024), (512, 2048), (512, 4096),
+        (1024, 2048), (1024, 4096)
+    ]
+    r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
+    if r:
+        cfg["ub"] = r["ub"]
+        cfg["b"] = r["b"]
+
+    # ─── Phase 4: KV cache ───
+    kv_combos = [
+        ("q4_0", "q4_0"),
+        ("q8_0", "q8_0"),
+        ("f16", "f16"),
+    ]
+    r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
+    if r:
+        cfg["ctk"] = r["ctk"]
+        cfg["ctv"] = r["ctv"]
+
+    # ─── Phase 5: Misc ───
+    misc_combos = [
+        (True, 50, 2),
+        (False, 50, 2),
+        (True, 0, 2),
+        (True, 100, 2),
+        (True, 50, 3),
+    ]
+    r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
+    if r:
+        cfg["mmap"] = r["mmap"]
+        cfg["poll"] = r["poll"]
+        cfg["prio"] = r["prio"]
+
+    # ─── Final Report ───
+    print()
+    print("=" * 70)
+    print("  FINAL OPTIMAL CONFIGURATION")
+    print("=" * 70)
+    for k, v in cfg.items():
+        print(f"  {k:>12}: {v}")
+    print()
+
+    # Final verification
+    print("  Running final verification (5 runs)...")
+    kill_server()
+    proc = start_server(cfg)
+    wait_for_server()
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+    final_speeds = []
+    for i in range(5):
+        try:
+            tps = run_benchmark()
+            final_speeds.append(tps)
+            print(f"    Run {i+1}: {tps:.2f} t/s")
+        except:
+            pass
+    proc.kill()
+
+    if final_speeds:
+        avg = sum(final_speeds) / len(final_speeds)
+        best = max(final_speeds)
+        print(f"\n  ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
+
+    print()
+    cmd_parts = [
+        f"llama-server --model {MODEL}",
+        f"-ngl {cfg['ngl']} -c {CONTEXT}",
+    ]
+    if cfg.get("cpu_moe"):
+        cmd_parts.append("--cpu-moe")
+    cmd_parts.extend([
+        f"-t {cfg['t']} -tb {cfg['tb']}",
+        f"-ub {cfg['ub']} -b {cfg['b']}",
+        f"-fa {cfg['fa']}",
+        f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
+        f"--prio {cfg['prio']} --poll {cfg['poll']}",
+    ])
+    if cfg["mlock"]:
+        cmd_parts.append("--mlock")
+    if not cfg["mmap"]:
+        cmd_parts.append("--no-mmap")
+    cmd_parts.append("--port 8000 --host 0.0.0.0")
+
+    print("  Recommended command:")
+    print(f"    {' '.join(cmd_parts)}")
+    print("=" * 70)
+
+    with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
+        json.dump(ALL_RESULTS, f, indent=2, default=str)
+    print(f"\n  Full results saved: scripts/tune_results_qwen35b_256k.json")
+
+
+if __name__ == "__main__":
+    main()