feat: Variet Engine v1.0 + 5-model tuning complete

Phase 01 (LLM Tuning): - Gemma4 26B: 74.65 t/s (fast) - Qwen 35B: 61.62 t/s (balanced) - Gemma4 31B: 16.0 t/s (deep-coder) - Qwen 27B: 16.7 t/s (deep-logic) - Qwen 122B: 8.95 t/s (ultra, GPU 1 only) Phase 02 (API Engine): - FastAPI reverse proxy on port 8000 - /engine/switch hot-swap with 503 protection - config/engine_models.json as single source of truth - Replaced 4 individual .bat files with unified engine File cleanup: - scripts/ 85 files -> 9 + _archive/ - Root .bat files -> _archive/
2026-04-07 18:08:58 +09:00
parent 7c7a899fd5
commit c111b3a9b0
414 changed files with 3402 additions and 68598 deletions
--- a/scripts/_archive/benchmarks/deep_tier_auto_test.py
+++ b/scripts/_archive/benchmarks/deep_tier_auto_test.py
@@ -0,0 +1,177 @@
+import subprocess
+import time
+import urllib.request
+import json
+import sys
+import os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+RESULTS_FILE = "scripts/deep_tier_auto_results.json"
+
+MODELS = [
+    {
+        "name": "Qwen 27B - 256K (q4_0)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\Qwen3.5-27B-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "262144", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "128", "-b", "512", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50", "-ts", "0.5,0.5",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    },
+    {
+        "name": "Gemma 31B - 32K (q4_0)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "32768", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    },
+    {
+        "name": "Gemma 31B - 64K (q4_0)",
+        "cmd": [
+            r"llama_bin_run\llama-server.exe",
+            "--model", r"models\gemma-4-31B-it-Q4_K_M.gguf",
+            "-ngl", "999", "-c", "65536", "-np", "1", "-fa", "on",
+            "--cache-type-k", "q4_0", "--cache-type-v", "q4_0",
+            "-ub", "256", "-b", "1024", "-t", "6", "-tb", "6",
+            "--prio", "3", "--mlock", "--poll", "50",
+            "--port", "8000", "--host", "0.0.0.0"
+        ]
+    }
+]
+
+TEST_PROMPTS = [
+    {
+        "id": "code",
+        "prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code."
+    },
+    {
+        "id": "logical",
+        "prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step."
+    }
+]
+
+def check_server(timeout=300):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            resp = json.loads(urllib.request.urlopen(req, timeout=2).read())
+            if resp.get("status") == "ok" or resp.get("status") == "ready":
+                return True
+        except:
+            pass
+        time.sleep(5)
+    return False
+
+def get_vram_usage():
+    try:
+        out = subprocess.check_output(
+            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", "--format=csv,noheader"],
+            text=True
+        )
+        return out.strip().split("\n")
+    except:
+        return ["Failed to get VRAM info"]
+
+def ask(prompt, max_tokens=300):
+    payload = json.dumps({
+        "model": "m",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode()
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+    t0 = time.time()
+    resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
+    dt = time.time() - t0
+    usage = resp.get("usage", {})
+    content = resp["choices"][0]["message"]["content"]
+    
+    tokens = usage.get("completion_tokens", 0)
+    tps = round(tokens / dt, 2) if dt > 0 else 0
+    return {"time": round(dt,2), "tokens": tokens, "tps": tps, "res": content[:150] + "..."}
+
+def main():
+    results = []
+    
+    # Kill any existing llama-server
+    subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    time.sleep(3)
+    
+    for cfg in MODELS:
+        print(f"\n[{time.strftime('%H:%M:%S')}] Starting {cfg['name']}...")
+        
+        # Start server
+        proc = subprocess.Popen(cfg["cmd"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        
+        # Wait for boot
+        print(f"Waiting for server to boot (up to 5 mins)...")
+        is_ready = check_server(300)
+        
+        if not is_ready:
+            print(f"❌ Failed to boot {cfg['name']}.")
+            results.append({"name": cfg["name"], "status": "Failed to boot (OOM or Crash)"})
+            proc.terminate()
+            subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            time.sleep(5)
+            continue
+            
+        print(f"✅ Server Ready!")
+        vram = get_vram_usage()
+        
+        # Warmup
+        try:
+            ask("Hello", max_tokens=10)
+        except Exception as e:
+            pass
+            
+        test_data = {}
+        for p in TEST_PROMPTS:
+            print(f"  Testing {p['id']}...", end="", flush=True)
+            try:
+                res = ask(p["prompt"])
+                test_data[p["id"]] = res
+                print(f" {res['tps']} t/s")
+            except Exception as e:
+                test_data[p["id"]] = {"error": str(e)}
+                print(f" ERROR: {e}")
+                
+        results.append({
+            "name": cfg["name"],
+            "status": "Success",
+            "vram": vram,
+            "tests": test_data
+        })
+        
+        # Save incremental
+        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+            
+        # Shutdown
+        print("Shutting down server...")
+        proc.terminate()
+        subprocess.run("taskkill /F /IM llama-server.exe", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        time.sleep(5)
+
+    print("\n✅ All tests complete!")
+    print(f"Results saved to {RESULTS_FILE}")
+
+if __name__ == "__main__":
+    main()