refactor(phase-01): v3 retune fast & balanced roles
fast (Gemma 4 26B-A4B):
- Enable mmproj GPU loading (vision ~1s, 12x faster than CPU)
- KV f16 → q8_0 (save ~2.5 GB VRAM for mmproj)
- Tensor split 0.5,0.5 → 0.43,0.57 (13/17 layers)
- Remove --mlock/--poll/--prio/-t/-tb (no measurable impact)
- measured_tps 74.65 → 71.89 (trade 3.7% speed for vision)
balanced (Qwen 3.5 35B-A3B):
- Tensor split 0.5,0.5 → 0.48,0.52 (enables pipeline parallelism)
- Ubatch 128 → 256 (prefill +78%: 649 → 1,157 t/s)
- mmproj + --no-mmproj-offload (CPU vision, VRAM headroom)
- Remove useless flags same as fast
- measured_tps 61.62 → 64.16 (+4.1%)
Other:
- Document full retuning in docs/v3_{fast,balanced}_retuning_log.md
- Session report at .planning/reports/20260411-session-report.md
- Add bench utilities: bench_short/bench_long/test_ts_ratios
- Speculative decoding (E2B draft) experimented but rejected
(+14% gen vs -31% cold start + tokenizer mismatch + mmproj conflict)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
145
scripts/test_ts_ratios.py
Normal file
145
scripts/test_ts_ratios.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python
|
||||
"""Test multiple -ts ratios to find which ones start normally (no OOM, PP enabled)."""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).parent.parent
|
||||
CONFIG_FILE = ROOT / "config" / "engine_models.json"
|
||||
LLAMA_LOG = ROOT / "logs" / "llama-server.log"
|
||||
ENGINE_LOG = ROOT / "logs" / "engine_test.log"
|
||||
PYTHON = r"C:\ProgramData\miniforge3\envs\variet-llm\python.exe"
|
||||
ENGINE_SCRIPT = ROOT / "engine" / "variet_engine.py"
|
||||
|
||||
RATIOS = [
|
||||
("0.5", "0.5"),
|
||||
("0.48", "0.52"),
|
||||
("0.47", "0.53"),
|
||||
("0.45", "0.55"),
|
||||
("0.43", "0.57"),
|
||||
("0.40", "0.60"),
|
||||
]
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def kill_servers():
|
||||
subprocess.run(
|
||||
["powershell", "-Command",
|
||||
"Get-WmiObject Win32_Process | Where-Object { $_.CommandLine -like '*engine/variet_engine.py*' -or $_.Name -eq 'llama-server.exe' } | ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }"],
|
||||
capture_output=True
|
||||
)
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
def update_config(ts_a, ts_b):
|
||||
with open(CONFIG_FILE, encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
args = cfg["roles"]["balanced"]["args"]
|
||||
for i, a in enumerate(args):
|
||||
if a == "-ts" and i + 1 < len(args):
|
||||
args[i + 1] = f"{ts_a},{ts_b}"
|
||||
break
|
||||
with open(CONFIG_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(cfg, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def start_engine():
|
||||
LLAMA_LOG.write_text("")
|
||||
ENGINE_LOG.write_text("")
|
||||
return subprocess.Popen(
|
||||
[PYTHON, str(ENGINE_SCRIPT)],
|
||||
cwd=str(ROOT),
|
||||
stdout=open(ENGINE_LOG, "wb"),
|
||||
stderr=subprocess.STDOUT,
|
||||
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
)
|
||||
|
||||
|
||||
def wait_for_result(timeout=180):
|
||||
"""Return (status, log_tail) where status is 'ready'|'oom'|'error'|'timeout'."""
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
time.sleep(3)
|
||||
# check engine status
|
||||
try:
|
||||
with urllib.request.urlopen("http://localhost:8000/engine/status", timeout=2) as r:
|
||||
data = json.loads(r.read())
|
||||
if data.get("state") == "ready":
|
||||
return "ready", ""
|
||||
if data.get("state") == "error":
|
||||
return "error", ""
|
||||
except Exception:
|
||||
pass
|
||||
return "timeout", ""
|
||||
|
||||
|
||||
def analyze_log():
|
||||
if not LLAMA_LOG.exists():
|
||||
return {}
|
||||
text = LLAMA_LOG.read_text(encoding="utf-8", errors="ignore")
|
||||
result = {
|
||||
"pp_enabled": "pipeline parallelism enabled" in text,
|
||||
"pp_fallback": "retrying without pipeline parallelism" in text,
|
||||
"oom": "out of memory" in text,
|
||||
"listening": "main: server is listening" in text,
|
||||
}
|
||||
m = re.search(r"CUDA0 model buffer size = +([0-9.]+) MiB", text)
|
||||
if m:
|
||||
result["cuda0_model"] = float(m.group(1))
|
||||
m = re.search(r"CUDA1 model buffer size = +([0-9.]+) MiB", text)
|
||||
if m:
|
||||
result["cuda1_model"] = float(m.group(1))
|
||||
m = re.search(r"CUDA0 KV buffer size = +([0-9.]+) MiB", text)
|
||||
if m:
|
||||
result["cuda0_kv"] = float(m.group(1))
|
||||
m = re.search(r"CUDA1 KV buffer size = +([0-9.]+) MiB", text)
|
||||
if m:
|
||||
result["cuda1_kv"] = float(m.group(1))
|
||||
m = re.search(r"CUDA0 compute buffer size = +([0-9.]+) MiB", text)
|
||||
if m:
|
||||
result["cuda0_compute"] = float(m.group(1))
|
||||
m = re.search(r"CUDA1 compute buffer size = +([0-9.]+) MiB", text)
|
||||
if m:
|
||||
result["cuda1_compute"] = float(m.group(1))
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
print(f"{'ratio':<14} {'status':<10} {'PP':<6} {'cuda0_m':<9} {'cuda1_m':<9} {'cuda0_kv':<9} {'cuda1_kv':<9} {'c0_comp':<9} {'c1_comp':<9}")
|
||||
print("-" * 110)
|
||||
|
||||
for ts_a, ts_b in RATIOS:
|
||||
label = f"{ts_a},{ts_b}"
|
||||
kill_servers()
|
||||
update_config(ts_a, ts_b)
|
||||
proc = start_engine()
|
||||
status, _ = wait_for_result(timeout=180)
|
||||
info = analyze_log()
|
||||
pp = "ON" if info.get("pp_enabled") and not info.get("pp_fallback") else ("FALLBACK" if info.get("pp_fallback") else "?")
|
||||
c0m = f"{info.get('cuda0_model', 0):.0f}"
|
||||
c1m = f"{info.get('cuda1_model', 0):.0f}"
|
||||
c0kv = f"{info.get('cuda0_kv', 0):.0f}"
|
||||
c1kv = f"{info.get('cuda1_kv', 0):.0f}"
|
||||
c0c = f"{info.get('cuda0_compute', 0):.0f}"
|
||||
c1c = f"{info.get('cuda1_compute', 0):.0f}"
|
||||
print(f"{label:<14} {status:<10} {pp:<6} {c0m:<9} {c1m:<9} {c0kv:<9} {c1kv:<9} {c0c:<9} {c1c:<9}")
|
||||
results.append({"ratio": label, "status": status, "info": info})
|
||||
proc.terminate()
|
||||
time.sleep(1)
|
||||
|
||||
kill_servers()
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user