refactor(phase-01): v3 retune fast & balanced roles

fast (Gemma 4 26B-A4B):
- Enable mmproj GPU loading (vision ~1s, 12x faster than CPU)
- KV f16 → q8_0 (save ~2.5 GB VRAM for mmproj)
- Tensor split 0.5,0.5 → 0.43,0.57 (13/17 layers)
- Remove --mlock/--poll/--prio/-t/-tb (no measurable impact)
- measured_tps 74.65 → 71.89 (trade 3.7% speed for vision)

balanced (Qwen 3.5 35B-A3B):
- Tensor split 0.5,0.5 → 0.48,0.52 (enables pipeline parallelism)
- Ubatch 128 → 256 (prefill +78%: 649 → 1,157 t/s)
- mmproj + --no-mmproj-offload (CPU vision, VRAM headroom)
- Remove useless flags same as fast
- measured_tps 61.62 → 64.16 (+4.1%)

Other:
- Document full retuning in docs/v3_{fast,balanced}_retuning_log.md
- Session report at .planning/reports/20260411-session-report.md
- Add bench utilities: bench_short/bench_long/test_ts_ratios
- Speculative decoding (E2B draft) experimented but rejected
  (+14% gen vs -31% cold start + tokenizer mismatch + mmproj conflict)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Variet-Worker
2026-04-11 14:55:27 +09:00
parent 219985b9ce
commit 0dee779a73
9 changed files with 1135 additions and 80 deletions

67
scripts/bench_long.py Normal file
View File

@@ -0,0 +1,67 @@
"""Benchmark with long prompts to measure prompt processing (prefill) speed."""
import json
import time
import urllib.request
import sys
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
BASE_SENTENCE = (
"The history of computing is a vast and multifaceted journey that spans millennia, "
"from the earliest mechanical calculating aids to the sophisticated digital systems of today. "
"It begins with simple counting devices like the abacus, which originated in ancient Mesopotamia "
"around 2300 BCE and was later refined by Chinese and Roman civilizations. "
"These early tools laid the conceptual groundwork for mechanical computation. "
)
def make_prompt(seed):
# each seed produces a slightly different long prompt to defeat caching
unique = f"Session {seed}. Random seed value: {seed * 31337 + 17}. "
long_text = unique + (BASE_SENTENCE * 40)
return (
"Read the following text carefully, then answer in exactly one short sentence:\n\n"
f"{long_text}\n\n"
"Question: What is the main subject of the text above? Answer in one short sentence only."
)
def bench(label, seed, gen_tokens=150):
payload = {
"model": "balanced",
"messages": [{"role": "user", "content": make_prompt(seed)}],
"max_tokens": gen_tokens,
"stream": False,
"temperature": 0.3,
}
req = urllib.request.Request(
"http://localhost:8000/v1/chat/completions",
data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"},
)
t0 = time.time()
with urllib.request.urlopen(req, timeout=600) as r:
d = json.loads(r.read())
total = time.time() - t0
t = d.get("timings", {})
print(f"[{label}]")
print(f" prompt: {t['prompt_n']:>5} tok {t['prompt_ms']:>7.0f} ms {t['prompt_per_second']:>7.2f} t/s")
print(f" gen: {t['predicted_n']:>5} tok {t['predicted_ms']:>7.0f} ms {t['predicted_per_second']:>7.2f} t/s")
print(f" total: {total:.2f} s")
return t
if __name__ == "__main__":
label = sys.argv[1] if len(sys.argv) > 1 else "run"
results = []
for i in range(3):
t = bench(f"{label} #{i+1}", seed=i + 1)
results.append(t)
print()
if results:
avg_prompt = sum(r["prompt_per_second"] for r in results) / len(results)
avg_gen = sum(r["predicted_per_second"] for r in results) / len(results)
print(f"=== [{label}] AVG === prompt: {avg_prompt:.2f} t/s | gen: {avg_gen:.2f} t/s")

87
scripts/bench_short.py Normal file
View File

@@ -0,0 +1,87 @@
"""Phase 01 style short-prompt benchmark using llama.cpp internal timings."""
import json
import urllib.request
import sys
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
def bench_text(model_name, n=200):
payload = json.dumps({
"model": model_name,
"messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
"max_tokens": n,
"temperature": 0,
}).encode()
req = urllib.request.Request(
"http://127.0.0.1:8000/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=120) as r:
return json.loads(r.read()).get("timings", {})
def bench_image(model_name, image_path, prompt):
import base64
with open(image_path, "rb") as f:
b64 = base64.b64encode(f.read()).decode()
payload = json.dumps({
"model": model_name,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
],
}],
"max_tokens": 100,
"temperature": 0.3,
}).encode()
req = urllib.request.Request(
"http://127.0.0.1:8000/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=600) as r:
return json.loads(r.read()).get("timings", {})
def main():
label = sys.argv[1] if len(sys.argv) > 1 else "run"
model = sys.argv[2] if len(sys.argv) > 2 else "fast"
do_image = "--image" in sys.argv
print(f"=== [{label}] model={model} do_image={do_image} ===")
print("warmup...")
try:
bench_text(model, 10)
except Exception as e:
print(f"warmup err: {e}")
print("text 5-run:")
runs = []
for i in range(5):
t = bench_text(model, 200)
runs.append(t["predicted_per_second"])
print(f" Run {i+1}: gen {t['predicted_per_second']:.2f} t/s ({t['predicted_n']} tok, {t['predicted_ms']:.0f}ms) | prompt {t['prompt_per_second']:.1f} t/s ({t['prompt_n']} tok)")
avg = sum(runs) / len(runs)
print(f" TEXT AVG: {avg:.2f} t/s BEST: {max(runs):.2f} MIN: {min(runs):.2f}")
if do_image:
prompts = [
"What do you see in this image? One sentence.",
"Describe the subject and background in one sentence.",
"What is the most prominent feature? One sentence.",
]
print("vision 3-run (640x640 cat):")
for i, p in enumerate(prompts):
t = bench_image(model, "logs/vision_test/sample.jpg", p)
print(f" Run {i+1}: prompt {t['prompt_n']} tok ({t['prompt_ms']:.0f}ms, {t['prompt_per_second']:.1f} t/s) | gen {t['predicted_n']} tok ({t['predicted_per_second']:.1f} t/s)")
if __name__ == "__main__":
main()

145
scripts/test_ts_ratios.py Normal file
View File

@@ -0,0 +1,145 @@
#!/usr/bin/env python
"""Test multiple -ts ratios to find which ones start normally (no OOM, PP enabled)."""
import subprocess
import time
import json
import urllib.request
import urllib.error
import sys
import re
from pathlib import Path
ROOT = Path(__file__).parent.parent
CONFIG_FILE = ROOT / "config" / "engine_models.json"
LLAMA_LOG = ROOT / "logs" / "llama-server.log"
ENGINE_LOG = ROOT / "logs" / "engine_test.log"
PYTHON = r"C:\ProgramData\miniforge3\envs\variet-llm\python.exe"
ENGINE_SCRIPT = ROOT / "engine" / "variet_engine.py"
RATIOS = [
("0.5", "0.5"),
("0.48", "0.52"),
("0.47", "0.53"),
("0.45", "0.55"),
("0.43", "0.57"),
("0.40", "0.60"),
]
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
def kill_servers():
subprocess.run(
["powershell", "-Command",
"Get-WmiObject Win32_Process | Where-Object { $_.CommandLine -like '*engine/variet_engine.py*' -or $_.Name -eq 'llama-server.exe' } | ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }"],
capture_output=True
)
time.sleep(2)
def update_config(ts_a, ts_b):
with open(CONFIG_FILE, encoding="utf-8") as f:
cfg = json.load(f)
args = cfg["roles"]["balanced"]["args"]
for i, a in enumerate(args):
if a == "-ts" and i + 1 < len(args):
args[i + 1] = f"{ts_a},{ts_b}"
break
with open(CONFIG_FILE, "w", encoding="utf-8") as f:
json.dump(cfg, f, indent=2, ensure_ascii=False)
def start_engine():
LLAMA_LOG.write_text("")
ENGINE_LOG.write_text("")
return subprocess.Popen(
[PYTHON, str(ENGINE_SCRIPT)],
cwd=str(ROOT),
stdout=open(ENGINE_LOG, "wb"),
stderr=subprocess.STDOUT,
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
)
def wait_for_result(timeout=180):
"""Return (status, log_tail) where status is 'ready'|'oom'|'error'|'timeout'."""
deadline = time.time() + timeout
while time.time() < deadline:
time.sleep(3)
# check engine status
try:
with urllib.request.urlopen("http://localhost:8000/engine/status", timeout=2) as r:
data = json.loads(r.read())
if data.get("state") == "ready":
return "ready", ""
if data.get("state") == "error":
return "error", ""
except Exception:
pass
return "timeout", ""
def analyze_log():
if not LLAMA_LOG.exists():
return {}
text = LLAMA_LOG.read_text(encoding="utf-8", errors="ignore")
result = {
"pp_enabled": "pipeline parallelism enabled" in text,
"pp_fallback": "retrying without pipeline parallelism" in text,
"oom": "out of memory" in text,
"listening": "main: server is listening" in text,
}
m = re.search(r"CUDA0 model buffer size = +([0-9.]+) MiB", text)
if m:
result["cuda0_model"] = float(m.group(1))
m = re.search(r"CUDA1 model buffer size = +([0-9.]+) MiB", text)
if m:
result["cuda1_model"] = float(m.group(1))
m = re.search(r"CUDA0 KV buffer size = +([0-9.]+) MiB", text)
if m:
result["cuda0_kv"] = float(m.group(1))
m = re.search(r"CUDA1 KV buffer size = +([0-9.]+) MiB", text)
if m:
result["cuda1_kv"] = float(m.group(1))
m = re.search(r"CUDA0 compute buffer size = +([0-9.]+) MiB", text)
if m:
result["cuda0_compute"] = float(m.group(1))
m = re.search(r"CUDA1 compute buffer size = +([0-9.]+) MiB", text)
if m:
result["cuda1_compute"] = float(m.group(1))
return result
def main():
results = []
print(f"{'ratio':<14} {'status':<10} {'PP':<6} {'cuda0_m':<9} {'cuda1_m':<9} {'cuda0_kv':<9} {'cuda1_kv':<9} {'c0_comp':<9} {'c1_comp':<9}")
print("-" * 110)
for ts_a, ts_b in RATIOS:
label = f"{ts_a},{ts_b}"
kill_servers()
update_config(ts_a, ts_b)
proc = start_engine()
status, _ = wait_for_result(timeout=180)
info = analyze_log()
pp = "ON" if info.get("pp_enabled") and not info.get("pp_fallback") else ("FALLBACK" if info.get("pp_fallback") else "?")
c0m = f"{info.get('cuda0_model', 0):.0f}"
c1m = f"{info.get('cuda1_model', 0):.0f}"
c0kv = f"{info.get('cuda0_kv', 0):.0f}"
c1kv = f"{info.get('cuda1_kv', 0):.0f}"
c0c = f"{info.get('cuda0_compute', 0):.0f}"
c1c = f"{info.get('cuda1_compute', 0):.0f}"
print(f"{label:<14} {status:<10} {pp:<6} {c0m:<9} {c1m:<9} {c0kv:<9} {c1kv:<9} {c0c:<9} {c1c:<9}")
results.append({"ratio": label, "status": status, "info": info})
proc.terminate()
time.sleep(1)
kill_servers()
print("\nDone.")
if __name__ == "__main__":
main()