variet_llm/scripts/bench_short.py

"""Phase 01 style short-prompt benchmark using llama.cpp internal timings."""
import json
import urllib.request
import sys

try:
    sys.stdout.reconfigure(encoding="utf-8")
except Exception:
    pass


def bench_text(model_name, n=200):
    payload = json.dumps({
        "model": model_name,
        "messages": [{"role": "user", "content": "Count from 1 to 50, each number on a new line."}],
        "max_tokens": n,
        "temperature": 0,
    }).encode()
    req = urllib.request.Request(
        "http://127.0.0.1:8000/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"},
    )
    with urllib.request.urlopen(req, timeout=120) as r:
        return json.loads(r.read()).get("timings", {})


def bench_image(model_name, image_path, prompt):
    import base64
    with open(image_path, "rb") as f:
        b64 = base64.b64encode(f.read()).decode()
    payload = json.dumps({
        "model": model_name,
        "messages": [{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
            ],
        }],
        "max_tokens": 100,
        "temperature": 0.3,
    }).encode()
    req = urllib.request.Request(
        "http://127.0.0.1:8000/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"},
    )
    with urllib.request.urlopen(req, timeout=600) as r:
        return json.loads(r.read()).get("timings", {})


def main():
    label = sys.argv[1] if len(sys.argv) > 1 else "run"
    model = sys.argv[2] if len(sys.argv) > 2 else "fast"
    do_image = "--image" in sys.argv

    print(f"=== [{label}] model={model} do_image={do_image} ===")
    print("warmup...")
    try:
        bench_text(model, 10)
    except Exception as e:
        print(f"warmup err: {e}")

    print("text 5-run:")
    runs = []
    for i in range(5):
        t = bench_text(model, 200)
        runs.append(t["predicted_per_second"])
        print(f"  Run {i+1}: gen {t['predicted_per_second']:.2f} t/s ({t['predicted_n']} tok, {t['predicted_ms']:.0f}ms) | prompt {t['prompt_per_second']:.1f} t/s ({t['prompt_n']} tok)")
    avg = sum(runs) / len(runs)
    print(f"  TEXT AVG: {avg:.2f} t/s  BEST: {max(runs):.2f}  MIN: {min(runs):.2f}")

    if do_image:
        prompts = [
            "What do you see in this image? One sentence.",
            "Describe the subject and background in one sentence.",
            "What is the most prominent feature? One sentence.",
        ]
        print("vision 3-run (640x640 cat):")
        for i, p in enumerate(prompts):
            t = bench_image(model, "logs/vision_test/sample.jpg", p)
            print(f"  Run {i+1}: prompt {t['prompt_n']} tok ({t['prompt_ms']:.0f}ms, {t['prompt_per_second']:.1f} t/s) | gen {t['predicted_n']} tok ({t['predicted_per_second']:.1f} t/s)")


if __name__ == "__main__":
    main()