Update tuning scripts and add task creation to sync_vikunja.js
This commit is contained in:
@@ -1,86 +0,0 @@
|
||||
import os
|
||||
import glob
|
||||
import re
|
||||
|
||||
skill_dir = r"C:\Users\Certes\.gemini\antigravity\skills"
|
||||
|
||||
translations = {
|
||||
"Manage parallel workstreams — list, create, switch, status, progress, complete, and resume": "병렬 작업 스트림 관리 — 목록, 생성, 전환, 상태, 진행률, 완료 및 재개",
|
||||
"Validate built features through conversational UAT": "대화형 UAT를 통해 구현된 기능 검증",
|
||||
"Retroactively audit and fill Nyquist validation gaps for a completed phase": "완료된 단계에 대한 검증 누락 사후 감사 및 보완",
|
||||
"Update GSD to latest version with changelog display": "GSD를 최신 버전으로 업데이트하고 변경 사항 표시",
|
||||
"Retroactive 6-pillar visual audit of implemented frontend code": "구현된 프론트엔드 코드에 대한 6개 요소 시각적 사후 감사",
|
||||
"Generate UI design contract (UI-SPEC.md) for frontend phases": "프론트엔드 단계를 위한 UI 디자인 명세서(UI-SPEC.md) 생성",
|
||||
"Manage persistent context threads for cross-session work": "교차 세션 작업을 위한 영구 컨텍스트 스레드 관리",
|
||||
"Display project statistics — phases, plans, requirements, git metrics, and timeline": "프로젝트 통계 표시 — 단계, 계획, 요구사항, Git 지표 및 타임라인",
|
||||
"Create PR, run review, and prepare for merge after verification passes": "검증 통과 후 PR 생성, 리뷰 실행 및 병합 준비",
|
||||
"Configure GSD workflow toggles and model profile": "GSD 워크플로우 옵션 및 모델 프로필 구성",
|
||||
"Switch model profile for GSD agents (quality/balanced/budget/inherit)": "GSD 요원의 모델 프로필 전환 (고품질/균형/예산/상속)",
|
||||
"Generate a session report with token usage estimates, work summary, and outcomes": "토큰 사용량, 작업 요약 및 결과를 포함한 세션 보고서 생성",
|
||||
"Review and promote backlog items to active milestone": "백로그 항목을 검토하고 활성 마일스톤으로 승격",
|
||||
"Request cross-AI peer review of phase plans from external AI CLIs": "외부 AI CLI에 단계 계획에 대한 교차 AI 동료 리뷰 요청",
|
||||
"Resume work from previous session with full context restoration": "전체 컨텍스트 복원과 함께 이전 세션에서 작업 재개",
|
||||
"Research how to implement a phase (standalone - usually use /gsd-plan-phase instead)": "단계를 구현하는 방법 리서치 (단독 실행 - 보통 /gsd-plan-phase 사용)",
|
||||
"Remove a GSD workspace and clean up worktrees": "GSD 워크스페이스 제거 및 워크트리 정리",
|
||||
"Remove a future phase from roadmap and renumber subsequent phases": "로드맵에서 향후 단계를 제거하고 이후 단계 번호 재지정",
|
||||
"Reapply local modifications after a GSD update": "GSD 업데이트 후 로컬 수정 사항 재적용",
|
||||
"Execute a quick task with GSD guarantees (atomic commits, state tracking) but skip optional agents": "GSD 보장(원자적 커밋, 상태 추적)을 사용하여 빠른 작업을 실행하되 선택적 요원 생략",
|
||||
"Check project progress, show context, and route to next action (execute or plan)": "프로젝트 진행 상황 확인, 컨텍스트 표시 및 다음 작업(실행 또는 계획)으로 라우팅",
|
||||
"Generate developer behavioral profile and create Claude-discoverable artifacts": "개발자 행동 프로필을 생성하고 AI가 인지할 수 있는 문서 작성",
|
||||
"Create a clean PR branch by filtering out .planning/ commits — ready for code review": ".planning/ 커밋을 필터링하여 깔끔한 PR 브랜치 생성 — 코드 리뷰 준비",
|
||||
"Capture a forward-looking idea with trigger conditions — surfaces automatically at the right milestone": "향후 아이디어를 트리거 조건과 함께 캡처 — 적절한 마일스톤에서 자동 표시",
|
||||
"Create detailed phase plan (PLAN.md) with verification loop": "검증 루프를 포함한 상세 단계 계획(PLAN.md) 생성",
|
||||
"Create phases to close all gaps identified by milestone audit": "마일스톤 감사에서 식별된 모든 격차를 해소하기 위한 단계 생성",
|
||||
"Create context handoff when pausing work mid-phase": "작업 중단 시 컨텍스트 인수인계 파일 생성",
|
||||
"Zero-friction idea capture. Append, list, or promote notes to todos.": "방해 없는 아이디어 캡처. 메모 추가, 나열 또는 할 일로 승격.",
|
||||
"Automatically advance to the next logical step in the GSD workflow": "GSD 워크플로우의 다음 논리적 단계로 자동 진행",
|
||||
"Create an isolated workspace with repo copies and independent .planning/": "외부 레포 사본 및 독립적인 .planning/을 갖춘 격리된 워크스페이스 생성",
|
||||
"Initialize a new project with deep context gathering and PROJECT.md": "심층 컨텍스트 수집 및 PROJECT.md와 함께 새 프로젝트 초기화",
|
||||
"Start a new milestone cycle — update PROJECT.md and route to requirements": "새로운 마일스톤 주기 시작 — PROJECT.md 업데이트 및 요구사항 재정의",
|
||||
"Generate a comprehensive project summary from milestone artifacts for team onboarding and review": "팀 온보딩 및 리뷰를 위해 마일스톤 산출물에서 종합적인 프로젝트 요약 생성",
|
||||
"Analyze codebase with parallel mapper agents to produce .planning/codebase/ documents": "병렬 매퍼 요원으로 코드베이스를 분석하여 .planning/codebase/ 문서 생성",
|
||||
"Interactive command center for managing multiple phases from one terminal": "하나의 터미널에서 여러 단계를 관리하는 대화형 명령 센터",
|
||||
"List active GSD workspaces and their status": "활성 GSD 워크스페이스 및 상태 나열",
|
||||
"Surface the agent's assumptions about a phase approach before planning": "계획 전 단계적 접근 방식에 대한 요원의 가정을 미리 표시",
|
||||
"Join the GSD Discord community": "GSD 디스코드 커뮤니티 참가",
|
||||
"Insert urgent work as decimal phase (e.g., 72.1) between existing phases": "기존 단계 사이에 소수점 단계(예: 72.1)로 긴급 작업 삽입",
|
||||
"Show available GSD commands and usage guide": "사용 가능한 GSD 명령어 및 사용 가이드 표시",
|
||||
"Diagnose planning directory health and optionally repair issues": "계획 디렉토리 상태 진단 및 선택적으로 문제 복구",
|
||||
"Post-mortem investigation for failed GSD workflows — analyzes git history, artifacts, and state to diagnose what went wrong": "실패한 GSD 워크플로우에 대한 사후 조사 — git 기록, 문서 및 상태 분석",
|
||||
"Execute a trivial task inline — no subagents, no planning overhead": "인라인으로 사소한 작업 실행 — 서브 에이전트 및 계획 오버헤드 없음",
|
||||
"Execute all plans in a phase with wave-based parallelization": "웨이브(Wave) 기반 병렬 처리를 사용하여 단계의 모든 계획 실행",
|
||||
"Route freeform text to the right GSD command automatically": "자유 형식 텍스트를 적절한 GSD 명령으로 자동 라우팅",
|
||||
"Systematic debugging with persistent state across context resets": "컨텍스트가 리셋되어도 상태를 유지하는 체계적인 디버깅",
|
||||
"Gather phase context through adaptive questioning before planning. Use --auto to skip interactive questions (the agent picks recommended defaults).": "계획 전 심층 질문을 통해 단계 컨텍스트 수집. 대화형 건너뛰기(--auto) 가능.",
|
||||
"Archive completed milestone and prepare for next version": "완료된 마일스톤 보관 및 다음 버전 준비",
|
||||
"List pending todos and select one to work on": "보류 중인 할 일 목록 표시 및 작업할 항목 선택",
|
||||
"Cross-phase audit of all outstanding UAT and verification items": "모든 미결 UAT 및 검증 항목에 대한 전체 단계 교차 감사",
|
||||
"Audit milestone completion against original intent before archiving": "보관 전 원래 의도와 비교하여 마일스톤 달성 여부 감사",
|
||||
"Capture idea or task as todo from current conversation context": "현재 대화 컨텍스트에서 아이디어 또는 작업을 할 일로 캡처",
|
||||
"Generate tests for a completed phase based on UAT criteria and implementation": "UAT 기준 및 구현을 기반으로 완료된 단계에 대한 테스트 생성",
|
||||
"Add phase to end of current milestone in roadmap": "로드맵의 현재 마일스톤 끝에 새 단계 추가",
|
||||
"Add an idea to the backlog parking lot (999.x numbering)": "백로그 주차장(999.x 넘버링)에 아이디어 추가",
|
||||
"Run all remaining phases autonomously — discuss→plan→execute per phase": "모든 남은 단계를 완전히 자율적으로 실행 (논의→계획→실행 루프)",
|
||||
"Archive accumulated phase directories from completed milestones": "완료된 마일스톤에서 쌓인 단계 디렉토리 보관 및 정리"
|
||||
}
|
||||
|
||||
modified_count = 0
|
||||
|
||||
for filepath in glob.glob(os.path.join(skill_dir, "gsd-*", "SKILL.md")):
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
new_content = content
|
||||
for eng, kor in translations.items():
|
||||
pattern = re.compile(r"^description:\s*" + re.escape(eng) + r"\s*$", re.MULTILINE)
|
||||
new_content = pattern.sub(f"description: {kor}", new_content)
|
||||
|
||||
if new_content != content:
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(new_content)
|
||||
modified_count += 1
|
||||
except Exception as e:
|
||||
print(f"Error processing {filepath}: {e}")
|
||||
|
||||
print(f"Successfully translated {modified_count} SKILL.md files.")
|
||||
@@ -1,253 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
UI/UX Pro Max Core - BM25 search engine for UI/UX style guides
|
||||
"""
|
||||
|
||||
import csv
|
||||
import re
|
||||
from pathlib import Path
|
||||
from math import log
|
||||
from collections import defaultdict
|
||||
|
||||
# ============ CONFIGURATION ============
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
MAX_RESULTS = 3
|
||||
|
||||
CSV_CONFIG = {
|
||||
"style": {
|
||||
"file": "styles.csv",
|
||||
"search_cols": ["Style Category", "Keywords", "Best For", "Type", "AI Prompt Keywords"],
|
||||
"output_cols": ["Style Category", "Type", "Keywords", "Primary Colors", "Effects & Animation", "Best For", "Performance", "Accessibility", "Framework Compatibility", "Complexity", "AI Prompt Keywords", "CSS/Technical Keywords", "Implementation Checklist", "Design System Variables"]
|
||||
},
|
||||
"color": {
|
||||
"file": "colors.csv",
|
||||
"search_cols": ["Product Type", "Notes"],
|
||||
"output_cols": ["Product Type", "Primary (Hex)", "Secondary (Hex)", "CTA (Hex)", "Background (Hex)", "Text (Hex)", "Notes"]
|
||||
},
|
||||
"chart": {
|
||||
"file": "charts.csv",
|
||||
"search_cols": ["Data Type", "Keywords", "Best Chart Type", "Accessibility Notes"],
|
||||
"output_cols": ["Data Type", "Keywords", "Best Chart Type", "Secondary Options", "Color Guidance", "Accessibility Notes", "Library Recommendation", "Interactive Level"]
|
||||
},
|
||||
"landing": {
|
||||
"file": "landing.csv",
|
||||
"search_cols": ["Pattern Name", "Keywords", "Conversion Optimization", "Section Order"],
|
||||
"output_cols": ["Pattern Name", "Keywords", "Section Order", "Primary CTA Placement", "Color Strategy", "Conversion Optimization"]
|
||||
},
|
||||
"product": {
|
||||
"file": "products.csv",
|
||||
"search_cols": ["Product Type", "Keywords", "Primary Style Recommendation", "Key Considerations"],
|
||||
"output_cols": ["Product Type", "Keywords", "Primary Style Recommendation", "Secondary Styles", "Landing Page Pattern", "Dashboard Style (if applicable)", "Color Palette Focus"]
|
||||
},
|
||||
"ux": {
|
||||
"file": "ux-guidelines.csv",
|
||||
"search_cols": ["Category", "Issue", "Description", "Platform"],
|
||||
"output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
|
||||
},
|
||||
"typography": {
|
||||
"file": "typography.csv",
|
||||
"search_cols": ["Font Pairing Name", "Category", "Mood/Style Keywords", "Best For", "Heading Font", "Body Font"],
|
||||
"output_cols": ["Font Pairing Name", "Category", "Heading Font", "Body Font", "Mood/Style Keywords", "Best For", "Google Fonts URL", "CSS Import", "Tailwind Config", "Notes"]
|
||||
},
|
||||
"icons": {
|
||||
"file": "icons.csv",
|
||||
"search_cols": ["Category", "Icon Name", "Keywords", "Best For"],
|
||||
"output_cols": ["Category", "Icon Name", "Keywords", "Library", "Import Code", "Usage", "Best For", "Style"]
|
||||
},
|
||||
"react": {
|
||||
"file": "react-performance.csv",
|
||||
"search_cols": ["Category", "Issue", "Keywords", "Description"],
|
||||
"output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
|
||||
},
|
||||
"web": {
|
||||
"file": "web-interface.csv",
|
||||
"search_cols": ["Category", "Issue", "Keywords", "Description"],
|
||||
"output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
|
||||
}
|
||||
}
|
||||
|
||||
STACK_CONFIG = {
|
||||
"html-tailwind": {"file": "stacks/html-tailwind.csv"},
|
||||
"react": {"file": "stacks/react.csv"},
|
||||
"nextjs": {"file": "stacks/nextjs.csv"},
|
||||
"astro": {"file": "stacks/astro.csv"},
|
||||
"vue": {"file": "stacks/vue.csv"},
|
||||
"nuxtjs": {"file": "stacks/nuxtjs.csv"},
|
||||
"nuxt-ui": {"file": "stacks/nuxt-ui.csv"},
|
||||
"svelte": {"file": "stacks/svelte.csv"},
|
||||
"swiftui": {"file": "stacks/swiftui.csv"},
|
||||
"react-native": {"file": "stacks/react-native.csv"},
|
||||
"flutter": {"file": "stacks/flutter.csv"},
|
||||
"shadcn": {"file": "stacks/shadcn.csv"},
|
||||
"jetpack-compose": {"file": "stacks/jetpack-compose.csv"}
|
||||
}
|
||||
|
||||
# Common columns for all stacks
|
||||
_STACK_COLS = {
|
||||
"search_cols": ["Category", "Guideline", "Description", "Do", "Don't"],
|
||||
"output_cols": ["Category", "Guideline", "Description", "Do", "Don't", "Code Good", "Code Bad", "Severity", "Docs URL"]
|
||||
}
|
||||
|
||||
AVAILABLE_STACKS = list(STACK_CONFIG.keys())
|
||||
|
||||
|
||||
# ============ BM25 IMPLEMENTATION ============
|
||||
class BM25:
|
||||
"""BM25 ranking algorithm for text search"""
|
||||
|
||||
def __init__(self, k1=1.5, b=0.75):
|
||||
self.k1 = k1
|
||||
self.b = b
|
||||
self.corpus = []
|
||||
self.doc_lengths = []
|
||||
self.avgdl = 0
|
||||
self.idf = {}
|
||||
self.doc_freqs = defaultdict(int)
|
||||
self.N = 0
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Lowercase, split, remove punctuation, filter short words"""
|
||||
text = re.sub(r'[^\w\s]', ' ', str(text).lower())
|
||||
return [w for w in text.split() if len(w) > 2]
|
||||
|
||||
def fit(self, documents):
|
||||
"""Build BM25 index from documents"""
|
||||
self.corpus = [self.tokenize(doc) for doc in documents]
|
||||
self.N = len(self.corpus)
|
||||
if self.N == 0:
|
||||
return
|
||||
self.doc_lengths = [len(doc) for doc in self.corpus]
|
||||
self.avgdl = sum(self.doc_lengths) / self.N
|
||||
|
||||
for doc in self.corpus:
|
||||
seen = set()
|
||||
for word in doc:
|
||||
if word not in seen:
|
||||
self.doc_freqs[word] += 1
|
||||
seen.add(word)
|
||||
|
||||
for word, freq in self.doc_freqs.items():
|
||||
self.idf[word] = log((self.N - freq + 0.5) / (freq + 0.5) + 1)
|
||||
|
||||
def score(self, query):
|
||||
"""Score all documents against query"""
|
||||
query_tokens = self.tokenize(query)
|
||||
scores = []
|
||||
|
||||
for idx, doc in enumerate(self.corpus):
|
||||
score = 0
|
||||
doc_len = self.doc_lengths[idx]
|
||||
term_freqs = defaultdict(int)
|
||||
for word in doc:
|
||||
term_freqs[word] += 1
|
||||
|
||||
for token in query_tokens:
|
||||
if token in self.idf:
|
||||
tf = term_freqs[token]
|
||||
idf = self.idf[token]
|
||||
numerator = tf * (self.k1 + 1)
|
||||
denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
|
||||
score += idf * numerator / denominator
|
||||
|
||||
scores.append((idx, score))
|
||||
|
||||
return sorted(scores, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
# ============ SEARCH FUNCTIONS ============
|
||||
def _load_csv(filepath):
|
||||
"""Load CSV and return list of dicts"""
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
return list(csv.DictReader(f))
|
||||
|
||||
|
||||
def _search_csv(filepath, search_cols, output_cols, query, max_results):
|
||||
"""Core search function using BM25"""
|
||||
if not filepath.exists():
|
||||
return []
|
||||
|
||||
data = _load_csv(filepath)
|
||||
|
||||
# Build documents from search columns
|
||||
documents = [" ".join(str(row.get(col, "")) for col in search_cols) for row in data]
|
||||
|
||||
# BM25 search
|
||||
bm25 = BM25()
|
||||
bm25.fit(documents)
|
||||
ranked = bm25.score(query)
|
||||
|
||||
# Get top results with score > 0
|
||||
results = []
|
||||
for idx, score in ranked[:max_results]:
|
||||
if score > 0:
|
||||
row = data[idx]
|
||||
results.append({col: row.get(col, "") for col in output_cols if col in row})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def detect_domain(query):
|
||||
"""Auto-detect the most relevant domain from query"""
|
||||
query_lower = query.lower()
|
||||
|
||||
domain_keywords = {
|
||||
"color": ["color", "palette", "hex", "#", "rgb"],
|
||||
"chart": ["chart", "graph", "visualization", "trend", "bar", "pie", "scatter", "heatmap", "funnel"],
|
||||
"landing": ["landing", "page", "cta", "conversion", "hero", "testimonial", "pricing", "section"],
|
||||
"product": ["saas", "ecommerce", "e-commerce", "fintech", "healthcare", "gaming", "portfolio", "crypto", "dashboard"],
|
||||
"style": ["style", "design", "ui", "minimalism", "glassmorphism", "neumorphism", "brutalism", "dark mode", "flat", "aurora", "prompt", "css", "implementation", "variable", "checklist", "tailwind"],
|
||||
"ux": ["ux", "usability", "accessibility", "wcag", "touch", "scroll", "animation", "keyboard", "navigation", "mobile"],
|
||||
"typography": ["font", "typography", "heading", "serif", "sans"],
|
||||
"icons": ["icon", "icons", "lucide", "heroicons", "symbol", "glyph", "pictogram", "svg icon"],
|
||||
"react": ["react", "next.js", "nextjs", "suspense", "memo", "usecallback", "useeffect", "rerender", "bundle", "waterfall", "barrel", "dynamic import", "rsc", "server component"],
|
||||
"web": ["aria", "focus", "outline", "semantic", "virtualize", "autocomplete", "form", "input type", "preconnect"]
|
||||
}
|
||||
|
||||
scores = {domain: sum(1 for kw in keywords if kw in query_lower) for domain, keywords in domain_keywords.items()}
|
||||
best = max(scores, key=scores.get)
|
||||
return best if scores[best] > 0 else "style"
|
||||
|
||||
|
||||
def search(query, domain=None, max_results=MAX_RESULTS):
|
||||
"""Main search function with auto-domain detection"""
|
||||
if domain is None:
|
||||
domain = detect_domain(query)
|
||||
|
||||
config = CSV_CONFIG.get(domain, CSV_CONFIG["style"])
|
||||
filepath = DATA_DIR / config["file"]
|
||||
|
||||
if not filepath.exists():
|
||||
return {"error": f"File not found: {filepath}", "domain": domain}
|
||||
|
||||
results = _search_csv(filepath, config["search_cols"], config["output_cols"], query, max_results)
|
||||
|
||||
return {
|
||||
"domain": domain,
|
||||
"query": query,
|
||||
"file": config["file"],
|
||||
"count": len(results),
|
||||
"results": results
|
||||
}
|
||||
|
||||
|
||||
def search_stack(query, stack, max_results=MAX_RESULTS):
|
||||
"""Search stack-specific guidelines"""
|
||||
if stack not in STACK_CONFIG:
|
||||
return {"error": f"Unknown stack: {stack}. Available: {', '.join(AVAILABLE_STACKS)}"}
|
||||
|
||||
filepath = DATA_DIR / STACK_CONFIG[stack]["file"]
|
||||
|
||||
if not filepath.exists():
|
||||
return {"error": f"Stack file not found: {filepath}", "stack": stack}
|
||||
|
||||
results = _search_csv(filepath, _STACK_COLS["search_cols"], _STACK_COLS["output_cols"], query, max_results)
|
||||
|
||||
return {
|
||||
"domain": "stack",
|
||||
"stack": stack,
|
||||
"query": query,
|
||||
"file": STACK_CONFIG[stack]["file"],
|
||||
"count": len(results),
|
||||
"results": results
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,114 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
UI/UX Pro Max Search - BM25 search engine for UI/UX style guides
|
||||
Usage: python search.py "<query>" [--domain <domain>] [--stack <stack>] [--max-results 3]
|
||||
python search.py "<query>" --design-system [-p "Project Name"]
|
||||
python search.py "<query>" --design-system --persist [-p "Project Name"] [--page "dashboard"]
|
||||
|
||||
Domains: style, prompt, color, chart, landing, product, ux, typography
|
||||
Stacks: html-tailwind, react, nextjs
|
||||
|
||||
Persistence (Master + Overrides pattern):
|
||||
--persist Save design system to design-system/MASTER.md
|
||||
--page Also create a page-specific override file in design-system/pages/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import io
|
||||
from core import CSV_CONFIG, AVAILABLE_STACKS, MAX_RESULTS, search, search_stack
|
||||
from design_system import generate_design_system, persist_design_system
|
||||
|
||||
# Force UTF-8 for stdout/stderr to handle emojis on Windows (cp1252 default)
|
||||
if sys.stdout.encoding and sys.stdout.encoding.lower() != 'utf-8':
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
if sys.stderr.encoding and sys.stderr.encoding.lower() != 'utf-8':
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
||||
|
||||
|
||||
def format_output(result):
|
||||
"""Format results for Claude consumption (token-optimized)"""
|
||||
if "error" in result:
|
||||
return f"Error: {result['error']}"
|
||||
|
||||
output = []
|
||||
if result.get("stack"):
|
||||
output.append(f"## UI Pro Max Stack Guidelines")
|
||||
output.append(f"**Stack:** {result['stack']} | **Query:** {result['query']}")
|
||||
else:
|
||||
output.append(f"## UI Pro Max Search Results")
|
||||
output.append(f"**Domain:** {result['domain']} | **Query:** {result['query']}")
|
||||
output.append(f"**Source:** {result['file']} | **Found:** {result['count']} results\n")
|
||||
|
||||
for i, row in enumerate(result['results'], 1):
|
||||
output.append(f"### Result {i}")
|
||||
for key, value in row.items():
|
||||
value_str = str(value)
|
||||
if len(value_str) > 300:
|
||||
value_str = value_str[:300] + "..."
|
||||
output.append(f"- **{key}:** {value_str}")
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="UI Pro Max Search")
|
||||
parser.add_argument("query", help="Search query")
|
||||
parser.add_argument("--domain", "-d", choices=list(CSV_CONFIG.keys()), help="Search domain")
|
||||
parser.add_argument("--stack", "-s", choices=AVAILABLE_STACKS, help="Stack-specific search (html-tailwind, react, nextjs)")
|
||||
parser.add_argument("--max-results", "-n", type=int, default=MAX_RESULTS, help="Max results (default: 3)")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
# Design system generation
|
||||
parser.add_argument("--design-system", "-ds", action="store_true", help="Generate complete design system recommendation")
|
||||
parser.add_argument("--project-name", "-p", type=str, default=None, help="Project name for design system output")
|
||||
parser.add_argument("--format", "-f", choices=["ascii", "markdown"], default="ascii", help="Output format for design system")
|
||||
# Persistence (Master + Overrides pattern)
|
||||
parser.add_argument("--persist", action="store_true", help="Save design system to design-system/MASTER.md (creates hierarchical structure)")
|
||||
parser.add_argument("--page", type=str, default=None, help="Create page-specific override file in design-system/pages/")
|
||||
parser.add_argument("--output-dir", "-o", type=str, default=None, help="Output directory for persisted files (default: current directory)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Design system takes priority
|
||||
if args.design_system:
|
||||
result = generate_design_system(
|
||||
args.query,
|
||||
args.project_name,
|
||||
args.format,
|
||||
persist=args.persist,
|
||||
page=args.page,
|
||||
output_dir=args.output_dir
|
||||
)
|
||||
print(result)
|
||||
|
||||
# Print persistence confirmation
|
||||
if args.persist:
|
||||
project_slug = args.project_name.lower().replace(' ', '-') if args.project_name else "default"
|
||||
print("\n" + "=" * 60)
|
||||
print(f"✅ Design system persisted to design-system/{project_slug}/")
|
||||
print(f" 📄 design-system/{project_slug}/MASTER.md (Global Source of Truth)")
|
||||
if args.page:
|
||||
page_filename = args.page.lower().replace(' ', '-')
|
||||
print(f" 📄 design-system/{project_slug}/pages/{page_filename}.md (Page Overrides)")
|
||||
print("")
|
||||
print(f"📖 Usage: When building a page, check design-system/{project_slug}/pages/[page].md first.")
|
||||
print(f" If exists, its rules override MASTER.md. Otherwise, use MASTER.md.")
|
||||
print("=" * 60)
|
||||
# Stack search
|
||||
elif args.stack:
|
||||
result = search_stack(args.query, args.stack, args.max_results)
|
||||
if args.json:
|
||||
import json
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
print(format_output(result))
|
||||
# Domain search
|
||||
else:
|
||||
result = search(args.query, args.domain, args.max_results)
|
||||
if args.json:
|
||||
import json
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
print(format_output(result))
|
||||
@@ -4,21 +4,29 @@ const path = require('path');
|
||||
// 1. Get arguments
|
||||
const args = process.argv.slice(2);
|
||||
if (args.length < 2) {
|
||||
console.error("Usage: node sync_vikunja.js <task_id> <message_or_commit>");
|
||||
console.error("Usage:");
|
||||
console.error(" node sync_vikunja.js <task_id> <message> # Update existing task");
|
||||
console.error(" node sync_vikunja.js create \"<title>\" \"<message>\" # Create new task");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const taskId = args[0];
|
||||
const commandOrId = args[0];
|
||||
const message = args[1];
|
||||
|
||||
// 2. Load configuration from .env.agent
|
||||
const envPath = path.join(__dirname, '../config/.env.agent');
|
||||
if (!fs.existsSync(envPath)) {
|
||||
console.error("Error: .agent/config/.env.agent file not found. Please create it from the template.");
|
||||
const envPath = path.join(__dirname, '../../.env.agent');
|
||||
const fallbackEnvPath = path.join(__dirname, '../config/.env.agent');
|
||||
|
||||
let envContent = '';
|
||||
if (fs.existsSync(envPath)) {
|
||||
envContent = fs.readFileSync(envPath, 'utf8');
|
||||
} else if (fs.existsSync(fallbackEnvPath)) {
|
||||
envContent = fs.readFileSync(fallbackEnvPath, 'utf8');
|
||||
} else {
|
||||
console.error("Error: .env.agent file not found.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const envContent = fs.readFileSync(envPath, 'utf8');
|
||||
const env = {};
|
||||
envContent.split('\n').forEach(line => {
|
||||
const match = line.match(/^([^#=]+)="?(.*?)"?$/);
|
||||
@@ -29,6 +37,7 @@ envContent.split('\n').forEach(line => {
|
||||
|
||||
const apiUrl = env.VIKUNJA_API_URL;
|
||||
const apiToken = env.VIKUNJA_API_TOKEN;
|
||||
const projectId = env.VIKUNJA_PROJECT_ID || 14;
|
||||
|
||||
if (!apiUrl || !apiToken || apiUrl.includes('[YOUR_')) {
|
||||
console.error("Error: VIKUNJA_API_URL or VIKUNJA_API_TOKEN is not configured correctly in .env.agent.");
|
||||
@@ -40,52 +49,59 @@ if (env.AGENT_OPERATING_MODE === "TEST") {
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// 3. Helper to make API calls using native fetch (Node 18+)
|
||||
async function markTaskDoneAndComment(taskId, message) {
|
||||
const FETCH_OPTS = {
|
||||
headers: {
|
||||
'Authorization': `Bearer ${apiToken}`,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
};
|
||||
|
||||
async function createTaskAndComment(title, message) {
|
||||
try {
|
||||
console.log(`Connecting to Vikunja API for Task ${taskId}...`);
|
||||
|
||||
// Update task status to done
|
||||
const patchRes = await fetch(`${apiUrl}/tasks/${taskId}`, {
|
||||
method: 'POST', // Vikunja uses POST to task endpoint for updates
|
||||
headers: {
|
||||
'Authorization': `Bearer ${apiToken}`,
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({ done: true })
|
||||
});
|
||||
|
||||
if (!patchRes.ok) {
|
||||
throw new Error(`Failed to mark task as done: ${patchRes.statusText}`);
|
||||
}
|
||||
|
||||
console.log(`✅ Task ${taskId} successfully marked as Done.`);
|
||||
|
||||
// Add comment
|
||||
const commentRes = await fetch(`${apiUrl}/tasks/${taskId}/comments`, {
|
||||
console.log(`Creating new task in Project ${projectId}...`);
|
||||
const createRes = await fetch(`${apiUrl}/projects/${projectId}/tasks`, {
|
||||
method: 'PUT',
|
||||
headers: {
|
||||
'Authorization': `Bearer ${apiToken}`,
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text: `[Agent Automator] Phase completed.\nReason/Hash: ${message}`
|
||||
...FETCH_OPTS,
|
||||
body: JSON.stringify({
|
||||
title: title,
|
||||
description: message,
|
||||
done: true
|
||||
})
|
||||
});
|
||||
|
||||
if (!commentRes.ok) {
|
||||
console.error(`Warning: Task marked as done, but failed to attach comment: ${commentRes.statusText}`);
|
||||
} else {
|
||||
console.log("✅ Comment attached successfully.");
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error("❌ Failed to sync with Vikunja:");
|
||||
// Mask the token if it somehow leaks via error message
|
||||
const secureErr = error.message.replace(new RegExp(apiToken, 'g'), "********");
|
||||
console.error(secureErr);
|
||||
if (!createRes.ok) throw new Error(`Create failed: ${createRes.statusText}`);
|
||||
const task = await createRes.json();
|
||||
console.log(`✅ Task created and marked Done! ID: #${task.id}`);
|
||||
} catch (e) {
|
||||
console.error("❌ Failed:", e.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
markTaskDoneAndComment(taskId, message);
|
||||
async function markTaskDoneAndComment(taskId, message) {
|
||||
try {
|
||||
console.log(`Updating Task ${taskId}...`);
|
||||
const patchRes = await fetch(`${apiUrl}/tasks/${taskId}`, {
|
||||
method: 'POST',
|
||||
...FETCH_OPTS,
|
||||
body: JSON.stringify({ done: true })
|
||||
});
|
||||
|
||||
if (!patchRes.ok) throw new Error(`Update failed: ${patchRes.statusText}`);
|
||||
console.log(`✅ Task ${taskId} marked as Done.`);
|
||||
|
||||
await fetch(`${apiUrl}/tasks/${taskId}/comments`, {
|
||||
method: 'PUT', ...FETCH_OPTS, body: JSON.stringify({ text: `[Agent Automator] Phase completed.\nReason/Hash: ${message}` })
|
||||
});
|
||||
console.log("✅ Comment attached.");
|
||||
} catch (e) {
|
||||
console.error("❌ Failed:", e.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (commandOrId === "create") {
|
||||
createTaskAndComment(message, args[2] || "Task fully completed.");
|
||||
} else {
|
||||
markTaskDoneAndComment(commandOrId, message);
|
||||
}
|
||||
|
||||
58
scripts/analysis_raw.txt
Normal file
58
scripts/analysis_raw.txt
Normal file
@@ -0,0 +1,58 @@
|
||||
0|Gemma4-26B MXFP4_MOE|ngl=999 pure-GPU|63.21|63.78|G0:11770|G1:10411|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
1|Gemma4-26B MXFP4_MOE|compare: cpu-moe|12.92|14.21|G0:3096|G1:3497|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe
|
||||
2|Gemma4-26B MXFP4_MOE|t=2|64.1|64.27|G0:11728|G1:10411|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
3|Gemma4-26B MXFP4_MOE|t=4|64|64.39|G0:11728|G1:10411|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
4|Gemma4-26B MXFP4_MOE|t=8|63.75|63.9|G0:11728|G1:10411|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
5|Gemma4-26B MXFP4_MOE|t=10|64.01|64.14|G0:11728|G1:10411|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
6|Gemma4-26B MXFP4_MOE|t=12|63.86|63.98|G0:11728|G1:10411|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
7|Gemma4-26B MXFP4_MOE|ub=256 b=1024|63.8|64.12|G0:10504|G1:9619|t=2|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU
|
||||
8|Gemma4-26B MXFP4_MOE|ub=256 b=2048|63.88|64.04|G0:10504|G1:9619|t=2|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
9|Gemma4-26B MXFP4_MOE|ub=512 b=4096|63.91|64.18|G0:11728|G1:10411|t=2|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU
|
||||
10|Gemma4-26B MXFP4_MOE|ub=1024 b=2048|63.86|64.1|G0:10956|G1:9907|t=2|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
11|Gemma4-26B MXFP4_MOE|ub=1024 b=4096|63.85|64.06|G0:10956|G1:9907|t=2|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU
|
||||
12|Gemma4-26B MXFP4_MOE|kv=q8_0/q8_0|64.14|64.39|G0:10670|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
|
||||
13|Gemma4-26B MXFP4_MOE|kv=q4_0/q8_0|37.52|37.86|G0:10394|G1:9753|t=2|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU
|
||||
14|Gemma4-26B MXFP4_MOE|kv=f16/f16|63.48|64.31|G0:11700|G1:11667|t=2|ub=512 b=2048|kv=f16/f16|pure-GPU
|
||||
15|Gemma4-26B MXFP4_MOE|FINAL|64.05|64.29|G0:10667|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
|
||||
16|Gemma4-26B Q4_K_M|ngl=999 pure-GPU|76.01|76.31|G0:11784|G1:10454|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
17|Gemma4-26B Q4_K_M|compare: cpu-moe|10.19|10.49|G0:2652|G1:2982|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe
|
||||
18|Gemma4-26B Q4_K_M|t=2|75.67|75.87|G0:11783|G1:10454|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
19|Gemma4-26B Q4_K_M|t=4|75.61|75.87|G0:11783|G1:10454|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
20|Gemma4-26B Q4_K_M|t=8|75.42|75.59|G0:11783|G1:10454|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
21|Gemma4-26B Q4_K_M|t=10|75.71|75.82|G0:11783|G1:10454|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
22|Gemma4-26B Q4_K_M|t=12|75.08|75.7|G0:11783|G1:10454|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
23|Gemma4-26B Q4_K_M|ub=256 b=1024|75.16|75.64|G0:10559|G1:9662|t=6|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU
|
||||
24|Gemma4-26B Q4_K_M|ub=256 b=2048|75.68|76.05|G0:10559|G1:9662|t=6|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
25|Gemma4-26B Q4_K_M|ub=512 b=4096|75.92|76.16|G0:11784|G1:10454|t=6|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU
|
||||
26|Gemma4-26B Q4_K_M|ub=1024 b=2048|75.7|75.9|G0:11012|G1:9950|t=6|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||
27|Gemma4-26B Q4_K_M|ub=1024 b=4096|75.77|75.99|G0:11011|G1:9950|t=6|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU
|
||||
28|Gemma4-26B Q4_K_M|kv=q8_0/q8_0|76.3|76.69|G0:10725|G1:10212|t=6|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
|
||||
29|Gemma4-26B Q4_K_M|kv=q4_0/q8_0|42.88|44.58|G0:10439|G1:9796|t=6|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU
|
||||
30|Gemma4-26B Q4_K_M|kv=f16/f16|76.36|76.78|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU
|
||||
31|Gemma4-26B Q4_K_M|FINAL|76.4|76.75|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU
|
||||
32|Qwen3.5-35B MXFP4_MOE|n-cpu-moe=5|51.43|52.07|G0:10365|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
33|Qwen3.5-35B MXFP4_MOE|t=2|43.8|46.4|G0:10365|G1:11152|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
34|Qwen3.5-35B MXFP4_MOE|t=4|49.21|52.78|G0:10353|G1:11152|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
35|Qwen3.5-35B MXFP4_MOE|t=8|46.43|50.49|G0:10397|G1:11152|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
36|Qwen3.5-35B MXFP4_MOE|t=10|46.12|50.06|G0:10351|G1:11152|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
37|Qwen3.5-35B MXFP4_MOE|t=12|45.23|47.1|G0:10337|G1:11152|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
38|Qwen3.5-35B MXFP4_MOE|ub=256 b=1024|48.9|52.3|G0:9834|G1:10906|t=6|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
39|Qwen3.5-35B MXFP4_MOE|ub=256 b=2048|49.62|52.52|G0:9833|G1:10906|t=6|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
40|Qwen3.5-35B MXFP4_MOE|ub=512 b=4096|48.78|52.14|G0:10337|G1:11152|t=6|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
41|Qwen3.5-35B MXFP4_MOE|ub=1024 b=2048|49.95|52.53|G0:11124|G1:11644|t=6|ub=1024 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
42|Qwen3.5-35B MXFP4_MOE|ub=1024 b=4096|48.75|52.06|G0:11123|G1:11644|t=6|ub=1024 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
43|Qwen3.5-35B MXFP4_MOE|kv=q4_0/q8_0|42.81|44.14|G0:10681|G1:11472|t=6|ub=512 b=2048|kv=q4_0/q8_0|n-cpu-moe=5
|
||||
44|Qwen3.5-35B MXFP4_MOE|FINAL|46.66|47.09|G0:10476|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
45|Qwen3.5-35B Q4_K_M|n-cpu-moe=5|49.01|53.09|G0:10606|G1:11338|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
46|Qwen3.5-35B Q4_K_M|t=2|45.73|47.87|G0:10599|G1:11338|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
47|Qwen3.5-35B Q4_K_M|t=4|50.98|54.33|G0:10601|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
48|Qwen3.5-35B Q4_K_M|t=8|48.45|52.1|G0:10596|G1:11338|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
49|Qwen3.5-35B Q4_K_M|t=10|47.83|51.45|G0:10595|G1:11338|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
50|Qwen3.5-35B Q4_K_M|t=12|43.77|46.79|G0:10589|G1:11338|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
51|Qwen3.5-35B Q4_K_M|ub=256 b=1024|52.14|53.82|G0:10089|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
52|Qwen3.5-35B Q4_K_M|ub=256 b=2048|50.23|53.66|G0:10091|G1:11092|t=4|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
53|Qwen3.5-35B Q4_K_M|ub=512 b=2048|49.89|53.89|G0:10595|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
54|Qwen3.5-35B Q4_K_M|ub=512 b=4096|50.4|54.19|G0:10564|G1:11338|t=4|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
55|Qwen3.5-35B Q4_K_M|kv=q8_0/q8_0|51.84|53.53|G0:10726|G1:11732|t=4|ub=256 b=1024|kv=q8_0/q8_0|n-cpu-moe=5
|
||||
56|Qwen3.5-35B Q4_K_M|kv=q4_0/q8_0|43.22|45.99|G0:10410|G1:11412|t=4|ub=256 b=1024|kv=q4_0/q8_0|n-cpu-moe=5
|
||||
57|Qwen3.5-35B Q4_K_M|FINAL|52.05|54.48|G0:10062|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
|
||||
372
scripts/auto_tune_122b.py
Normal file
372
scripts/auto_tune_122b.py
Normal file
@@ -0,0 +1,372 @@
|
||||
"""
|
||||
Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
|
||||
===========================================
|
||||
각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
|
||||
서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
|
||||
|
||||
예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
# ============================================================
|
||||
# 테스트할 설정 목록
|
||||
# ============================================================
|
||||
# 공통 파라미터 (변경하지 않는 것들)
|
||||
COMMON_ARGS = [
|
||||
"--model", MODEL_PATH,
|
||||
"-ngl", "999",
|
||||
"--cpu-moe",
|
||||
"-c", "2048",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "256",
|
||||
"-b", "1024",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"--no-warmup", # 워밍업은 벤치마크 스크립트에서 직접 수행
|
||||
]
|
||||
|
||||
# 변수 파라미터 조합
|
||||
CONFIGS = [
|
||||
{
|
||||
"name": "A) --no-mmap -t 8",
|
||||
"desc": "서버 권장: mmap 비활성화 (baseline 대비)",
|
||||
"extra": ["--no-mmap", "-t", "8", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "B) --no-mmap -t 6",
|
||||
"desc": "스레드 감소 (캐시 경합 회피)",
|
||||
"extra": ["--no-mmap", "-t", "6", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "C) --no-mmap -t 10",
|
||||
"desc": "스레드 증가 (RAM 대역폭 포화)",
|
||||
"extra": ["--no-mmap", "-t", "10", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "D) --no-mmap -t 12",
|
||||
"desc": "더 많은 스레드",
|
||||
"extra": ["--no-mmap", "-t", "12", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "E) --no-mmap -t 10 --prio 3 --poll 100",
|
||||
"desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
|
||||
"extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
|
||||
},
|
||||
]
|
||||
|
||||
# ============================================================
|
||||
# 유틸리티 함수
|
||||
# ============================================================
|
||||
|
||||
def kill_server():
|
||||
"""llama-server 프로세스 강제 종료"""
|
||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
||||
time.sleep(3)
|
||||
|
||||
def start_server(config, log_path):
|
||||
"""서버 시작, 로그를 파일로 리다이렉트"""
|
||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
||||
log_file = open(log_path, "w", encoding="utf-8")
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=log_file,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd()
|
||||
)
|
||||
return proc, log_file
|
||||
|
||||
def wait_for_server(timeout=600):
|
||||
"""서버가 준비될 때까지 대기"""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def run_single_benchmark(prompt, max_tokens=200):
|
||||
"""단일 벤치마크 실행"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
return completion_tokens, elapsed
|
||||
|
||||
def parse_eval_times(log_path):
|
||||
"""서버 로그에서 순수 eval time 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
|
||||
# "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
|
||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
|
||||
results = []
|
||||
for m in matches:
|
||||
results.append({
|
||||
"total_ms": float(m[0]),
|
||||
"tokens": int(m[1]),
|
||||
"ms_per_token": float(m[2]),
|
||||
"tps": float(m[3])
|
||||
})
|
||||
return results
|
||||
|
||||
def parse_prompt_eval_times(log_path):
|
||||
"""서버 로그에서 prompt eval time 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
|
||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
|
||||
results = []
|
||||
for m in matches:
|
||||
results.append({
|
||||
"total_ms": float(m[0]),
|
||||
"tokens": int(m[1]),
|
||||
"ms_per_token": float(m[2]),
|
||||
"tps": float(m[3])
|
||||
})
|
||||
return results
|
||||
|
||||
def parse_vram_usage(log_path):
|
||||
"""서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return "N/A"
|
||||
|
||||
match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
|
||||
if match:
|
||||
return f"{float(match.group(1)):.0f} MiB"
|
||||
return "N/A"
|
||||
|
||||
# ============================================================
|
||||
# 메인 튜닝 루프
|
||||
# ============================================================
|
||||
|
||||
def main():
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B 자동 정밀 튜닝")
|
||||
print(f" 시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f" 테스트 설정: {len(CONFIGS)}개")
|
||||
print(f" 예상 소요: ~{len(CONFIGS) * 7}분")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print(" 기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
|
||||
print()
|
||||
|
||||
# 결과 저장
|
||||
all_results = []
|
||||
|
||||
for idx, config in enumerate(CONFIGS):
|
||||
config_start = time.time()
|
||||
log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
||||
print(f" {config['desc']}")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# 1. 기존 서버 종료
|
||||
print(" [1/4] 서버 종료 중...")
|
||||
kill_server()
|
||||
|
||||
# 2. 새 서버 시작
|
||||
print(f" [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
|
||||
proc, log_file = start_server(config, log_path)
|
||||
|
||||
# 3. 서버 준비 대기
|
||||
if not wait_for_server(timeout=600):
|
||||
print(" ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
|
||||
kill_server()
|
||||
log_file.close()
|
||||
all_results.append({
|
||||
"config": config["name"],
|
||||
"status": "FAILED",
|
||||
"eval_tps": [],
|
||||
"prompt_tps": [],
|
||||
"vram": "N/A"
|
||||
})
|
||||
continue
|
||||
|
||||
load_time = time.time() - config_start
|
||||
print(f" [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
|
||||
|
||||
# 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
|
||||
print(" [4/4] 벤치마크 실행 중...")
|
||||
|
||||
# 워밍업
|
||||
try:
|
||||
run_single_benchmark("Say hello.", max_tokens=20)
|
||||
print(" 워밍업 완료")
|
||||
except Exception as e:
|
||||
print(f" 워밍업 실패: {e}")
|
||||
|
||||
# 본 테스트 3회
|
||||
prompts = [
|
||||
"Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
|
||||
"Explain the complete process of photosynthesis including light and dark reactions in detail.",
|
||||
"Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
|
||||
]
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
try:
|
||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
||||
approx_tps = tokens / elapsed if elapsed > 0 else 0
|
||||
print(f" Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
|
||||
except Exception as e:
|
||||
print(f" Run {i+1}/3: ERROR - {e}")
|
||||
|
||||
# 서버 종료 전에 로그 플러시를 위해 잠시 대기
|
||||
time.sleep(2)
|
||||
|
||||
# 서버 종료
|
||||
kill_server()
|
||||
log_file.close()
|
||||
time.sleep(2)
|
||||
|
||||
# 로그 파싱
|
||||
eval_times = parse_eval_times(log_path)
|
||||
prompt_times = parse_prompt_eval_times(log_path)
|
||||
vram = parse_vram_usage(log_path)
|
||||
|
||||
# 워밍업 제외 (첫 번째 결과)
|
||||
if len(eval_times) > 1:
|
||||
bench_evals = eval_times[1:] # 워밍업 제외
|
||||
else:
|
||||
bench_evals = eval_times
|
||||
|
||||
if len(prompt_times) > 1:
|
||||
bench_prompts = prompt_times[1:]
|
||||
else:
|
||||
bench_prompts = prompt_times
|
||||
|
||||
eval_speeds = [e["tps"] for e in bench_evals]
|
||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
||||
|
||||
result = {
|
||||
"config": config["name"],
|
||||
"status": "OK",
|
||||
"eval_tps": eval_speeds,
|
||||
"prompt_tps": prompt_speeds,
|
||||
"vram": vram,
|
||||
}
|
||||
all_results.append(result)
|
||||
|
||||
config_elapsed = time.time() - config_start
|
||||
print(f"\n 완료! 소요: {config_elapsed:.0f}초")
|
||||
|
||||
if eval_speeds:
|
||||
avg_eval = sum(eval_speeds) / len(eval_speeds)
|
||||
max_eval = max(eval_speeds)
|
||||
print(f" 📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
|
||||
|
||||
# ============================================================
|
||||
# 최종 결과 비교 테이블
|
||||
# ============================================================
|
||||
print("\n")
|
||||
print("=" * 80)
|
||||
print(" 🏆 최종 결과 비교 테이블")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
# 기존 baseline 추가
|
||||
print(f" {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
|
||||
print(f" {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
|
||||
|
||||
# Baseline (이전 결과)
|
||||
print(f" {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
|
||||
|
||||
best_avg = 0
|
||||
best_config = ""
|
||||
|
||||
for r in all_results:
|
||||
if r["status"] != "OK" or not r["eval_tps"]:
|
||||
print(f" {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
|
||||
continue
|
||||
|
||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
||||
max_e = max(r["eval_tps"])
|
||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
||||
|
||||
if avg_e > best_avg:
|
||||
best_avg = avg_e
|
||||
best_config = r["config"]
|
||||
|
||||
marker = " ⭐" if avg_e > 10.06 else ""
|
||||
print(f" {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
|
||||
|
||||
print()
|
||||
if best_avg > 0:
|
||||
improvement = ((best_avg - 10.02) / 10.02) * 100
|
||||
print(f" 🏆 최고 성능: {best_config}")
|
||||
print(f" → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
|
||||
|
||||
print()
|
||||
print(f" 완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print("=" * 80)
|
||||
|
||||
# 결과를 파일로도 저장
|
||||
result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
|
||||
with open(result_path, "w", encoding="utf-8") as f:
|
||||
f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
|
||||
f.write(f"Date: {timestamp}\n\n")
|
||||
for r in all_results:
|
||||
f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
|
||||
print(f" 결과 저장: {result_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
257
scripts/auto_tune_122b_r2.py
Normal file
257
scripts/auto_tune_122b_r2.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Qwen3.5 122B-A10B 정밀 튜닝 2라운드
|
||||
====================================
|
||||
1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
|
||||
→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
COMMON_ARGS = [
|
||||
"--model", MODEL_PATH,
|
||||
"-ngl", "999",
|
||||
"--cpu-moe",
|
||||
"-c", "2048",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "256",
|
||||
"-b", "1024",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"--no-warmup",
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
{
|
||||
"name": "F) mmap on, -t 4",
|
||||
"desc": "최소 스레드 (4개, 물리코어 절반)",
|
||||
"extra": ["-t", "4", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "G) mmap on, -t 5",
|
||||
"desc": "스레드 5개",
|
||||
"extra": ["-t", "5", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "H) mmap on, -t 6",
|
||||
"desc": "스레드 6개 (--no-mmap에서 최고였음)",
|
||||
"extra": ["-t", "6", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "I) mmap on, -t 7",
|
||||
"desc": "스레드 7개",
|
||||
"extra": ["-t", "7", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "J) mmap on, -t 6, --prio 3",
|
||||
"desc": "최적 스레드 + 리얼타임 우선순위",
|
||||
"extra": ["-t", "6", "--prio", "3"],
|
||||
},
|
||||
]
|
||||
|
||||
def kill_server():
|
||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
||||
time.sleep(3)
|
||||
|
||||
def start_server(config, log_path):
|
||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
||||
log_file = open(log_path, "w", encoding="utf-8")
|
||||
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
|
||||
return proc, log_file
|
||||
|
||||
def wait_for_server(timeout=600):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def run_single_benchmark(prompt, max_tokens=200):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
usage = result.get("usage", {})
|
||||
return usage.get("completion_tokens", 0), elapsed
|
||||
|
||||
def parse_eval_times(log_path):
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
|
||||
|
||||
def parse_prompt_eval_times(log_path):
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
return [{"tps": float(m[3])} for m in matches]
|
||||
|
||||
def main():
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
all_results = []
|
||||
|
||||
for idx, config in enumerate(CONFIGS):
|
||||
config_start = time.time()
|
||||
log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
||||
print(f" {config['desc']}")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
kill_server()
|
||||
print(f" [1/3] 서버 시작 중...")
|
||||
proc, log_file = start_server(config, log_path)
|
||||
|
||||
if not wait_for_server(timeout=600):
|
||||
print(" ❌ 서버 시작 실패!")
|
||||
kill_server()
|
||||
log_file.close()
|
||||
all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
|
||||
continue
|
||||
|
||||
load_time = time.time() - config_start
|
||||
print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)")
|
||||
|
||||
# 워밍업 + 벤치마크
|
||||
try:
|
||||
run_single_benchmark("Say hello.", max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
print(" [3/3] 벤치마크 3회...")
|
||||
prompts = [
|
||||
"Write a detailed explanation of how neural networks learn through backpropagation.",
|
||||
"Explain the complete process of photosynthesis including light and dark reactions.",
|
||||
"Describe the differences between SQL and NoSQL databases with examples.",
|
||||
]
|
||||
for i, prompt in enumerate(prompts):
|
||||
try:
|
||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
||||
print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
|
||||
except Exception as e:
|
||||
print(f" Run {i+1}: ERROR - {e}")
|
||||
|
||||
time.sleep(2)
|
||||
kill_server()
|
||||
log_file.close()
|
||||
time.sleep(2)
|
||||
|
||||
eval_times = parse_eval_times(log_path)
|
||||
prompt_times = parse_prompt_eval_times(log_path)
|
||||
bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
|
||||
bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
|
||||
|
||||
eval_speeds = [e["tps"] for e in bench_evals]
|
||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
||||
|
||||
all_results.append({
|
||||
"config": config["name"],
|
||||
"status": "OK",
|
||||
"eval_tps": eval_speeds,
|
||||
"prompt_tps": prompt_speeds,
|
||||
})
|
||||
|
||||
if eval_speeds:
|
||||
print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
|
||||
|
||||
# 최종 결과
|
||||
print("\n")
|
||||
print("=" * 85)
|
||||
print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
|
||||
print("=" * 85)
|
||||
print()
|
||||
print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
|
||||
print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
|
||||
|
||||
# 1라운드 결과 (하드코딩)
|
||||
r1 = [
|
||||
("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52),
|
||||
("A) --no-mmap -t 8", 9.66, 9.70, 28.26),
|
||||
("B) --no-mmap -t 6", 10.02, 10.18, 26.73),
|
||||
("C) --no-mmap -t 10", 9.42, 9.46, 27.31),
|
||||
("D) --no-mmap -t 12", 9.04, 9.11, 27.92),
|
||||
("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37),
|
||||
]
|
||||
for name, avg, mx, pp in r1:
|
||||
marker = " ⭐" if avg >= 10.0 else ""
|
||||
print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
|
||||
|
||||
print(f" {'--- 2라운드 ---':<48}")
|
||||
|
||||
best_avg = 10.06 # 기존 최고
|
||||
best_config = "[기준] mmap on, -t 8"
|
||||
|
||||
for r in all_results:
|
||||
if r["status"] != "OK" or not r["eval_tps"]:
|
||||
print(f" {r['config']:<48} {'FAIL':>8}")
|
||||
continue
|
||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
||||
max_e = max(r["eval_tps"])
|
||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
||||
if max_e > best_avg:
|
||||
best_avg = max_e
|
||||
best_config = r["config"]
|
||||
marker = " ⭐" if avg_e >= 10.0 else ""
|
||||
print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
|
||||
|
||||
print()
|
||||
print(f" 🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
|
||||
print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print("=" * 85)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
339
scripts/auto_tune_gemma4_256k.py
Normal file
339
scripts/auto_tune_gemma4_256k.py
Normal file
@@ -0,0 +1,339 @@
|
||||
"""
|
||||
Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
|
||||
Phase 1: -ngl sweep (GPU layers)
|
||||
Phase 2: -t / -tb sweep (CPU threads)
|
||||
Phase 3: -ub / -b sweep (batch sizes)
|
||||
Phase 4: --cache-type-k/v sweep (KV cache precision)
|
||||
Phase 5: --no-mmap, --poll, --prio sweep (misc)
|
||||
Each phase fixes the best from previous phases.
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import itertools
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||
CONTEXT = 262144
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
|
||||
# ─── Baseline (from previous tuning at -c 4096) ───
|
||||
BEST = {
|
||||
"ngl": 22,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": True,
|
||||
"mmap": True,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
}
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def build_cmd(cfg):
|
||||
cmd = [LLAMA_SERVER, "--model", MODEL,
|
||||
"-ngl", str(cfg["ngl"]),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", cfg["fa"],
|
||||
"--cache-type-k", cfg["ctk"],
|
||||
"--cache-type-v", cfg["ctv"],
|
||||
"-ub", str(cfg["ub"]),
|
||||
"-b", str(cfg["b"]),
|
||||
"-t", str(cfg["t"]),
|
||||
"-tb", str(cfg["tb"]),
|
||||
"--prio", str(cfg["prio"]),
|
||||
"--poll", str(cfg["poll"]),
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0"]
|
||||
if cfg["mlock"]:
|
||||
cmd.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(cfg):
|
||||
cmd = build_cmd(cfg)
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=180):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
return ct / elapsed if elapsed > 0 else 0
|
||||
|
||||
|
||||
def get_vram():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
parts = r.stdout.strip().split(",")
|
||||
return int(parts[0].strip()), int(parts[1].strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test_config(cfg, label=""):
|
||||
kill_server()
|
||||
desc = label or str(cfg)
|
||||
print(f" [{desc}] Starting server...")
|
||||
proc = start_server(cfg)
|
||||
|
||||
if not wait_for_server():
|
||||
print(f" [{desc}] FAILED to start")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram_used, vram_total = get_vram()
|
||||
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Benchmark
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
speeds.append(tps)
|
||||
except Exception as e:
|
||||
print(f"ERR({e}) ", end="", flush=True)
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
print("ALL FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {**cfg, "avg_tps": avg, "best_tps": best,
|
||||
"vram_used": vram_used, "vram_total": vram_total, "label": label}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
def phase_sweep(phase_name, param_name, values, base_cfg):
|
||||
print(f"\n{'='*70}")
|
||||
print(f" PHASE: {phase_name}")
|
||||
print(f" Sweeping: {param_name} = {values}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
best_result = None
|
||||
for val in values:
|
||||
cfg = {**base_cfg}
|
||||
if isinstance(param_name, list):
|
||||
for p, v in zip(param_name, val):
|
||||
cfg[p] = v
|
||||
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
|
||||
else:
|
||||
cfg[param_name] = val
|
||||
label = f"{param_name}={val}"
|
||||
|
||||
r = test_config(cfg, label)
|
||||
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
|
||||
best_result = r
|
||||
|
||||
if best_result:
|
||||
print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
|
||||
return best_result
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner")
|
||||
print(" 256K Context | RTX 3060 12GB")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
cfg = dict(BEST)
|
||||
|
||||
# ─── Phase 1: -ngl (already done, quick verify top 3) ───
|
||||
r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg)
|
||||
if r:
|
||||
cfg["ngl"] = r["ngl"]
|
||||
|
||||
# ─── Phase 2: CPU threads (-t, -tb) ───
|
||||
thread_combos = [
|
||||
(2, 2), (4, 4), (4, 8), (6, 6), (6, 8),
|
||||
(8, 8), (8, 12), (10, 10), (12, 12), (16, 16)
|
||||
]
|
||||
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
|
||||
if r:
|
||||
cfg["t"] = r["t"]
|
||||
cfg["tb"] = r["tb"]
|
||||
|
||||
# ─── Phase 3: Batch sizes (-ub, -b) ───
|
||||
batch_combos = [
|
||||
(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)
|
||||
]
|
||||
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
|
||||
if r:
|
||||
cfg["ub"] = r["ub"]
|
||||
cfg["b"] = r["b"]
|
||||
|
||||
# ─── Phase 4: KV cache precision ───
|
||||
kv_combos = [
|
||||
("q4_0", "q4_0"),
|
||||
("q8_0", "q8_0"),
|
||||
("q4_0", "q8_0"),
|
||||
("f16", "f16"),
|
||||
]
|
||||
r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg)
|
||||
if r:
|
||||
cfg["ctk"] = r["ctk"]
|
||||
cfg["ctv"] = r["ctv"]
|
||||
|
||||
# ─── Phase 5: Misc (mmap, poll, prio) ───
|
||||
misc_combos = [
|
||||
(True, 50, 2), # baseline
|
||||
(False, 50, 2), # no-mmap
|
||||
(True, 0, 2), # no polling
|
||||
(True, 100, 2), # max polling
|
||||
(True, 50, 3), # realtime priority
|
||||
(False, 0, 3), # no-mmap + no-poll + realtime
|
||||
]
|
||||
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
|
||||
if r:
|
||||
cfg["mmap"] = r["mmap"]
|
||||
cfg["poll"] = r["poll"]
|
||||
cfg["prio"] = r["prio"]
|
||||
|
||||
# ─── Final Report ───
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" FINAL OPTIMAL CONFIGURATION")
|
||||
print("=" * 70)
|
||||
print(f" ngl: {cfg['ngl']}")
|
||||
print(f" threads: -t {cfg['t']} -tb {cfg['tb']}")
|
||||
print(f" batch: -ub {cfg['ub']} -b {cfg['b']}")
|
||||
print(f" kv cache: -ctk {cfg['ctk']} -ctv {cfg['ctv']}")
|
||||
print(f" flash: -fa {cfg['fa']}")
|
||||
print(f" mlock: {'yes' if cfg['mlock'] else 'no'}")
|
||||
print(f" mmap: {'yes' if cfg['mmap'] else 'no (--no-mmap)'}")
|
||||
print(f" prio: {cfg['prio']}")
|
||||
print(f" poll: {cfg['poll']}")
|
||||
print()
|
||||
|
||||
# Final verification run
|
||||
print(" Running final verification (5 runs)...")
|
||||
kill_server()
|
||||
proc = start_server(cfg)
|
||||
wait_for_server()
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
final_speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
final_speeds.append(tps)
|
||||
print(f" Run {i+1}: {tps:.2f} t/s")
|
||||
except:
|
||||
pass
|
||||
proc.kill()
|
||||
|
||||
if final_speeds:
|
||||
avg = sum(final_speeds) / len(final_speeds)
|
||||
best = max(final_speeds)
|
||||
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
|
||||
|
||||
print()
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODEL}",
|
||||
f"-ngl {cfg['ngl']} -c {CONTEXT}",
|
||||
f"-t {cfg['t']} -tb {cfg['tb']}",
|
||||
f"-ub {cfg['ub']} -b {cfg['b']}",
|
||||
f"-fa {cfg['fa']}",
|
||||
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
|
||||
f"--prio {cfg['prio']} --poll {cfg['poll']}",
|
||||
]
|
||||
if cfg["mlock"]:
|
||||
cmd_parts.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
print(" Recommended command:")
|
||||
print(f" {' '.join(cmd_parts)}")
|
||||
print("=" * 70)
|
||||
|
||||
# Dump all results to JSON
|
||||
with open("scripts/tune_results_gemma4_256k.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
print(f"\n Full results saved: scripts/tune_results_gemma4_256k.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
163
scripts/auto_tune_gemma4_ncpumoe.py
Normal file
163
scripts/auto_tune_gemma4_ncpumoe.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
|
||||
Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
|
||||
"""
|
||||
import subprocess, time, json, urllib.request, sys, os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||
CTX = 262144
|
||||
RUNS = 3
|
||||
|
||||
|
||||
def kill():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
|
||||
cmd = [SERVER, "--model", MODEL, "-ngl", "999",
|
||||
"-c", str(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", ctk, "--cache-type-v", ctv,
|
||||
"-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
|
||||
"--prio", str(prio), "--poll", "50",
|
||||
"--mlock", "--port", "8000", "--host", "0.0.0.0"]
|
||||
if ncpumoe > 0:
|
||||
cmd.extend(["--n-cpu-moe", str(ncpumoe)])
|
||||
if nommap:
|
||||
cmd.append("--no-mmap")
|
||||
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def wait_ready(timeout=240):
|
||||
t0 = time.time()
|
||||
while time.time() - t0 < timeout:
|
||||
try:
|
||||
with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
|
||||
if json.loads(r.read()).get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def bench(n=200):
|
||||
p = json.dumps({"model": "m", "messages": [{"role": "user",
|
||||
"content": "Count from 1 to 50, each number on new line."}],
|
||||
"max_tokens": n, "temperature": 0.0}).encode()
|
||||
r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
|
||||
headers={"Content-Type": "application/json"})
|
||||
t0 = time.time()
|
||||
with urllib.request.urlopen(r, timeout=300) as resp:
|
||||
res = json.loads(resp.read())
|
||||
dt = time.time() - t0
|
||||
ct = res.get("usage", {}).get("completion_tokens", 0)
|
||||
return ct / dt if dt > 0 else 0
|
||||
|
||||
|
||||
def vram():
|
||||
try:
|
||||
r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
|
||||
a, b = r.stdout.strip().split(",")
|
||||
return int(a.strip()), int(b.strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test(label, ncpumoe, **kw):
|
||||
kill()
|
||||
print(f" [{label}] Starting...", end=" ", flush=True)
|
||||
p = start(ncpumoe, **kw)
|
||||
if not wait_ready():
|
||||
print("FAILED"); p.kill(); return None
|
||||
vu, vt = vram()
|
||||
print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
|
||||
try: bench(20)
|
||||
except: pass
|
||||
speeds = []
|
||||
for _ in range(RUNS):
|
||||
try: speeds.append(bench())
|
||||
except: pass
|
||||
p.kill()
|
||||
if not speeds:
|
||||
print("BENCH FAILED"); return None
|
||||
avg, best = sum(speeds)/len(speeds), max(speeds)
|
||||
print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
|
||||
return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
|
||||
"vram": vu, **kw}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
|
||||
print("=" * 60)
|
||||
results = []
|
||||
|
||||
# Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
|
||||
print("\n--- Phase 1: --n-cpu-moe sweep ---")
|
||||
for n in [0, 5, 10, 15, 20, 25, 30]:
|
||||
nm = n > 15 # use --no-mmap when heavy CPU offload
|
||||
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Find best n-cpu-moe
|
||||
best_r = max(results, key=lambda x: x["avg"])
|
||||
best_n = best_r["ncpumoe"]
|
||||
print(f"\n ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||
|
||||
# Fine-tune around best
|
||||
if best_n > 0:
|
||||
print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
|
||||
for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
|
||||
if n == best_n: continue
|
||||
nm = n > 15
|
||||
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||
if r: results.append(r)
|
||||
best_r = max(results, key=lambda x: x["avg"])
|
||||
best_n = best_r["ncpumoe"]
|
||||
print(f"\n ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||
|
||||
# Phase 2: Thread sweep at best n-cpu-moe
|
||||
nm = best_n > 15
|
||||
print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
|
||||
for t in [2, 4, 6, 8, 10]:
|
||||
r = test(f"t={t}", best_n, t=t, nommap=nm)
|
||||
if r: results.append(r)
|
||||
best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
|
||||
bt = best_t.get("t", 4)
|
||||
print(f"\n ★ Best threads: {bt}")
|
||||
|
||||
# Phase 3: Batch sweep
|
||||
print(f"\n--- Phase 3: Batch sweep ---")
|
||||
for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
|
||||
r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Phase 4: KV cache type
|
||||
print(f"\n--- Phase 4: KV cache type ---")
|
||||
for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
|
||||
r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
|
||||
if r: results.append(r)
|
||||
|
||||
# Final report
|
||||
best_all = max(results, key=lambda x: x["avg"])
|
||||
print(f"\n{'='*60}")
|
||||
print(f" FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
print(" Saved: scripts/tune_results_gemma4_ncpumoe.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
335
scripts/auto_tune_qwen35b_256k.py
Normal file
335
scripts/auto_tune_qwen35b_256k.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""
|
||||
Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
|
||||
Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
|
||||
Now tuning for -c 262144 (256K context).
|
||||
|
||||
Phase 1: --cpu-moe vs no --cpu-moe baseline
|
||||
Phase 2: -t / -tb sweep
|
||||
Phase 3: -ub / -b sweep
|
||||
Phase 4: --cache-type-k/v sweep
|
||||
Phase 5: Misc (mmap, poll, prio)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
|
||||
CONTEXT = 262144
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
|
||||
BEST = {
|
||||
"ngl": 999,
|
||||
"cpu_moe": True,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": True,
|
||||
"mmap": True,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
}
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||
time.sleep(4)
|
||||
|
||||
|
||||
def build_cmd(cfg):
|
||||
cmd = [LLAMA_SERVER, "--model", MODEL,
|
||||
"-ngl", str(cfg["ngl"]),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", cfg["fa"],
|
||||
"--cache-type-k", cfg["ctk"],
|
||||
"--cache-type-v", cfg["ctv"],
|
||||
"-ub", str(cfg["ub"]),
|
||||
"-b", str(cfg["b"]),
|
||||
"-t", str(cfg["t"]),
|
||||
"-tb", str(cfg["tb"]),
|
||||
"--prio", str(cfg["prio"]),
|
||||
"--poll", str(cfg["poll"]),
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0"]
|
||||
if cfg.get("cpu_moe"):
|
||||
cmd.append("--cpu-moe")
|
||||
if cfg["mlock"]:
|
||||
cmd.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(cfg):
|
||||
cmd = build_cmd(cfg)
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=240):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
return ct / elapsed if elapsed > 0 else 0
|
||||
|
||||
|
||||
def get_vram():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
parts = r.stdout.strip().split(",")
|
||||
return int(parts[0].strip()), int(parts[1].strip())
|
||||
except:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def test_config(cfg, label=""):
|
||||
kill_server()
|
||||
desc = label or str(cfg)
|
||||
print(f" [{desc}] Starting server...", flush=True)
|
||||
proc = start_server(cfg)
|
||||
|
||||
if not wait_for_server():
|
||||
print(f" [{desc}] FAILED to start")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram_used, vram_total = get_vram()
|
||||
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
speeds.append(tps)
|
||||
except Exception as e:
|
||||
print(f"ERR({e}) ", end="", flush=True)
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
print("ALL FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
|
||||
"vram_used": vram_used, "vram_total": vram_total, "label": label}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
def phase_sweep(phase_name, param_name, values, base_cfg):
|
||||
print(f"\n{'='*70}")
|
||||
print(f" PHASE: {phase_name}")
|
||||
print(f" Sweeping: {param_name} = {values}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
best_result = None
|
||||
for val in values:
|
||||
cfg = {**base_cfg}
|
||||
if isinstance(param_name, list):
|
||||
for p, v in zip(param_name, val):
|
||||
cfg[p] = v
|
||||
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
|
||||
else:
|
||||
cfg[param_name] = val
|
||||
label = f"{param_name}={val}"
|
||||
|
||||
r = test_config(cfg, label)
|
||||
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
|
||||
best_result = r
|
||||
|
||||
if best_result:
|
||||
print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
|
||||
return best_result
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
|
||||
print(" 256K Context | RTX 3060 12GB")
|
||||
print(" Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
cfg = dict(BEST)
|
||||
|
||||
# ─── Phase 1: --cpu-moe critical test ───
|
||||
r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
|
||||
if r:
|
||||
cfg["cpu_moe"] = r["cpu_moe"]
|
||||
|
||||
# ─── Phase 2: CPU threads ───
|
||||
thread_combos = [
|
||||
(2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
|
||||
(8, 8), (8, 12), (10, 10), (12, 12)
|
||||
]
|
||||
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
|
||||
if r:
|
||||
cfg["t"] = r["t"]
|
||||
cfg["tb"] = r["tb"]
|
||||
|
||||
# ─── Phase 3: Batch sizes ───
|
||||
batch_combos = [
|
||||
(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)
|
||||
]
|
||||
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
|
||||
if r:
|
||||
cfg["ub"] = r["ub"]
|
||||
cfg["b"] = r["b"]
|
||||
|
||||
# ─── Phase 4: KV cache ───
|
||||
kv_combos = [
|
||||
("q4_0", "q4_0"),
|
||||
("q8_0", "q8_0"),
|
||||
("f16", "f16"),
|
||||
]
|
||||
r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
|
||||
if r:
|
||||
cfg["ctk"] = r["ctk"]
|
||||
cfg["ctv"] = r["ctv"]
|
||||
|
||||
# ─── Phase 5: Misc ───
|
||||
misc_combos = [
|
||||
(True, 50, 2),
|
||||
(False, 50, 2),
|
||||
(True, 0, 2),
|
||||
(True, 100, 2),
|
||||
(True, 50, 3),
|
||||
]
|
||||
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
|
||||
if r:
|
||||
cfg["mmap"] = r["mmap"]
|
||||
cfg["poll"] = r["poll"]
|
||||
cfg["prio"] = r["prio"]
|
||||
|
||||
# ─── Final Report ───
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" FINAL OPTIMAL CONFIGURATION")
|
||||
print("=" * 70)
|
||||
for k, v in cfg.items():
|
||||
print(f" {k:>12}: {v}")
|
||||
print()
|
||||
|
||||
# Final verification
|
||||
print(" Running final verification (5 runs)...")
|
||||
kill_server()
|
||||
proc = start_server(cfg)
|
||||
wait_for_server()
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
final_speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
tps = run_benchmark()
|
||||
final_speeds.append(tps)
|
||||
print(f" Run {i+1}: {tps:.2f} t/s")
|
||||
except:
|
||||
pass
|
||||
proc.kill()
|
||||
|
||||
if final_speeds:
|
||||
avg = sum(final_speeds) / len(final_speeds)
|
||||
best = max(final_speeds)
|
||||
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
|
||||
|
||||
print()
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODEL}",
|
||||
f"-ngl {cfg['ngl']} -c {CONTEXT}",
|
||||
]
|
||||
if cfg.get("cpu_moe"):
|
||||
cmd_parts.append("--cpu-moe")
|
||||
cmd_parts.extend([
|
||||
f"-t {cfg['t']} -tb {cfg['tb']}",
|
||||
f"-ub {cfg['ub']} -b {cfg['b']}",
|
||||
f"-fa {cfg['fa']}",
|
||||
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
|
||||
f"--prio {cfg['prio']} --poll {cfg['poll']}",
|
||||
])
|
||||
if cfg["mlock"]:
|
||||
cmd_parts.append("--mlock")
|
||||
if not cfg["mmap"]:
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
print(" Recommended command:")
|
||||
print(f" {' '.join(cmd_parts)}")
|
||||
print("=" * 70)
|
||||
|
||||
with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
print(f"\n Full results saved: scripts/tune_results_qwen35b_256k.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
scripts/boot_122b.txt
Normal file
BIN
scripts/boot_122b.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_38.txt
Normal file
BIN
scripts/boot_122b_38.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_42.txt
Normal file
BIN
scripts/boot_122b_42.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_44.txt
Normal file
BIN
scripts/boot_122b_44.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_auto.txt
Normal file
BIN
scripts/boot_122b_auto.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_maxmem.txt
Normal file
BIN
scripts/boot_122b_maxmem.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_row.txt
Normal file
BIN
scripts/boot_122b_row.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_row_dual.txt
Normal file
BIN
scripts/boot_122b_row_dual.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_single.txt
Normal file
BIN
scripts/boot_122b_single.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_single2.txt
Normal file
BIN
scripts/boot_122b_single2.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_ts85.txt
Normal file
BIN
scripts/boot_122b_ts85.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_tune.txt
Normal file
BIN
scripts/boot_122b_tune.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_tuned.txt
Normal file
BIN
scripts/boot_122b_tuned.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_v2.txt
Normal file
BIN
scripts/boot_122b_v2.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log.txt
Normal file
BIN
scripts/boot_log.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log2.txt
Normal file
BIN
scripts/boot_log2.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log3.txt
Normal file
BIN
scripts/boot_log3.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log4.txt
Normal file
BIN
scripts/boot_log4.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log5.txt
Normal file
BIN
scripts/boot_log5.txt
Normal file
Binary file not shown.
BIN
scripts/boot_qwen_iq4.txt
Normal file
BIN
scripts/boot_qwen_iq4.txt
Normal file
Binary file not shown.
3
scripts/check_help.bat
Normal file
3
scripts/check_help.bat
Normal file
@@ -0,0 +1,3 @@
|
||||
@echo off
|
||||
.\llama_bin_run\llama-server.exe --help 2>&1 | findstr /i "split tensor device main-gpu cpu-moe n-cpu-moe" > scripts\help_gpu_flags.txt
|
||||
echo Done.
|
||||
38
scripts/download_llama.py
Normal file
38
scripts/download_llama.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import urllib.request
|
||||
import json
|
||||
import zipfile
|
||||
import os
|
||||
import ssl
|
||||
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
|
||||
url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
try:
|
||||
with urllib.request.urlopen(req, context=ctx) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
download_url = None
|
||||
for asset in data['assets']:
|
||||
if "bin-win-cuda-cu12.2.0-x64.zip" in asset['name'] or ("bin-win-cu" in asset['name'] and asset['name'].endswith(".zip") and "x64" in asset['name']):
|
||||
download_url = asset['browser_download_url']
|
||||
break
|
||||
|
||||
if download_url:
|
||||
print(f"Downloading {download_url}...")
|
||||
zip_path = "llama.zip"
|
||||
with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
|
||||
out_file.write(resp.read())
|
||||
print("Extracting to 'llama_bin'...")
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall("llama_bin")
|
||||
print("Done extracting.")
|
||||
os.remove(zip_path)
|
||||
else:
|
||||
print("Could not find the target zip. Available assets:")
|
||||
for asset in data['assets']:
|
||||
print(" -", asset['name'])
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
33
scripts/download_models.py
Normal file
33
scripts/download_models.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
|
||||
models = [
|
||||
# 먼저 용량이 작은 Gemma4 26B 부터 다운로드
|
||||
("ggml-org/gemma-4-26B-A4B-it-GGUF", "gemma-4-26B-A4B-it-Q4_K_M.gguf"),
|
||||
# 다음 Qwen 35B
|
||||
("unsloth/Qwen3.5-35B-A3B-GGUF", "Qwen3.5-35B-A3B-Q4_K_M.gguf"),
|
||||
# 마지막으로 122B (분할 압축되어 있음)
|
||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"),
|
||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf"),
|
||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf")
|
||||
]
|
||||
|
||||
print("=== 고속 다운로더 시작 (huggingface_hub & hf_transfer) ===")
|
||||
os.makedirs("models", exist_ok=True)
|
||||
|
||||
for repo, filename in models:
|
||||
print(f"\n>>> 다운로드 중 (백그라운드 진행): [{repo}] 의 [{filename}]...")
|
||||
try:
|
||||
path = hf_hub_download(
|
||||
repo_id=repo,
|
||||
filename=filename,
|
||||
local_dir="./models",
|
||||
local_dir_use_symlinks=False
|
||||
)
|
||||
print(f"완료: {path}")
|
||||
except Exception as e:
|
||||
print(f"다운로드 실패: {e}")
|
||||
|
||||
print("\n모든 다운로드 프로세스가 종료되었습니다.")
|
||||
56
scripts/download_true_llama.py
Normal file
56
scripts/download_true_llama.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import urllib.request
|
||||
import json
|
||||
import zipfile
|
||||
import os
|
||||
import ssl
|
||||
import shutil
|
||||
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
|
||||
url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
try:
|
||||
with urllib.request.urlopen(req, context=ctx) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
download_url = None
|
||||
for asset in data['assets']:
|
||||
if "bin-win-cuda-12.4-x64.zip" in asset['name'] and "cudart" not in asset['name']:
|
||||
download_url = asset['browser_download_url']
|
||||
break
|
||||
|
||||
if download_url:
|
||||
print(f"Downloading true binaries: {download_url}...")
|
||||
zip_path = "llama_main.zip"
|
||||
with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
|
||||
out_file.write(resp.read())
|
||||
|
||||
print("Extracting to temporary folder 'llama_temp'...")
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall("llama_temp")
|
||||
|
||||
print("Moving exact files to 'llama_bin_run'...")
|
||||
os.makedirs("llama_bin_run", exist_ok=True)
|
||||
for root, dirs, files in os.walk("llama_temp"):
|
||||
for file in files:
|
||||
shutil.move(os.path.join(root, file), os.path.join("llama_bin_run", file))
|
||||
|
||||
if os.path.exists("llama_bin"):
|
||||
for item in os.listdir("llama_bin"):
|
||||
src = os.path.join("llama_bin", item)
|
||||
dst = os.path.join("llama_bin_run", item)
|
||||
if not os.path.exists(dst):
|
||||
try:
|
||||
shutil.copy(src, dst)
|
||||
except:
|
||||
pass
|
||||
|
||||
os.remove(zip_path)
|
||||
shutil.rmtree("llama_temp", ignore_errors=True)
|
||||
print("Download and path extraction fully complete.")
|
||||
else:
|
||||
print("Could not find the target zip.")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
531
scripts/dual_gpu_benchmark.mjs
Normal file
531
scripts/dual_gpu_benchmark.mjs
Normal file
@@ -0,0 +1,531 @@
|
||||
/**
|
||||
* Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||
* ===========================================================
|
||||
* Tests 4 models across multiple parameter configurations to find
|
||||
* the absolute best model + settings for 256K context coding agent.
|
||||
*
|
||||
* Models:
|
||||
* 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||
* 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||
* 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||
* 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||
*
|
||||
* Run: node scripts/dual_gpu_benchmark.mjs
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, statSync, existsSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
|
||||
// ─── Configuration ─────────────────────────────────────────────
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CONTEXT = 262144; // 256K
|
||||
const BENCHMARK_RUNS = 3;
|
||||
const BENCHMARK_TOKENS = 200;
|
||||
const SERVER_TIMEOUT = 300_000; // ms
|
||||
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Qwen3.5-35B-A3B Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
type: "qwen", quant: "Q4_K_M", totalLayers: 64,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
|
||||
},
|
||||
{
|
||||
name: "Gemma4 26B-A4B Q4_K_M",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
||||
type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
|
||||
},
|
||||
{
|
||||
name: "Gemma4 26B-A4B MXFP4_MOE",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
||||
type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
|
||||
},
|
||||
];
|
||||
|
||||
const ALL_RESULTS = [];
|
||||
|
||||
// ─── Utility ───────────────────────────────────────────────────
|
||||
|
||||
function log(msg) {
|
||||
const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
|
||||
console.log(`[${ts}] ${msg}`);
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
function killServer() {
|
||||
try {
|
||||
execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
|
||||
} catch {}
|
||||
return sleep(5000);
|
||||
}
|
||||
|
||||
function getVramAll() {
|
||||
try {
|
||||
const out = execSync(
|
||||
'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }
|
||||
);
|
||||
return out.trim().split("\n").map((line) => {
|
||||
const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
|
||||
return { gpu, used, total };
|
||||
});
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function buildCmd(modelPath, params) {
|
||||
const {
|
||||
ngl, t, ub, b, ctk, ctv,
|
||||
cpuMoe = false, nCpuMoe = 0,
|
||||
prio = 3, nommap = false
|
||||
} = params;
|
||||
|
||||
const cmd = [
|
||||
LLAMA_SERVER,
|
||||
"--model", modelPath,
|
||||
"-ngl", String(ngl),
|
||||
"-c", String(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", ctk,
|
||||
"--cache-type-v", ctv,
|
||||
"-ub", String(ub),
|
||||
"-b", String(b),
|
||||
"-t", String(t),
|
||||
"-tb", String(t),
|
||||
"--prio", String(prio),
|
||||
"--poll", "50",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
];
|
||||
|
||||
if (cpuMoe) cmd.push("--cpu-moe");
|
||||
else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
|
||||
if (nommap) cmd.push("--no-mmap");
|
||||
|
||||
return cmd;
|
||||
}
|
||||
|
||||
function startServer(modelPath, params) {
|
||||
const args = buildCmd(modelPath, params);
|
||||
const exe = args.shift();
|
||||
log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
|
||||
return spawn(exe, args, {
|
||||
cwd: process.cwd(),
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
}
|
||||
|
||||
async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
try {
|
||||
const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const data = await resp.json();
|
||||
if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, bootTime: timeoutMs / 1000 };
|
||||
}
|
||||
|
||||
async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
|
||||
const payload = JSON.stringify({
|
||||
model: "local-model",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
|
||||
max_tokens: maxTokens,
|
||||
temperature: 0.0,
|
||||
});
|
||||
|
||||
const start = Date.now();
|
||||
const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: payload,
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const result = await resp.json();
|
||||
const elapsed = (Date.now() - start) / 1000;
|
||||
|
||||
const usage = result.usage || {};
|
||||
const ct = usage.completion_tokens || 0;
|
||||
return {
|
||||
tps: elapsed > 0 ? ct / elapsed : 0,
|
||||
completionTokens: ct,
|
||||
promptTokens: usage.prompt_tokens || 0,
|
||||
elapsed,
|
||||
};
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await killServer();
|
||||
log(` [${label}] Starting server...`);
|
||||
|
||||
const proc = startServer(model.path, params);
|
||||
const { ok, bootTime } = await waitForServer();
|
||||
|
||||
if (!ok) {
|
||||
log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
|
||||
proc.kill("SIGKILL");
|
||||
return null;
|
||||
}
|
||||
|
||||
const vram = getVramAll();
|
||||
const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
|
||||
log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
|
||||
|
||||
// Warmup
|
||||
try { await runBenchmark(20); } catch {}
|
||||
|
||||
// Benchmark
|
||||
const speeds = [];
|
||||
for (let i = 0; i < BENCHMARK_RUNS; i++) {
|
||||
try {
|
||||
const r = await runBenchmark();
|
||||
speeds.push(r.tps);
|
||||
log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Run ${i + 1}: ERROR (${e.message})`);
|
||||
}
|
||||
}
|
||||
|
||||
proc.kill("SIGKILL");
|
||||
|
||||
if (speeds.length === 0) {
|
||||
log(` [${label}] ALL BENCHMARK RUNS FAILED`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
|
||||
|
||||
const result = {
|
||||
model: model.name, quant: model.quant, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot_time: +bootTime.toFixed(1), vram, params,
|
||||
};
|
||||
ALL_RESULTS.push(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Phase Runners ─────────────────────────────────────────────
|
||||
|
||||
async function phase0_bootTest(model) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 0: Boot Test — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
// Try full GPU first
|
||||
let r = await testConfig(model, "boot-ngl999", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
|
||||
});
|
||||
if (r) return r;
|
||||
|
||||
// Try with cpu-moe
|
||||
log(" Full GPU failed, trying with --cpu-moe...");
|
||||
r = await testConfig(model, "boot-cpumoe", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
|
||||
});
|
||||
if (r) return r;
|
||||
|
||||
// Reduced layers
|
||||
log(" --cpu-moe also failed, trying reduced layers...");
|
||||
r = await testConfig(model, "boot-ngl-half", {
|
||||
ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
|
||||
ctk: "q4_0", ctv: "q4_0",
|
||||
});
|
||||
return r;
|
||||
}
|
||||
|
||||
async function phase1_gpuOffload(model, baseline) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 1: GPU Offload Strategy — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const results = baseline ? [baseline] : [];
|
||||
|
||||
// Test --cpu-moe on/off
|
||||
for (const cpuMoe of [true, false]) {
|
||||
const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
|
||||
if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
|
||||
const r = await testConfig(model, lbl, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
// n-cpu-moe sweep
|
||||
for (const n of [0, 5, 10, 15, 20]) {
|
||||
if (n > model.totalLayers) continue;
|
||||
const r = await testConfig(model, `n-cpu-moe=${n}`, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; }
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase2_threads(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 2: CPU Thread Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const t of [2, 4, 6, 8, 10, 12]) {
|
||||
if (t === p.t) continue;
|
||||
const r = await testConfig(model, `t=${t}`, {
|
||||
...p, t,
|
||||
});
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase3_batch(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 3: Batch Size Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const [ub, b] of [
|
||||
[128, 512], [256, 1024], [256, 2048],
|
||||
[512, 1024], [512, 2048], [512, 4096],
|
||||
[1024, 2048], [1024, 4096],
|
||||
]) {
|
||||
if (ub === p.ub && b === p.b) continue;
|
||||
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase4_kvcache(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 4: KV Cache Type Sweep — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const p = prev.params;
|
||||
const results = [prev];
|
||||
|
||||
for (const [ctk, ctv] of [
|
||||
["q4_0", "q4_0"], ["q8_0", "q8_0"],
|
||||
["q4_0", "q8_0"], ["f16", "f16"],
|
||||
]) {
|
||||
if (ctk === p.ctk && ctv === p.ctv) continue;
|
||||
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
|
||||
if (r) results.push(r);
|
||||
}
|
||||
|
||||
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||
log(`\n ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||
return best;
|
||||
}
|
||||
|
||||
async function phase5_final(model, prev) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` PHASE 5: Final Verification (5 runs) — ${model.name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
await killServer();
|
||||
const proc = startServer(model.path, prev.params);
|
||||
const { ok, bootTime } = await waitForServer();
|
||||
if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; }
|
||||
|
||||
const vram = getVramAll();
|
||||
try { await runBenchmark(20); } catch {}
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try {
|
||||
const r = await runBenchmark();
|
||||
speeds.push(r.tps);
|
||||
log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Final Run ${i + 1}: ERROR (${e.message})`);
|
||||
}
|
||||
}
|
||||
proc.kill("SIGKILL");
|
||||
|
||||
if (speeds.length > 0) {
|
||||
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
|
||||
|
||||
const final_ = {
|
||||
model: model.name, quant: model.quant,
|
||||
label: `FINAL-${model.name}`,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot_time: +bootTime.toFixed(1), vram, params: prev.params,
|
||||
};
|
||||
ALL_RESULTS.push(final_);
|
||||
return final_;
|
||||
}
|
||||
return prev;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
|
||||
async function runModelBenchmark(model) {
|
||||
log(`\n${"#".repeat(70)}`);
|
||||
log(` MODEL: ${model.name}`);
|
||||
log(` File: ${model.path}`);
|
||||
try {
|
||||
const sz = statSync(model.path).size / 1024 ** 3;
|
||||
log(` Size: ${sz.toFixed(2)} GB`);
|
||||
} catch { log(` Size: unknown`); }
|
||||
log(`${"#".repeat(70)}`);
|
||||
|
||||
if (!existsSync(model.path)) {
|
||||
log(` SKIP: Model file not found!`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const baseline = await phase0_bootTest(model);
|
||||
if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; }
|
||||
|
||||
let best = await phase1_gpuOffload(model, baseline);
|
||||
if (!best) return baseline;
|
||||
|
||||
best = await phase2_threads(model, best);
|
||||
best = await phase3_batch(model, best);
|
||||
best = await phase4_kvcache(model, best);
|
||||
best = await phase5_final(model, best);
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const startTime = Date.now();
|
||||
|
||||
log("=".repeat(70));
|
||||
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
|
||||
log(" 2x RTX 3060 (24GB Total) | 256K Context");
|
||||
log(` Models: ${MODELS.length}`);
|
||||
log(` Started: ${new Date().toISOString()}`);
|
||||
log("=".repeat(70));
|
||||
|
||||
const gpus = getVramAll();
|
||||
gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
|
||||
|
||||
const winners = [];
|
||||
|
||||
for (let i = 0; i < MODELS.length; i++) {
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
const winner = await runModelBenchmark(MODELS[i]);
|
||||
if (winner) winners.push(winner);
|
||||
|
||||
// Save intermediate
|
||||
writeFileSync("scripts/dual_gpu_results.json",
|
||||
JSON.stringify(ALL_RESULTS, null, 2));
|
||||
log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`);
|
||||
}
|
||||
|
||||
// ─── Grand Final ───────────────────────────────────────────
|
||||
const elapsed = (Date.now() - startTime) / 60000;
|
||||
|
||||
log(`\n${"=".repeat(70)}`);
|
||||
log(` GRAND FINAL COMPARISON`);
|
||||
log(` Total time: ${elapsed.toFixed(1)} minutes`);
|
||||
log(` Configs tested: ${ALL_RESULTS.length}`);
|
||||
log(`${"=".repeat(70)}`);
|
||||
|
||||
if (winners.length === 0) {
|
||||
log(" No models ran at 256K!");
|
||||
return;
|
||||
}
|
||||
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
const medals = ["🥇", "🥈", "🥉", " "];
|
||||
|
||||
const lines = [
|
||||
`Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
|
||||
`Hardware: 2x RTX 3060 12GB | Context: 256K`,
|
||||
`Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
|
||||
"", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60),
|
||||
];
|
||||
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i];
|
||||
const p = w.params;
|
||||
lines.push("");
|
||||
lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`);
|
||||
lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
|
||||
lines.push(` Boot: ${w.boot_time.toFixed(0)}s`);
|
||||
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
|
||||
lines.push(` ctk=${p.ctk} ctv=${p.ctv}`);
|
||||
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
||||
else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
||||
}
|
||||
|
||||
const champ = winners[0];
|
||||
const cp = champ.params;
|
||||
lines.push("", "=".repeat(60));
|
||||
lines.push(` ★ CHAMPION: ${champ.model}`);
|
||||
lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`);
|
||||
lines.push("=".repeat(60));
|
||||
|
||||
// Build recommended command
|
||||
const cmdParts = [
|
||||
`llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
|
||||
`-ngl ${cp.ngl} -c ${CONTEXT}`,
|
||||
`-t ${cp.t} -tb ${cp.t}`,
|
||||
`-ub ${cp.ub} -b ${cp.b}`,
|
||||
`-fa on`,
|
||||
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
||||
`--prio ${cp.prio || 3} --poll 50`,
|
||||
`--mlock`,
|
||||
];
|
||||
if (cp.cpuMoe) cmdParts.push("--cpu-moe");
|
||||
else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
|
||||
if (cp.nommap) cmdParts.push("--no-mmap");
|
||||
cmdParts.push("--port 8000 --host 0.0.0.0");
|
||||
|
||||
lines.push("", " Recommended command:");
|
||||
lines.push(` ${cmdParts.join(" ")}`);
|
||||
|
||||
const summary = lines.join("\n");
|
||||
console.log(summary);
|
||||
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/dual_gpu_results.json",
|
||||
JSON.stringify(ALL_RESULTS, null, 2));
|
||||
|
||||
log(`\n Results: scripts/dual_gpu_results.json`);
|
||||
log(` Summary: scripts/dual_gpu_summary.txt`);
|
||||
log(` DONE!`);
|
||||
|
||||
await killServer();
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error("Fatal error:", e);
|
||||
process.exit(1);
|
||||
});
|
||||
644
scripts/dual_gpu_benchmark.py
Normal file
644
scripts/dual_gpu_benchmark.py
Normal file
@@ -0,0 +1,644 @@
|
||||
"""
|
||||
Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||
==========================================================
|
||||
Tests 4 models across multiple parameter configurations to find
|
||||
the absolute best model + settings for 256K context coding agent.
|
||||
|
||||
Models:
|
||||
1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||
2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||
3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||
4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||
|
||||
Test Phases (per model):
|
||||
Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
|
||||
Phase 1: GPU layer + MoE offload strategy sweep
|
||||
Phase 2: CPU thread sweep (carry best from P1)
|
||||
Phase 3: Batch size sweep (carry best from P1+P2)
|
||||
Phase 4: KV cache type sweep (carry best from P1+P2+P3)
|
||||
Phase 5: Final verification (5 runs)
|
||||
|
||||
Output: scripts/dual_gpu_results.json (all raw data)
|
||||
scripts/dual_gpu_summary.txt (human-readable winner)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ─── Configuration ───────────────────────────────────────────────
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||
CONTEXT = 262144 # 256K
|
||||
BENCHMARK_RUNS = 3
|
||||
BENCHMARK_TOKENS = 200
|
||||
SERVER_TIMEOUT = 300 # seconds to wait for server startup
|
||||
|
||||
MODELS = [
|
||||
{
|
||||
"name": "Qwen3.5-35B-A3B Q4_K_M",
|
||||
"path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
|
||||
"type": "qwen",
|
||||
"quant": "Q4_K_M",
|
||||
"is_mxfp4": False,
|
||||
"total_layers": 64, # Qwen3.5 35B has 64 layers
|
||||
},
|
||||
{
|
||||
"name": "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||
"path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
|
||||
"type": "qwen",
|
||||
"quant": "MXFP4_MOE",
|
||||
"is_mxfp4": True,
|
||||
"total_layers": 64,
|
||||
},
|
||||
{
|
||||
"name": "Gemma4 26B-A4B Q4_K_M",
|
||||
"path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
|
||||
"type": "gemma4",
|
||||
"quant": "Q4_K_M",
|
||||
"is_mxfp4": False,
|
||||
"total_layers": 30, # Gemma4 26B has 30 layers
|
||||
},
|
||||
{
|
||||
"name": "Gemma4 26B-A4B MXFP4_MOE",
|
||||
"path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
|
||||
"type": "gemma4",
|
||||
"quant": "MXFP4_MOE",
|
||||
"is_mxfp4": True,
|
||||
"total_layers": 30,
|
||||
},
|
||||
]
|
||||
|
||||
ALL_RESULTS = []
|
||||
|
||||
|
||||
# ─── Utility Functions ──────────────────────────────────────────
|
||||
def log(msg):
|
||||
ts = datetime.datetime.now().strftime("%H:%M:%S")
|
||||
print(f"[{ts}] {msg}", flush=True)
|
||||
|
||||
|
||||
def kill_server():
|
||||
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||
capture_output=True)
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def get_vram_all():
|
||||
"""Returns list of (used, total) tuples for each GPU."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
gpus = []
|
||||
for line in r.stdout.strip().split("\n"):
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 3:
|
||||
gpus.append({
|
||||
"gpu": int(parts[0]),
|
||||
"used": int(parts[1]),
|
||||
"total": int(parts[2]),
|
||||
})
|
||||
return gpus
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
|
||||
cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
|
||||
"""Build llama-server command for dual-GPU."""
|
||||
cmd = [
|
||||
LLAMA_SERVER,
|
||||
"--model", model_path,
|
||||
"-ngl", str(ngl),
|
||||
"-c", str(CONTEXT),
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", ctk,
|
||||
"--cache-type-v", ctv,
|
||||
"-ub", str(ub),
|
||||
"-b", str(b),
|
||||
"-t", str(t),
|
||||
"-tb", str(t),
|
||||
"--prio", str(prio),
|
||||
"--poll", "50",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
]
|
||||
# MoE offloading options
|
||||
if cpu_moe:
|
||||
cmd.append("--cpu-moe")
|
||||
elif n_cpu_moe > 0:
|
||||
cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
|
||||
if nommap:
|
||||
cmd.append("--no-mmap")
|
||||
return cmd
|
||||
|
||||
|
||||
def start_server(model_path, **kwargs):
|
||||
cmd = build_cmd(model_path, **kwargs)
|
||||
log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_server(timeout=SERVER_TIMEOUT):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
boot_time = time.time() - start
|
||||
return True, boot_time
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(3)
|
||||
return False, timeout
|
||||
|
||||
|
||||
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user",
|
||||
"content": "Count from 1 to 50, writing each number on a new line."}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0,
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
ct = usage.get("completion_tokens", 0)
|
||||
pt = usage.get("prompt_tokens", 0)
|
||||
return {
|
||||
"tps": ct / elapsed if elapsed > 0 else 0,
|
||||
"completion_tokens": ct,
|
||||
"prompt_tokens": pt,
|
||||
"elapsed": elapsed,
|
||||
}
|
||||
|
||||
|
||||
def test_config(model_info, label, **kwargs):
|
||||
"""Test a single configuration. Returns result dict or None."""
|
||||
kill_server()
|
||||
log(f" [{label}] Starting server...")
|
||||
|
||||
proc = start_server(model_info["path"], **kwargs)
|
||||
ok, boot_time = wait_for_server()
|
||||
|
||||
if not ok:
|
||||
log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
|
||||
proc.kill()
|
||||
return None
|
||||
|
||||
vram = get_vram_all()
|
||||
vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
|
||||
log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Benchmark runs
|
||||
speeds = []
|
||||
for i in range(BENCHMARK_RUNS):
|
||||
try:
|
||||
r = run_benchmark()
|
||||
speeds.append(r["tps"])
|
||||
log(f" Run {i+1}: {r['tps']:.2f} t/s")
|
||||
except Exception as e:
|
||||
log(f" Run {i+1}: ERROR ({e})")
|
||||
|
||||
proc.kill()
|
||||
|
||||
if not speeds:
|
||||
log(f" [{label}] ALL BENCHMARK RUNS FAILED")
|
||||
return None
|
||||
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best = max(speeds)
|
||||
log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||
|
||||
result = {
|
||||
"model": model_info["name"],
|
||||
"quant": model_info["quant"],
|
||||
"label": label,
|
||||
"avg_tps": round(avg, 2),
|
||||
"best_tps": round(best, 2),
|
||||
"boot_time": round(boot_time, 1),
|
||||
"vram": vram,
|
||||
"params": kwargs,
|
||||
}
|
||||
ALL_RESULTS.append(result)
|
||||
return result
|
||||
|
||||
|
||||
# ─── Phase Runners ───────────────────────────────────────────────
|
||||
|
||||
def phase0_boot_test(model):
|
||||
"""Quick test: can the model even boot with 256K on dual GPU?"""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 0: Boot Test — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
# Try -ngl 999 (all layers to GPU) as baseline
|
||||
r = test_config(
|
||||
model, f"boot-ngl999",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
)
|
||||
if r:
|
||||
return r
|
||||
|
||||
# If full GPU fails, try with cpu-moe
|
||||
log(" Full GPU failed, trying with --cpu-moe...")
|
||||
r = test_config(
|
||||
model, f"boot-cpumoe",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
cpu_moe=True,
|
||||
)
|
||||
if r:
|
||||
return r
|
||||
|
||||
# Extreme fallback: fewer layers
|
||||
log(" --cpu-moe also failed, trying reduced layers...")
|
||||
r = test_config(
|
||||
model, f"boot-ngl-half",
|
||||
ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
|
||||
ctk="q4_0", ctv="q4_0",
|
||||
)
|
||||
return r
|
||||
|
||||
|
||||
def phase1_gpu_offload(model, baseline):
|
||||
"""Find optimal GPU layer count and MoE offload strategy."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 1: GPU Offload Strategy — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
results = []
|
||||
if baseline:
|
||||
results.append(baseline)
|
||||
|
||||
total = model["total_layers"]
|
||||
|
||||
# Strategy A: All GPU + cpu-moe variations
|
||||
for cpu_moe in [True, False]:
|
||||
label = f"ngl=999 cpu_moe={cpu_moe}"
|
||||
# Skip if already tested in baseline
|
||||
if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
|
||||
baseline["params"].get("cpu_moe", False) == cpu_moe:
|
||||
continue
|
||||
r = test_config(
|
||||
model, label,
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
cpu_moe=cpu_moe,
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
# Strategy B: n-cpu-moe sweep (selective expert offload)
|
||||
for n in [0, 5, 10, 15, 20]:
|
||||
if n > total:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"n-cpu-moe={n}",
|
||||
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||
n_cpu_moe=n,
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
if not results:
|
||||
log(" PHASE 1: No configuration worked!")
|
||||
return None
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase2_threads(model, prev_best):
|
||||
"""Sweep CPU threads with best GPU config locked."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 2: CPU Thread Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
results = [prev_best]
|
||||
|
||||
for t in [2, 4, 6, 8, 10, 12]:
|
||||
if t == p.get("t", 6):
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"t={t}",
|
||||
ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
|
||||
ctk=p["ctk"], ctv=p["ctv"],
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase3_batch(model, prev_best):
|
||||
"""Sweep batch sizes."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 3: Batch Size Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
best_t = p["t"]
|
||||
results = [prev_best]
|
||||
|
||||
for ub, b in [(128, 512), (256, 1024), (256, 2048),
|
||||
(512, 1024), (512, 2048), (512, 4096),
|
||||
(1024, 2048), (1024, 4096)]:
|
||||
if ub == p["ub"] and b == p["b"]:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"ub={ub} b={b}",
|
||||
ngl=p["ngl"], t=best_t, ub=ub, b=b,
|
||||
ctk=p["ctk"], ctv=p["ctv"],
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase4_kvcache(model, prev_best):
|
||||
"""Sweep KV cache precision."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 4: KV Cache Type Sweep — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
results = [prev_best]
|
||||
|
||||
for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
|
||||
("q4_0", "q8_0"), ("f16", "f16")]:
|
||||
if ctk == p["ctk"] and ctv == p["ctv"]:
|
||||
continue
|
||||
r = test_config(
|
||||
model, f"kv={ctk}/{ctv}",
|
||||
ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
|
||||
ctk=ctk, ctv=ctv,
|
||||
cpu_moe=p.get("cpu_moe", False),
|
||||
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||
)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
best = max(results, key=lambda x: x["avg_tps"])
|
||||
log(f"\n ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||
return best
|
||||
|
||||
|
||||
def phase5_final(model, prev_best):
|
||||
"""Final verification with 5 runs."""
|
||||
log(f"\n{'='*70}")
|
||||
log(f" PHASE 5: Final Verification (5 runs) — {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
p = prev_best["params"]
|
||||
kill_server()
|
||||
proc = start_server(model["path"], **p)
|
||||
ok, boot_time = wait_for_server()
|
||||
if not ok:
|
||||
log(" FAILED to start for final verification!")
|
||||
proc.kill()
|
||||
return prev_best
|
||||
|
||||
vram = get_vram_all()
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
run_benchmark(max_tokens=20)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
speeds = []
|
||||
for i in range(5):
|
||||
try:
|
||||
r = run_benchmark()
|
||||
speeds.append(r["tps"])
|
||||
log(f" Final Run {i+1}: {r['tps']:.2f} t/s")
|
||||
except Exception as e:
|
||||
log(f" Final Run {i+1}: ERROR ({e})")
|
||||
|
||||
proc.kill()
|
||||
|
||||
if speeds:
|
||||
avg = sum(speeds) / len(speeds)
|
||||
best_tps = max(speeds)
|
||||
log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
|
||||
|
||||
final = {
|
||||
"model": model["name"],
|
||||
"quant": model["quant"],
|
||||
"label": f"FINAL-{model['name']}",
|
||||
"avg_tps": round(avg, 2),
|
||||
"best_tps": round(best_tps, 2),
|
||||
"boot_time": round(boot_time, 1),
|
||||
"vram": vram,
|
||||
"params": p,
|
||||
}
|
||||
ALL_RESULTS.append(final)
|
||||
return final
|
||||
|
||||
return prev_best
|
||||
|
||||
|
||||
# ─── Main ────────────────────────────────────────────────────────
|
||||
|
||||
def run_full_benchmark_for_model(model):
|
||||
"""Run all phases for a single model."""
|
||||
log(f"\n{'#'*70}")
|
||||
log(f" MODEL: {model['name']}")
|
||||
log(f" File: {model['path']}")
|
||||
log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB")
|
||||
log(f"{'#'*70}")
|
||||
|
||||
# Check model exists
|
||||
if not os.path.exists(model["path"]):
|
||||
log(f" SKIP: Model file not found!")
|
||||
return None
|
||||
|
||||
# Phase 0: Can it boot?
|
||||
baseline = phase0_boot_test(model)
|
||||
if not baseline:
|
||||
log(f" SKIP: {model['name']} cannot boot at 256K context!")
|
||||
return None
|
||||
|
||||
# Phase 1: GPU offload strategy
|
||||
best = phase1_gpu_offload(model, baseline)
|
||||
if not best:
|
||||
return baseline
|
||||
|
||||
# Phase 2: CPU threads
|
||||
best = phase2_threads(model, best)
|
||||
|
||||
# Phase 3: Batch sizes
|
||||
best = phase3_batch(model, best)
|
||||
|
||||
# Phase 4: KV cache
|
||||
best = phase4_kvcache(model, best)
|
||||
|
||||
# Phase 5: Final verification
|
||||
final = phase5_final(model, best)
|
||||
|
||||
return final
|
||||
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
|
||||
log("=" * 70)
|
||||
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
|
||||
log(" 2x RTX 3060 (24GB Total) | 256K Context")
|
||||
log(f" Models: {len(MODELS)}")
|
||||
log(f" Started: {datetime.datetime.now().isoformat()}")
|
||||
log("=" * 70)
|
||||
|
||||
# Show GPU info
|
||||
gpus = get_vram_all()
|
||||
for g in gpus:
|
||||
log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
|
||||
|
||||
# Run benchmarks for each model
|
||||
model_winners = []
|
||||
for i, model in enumerate(MODELS):
|
||||
log(f"\n{'='*70}")
|
||||
log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
winner = run_full_benchmark_for_model(model)
|
||||
if winner:
|
||||
model_winners.append(winner)
|
||||
|
||||
# Save intermediate results
|
||||
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
|
||||
|
||||
# ─── Grand Final Comparison ──────────────────────────────────
|
||||
elapsed = (time.time() - start_time) / 60
|
||||
|
||||
log(f"\n{'='*70}")
|
||||
log(f" GRAND FINAL COMPARISON")
|
||||
log(f" Total time: {elapsed:.1f} minutes")
|
||||
log(f" Configs tested: {len(ALL_RESULTS)}")
|
||||
log(f"{'='*70}")
|
||||
|
||||
if not model_winners:
|
||||
log(" No models were able to run at 256K context!")
|
||||
return
|
||||
|
||||
# Sort by avg t/s
|
||||
model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
|
||||
|
||||
summary_lines = []
|
||||
summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
|
||||
summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
|
||||
summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
|
||||
summary_lines.append(f"Total time: {elapsed:.1f} minutes")
|
||||
summary_lines.append("")
|
||||
summary_lines.append("=" * 60)
|
||||
summary_lines.append(" RANKING (by AVG t/s)")
|
||||
summary_lines.append("=" * 60)
|
||||
|
||||
for rank, w in enumerate(model_winners, 1):
|
||||
medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ")
|
||||
summary_lines.append(f"\n {medal} #{rank}: {w['model']}")
|
||||
summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
|
||||
summary_lines.append(f" Boot: {w['boot_time']:.0f}s")
|
||||
p = w["params"]
|
||||
summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
|
||||
summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}")
|
||||
if p.get("cpu_moe"):
|
||||
summary_lines.append(f" --cpu-moe")
|
||||
elif p.get("n_cpu_moe", 0) > 0:
|
||||
summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}")
|
||||
|
||||
champion = model_winners[0]
|
||||
summary_lines.append(f"\n{'='*60}")
|
||||
summary_lines.append(f" ★ CHAMPION: {champion['model']}")
|
||||
summary_lines.append(f" {champion['avg_tps']:.2f} t/s average")
|
||||
summary_lines.append(f"{'='*60}")
|
||||
|
||||
# Build recommended command
|
||||
p = champion["params"]
|
||||
cmd_parts = [
|
||||
f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
|
||||
f"-ngl {p['ngl']} -c {CONTEXT}",
|
||||
f"-t {p['t']} -tb {p['t']}",
|
||||
f"-ub {p['ub']} -b {p['b']}",
|
||||
"-fa on",
|
||||
f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
|
||||
f"--prio {p.get('prio', 3)} --poll 50",
|
||||
"--mlock",
|
||||
]
|
||||
if p.get("cpu_moe"):
|
||||
cmd_parts.append("--cpu-moe")
|
||||
elif p.get("n_cpu_moe", 0) > 0:
|
||||
cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
|
||||
if p.get("nommap"):
|
||||
cmd_parts.append("--no-mmap")
|
||||
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||
|
||||
summary_lines.append(f"\n Recommended command:")
|
||||
summary_lines.append(f" {' '.join(cmd_parts)}")
|
||||
|
||||
summary = "\n".join(summary_lines)
|
||||
print(summary)
|
||||
|
||||
with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
|
||||
f.write(summary)
|
||||
|
||||
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||
|
||||
log(f"\n Results: scripts/dual_gpu_results.json")
|
||||
log(f" Summary: scripts/dual_gpu_summary.txt")
|
||||
log(f" DONE!")
|
||||
|
||||
kill_server()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
330
scripts/dual_gpu_benchmark_v2.mjs
Normal file
330
scripts/dual_gpu_benchmark_v2.mjs
Normal file
@@ -0,0 +1,330 @@
|
||||
/**
|
||||
* Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2
|
||||
* =====================================================
|
||||
* Informed by VRAM analysis — tests models in optimal order.
|
||||
*
|
||||
* Key insights applied:
|
||||
* - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA)
|
||||
* - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first
|
||||
* - Skip configs known to fail, minimize wasted time
|
||||
*
|
||||
* Run: node scripts/dual_gpu_benchmark_v2.mjs
|
||||
* Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, existsSync, statSync } from "fs";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CTX = 262144;
|
||||
const RUNS = 3;
|
||||
const TOKENS = 200;
|
||||
const BOOT_TIMEOUT = 300_000;
|
||||
|
||||
// Models ordered: smallest first (most likely to succeed fully on GPU)
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Gemma4-26B MXFP4_MOE",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
||||
quant: "MXFP4_MOE",
|
||||
fitsGPU: true, // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB
|
||||
},
|
||||
{
|
||||
name: "Gemma4-26B Q4_K_M",
|
||||
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
||||
quant: "Q4_K_M",
|
||||
fitsGPU: true, // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
quant: "MXFP4_MOE",
|
||||
fitsGPU: "maybe", // 20.1 + 1.41 + 1 = 22.51 GB — tight
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5-35B Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
quant: "Q4_K_M",
|
||||
fitsGPU: "maybe", // 20.5 + 1.41 + 1 = 22.91 GB — very tight
|
||||
},
|
||||
];
|
||||
|
||||
const ALL = [];
|
||||
let currentProc = null;
|
||||
|
||||
// ─── Utilities ─────────────────────────────────────────────────
|
||||
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
async function kill() {
|
||||
if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; }
|
||||
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||
await sleep(5000);
|
||||
}
|
||||
|
||||
function vram() {
|
||||
try {
|
||||
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||
return { gpu: g, used: u, total: t };
|
||||
});
|
||||
} catch { return []; }
|
||||
}
|
||||
|
||||
function startServer(modelPath, p) {
|
||||
const args = [
|
||||
"--model", modelPath, "-ngl", String(p.ngl),
|
||||
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", p.ctk, "--cache-type-v", p.ctv,
|
||||
"-ub", String(p.ub), "-b", String(p.b),
|
||||
"-t", String(p.t), "-tb", String(p.t),
|
||||
"--prio", String(p.prio || 3), "--poll", "50", "--mlock",
|
||||
"--port", "8000", "--host", "0.0.0.0",
|
||||
];
|
||||
if (p.cpuMoe) args.push("--cpu-moe");
|
||||
else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||
if (p.nommap) args.push("--no-mmap");
|
||||
|
||||
currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||
return currentProc;
|
||||
}
|
||||
|
||||
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||
const t0 = Date.now();
|
||||
while (Date.now() - t0 < timeout) {
|
||||
try {
|
||||
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const d = await r.json();
|
||||
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, boot: timeout / 1000 };
|
||||
}
|
||||
|
||||
async function bench(n = TOKENS) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "m",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||
max_tokens: n, temperature: 0,
|
||||
}),
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const ct = d.usage?.completion_tokens || 0;
|
||||
return { tps: ct / dt, ct, dt };
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await kill();
|
||||
log(` [${label}] Starting...`);
|
||||
startServer(model.path, params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) { log(` [${label}] ✗ FAILED (timeout)`); await kill(); return null; }
|
||||
|
||||
const v = vram();
|
||||
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||
log(` [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`);
|
||||
|
||||
try { await bench(20); } catch {} // warmup
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < RUNS; i++) {
|
||||
try { const r = await bench(); speeds.push(r.tps); log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Run${i+1}: ERR ${e.message}`); }
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (!speeds.length) { log(` [${label}] ✗ ALL RUNS FAILED`); return null; }
|
||||
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||
|
||||
const res = { model: model.name, quant: model.quant, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram: v, params };
|
||||
ALL.push(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
// Save intermediate results after each test
|
||||
function saveIntermediate() {
|
||||
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
}
|
||||
|
||||
// ─── Smart Phase Runner ────────────────────────────────────────
|
||||
|
||||
async function tuneModel(model) {
|
||||
log(`\n${"#".repeat(65)}`);
|
||||
log(` ${model.name} (${model.quant})`);
|
||||
if (!existsSync(model.path)) { log(" ✗ File not found, SKIP"); return null; }
|
||||
const sz = (statSync(model.path).size / 1024**3).toFixed(2);
|
||||
log(` Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`);
|
||||
log(`${"#".repeat(65)}`);
|
||||
|
||||
// ── Step 1: Find working GPU config ──
|
||||
log(`\n ── Step 1: Find optimal GPU offload ──`);
|
||||
let baseline = null;
|
||||
|
||||
if (model.fitsGPU === true || model.fitsGPU === "maybe") {
|
||||
// Try full GPU, no CPU offload
|
||||
baseline = await testConfig(model, "ngl=999 pure-GPU", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" });
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
if (!baseline) {
|
||||
// Try n-cpu-moe values (ascending — find minimum needed)
|
||||
for (const n of [5, 10, 15, 20]) {
|
||||
baseline = await testConfig(model, `n-cpu-moe=${n}`, {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n });
|
||||
saveIntermediate();
|
||||
if (baseline) break; // found minimum working offload
|
||||
}
|
||||
}
|
||||
|
||||
if (!baseline) {
|
||||
// Last resort: full cpu-moe
|
||||
baseline = await testConfig(model, "cpu-moe", {
|
||||
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true });
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
if (!baseline) { log(` ✗ ${model.name} cannot boot at 256K!`); return null; }
|
||||
|
||||
const bp = baseline.params; // carry forward best params
|
||||
|
||||
// If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory)
|
||||
if (!bp.cpuMoe && !bp.nCpuMoe) {
|
||||
const alt = await testConfig(model, "compare: cpu-moe", {
|
||||
...bp, cpuMoe: true });
|
||||
saveIntermediate();
|
||||
if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; }
|
||||
}
|
||||
|
||||
let best = baseline;
|
||||
|
||||
// ── Step 2: Thread sweep ──
|
||||
log(`\n ── Step 2: Thread sweep ──`);
|
||||
for (const t of [2, 4, 8, 10, 12]) {
|
||||
if (t === best.params.t) continue;
|
||||
const r = await testConfig(model, `t=${t}`, { ...best.params, t });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 3: Batch sweep ──
|
||||
log(`\n ── Step 3: Batch sweep ──`);
|
||||
for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) {
|
||||
if (ub === best.params.ub && b === best.params.b) continue;
|
||||
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 4: KV cache sweep ──
|
||||
log(`\n ── Step 4: KV cache type ──`);
|
||||
for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) {
|
||||
if (ctk === best.params.ctk && ctv === best.params.ctv) continue;
|
||||
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv });
|
||||
saveIntermediate();
|
||||
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||
}
|
||||
|
||||
// ── Step 5: Final verification (5 runs) ──
|
||||
log(`\n ── Step 5: Final verification ──`);
|
||||
await kill();
|
||||
startServer(model.path, best.params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) { await kill(); return best; }
|
||||
const v = vram();
|
||||
try { await bench(20); } catch {}
|
||||
|
||||
const finals = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (finals.length > 0) {
|
||||
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||
const bst = Math.max(...finals);
|
||||
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||
const final = { model: model.name, quant: model.quant, label: `FINAL`,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram: v, params: best.params };
|
||||
ALL.push(final);
|
||||
saveIntermediate();
|
||||
return final;
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
async function main() {
|
||||
const t0 = Date.now();
|
||||
log("=" .repeat(65));
|
||||
log(" DUAL-GPU BENCHMARK v2 — Smart Strategy");
|
||||
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||
log(" " + new Date().toISOString());
|
||||
log("=".repeat(65));
|
||||
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||
|
||||
const winners = [];
|
||||
for (let i = 0; i < MODELS.length; i++) {
|
||||
log(`\n${"=".repeat(65)}`);
|
||||
log(` MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`);
|
||||
log("=".repeat(65));
|
||||
const w = await tuneModel(MODELS[i]);
|
||||
if (w) winners.push(w);
|
||||
saveIntermediate();
|
||||
}
|
||||
|
||||
// ─── Summary ──────────────────────────────────────────────
|
||||
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
const medals = ["🥇", "🥈", "🥉", " "];
|
||||
|
||||
const lines = [
|
||||
`Dual-GPU Benchmark v2 — ${new Date().toISOString()}`,
|
||||
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||
"", "=" .repeat(55), " RANKING", "=".repeat(55),
|
||||
];
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i], p = w.params;
|
||||
lines.push("", ` ${medals[i]||" "} #${i+1}: ${w.model}`);
|
||||
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`);
|
||||
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
||||
else if (p.nCpuMoe) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
||||
}
|
||||
if (winners.length > 0) {
|
||||
const c = winners[0], cp = c.params;
|
||||
lines.push("", "=".repeat(55), ` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s`, "=".repeat(55));
|
||||
const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`,
|
||||
`-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`,
|
||||
`-ub ${cp.ub} -b ${cp.b} -fa on`,
|
||||
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
||||
`--prio ${cp.prio||3} --poll 50 --mlock`,
|
||||
cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "",
|
||||
"--port 8000 --host 0.0.0.0"].filter(Boolean).join(" ");
|
||||
lines.push("", " Recommended:", ` ${cmd}`);
|
||||
}
|
||||
const summary = lines.join("\n");
|
||||
console.log("\n" + summary);
|
||||
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
log(`\n Saved: dual_gpu_results.json + dual_gpu_summary.txt`);
|
||||
log(" DONE!");
|
||||
await kill();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||
1654
scripts/dual_gpu_results.json
Normal file
1654
scripts/dual_gpu_results.json
Normal file
File diff suppressed because it is too large
Load Diff
31
scripts/dual_gpu_summary.txt
Normal file
31
scripts/dual_gpu_summary.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
Dual-GPU Benchmark v2 — 2026-04-06T06:52:08.868Z
|
||||
2x RTX 3060 12GB | 256K Context | 58 configs | 69.4 min
|
||||
|
||||
=======================================================
|
||||
RANKING
|
||||
=======================================================
|
||||
|
||||
🥇 #1: Gemma4-26B Q4_K_M
|
||||
AVG: 76.4 t/s | BEST: 76.75 t/s | Boot: 9s
|
||||
ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16
|
||||
|
||||
🥈 #2: Gemma4-26B MXFP4_MOE
|
||||
AVG: 64.05 t/s | BEST: 64.29 t/s | Boot: 9s
|
||||
ngl=999 t=2 ub=512 b=2048 ctk=q8_0 ctv=q8_0
|
||||
|
||||
🥉 #3: Qwen3.5-35B Q4_K_M
|
||||
AVG: 52.05 t/s | BEST: 54.48 t/s | Boot: 12.1s
|
||||
ngl=999 t=4 ub=256 b=1024 ctk=q4_0 ctv=q4_0
|
||||
--n-cpu-moe 5
|
||||
|
||||
#4: Qwen3.5-35B MXFP4_MOE
|
||||
AVG: 46.66 t/s | BEST: 47.09 t/s | Boot: 15s
|
||||
ngl=999 t=6 ub=512 b=2048 ctk=q4_0 ctv=q4_0
|
||||
--n-cpu-moe 5
|
||||
|
||||
=======================================================
|
||||
★ CHAMPION: Gemma4-26B Q4_K_M — 76.4 t/s
|
||||
=======================================================
|
||||
|
||||
Recommended:
|
||||
llama-server --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf -ngl 999 -c 262144 -t 6 -tb 6 -ub 512 -b 2048 -fa on --cache-type-k f16 --cache-type-v f16 --prio 3 --poll 50 --mlock --port 8000 --host 0.0.0.0
|
||||
BIN
scripts/final_tune_122b.txt
Normal file
BIN
scripts/final_tune_122b.txt
Normal file
Binary file not shown.
BIN
scripts/final_tune_122b_dual.txt
Normal file
BIN
scripts/final_tune_122b_dual.txt
Normal file
Binary file not shown.
101
scripts/find_max_dense.mjs
Normal file
101
scripts/find_max_dense.mjs
Normal file
@@ -0,0 +1,101 @@
|
||||
import { spawn, exec } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function killServer() {
|
||||
return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
|
||||
}
|
||||
|
||||
async function testContextSize(modelPath, contextSize) {
|
||||
console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
|
||||
await killServer();
|
||||
|
||||
const args = [
|
||||
'--model', `models\\${modelPath}`,
|
||||
'-ngl', '999',
|
||||
'-c', contextSize.toString(),
|
||||
'-fa', 'on',
|
||||
'--cache-type-k', 'q4_0',
|
||||
'--cache-type-v', 'q4_0',
|
||||
'-ub', '512',
|
||||
'-b', '2048',
|
||||
'-t', '6',
|
||||
'-tb', '6',
|
||||
'--split-mode', 'row',
|
||||
'--prio', '3',
|
||||
'--fit', 'off',
|
||||
'--port', '8000',
|
||||
'--host', '0.0.0.0'
|
||||
];
|
||||
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
|
||||
|
||||
let booted = false;
|
||||
let oomed = false;
|
||||
|
||||
server.stderr.on('data', (d) => {
|
||||
const text = d.toString();
|
||||
if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
|
||||
oomed = true;
|
||||
}
|
||||
});
|
||||
|
||||
for (let i = 0; i < 20; i++) {
|
||||
if (oomed) break;
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
booted = true;
|
||||
break;
|
||||
}
|
||||
} catch(e) {}
|
||||
await delay(2000);
|
||||
}
|
||||
|
||||
if (oomed || !booted) {
|
||||
console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
|
||||
server.kill('SIGKILL');
|
||||
await killServer();
|
||||
return false;
|
||||
}
|
||||
|
||||
console.log(`✅ Booted! Running Benchmark...`);
|
||||
|
||||
// Benchmark
|
||||
const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
r(stdout || stderr);
|
||||
}));
|
||||
|
||||
console.log(bench);
|
||||
await killServer();
|
||||
return true;
|
||||
}
|
||||
|
||||
async function findMaxContext(modelName) {
|
||||
const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
|
||||
|
||||
let maxFound = false;
|
||||
for (const c of contexts) {
|
||||
const success = await testContextSize(modelName, c);
|
||||
if (success) {
|
||||
maxFound = true;
|
||||
console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!maxFound) {
|
||||
console.log(`\n❌ Failed to find any working context size for ${modelName}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
exec('set CUDA_VISIBLE_DEVICES=');
|
||||
console.log("============= QWEN 27B Q4_K_M =============");
|
||||
await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
|
||||
|
||||
console.log("\n============= GEMMA 4 31B Q4_K_M =============");
|
||||
await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
|
||||
}
|
||||
|
||||
main();
|
||||
562
scripts/help_full.txt
Normal file
562
scripts/help_full.txt
Normal file
@@ -0,0 +1,562 @@
|
||||
----- common params -----
|
||||
|
||||
-h, --help, --usage print usage and exit
|
||||
--version show version and build info
|
||||
--license show source code license and dependencies
|
||||
-cl, --cache-list show list of models in cache
|
||||
--completion-bash print source-able bash completion script for llama.cpp
|
||||
-t, --threads N number of CPU threads to use during generation (default: -1)
|
||||
(env: LLAMA_ARG_THREADS)
|
||||
-tb, --threads-batch N number of threads to use during batch and prompt processing (default:
|
||||
same as --threads)
|
||||
-C, --cpu-mask M CPU affinity mask: arbitrarily long hex. Complements cpu-range
|
||||
(default: "")
|
||||
-Cr, --cpu-range lo-hi range of CPUs for affinity. Complements --cpu-mask
|
||||
--cpu-strict <0|1> use strict CPU placement (default: 0)
|
||||
--prio N set process/thread priority : low(-1), normal(0), medium(1), high(2),
|
||||
realtime(3) (default: 0)
|
||||
--poll <0...100> use polling level to wait for work (0 - no polling, default: 50)
|
||||
-Cb, --cpu-mask-batch M CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch
|
||||
(default: same as --cpu-mask)
|
||||
-Crb, --cpu-range-batch lo-hi ranges of CPUs for affinity. Complements --cpu-mask-batch
|
||||
--cpu-strict-batch <0|1> use strict CPU placement (default: same as --cpu-strict)
|
||||
--prio-batch N set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime
|
||||
(default: 0)
|
||||
--poll-batch <0|1> use polling to wait for work (default: same as --poll)
|
||||
-c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
|
||||
(env: LLAMA_ARG_CTX_SIZE)
|
||||
-n, --predict, --n-predict N number of tokens to predict (default: -1, -1 = infinity)
|
||||
(env: LLAMA_ARG_N_PREDICT)
|
||||
-b, --batch-size N logical maximum batch size (default: 2048)
|
||||
(env: LLAMA_ARG_BATCH)
|
||||
-ub, --ubatch-size N physical maximum batch size (default: 512)
|
||||
(env: LLAMA_ARG_UBATCH)
|
||||
--keep N number of tokens to keep from the initial prompt (default: 0, -1 =
|
||||
all)
|
||||
--swa-full use full-size SWA cache (default: false)
|
||||
[(more
|
||||
info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
(env: LLAMA_ARG_SWA_FULL)
|
||||
-fa, --flash-attn [on|off|auto] set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
|
||||
(env: LLAMA_ARG_FLASH_ATTN)
|
||||
--perf, --no-perf whether to enable internal libllama performance timings (default:
|
||||
false)
|
||||
(env: LLAMA_ARG_PERF)
|
||||
-e, --escape, --no-escape whether to process escapes sequences (\n, \r, \t, \', \", \\)
|
||||
(default: true)
|
||||
--rope-scaling {none,linear,yarn} RoPE frequency scaling method, defaults to linear unless specified by
|
||||
the model
|
||||
(env: LLAMA_ARG_ROPE_SCALING_TYPE)
|
||||
--rope-scale N RoPE context scaling factor, expands context by a factor of N
|
||||
(env: LLAMA_ARG_ROPE_SCALE)
|
||||
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from
|
||||
model)
|
||||
(env: LLAMA_ARG_ROPE_FREQ_BASE)
|
||||
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
|
||||
(env: LLAMA_ARG_ROPE_FREQ_SCALE)
|
||||
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training
|
||||
context size)
|
||||
(env: LLAMA_ARG_YARN_ORIG_CTX)
|
||||
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.00, 0.0 = full
|
||||
interpolation)
|
||||
(env: LLAMA_ARG_YARN_EXT_FACTOR)
|
||||
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
|
||||
(env: LLAMA_ARG_YARN_ATTN_FACTOR)
|
||||
--yarn-beta-slow N YaRN: high correction dim or alpha (default: -1.00)
|
||||
(env: LLAMA_ARG_YARN_BETA_SLOW)
|
||||
--yarn-beta-fast N YaRN: low correction dim or beta (default: -1.00)
|
||||
(env: LLAMA_ARG_YARN_BETA_FAST)
|
||||
-kvo, --kv-offload, -nkvo, --no-kv-offload
|
||||
whether to enable KV cache offloading (default: enabled)
|
||||
(env: LLAMA_ARG_KV_OFFLOAD)
|
||||
--repack, -nr, --no-repack whether to enable weight repacking (default: enabled)
|
||||
(env: LLAMA_ARG_REPACK)
|
||||
--no-host bypass host buffer allowing extra buffers to be used
|
||||
(env: LLAMA_ARG_NO_HOST)
|
||||
-ctk, --cache-type-k TYPE KV cache data type for K
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(default: f16)
|
||||
(env: LLAMA_ARG_CACHE_TYPE_K)
|
||||
-ctv, --cache-type-v TYPE KV cache data type for V
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(default: f16)
|
||||
(env: LLAMA_ARG_CACHE_TYPE_V)
|
||||
-dt, --defrag-thold N KV cache defragmentation threshold (DEPRECATED)
|
||||
(env: LLAMA_ARG_DEFRAG_THOLD)
|
||||
--rpc SERVERS comma separated list of RPC servers (host:port)
|
||||
(env: LLAMA_ARG_RPC)
|
||||
--mlock force system to keep model in RAM rather than swapping or compressing
|
||||
(env: LLAMA_ARG_MLOCK)
|
||||
--mmap, --no-mmap whether to memory-map model. (if mmap disabled, slower load but may
|
||||
reduce pageouts if not using mlock) (default: enabled)
|
||||
(env: LLAMA_ARG_MMAP)
|
||||
-dio, --direct-io, -ndio, --no-direct-io
|
||||
use DirectIO if available. (default: disabled)
|
||||
(env: LLAMA_ARG_DIO)
|
||||
--numa TYPE attempt optimizations that help on some NUMA systems
|
||||
- distribute: spread execution evenly over all nodes
|
||||
- isolate: only spawn threads on CPUs on the node that execution
|
||||
started on
|
||||
- numactl: use the CPU map provided by numactl
|
||||
if run without this previously, it is recommended to drop the system
|
||||
page cache before using this
|
||||
see https://github.com/ggml-org/llama.cpp/issues/1437
|
||||
(env: LLAMA_ARG_NUMA)
|
||||
-dev, --device <dev1,dev2,..> comma-separated list of devices to use for offloading (none = don't
|
||||
offload)
|
||||
use --list-devices to see a list of available devices
|
||||
(env: LLAMA_ARG_DEVICE)
|
||||
--list-devices print list of available devices and exit
|
||||
-ot, --override-tensor <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type
|
||||
(env: LLAMA_ARG_OVERRIDE_TENSOR)
|
||||
-cmoe, --cpu-moe keep all Mixture of Experts (MoE) weights in the CPU
|
||||
(env: LLAMA_ARG_CPU_MOE)
|
||||
-ncmoe, --n-cpu-moe N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||
CPU
|
||||
(env: LLAMA_ARG_N_CPU_MOE)
|
||||
-ngl, --gpu-layers, --n-gpu-layers N max. number of layers to store in VRAM, either an exact number,
|
||||
'auto', or 'all' (default: auto)
|
||||
(env: LLAMA_ARG_N_GPU_LAYERS)
|
||||
-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of:
|
||||
- none: use one GPU only
|
||||
- layer (default): split layers and KV across GPUs
|
||||
- row: split rows across GPUs
|
||||
(env: LLAMA_ARG_SPLIT_MODE)
|
||||
-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of
|
||||
proportions, e.g. 3,1
|
||||
(env: LLAMA_ARG_TENSOR_SPLIT)
|
||||
-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for
|
||||
intermediate results and KV (with split-mode = row) (default: 0)
|
||||
(env: LLAMA_ARG_MAIN_GPU)
|
||||
-fit, --fit [on|off] whether to adjust unset arguments to fit in device memory ('on' or
|
||||
'off', default: 'on')
|
||||
(env: LLAMA_ARG_FIT)
|
||||
-fitt, --fit-target MiB0,MiB1,MiB2,...
|
||||
target margin per device for --fit, comma-separated list of values,
|
||||
single value is broadcast across all devices, default: 1024
|
||||
(env: LLAMA_ARG_FIT_TARGET)
|
||||
-fitc, --fit-ctx N minimum ctx size that can be set by --fit option, default: 4096
|
||||
(env: LLAMA_ARG_FIT_CTX)
|
||||
--check-tensors check model tensor data for invalid values (default: false)
|
||||
--override-kv KEY=TYPE:VALUE,... advanced option to override model metadata by key. to specify multiple
|
||||
overrides, either use comma-separated values.
|
||||
types: int, float, bool, str. example: --override-kv
|
||||
tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false
|
||||
--op-offload, --no-op-offload whether to offload host tensor operations to device (default: true)
|
||||
--lora FNAME path to LoRA adapter (use comma-separated values to load multiple
|
||||
adapters)
|
||||
--lora-scaled FNAME:SCALE,... path to LoRA adapter with user defined scaling (format:
|
||||
FNAME:SCALE,...)
|
||||
note: use comma-separated values
|
||||
--control-vector FNAME add a control vector
|
||||
note: use comma-separated values to add multiple control vectors
|
||||
--control-vector-scaled FNAME:SCALE,...
|
||||
add a control vector with user defined scaling SCALE
|
||||
note: use comma-separated values (format: FNAME:SCALE,...)
|
||||
--control-vector-layer-range START END
|
||||
layer range to apply the control vector(s) to, start and end inclusive
|
||||
-m, --model FNAME model path to load
|
||||
(env: LLAMA_ARG_MODEL)
|
||||
-mu, --model-url MODEL_URL model download url (default: unused)
|
||||
(env: LLAMA_ARG_MODEL_URL)
|
||||
-dr, --docker-repo [<repo>/]<model>[:quant]
|
||||
Docker Hub model repository. repo is optional, default to ai/. quant
|
||||
is optional, default to :latest.
|
||||
example: gemma3
|
||||
(default: unused)
|
||||
(env: LLAMA_ARG_DOCKER_REPO)
|
||||
-hf, -hfr, --hf-repo <user>/<model>[:quant]
|
||||
Hugging Face model repository; quant is optional, case-insensitive,
|
||||
default to Q4_K_M, or falls back to the first file in the repo if
|
||||
Q4_K_M doesn't exist.
|
||||
mmproj is also downloaded automatically if available. to disable, add
|
||||
--no-mmproj
|
||||
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
|
||||
(default: unused)
|
||||
(env: LLAMA_ARG_HF_REPO)
|
||||
-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]
|
||||
Same as --hf-repo, but for the draft model (default: unused)
|
||||
(env: LLAMA_ARG_HFD_REPO)
|
||||
-hff, --hf-file FILE Hugging Face model file. If specified, it will override the quant in
|
||||
--hf-repo (default: unused)
|
||||
(env: LLAMA_ARG_HF_FILE)
|
||||
-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]
|
||||
Hugging Face model repository for the vocoder model (default: unused)
|
||||
(env: LLAMA_ARG_HF_REPO_V)
|
||||
-hffv, --hf-file-v FILE Hugging Face model file for the vocoder model (default: unused)
|
||||
(env: LLAMA_ARG_HF_FILE_V)
|
||||
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment
|
||||
variable)
|
||||
(env: HF_TOKEN)
|
||||
--log-disable Log disable
|
||||
--log-file FNAME Log to file
|
||||
(env: LLAMA_LOG_FILE)
|
||||
--log-colors [on|off|auto] Set colored logging ('on', 'off', or 'auto', default: 'auto')
|
||||
'auto' enables colors when output is to a terminal
|
||||
(env: LLAMA_LOG_COLORS)
|
||||
-v, --verbose, --log-verbose Set verbosity level to infinity (i.e. log all messages, useful for
|
||||
debugging)
|
||||
--offline Offline mode: forces use of cache, prevents network access
|
||||
(env: LLAMA_OFFLINE)
|
||||
-lv, --verbosity, --log-verbosity N Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
ignored. Values:
|
||||
- 0: generic output
|
||||
- 1: error
|
||||
- 2: warning
|
||||
- 3: info
|
||||
- 4: debug
|
||||
(default: 3)
|
||||
|
||||
(env: LLAMA_LOG_VERBOSITY)
|
||||
--log-prefix Enable prefix in log messages
|
||||
(env: LLAMA_LOG_PREFIX)
|
||||
--log-timestamps Enable timestamps in log messages
|
||||
(env: LLAMA_LOG_TIMESTAMPS)
|
||||
-ctkd, --cache-type-k-draft TYPE KV cache data type for K for the draft model
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(default: f16)
|
||||
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT)
|
||||
-ctvd, --cache-type-v-draft TYPE KV cache data type for V for the draft model
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(default: f16)
|
||||
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT)
|
||||
|
||||
|
||||
----- sampling params -----
|
||||
|
||||
--samplers SAMPLERS samplers that will be used for generation in the order, separated by
|
||||
';'
|
||||
(default:
|
||||
penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature)
|
||||
-s, --seed SEED RNG seed (default: -1, use random seed for -1)
|
||||
--sampler-seq, --sampling-seq SEQUENCE
|
||||
simplified sequence for samplers that will be used (default:
|
||||
edskypmxt)
|
||||
--ignore-eos ignore end of stream token and continue generating (implies
|
||||
--logit-bias EOS-inf)
|
||||
--temp, --temperature N temperature (default: 0.80)
|
||||
--top-k N top-k sampling (default: 40, 0 = disabled)
|
||||
(env: LLAMA_ARG_TOP_K)
|
||||
--top-p N top-p sampling (default: 0.95, 1.0 = disabled)
|
||||
--min-p N min-p sampling (default: 0.05, 0.0 = disabled)
|
||||
--top-nsigma, --top-n-sigma N top-n-sigma sampling (default: -1.00, -1.0 = disabled)
|
||||
--xtc-probability N xtc probability (default: 0.00, 0.0 = disabled)
|
||||
--xtc-threshold N xtc threshold (default: 0.10, 1.0 = disabled)
|
||||
--typical, --typical-p N locally typical sampling, parameter p (default: 1.00, 1.0 = disabled)
|
||||
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1
|
||||
= ctx_size)
|
||||
--repeat-penalty N penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled)
|
||||
--presence-penalty N repeat alpha presence penalty (default: 0.00, 0.0 = disabled)
|
||||
--frequency-penalty N repeat alpha frequency penalty (default: 0.00, 0.0 = disabled)
|
||||
--dry-multiplier N set DRY sampling multiplier (default: 0.00, 0.0 = disabled)
|
||||
--dry-base N set DRY sampling base value (default: 1.75)
|
||||
--dry-allowed-length N set allowed length for DRY sampling (default: 2)
|
||||
--dry-penalty-last-n N set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 =
|
||||
context size)
|
||||
--dry-sequence-breaker STRING add sequence breaker for DRY sampling, clearing out default breakers
|
||||
('\n', ':', '"', '*') in the process; use "none" to not use any
|
||||
sequence breakers
|
||||
--adaptive-target N adaptive-p: select tokens near this probability (valid range 0.0 to
|
||||
1.0; negative = disabled) (default: -1.00)
|
||||
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)
|
||||
--adaptive-decay N adaptive-p: decay rate for target adaptation over time. lower values
|
||||
are more reactive, higher values are more stable.
|
||||
(valid range 0.0 to 0.99) (default: 0.90)
|
||||
--dynatemp-range N dynamic temperature range (default: 0.00, 0.0 = disabled)
|
||||
--dynatemp-exp N dynamic temperature exponent (default: 1.00)
|
||||
--mirostat N use Mirostat sampling.
|
||||
Top K, Nucleus and Locally Typical samplers are ignored if used.
|
||||
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
|
||||
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.10)
|
||||
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.00)
|
||||
-l, --logit-bias TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
|
||||
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
|
||||
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
|
||||
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/
|
||||
dir)
|
||||
--grammar-file FNAME file to read grammar from
|
||||
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g.
|
||||
`{}` for any JSON object
|
||||
For schemas w/ external $refs, use --grammar +
|
||||
example/json_schema_to_grammar.py instead
|
||||
-jf, --json-schema-file FILE File containing a JSON schema to constrain generations
|
||||
(https://json-schema.org/), e.g. `{}` for any JSON object
|
||||
For schemas w/ external $refs, use --grammar +
|
||||
example/json_schema_to_grammar.py instead
|
||||
-bs, --backend-sampling enable backend sampling (experimental) (default: disabled)
|
||||
(env: LLAMA_ARG_BACKEND_SAMPLING)
|
||||
|
||||
|
||||
----- example-specific params -----
|
||||
|
||||
-lcs, --lookup-cache-static FNAME path to static lookup cache to use for lookup decoding (not updated by
|
||||
generation)
|
||||
-lcd, --lookup-cache-dynamic FNAME path to dynamic lookup cache to use for lookup decoding (updated by
|
||||
generation)
|
||||
-ctxcp, --ctx-checkpoints, --swa-checkpoints N
|
||||
max number of context checkpoints to create per slot (default:
|
||||
32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
|
||||
(env: LLAMA_ARG_CTX_CHECKPOINTS)
|
||||
-cpent, --checkpoint-every-n-tokens N create a checkpoint every n tokens during prefill (processing), -1 to
|
||||
disable (default: 8192)
|
||||
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT)
|
||||
-cram, --cache-ram N set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 -
|
||||
disable)[(more
|
||||
info)](https://github.com/ggml-org/llama.cpp/pull/16391)
|
||||
(env: LLAMA_ARG_CACHE_RAM)
|
||||
-kvu, --kv-unified, -no-kvu, --no-kv-unified
|
||||
use single unified KV buffer shared across all sequences (default:
|
||||
enabled if number of slots is auto)
|
||||
(env: LLAMA_ARG_KV_UNIFIED)
|
||||
--clear-idle, --no-clear-idle save and clear idle slots on new task (default: enabled, requires
|
||||
unified KV and cache-ram)
|
||||
(env: LLAMA_ARG_CLEAR_IDLE)
|
||||
--context-shift, --no-context-shift whether to use context shift on infinite text generation (default:
|
||||
disabled)
|
||||
(env: LLAMA_ARG_CONTEXT_SHIFT)
|
||||
-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
|
||||
-sp, --special special tokens output enabled (default: false)
|
||||
--warmup, --no-warmup whether to perform warmup with an empty run (default: enabled)
|
||||
--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of
|
||||
Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
|
||||
--pooling {none,mean,cls,last,rank} pooling type for embeddings, use model default if unspecified
|
||||
(env: LLAMA_ARG_POOLING)
|
||||
-np, --parallel N number of server slots (default: -1, -1 = auto)
|
||||
(env: LLAMA_ARG_N_PARALLEL)
|
||||
-cb, --cont-batching, -nocb, --no-cont-batching
|
||||
whether to enable continuous batching (a.k.a dynamic batching)
|
||||
(default: enabled)
|
||||
(env: LLAMA_ARG_CONT_BATCHING)
|
||||
-mm, --mmproj FILE path to a multimodal projector file. see tools/mtmd/README.md
|
||||
note: if -hf is used, this argument can be omitted
|
||||
(env: LLAMA_ARG_MMPROJ)
|
||||
-mmu, --mmproj-url URL URL to a multimodal projector file. see tools/mtmd/README.md
|
||||
(env: LLAMA_ARG_MMPROJ_URL)
|
||||
--mmproj-auto, --no-mmproj, --no-mmproj-auto
|
||||
whether to use multimodal projector file (if available), useful when
|
||||
using -hf (default: enabled)
|
||||
(env: LLAMA_ARG_MMPROJ_AUTO)
|
||||
--mmproj-offload, --no-mmproj-offload whether to enable GPU offloading for multimodal projector (default:
|
||||
enabled)
|
||||
(env: LLAMA_ARG_MMPROJ_OFFLOAD)
|
||||
--image-min-tokens N minimum number of tokens each image can take, only used by vision
|
||||
models with dynamic resolution (default: read from model)
|
||||
(env: LLAMA_ARG_IMAGE_MIN_TOKENS)
|
||||
--image-max-tokens N maximum number of tokens each image can take, only used by vision
|
||||
models with dynamic resolution (default: read from model)
|
||||
(env: LLAMA_ARG_IMAGE_MAX_TOKENS)
|
||||
-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type for draft model
|
||||
-cmoed, --cpu-moe-draft keep all Mixture of Experts (MoE) weights in the CPU for the draft
|
||||
model
|
||||
(env: LLAMA_ARG_CPU_MOE_DRAFT)
|
||||
-ncmoed, --n-cpu-moe-draft N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||
CPU for the draft model
|
||||
(env: LLAMA_ARG_N_CPU_MOE_DRAFT)
|
||||
-a, --alias STRING set model name aliases, comma-separated (to be used by API)
|
||||
(env: LLAMA_ARG_ALIAS)
|
||||
--tags STRING set model tags, comma-separated (informational, not used for routing)
|
||||
(env: LLAMA_ARG_TAGS)
|
||||
--host HOST ip address to listen, or bind to an UNIX socket if the address ends
|
||||
with .sock (default: 127.0.0.1)
|
||||
(env: LLAMA_ARG_HOST)
|
||||
--port PORT port to listen (default: 8080)
|
||||
(env: LLAMA_ARG_PORT)
|
||||
--reuse-port allow multiple sockets to bind to the same port (default: disabled)
|
||||
(env: LLAMA_ARG_REUSE_PORT)
|
||||
--path PATH path to serve static files from (default: )
|
||||
(env: LLAMA_ARG_STATIC_PATH)
|
||||
--api-prefix PREFIX prefix path the server serves from, without the trailing slash
|
||||
(default: )
|
||||
(env: LLAMA_ARG_API_PREFIX)
|
||||
--webui-config JSON JSON that provides default WebUI settings (overrides WebUI defaults)
|
||||
(env: LLAMA_ARG_WEBUI_CONFIG)
|
||||
--webui-config-file PATH JSON file that provides default WebUI settings (overrides WebUI
|
||||
defaults)
|
||||
(env: LLAMA_ARG_WEBUI_CONFIG_FILE)
|
||||
--webui-mcp-proxy, --no-webui-mcp-proxy
|
||||
experimental: whether to enable MCP CORS proxy - do not enable in
|
||||
untrusted environments (default: disabled)
|
||||
(env: LLAMA_ARG_WEBUI_MCP_PROXY)
|
||||
--tools TOOL1,TOOL2,... experimental: whether to enable built-in tools for AI agents - do not
|
||||
enable in untrusted environments (default: no tools)
|
||||
specify "all" to enable all tools
|
||||
available tools: read_file, file_glob_search, grep_search,
|
||||
exec_shell_command, write_file, edit_file, apply_diff
|
||||
(env: LLAMA_ARG_TOOLS)
|
||||
--webui, --no-webui whether to enable the Web UI (default: enabled)
|
||||
(env: LLAMA_ARG_WEBUI)
|
||||
--embedding, --embeddings restrict to only support embedding use case; use only with dedicated
|
||||
embedding models (default: disabled)
|
||||
(env: LLAMA_ARG_EMBEDDINGS)
|
||||
--rerank, --reranking enable reranking endpoint on server (default: disabled)
|
||||
(env: LLAMA_ARG_RERANKING)
|
||||
--api-key KEY API key to use for authentication, multiple keys can be provided as a
|
||||
comma-separated list (default: none)
|
||||
(env: LLAMA_API_KEY)
|
||||
--api-key-file FNAME path to file containing API keys (default: none)
|
||||
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
|
||||
(env: LLAMA_ARG_SSL_KEY_FILE)
|
||||
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
|
||||
(env: LLAMA_ARG_SSL_CERT_FILE)
|
||||
--chat-template-kwargs STRING sets additional params for the json template parser, must be a valid
|
||||
json object string, e.g. '{"key1":"value1","key2":"value2"}'
|
||||
(env: LLAMA_CHAT_TEMPLATE_KWARGS)
|
||||
-to, --timeout N server read/write timeout in seconds (default: 600)
|
||||
(env: LLAMA_ARG_TIMEOUT)
|
||||
--threads-http N number of threads used to process HTTP requests (default: -1)
|
||||
(env: LLAMA_ARG_THREADS_HTTP)
|
||||
--cache-prompt, --no-cache-prompt whether to enable prompt caching (default: enabled)
|
||||
(env: LLAMA_ARG_CACHE_PROMPT)
|
||||
--cache-reuse N min chunk size to attempt reusing from the cache via KV shifting,
|
||||
requires prompt caching to be enabled (default: 0)
|
||||
[(card)](https://ggml.ai/f0.png)
|
||||
(env: LLAMA_ARG_CACHE_REUSE)
|
||||
--metrics enable prometheus compatible metrics endpoint (default: disabled)
|
||||
(env: LLAMA_ARG_ENDPOINT_METRICS)
|
||||
--props enable changing global properties via POST /props (default: disabled)
|
||||
(env: LLAMA_ARG_ENDPOINT_PROPS)
|
||||
--slots, --no-slots expose slots monitoring endpoint (default: enabled)
|
||||
(env: LLAMA_ARG_ENDPOINT_SLOTS)
|
||||
--slot-save-path PATH path to save slot kv cache (default: disabled)
|
||||
--media-path PATH directory for loading local media files; files can be accessed via
|
||||
file:// URLs using relative paths (default: disabled)
|
||||
--models-dir PATH directory containing models for the router server (default: disabled)
|
||||
(env: LLAMA_ARG_MODELS_DIR)
|
||||
--models-preset PATH path to INI file containing model presets for the router server
|
||||
(default: disabled)
|
||||
(env: LLAMA_ARG_MODELS_PRESET)
|
||||
--models-max N for router server, maximum number of models to load simultaneously
|
||||
(default: 4, 0 = unlimited)
|
||||
(env: LLAMA_ARG_MODELS_MAX)
|
||||
--models-autoload, --no-models-autoload
|
||||
for router server, whether to automatically load models (default:
|
||||
enabled)
|
||||
(env: LLAMA_ARG_MODELS_AUTOLOAD)
|
||||
--jinja, --no-jinja whether to use jinja template engine for chat (default: enabled)
|
||||
(env: LLAMA_ARG_JINJA)
|
||||
--reasoning-format FORMAT controls whether thought tags are allowed and/or extracted from the
|
||||
response, and in which format they're returned; one of:
|
||||
- none: leaves thoughts unparsed in `message.content`
|
||||
- deepseek: puts thoughts in `message.reasoning_content`
|
||||
- deepseek-legacy: keeps `<think>` tags in `message.content` while
|
||||
also populating `message.reasoning_content`
|
||||
(default: auto)
|
||||
(env: LLAMA_ARG_THINK)
|
||||
-rea, --reasoning [on|off|auto] Use reasoning/thinking in the chat ('on', 'off', or 'auto', default:
|
||||
'auto' (detect from template))
|
||||
(env: LLAMA_ARG_REASONING)
|
||||
--reasoning-budget N token budget for thinking: -1 for unrestricted, 0 for immediate end,
|
||||
N>0 for token budget (default: -1)
|
||||
(env: LLAMA_ARG_THINK_BUDGET)
|
||||
--reasoning-budget-message MESSAGE message injected before the end-of-thinking tag when reasoning budget
|
||||
is exhausted (default: none)
|
||||
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE)
|
||||
--chat-template JINJA_TEMPLATE set custom jinja chat template (default: template taken from model's
|
||||
metadata)
|
||||
if suffix/prefix are specified, template will be disabled
|
||||
only commonly used templates are accepted (unless --jinja is set
|
||||
before this flag):
|
||||
list of built-in templates:
|
||||
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml,
|
||||
command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe,
|
||||
exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite,
|
||||
granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2,
|
||||
llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez,
|
||||
minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7,
|
||||
mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3,
|
||||
phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca,
|
||||
yandex, zephyr
|
||||
(env: LLAMA_ARG_CHAT_TEMPLATE)
|
||||
--chat-template-file JINJA_TEMPLATE_FILE
|
||||
set custom jinja chat template file (default: template taken from
|
||||
model's metadata)
|
||||
if suffix/prefix are specified, template will be disabled
|
||||
only commonly used templates are accepted (unless --jinja is set
|
||||
before this flag):
|
||||
list of built-in templates:
|
||||
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml,
|
||||
command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe,
|
||||
exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite,
|
||||
granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2,
|
||||
llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez,
|
||||
minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7,
|
||||
mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3,
|
||||
phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca,
|
||||
yandex, zephyr
|
||||
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE)
|
||||
--skip-chat-parsing, --no-skip-chat-parsing
|
||||
force a pure content parser, even if a Jinja template is specified;
|
||||
model will output everything in the content section, including any
|
||||
reasoning and/or tool calls (default: disabled)
|
||||
(env: LLAMA_ARG_SKIP_CHAT_PARSING)
|
||||
--prefill-assistant, --no-prefill-assistant
|
||||
whether to prefill the assistant's response if the last message is an
|
||||
assistant message (default: prefill enabled)
|
||||
when this flag is set, if the last message is an assistant message
|
||||
then it will be treated as a full message and not prefilled
|
||||
|
||||
(env: LLAMA_ARG_PREFILL_ASSISTANT)
|
||||
-sps, --slot-prompt-similarity SIMILARITY
|
||||
how much the prompt of a request must match the prompt of a slot in
|
||||
order to use that slot (default: 0.10, 0.0 = disabled)
|
||||
--lora-init-without-apply load LoRA adapters without applying them (apply later via POST
|
||||
/lora-adapters) (default: disabled)
|
||||
--sleep-idle-seconds SECONDS number of seconds of idleness after which the server will sleep
|
||||
(default: -1; -1 = disabled)
|
||||
-td, --threads-draft N number of threads to use during generation (default: same as
|
||||
--threads)
|
||||
-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default:
|
||||
same as --threads-draft)
|
||||
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
|
||||
(env: LLAMA_ARG_DRAFT_MAX)
|
||||
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
|
||||
(default: 0)
|
||||
(env: LLAMA_ARG_DRAFT_MIN)
|
||||
--draft-p-min P minimum speculative decoding probability (greedy) (default: 0.75)
|
||||
(env: LLAMA_ARG_DRAFT_P_MIN)
|
||||
-cd, --ctx-size-draft N size of the prompt context for the draft model (default: 0, 0 = loaded
|
||||
from model)
|
||||
(env: LLAMA_ARG_CTX_SIZE_DRAFT)
|
||||
-devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model
|
||||
(none = don't offload)
|
||||
use --list-devices to see a list of available devices
|
||||
-ngld, --gpu-layers-draft, --n-gpu-layers-draft N
|
||||
max. number of draft model layers to store in VRAM, either an exact
|
||||
number, 'auto', or 'all' (default: auto)
|
||||
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
|
||||
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
|
||||
(env: LLAMA_ARG_MODEL_DRAFT)
|
||||
--spec-replace TARGET DRAFT translate the string in TARGET into DRAFT if the draft model and main
|
||||
model are not compatible
|
||||
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
type of speculative decoding to use when no draft model is provided
|
||||
(default: none)
|
||||
|
||||
(env: LLAMA_ARG_SPEC_TYPE)
|
||||
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
|
||||
of lookup n-gram (default: 12)
|
||||
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
|
||||
of draft m-gram (default: 48)
|
||||
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
|
||||
-mv, --model-vocoder FNAME vocoder model for audio generation (default: unused)
|
||||
--tts-use-guide-tokens Use guide tokens to improve TTS word recall
|
||||
--embd-gemma-default use default EmbeddingGemma model (note: can download weights from the
|
||||
internet)
|
||||
--fim-qwen-1.5b-default use default Qwen 2.5 Coder 1.5B (note: can download weights from the
|
||||
internet)
|
||||
--fim-qwen-3b-default use default Qwen 2.5 Coder 3B (note: can download weights from the
|
||||
internet)
|
||||
--fim-qwen-7b-default use default Qwen 2.5 Coder 7B (note: can download weights from the
|
||||
internet)
|
||||
--fim-qwen-7b-spec use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can
|
||||
download weights from the internet)
|
||||
--fim-qwen-14b-spec use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note:
|
||||
can download weights from the internet)
|
||||
--fim-qwen-30b-default use default Qwen 3 Coder 30B A3B Instruct (note: can download weights
|
||||
from the internet)
|
||||
--gpt-oss-20b-default use gpt-oss-20b (note: can download weights from the internet)
|
||||
--gpt-oss-120b-default use gpt-oss-120b (note: can download weights from the internet)
|
||||
--vision-gemma-4b-default use Gemma 3 4B QAT (note: can download weights from the internet)
|
||||
--vision-gemma-12b-default use Gemma 3 12B QAT (note: can download weights from the internet)
|
||||
31
scripts/help_gpu_flags.txt
Normal file
31
scripts/help_gpu_flags.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
ggml_cuda_init: found 2 CUDA devices (Total VRAM: 24575 MiB):
|
||||
Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB
|
||||
Device 1: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB
|
||||
-dev, --device <dev1,dev2,..> comma-separated list of devices to use for offloading (none = don't
|
||||
use --list-devices to see a list of available devices
|
||||
(env: LLAMA_ARG_DEVICE)
|
||||
--list-devices print list of available devices and exit
|
||||
-ot, --override-tensor <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type
|
||||
(env: LLAMA_ARG_OVERRIDE_TENSOR)
|
||||
-cmoe, --cpu-moe keep all Mixture of Experts (MoE) weights in the CPU
|
||||
-ncmoe, --n-cpu-moe N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||
-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of:
|
||||
- layer (default): split layers and KV across GPUs
|
||||
- row: split rows across GPUs
|
||||
(env: LLAMA_ARG_SPLIT_MODE)
|
||||
-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of
|
||||
(env: LLAMA_ARG_TENSOR_SPLIT)
|
||||
-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for
|
||||
intermediate results and KV (with split-mode = row) (default: 0)
|
||||
-fit, --fit [on|off] whether to adjust unset arguments to fit in device memory ('on' or
|
||||
target margin per device for --fit, comma-separated list of values,
|
||||
single value is broadcast across all devices, default: 1024
|
||||
--check-tensors check model tensor data for invalid values (default: false)
|
||||
--op-offload, --no-op-offload whether to offload host tensor operations to device (default: true)
|
||||
-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type for draft model
|
||||
-cmoed, --cpu-moe-draft keep all Mixture of Experts (MoE) weights in the CPU for the draft
|
||||
-ncmoed, --n-cpu-moe-draft N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||
-devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model
|
||||
use --list-devices to see a list of available devices
|
||||
28
scripts/hf_search.py
Normal file
28
scripts/hf_search.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from huggingface_hub import HfApi
|
||||
import sys
|
||||
|
||||
api = HfApi()
|
||||
|
||||
def search_gguf(query):
|
||||
print(f"\n--- Searching for: {query} ---")
|
||||
try:
|
||||
models = api.list_models(search=query, limit=3)
|
||||
found = list(models)
|
||||
if not found:
|
||||
print("No models found.")
|
||||
return
|
||||
for m in found:
|
||||
print(f"Repo: {m.id}")
|
||||
files = api.list_repo_files(repo_id=m.id)
|
||||
ggufs = [f for f in files if "q4_k_m" in f.lower() and f.endswith(".gguf")]
|
||||
if not ggufs:
|
||||
ggufs = [f for f in files if f.endswith(".gguf")][:3]
|
||||
print(f" GGUFs: {ggufs}")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
search_gguf("122b-a10b gguf")
|
||||
search_gguf("Qwen3.5 122b gguf")
|
||||
search_gguf("35b-a3b gguf")
|
||||
search_gguf("gemma-4 26b gguf")
|
||||
search_gguf("Qwen 122B")
|
||||
123
scripts/perf_test.py
Normal file
123
scripts/perf_test.py
Normal file
@@ -0,0 +1,123 @@
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
|
||||
def check_server():
|
||||
"""Check if server is up"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("status") == "ok"
|
||||
except:
|
||||
return False
|
||||
|
||||
def run_benchmark(prompt, max_tokens=100, label="Test"):
|
||||
"""Run a single benchmark request and return results"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
content = result["choices"][0]["message"].get("content", "")
|
||||
usage = result.get("usage", {})
|
||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
|
||||
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"elapsed": elapsed,
|
||||
"gen_tps_approx": gen_tps,
|
||||
"content_preview": content[:100]
|
||||
}
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print(" LLM Performance Benchmark Tool")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Wait for server
|
||||
print("[1/3] Checking server health...")
|
||||
for i in range(30):
|
||||
if check_server():
|
||||
print(" -> Server is ready!")
|
||||
break
|
||||
print(f" -> Waiting for server... ({i+1}/30)")
|
||||
time.sleep(2)
|
||||
else:
|
||||
print(" -> ERROR: Server not responding after 60s")
|
||||
return
|
||||
|
||||
# Warmup
|
||||
print()
|
||||
print("[2/3] Warmup run (short)...")
|
||||
try:
|
||||
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
||||
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
||||
except Exception as e:
|
||||
print(f" -> Warmup failed: {e}")
|
||||
|
||||
# Main benchmark
|
||||
print()
|
||||
print("[3/3] Running main benchmark...")
|
||||
print("-" * 60)
|
||||
|
||||
test_prompt = "Count from 1 to 50, writing each number on a new line."
|
||||
|
||||
results = []
|
||||
for i in range(3):
|
||||
print(f" Run {i+1}/3...")
|
||||
try:
|
||||
r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}")
|
||||
results.append(r)
|
||||
print(f" Tokens: {r['completion_tokens']} | "
|
||||
f"Time: {r['elapsed']:.2f}s | "
|
||||
f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
if results:
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(" RESULTS SUMMARY")
|
||||
print("=" * 60)
|
||||
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
||||
max_tps = max(r["gen_tps_approx"] for r in results)
|
||||
min_tps = min(r["gen_tps_approx"] for r in results)
|
||||
print(f" Runs: {len(results)}")
|
||||
print(f" Avg TPS: {avg_tps:.2f} t/s (approx, includes prompt eval)")
|
||||
print(f" Min TPS: {min_tps:.2f} t/s")
|
||||
print(f" Max TPS: {max_tps:.2f} t/s")
|
||||
print()
|
||||
print(" NOTE: Check server console for exact generation t/s")
|
||||
print(" (the 'eval time' line shows pure token generation speed)")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
169
scripts/perf_test_122b.py
Normal file
169
scripts/perf_test_122b.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
|
||||
def check_server():
|
||||
"""Check if server is up"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("status") == "ok"
|
||||
except:
|
||||
return False
|
||||
|
||||
def check_slots():
|
||||
"""Check server slot info for VRAM usage details"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/slots")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read())
|
||||
except:
|
||||
return None
|
||||
|
||||
def run_benchmark(prompt, max_tokens=300, label="Test"):
|
||||
"""Run a single benchmark request and return results"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
content = result["choices"][0]["message"].get("content", "")
|
||||
usage = result.get("usage", {})
|
||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
|
||||
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"elapsed": elapsed,
|
||||
"gen_tps_approx": gen_tps,
|
||||
"content_preview": content[:150]
|
||||
}
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B Performance Benchmark")
|
||||
print(" Target: 10+ t/s generation speed")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Wait for server (model loading takes 3-5 min for 71 GB)
|
||||
print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
|
||||
max_wait = 600 # 10 minutes max
|
||||
for i in range(max_wait // 5):
|
||||
if check_server():
|
||||
print(f" -> Server is ready! (waited {i*5}s)")
|
||||
break
|
||||
if i % 6 == 0:
|
||||
print(f" -> Loading model... ({i*5}s / {max_wait}s)")
|
||||
time.sleep(5)
|
||||
else:
|
||||
print(f" -> ERROR: Server not responding after {max_wait}s")
|
||||
return
|
||||
|
||||
# Check server info
|
||||
print()
|
||||
print("[2/4] Checking server status...")
|
||||
slots = check_slots()
|
||||
if slots:
|
||||
print(f" -> Slots available: {len(slots)}")
|
||||
|
||||
# Warmup
|
||||
print()
|
||||
print("[3/4] Warmup run (short, pre-heating GPU caches)...")
|
||||
try:
|
||||
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
||||
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
||||
print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
||||
except Exception as e:
|
||||
print(f" -> Warmup failed: {e}")
|
||||
|
||||
# Main benchmark - 5 runs for statistical reliability
|
||||
print()
|
||||
print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
|
||||
print("-" * 70)
|
||||
|
||||
test_prompts = [
|
||||
"Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
|
||||
"Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
|
||||
"Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
|
||||
"Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
|
||||
"Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
|
||||
]
|
||||
|
||||
results = []
|
||||
for i in range(5):
|
||||
prompt = test_prompts[i % len(test_prompts)]
|
||||
print(f"\n Run {i+1}/5: {prompt[:50]}...")
|
||||
try:
|
||||
r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
|
||||
results.append(r)
|
||||
print(f" Completion tokens: {r['completion_tokens']}")
|
||||
print(f" Total time: {r['elapsed']:.2f}s")
|
||||
print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
if results:
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" RESULTS SUMMARY - Qwen3.5 122B-A10B")
|
||||
print("=" * 70)
|
||||
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
||||
max_tps = max(r["gen_tps_approx"] for r in results)
|
||||
min_tps = min(r["gen_tps_approx"] for r in results)
|
||||
total_tokens = sum(r["completion_tokens"] for r in results)
|
||||
total_time = sum(r["elapsed"] for r in results)
|
||||
|
||||
print(f" Runs completed: {len(results)}/5")
|
||||
print(f" Total tokens: {total_tokens}")
|
||||
print(f" Total time: {total_time:.1f}s")
|
||||
print()
|
||||
print(f" Approx TPS (avg): {avg_tps:.2f} t/s")
|
||||
print(f" Approx TPS (min): {min_tps:.2f} t/s")
|
||||
print(f" Approx TPS (max): {max_tps:.2f} t/s")
|
||||
print()
|
||||
|
||||
# Verdict
|
||||
if avg_tps >= 10:
|
||||
print(" ✅ TARGET ACHIEVED: 10+ t/s!")
|
||||
elif avg_tps >= 8:
|
||||
print(" ⚠️ CLOSE TO TARGET: Consider further tuning")
|
||||
else:
|
||||
print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
|
||||
|
||||
print()
|
||||
print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
|
||||
print(" ⚡ Check the server console/log for exact 'eval time' t/s value,")
|
||||
print(" ⚡ which shows pure token generation speed (always higher).")
|
||||
print("=" * 70)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
5
scripts/q4km_latest.txt
Normal file
5
scripts/q4km_latest.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
pure-GPU nommap small | 62.29 | GPU | VRAM:22975 | ub=128 b=512 t=4
|
||||
pure-GPU ts=0.5,0.5 | 63.89 | GPU | VRAM:23002 | ub=128 b=512 t=4
|
||||
tune t=2 | 64.1 | GPU | VRAM:22980 | ub=128 b=512 t=2
|
||||
tune t=6 | 64.18 | GPU | VRAM:22982 | ub=128 b=512 t=6
|
||||
tune t=8 | 63.11 | GPU | VRAM:22980 | ub=128 b=512 t=8
|
||||
31
scripts/quick_pptest.mjs
Normal file
31
scripts/quick_pptest.mjs
Normal file
@@ -0,0 +1,31 @@
|
||||
// Quick PP+TG speed test
|
||||
const BASE = "http://127.0.0.1:8000";
|
||||
|
||||
async function test(label, prompt, maxTok) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ model: "m", messages: [{ role: "user", content: prompt }], max_tokens: maxTok, temperature: 0 }),
|
||||
signal: AbortSignal.timeout(600000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const u = d.usage || {};
|
||||
const pp = u.prompt_tokens || 0;
|
||||
const tg = u.completion_tokens || 0;
|
||||
const ppSpeed = pp > 0 ? (pp / dt).toFixed(1) : "?";
|
||||
const tgSpeed = tg > 0 ? (tg / dt).toFixed(1) : "?";
|
||||
console.log(`${label} | PP:${pp}tok ${ppSpeed}t/s | TG:${tg}tok ${tgSpeed}t/s | ${dt.toFixed(1)}s`);
|
||||
}
|
||||
|
||||
const short = "Count 1 to 20.";
|
||||
const long = "x".repeat(3000) + " Summarize above in 3 words.";
|
||||
const code = Array(200).fill("function foo(x) { return x * 2 + Math.random(); }").join("\n") + "\n\nRefactor above to arrow functions. Show first 5 lines.";
|
||||
|
||||
await test("warmup", short, 20);
|
||||
await test("SHORT", short, 200);
|
||||
await test("3K-PP", long, 100);
|
||||
await test("10K-CODE", code, 100);
|
||||
await test("TG-500", short, 500);
|
||||
console.log("DONE");
|
||||
345
scripts/qwen_fullgpu_challenge.mjs
Normal file
345
scripts/qwen_fullgpu_challenge.mjs
Normal file
@@ -0,0 +1,345 @@
|
||||
/**
|
||||
* Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크
|
||||
* =====================================================
|
||||
* 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기
|
||||
*
|
||||
* 테스트 모델:
|
||||
* 1. UD-IQ4_NL (16.6 GB) — 확실히 올라감, 기준선
|
||||
* 2. MXFP4_MOE (20.1 GB) — 도전! VRAM 극한 최적화
|
||||
* 3. Q4_K_M (20.5 GB) — 대조군 (n-cpu-moe=5)
|
||||
*
|
||||
* VRAM 절감 전략:
|
||||
* A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소)
|
||||
* B. split-mode row (GPU간 더 균등한 분배)
|
||||
* C. tensor-split 수동 밸런싱
|
||||
* D. no-mmap (메모리 관리 최적화)
|
||||
* E. defrag-thold (KV 캐시 파편화 방지)
|
||||
*
|
||||
* Run: node scripts/qwen_fullgpu_challenge.mjs
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from "child_process";
|
||||
import { writeFileSync, existsSync, statSync } from "fs";
|
||||
|
||||
const BASE_URL = "http://127.0.0.1:8000";
|
||||
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||
const CTX = 262144;
|
||||
const RUNS = 3;
|
||||
const TOKENS = 200;
|
||||
const BOOT_TIMEOUT = 300_000;
|
||||
|
||||
const MODELS = [
|
||||
{
|
||||
name: "Qwen3.5 UD-IQ4_NL",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`,
|
||||
sizeGB: 16.6,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5 MXFP4_MOE",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||
sizeGB: 20.11,
|
||||
},
|
||||
{
|
||||
name: "Qwen3.5 Q4_K_M",
|
||||
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||
sizeGB: 20.5,
|
||||
},
|
||||
];
|
||||
|
||||
const ALL = [];
|
||||
let proc = null;
|
||||
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
async function kill() {
|
||||
if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; }
|
||||
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||
await sleep(5000);
|
||||
}
|
||||
|
||||
function vram() {
|
||||
try {
|
||||
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||
return { gpu: g, used: u, total: t };
|
||||
});
|
||||
} catch { return []; }
|
||||
}
|
||||
|
||||
function startServer(modelPath, p) {
|
||||
const args = [
|
||||
"--model", modelPath, "-ngl", "999",
|
||||
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||
"--cache-type-k", p.ctk || "q4_0",
|
||||
"--cache-type-v", p.ctv || "q4_0",
|
||||
"-ub", String(p.ub || 512), "-b", String(p.b || 2048),
|
||||
"-t", String(p.t || 4), "-tb", String(p.t || 4),
|
||||
"--prio", "3", "--poll", "50", "--mlock",
|
||||
"--port", "8000", "--host", "0.0.0.0",
|
||||
];
|
||||
|
||||
// GPU offload strategy
|
||||
if (p.cpuMoe) args.push("--cpu-moe");
|
||||
else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||
|
||||
// VRAM saving options
|
||||
if (p.splitMode) args.push("--split-mode", p.splitMode);
|
||||
if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit);
|
||||
if (p.noMmap) args.push("--no-mmap");
|
||||
if (p.defragThold) args.push("--defrag-thold", String(p.defragThold));
|
||||
if (p.noKvOffload) args.push("--no-kv-offload");
|
||||
|
||||
const cmdStr = args.join(" ");
|
||||
log(` CMD: ...${cmdStr.slice(-80)}`);
|
||||
proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||
return proc;
|
||||
}
|
||||
|
||||
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||
const t0 = Date.now();
|
||||
while (Date.now() - t0 < timeout) {
|
||||
try {
|
||||
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||
const d = await r.json();
|
||||
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||
} catch {}
|
||||
await sleep(3000);
|
||||
}
|
||||
return { ok: false, boot: timeout / 1000 };
|
||||
}
|
||||
|
||||
async function bench(n = TOKENS) {
|
||||
const t0 = Date.now();
|
||||
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "m",
|
||||
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||
max_tokens: n, temperature: 0,
|
||||
}),
|
||||
signal: AbortSignal.timeout(600_000),
|
||||
});
|
||||
const d = await r.json();
|
||||
const dt = (Date.now() - t0) / 1000;
|
||||
const ct = d.usage?.completion_tokens || 0;
|
||||
return { tps: ct / dt, ct, dt };
|
||||
}
|
||||
|
||||
async function testConfig(model, label, params) {
|
||||
await kill();
|
||||
log(` [${label}] Starting...`);
|
||||
startServer(model.path, params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (!ok) {
|
||||
log(` [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`);
|
||||
await kill();
|
||||
return null;
|
||||
}
|
||||
|
||||
const v = vram();
|
||||
const totalUsed = v.reduce((a, g) => a + g.used, 0);
|
||||
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||
log(` [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`);
|
||||
|
||||
try { await bench(20); } catch {} // warmup
|
||||
|
||||
const speeds = [];
|
||||
for (let i = 0; i < RUNS; i++) {
|
||||
try {
|
||||
const r = await bench();
|
||||
speeds.push(r.tps);
|
||||
log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) {
|
||||
log(` Run${i+1}: ERR ${e.message}`);
|
||||
}
|
||||
}
|
||||
await kill();
|
||||
|
||||
if (!speeds.length) return null;
|
||||
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||
const best = Math.max(...speeds);
|
||||
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||
|
||||
const res = {
|
||||
model: model.name, label,
|
||||
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||
boot: +boot.toFixed(1),
|
||||
vram_total: totalUsed, vram: v,
|
||||
params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" },
|
||||
gpu_only: !params.cpuMoe && !params.nCpuMoe,
|
||||
};
|
||||
ALL.push(res);
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
return res;
|
||||
}
|
||||
|
||||
// ─── Test Strategies ───────────────────────────────────────────
|
||||
|
||||
async function testModel(model) {
|
||||
log(`\n${"#".repeat(65)}`);
|
||||
log(` ${model.name} (${model.sizeGB} GB)`);
|
||||
if (!existsSync(model.path)) { log(" ✗ File not found!"); return null; }
|
||||
log(`${"#".repeat(65)}`);
|
||||
|
||||
let best = null;
|
||||
const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; };
|
||||
|
||||
// ── Strategy 1: Pure GPU, default settings ──
|
||||
log(`\n ── Strategy 1: Pure GPU (default) ──`);
|
||||
update(await testConfig(model, "pure-GPU default", {
|
||||
t: 4, ub: 512, b: 2048
|
||||
}));
|
||||
|
||||
// ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ──
|
||||
log(`\n ── Strategy 2: Pure GPU, minimal batch ──`);
|
||||
update(await testConfig(model, "pure-GPU minbatch", {
|
||||
t: 4, ub: 64, b: 256
|
||||
}));
|
||||
|
||||
// ── Strategy 3: Pure GPU, small batch + no-mmap ──
|
||||
log(`\n ── Strategy 3: Pure GPU + no-mmap + small batch ──`);
|
||||
update(await testConfig(model, "pure-GPU nommap small", {
|
||||
t: 4, ub: 128, b: 512, noMmap: true
|
||||
}));
|
||||
|
||||
// ── Strategy 4: Pure GPU, split-mode row ──
|
||||
log(`\n ── Strategy 4: Pure GPU + split-mode row ──`);
|
||||
update(await testConfig(model, "pure-GPU row-split", {
|
||||
t: 4, ub: 128, b: 512, splitMode: "row"
|
||||
}));
|
||||
|
||||
// ── Strategy 5: Pure GPU, tensor-split manual balance ──
|
||||
log(`\n ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`);
|
||||
update(await testConfig(model, "pure-GPU ts=0.5,0.5", {
|
||||
t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5"
|
||||
}));
|
||||
|
||||
// ── Strategy 6: Pure GPU, defrag + all tricks ──
|
||||
log(`\n ── Strategy 6: Pure GPU ALL tricks ──`);
|
||||
update(await testConfig(model, "pure-GPU all-tricks", {
|
||||
t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1
|
||||
}));
|
||||
|
||||
// ── Fallback: n-cpu-moe=5 baseline ──
|
||||
if (!best || !best.gpu_only) {
|
||||
log(`\n ── Fallback: n-cpu-moe=5 ──`);
|
||||
update(await testConfig(model, "n-cpu-moe=5 baseline", {
|
||||
t: 4, ub: 256, b: 1024, nCpuMoe: 5
|
||||
}));
|
||||
}
|
||||
|
||||
// ── If pure GPU worked, tune batch/thread/kv ──
|
||||
if (best && best.gpu_only) {
|
||||
log(`\n ── Pure GPU succeeded! Fine-tuning... ──`);
|
||||
const bp = best.params;
|
||||
|
||||
// Thread sweep
|
||||
for (const t of [2, 6, 8]) {
|
||||
if (t === bp.t) continue;
|
||||
update(await testConfig(model, `tune t=${t}`, { ...bp, t }));
|
||||
}
|
||||
|
||||
// Batch sweep
|
||||
for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) {
|
||||
if (ub === bp.ub && b === bp.b) continue;
|
||||
update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b }));
|
||||
}
|
||||
|
||||
// KV cache upgrade (extra VRAM available?)
|
||||
for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) {
|
||||
update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv }));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Final verification ──
|
||||
if (best) {
|
||||
log(`\n ── Final verification (5 runs) ──`);
|
||||
await kill();
|
||||
startServer(model.path, best.params);
|
||||
const { ok, boot } = await waitReady();
|
||||
if (ok) {
|
||||
const v = vram();
|
||||
try { await bench(20); } catch {}
|
||||
const finals = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||
}
|
||||
await kill();
|
||||
if (finals.length > 0) {
|
||||
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||
const bst = Math.max(...finals);
|
||||
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||
const final = { model: model.name, label: "FINAL",
|
||||
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||
boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0),
|
||||
vram: v, params: best.params, gpu_only: best.gpu_only };
|
||||
ALL.push(final);
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
return final;
|
||||
}
|
||||
}
|
||||
await kill();
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// ─── Main ──────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const t0 = Date.now();
|
||||
log("=".repeat(65));
|
||||
log(" QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET");
|
||||
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||
log(" " + new Date().toISOString());
|
||||
log("=".repeat(65));
|
||||
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||
|
||||
const winners = [];
|
||||
for (const model of MODELS) {
|
||||
const w = await testModel(model);
|
||||
if (w) winners.push(w);
|
||||
}
|
||||
|
||||
// ─── Summary ──────────────────────────────────────────────
|
||||
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||
|
||||
const lines = [
|
||||
`Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`,
|
||||
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||
"", "=".repeat(55), " RANKING", "=".repeat(55),
|
||||
];
|
||||
for (let i = 0; i < winners.length; i++) {
|
||||
const w = winners[i], p = w.params;
|
||||
const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload";
|
||||
lines.push("", ` #${i+1}: ${w.model} [${gpu}]`);
|
||||
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||
lines.push(` VRAM: ${w.vram_total} MiB total`);
|
||||
const flags = [];
|
||||
if (p.splitMode) flags.push(`split=${p.splitMode}`);
|
||||
if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`);
|
||||
if (p.noMmap) flags.push("no-mmap");
|
||||
if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`);
|
||||
lines.push(` t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`);
|
||||
}
|
||||
|
||||
if (winners.length > 0) {
|
||||
const c = winners[0];
|
||||
lines.push("", "=".repeat(55));
|
||||
lines.push(` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`);
|
||||
lines.push("=".repeat(55));
|
||||
}
|
||||
|
||||
const summary = lines.join("\n");
|
||||
console.log("\n" + summary);
|
||||
writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8");
|
||||
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||
log(`\n Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`);
|
||||
log(" DONE!");
|
||||
await kill();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||
834
scripts/qwen_fullgpu_results.json
Normal file
834
scripts/qwen_fullgpu_results.json
Normal file
@@ -0,0 +1,834 @@
|
||||
[
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU minbatch",
|
||||
"avg_tps": 65.11,
|
||||
"best_tps": 65.49,
|
||||
"boot": 9,
|
||||
"vram_total": 19177,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10039,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU nommap small",
|
||||
"avg_tps": 65.01,
|
||||
"best_tps": 65.36,
|
||||
"boot": 6,
|
||||
"vram_total": 19672,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10342,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9330,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU row-split",
|
||||
"avg_tps": 13.65,
|
||||
"best_tps": 14.82,
|
||||
"boot": 9,
|
||||
"vram_total": 19427,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10311,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9116,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"splitMode": "row",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU ts=0.5,0.5",
|
||||
"avg_tps": 64.92,
|
||||
"best_tps": 65.23,
|
||||
"boot": 9,
|
||||
"vram_total": 19664,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10334,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9330,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "pure-GPU all-tricks",
|
||||
"avg_tps": 64.72,
|
||||
"best_tps": 64.89,
|
||||
"boot": 6,
|
||||
"vram_total": 19171,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10033,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"noMmap": true,
|
||||
"defragThold": 0.1,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune t=2",
|
||||
"avg_tps": 64.87,
|
||||
"best_tps": 65.13,
|
||||
"boot": 9,
|
||||
"vram_total": 19170,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10032,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 2,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune t=6",
|
||||
"avg_tps": 64.88,
|
||||
"best_tps": 65.17,
|
||||
"boot": 9,
|
||||
"vram_total": 19168,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10030,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 6,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune t=8",
|
||||
"avg_tps": 64.5,
|
||||
"best_tps": 64.77,
|
||||
"boot": 9,
|
||||
"vram_total": 19168,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10030,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9138,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 8,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune ub=256 b=1024",
|
||||
"avg_tps": 64.73,
|
||||
"best_tps": 64.98,
|
||||
"boot": 9,
|
||||
"vram_total": 20640,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10928,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9712,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune ub=256 b=2048",
|
||||
"avg_tps": 63.69,
|
||||
"best_tps": 64.94,
|
||||
"boot": 12,
|
||||
"vram_total": 20614,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10902,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9712,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 2048,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune kv=q8_0/q8_0",
|
||||
"avg_tps": 64.78,
|
||||
"best_tps": 65.08,
|
||||
"boot": 9,
|
||||
"vram_total": 20422,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 10644,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 9778,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "tune kv=f16/f16",
|
||||
"avg_tps": 65.53,
|
||||
"best_tps": 65.81,
|
||||
"boot": 9,
|
||||
"vram_total": 22812,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11846,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10966,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "f16",
|
||||
"ctv": "f16"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 UD-IQ4_NL",
|
||||
"label": "FINAL",
|
||||
"avg_tps": 66.31,
|
||||
"best_tps": 66.53,
|
||||
"boot": 9,
|
||||
"vram_total": 22811,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11845,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10966,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "f16",
|
||||
"ctv": "f16"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "pure-GPU minbatch",
|
||||
"avg_tps": 63.06,
|
||||
"best_tps": 64.16,
|
||||
"boot": 12,
|
||||
"vram_total": 22747,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11895,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10852,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "pure-GPU nommap small",
|
||||
"avg_tps": 63.75,
|
||||
"best_tps": 63.98,
|
||||
"boot": 9,
|
||||
"vram_total": 22579,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11797,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "pure-GPU ts=0.5,0.5",
|
||||
"avg_tps": 62.88,
|
||||
"best_tps": 63.9,
|
||||
"boot": 12,
|
||||
"vram_total": 22578,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11796,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "pure-GPU all-tricks",
|
||||
"avg_tps": 62.55,
|
||||
"best_tps": 63.71,
|
||||
"boot": 9,
|
||||
"vram_total": 22743,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11891,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10852,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 64,
|
||||
"b": 256,
|
||||
"noMmap": true,
|
||||
"defragThold": 0.1,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune t=2",
|
||||
"avg_tps": 63.07,
|
||||
"best_tps": 64.08,
|
||||
"boot": 9,
|
||||
"vram_total": 22601,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11819,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 2,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune t=6",
|
||||
"avg_tps": 63.58,
|
||||
"best_tps": 64.04,
|
||||
"boot": 9,
|
||||
"vram_total": 22583,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11801,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 6,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune t=8",
|
||||
"avg_tps": 62.92,
|
||||
"best_tps": 63.73,
|
||||
"boot": 9,
|
||||
"vram_total": 22536,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11754,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 8,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune ub=256 b=1024",
|
||||
"avg_tps": 62.76,
|
||||
"best_tps": 63.86,
|
||||
"boot": 9,
|
||||
"vram_total": 22874,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11968,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10906,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "tune ub=256 b=2048",
|
||||
"avg_tps": 62.74,
|
||||
"best_tps": 63.9,
|
||||
"boot": 9,
|
||||
"vram_total": 22912,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12006,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10906,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 2048,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 MXFP4_MOE",
|
||||
"label": "FINAL",
|
||||
"avg_tps": 63.71,
|
||||
"best_tps": 64.39,
|
||||
"boot": 9,
|
||||
"vram_total": 22566,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 11784,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10782,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "pure-GPU nommap small",
|
||||
"avg_tps": 62.29,
|
||||
"best_tps": 63.03,
|
||||
"boot": 9,
|
||||
"vram_total": 22975,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12007,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"noMmap": true,
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "pure-GPU ts=0.5,0.5",
|
||||
"avg_tps": 63.89,
|
||||
"best_tps": 64.91,
|
||||
"boot": 12,
|
||||
"vram_total": 23002,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12034,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "tune t=2",
|
||||
"avg_tps": 64.1,
|
||||
"best_tps": 64.54,
|
||||
"boot": 12,
|
||||
"vram_total": 22980,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12012,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 2,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "tune t=6",
|
||||
"avg_tps": 64.18,
|
||||
"best_tps": 64.72,
|
||||
"boot": 12,
|
||||
"vram_total": 22982,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12014,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 6,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
},
|
||||
{
|
||||
"model": "Qwen3.5 Q4_K_M",
|
||||
"label": "tune t=8",
|
||||
"avg_tps": 63.11,
|
||||
"best_tps": 64.02,
|
||||
"boot": 12,
|
||||
"vram_total": 22980,
|
||||
"vram": [
|
||||
{
|
||||
"gpu": 0,
|
||||
"used": 12012,
|
||||
"total": 12288
|
||||
},
|
||||
{
|
||||
"gpu": 1,
|
||||
"used": 10968,
|
||||
"total": 12288
|
||||
}
|
||||
],
|
||||
"params": {
|
||||
"t": 8,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"tensorSplit": "0.5,0.5",
|
||||
"ngl": 999,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0"
|
||||
},
|
||||
"gpu_only": true
|
||||
}
|
||||
]
|
||||
12
scripts/qwen_intermediate.csv
Normal file
12
scripts/qwen_intermediate.csv
Normal file
@@ -0,0 +1,12 @@
|
||||
model,label,avg,best,mode,vram,t,ub,b,kv,split,mmap
|
||||
UD-IQ4_NL,pure-GPU minbatch,65.11,65.49,GPU,19177,t4,ub64,b256,q4_0/q4_0,,
|
||||
UD-IQ4_NL,pure-GPU nommap small,65.01,65.36,GPU,19672,t4,ub128,b512,q4_0/q4_0,,nommap
|
||||
UD-IQ4_NL,pure-GPU row-split,13.65,14.82,GPU,19427,t4,ub128,b512,q4_0/q4_0,row,
|
||||
UD-IQ4_NL,pure-GPU ts=0.5,0.5,64.92,65.23,GPU,19664,t4,ub128,b512,q4_0/q4_0,,
|
||||
UD-IQ4_NL,pure-GPU all-tricks,64.72,64.89,GPU,19171,t4,ub64,b256,q4_0/q4_0,,nommap
|
||||
UD-IQ4_NL,tune t=2,64.87,65.13,GPU,19170,t2,ub64,b256,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune t=6,64.88,65.17,GPU,19168,t6,ub64,b256,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune t=8,64.5,64.77,GPU,19168,t8,ub64,b256,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune ub=256 b=1024,64.73,64.98,GPU,20640,t4,ub256,b1024,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune ub=256 b=2048,63.69,64.94,GPU,20614,t4,ub256,b2048,q4_0/q4_0,,
|
||||
UD-IQ4_NL,tune kv=q8_0/q8_0,64.78,65.08,GPU,20422,t4,ub64,b256,q8_0/q8_0,,
|
||||
|
24
scripts/qwen_latest.txt
Normal file
24
scripts/qwen_latest.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
UD-IQ4_NL | pure-GPU minbatch | 65.11 | GPU | 19177
|
||||
UD-IQ4_NL | pure-GPU nommap small | 65.01 | GPU | 19672
|
||||
UD-IQ4_NL | pure-GPU row-split | 13.65 | GPU | 19427
|
||||
UD-IQ4_NL | pure-GPU ts=0.5,0.5 | 64.92 | GPU | 19664
|
||||
UD-IQ4_NL | pure-GPU all-tricks | 64.72 | GPU | 19171
|
||||
UD-IQ4_NL | tune t=2 | 64.87 | GPU | 19170
|
||||
UD-IQ4_NL | tune t=6 | 64.88 | GPU | 19168
|
||||
UD-IQ4_NL | tune t=8 | 64.5 | GPU | 19168
|
||||
UD-IQ4_NL | tune ub=256 b=1024 | 64.73 | GPU | 20640
|
||||
UD-IQ4_NL | tune ub=256 b=2048 | 63.69 | GPU | 20614
|
||||
UD-IQ4_NL | tune kv=q8_0/q8_0 | 64.78 | GPU | 20422
|
||||
UD-IQ4_NL | tune kv=f16/f16 | 65.53 | GPU | 22812
|
||||
UD-IQ4_NL | FINAL | 66.31 | GPU | 22811
|
||||
MXFP4_MOE | pure-GPU minbatch | 63.06 | GPU | 22747
|
||||
MXFP4_MOE | pure-GPU nommap small | 63.75 | GPU | 22579
|
||||
MXFP4_MOE | pure-GPU ts=0.5,0.5 | 62.88 | GPU | 22578
|
||||
MXFP4_MOE | pure-GPU all-tricks | 62.55 | GPU | 22743
|
||||
MXFP4_MOE | tune t=2 | 63.07 | GPU | 22601
|
||||
MXFP4_MOE | tune t=6 | 63.58 | GPU | 22583
|
||||
MXFP4_MOE | tune t=8 | 62.92 | GPU | 22536
|
||||
MXFP4_MOE | tune ub=256 b=1024 | 62.76 | GPU | 22874
|
||||
MXFP4_MOE | tune ub=256 b=2048 | 62.74 | GPU | 22912
|
||||
MXFP4_MOE | FINAL | 63.71 | GPU | 22566
|
||||
Q4_K_M | pure-GPU nommap small | 62.29 | GPU | 22975
|
||||
BIN
scripts/test_20ts.txt
Normal file
BIN
scripts/test_20ts.txt
Normal file
Binary file not shown.
64
scripts/tune_122b_20ts.mjs
Normal file
64
scripts/tune_122b_20ts.mjs
Normal file
@@ -0,0 +1,64 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, envVars, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const env = { ...process.env, ...envVars };
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
env
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48)
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer");
|
||||
|
||||
// 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer");
|
||||
|
||||
// 3. Fallback to 36 if OOM happens on 32/28
|
||||
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer");
|
||||
|
||||
console.log("\nALL TESTS COMPLETED");
|
||||
}
|
||||
|
||||
main();
|
||||
72
scripts/tune_exact.mjs
Normal file
72
scripts/tune_exact.mjs
Normal file
@@ -0,0 +1,72 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, envVars, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Env: ${JSON.stringify(envVars)}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const env = { ...process.env, ...envVars };
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
env
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running speed test...`);
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// 1. 122B-A10B: Pure GPU offload (No n-cpu-moe at all)
|
||||
// -ngl 999 will offload all 48 layers to the NVIDIA driver (triggering Shared VRAM fallback on Windows)
|
||||
const args122B = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
await runTest(args122B, {}, "122B-A10B: Pure GPU (NVIDIA Shared Memory Fallback)");
|
||||
|
||||
// 2. 35B-A3B: Pure GPU tuning to hit 70 t/s
|
||||
// Base configuration from previous full-gpu run:
|
||||
const args35B = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 128 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// We already got ~64 t/s basically.
|
||||
// Let's try MMQ for custom matrix multiplication which is often faster on Ampere for low batch generation
|
||||
await runTest(args35B, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Force MMQ = 1");
|
||||
|
||||
// Try increasing threads to 12 just in case
|
||||
const args35B_t12 = args35B.replace("-t 6 -tb 6", "-t 12 -tb 12");
|
||||
await runTest(args35B_t12, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Threads 12 + MMQ");
|
||||
|
||||
console.log("\nALL TESTS COMPLETED");
|
||||
}
|
||||
|
||||
main();
|
||||
84
scripts/tune_models.mjs
Normal file
84
scripts/tune_models.mjs
Normal file
@@ -0,0 +1,84 @@
|
||||
import { exec, spawn } from 'child_process';
|
||||
|
||||
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
async function runTest(modelArgs, name) {
|
||||
console.log(`\n===========================================`);
|
||||
console.log(`Testing: ${name}`);
|
||||
console.log(`Args: ${modelArgs}`);
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
// Kill existing
|
||||
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||
await delay(2000);
|
||||
|
||||
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||
detached: true,
|
||||
stdio: 'ignore'
|
||||
});
|
||||
|
||||
let ready = false;
|
||||
let oom = false;
|
||||
|
||||
for (let i = 0; i < 40; i++) {
|
||||
try {
|
||||
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||
if (res.status === 200) {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
await delay(3000);
|
||||
}
|
||||
|
||||
if (!ready) {
|
||||
console.log(`[${name}] FAILED TO BOOT (Likely OOM)`);
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: false });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||
// Run pptest
|
||||
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||
console.log(stdout || stderr);
|
||||
|
||||
// Extract TG and PP from TG-500
|
||||
const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/);
|
||||
const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/);
|
||||
|
||||
const tg = tgMatch ? parseFloat(tgMatch[1]) : 0;
|
||||
const pp = ppMatch ? parseFloat(ppMatch[1]) : 0;
|
||||
|
||||
exec('taskkill /F /IM llama-server.exe');
|
||||
resolve({ success: true, tg, pp });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512
|
||||
const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// Test 1: n-cpu-moe 1, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512");
|
||||
|
||||
// Test 2: n-cpu-moe 2, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512");
|
||||
|
||||
// Test 3: n-cpu-moe 4, ub 512
|
||||
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512");
|
||||
|
||||
// 2. 122B Tuning: Find optimal n-cpu-moe
|
||||
const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||
|
||||
// Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU.
|
||||
// Let's try 38, 35, 30
|
||||
await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38");
|
||||
await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30");
|
||||
await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22");
|
||||
|
||||
console.log("Tuning finished.");
|
||||
}
|
||||
|
||||
main();
|
||||
591
scripts/tune_results_gemma4_256k.json
Normal file
591
scripts/tune_results_gemma4_256k.json
Normal file
@@ -0,0 +1,591 @@
|
||||
[
|
||||
{
|
||||
"ngl": 22,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.22049935826915,
|
||||
"best_tps": 25.971732307567606,
|
||||
"vram_used": 11953,
|
||||
"vram_total": 12288,
|
||||
"label": "ngl=22"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.805518952775174,
|
||||
"best_tps": 25.953896683689454,
|
||||
"vram_used": 11942,
|
||||
"vram_total": 12288,
|
||||
"label": "ngl=21"
|
||||
},
|
||||
{
|
||||
"ngl": 20,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 23.537353232262834,
|
||||
"best_tps": 24.32109262330477,
|
||||
"vram_used": 11972,
|
||||
"vram_total": 12288,
|
||||
"label": "ngl=20"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 2,
|
||||
"tb": 2,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 20.167581352340264,
|
||||
"best_tps": 20.701192443418005,
|
||||
"vram_used": 11969,
|
||||
"vram_total": 12288,
|
||||
"label": "t=2 | tb=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.689104997668554,
|
||||
"best_tps": 26.328541632880874,
|
||||
"vram_used": 11975,
|
||||
"vram_total": 12288,
|
||||
"label": "t=4 | tb=4"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.294470150452725,
|
||||
"best_tps": 26.541251363470614,
|
||||
"vram_used": 11984,
|
||||
"vram_total": 12288,
|
||||
"label": "t=4 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.307859289404675,
|
||||
"best_tps": 26.292208504543133,
|
||||
"vram_used": 11984,
|
||||
"vram_total": 12288,
|
||||
"label": "t=6 | tb=6"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 6,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.230599923243314,
|
||||
"best_tps": 26.366065850165732,
|
||||
"vram_used": 11983,
|
||||
"vram_total": 12288,
|
||||
"label": "t=6 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.113108026759278,
|
||||
"best_tps": 26.123872617669583,
|
||||
"vram_used": 11984,
|
||||
"vram_total": 12288,
|
||||
"label": "t=8 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 8,
|
||||
"tb": 12,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.05545428888364,
|
||||
"best_tps": 26.06377500079152,
|
||||
"vram_used": 11983,
|
||||
"vram_total": 12288,
|
||||
"label": "t=8 | tb=12"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 10,
|
||||
"tb": 10,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 24.706926870374986,
|
||||
"best_tps": 25.03033604251865,
|
||||
"vram_used": 11984,
|
||||
"vram_total": 12288,
|
||||
"label": "t=10 | tb=10"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 12,
|
||||
"tb": 12,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 22.468055564001904,
|
||||
"best_tps": 23.425983251691825,
|
||||
"vram_used": 11989,
|
||||
"vram_total": 12288,
|
||||
"label": "t=12 | tb=12"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 16,
|
||||
"tb": 16,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 21.176973905195442,
|
||||
"best_tps": 21.482429642395456,
|
||||
"vram_used": 12021,
|
||||
"vram_total": 12288,
|
||||
"label": "t=16 | tb=16"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.545748810106186,
|
||||
"best_tps": 26.344547829145817,
|
||||
"vram_used": 11986,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=128 | b=512"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.503875205368377,
|
||||
"best_tps": 26.393548686102108,
|
||||
"vram_used": 11981,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=256 | b=1024"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 256,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.46500292415627,
|
||||
"best_tps": 26.2726382287537,
|
||||
"vram_used": 11981,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=256 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 1024,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.50982209452459,
|
||||
"best_tps": 26.292282671074723,
|
||||
"vram_used": 12020,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=1024"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.39646674356899,
|
||||
"best_tps": 26.28106356028714,
|
||||
"vram_used": 12020,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.471945933724726,
|
||||
"best_tps": 26.268422652962233,
|
||||
"vram_used": 12021,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=4096"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.722119623856702,
|
||||
"best_tps": 26.497264927416403,
|
||||
"vram_used": 12019,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=1024 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.665819493145943,
|
||||
"best_tps": 26.301163428594148,
|
||||
"vram_used": 12019,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=1024 | b=4096"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.464915272955533,
|
||||
"best_tps": 26.40667691713752,
|
||||
"vram_used": 12019,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q4_0 | ctv=q4_0"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.489715990281564,
|
||||
"best_tps": 25.884133821146627,
|
||||
"vram_used": 12011,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q8_0 | ctv=q8_0"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 22.751034104721082,
|
||||
"best_tps": 22.91250972782414,
|
||||
"vram_used": 12017,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q4_0 | ctv=q8_0"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "f16",
|
||||
"ctv": "f16",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 24.745831571513975,
|
||||
"best_tps": 25.53926086004382,
|
||||
"vram_used": 11985,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=f16 | ctv=f16"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.21575943186602,
|
||||
"best_tps": 25.796865637378264,
|
||||
"vram_used": 12013,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=50 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": false,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 23.88172807693179,
|
||||
"best_tps": 24.803356430302312,
|
||||
"vram_used": 12016,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=False | poll=50 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 0,
|
||||
"avg_tps": 25.041321207287698,
|
||||
"best_tps": 25.88479834694897,
|
||||
"vram_used": 12017,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=0 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 100,
|
||||
"avg_tps": 25.27990666474703,
|
||||
"best_tps": 26.034861156695197,
|
||||
"vram_used": 12017,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=100 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 3,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.360977804679788,
|
||||
"best_tps": 26.0705565191107,
|
||||
"vram_used": 12022,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=50 | prio=3"
|
||||
},
|
||||
{
|
||||
"ngl": 21,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": false,
|
||||
"prio": 3,
|
||||
"poll": 0,
|
||||
"avg_tps": 24.156893523381967,
|
||||
"best_tps": 24.840307911026144,
|
||||
"vram_used": 12021,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=False | poll=0 | prio=3"
|
||||
}
|
||||
]
|
||||
201
scripts/tune_results_gemma4_ncpumoe.json
Normal file
201
scripts/tune_results_gemma4_ncpumoe.json
Normal file
@@ -0,0 +1,201 @@
|
||||
[
|
||||
{
|
||||
"label": "ncpumoe=0",
|
||||
"ncpumoe": 0,
|
||||
"avg": 15.396949591766335,
|
||||
"best": 20.220093309883133,
|
||||
"vram": 12011,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=5",
|
||||
"ncpumoe": 5,
|
||||
"avg": 4.853957926040404,
|
||||
"best": 4.9029479257524216,
|
||||
"vram": 11945,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=10",
|
||||
"ncpumoe": 10,
|
||||
"avg": 20.64137159193706,
|
||||
"best": 26.474940718957154,
|
||||
"vram": 12020,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=15",
|
||||
"ncpumoe": 15,
|
||||
"avg": 13.424368433101165,
|
||||
"best": 13.698684361880598,
|
||||
"vram": 12018,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=20",
|
||||
"ncpumoe": 20,
|
||||
"avg": 10.338449574838693,
|
||||
"best": 13.495275411319872,
|
||||
"vram": 11530,
|
||||
"nommap": true
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=25",
|
||||
"ncpumoe": 25,
|
||||
"avg": 12.920348175328435,
|
||||
"best": 12.99923042323437,
|
||||
"vram": 11625,
|
||||
"nommap": true
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=30",
|
||||
"ncpumoe": 30,
|
||||
"avg": 13.251690836275145,
|
||||
"best": 13.253697466971921,
|
||||
"vram": 9064,
|
||||
"nommap": true
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=7",
|
||||
"ncpumoe": 7,
|
||||
"avg": 16.31796299658782,
|
||||
"best": 23.160760806218782,
|
||||
"vram": 11994,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=9",
|
||||
"ncpumoe": 9,
|
||||
"avg": 7.469651892205037,
|
||||
"best": 10.875064047449284,
|
||||
"vram": 11941,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=11",
|
||||
"ncpumoe": 11,
|
||||
"avg": 14.814740144776437,
|
||||
"best": 15.199641279675724,
|
||||
"vram": 11984,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ncpumoe=13",
|
||||
"ncpumoe": 13,
|
||||
"avg": 14.183175252947136,
|
||||
"best": 14.427257794639086,
|
||||
"vram": 12003,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=2",
|
||||
"ncpumoe": 10,
|
||||
"avg": 28.551811207068425,
|
||||
"best": 28.688565545389164,
|
||||
"vram": 11968,
|
||||
"t": 2,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=4",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.8619310622166,
|
||||
"best": 31.17677746690393,
|
||||
"vram": 11972,
|
||||
"t": 4,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=6",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.578454576249854,
|
||||
"best": 30.971792125516313,
|
||||
"vram": 11983,
|
||||
"t": 6,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=8",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.529393512116172,
|
||||
"best": 30.954830478128166,
|
||||
"vram": 11982,
|
||||
"t": 8,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "t=10",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.773041112229503,
|
||||
"best": 31.00899077264753,
|
||||
"vram": 11972,
|
||||
"t": 10,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ub=256,b=1024",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.49319055490045,
|
||||
"best": 30.691055921541377,
|
||||
"vram": 11993,
|
||||
"t": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ub=512,b=2048",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.923573731331718,
|
||||
"best": 31.902272031660825,
|
||||
"vram": 11995,
|
||||
"t": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ub=512,b=4096",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.723820162954862,
|
||||
"best": 31.065476003548053,
|
||||
"vram": 11966,
|
||||
"t": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "ub=1024,b=2048",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.489888387093156,
|
||||
"best": 30.982074615885946,
|
||||
"vram": 11964,
|
||||
"t": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "kv=q4_0",
|
||||
"ncpumoe": 10,
|
||||
"avg": 30.63156129571348,
|
||||
"best": 31.088674795634944,
|
||||
"vram": 11988,
|
||||
"t": 4,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"nommap": false
|
||||
},
|
||||
{
|
||||
"label": "kv=q8_0",
|
||||
"ncpumoe": 10,
|
||||
"avg": 29.6114222576863,
|
||||
"best": 30.580427895917573,
|
||||
"vram": 11980,
|
||||
"t": 4,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"nommap": false
|
||||
}
|
||||
]
|
||||
522
scripts/tune_results_qwen35b_256k.json
Normal file
522
scripts/tune_results_qwen35b_256k.json
Normal file
@@ -0,0 +1,522 @@
|
||||
[
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.169961832638464,
|
||||
"best_tps": 26.533887071573073,
|
||||
"vram_used": 4994,
|
||||
"vram_total": 12288,
|
||||
"label": "cpu_moe=True"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": false,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 11.065030380022206,
|
||||
"best_tps": 11.083028272674314,
|
||||
"vram_used": 11949,
|
||||
"vram_total": 12288,
|
||||
"label": "cpu_moe=False"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 2,
|
||||
"tb": 2,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 21.473286428302767,
|
||||
"best_tps": 21.746637577851104,
|
||||
"vram_used": 4994,
|
||||
"vram_total": 12288,
|
||||
"label": "t=2 | tb=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.552358479030676,
|
||||
"best_tps": 27.314237654089343,
|
||||
"vram_used": 4991,
|
||||
"vram_total": 12288,
|
||||
"label": "t=4 | tb=4"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.347068485327956,
|
||||
"best_tps": 26.87924726131441,
|
||||
"vram_used": 4993,
|
||||
"vram_total": 12288,
|
||||
"label": "t=4 | tb=6"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 6,
|
||||
"tb": 6,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.331286039513458,
|
||||
"best_tps": 26.81427299445741,
|
||||
"vram_used": 5001,
|
||||
"vram_total": 12288,
|
||||
"label": "t=6 | tb=6"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 6,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.391160513711274,
|
||||
"best_tps": 26.735573238878736,
|
||||
"vram_used": 5001,
|
||||
"vram_total": 12288,
|
||||
"label": "t=6 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 8,
|
||||
"tb": 8,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 25.32340666199144,
|
||||
"best_tps": 25.87949347494079,
|
||||
"vram_used": 4995,
|
||||
"vram_total": 12288,
|
||||
"label": "t=8 | tb=8"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 10,
|
||||
"tb": 10,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 23.752277317850815,
|
||||
"best_tps": 24.98242898809555,
|
||||
"vram_used": 5011,
|
||||
"vram_total": 12288,
|
||||
"label": "t=10 | tb=10"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 12,
|
||||
"tb": 12,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 21.75032196383532,
|
||||
"best_tps": 23.18963400077116,
|
||||
"vram_used": 5104,
|
||||
"vram_total": 12288,
|
||||
"label": "t=12 | tb=12"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 128,
|
||||
"b": 512,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 13.27593572827031,
|
||||
"best_tps": 13.337407402920235,
|
||||
"vram_used": 4391,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=128 | b=512"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 256,
|
||||
"b": 1024,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.638687188233188,
|
||||
"best_tps": 27.361082444434413,
|
||||
"vram_used": 4495,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=256 | b=1024"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 256,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.29069503392877,
|
||||
"best_tps": 26.63368832924803,
|
||||
"vram_used": 4490,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=256 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 1024,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.518331831441134,
|
||||
"best_tps": 26.972021321271527,
|
||||
"vram_used": 4984,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=1024"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.401541912276873,
|
||||
"best_tps": 26.46530849236633,
|
||||
"vram_used": 4990,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 26.892711500590455,
|
||||
"best_tps": 26.892711500590455,
|
||||
"vram_used": 5006,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=512 | b=4096"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 2048,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 12.600209659679201,
|
||||
"best_tps": 12.759356030807627,
|
||||
"vram_used": 12020,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=1024 | b=2048"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 1024,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 6.023959262370547,
|
||||
"best_tps": 8.284882268188156,
|
||||
"vram_used": 11931,
|
||||
"vram_total": 12288,
|
||||
"label": "ub=1024 | b=4096"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 12.96992950856374,
|
||||
"best_tps": 12.96992950856374,
|
||||
"vram_used": 12022,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q4_0 | ctv=q4_0"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q8_0",
|
||||
"ctv": "q8_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 11.420078920350697,
|
||||
"best_tps": 13.524778595767653,
|
||||
"vram_used": 12030,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=q8_0 | ctv=q8_0"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "f16",
|
||||
"ctv": "f16",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 11.978106511464183,
|
||||
"best_tps": 13.729190013094977,
|
||||
"vram_used": 11518,
|
||||
"vram_total": 12288,
|
||||
"label": "ctk=f16 | ctv=f16"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 16.164278220452957,
|
||||
"best_tps": 22.645890325274323,
|
||||
"vram_used": 11623,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=50 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": false,
|
||||
"prio": 2,
|
||||
"poll": 50,
|
||||
"avg_tps": 16.555542780023114,
|
||||
"best_tps": 23.333815015033892,
|
||||
"vram_used": 9062,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=False | poll=50 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 0,
|
||||
"avg_tps": 13.003619379106329,
|
||||
"best_tps": 13.031594557134142,
|
||||
"vram_used": 11994,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=0 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 2,
|
||||
"poll": 100,
|
||||
"avg_tps": 5.7762452690702935,
|
||||
"best_tps": 5.795560155803046,
|
||||
"vram_used": 11953,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=100 | prio=2"
|
||||
},
|
||||
{
|
||||
"ngl": 999,
|
||||
"cpu_moe": true,
|
||||
"t": 4,
|
||||
"tb": 4,
|
||||
"ub": 512,
|
||||
"b": 4096,
|
||||
"ctk": "q4_0",
|
||||
"ctv": "q4_0",
|
||||
"fa": "on",
|
||||
"mlock": true,
|
||||
"mmap": true,
|
||||
"prio": 3,
|
||||
"poll": 50,
|
||||
"avg_tps": 12.59406799687573,
|
||||
"best_tps": 14.966737641114795,
|
||||
"vram_used": 11996,
|
||||
"vram_total": 12288,
|
||||
"label": "mmap=True | poll=50 | prio=3"
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user