Compare commits
6 Commits
7890ff6644
...
7c7a899fd5
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7c7a899fd5 | ||
|
|
626a089b6b | ||
|
|
a09736e930 | ||
|
|
66778b750d | ||
|
|
e37f65af80 | ||
|
|
93f4182384 |
@@ -1,86 +0,0 @@
|
|||||||
import os
|
|
||||||
import glob
|
|
||||||
import re
|
|
||||||
|
|
||||||
skill_dir = r"C:\Users\Certes\.gemini\antigravity\skills"
|
|
||||||
|
|
||||||
translations = {
|
|
||||||
"Manage parallel workstreams — list, create, switch, status, progress, complete, and resume": "병렬 작업 스트림 관리 — 목록, 생성, 전환, 상태, 진행률, 완료 및 재개",
|
|
||||||
"Validate built features through conversational UAT": "대화형 UAT를 통해 구현된 기능 검증",
|
|
||||||
"Retroactively audit and fill Nyquist validation gaps for a completed phase": "완료된 단계에 대한 검증 누락 사후 감사 및 보완",
|
|
||||||
"Update GSD to latest version with changelog display": "GSD를 최신 버전으로 업데이트하고 변경 사항 표시",
|
|
||||||
"Retroactive 6-pillar visual audit of implemented frontend code": "구현된 프론트엔드 코드에 대한 6개 요소 시각적 사후 감사",
|
|
||||||
"Generate UI design contract (UI-SPEC.md) for frontend phases": "프론트엔드 단계를 위한 UI 디자인 명세서(UI-SPEC.md) 생성",
|
|
||||||
"Manage persistent context threads for cross-session work": "교차 세션 작업을 위한 영구 컨텍스트 스레드 관리",
|
|
||||||
"Display project statistics — phases, plans, requirements, git metrics, and timeline": "프로젝트 통계 표시 — 단계, 계획, 요구사항, Git 지표 및 타임라인",
|
|
||||||
"Create PR, run review, and prepare for merge after verification passes": "검증 통과 후 PR 생성, 리뷰 실행 및 병합 준비",
|
|
||||||
"Configure GSD workflow toggles and model profile": "GSD 워크플로우 옵션 및 모델 프로필 구성",
|
|
||||||
"Switch model profile for GSD agents (quality/balanced/budget/inherit)": "GSD 요원의 모델 프로필 전환 (고품질/균형/예산/상속)",
|
|
||||||
"Generate a session report with token usage estimates, work summary, and outcomes": "토큰 사용량, 작업 요약 및 결과를 포함한 세션 보고서 생성",
|
|
||||||
"Review and promote backlog items to active milestone": "백로그 항목을 검토하고 활성 마일스톤으로 승격",
|
|
||||||
"Request cross-AI peer review of phase plans from external AI CLIs": "외부 AI CLI에 단계 계획에 대한 교차 AI 동료 리뷰 요청",
|
|
||||||
"Resume work from previous session with full context restoration": "전체 컨텍스트 복원과 함께 이전 세션에서 작업 재개",
|
|
||||||
"Research how to implement a phase (standalone - usually use /gsd-plan-phase instead)": "단계를 구현하는 방법 리서치 (단독 실행 - 보통 /gsd-plan-phase 사용)",
|
|
||||||
"Remove a GSD workspace and clean up worktrees": "GSD 워크스페이스 제거 및 워크트리 정리",
|
|
||||||
"Remove a future phase from roadmap and renumber subsequent phases": "로드맵에서 향후 단계를 제거하고 이후 단계 번호 재지정",
|
|
||||||
"Reapply local modifications after a GSD update": "GSD 업데이트 후 로컬 수정 사항 재적용",
|
|
||||||
"Execute a quick task with GSD guarantees (atomic commits, state tracking) but skip optional agents": "GSD 보장(원자적 커밋, 상태 추적)을 사용하여 빠른 작업을 실행하되 선택적 요원 생략",
|
|
||||||
"Check project progress, show context, and route to next action (execute or plan)": "프로젝트 진행 상황 확인, 컨텍스트 표시 및 다음 작업(실행 또는 계획)으로 라우팅",
|
|
||||||
"Generate developer behavioral profile and create Claude-discoverable artifacts": "개발자 행동 프로필을 생성하고 AI가 인지할 수 있는 문서 작성",
|
|
||||||
"Create a clean PR branch by filtering out .planning/ commits — ready for code review": ".planning/ 커밋을 필터링하여 깔끔한 PR 브랜치 생성 — 코드 리뷰 준비",
|
|
||||||
"Capture a forward-looking idea with trigger conditions — surfaces automatically at the right milestone": "향후 아이디어를 트리거 조건과 함께 캡처 — 적절한 마일스톤에서 자동 표시",
|
|
||||||
"Create detailed phase plan (PLAN.md) with verification loop": "검증 루프를 포함한 상세 단계 계획(PLAN.md) 생성",
|
|
||||||
"Create phases to close all gaps identified by milestone audit": "마일스톤 감사에서 식별된 모든 격차를 해소하기 위한 단계 생성",
|
|
||||||
"Create context handoff when pausing work mid-phase": "작업 중단 시 컨텍스트 인수인계 파일 생성",
|
|
||||||
"Zero-friction idea capture. Append, list, or promote notes to todos.": "방해 없는 아이디어 캡처. 메모 추가, 나열 또는 할 일로 승격.",
|
|
||||||
"Automatically advance to the next logical step in the GSD workflow": "GSD 워크플로우의 다음 논리적 단계로 자동 진행",
|
|
||||||
"Create an isolated workspace with repo copies and independent .planning/": "외부 레포 사본 및 독립적인 .planning/을 갖춘 격리된 워크스페이스 생성",
|
|
||||||
"Initialize a new project with deep context gathering and PROJECT.md": "심층 컨텍스트 수집 및 PROJECT.md와 함께 새 프로젝트 초기화",
|
|
||||||
"Start a new milestone cycle — update PROJECT.md and route to requirements": "새로운 마일스톤 주기 시작 — PROJECT.md 업데이트 및 요구사항 재정의",
|
|
||||||
"Generate a comprehensive project summary from milestone artifacts for team onboarding and review": "팀 온보딩 및 리뷰를 위해 마일스톤 산출물에서 종합적인 프로젝트 요약 생성",
|
|
||||||
"Analyze codebase with parallel mapper agents to produce .planning/codebase/ documents": "병렬 매퍼 요원으로 코드베이스를 분석하여 .planning/codebase/ 문서 생성",
|
|
||||||
"Interactive command center for managing multiple phases from one terminal": "하나의 터미널에서 여러 단계를 관리하는 대화형 명령 센터",
|
|
||||||
"List active GSD workspaces and their status": "활성 GSD 워크스페이스 및 상태 나열",
|
|
||||||
"Surface the agent's assumptions about a phase approach before planning": "계획 전 단계적 접근 방식에 대한 요원의 가정을 미리 표시",
|
|
||||||
"Join the GSD Discord community": "GSD 디스코드 커뮤니티 참가",
|
|
||||||
"Insert urgent work as decimal phase (e.g., 72.1) between existing phases": "기존 단계 사이에 소수점 단계(예: 72.1)로 긴급 작업 삽입",
|
|
||||||
"Show available GSD commands and usage guide": "사용 가능한 GSD 명령어 및 사용 가이드 표시",
|
|
||||||
"Diagnose planning directory health and optionally repair issues": "계획 디렉토리 상태 진단 및 선택적으로 문제 복구",
|
|
||||||
"Post-mortem investigation for failed GSD workflows — analyzes git history, artifacts, and state to diagnose what went wrong": "실패한 GSD 워크플로우에 대한 사후 조사 — git 기록, 문서 및 상태 분석",
|
|
||||||
"Execute a trivial task inline — no subagents, no planning overhead": "인라인으로 사소한 작업 실행 — 서브 에이전트 및 계획 오버헤드 없음",
|
|
||||||
"Execute all plans in a phase with wave-based parallelization": "웨이브(Wave) 기반 병렬 처리를 사용하여 단계의 모든 계획 실행",
|
|
||||||
"Route freeform text to the right GSD command automatically": "자유 형식 텍스트를 적절한 GSD 명령으로 자동 라우팅",
|
|
||||||
"Systematic debugging with persistent state across context resets": "컨텍스트가 리셋되어도 상태를 유지하는 체계적인 디버깅",
|
|
||||||
"Gather phase context through adaptive questioning before planning. Use --auto to skip interactive questions (the agent picks recommended defaults).": "계획 전 심층 질문을 통해 단계 컨텍스트 수집. 대화형 건너뛰기(--auto) 가능.",
|
|
||||||
"Archive completed milestone and prepare for next version": "완료된 마일스톤 보관 및 다음 버전 준비",
|
|
||||||
"List pending todos and select one to work on": "보류 중인 할 일 목록 표시 및 작업할 항목 선택",
|
|
||||||
"Cross-phase audit of all outstanding UAT and verification items": "모든 미결 UAT 및 검증 항목에 대한 전체 단계 교차 감사",
|
|
||||||
"Audit milestone completion against original intent before archiving": "보관 전 원래 의도와 비교하여 마일스톤 달성 여부 감사",
|
|
||||||
"Capture idea or task as todo from current conversation context": "현재 대화 컨텍스트에서 아이디어 또는 작업을 할 일로 캡처",
|
|
||||||
"Generate tests for a completed phase based on UAT criteria and implementation": "UAT 기준 및 구현을 기반으로 완료된 단계에 대한 테스트 생성",
|
|
||||||
"Add phase to end of current milestone in roadmap": "로드맵의 현재 마일스톤 끝에 새 단계 추가",
|
|
||||||
"Add an idea to the backlog parking lot (999.x numbering)": "백로그 주차장(999.x 넘버링)에 아이디어 추가",
|
|
||||||
"Run all remaining phases autonomously — discuss→plan→execute per phase": "모든 남은 단계를 완전히 자율적으로 실행 (논의→계획→실행 루프)",
|
|
||||||
"Archive accumulated phase directories from completed milestones": "완료된 마일스톤에서 쌓인 단계 디렉토리 보관 및 정리"
|
|
||||||
}
|
|
||||||
|
|
||||||
modified_count = 0
|
|
||||||
|
|
||||||
for filepath in glob.glob(os.path.join(skill_dir, "gsd-*", "SKILL.md")):
|
|
||||||
try:
|
|
||||||
with open(filepath, 'r', encoding='utf-8') as f:
|
|
||||||
content = f.read()
|
|
||||||
|
|
||||||
new_content = content
|
|
||||||
for eng, kor in translations.items():
|
|
||||||
pattern = re.compile(r"^description:\s*" + re.escape(eng) + r"\s*$", re.MULTILINE)
|
|
||||||
new_content = pattern.sub(f"description: {kor}", new_content)
|
|
||||||
|
|
||||||
if new_content != content:
|
|
||||||
with open(filepath, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(new_content)
|
|
||||||
modified_count += 1
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error processing {filepath}: {e}")
|
|
||||||
|
|
||||||
print(f"Successfully translated {modified_count} SKILL.md files.")
|
|
||||||
@@ -1,253 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
UI/UX Pro Max Core - BM25 search engine for UI/UX style guides
|
|
||||||
"""
|
|
||||||
|
|
||||||
import csv
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
|
||||||
from math import log
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
# ============ CONFIGURATION ============
|
|
||||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
|
||||||
MAX_RESULTS = 3
|
|
||||||
|
|
||||||
CSV_CONFIG = {
|
|
||||||
"style": {
|
|
||||||
"file": "styles.csv",
|
|
||||||
"search_cols": ["Style Category", "Keywords", "Best For", "Type", "AI Prompt Keywords"],
|
|
||||||
"output_cols": ["Style Category", "Type", "Keywords", "Primary Colors", "Effects & Animation", "Best For", "Performance", "Accessibility", "Framework Compatibility", "Complexity", "AI Prompt Keywords", "CSS/Technical Keywords", "Implementation Checklist", "Design System Variables"]
|
|
||||||
},
|
|
||||||
"color": {
|
|
||||||
"file": "colors.csv",
|
|
||||||
"search_cols": ["Product Type", "Notes"],
|
|
||||||
"output_cols": ["Product Type", "Primary (Hex)", "Secondary (Hex)", "CTA (Hex)", "Background (Hex)", "Text (Hex)", "Notes"]
|
|
||||||
},
|
|
||||||
"chart": {
|
|
||||||
"file": "charts.csv",
|
|
||||||
"search_cols": ["Data Type", "Keywords", "Best Chart Type", "Accessibility Notes"],
|
|
||||||
"output_cols": ["Data Type", "Keywords", "Best Chart Type", "Secondary Options", "Color Guidance", "Accessibility Notes", "Library Recommendation", "Interactive Level"]
|
|
||||||
},
|
|
||||||
"landing": {
|
|
||||||
"file": "landing.csv",
|
|
||||||
"search_cols": ["Pattern Name", "Keywords", "Conversion Optimization", "Section Order"],
|
|
||||||
"output_cols": ["Pattern Name", "Keywords", "Section Order", "Primary CTA Placement", "Color Strategy", "Conversion Optimization"]
|
|
||||||
},
|
|
||||||
"product": {
|
|
||||||
"file": "products.csv",
|
|
||||||
"search_cols": ["Product Type", "Keywords", "Primary Style Recommendation", "Key Considerations"],
|
|
||||||
"output_cols": ["Product Type", "Keywords", "Primary Style Recommendation", "Secondary Styles", "Landing Page Pattern", "Dashboard Style (if applicable)", "Color Palette Focus"]
|
|
||||||
},
|
|
||||||
"ux": {
|
|
||||||
"file": "ux-guidelines.csv",
|
|
||||||
"search_cols": ["Category", "Issue", "Description", "Platform"],
|
|
||||||
"output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
|
|
||||||
},
|
|
||||||
"typography": {
|
|
||||||
"file": "typography.csv",
|
|
||||||
"search_cols": ["Font Pairing Name", "Category", "Mood/Style Keywords", "Best For", "Heading Font", "Body Font"],
|
|
||||||
"output_cols": ["Font Pairing Name", "Category", "Heading Font", "Body Font", "Mood/Style Keywords", "Best For", "Google Fonts URL", "CSS Import", "Tailwind Config", "Notes"]
|
|
||||||
},
|
|
||||||
"icons": {
|
|
||||||
"file": "icons.csv",
|
|
||||||
"search_cols": ["Category", "Icon Name", "Keywords", "Best For"],
|
|
||||||
"output_cols": ["Category", "Icon Name", "Keywords", "Library", "Import Code", "Usage", "Best For", "Style"]
|
|
||||||
},
|
|
||||||
"react": {
|
|
||||||
"file": "react-performance.csv",
|
|
||||||
"search_cols": ["Category", "Issue", "Keywords", "Description"],
|
|
||||||
"output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
|
|
||||||
},
|
|
||||||
"web": {
|
|
||||||
"file": "web-interface.csv",
|
|
||||||
"search_cols": ["Category", "Issue", "Keywords", "Description"],
|
|
||||||
"output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
STACK_CONFIG = {
|
|
||||||
"html-tailwind": {"file": "stacks/html-tailwind.csv"},
|
|
||||||
"react": {"file": "stacks/react.csv"},
|
|
||||||
"nextjs": {"file": "stacks/nextjs.csv"},
|
|
||||||
"astro": {"file": "stacks/astro.csv"},
|
|
||||||
"vue": {"file": "stacks/vue.csv"},
|
|
||||||
"nuxtjs": {"file": "stacks/nuxtjs.csv"},
|
|
||||||
"nuxt-ui": {"file": "stacks/nuxt-ui.csv"},
|
|
||||||
"svelte": {"file": "stacks/svelte.csv"},
|
|
||||||
"swiftui": {"file": "stacks/swiftui.csv"},
|
|
||||||
"react-native": {"file": "stacks/react-native.csv"},
|
|
||||||
"flutter": {"file": "stacks/flutter.csv"},
|
|
||||||
"shadcn": {"file": "stacks/shadcn.csv"},
|
|
||||||
"jetpack-compose": {"file": "stacks/jetpack-compose.csv"}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Common columns for all stacks
|
|
||||||
_STACK_COLS = {
|
|
||||||
"search_cols": ["Category", "Guideline", "Description", "Do", "Don't"],
|
|
||||||
"output_cols": ["Category", "Guideline", "Description", "Do", "Don't", "Code Good", "Code Bad", "Severity", "Docs URL"]
|
|
||||||
}
|
|
||||||
|
|
||||||
AVAILABLE_STACKS = list(STACK_CONFIG.keys())
|
|
||||||
|
|
||||||
|
|
||||||
# ============ BM25 IMPLEMENTATION ============
|
|
||||||
class BM25:
|
|
||||||
"""BM25 ranking algorithm for text search"""
|
|
||||||
|
|
||||||
def __init__(self, k1=1.5, b=0.75):
|
|
||||||
self.k1 = k1
|
|
||||||
self.b = b
|
|
||||||
self.corpus = []
|
|
||||||
self.doc_lengths = []
|
|
||||||
self.avgdl = 0
|
|
||||||
self.idf = {}
|
|
||||||
self.doc_freqs = defaultdict(int)
|
|
||||||
self.N = 0
|
|
||||||
|
|
||||||
def tokenize(self, text):
|
|
||||||
"""Lowercase, split, remove punctuation, filter short words"""
|
|
||||||
text = re.sub(r'[^\w\s]', ' ', str(text).lower())
|
|
||||||
return [w for w in text.split() if len(w) > 2]
|
|
||||||
|
|
||||||
def fit(self, documents):
|
|
||||||
"""Build BM25 index from documents"""
|
|
||||||
self.corpus = [self.tokenize(doc) for doc in documents]
|
|
||||||
self.N = len(self.corpus)
|
|
||||||
if self.N == 0:
|
|
||||||
return
|
|
||||||
self.doc_lengths = [len(doc) for doc in self.corpus]
|
|
||||||
self.avgdl = sum(self.doc_lengths) / self.N
|
|
||||||
|
|
||||||
for doc in self.corpus:
|
|
||||||
seen = set()
|
|
||||||
for word in doc:
|
|
||||||
if word not in seen:
|
|
||||||
self.doc_freqs[word] += 1
|
|
||||||
seen.add(word)
|
|
||||||
|
|
||||||
for word, freq in self.doc_freqs.items():
|
|
||||||
self.idf[word] = log((self.N - freq + 0.5) / (freq + 0.5) + 1)
|
|
||||||
|
|
||||||
def score(self, query):
|
|
||||||
"""Score all documents against query"""
|
|
||||||
query_tokens = self.tokenize(query)
|
|
||||||
scores = []
|
|
||||||
|
|
||||||
for idx, doc in enumerate(self.corpus):
|
|
||||||
score = 0
|
|
||||||
doc_len = self.doc_lengths[idx]
|
|
||||||
term_freqs = defaultdict(int)
|
|
||||||
for word in doc:
|
|
||||||
term_freqs[word] += 1
|
|
||||||
|
|
||||||
for token in query_tokens:
|
|
||||||
if token in self.idf:
|
|
||||||
tf = term_freqs[token]
|
|
||||||
idf = self.idf[token]
|
|
||||||
numerator = tf * (self.k1 + 1)
|
|
||||||
denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
|
|
||||||
score += idf * numerator / denominator
|
|
||||||
|
|
||||||
scores.append((idx, score))
|
|
||||||
|
|
||||||
return sorted(scores, key=lambda x: x[1], reverse=True)
|
|
||||||
|
|
||||||
|
|
||||||
# ============ SEARCH FUNCTIONS ============
|
|
||||||
def _load_csv(filepath):
|
|
||||||
"""Load CSV and return list of dicts"""
|
|
||||||
with open(filepath, 'r', encoding='utf-8') as f:
|
|
||||||
return list(csv.DictReader(f))
|
|
||||||
|
|
||||||
|
|
||||||
def _search_csv(filepath, search_cols, output_cols, query, max_results):
|
|
||||||
"""Core search function using BM25"""
|
|
||||||
if not filepath.exists():
|
|
||||||
return []
|
|
||||||
|
|
||||||
data = _load_csv(filepath)
|
|
||||||
|
|
||||||
# Build documents from search columns
|
|
||||||
documents = [" ".join(str(row.get(col, "")) for col in search_cols) for row in data]
|
|
||||||
|
|
||||||
# BM25 search
|
|
||||||
bm25 = BM25()
|
|
||||||
bm25.fit(documents)
|
|
||||||
ranked = bm25.score(query)
|
|
||||||
|
|
||||||
# Get top results with score > 0
|
|
||||||
results = []
|
|
||||||
for idx, score in ranked[:max_results]:
|
|
||||||
if score > 0:
|
|
||||||
row = data[idx]
|
|
||||||
results.append({col: row.get(col, "") for col in output_cols if col in row})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def detect_domain(query):
|
|
||||||
"""Auto-detect the most relevant domain from query"""
|
|
||||||
query_lower = query.lower()
|
|
||||||
|
|
||||||
domain_keywords = {
|
|
||||||
"color": ["color", "palette", "hex", "#", "rgb"],
|
|
||||||
"chart": ["chart", "graph", "visualization", "trend", "bar", "pie", "scatter", "heatmap", "funnel"],
|
|
||||||
"landing": ["landing", "page", "cta", "conversion", "hero", "testimonial", "pricing", "section"],
|
|
||||||
"product": ["saas", "ecommerce", "e-commerce", "fintech", "healthcare", "gaming", "portfolio", "crypto", "dashboard"],
|
|
||||||
"style": ["style", "design", "ui", "minimalism", "glassmorphism", "neumorphism", "brutalism", "dark mode", "flat", "aurora", "prompt", "css", "implementation", "variable", "checklist", "tailwind"],
|
|
||||||
"ux": ["ux", "usability", "accessibility", "wcag", "touch", "scroll", "animation", "keyboard", "navigation", "mobile"],
|
|
||||||
"typography": ["font", "typography", "heading", "serif", "sans"],
|
|
||||||
"icons": ["icon", "icons", "lucide", "heroicons", "symbol", "glyph", "pictogram", "svg icon"],
|
|
||||||
"react": ["react", "next.js", "nextjs", "suspense", "memo", "usecallback", "useeffect", "rerender", "bundle", "waterfall", "barrel", "dynamic import", "rsc", "server component"],
|
|
||||||
"web": ["aria", "focus", "outline", "semantic", "virtualize", "autocomplete", "form", "input type", "preconnect"]
|
|
||||||
}
|
|
||||||
|
|
||||||
scores = {domain: sum(1 for kw in keywords if kw in query_lower) for domain, keywords in domain_keywords.items()}
|
|
||||||
best = max(scores, key=scores.get)
|
|
||||||
return best if scores[best] > 0 else "style"
|
|
||||||
|
|
||||||
|
|
||||||
def search(query, domain=None, max_results=MAX_RESULTS):
|
|
||||||
"""Main search function with auto-domain detection"""
|
|
||||||
if domain is None:
|
|
||||||
domain = detect_domain(query)
|
|
||||||
|
|
||||||
config = CSV_CONFIG.get(domain, CSV_CONFIG["style"])
|
|
||||||
filepath = DATA_DIR / config["file"]
|
|
||||||
|
|
||||||
if not filepath.exists():
|
|
||||||
return {"error": f"File not found: {filepath}", "domain": domain}
|
|
||||||
|
|
||||||
results = _search_csv(filepath, config["search_cols"], config["output_cols"], query, max_results)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"domain": domain,
|
|
||||||
"query": query,
|
|
||||||
"file": config["file"],
|
|
||||||
"count": len(results),
|
|
||||||
"results": results
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def search_stack(query, stack, max_results=MAX_RESULTS):
|
|
||||||
"""Search stack-specific guidelines"""
|
|
||||||
if stack not in STACK_CONFIG:
|
|
||||||
return {"error": f"Unknown stack: {stack}. Available: {', '.join(AVAILABLE_STACKS)}"}
|
|
||||||
|
|
||||||
filepath = DATA_DIR / STACK_CONFIG[stack]["file"]
|
|
||||||
|
|
||||||
if not filepath.exists():
|
|
||||||
return {"error": f"Stack file not found: {filepath}", "stack": stack}
|
|
||||||
|
|
||||||
results = _search_csv(filepath, _STACK_COLS["search_cols"], _STACK_COLS["output_cols"], query, max_results)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"domain": "stack",
|
|
||||||
"stack": stack,
|
|
||||||
"query": query,
|
|
||||||
"file": STACK_CONFIG[stack]["file"],
|
|
||||||
"count": len(results),
|
|
||||||
"results": results
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,114 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
UI/UX Pro Max Search - BM25 search engine for UI/UX style guides
|
|
||||||
Usage: python search.py "<query>" [--domain <domain>] [--stack <stack>] [--max-results 3]
|
|
||||||
python search.py "<query>" --design-system [-p "Project Name"]
|
|
||||||
python search.py "<query>" --design-system --persist [-p "Project Name"] [--page "dashboard"]
|
|
||||||
|
|
||||||
Domains: style, prompt, color, chart, landing, product, ux, typography
|
|
||||||
Stacks: html-tailwind, react, nextjs
|
|
||||||
|
|
||||||
Persistence (Master + Overrides pattern):
|
|
||||||
--persist Save design system to design-system/MASTER.md
|
|
||||||
--page Also create a page-specific override file in design-system/pages/
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import io
|
|
||||||
from core import CSV_CONFIG, AVAILABLE_STACKS, MAX_RESULTS, search, search_stack
|
|
||||||
from design_system import generate_design_system, persist_design_system
|
|
||||||
|
|
||||||
# Force UTF-8 for stdout/stderr to handle emojis on Windows (cp1252 default)
|
|
||||||
if sys.stdout.encoding and sys.stdout.encoding.lower() != 'utf-8':
|
|
||||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
||||||
if sys.stderr.encoding and sys.stderr.encoding.lower() != 'utf-8':
|
|
||||||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
|
||||||
|
|
||||||
|
|
||||||
def format_output(result):
|
|
||||||
"""Format results for Claude consumption (token-optimized)"""
|
|
||||||
if "error" in result:
|
|
||||||
return f"Error: {result['error']}"
|
|
||||||
|
|
||||||
output = []
|
|
||||||
if result.get("stack"):
|
|
||||||
output.append(f"## UI Pro Max Stack Guidelines")
|
|
||||||
output.append(f"**Stack:** {result['stack']} | **Query:** {result['query']}")
|
|
||||||
else:
|
|
||||||
output.append(f"## UI Pro Max Search Results")
|
|
||||||
output.append(f"**Domain:** {result['domain']} | **Query:** {result['query']}")
|
|
||||||
output.append(f"**Source:** {result['file']} | **Found:** {result['count']} results\n")
|
|
||||||
|
|
||||||
for i, row in enumerate(result['results'], 1):
|
|
||||||
output.append(f"### Result {i}")
|
|
||||||
for key, value in row.items():
|
|
||||||
value_str = str(value)
|
|
||||||
if len(value_str) > 300:
|
|
||||||
value_str = value_str[:300] + "..."
|
|
||||||
output.append(f"- **{key}:** {value_str}")
|
|
||||||
output.append("")
|
|
||||||
|
|
||||||
return "\n".join(output)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="UI Pro Max Search")
|
|
||||||
parser.add_argument("query", help="Search query")
|
|
||||||
parser.add_argument("--domain", "-d", choices=list(CSV_CONFIG.keys()), help="Search domain")
|
|
||||||
parser.add_argument("--stack", "-s", choices=AVAILABLE_STACKS, help="Stack-specific search (html-tailwind, react, nextjs)")
|
|
||||||
parser.add_argument("--max-results", "-n", type=int, default=MAX_RESULTS, help="Max results (default: 3)")
|
|
||||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
||||||
# Design system generation
|
|
||||||
parser.add_argument("--design-system", "-ds", action="store_true", help="Generate complete design system recommendation")
|
|
||||||
parser.add_argument("--project-name", "-p", type=str, default=None, help="Project name for design system output")
|
|
||||||
parser.add_argument("--format", "-f", choices=["ascii", "markdown"], default="ascii", help="Output format for design system")
|
|
||||||
# Persistence (Master + Overrides pattern)
|
|
||||||
parser.add_argument("--persist", action="store_true", help="Save design system to design-system/MASTER.md (creates hierarchical structure)")
|
|
||||||
parser.add_argument("--page", type=str, default=None, help="Create page-specific override file in design-system/pages/")
|
|
||||||
parser.add_argument("--output-dir", "-o", type=str, default=None, help="Output directory for persisted files (default: current directory)")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Design system takes priority
|
|
||||||
if args.design_system:
|
|
||||||
result = generate_design_system(
|
|
||||||
args.query,
|
|
||||||
args.project_name,
|
|
||||||
args.format,
|
|
||||||
persist=args.persist,
|
|
||||||
page=args.page,
|
|
||||||
output_dir=args.output_dir
|
|
||||||
)
|
|
||||||
print(result)
|
|
||||||
|
|
||||||
# Print persistence confirmation
|
|
||||||
if args.persist:
|
|
||||||
project_slug = args.project_name.lower().replace(' ', '-') if args.project_name else "default"
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print(f"✅ Design system persisted to design-system/{project_slug}/")
|
|
||||||
print(f" 📄 design-system/{project_slug}/MASTER.md (Global Source of Truth)")
|
|
||||||
if args.page:
|
|
||||||
page_filename = args.page.lower().replace(' ', '-')
|
|
||||||
print(f" 📄 design-system/{project_slug}/pages/{page_filename}.md (Page Overrides)")
|
|
||||||
print("")
|
|
||||||
print(f"📖 Usage: When building a page, check design-system/{project_slug}/pages/[page].md first.")
|
|
||||||
print(f" If exists, its rules override MASTER.md. Otherwise, use MASTER.md.")
|
|
||||||
print("=" * 60)
|
|
||||||
# Stack search
|
|
||||||
elif args.stack:
|
|
||||||
result = search_stack(args.query, args.stack, args.max_results)
|
|
||||||
if args.json:
|
|
||||||
import json
|
|
||||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
||||||
else:
|
|
||||||
print(format_output(result))
|
|
||||||
# Domain search
|
|
||||||
else:
|
|
||||||
result = search(args.query, args.domain, args.max_results)
|
|
||||||
if args.json:
|
|
||||||
import json
|
|
||||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
||||||
else:
|
|
||||||
print(format_output(result))
|
|
||||||
@@ -4,21 +4,29 @@ const path = require('path');
|
|||||||
// 1. Get arguments
|
// 1. Get arguments
|
||||||
const args = process.argv.slice(2);
|
const args = process.argv.slice(2);
|
||||||
if (args.length < 2) {
|
if (args.length < 2) {
|
||||||
console.error("Usage: node sync_vikunja.js <task_id> <message_or_commit>");
|
console.error("Usage:");
|
||||||
|
console.error(" node sync_vikunja.js <task_id> <message> # Update existing task");
|
||||||
|
console.error(" node sync_vikunja.js create \"<title>\" \"<message>\" # Create new task");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
const taskId = args[0];
|
const commandOrId = args[0];
|
||||||
const message = args[1];
|
const message = args[1];
|
||||||
|
|
||||||
// 2. Load configuration from .env.agent
|
// 2. Load configuration from .env.agent
|
||||||
const envPath = path.join(__dirname, '../config/.env.agent');
|
const envPath = path.join(__dirname, '../../.env.agent');
|
||||||
if (!fs.existsSync(envPath)) {
|
const fallbackEnvPath = path.join(__dirname, '../config/.env.agent');
|
||||||
console.error("Error: .agent/config/.env.agent file not found. Please create it from the template.");
|
|
||||||
|
let envContent = '';
|
||||||
|
if (fs.existsSync(envPath)) {
|
||||||
|
envContent = fs.readFileSync(envPath, 'utf8');
|
||||||
|
} else if (fs.existsSync(fallbackEnvPath)) {
|
||||||
|
envContent = fs.readFileSync(fallbackEnvPath, 'utf8');
|
||||||
|
} else {
|
||||||
|
console.error("Error: .env.agent file not found.");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
const envContent = fs.readFileSync(envPath, 'utf8');
|
|
||||||
const env = {};
|
const env = {};
|
||||||
envContent.split('\n').forEach(line => {
|
envContent.split('\n').forEach(line => {
|
||||||
const match = line.match(/^([^#=]+)="?(.*?)"?$/);
|
const match = line.match(/^([^#=]+)="?(.*?)"?$/);
|
||||||
@@ -29,6 +37,7 @@ envContent.split('\n').forEach(line => {
|
|||||||
|
|
||||||
const apiUrl = env.VIKUNJA_API_URL;
|
const apiUrl = env.VIKUNJA_API_URL;
|
||||||
const apiToken = env.VIKUNJA_API_TOKEN;
|
const apiToken = env.VIKUNJA_API_TOKEN;
|
||||||
|
const projectId = env.VIKUNJA_PROJECT_ID || 14;
|
||||||
|
|
||||||
if (!apiUrl || !apiToken || apiUrl.includes('[YOUR_')) {
|
if (!apiUrl || !apiToken || apiUrl.includes('[YOUR_')) {
|
||||||
console.error("Error: VIKUNJA_API_URL or VIKUNJA_API_TOKEN is not configured correctly in .env.agent.");
|
console.error("Error: VIKUNJA_API_URL or VIKUNJA_API_TOKEN is not configured correctly in .env.agent.");
|
||||||
@@ -40,52 +49,59 @@ if (env.AGENT_OPERATING_MODE === "TEST") {
|
|||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Helper to make API calls using native fetch (Node 18+)
|
const FETCH_OPTS = {
|
||||||
async function markTaskDoneAndComment(taskId, message) {
|
headers: {
|
||||||
|
'Authorization': `Bearer ${apiToken}`,
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
async function createTaskAndComment(title, message) {
|
||||||
try {
|
try {
|
||||||
console.log(`Connecting to Vikunja API for Task ${taskId}...`);
|
console.log(`Creating new task in Project ${projectId}...`);
|
||||||
|
const createRes = await fetch(`${apiUrl}/projects/${projectId}/tasks`, {
|
||||||
// Update task status to done
|
|
||||||
const patchRes = await fetch(`${apiUrl}/tasks/${taskId}`, {
|
|
||||||
method: 'POST', // Vikunja uses POST to task endpoint for updates
|
|
||||||
headers: {
|
|
||||||
'Authorization': `Bearer ${apiToken}`,
|
|
||||||
'Content-Type': 'application/json'
|
|
||||||
},
|
|
||||||
body: JSON.stringify({ done: true })
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!patchRes.ok) {
|
|
||||||
throw new Error(`Failed to mark task as done: ${patchRes.statusText}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`✅ Task ${taskId} successfully marked as Done.`);
|
|
||||||
|
|
||||||
// Add comment
|
|
||||||
const commentRes = await fetch(`${apiUrl}/tasks/${taskId}/comments`, {
|
|
||||||
method: 'PUT',
|
method: 'PUT',
|
||||||
headers: {
|
...FETCH_OPTS,
|
||||||
'Authorization': `Bearer ${apiToken}`,
|
|
||||||
'Content-Type': 'application/json'
|
|
||||||
},
|
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
text: `[Agent Automator] Phase completed.\nReason/Hash: ${message}`
|
title: title,
|
||||||
|
description: message,
|
||||||
|
done: true
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!commentRes.ok) {
|
if (!createRes.ok) throw new Error(`Create failed: ${createRes.statusText}`);
|
||||||
console.error(`Warning: Task marked as done, but failed to attach comment: ${commentRes.statusText}`);
|
const task = await createRes.json();
|
||||||
} else {
|
console.log(`✅ Task created and marked Done! ID: #${task.id}`);
|
||||||
console.log("✅ Comment attached successfully.");
|
} catch (e) {
|
||||||
}
|
console.error("❌ Failed:", e.message);
|
||||||
|
|
||||||
} catch (error) {
|
|
||||||
console.error("❌ Failed to sync with Vikunja:");
|
|
||||||
// Mask the token if it somehow leaks via error message
|
|
||||||
const secureErr = error.message.replace(new RegExp(apiToken, 'g'), "********");
|
|
||||||
console.error(secureErr);
|
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
markTaskDoneAndComment(taskId, message);
|
async function markTaskDoneAndComment(taskId, message) {
|
||||||
|
try {
|
||||||
|
console.log(`Updating Task ${taskId}...`);
|
||||||
|
const patchRes = await fetch(`${apiUrl}/tasks/${taskId}`, {
|
||||||
|
method: 'POST',
|
||||||
|
...FETCH_OPTS,
|
||||||
|
body: JSON.stringify({ done: true })
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!patchRes.ok) throw new Error(`Update failed: ${patchRes.statusText}`);
|
||||||
|
console.log(`✅ Task ${taskId} marked as Done.`);
|
||||||
|
|
||||||
|
await fetch(`${apiUrl}/tasks/${taskId}/comments`, {
|
||||||
|
method: 'PUT', ...FETCH_OPTS, body: JSON.stringify({ text: `[Agent Automator] Phase completed.\nReason/Hash: ${message}` })
|
||||||
|
});
|
||||||
|
console.log("✅ Comment attached.");
|
||||||
|
} catch (e) {
|
||||||
|
console.error("❌ Failed:", e.message);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (commandOrId === "create") {
|
||||||
|
createTaskAndComment(message, args[2] || "Task fully completed.");
|
||||||
|
} else {
|
||||||
|
markTaskDoneAndComment(commandOrId, message);
|
||||||
|
}
|
||||||
|
|||||||
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -30,3 +30,7 @@
|
|||||||
path = .agent/vendor/mini-swe
|
path = .agent/vendor/mini-swe
|
||||||
url = https://github.com/swe-agent/swe-agent.git
|
url = https://github.com/swe-agent/swe-agent.git
|
||||||
branch = main
|
branch = main
|
||||||
|
[submodule "openclaude"]
|
||||||
|
path = openclaude
|
||||||
|
url = https://github.com/Gitlawb/openclaude.git
|
||||||
|
branch = main
|
||||||
|
|||||||
@@ -1,40 +1,45 @@
|
|||||||
---
|
---
|
||||||
phase: 00-System Setup
|
phase: 00-initialization
|
||||||
task: 0
|
task: 0
|
||||||
total_tasks: 0
|
total_tasks: 0
|
||||||
status: paused
|
status: paused
|
||||||
last_updated: 2026-03-29T19:40:00Z
|
last_updated: 2026-04-05T00:51:15+09:00
|
||||||
---
|
---
|
||||||
|
|
||||||
<current_state>
|
<current_state>
|
||||||
우리는 `/gsd-new-project` 에 준하는 마스터 부트스트랩 프로젝트 초기화 작업을 완벽하게 끝냈습니다.
|
Completed project initialization and architecture planning.
|
||||||
모든 SSOT 파일(PROJECT.md, ARCHITECTURE.md, config.json)이 수립되었고, 메인 위협이었던 서브모듈 동기화 문제를 해결할 마스터 허브 통제 스위치(`sync_vendors`)까지 삽입되어 Git에 영구 커밋된 상태입니다. 정확히 1단계 로드맵 구축을 시작하기 직전의 출발선에 서 있습니다.
|
GSD project state (.planning/PROJECT.md and config.json) corresponds to the 'Dual-Orchestration AI Assistant' structure using a 2+0 GPU division.
|
||||||
|
Right before starting Phase 1 planning.
|
||||||
</current_state>
|
</current_state>
|
||||||
|
|
||||||
<completed_work>
|
<completed_work>
|
||||||
- `.planning/PROJECT.md` 제정 (Zero-Pollution 및 Git 원칙 헌법)
|
- Configured git repository, remote (`Variet/variet_llm`), and Vikunja
|
||||||
- `.planning/codebase/` 내부 아키텍처 및 스택 매핑
|
- Cleaned up previous `agent_guide` config
|
||||||
- `.agent/scripts/sync_vendors.sh/.bat` (마스터 배포자 유틸) 생성
|
- Wrote `.planning/PROJECT.md` outlining the 3-Tier model strategy and the requirements
|
||||||
- 19개의 추출된 최고급 스킬셋을 Git 트래킹에 편입시켜 100% 클론 복제성 확보
|
- Written `.planning/config.json`
|
||||||
|
- Committed everything to git
|
||||||
</completed_work>
|
</completed_work>
|
||||||
|
|
||||||
<remaining_work>
|
<remaining_work>
|
||||||
- `/gsd-plan-phase 1` 을 실행하여 실제 개발 로드맵 설계 시작
|
- Plan Phase 1: Machine A LLM inference server setup and Hot-swap scripts (Fast/Balanced/Deep)
|
||||||
|
- Plan Phase 2: Machine B VS Code Extension
|
||||||
|
- Plan Phase 3: Machine B Discord Bot
|
||||||
|
- Plan Phase 4: MCP Tool integration
|
||||||
</remaining_work>
|
</remaining_work>
|
||||||
|
|
||||||
<decisions_made>
|
<decisions_made>
|
||||||
- **Master-Satellite 배포 위상:** 이 `new_gene` 저장소에서만 오픈소스를 동기화하고, 다른 파생 레포지토리들은 어떤 스크립트도 없이 `git pull` 만으로 무기를 상속받는 완벽한 Zero-Pollution 구조 채택.
|
- Decided to use 2+0 GPU architecture because it gives single-user coding requests maximum throughput (50-80 t/s) while keeping orchestration neatly on Machine B.
|
||||||
- 어떠한 글로벌 패키지도 허용하지 않고 오직 로컬 `.agent/env` 에 모든 기능을 캡슐화.
|
- Picked a 3-tier model strategy: Gemma4 26B (Fast), Qwen 35B (Balanced), Qwen 122B (Deep).
|
||||||
</decisions_made>
|
</decisions_made>
|
||||||
|
|
||||||
<blockers>
|
<blockers>
|
||||||
None.
|
- None.
|
||||||
</blockers>
|
</blockers>
|
||||||
|
|
||||||
<context>
|
<context>
|
||||||
프로젝트 토대는 더할 나위 없이 단단해졌습니다. 다음 에이전트는 이 파일을 읽자마자 혼란 없이 완벽한 컨텍스트를 흡수한 채, 사용자에게 첫 번째 기능을 묻는 영광스러운 역할을 수행하게 될 것입니다.
|
We transitioned from pure Llama.cpp tuning to architectural layout. The logic for how tools are routed has been clarified (LLM thinks on Machine A, tools are executed locally on Machine B). Next logical step is to execute Phase 1 (infrastructure and hot swap on Machine A).
|
||||||
</context>
|
</context>
|
||||||
|
|
||||||
<next_action>
|
<next_action>
|
||||||
Start with: Ask the user to run `/gsd-plan-phase 1` to define the roadmap.
|
Start with: `/gsd-plan-phase 1` to design the Machine A startup and hot swap mechanism.
|
||||||
</next_action>
|
</next_action>
|
||||||
|
|||||||
@@ -1,40 +0,0 @@
|
|||||||
---
|
|
||||||
phase: 00-System Setup
|
|
||||||
task: 0
|
|
||||||
total_tasks: 0
|
|
||||||
status: paused
|
|
||||||
last_updated: 2026-03-29T19:40:00Z
|
|
||||||
---
|
|
||||||
|
|
||||||
<current_state>
|
|
||||||
우리는 `/gsd-new-project` 에 준하는 마스터 부트스트랩 프로젝트 초기화 작업을 완벽하게 끝냈습니다.
|
|
||||||
모든 SSOT 파일(PROJECT.md, ARCHITECTURE.md, config.json)이 수립되었고, 메인 위협이었던 서브모듈 동기화 문제를 해결할 마스터 허브 통제 스위치(`sync_vendors`)까지 삽입되어 Git에 영구 커밋된 상태입니다. 정확히 1단계 로드맵 구축을 시작하기 직전의 출발선에 서 있습니다.
|
|
||||||
</current_state>
|
|
||||||
|
|
||||||
<completed_work>
|
|
||||||
- `.planning/PROJECT.md` 제정 (Zero-Pollution 및 Git 원칙 헌법)
|
|
||||||
- `.planning/codebase/` 내부 아키텍처 및 스택 매핑
|
|
||||||
- `.agent/scripts/sync_vendors.sh/.bat` (마스터 배포자 유틸) 생성
|
|
||||||
- 19개의 추출된 최고급 스킬셋을 Git 트래킹에 편입시켜 100% 클론 복제성 확보
|
|
||||||
</completed_work>
|
|
||||||
|
|
||||||
<remaining_work>
|
|
||||||
- `/gsd-plan-phase 1` 을 실행하여 실제 개발 로드맵 설계 시작
|
|
||||||
</remaining_work>
|
|
||||||
|
|
||||||
<decisions_made>
|
|
||||||
- **Master-Satellite 배포 위상:** 이 `new_gene` 저장소에서만 오픈소스를 동기화하고, 다른 파생 레포지토리들은 어떤 스크립트도 없이 `git pull` 만으로 무기를 상속받는 완벽한 Zero-Pollution 구조 채택.
|
|
||||||
- 어떠한 글로벌 패키지도 허용하지 않고 오직 로컬 `.agent/env` 에 모든 기능을 캡슐화.
|
|
||||||
</decisions_made>
|
|
||||||
|
|
||||||
<blockers>
|
|
||||||
None.
|
|
||||||
</blockers>
|
|
||||||
|
|
||||||
<context>
|
|
||||||
프로젝트 토대는 더할 나위 없이 단단해졌습니다. 다음 에이전트는 이 파일을 읽자마자 혼란 없이 완벽한 컨텍스트를 흡수한 채, 사용자에게 첫 번째 기능을 묻는 영광스러운 역할을 수행하게 될 것입니다.
|
|
||||||
</context>
|
|
||||||
|
|
||||||
<next_action>
|
|
||||||
Start with: Ask the user to run `/gsd-plan-phase 1` to define the roadmap.
|
|
||||||
</next_action>
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
{
|
|
||||||
"version": "1.0",
|
|
||||||
"timestamp": "2026-03-29T19:40:00Z",
|
|
||||||
"phase": "00-setup",
|
|
||||||
"phase_name": "Project Initialization",
|
|
||||||
"phase_dir": ".planning",
|
|
||||||
"plan": 0,
|
|
||||||
"task": 0,
|
|
||||||
"total_tasks": 0,
|
|
||||||
"status": "paused",
|
|
||||||
"completed_tasks": [],
|
|
||||||
"remaining_tasks": [],
|
|
||||||
"blockers": [],
|
|
||||||
"human_actions_pending": [],
|
|
||||||
"decisions": [
|
|
||||||
{
|
|
||||||
"decision": "Master-Satellite Deployment",
|
|
||||||
"rationale": "Ensures downstream repos remain 100% Zero-Pollution by distributing flattened skills natively via Git Push",
|
|
||||||
"phase": "0"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"uncommitted_files": [],
|
|
||||||
"next_action": "Run /gsd-plan-phase 1 to begin roadmap construction",
|
|
||||||
"context_notes": "Zero-pollution foundation is perfectly solid. Awaiting Phase 1 initialization."
|
|
||||||
}
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
# Project Context: Zero-Pollution Agent Bootstrap Kit
|
|
||||||
|
|
||||||
## What This Is
|
|
||||||
`new_gene`은 향후 모든 AI 에이전트 개발 및 자동화 프로젝트에 포크(Fork)되거나 복제되어 즉각적으로 활용되는 **'완전 자율형·무오염(Zero-Pollution) 부트스트랩 마스터 템플릿'** 입니다.
|
|
||||||
|
|
||||||
## Core Value
|
|
||||||
코딩을 시작할 때 발생하는 세팅 오류, 프롬프트 해킹, 메인 브랜치 훼손, API 키 유출을 원천적으로 막습니다. 누구나 이 저장소만 클론(`git clone`) 받으면 환경 설정이나 별도의 복잡한 스크립트 실행 없이 19개의 최고급 개발 스킬(TDD, Git Worktree)과 GSD 오케스트레이터를 즉시 꺼내 쓸 수 있는 완벽한 턴키(Turn-key) 생태계를 구축합니다.
|
|
||||||
|
|
||||||
## Success Criteria
|
|
||||||
1. 사용자가 단 1개의 글로벌 패키지를 설치하지 않아도 로컬(`.agent/env/node_modules`)만을 통해 100% 작동해야 함.
|
|
||||||
2. 터미널 명령어나 히스토리에 API 토큰이 노출되지 않고 자동 동기화(Vikunja/Gitea)가 이루어져야 함.
|
|
||||||
3. 에이전트는 코딩 전 반드시 TDD 가설을 세우고, 서브 브랜치(Worktree)로 격리되어 안전하게 작업해야 함.
|
|
||||||
4. **[Key Constraint]** 템플릿의 모든 아키텍처는 `git clone`만으로 완전한 작업 흐름이 이어져야 하며, 각 파생 프로젝트들은 `git pull` 명령어 단 한 번만으로 최신 무기와 스킬(Agent 엔진)이 완벽히 업데이트되어야 함.
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
### Validated
|
|
||||||
- ✓ [Zero-Pollution] 모든 스킬(`superpowers`, `obsidian-skills`)이 글로벌 영역이 아닌 내부(`.agent/skills/`)로 캡슐화되어 직접 Git 트래킹을 받음.
|
|
||||||
- ✓ [API Bridge] `sync_wiki.js`와 `sync_vikunja.js`를 통해 토큰 노출 없이 서버와 통신 구조 확보.
|
|
||||||
- ✓ [Persistence] `claude-mem` (SQLite MCP) 기반의 오답노트 기억 능력 탑재.
|
|
||||||
|
|
||||||
### Active
|
|
||||||
- [x] `git clone`/`git pull`만으로 19개 스킬 즉각 로드 (Master Sync Hub 토폴로지 확립)
|
|
||||||
- [ ] GSD Phase 계획과 실제 코드 실행 간 오차를 자동으로 검증할 브릿지 시스템 보완
|
|
||||||
- [ ] `.env.agent`만 셋업하면 모든 것이 알아서 연결되는 완전 자율화 체계 구축 지속
|
|
||||||
|
|
||||||
### Out of Scope (배포 제약)
|
|
||||||
- **하위 프로젝트(Satellite Repo)에서의 오픈소스 서브모듈(Vendor) 직접 업데이트 행위 원천 금지.** (오직 ఈ Master 템플릿의 `sync_vendors` 스크립트만이 서브모듈의 버전을 평탄화 추출/번역하여 100% 검증된 정적 파일(`.agent/skills/`)로 Git 트래킹합니다.)
|
|
||||||
- 하위 패키지에 글로벌 NPM 패키지 설치 요구 및 부트스트랩 스크립트 실행 강요 금지 (오직 `git pull` 하나만으로 마스터의 스킬셋 변동분을 수동적으로 상속받을 것).
|
|
||||||
- `/start` 및 `/end` 같은 과거 유산(Legacy) 도입 금지
|
|
||||||
|
|
||||||
---
|
|
||||||
*Last updated: 2026-03-29 after Phase 1 initialization*
|
|
||||||
|
|
||||||
## Evolution
|
|
||||||
This document evolves at phase transitions and milestone boundaries.
|
|
||||||
|
|
||||||
**After each phase transition** (via `/gsd-transition`):
|
|
||||||
1. Requirements invalidated? → Move to Out of Scope with reason
|
|
||||||
2. Requirements validated? → Move to Validated with phase reference
|
|
||||||
3. New requirements emerged? → Add to Active
|
|
||||||
4. Decisions to log? → Add to Key Decisions
|
|
||||||
5. "What This Is" still accurate? → Update if drifted
|
|
||||||
|
|
||||||
**After each milestone** (via `/gsd-complete-milestone`):
|
|
||||||
1. Full review of all sections
|
|
||||||
2. Core Value check — still the right priority?
|
|
||||||
3. Audit Out of Scope — reasons still valid?
|
|
||||||
4. Update Context with current state
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
# Roadmap
|
|
||||||
|
|
||||||
## Phase 1: Zero-Pollution Pipeline Stabilization
|
|
||||||
**Status:** Complete (2026-03-30)
|
|
||||||
**Goal:** Fix the Master-Satellite synchronization pipeline to natively extract GSD skills, UI data, and Python MCP dependencies so that satellite repositories bootstrap via zero-click.
|
|
||||||
|
|
||||||
### Requirements
|
|
||||||
- REQ-01: Bootstrap script must auto-install Python dependencies for MCP tools across satellites.
|
|
||||||
- REQ-02: Master sync script must invoke `get-shit-done-cc` local installation to native `.agent/skills` and ensure Git tracking.
|
|
||||||
- REQ-03: Master sync script must invoke `uipro update` and commit changes.
|
|
||||||
- REQ-04: The `.gitignore` generated by `extract_skills.js` must safely whitelist `/gsd` skill directories.
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
# Codebase Architecture (`ARCHITECTURE.md`)
|
|
||||||
|
|
||||||
## Conceptual Core
|
|
||||||
The project employs a **Master-Satellite Zero-Pollution Orchestration Model**.
|
|
||||||
The architecture is designed to host local environments mapped centrally for LLM / Developer agents so that their capabilities and tracking modules do not leak into the developer's global OS environment.
|
|
||||||
|
|
||||||
## Design Patterns & Layers
|
|
||||||
1. **Master Repository Role (`new_gene`)**: Synchronizes third-party Submodules, flattens them via `.agent/scripts/extract_skills.js` and `.agent/scripts/sync_vendors.bat`, and seeds localized GSD skills via local bin instantiation.
|
|
||||||
2. **Satellite Project Consumption**: Satellite repositories pull the tracked objects (like `.agent/get-shit-done/` binaries and `.agent/skills/gsd-*/` files), run `bootstrap.bat`, and safely isolate all code environments.
|
|
||||||
|
|
||||||
## Execution Entry Points
|
|
||||||
- Master Sync: `.agent/scripts/sync_vendors.bat/.sh` initializes NPM bounds, installs `uipro`/`get-shit-done-cc` strictly inside `.agent/env/node_modules/`, and runs them from the root via `.agent/env/node_modules/.bin/...` to safely isolate state.
|
|
||||||
- Satellite Install: `bootstrap.bat/.sh` executes the runtime scripts sequentially (Git modules -> node env -> Python `.requirements` instantiation using a developer-provided `$AGENT_PYTHON_PATH`).
|
|
||||||
|
|
||||||
## Data Flow
|
|
||||||
The data flow travels exclusively via file IO (File reading -> Markdown modification) controlled by the Antigravity Agent framework executing `Task()` components or user IDE plugins. Git actions (add/commit/push) ensure cross-device immutability.
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
# Codebase Concerns (`CONCERNS.md`)
|
|
||||||
|
|
||||||
## Known Technical Debt & Fragile Areas
|
|
||||||
|
|
||||||
1. **UTF-8 Support on PowerShell**:
|
|
||||||
The `sync_vendors.bat` script utilizes `chcp 65001 >nul` to support symbols and emojis (`🔄`, `⛏️`, `🌐`). However, invoking this `.bat` script directly from the new `powershell.exe` execution layers occasionally mangles the output characters or breaks parsing commands. Developers manually validating `sync_vendors.bat` in Powershell may observe syntax exceptions that don't manifest inside standard CMD executions.
|
|
||||||
|
|
||||||
2. **Path Encoding Depth Limitations**:
|
|
||||||
The GSD implementation maps Deep nested `.agent/vendor/**` repositories and heavily duplicates structure into `.agent/skills/`. For Windows systems overriding 260-character restrictions, this can theoretically introduce silent file truncation if the local `.agent` environment scales beyond expectations.
|
|
||||||
|
|
||||||
3. **Submodule Divergences**:
|
|
||||||
The primary master hook uses `git submodule update --remote --merge`. Tracking upstream mains from multiple open-source repositories means breaking changes authored dynamically by upstream maintainers (e.g., `browser_use` rewriting its setup commands or `obsidian-skills` evolving) could cascade into local logic breaks, requiring a robust `translate_skills.js` sync layer to patch integration shifts.
|
|
||||||
|
|
||||||
4. **Environment Abstraction Leaks**:
|
|
||||||
While Zero-Pollution aims to block system pollution, `bootstrap.bat` utilizes `AGENT_PYTHON_PATH` configured globally to execute `.requirements.txt`. There is an implicit assumption that this python executable is correctly mapped to an isolated virtual environment (`venv`). The architecture relies heavily on Developer compliance to not provide a root Python executable.
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
# Codebase Conventions (`CONVENTIONS.md`)
|
|
||||||
|
|
||||||
## Code Style & Scripts
|
|
||||||
- **Shell / Bash**: Ensure standard `$?` exit evaluation handling with `if [ $? -ne 0 ]; then exit 1; fi`.
|
|
||||||
- **Batch Scripting**: Ensure `%errorlevel%` mapping with `if %errorlevel% neq 0 ( exit /b %errorlevel% )`. Uses `chcp 65001 >nul` for cross-platform UTF-8 emoji support.
|
|
||||||
- **Node.js Scripts**: Typically wrapped recursively checking properties (`fs.existsSync`, `fs.readdirSync`), enforcing zero-pollution (ignoring flattened dependencies via explicit `fs.writeFileSync`).
|
|
||||||
|
|
||||||
## Architectural Patterns
|
|
||||||
- **Zero-Pollution Rule**: Never install global node_modules (`-g`) or system modifications. Path routing is managed securely using execution strings formatted like `path/to/local/bin` instead of relying on `$PATH`.
|
|
||||||
- **Idempotency Rule**: The bootstrap processes (`bootstrap.sh`, `bootstrap.bat`) are designed to be run hundreds of times safely, using `if not exist` checks before taking any structural action.
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
- Terminal outputs are robust, mapping explicit phases like `[1/5]`, `[2/5]` to terminal stdout.
|
|
||||||
- Execution halts strictly on critical dependency update failures.
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
# Codebase Integrations (`INTEGRATIONS.md`)
|
|
||||||
|
|
||||||
## Webhooks & APIs
|
|
||||||
- Currently, this project relies purely on Git synchronisation and CLI execution. Network activity is limited to downloading external packages (`npm install`, `pip install`, `git clone/pull`).
|
|
||||||
- Once MCP tools are invoked, `claude-mem` likely interacts with SQLite logic or LLM memory endpoints depending on its runtime schema. `browser-use` interfaces with web targets directly.
|
|
||||||
|
|
||||||
## Databases & Persistence
|
|
||||||
- There are no central RDBMS or NoSQL layers native to this orchestration layer.
|
|
||||||
- **Knowledge/Memory**: The project serves as an SSOT using markdown (`.planning/*`, `.agent/knowledge/*`).
|
|
||||||
- **MCP state persistence**: Relies on specific plugin's local DBs (e.g., SQLite for mem logs).
|
|
||||||
|
|
||||||
## Third-Party Authentication
|
|
||||||
- Configured by `.agent/config/.env.agent` which includes API keys (Vikunja Task ID, Gitea instances, etc.), though the current codebase only seeds and checks for its existence without directly invoking the APIs inside the shell scripts.
|
|
||||||
|
|
||||||
## Core External Tools
|
|
||||||
- **Git**: Primary persistence and vendor orchestration transport mechanism.
|
|
||||||
- **NPM & PIP**: Invoked asynchronously by bootstrap scripts to satisfy AI tool dependencies.
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
# Codebase Stack (`STACK.md`)
|
|
||||||
|
|
||||||
## Languages
|
|
||||||
- **Shell / Batch Scripting**: Used heavily for the zero-pollution bootstrap scripts (`bootstrap.bat`, `bootstrap.sh`, `sync_vendors.bat`, `sync_vendors.sh`).
|
|
||||||
- **JavaScript (Node.js)**: Used for vendor extraction and translation scripts (`extract_skills.js`, `translate_skills.js`).
|
|
||||||
- **Python**: Required indirectly for running installed Python MCP servers (`browser_use`, `mini-swe`, `claude-mem`) during satellite bootstrapping.
|
|
||||||
|
|
||||||
## Runtimes & Frameworks
|
|
||||||
- **Node.js**: The `.agent/env` isolates local dependencies (e.g. `npm install`, `npx get-shit-done-cc`, `uipro-cli`).
|
|
||||||
- **Python**: Expected to be provided externally via user's `AGENT_PYTHON_PATH` to isolate plugin Python processes.
|
|
||||||
- **Agent Engines**: Target runtimes interacting with this environment include Google Gemini CLI / Antigravity, Claude Code, and Copilot.
|
|
||||||
|
|
||||||
## Core Dependencies
|
|
||||||
- **UI & Workflow Generative Frameworks**:
|
|
||||||
- `get-shit-done-cc`: CLI orchestration framework for generating GSD workflows.
|
|
||||||
- `uipro-cli`: Generates UI/UX specification data.
|
|
||||||
- **Git Submodules** (defined in `.gitmodules`):
|
|
||||||
- `.agent/vendor/superpowers`
|
|
||||||
- `.agent/knowledge/everything_claude`
|
|
||||||
- `.agent/knowledge/awesome_claude`
|
|
||||||
- `.agent/vendor/obsidian-skills`
|
|
||||||
- `.agent/services/claude-mem`
|
|
||||||
- `.agent/services/mcp-core`
|
|
||||||
- `.agent/vendor/browser_use`
|
|
||||||
- `.agent/vendor/mini-swe`
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
- `.agent/config/.env.agent`: Developer-specific agent configuration. Uses `.template` fallback.
|
|
||||||
- `PROJECT.md` & `ROADMAP.md`: Project-level planning and tracker configurations.
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
# Codebase Structure (`STRUCTURE.md`)
|
|
||||||
|
|
||||||
## Directory Map
|
|
||||||
|
|
||||||
- **Root Files**
|
|
||||||
- `bootstrap.bat` / `bootstrap.sh`: Primary runtime orchestrators for setting up the node/python environment.
|
|
||||||
- `.gitignore` & `.gitmodules`: Repository definitions protecting the ecosystem from pollution.
|
|
||||||
|
|
||||||
- **`.agent/` (The Enclave)**
|
|
||||||
- `.agent/env/`: The isolated local node environment container. Has its own `package.json` avoiding merging issues with user code.
|
|
||||||
- `.agent/scripts/`: Home of the Master Sync orchestration rules (`sync_vendors`, `extract_skills.js`, `translate_skills.js`).
|
|
||||||
- `.agent/skills/`: Holds the flattened markdown files (`SKILL.md`) that agents interpret on system boot. Note `gsd-*/` is whitelisted here.
|
|
||||||
- `.agent/vendor/`: The raw checked-out git repository targets before any translations/flattening is parsed out of them.
|
|
||||||
- `.agent/config/`: Runtime configurations specific to the current physical machine (`.env.agent`).
|
|
||||||
- `.agent/knowledge/`: Stores raw unstructured domain context for explicit queries.
|
|
||||||
|
|
||||||
- **`.planning/` (GSD Execution)**
|
|
||||||
- Manages the Project State (`PROJECT.md`, `ROADMAP.md`, `STATE.md`).
|
|
||||||
- `phases/`: Artifact history holding individual plans (`01-PLAN.md`) and the verification/summary reports mapping their outcomes.
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# Codebase Testing (`TESTING.md`)
|
|
||||||
|
|
||||||
## Structural Verification
|
|
||||||
Given that this is an AI Framework Infrastructure project (Bootstrap Kit), formal unit testing via Jest/Vitest does not primarily apply to the root structure.
|
|
||||||
|
|
||||||
## GSD Audit Testing
|
|
||||||
Testing is defined strictly through the **Plan-Execute-Verify** cycle of the Get-Shit-Done (GSD) framework:
|
|
||||||
- **`[Acceptance Criteria]`**: Each phase and subtask has shell commands evaluated by `grep` or file existence checks ensuring conditions are logically provable (e.g. `grep "npx get-shit-done-cc" .agent/scripts/sync_vendors.bat`).
|
|
||||||
- **Verifiers**: The `.planning/phases/*-VERIFICATION.md` pattern uses dedicated checker agents to automatically validate system goals against the physical outcome on disk.
|
|
||||||
|
|
||||||
## Continuous Integration
|
|
||||||
At the moment, no GitHub Actions or standard CI test runners execute automated validation, as validation strictly hinges on local agent validation (`/gsd-verify-work` or `gsd-plan-checker` routines).
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"mode": "yolo",
|
|
||||||
"granularity": "Standard",
|
|
||||||
"parallelization": true,
|
|
||||||
"commit_docs": true,
|
|
||||||
"model_profile": "balanced",
|
|
||||||
"workflow": {
|
|
||||||
"research": true,
|
|
||||||
"plan_check": true,
|
|
||||||
"verifier": true,
|
|
||||||
"nyquist_validation": true,
|
|
||||||
"auto_advance": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
# Phase 1: Zero-Pollution Pipeline Stabilization - Context
|
|
||||||
|
|
||||||
**Gathered:** 2026-03-30
|
|
||||||
**Status:** Ready for planning
|
|
||||||
**Source:** PRD Express Path (implementation_plan.md)
|
|
||||||
|
|
||||||
<domain>
|
|
||||||
## Phase Boundary
|
|
||||||
Fix the Master-Satellite synchronization pipeline to natively extract GSD skills, UI updates, and Python dependencies. This ensures that a zero-click `bootstrap.bat/.sh` works flawlessly across new workspaces without violating the Zero-Pollution architecture.
|
|
||||||
</domain>
|
|
||||||
|
|
||||||
<decisions>
|
|
||||||
## Implementation Decisions
|
|
||||||
|
|
||||||
### Zero-Click Python Dependencies
|
|
||||||
- Modify `bootstrap.bat` and `bootstrap.sh` to auto-detect and pip-install Python dependencies for MCP tools (`browser_use`, `claude-mem`, `mini-swe`).
|
|
||||||
|
|
||||||
### Master Sync Pipeline GSD Extraction
|
|
||||||
- Modify `sync_vendors.bat` and `sync_vendors.sh` to execute `npm update get-shit-done-cc uipro-cli`, `npx uipro update`, and `npx get-shit-done-cc --antigravity --local` inside `.agent/env`.
|
|
||||||
- Ensure output folders `.agent/get-shit-done/` and `.agent/skills/gsd-*/` are `git add`ed.
|
|
||||||
|
|
||||||
### Gitignore Rules for extracted skills
|
|
||||||
- Modify `.agent/scripts/extract_skills.js` to whitelist `!gsd-*/` in the generated `.gitignore` so Phase planning and GSD execution tools successfully persist to git.
|
|
||||||
|
|
||||||
### the agent's Discretion
|
|
||||||
- The method of Python virtual environment detection vs global python is left to the agent, though leveraging `AGENT_PYTHON_PATH` if specified is preferred.
|
|
||||||
</decisions>
|
|
||||||
|
|
||||||
<canonical_refs>
|
|
||||||
## Canonical References
|
|
||||||
|
|
||||||
**Downstream agents MUST read these before planning or implementing.**
|
|
||||||
|
|
||||||
### Zero Pollution Scripts
|
|
||||||
- `bootstrap.bat`
|
|
||||||
- `bootstrap.sh`
|
|
||||||
- `.agent/scripts/sync_vendors.bat`
|
|
||||||
- `.agent/scripts/sync_vendors.sh`
|
|
||||||
- `.agent/scripts/extract_skills.js`
|
|
||||||
|
|
||||||
</canonical_refs>
|
|
||||||
|
|
||||||
<specifics>
|
|
||||||
## Specific Ideas
|
|
||||||
- The `npx get-shit-done-cc --antigravity --local` command must be run relative to the project root or precisely routed, otherwise GSD commands won't appear sequentially.
|
|
||||||
|
|
||||||
</specifics>
|
|
||||||
|
|
||||||
<deferred>
|
|
||||||
## Deferred Ideas
|
|
||||||
None — PRD covers phase scope.
|
|
||||||
</deferred>
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Phase: 01-zero-pollution-pipeline-stabilization*
|
|
||||||
*Context gathered: 2026-03-30 via PRD Express Path*
|
|
||||||
@@ -1,143 +0,0 @@
|
|||||||
---
|
|
||||||
wave: 1
|
|
||||||
depends_on: []
|
|
||||||
files_modified:
|
|
||||||
- ".agent/scripts/sync_vendors.bat"
|
|
||||||
- ".agent/scripts/sync_vendors.sh"
|
|
||||||
- ".agent/scripts/extract_skills.js"
|
|
||||||
- "bootstrap.bat"
|
|
||||||
- "bootstrap.sh"
|
|
||||||
autonomous: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# Phase 1: Zero-Pollution Pipeline Stabilization
|
|
||||||
|
|
||||||
## Objective
|
|
||||||
Fix the Master-Satellite synchronization pipeline to natively extract GSD skills, UI data, and Python MCP dependencies so that satellite repositories bootstrap via zero-click.
|
|
||||||
|
|
||||||
## Verification Criteria
|
|
||||||
- [ ] Running `bootstrap.bat` attempts to install Python `requirements.txt` targets (`browser_use`, `claude-mem`).
|
|
||||||
- [ ] Running `sync_vendors.bat` successfully runs `get-shit-done-cc` generator and Git tracks `.agent/skills/gsd-*/` and `.agent/get-shit-done/`.
|
|
||||||
- [ ] `extract_skills.js` generated `.gitignore` whitelists `!gsd-*/`.
|
|
||||||
|
|
||||||
## must_haves
|
|
||||||
- [ ] `sync_vendors.bat` and `sync_vendors.sh` must execute `npm install`, `npx uipro update` and `npx get-shit-done-cc --antigravity --local`.
|
|
||||||
- [ ] `bootstrap.bat` and `bootstrap.sh` must execute `pip install -r requirements.txt` for Python components if Python env exists.
|
|
||||||
- [ ] GSD execution binaries and folders must be tracked via `git add .agent/get-shit-done/`.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
<task>
|
|
||||||
<read_first>
|
|
||||||
- .agent/scripts/sync_vendors.bat
|
|
||||||
- .agent/scripts/sync_vendors.sh
|
|
||||||
</read_first>
|
|
||||||
<action>
|
|
||||||
Add the local GSD extraction and uipro updating mechanism directly inside `sync_vendors.bat` and `sync_vendors.sh`.
|
|
||||||
For the bash script (`sync_vendors.sh`), under step `[2/5] 패키지 업데이트 및 GSD, UI-UX-PRO-MAX 동기화...`:
|
|
||||||
Add:
|
|
||||||
```bash
|
|
||||||
cd .agent/env
|
|
||||||
npm install
|
|
||||||
npm update get-shit-done-cc uipro-cli
|
|
||||||
npx uipro update
|
|
||||||
npx get-shit-done-cc --antigravity --local
|
|
||||||
cd ../..
|
|
||||||
```
|
|
||||||
For the batch script (`sync_vendors.bat`), under a new step for package updates:
|
|
||||||
Add:
|
|
||||||
```bat
|
|
||||||
cd .agent\env
|
|
||||||
call npm install
|
|
||||||
call npm update get-shit-done-cc uipro-cli
|
|
||||||
call npx uipro update
|
|
||||||
call npx get-shit-done-cc --antigravity --local
|
|
||||||
cd ..\..
|
|
||||||
```
|
|
||||||
Also, change the git commit logic to track GSD:
|
|
||||||
Change `git add .agent/vendor/ .agent/skills/ .gitmodules` to `git add .agent/vendor/ .agent/skills/ .agent/get-shit-done/ .gitmodules`
|
|
||||||
</action>
|
|
||||||
<acceptance_criteria>
|
|
||||||
`grep "npx get-shit-done-cc --antigravity --local" .agent/scripts/sync_vendors.bat` exits 0.
|
|
||||||
`grep "git add" .agent/scripts/sync_vendors.bat | grep ".agent/get-shit-done/"` exits 0.
|
|
||||||
`grep "npx get-shit-done-cc --antigravity --local" .agent/scripts/sync_vendors.sh` exits 0.
|
|
||||||
`grep "git add" .agent/scripts/sync_vendors.sh | grep ".agent/get-shit-done/"` exits 0.
|
|
||||||
</acceptance_criteria>
|
|
||||||
</task>
|
|
||||||
|
|
||||||
<task>
|
|
||||||
<read_first>
|
|
||||||
- .agent/scripts/extract_skills.js
|
|
||||||
</read_first>
|
|
||||||
<action>
|
|
||||||
Modify `extract_skills.js` to whitelist `gsd-*/` skill folders in the generated `.gitignore`.
|
|
||||||
Find `const gitignoreContent = [` and add `'!gsd-*/',` below `'!mini-swe/',`.
|
|
||||||
</action>
|
|
||||||
<acceptance_criteria>
|
|
||||||
`grep "!gsd-\*/" .agent/scripts/extract_skills.js` exits 0.
|
|
||||||
</acceptance_criteria>
|
|
||||||
</task>
|
|
||||||
|
|
||||||
<task>
|
|
||||||
<read_first>
|
|
||||||
- bootstrap.bat
|
|
||||||
</read_first>
|
|
||||||
<action>
|
|
||||||
Add Python MCP auto-setup logic to `bootstrap.bat` right after the node dependencies step.
|
|
||||||
Add:
|
|
||||||
```bat
|
|
||||||
echo [4/5] Checking and Installing Python MCP Dependencies...
|
|
||||||
if defined AGENT_PYTHON_PATH (
|
|
||||||
echo -^> Using AGENT_PYTHON_PATH: %AGENT_PYTHON_PATH%
|
|
||||||
if exist ".agent\vendor\browser_use\requirements.txt" (
|
|
||||||
"%AGENT_PYTHON_PATH%" -m pip install -r ".agent\vendor\browser_use\requirements.txt"
|
|
||||||
)
|
|
||||||
if exist ".agent\services\claude-mem\requirements.txt" (
|
|
||||||
"%AGENT_PYTHON_PATH%" -m pip install -r ".agent\services\claude-mem\requirements.txt"
|
|
||||||
)
|
|
||||||
if exist ".agent\services\mini-swe\requirements.txt" (
|
|
||||||
"%AGENT_PYTHON_PATH%" -m pip install -r ".agent\services\mini-swe\requirements.txt"
|
|
||||||
)
|
|
||||||
) else (
|
|
||||||
echo -^> Warning: AGENT_PYTHON_PATH is not defined. Skipping Python dependencies installation.
|
|
||||||
)
|
|
||||||
```
|
|
||||||
Update the final step text `[4/4]` to `[5/5]` appropriately.
|
|
||||||
</action>
|
|
||||||
<acceptance_criteria>
|
|
||||||
`grep "AGENT_PYTHON_PATH" bootstrap.bat` exits 0.
|
|
||||||
`grep "pip install -r" bootstrap.bat` exits 0.
|
|
||||||
</acceptance_criteria>
|
|
||||||
</task>
|
|
||||||
|
|
||||||
<task>
|
|
||||||
<read_first>
|
|
||||||
- bootstrap.sh
|
|
||||||
</read_first>
|
|
||||||
<action>
|
|
||||||
Add Python MCP auto-setup logic to `bootstrap.sh` right after the node dependencies step.
|
|
||||||
Add:
|
|
||||||
```bash
|
|
||||||
echo "[4/5] Checking and Installing Python MCP Dependencies..."
|
|
||||||
if [ -n "$AGENT_PYTHON_PATH" ]; then
|
|
||||||
echo " -> Using AGENT_PYTHON_PATH: $AGENT_PYTHON_PATH"
|
|
||||||
if [ -f ".agent/vendor/browser_use/requirements.txt" ]; then
|
|
||||||
"$AGENT_PYTHON_PATH" -m pip install -r ".agent/vendor/browser_use/requirements.txt"
|
|
||||||
fi
|
|
||||||
if [ -f ".agent/services/claude-mem/requirements.txt" ]; then
|
|
||||||
"$AGENT_PYTHON_PATH" -m pip install -r ".agent/services/claude-mem/requirements.txt"
|
|
||||||
fi
|
|
||||||
if [ -f ".agent/services/mini-swe/requirements.txt" ]; then
|
|
||||||
"$AGENT_PYTHON_PATH" -m pip install -r ".agent/services/mini-swe/requirements.txt"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo " -> Warning: AGENT_PYTHON_PATH is not defined. Skipping Python dependencies installation."
|
|
||||||
fi
|
|
||||||
```
|
|
||||||
Update the final step text `[4/4]` to `[5/5]` appropriately.
|
|
||||||
</action>
|
|
||||||
<acceptance_criteria>
|
|
||||||
`grep "AGENT_PYTHON_PATH" bootstrap.sh` exits 0.
|
|
||||||
`grep "pip install -r" bootstrap.sh` exits 0.
|
|
||||||
</acceptance_criteria>
|
|
||||||
</task>
|
|
||||||
@@ -1,25 +1,34 @@
|
|||||||
{
|
{
|
||||||
"version": "1.0",
|
"version": "1.0",
|
||||||
"timestamp": "2026-03-29T19:40:00Z",
|
"timestamp": "2026-04-06T21:18:00+09:00",
|
||||||
"phase": "00-setup",
|
"phase": "01",
|
||||||
"phase_name": "Project Initialization",
|
"phase_name": "llm-tuning",
|
||||||
"phase_dir": ".planning",
|
"phase_dir": ".planning/phases/01-llm-tuning",
|
||||||
"plan": 0,
|
"plan": 1,
|
||||||
"task": 0,
|
"task": 3,
|
||||||
"total_tasks": 0,
|
"total_tasks": 5,
|
||||||
"status": "paused",
|
"status": "paused",
|
||||||
"completed_tasks": [],
|
"completed_tasks": [
|
||||||
"remaining_tasks": [],
|
{"id": 1, "name": "Evaluate 122B Single GPU", "status": "done", "commit": ""},
|
||||||
"blockers": [],
|
{"id": 2, "name": "Evaluate 122B Dual GPU memory geometric splitting", "status": "done", "commit": ""},
|
||||||
|
{"id": 3, "name": "Calculate theoretical limits of DDR4 MoE fetching", "status": "done", "commit": ""},
|
||||||
|
{"id": 4, "name": "Test Qwen 27B Dense context bounds limits", "status": "in_progress", "progress": "Confirmed -c 262144 boots successfully"}
|
||||||
|
],
|
||||||
|
"remaining_tasks": [
|
||||||
|
{"id": 5, "name": "Evaluate Gemma-4 31B max context and speed", "status": "not_started"}
|
||||||
|
],
|
||||||
|
"blockers": [
|
||||||
|
{"description": "122B Q4_K_M 20t/s Generation Speed Limit", "type": "technical", "workaround": "Physical limitation of DDR4 RAM bandwidth (50GB/s) against 4+ GB of active weights. Cannot be bypassed. Shifted focus to smaller Dense models that fit completely into VRAM."}
|
||||||
|
],
|
||||||
"human_actions_pending": [],
|
"human_actions_pending": [],
|
||||||
"decisions": [
|
"decisions": [
|
||||||
{
|
{"decision": "Stop forcing Dual GPU symmetric utilization on MoE with n-cpu-moe", "rationale": "Model asymmetry forces OOM on one GPU and underutilization on the other.", "phase": "01"},
|
||||||
"decision": "Master-Satellite Deployment",
|
{"decision": "Shift focus to Qwen 27B / Gemma 4 31B dense models", "rationale": "They fit 100% into VRAM, bypassing WDDM/PCIe/DDR4 bottlenecks, guaranteeing ~20+ t/s generation speeds.", "phase": "01"}
|
||||||
"rationale": "Ensures downstream repos remain 100% Zero-Pollution by distributing flattened skills natively via Git Push",
|
|
||||||
"phase": "0"
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"uncommitted_files": [],
|
"uncommitted_files": [
|
||||||
"next_action": "Run /gsd-plan-phase 1 to begin roadmap construction",
|
"scripts/find_max_dense.mjs",
|
||||||
"context_notes": "Zero-pollution foundation is perfectly solid. Awaiting Phase 1 initialization."
|
"scripts/tune_122b_20ts.mjs"
|
||||||
|
],
|
||||||
|
"next_action": "Complete speed benchmark for Qwen 27B and find max context for Gemma 4 31B",
|
||||||
|
"context_notes": "We successfully shifted the user's focus away from physically impossible 122B Q4_K_M constraints by laying down concrete mathematical logic about VRAM/RAM bandwidth. We are now pivoting to dense models (27B/31B) to guarantee speed and context size."
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,38 +1,46 @@
|
|||||||
# Project Context: Zero-Pollution Agent Bootstrap Kit
|
# Variet LLM: Dual-Orchestration AI Assistant
|
||||||
|
|
||||||
## What This Is
|
## What This Is
|
||||||
`new_gene`은 향후 모든 AI 에이전트 개발 및 자동화 프로젝트에 포크(Fork)되거나 복제되어 즉각적으로 활용되는 **'완전 자율형·무오염(Zero-Pollution) 부트스트랩 마스터 템플릿'** 입니다.
|
A high-performance, locally-hosted AI assistant system built on two RTX 3060 12GB GPUs. It uses a "2+0" architecture where Machine A acts as a dedicated inference server running large language models, while Machine B handles the user interface (VS Code, Discord) and tool execution.
|
||||||
|
|
||||||
## Core Value
|
## Problem / Core Value
|
||||||
코딩을 시작할 때 발생하는 세팅 오류, 프롬프트 해킹, 메인 브랜치 훼손, API 키 유출을 원천적으로 막습니다. 누구나 이 저장소만 클론(`git clone`) 받으면 환경 설정이나 별도의 복잡한 스크립트 실행 없이 19개의 최고급 개발 스킬(TDD, Git Worktree)과 GSD 오케스트레이터를 즉시 꺼내 쓸 수 있는 완벽한 턴키(Turn-key) 생태계를 구축합니다.
|
Standard LLM set-ups on a single GPU often struggle with context switching and running multi-tools asynchronously. By dedicating an API server to raw inference (50-80 t/s with Qwen 35B), the system achieves extreme responsiveness for coding while preserving resources for tool execution (Calendar, Mail, Search) on the workstation.
|
||||||
|
|
||||||
## Success Criteria
|
## Target Audience
|
||||||
1. 사용자가 단 1개의 글로벌 패키지를 설치하지 않아도 로컬(`.agent/env/node_modules`)만을 통해 100% 작동해야 함.
|
Single developer working on complex coding tasks alongside daily administrative tasks.
|
||||||
2. 터미널 명령어나 히스토리에 API 토큰이 노출되지 않고 자동 동기화(Vikunja/Gitea)가 이루어져야 함.
|
|
||||||
3. 에이전트는 코딩 전 반드시 TDD 가설을 세우고, 서브 브랜치(Worktree)로 격리되어 안전하게 작업해야 함.
|
## Key Decisions
|
||||||
4. **[Key Constraint]** 템플릿의 모든 아키텍처는 `git clone`만으로 완전한 작업 흐름이 이어져야 하며, 각 파생 프로젝트들은 `git pull` 명령어 단 한 번만으로 최신 무기와 스킬(Agent 엔진)이 완벽히 업데이트되어야 함.
|
|
||||||
|
| Decision | Rationale | Outcome |
|
||||||
|
|----------|-----------|---------|
|
||||||
|
| 2+0 GPU Architecture | Placing both GPUs in Machine A allows Qwen 35B to fully load into VRAM, increasing speed from 30t/s to 50-80t/s. | Machine A: API Server only.<br/>Machine B: All orchestrations & tools. |
|
||||||
|
| Separation of Agent Logic | Machine A is a pure "brain" (llama-server). Machine B has the "hands and eyes" (VS Code extension and Discord Bot). | Simplified infrastructure; tools execute directly on the workstation. |
|
||||||
|
| 3-Tier Model Strategy | Need balanced speeds depending on the complexity of the task requested. | Fast: Gemma4 26B (~70t/s)<br/>Balanced: Qwen 35B (~50t/s)<br/>Deep: Qwen 122B (~11t/s) |
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
### Validated
|
### Validated
|
||||||
- ✓ [Zero-Pollution] 모든 스킬(`superpowers`, `obsidian-skills`)이 글로벌 영역이 아닌 내부(`.agent/skills/`)로 캡슐화되어 직접 Git 트래킹을 받음.
|
|
||||||
- ✓ [API Bridge] `sync_wiki.js`와 `sync_vikunja.js`를 통해 토큰 노출 없이 서버와 통신 구조 확보.
|
(None yet — ship to validate)
|
||||||
- ✓ [Persistence] `claude-mem` (SQLite MCP) 기반의 오답노트 기억 능력 탑재.
|
|
||||||
|
|
||||||
### Active
|
### Active
|
||||||
- [x] `git clone`/`git pull`만으로 19개 스킬 즉각 로드 (Master Sync Hub 토폴로지 확립)
|
|
||||||
- [ ] GSD Phase 계획과 실제 코드 실행 간 오차를 자동으로 검증할 브릿지 시스템 보완
|
|
||||||
- [ ] `.env.agent`만 셋업하면 모든 것이 알아서 연결되는 완전 자율화 체계 구축 지속
|
|
||||||
|
|
||||||
### Out of Scope (배포 제약)
|
- [ ] Deploy headless llama-server setup on Machine A.
|
||||||
- **하위 프로젝트(Satellite Repo)에서의 오픈소스 서브모듈(Vendor) 직접 업데이트 행위 원천 금지.** (오직 ఈ Master 템플릿의 `sync_vendors` 스크립트만이 서브모듈의 버전을 평탄화 추출/번역하여 100% 검증된 정적 파일(`.agent/skills/`)로 Git 트래킹합니다.)
|
- [ ] Build a model hot-swap utility (Fast/Balanced/Deep) for Machine A.
|
||||||
- 하위 패키지에 글로벌 NPM 패키지 설치 요구 및 부트스트랩 스크립트 실행 강요 금지 (오직 `git pull` 하나만으로 마스터의 스킬셋 변동분을 수동적으로 상속받을 것).
|
- [ ] Develop a VS Code Extension (TypeScript) on Machine B for coding agent loop.
|
||||||
- `/start` 및 `/end` 같은 과거 유산(Legacy) 도입 금지
|
- [ ] Develop a Discord Bot (discord.py) on Machine B for personal assistant tools.
|
||||||
|
- [ ] Implement MCP tools (SearXNG, Google Calendar, Gmail) securely on Machine B.
|
||||||
|
|
||||||
|
### Out of Scope
|
||||||
|
|
||||||
|
- [ ] Running inference directly on Machine B (It lacks VRAM/GPU resources in this architecture).
|
||||||
|
- [ ] Exposing Machine A to the public internet (LAN traffic only).
|
||||||
|
|
||||||
---
|
---
|
||||||
*Last updated: 2026-03-29 after Phase 1 initialization*
|
*Last updated: 2026-04-05 after initialization*
|
||||||
|
|
||||||
## Evolution
|
## Evolution
|
||||||
|
|
||||||
This document evolves at phase transitions and milestone boundaries.
|
This document evolves at phase transitions and milestone boundaries.
|
||||||
|
|
||||||
**After each phase transition** (via `/gsd-transition`):
|
**After each phase transition** (via `/gsd-transition`):
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
# Roadmap
|
|
||||||
|
|
||||||
## Phase 1: Zero-Pollution Pipeline Stabilization
|
|
||||||
**Status:** Complete (2026-03-30)
|
|
||||||
**Goal:** Fix the Master-Satellite synchronization pipeline to natively extract GSD skills, UI data, and Python MCP dependencies so that satellite repositories bootstrap via zero-click.
|
|
||||||
|
|
||||||
### Requirements
|
|
||||||
- REQ-01: Bootstrap script must auto-install Python dependencies for MCP tools across satellites.
|
|
||||||
- REQ-02: Master sync script must invoke `get-shit-done-cc` local installation to native `.agent/skills` and ensure Git tracking.
|
|
||||||
- REQ-03: Master sync script must invoke `uipro update` and commit changes.
|
|
||||||
- REQ-04: The `.gitignore` generated by `extract_skills.js` must safely whitelist `/gsd` skill directories.
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
# Codebase Architecture (`ARCHITECTURE.md`)
|
|
||||||
|
|
||||||
## Conceptual Core
|
|
||||||
The project employs a **Master-Satellite Zero-Pollution Orchestration Model**.
|
|
||||||
The architecture is designed to host local environments mapped centrally for LLM / Developer agents so that their capabilities and tracking modules do not leak into the developer's global OS environment.
|
|
||||||
|
|
||||||
## Design Patterns & Layers
|
|
||||||
1. **Master Repository Role (`new_gene`)**: Synchronizes third-party Submodules, flattens them via `.agent/scripts/extract_skills.js` and `.agent/scripts/sync_vendors.bat`, and seeds localized GSD skills via local bin instantiation.
|
|
||||||
2. **Satellite Project Consumption**: Satellite repositories pull the tracked objects (like `.agent/get-shit-done/` binaries and `.agent/skills/gsd-*/` files), run `bootstrap.bat`, and safely isolate all code environments.
|
|
||||||
|
|
||||||
## Execution Entry Points
|
|
||||||
- Master Sync: `.agent/scripts/sync_vendors.bat/.sh` initializes NPM bounds, installs `uipro`/`get-shit-done-cc` strictly inside `.agent/env/node_modules/`, and runs them from the root via `.agent/env/node_modules/.bin/...` to safely isolate state.
|
|
||||||
- Satellite Install: `bootstrap.bat/.sh` executes the runtime scripts sequentially (Git modules -> node env -> Python `.requirements` instantiation using a developer-provided `$AGENT_PYTHON_PATH`).
|
|
||||||
|
|
||||||
## Data Flow
|
|
||||||
The data flow travels exclusively via file IO (File reading -> Markdown modification) controlled by the Antigravity Agent framework executing `Task()` components or user IDE plugins. Git actions (add/commit/push) ensure cross-device immutability.
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
# Codebase Concerns (`CONCERNS.md`)
|
|
||||||
|
|
||||||
## Known Technical Debt & Fragile Areas
|
|
||||||
|
|
||||||
1. **UTF-8 Support on PowerShell**:
|
|
||||||
The `sync_vendors.bat` script utilizes `chcp 65001 >nul` to support symbols and emojis (`🔄`, `⛏️`, `🌐`). However, invoking this `.bat` script directly from the new `powershell.exe` execution layers occasionally mangles the output characters or breaks parsing commands. Developers manually validating `sync_vendors.bat` in Powershell may observe syntax exceptions that don't manifest inside standard CMD executions.
|
|
||||||
|
|
||||||
2. **Path Encoding Depth Limitations**:
|
|
||||||
The GSD implementation maps Deep nested `.agent/vendor/**` repositories and heavily duplicates structure into `.agent/skills/`. For Windows systems overriding 260-character restrictions, this can theoretically introduce silent file truncation if the local `.agent` environment scales beyond expectations.
|
|
||||||
|
|
||||||
3. **Submodule Divergences**:
|
|
||||||
The primary master hook uses `git submodule update --remote --merge`. Tracking upstream mains from multiple open-source repositories means breaking changes authored dynamically by upstream maintainers (e.g., `browser_use` rewriting its setup commands or `obsidian-skills` evolving) could cascade into local logic breaks, requiring a robust `translate_skills.js` sync layer to patch integration shifts.
|
|
||||||
|
|
||||||
4. **Environment Abstraction Leaks**:
|
|
||||||
While Zero-Pollution aims to block system pollution, `bootstrap.bat` utilizes `AGENT_PYTHON_PATH` configured globally to execute `.requirements.txt`. There is an implicit assumption that this python executable is correctly mapped to an isolated virtual environment (`venv`). The architecture relies heavily on Developer compliance to not provide a root Python executable.
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
# Codebase Conventions (`CONVENTIONS.md`)
|
|
||||||
|
|
||||||
## Code Style & Scripts
|
|
||||||
- **Shell / Bash**: Ensure standard `$?` exit evaluation handling with `if [ $? -ne 0 ]; then exit 1; fi`.
|
|
||||||
- **Batch Scripting**: Ensure `%errorlevel%` mapping with `if %errorlevel% neq 0 ( exit /b %errorlevel% )`. Uses `chcp 65001 >nul` for cross-platform UTF-8 emoji support.
|
|
||||||
- **Node.js Scripts**: Typically wrapped recursively checking properties (`fs.existsSync`, `fs.readdirSync`), enforcing zero-pollution (ignoring flattened dependencies via explicit `fs.writeFileSync`).
|
|
||||||
|
|
||||||
## Architectural Patterns
|
|
||||||
- **Zero-Pollution Rule**: Never install global node_modules (`-g`) or system modifications. Path routing is managed securely using execution strings formatted like `path/to/local/bin` instead of relying on `$PATH`.
|
|
||||||
- **Idempotency Rule**: The bootstrap processes (`bootstrap.sh`, `bootstrap.bat`) are designed to be run hundreds of times safely, using `if not exist` checks before taking any structural action.
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
- Terminal outputs are robust, mapping explicit phases like `[1/5]`, `[2/5]` to terminal stdout.
|
|
||||||
- Execution halts strictly on critical dependency update failures.
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
# Codebase Integrations (`INTEGRATIONS.md`)
|
|
||||||
|
|
||||||
## Webhooks & APIs
|
|
||||||
- Currently, this project relies purely on Git synchronisation and CLI execution. Network activity is limited to downloading external packages (`npm install`, `pip install`, `git clone/pull`).
|
|
||||||
- Once MCP tools are invoked, `claude-mem` likely interacts with SQLite logic or LLM memory endpoints depending on its runtime schema. `browser-use` interfaces with web targets directly.
|
|
||||||
|
|
||||||
## Databases & Persistence
|
|
||||||
- There are no central RDBMS or NoSQL layers native to this orchestration layer.
|
|
||||||
- **Knowledge/Memory**: The project serves as an SSOT using markdown (`.planning/*`, `.agent/knowledge/*`).
|
|
||||||
- **MCP state persistence**: Relies on specific plugin's local DBs (e.g., SQLite for mem logs).
|
|
||||||
|
|
||||||
## Third-Party Authentication
|
|
||||||
- Configured by `.agent/config/.env.agent` which includes API keys (Vikunja Task ID, Gitea instances, etc.), though the current codebase only seeds and checks for its existence without directly invoking the APIs inside the shell scripts.
|
|
||||||
|
|
||||||
## Core External Tools
|
|
||||||
- **Git**: Primary persistence and vendor orchestration transport mechanism.
|
|
||||||
- **NPM & PIP**: Invoked asynchronously by bootstrap scripts to satisfy AI tool dependencies.
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
# Codebase Stack (`STACK.md`)
|
|
||||||
|
|
||||||
## Languages
|
|
||||||
- **Shell / Batch Scripting**: Used heavily for the zero-pollution bootstrap scripts (`bootstrap.bat`, `bootstrap.sh`, `sync_vendors.bat`, `sync_vendors.sh`).
|
|
||||||
- **JavaScript (Node.js)**: Used for vendor extraction and translation scripts (`extract_skills.js`, `translate_skills.js`).
|
|
||||||
- **Python**: Required indirectly for running installed Python MCP servers (`browser_use`, `mini-swe`, `claude-mem`) during satellite bootstrapping.
|
|
||||||
|
|
||||||
## Runtimes & Frameworks
|
|
||||||
- **Node.js**: The `.agent/env` isolates local dependencies (e.g. `npm install`, `npx get-shit-done-cc`, `uipro-cli`).
|
|
||||||
- **Python**: Expected to be provided externally via user's `AGENT_PYTHON_PATH` to isolate plugin Python processes.
|
|
||||||
- **Agent Engines**: Target runtimes interacting with this environment include Google Gemini CLI / Antigravity, Claude Code, and Copilot.
|
|
||||||
|
|
||||||
## Core Dependencies
|
|
||||||
- **UI & Workflow Generative Frameworks**:
|
|
||||||
- `get-shit-done-cc`: CLI orchestration framework for generating GSD workflows.
|
|
||||||
- `uipro-cli`: Generates UI/UX specification data.
|
|
||||||
- **Git Submodules** (defined in `.gitmodules`):
|
|
||||||
- `.agent/vendor/superpowers`
|
|
||||||
- `.agent/knowledge/everything_claude`
|
|
||||||
- `.agent/knowledge/awesome_claude`
|
|
||||||
- `.agent/vendor/obsidian-skills`
|
|
||||||
- `.agent/services/claude-mem`
|
|
||||||
- `.agent/services/mcp-core`
|
|
||||||
- `.agent/vendor/browser_use`
|
|
||||||
- `.agent/vendor/mini-swe`
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
- `.agent/config/.env.agent`: Developer-specific agent configuration. Uses `.template` fallback.
|
|
||||||
- `PROJECT.md` & `ROADMAP.md`: Project-level planning and tracker configurations.
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
# Codebase Structure (`STRUCTURE.md`)
|
|
||||||
|
|
||||||
## Directory Map
|
|
||||||
|
|
||||||
- **Root Files**
|
|
||||||
- `bootstrap.bat` / `bootstrap.sh`: Primary runtime orchestrators for setting up the node/python environment.
|
|
||||||
- `.gitignore` & `.gitmodules`: Repository definitions protecting the ecosystem from pollution.
|
|
||||||
|
|
||||||
- **`.agent/` (The Enclave)**
|
|
||||||
- `.agent/env/`: The isolated local node environment container. Has its own `package.json` avoiding merging issues with user code.
|
|
||||||
- `.agent/scripts/`: Home of the Master Sync orchestration rules (`sync_vendors`, `extract_skills.js`, `translate_skills.js`).
|
|
||||||
- `.agent/skills/`: Holds the flattened markdown files (`SKILL.md`) that agents interpret on system boot. Note `gsd-*/` is whitelisted here.
|
|
||||||
- `.agent/vendor/`: The raw checked-out git repository targets before any translations/flattening is parsed out of them.
|
|
||||||
- `.agent/config/`: Runtime configurations specific to the current physical machine (`.env.agent`).
|
|
||||||
- `.agent/knowledge/`: Stores raw unstructured domain context for explicit queries.
|
|
||||||
|
|
||||||
- **`.planning/` (GSD Execution)**
|
|
||||||
- Manages the Project State (`PROJECT.md`, `ROADMAP.md`, `STATE.md`).
|
|
||||||
- `phases/`: Artifact history holding individual plans (`01-PLAN.md`) and the verification/summary reports mapping their outcomes.
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# Codebase Testing (`TESTING.md`)
|
|
||||||
|
|
||||||
## Structural Verification
|
|
||||||
Given that this is an AI Framework Infrastructure project (Bootstrap Kit), formal unit testing via Jest/Vitest does not primarily apply to the root structure.
|
|
||||||
|
|
||||||
## GSD Audit Testing
|
|
||||||
Testing is defined strictly through the **Plan-Execute-Verify** cycle of the Get-Shit-Done (GSD) framework:
|
|
||||||
- **`[Acceptance Criteria]`**: Each phase and subtask has shell commands evaluated by `grep` or file existence checks ensuring conditions are logically provable (e.g. `grep "npx get-shit-done-cc" .agent/scripts/sync_vendors.bat`).
|
|
||||||
- **Verifiers**: The `.planning/phases/*-VERIFICATION.md` pattern uses dedicated checker agents to automatically validate system goals against the physical outcome on disk.
|
|
||||||
|
|
||||||
## Continuous Integration
|
|
||||||
At the moment, no GitHub Actions or standard CI test runners execute automated validation, as validation strictly hinges on local agent validation (`/gsd-verify-work` or `gsd-plan-checker` routines).
|
|
||||||
@@ -1,14 +1,16 @@
|
|||||||
{
|
{
|
||||||
|
"project_name": "Variet LLM",
|
||||||
|
"version": 1,
|
||||||
"mode": "yolo",
|
"mode": "yolo",
|
||||||
"granularity": "Standard",
|
"granularity": "standard",
|
||||||
"parallelization": true,
|
"parallelization": false,
|
||||||
"commit_docs": true,
|
"commit_docs": true,
|
||||||
"model_profile": "balanced",
|
"model_profile": "quality",
|
||||||
"workflow": {
|
"workflow": {
|
||||||
"research": true,
|
"research": true,
|
||||||
"plan_check": true,
|
"plan_check": true,
|
||||||
"verifier": true,
|
"verifier": true,
|
||||||
"nyquist_validation": true,
|
"nyquist_validation": true,
|
||||||
"auto_advance": true
|
"_auto_chain_active": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,57 +0,0 @@
|
|||||||
# Phase 1: Zero-Pollution Pipeline Stabilization - Context
|
|
||||||
|
|
||||||
**Gathered:** 2026-03-30
|
|
||||||
**Status:** Ready for planning
|
|
||||||
**Source:** PRD Express Path (implementation_plan.md)
|
|
||||||
|
|
||||||
<domain>
|
|
||||||
## Phase Boundary
|
|
||||||
Fix the Master-Satellite synchronization pipeline to natively extract GSD skills, UI updates, and Python dependencies. This ensures that a zero-click `bootstrap.bat/.sh` works flawlessly across new workspaces without violating the Zero-Pollution architecture.
|
|
||||||
</domain>
|
|
||||||
|
|
||||||
<decisions>
|
|
||||||
## Implementation Decisions
|
|
||||||
|
|
||||||
### Zero-Click Python Dependencies
|
|
||||||
- Modify `bootstrap.bat` and `bootstrap.sh` to auto-detect and pip-install Python dependencies for MCP tools (`browser_use`, `claude-mem`, `mini-swe`).
|
|
||||||
|
|
||||||
### Master Sync Pipeline GSD Extraction
|
|
||||||
- Modify `sync_vendors.bat` and `sync_vendors.sh` to execute `npm update get-shit-done-cc uipro-cli`, `npx uipro update`, and `npx get-shit-done-cc --antigravity --local` inside `.agent/env`.
|
|
||||||
- Ensure output folders `.agent/get-shit-done/` and `.agent/skills/gsd-*/` are `git add`ed.
|
|
||||||
|
|
||||||
### Gitignore Rules for extracted skills
|
|
||||||
- Modify `.agent/scripts/extract_skills.js` to whitelist `!gsd-*/` in the generated `.gitignore` so Phase planning and GSD execution tools successfully persist to git.
|
|
||||||
|
|
||||||
### the agent's Discretion
|
|
||||||
- The method of Python virtual environment detection vs global python is left to the agent, though leveraging `AGENT_PYTHON_PATH` if specified is preferred.
|
|
||||||
</decisions>
|
|
||||||
|
|
||||||
<canonical_refs>
|
|
||||||
## Canonical References
|
|
||||||
|
|
||||||
**Downstream agents MUST read these before planning or implementing.**
|
|
||||||
|
|
||||||
### Zero Pollution Scripts
|
|
||||||
- `bootstrap.bat`
|
|
||||||
- `bootstrap.sh`
|
|
||||||
- `.agent/scripts/sync_vendors.bat`
|
|
||||||
- `.agent/scripts/sync_vendors.sh`
|
|
||||||
- `.agent/scripts/extract_skills.js`
|
|
||||||
|
|
||||||
</canonical_refs>
|
|
||||||
|
|
||||||
<specifics>
|
|
||||||
## Specific Ideas
|
|
||||||
- The `npx get-shit-done-cc --antigravity --local` command must be run relative to the project root or precisely routed, otherwise GSD commands won't appear sequentially.
|
|
||||||
|
|
||||||
</specifics>
|
|
||||||
|
|
||||||
<deferred>
|
|
||||||
## Deferred Ideas
|
|
||||||
None — PRD covers phase scope.
|
|
||||||
</deferred>
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Phase: 01-zero-pollution-pipeline-stabilization*
|
|
||||||
*Context gathered: 2026-03-30 via PRD Express Path*
|
|
||||||
@@ -1,143 +0,0 @@
|
|||||||
---
|
|
||||||
wave: 1
|
|
||||||
depends_on: []
|
|
||||||
files_modified:
|
|
||||||
- ".agent/scripts/sync_vendors.bat"
|
|
||||||
- ".agent/scripts/sync_vendors.sh"
|
|
||||||
- ".agent/scripts/extract_skills.js"
|
|
||||||
- "bootstrap.bat"
|
|
||||||
- "bootstrap.sh"
|
|
||||||
autonomous: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# Phase 1: Zero-Pollution Pipeline Stabilization
|
|
||||||
|
|
||||||
## Objective
|
|
||||||
Fix the Master-Satellite synchronization pipeline to natively extract GSD skills, UI data, and Python MCP dependencies so that satellite repositories bootstrap via zero-click.
|
|
||||||
|
|
||||||
## Verification Criteria
|
|
||||||
- [ ] Running `bootstrap.bat` attempts to install Python `requirements.txt` targets (`browser_use`, `claude-mem`).
|
|
||||||
- [ ] Running `sync_vendors.bat` successfully runs `get-shit-done-cc` generator and Git tracks `.agent/skills/gsd-*/` and `.agent/get-shit-done/`.
|
|
||||||
- [ ] `extract_skills.js` generated `.gitignore` whitelists `!gsd-*/`.
|
|
||||||
|
|
||||||
## must_haves
|
|
||||||
- [ ] `sync_vendors.bat` and `sync_vendors.sh` must execute `npm install`, `npx uipro update` and `npx get-shit-done-cc --antigravity --local`.
|
|
||||||
- [ ] `bootstrap.bat` and `bootstrap.sh` must execute `pip install -r requirements.txt` for Python components if Python env exists.
|
|
||||||
- [ ] GSD execution binaries and folders must be tracked via `git add .agent/get-shit-done/`.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
<task>
|
|
||||||
<read_first>
|
|
||||||
- .agent/scripts/sync_vendors.bat
|
|
||||||
- .agent/scripts/sync_vendors.sh
|
|
||||||
</read_first>
|
|
||||||
<action>
|
|
||||||
Add the local GSD extraction and uipro updating mechanism directly inside `sync_vendors.bat` and `sync_vendors.sh`.
|
|
||||||
For the bash script (`sync_vendors.sh`), under step `[2/5] 패키지 업데이트 및 GSD, UI-UX-PRO-MAX 동기화...`:
|
|
||||||
Add:
|
|
||||||
```bash
|
|
||||||
cd .agent/env
|
|
||||||
npm install
|
|
||||||
npm update get-shit-done-cc uipro-cli
|
|
||||||
npx uipro update
|
|
||||||
npx get-shit-done-cc --antigravity --local
|
|
||||||
cd ../..
|
|
||||||
```
|
|
||||||
For the batch script (`sync_vendors.bat`), under a new step for package updates:
|
|
||||||
Add:
|
|
||||||
```bat
|
|
||||||
cd .agent\env
|
|
||||||
call npm install
|
|
||||||
call npm update get-shit-done-cc uipro-cli
|
|
||||||
call npx uipro update
|
|
||||||
call npx get-shit-done-cc --antigravity --local
|
|
||||||
cd ..\..
|
|
||||||
```
|
|
||||||
Also, change the git commit logic to track GSD:
|
|
||||||
Change `git add .agent/vendor/ .agent/skills/ .gitmodules` to `git add .agent/vendor/ .agent/skills/ .agent/get-shit-done/ .gitmodules`
|
|
||||||
</action>
|
|
||||||
<acceptance_criteria>
|
|
||||||
`grep "npx get-shit-done-cc --antigravity --local" .agent/scripts/sync_vendors.bat` exits 0.
|
|
||||||
`grep "git add" .agent/scripts/sync_vendors.bat | grep ".agent/get-shit-done/"` exits 0.
|
|
||||||
`grep "npx get-shit-done-cc --antigravity --local" .agent/scripts/sync_vendors.sh` exits 0.
|
|
||||||
`grep "git add" .agent/scripts/sync_vendors.sh | grep ".agent/get-shit-done/"` exits 0.
|
|
||||||
</acceptance_criteria>
|
|
||||||
</task>
|
|
||||||
|
|
||||||
<task>
|
|
||||||
<read_first>
|
|
||||||
- .agent/scripts/extract_skills.js
|
|
||||||
</read_first>
|
|
||||||
<action>
|
|
||||||
Modify `extract_skills.js` to whitelist `gsd-*/` skill folders in the generated `.gitignore`.
|
|
||||||
Find `const gitignoreContent = [` and add `'!gsd-*/',` below `'!mini-swe/',`.
|
|
||||||
</action>
|
|
||||||
<acceptance_criteria>
|
|
||||||
`grep "!gsd-\*/" .agent/scripts/extract_skills.js` exits 0.
|
|
||||||
</acceptance_criteria>
|
|
||||||
</task>
|
|
||||||
|
|
||||||
<task>
|
|
||||||
<read_first>
|
|
||||||
- bootstrap.bat
|
|
||||||
</read_first>
|
|
||||||
<action>
|
|
||||||
Add Python MCP auto-setup logic to `bootstrap.bat` right after the node dependencies step.
|
|
||||||
Add:
|
|
||||||
```bat
|
|
||||||
echo [4/5] Checking and Installing Python MCP Dependencies...
|
|
||||||
if defined AGENT_PYTHON_PATH (
|
|
||||||
echo -^> Using AGENT_PYTHON_PATH: %AGENT_PYTHON_PATH%
|
|
||||||
if exist ".agent\vendor\browser_use\requirements.txt" (
|
|
||||||
"%AGENT_PYTHON_PATH%" -m pip install -r ".agent\vendor\browser_use\requirements.txt"
|
|
||||||
)
|
|
||||||
if exist ".agent\services\claude-mem\requirements.txt" (
|
|
||||||
"%AGENT_PYTHON_PATH%" -m pip install -r ".agent\services\claude-mem\requirements.txt"
|
|
||||||
)
|
|
||||||
if exist ".agent\services\mini-swe\requirements.txt" (
|
|
||||||
"%AGENT_PYTHON_PATH%" -m pip install -r ".agent\services\mini-swe\requirements.txt"
|
|
||||||
)
|
|
||||||
) else (
|
|
||||||
echo -^> Warning: AGENT_PYTHON_PATH is not defined. Skipping Python dependencies installation.
|
|
||||||
)
|
|
||||||
```
|
|
||||||
Update the final step text `[4/4]` to `[5/5]` appropriately.
|
|
||||||
</action>
|
|
||||||
<acceptance_criteria>
|
|
||||||
`grep "AGENT_PYTHON_PATH" bootstrap.bat` exits 0.
|
|
||||||
`grep "pip install -r" bootstrap.bat` exits 0.
|
|
||||||
</acceptance_criteria>
|
|
||||||
</task>
|
|
||||||
|
|
||||||
<task>
|
|
||||||
<read_first>
|
|
||||||
- bootstrap.sh
|
|
||||||
</read_first>
|
|
||||||
<action>
|
|
||||||
Add Python MCP auto-setup logic to `bootstrap.sh` right after the node dependencies step.
|
|
||||||
Add:
|
|
||||||
```bash
|
|
||||||
echo "[4/5] Checking and Installing Python MCP Dependencies..."
|
|
||||||
if [ -n "$AGENT_PYTHON_PATH" ]; then
|
|
||||||
echo " -> Using AGENT_PYTHON_PATH: $AGENT_PYTHON_PATH"
|
|
||||||
if [ -f ".agent/vendor/browser_use/requirements.txt" ]; then
|
|
||||||
"$AGENT_PYTHON_PATH" -m pip install -r ".agent/vendor/browser_use/requirements.txt"
|
|
||||||
fi
|
|
||||||
if [ -f ".agent/services/claude-mem/requirements.txt" ]; then
|
|
||||||
"$AGENT_PYTHON_PATH" -m pip install -r ".agent/services/claude-mem/requirements.txt"
|
|
||||||
fi
|
|
||||||
if [ -f ".agent/services/mini-swe/requirements.txt" ]; then
|
|
||||||
"$AGENT_PYTHON_PATH" -m pip install -r ".agent/services/mini-swe/requirements.txt"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo " -> Warning: AGENT_PYTHON_PATH is not defined. Skipping Python dependencies installation."
|
|
||||||
fi
|
|
||||||
```
|
|
||||||
Update the final step text `[4/4]` to `[5/5]` appropriately.
|
|
||||||
</action>
|
|
||||||
<acceptance_criteria>
|
|
||||||
`grep "AGENT_PYTHON_PATH" bootstrap.sh` exits 0.
|
|
||||||
`grep "pip install -r" bootstrap.sh` exits 0.
|
|
||||||
</acceptance_criteria>
|
|
||||||
</task>
|
|
||||||
1
openclaude
Submodule
1
openclaude
Submodule
Submodule openclaude added at 5ef79546e9
58
scripts/analysis_raw.txt
Normal file
58
scripts/analysis_raw.txt
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
0|Gemma4-26B MXFP4_MOE|ngl=999 pure-GPU|63.21|63.78|G0:11770|G1:10411|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
1|Gemma4-26B MXFP4_MOE|compare: cpu-moe|12.92|14.21|G0:3096|G1:3497|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe
|
||||||
|
2|Gemma4-26B MXFP4_MOE|t=2|64.1|64.27|G0:11728|G1:10411|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
3|Gemma4-26B MXFP4_MOE|t=4|64|64.39|G0:11728|G1:10411|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
4|Gemma4-26B MXFP4_MOE|t=8|63.75|63.9|G0:11728|G1:10411|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
5|Gemma4-26B MXFP4_MOE|t=10|64.01|64.14|G0:11728|G1:10411|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
6|Gemma4-26B MXFP4_MOE|t=12|63.86|63.98|G0:11728|G1:10411|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
7|Gemma4-26B MXFP4_MOE|ub=256 b=1024|63.8|64.12|G0:10504|G1:9619|t=2|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU
|
||||||
|
8|Gemma4-26B MXFP4_MOE|ub=256 b=2048|63.88|64.04|G0:10504|G1:9619|t=2|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
9|Gemma4-26B MXFP4_MOE|ub=512 b=4096|63.91|64.18|G0:11728|G1:10411|t=2|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU
|
||||||
|
10|Gemma4-26B MXFP4_MOE|ub=1024 b=2048|63.86|64.1|G0:10956|G1:9907|t=2|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
11|Gemma4-26B MXFP4_MOE|ub=1024 b=4096|63.85|64.06|G0:10956|G1:9907|t=2|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU
|
||||||
|
12|Gemma4-26B MXFP4_MOE|kv=q8_0/q8_0|64.14|64.39|G0:10670|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
|
||||||
|
13|Gemma4-26B MXFP4_MOE|kv=q4_0/q8_0|37.52|37.86|G0:10394|G1:9753|t=2|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU
|
||||||
|
14|Gemma4-26B MXFP4_MOE|kv=f16/f16|63.48|64.31|G0:11700|G1:11667|t=2|ub=512 b=2048|kv=f16/f16|pure-GPU
|
||||||
|
15|Gemma4-26B MXFP4_MOE|FINAL|64.05|64.29|G0:10667|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
|
||||||
|
16|Gemma4-26B Q4_K_M|ngl=999 pure-GPU|76.01|76.31|G0:11784|G1:10454|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
17|Gemma4-26B Q4_K_M|compare: cpu-moe|10.19|10.49|G0:2652|G1:2982|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe
|
||||||
|
18|Gemma4-26B Q4_K_M|t=2|75.67|75.87|G0:11783|G1:10454|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
19|Gemma4-26B Q4_K_M|t=4|75.61|75.87|G0:11783|G1:10454|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
20|Gemma4-26B Q4_K_M|t=8|75.42|75.59|G0:11783|G1:10454|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
21|Gemma4-26B Q4_K_M|t=10|75.71|75.82|G0:11783|G1:10454|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
22|Gemma4-26B Q4_K_M|t=12|75.08|75.7|G0:11783|G1:10454|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
23|Gemma4-26B Q4_K_M|ub=256 b=1024|75.16|75.64|G0:10559|G1:9662|t=6|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU
|
||||||
|
24|Gemma4-26B Q4_K_M|ub=256 b=2048|75.68|76.05|G0:10559|G1:9662|t=6|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
25|Gemma4-26B Q4_K_M|ub=512 b=4096|75.92|76.16|G0:11784|G1:10454|t=6|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU
|
||||||
|
26|Gemma4-26B Q4_K_M|ub=1024 b=2048|75.7|75.9|G0:11012|G1:9950|t=6|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU
|
||||||
|
27|Gemma4-26B Q4_K_M|ub=1024 b=4096|75.77|75.99|G0:11011|G1:9950|t=6|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU
|
||||||
|
28|Gemma4-26B Q4_K_M|kv=q8_0/q8_0|76.3|76.69|G0:10725|G1:10212|t=6|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
|
||||||
|
29|Gemma4-26B Q4_K_M|kv=q4_0/q8_0|42.88|44.58|G0:10439|G1:9796|t=6|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU
|
||||||
|
30|Gemma4-26B Q4_K_M|kv=f16/f16|76.36|76.78|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU
|
||||||
|
31|Gemma4-26B Q4_K_M|FINAL|76.4|76.75|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU
|
||||||
|
32|Qwen3.5-35B MXFP4_MOE|n-cpu-moe=5|51.43|52.07|G0:10365|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
33|Qwen3.5-35B MXFP4_MOE|t=2|43.8|46.4|G0:10365|G1:11152|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
34|Qwen3.5-35B MXFP4_MOE|t=4|49.21|52.78|G0:10353|G1:11152|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
35|Qwen3.5-35B MXFP4_MOE|t=8|46.43|50.49|G0:10397|G1:11152|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
36|Qwen3.5-35B MXFP4_MOE|t=10|46.12|50.06|G0:10351|G1:11152|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
37|Qwen3.5-35B MXFP4_MOE|t=12|45.23|47.1|G0:10337|G1:11152|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
38|Qwen3.5-35B MXFP4_MOE|ub=256 b=1024|48.9|52.3|G0:9834|G1:10906|t=6|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
39|Qwen3.5-35B MXFP4_MOE|ub=256 b=2048|49.62|52.52|G0:9833|G1:10906|t=6|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
40|Qwen3.5-35B MXFP4_MOE|ub=512 b=4096|48.78|52.14|G0:10337|G1:11152|t=6|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
41|Qwen3.5-35B MXFP4_MOE|ub=1024 b=2048|49.95|52.53|G0:11124|G1:11644|t=6|ub=1024 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
42|Qwen3.5-35B MXFP4_MOE|ub=1024 b=4096|48.75|52.06|G0:11123|G1:11644|t=6|ub=1024 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
43|Qwen3.5-35B MXFP4_MOE|kv=q4_0/q8_0|42.81|44.14|G0:10681|G1:11472|t=6|ub=512 b=2048|kv=q4_0/q8_0|n-cpu-moe=5
|
||||||
|
44|Qwen3.5-35B MXFP4_MOE|FINAL|46.66|47.09|G0:10476|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
45|Qwen3.5-35B Q4_K_M|n-cpu-moe=5|49.01|53.09|G0:10606|G1:11338|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
46|Qwen3.5-35B Q4_K_M|t=2|45.73|47.87|G0:10599|G1:11338|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
47|Qwen3.5-35B Q4_K_M|t=4|50.98|54.33|G0:10601|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
48|Qwen3.5-35B Q4_K_M|t=8|48.45|52.1|G0:10596|G1:11338|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
49|Qwen3.5-35B Q4_K_M|t=10|47.83|51.45|G0:10595|G1:11338|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
50|Qwen3.5-35B Q4_K_M|t=12|43.77|46.79|G0:10589|G1:11338|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
51|Qwen3.5-35B Q4_K_M|ub=256 b=1024|52.14|53.82|G0:10089|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
52|Qwen3.5-35B Q4_K_M|ub=256 b=2048|50.23|53.66|G0:10091|G1:11092|t=4|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
53|Qwen3.5-35B Q4_K_M|ub=512 b=2048|49.89|53.89|G0:10595|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
54|Qwen3.5-35B Q4_K_M|ub=512 b=4096|50.4|54.19|G0:10564|G1:11338|t=4|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
|
55|Qwen3.5-35B Q4_K_M|kv=q8_0/q8_0|51.84|53.53|G0:10726|G1:11732|t=4|ub=256 b=1024|kv=q8_0/q8_0|n-cpu-moe=5
|
||||||
|
56|Qwen3.5-35B Q4_K_M|kv=q4_0/q8_0|43.22|45.99|G0:10410|G1:11412|t=4|ub=256 b=1024|kv=q4_0/q8_0|n-cpu-moe=5
|
||||||
|
57|Qwen3.5-35B Q4_K_M|FINAL|52.05|54.48|G0:10062|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
|
||||||
339
scripts/auto_tune_gemma4_256k.py
Normal file
339
scripts/auto_tune_gemma4_256k.py
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
"""
|
||||||
|
Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
|
||||||
|
Phase 1: -ngl sweep (GPU layers)
|
||||||
|
Phase 2: -t / -tb sweep (CPU threads)
|
||||||
|
Phase 3: -ub / -b sweep (batch sizes)
|
||||||
|
Phase 4: --cache-type-k/v sweep (KV cache precision)
|
||||||
|
Phase 5: --no-mmap, --poll, --prio sweep (misc)
|
||||||
|
Each phase fixes the best from previous phases.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import urllib.request
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
try:
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
BASE_URL = "http://127.0.0.1:8000"
|
||||||
|
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||||
|
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||||
|
CONTEXT = 262144
|
||||||
|
BENCHMARK_RUNS = 3
|
||||||
|
BENCHMARK_TOKENS = 200
|
||||||
|
|
||||||
|
# ─── Baseline (from previous tuning at -c 4096) ───
|
||||||
|
BEST = {
|
||||||
|
"ngl": 22,
|
||||||
|
"t": 8,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": True,
|
||||||
|
"mmap": True,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
}
|
||||||
|
|
||||||
|
ALL_RESULTS = []
|
||||||
|
|
||||||
|
|
||||||
|
def kill_server():
|
||||||
|
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||||
|
capture_output=True)
|
||||||
|
time.sleep(4)
|
||||||
|
|
||||||
|
|
||||||
|
def build_cmd(cfg):
|
||||||
|
cmd = [LLAMA_SERVER, "--model", MODEL,
|
||||||
|
"-ngl", str(cfg["ngl"]),
|
||||||
|
"-c", str(CONTEXT),
|
||||||
|
"-np", "1",
|
||||||
|
"-fa", cfg["fa"],
|
||||||
|
"--cache-type-k", cfg["ctk"],
|
||||||
|
"--cache-type-v", cfg["ctv"],
|
||||||
|
"-ub", str(cfg["ub"]),
|
||||||
|
"-b", str(cfg["b"]),
|
||||||
|
"-t", str(cfg["t"]),
|
||||||
|
"-tb", str(cfg["tb"]),
|
||||||
|
"--prio", str(cfg["prio"]),
|
||||||
|
"--poll", str(cfg["poll"]),
|
||||||
|
"--port", "8000",
|
||||||
|
"--host", "0.0.0.0"]
|
||||||
|
if cfg["mlock"]:
|
||||||
|
cmd.append("--mlock")
|
||||||
|
if not cfg["mmap"]:
|
||||||
|
cmd.append("--no-mmap")
|
||||||
|
return cmd
|
||||||
|
|
||||||
|
|
||||||
|
def start_server(cfg):
|
||||||
|
cmd = build_cmd(cfg)
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||||
|
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||||
|
)
|
||||||
|
return proc
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_server(timeout=180):
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start < timeout:
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||||
|
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||||
|
data = json.loads(resp.read())
|
||||||
|
if data.get("status") == "ok":
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(2)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": "local-model",
|
||||||
|
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"temperature": 0.0
|
||||||
|
}).encode("utf-8")
|
||||||
|
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{BASE_URL}/v1/chat/completions",
|
||||||
|
data=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||||
|
result = json.loads(resp.read())
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
usage = result.get("usage", {})
|
||||||
|
ct = usage.get("completion_tokens", 0)
|
||||||
|
return ct / elapsed if elapsed > 0 else 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_vram():
|
||||||
|
try:
|
||||||
|
r = subprocess.run(
|
||||||
|
["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
parts = r.stdout.strip().split(",")
|
||||||
|
return int(parts[0].strip()), int(parts[1].strip())
|
||||||
|
except:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_config(cfg, label=""):
|
||||||
|
kill_server()
|
||||||
|
desc = label or str(cfg)
|
||||||
|
print(f" [{desc}] Starting server...")
|
||||||
|
proc = start_server(cfg)
|
||||||
|
|
||||||
|
if not wait_for_server():
|
||||||
|
print(f" [{desc}] FAILED to start")
|
||||||
|
proc.kill()
|
||||||
|
return None
|
||||||
|
|
||||||
|
vram_used, vram_total = get_vram()
|
||||||
|
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
try:
|
||||||
|
run_benchmark(max_tokens=20)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Benchmark
|
||||||
|
speeds = []
|
||||||
|
for i in range(BENCHMARK_RUNS):
|
||||||
|
try:
|
||||||
|
tps = run_benchmark()
|
||||||
|
speeds.append(tps)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERR({e}) ", end="", flush=True)
|
||||||
|
|
||||||
|
proc.kill()
|
||||||
|
|
||||||
|
if not speeds:
|
||||||
|
print("ALL FAILED")
|
||||||
|
return None
|
||||||
|
|
||||||
|
avg = sum(speeds) / len(speeds)
|
||||||
|
best = max(speeds)
|
||||||
|
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||||
|
|
||||||
|
result = {**cfg, "avg_tps": avg, "best_tps": best,
|
||||||
|
"vram_used": vram_used, "vram_total": vram_total, "label": label}
|
||||||
|
ALL_RESULTS.append(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def phase_sweep(phase_name, param_name, values, base_cfg):
|
||||||
|
print(f"\n{'='*70}")
|
||||||
|
print(f" PHASE: {phase_name}")
|
||||||
|
print(f" Sweeping: {param_name} = {values}")
|
||||||
|
print(f"{'='*70}")
|
||||||
|
|
||||||
|
best_result = None
|
||||||
|
for val in values:
|
||||||
|
cfg = {**base_cfg}
|
||||||
|
if isinstance(param_name, list):
|
||||||
|
for p, v in zip(param_name, val):
|
||||||
|
cfg[p] = v
|
||||||
|
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
|
||||||
|
else:
|
||||||
|
cfg[param_name] = val
|
||||||
|
label = f"{param_name}={val}"
|
||||||
|
|
||||||
|
r = test_config(cfg, label)
|
||||||
|
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
|
||||||
|
best_result = r
|
||||||
|
|
||||||
|
if best_result:
|
||||||
|
print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
|
||||||
|
return best_result
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 70)
|
||||||
|
print(" Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner")
|
||||||
|
print(" 256K Context | RTX 3060 12GB")
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
|
|
||||||
|
cfg = dict(BEST)
|
||||||
|
|
||||||
|
# ─── Phase 1: -ngl (already done, quick verify top 3) ───
|
||||||
|
r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg)
|
||||||
|
if r:
|
||||||
|
cfg["ngl"] = r["ngl"]
|
||||||
|
|
||||||
|
# ─── Phase 2: CPU threads (-t, -tb) ───
|
||||||
|
thread_combos = [
|
||||||
|
(2, 2), (4, 4), (4, 8), (6, 6), (6, 8),
|
||||||
|
(8, 8), (8, 12), (10, 10), (12, 12), (16, 16)
|
||||||
|
]
|
||||||
|
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
|
||||||
|
if r:
|
||||||
|
cfg["t"] = r["t"]
|
||||||
|
cfg["tb"] = r["tb"]
|
||||||
|
|
||||||
|
# ─── Phase 3: Batch sizes (-ub, -b) ───
|
||||||
|
batch_combos = [
|
||||||
|
(128, 512), (256, 1024), (256, 2048),
|
||||||
|
(512, 1024), (512, 2048), (512, 4096),
|
||||||
|
(1024, 2048), (1024, 4096)
|
||||||
|
]
|
||||||
|
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
|
||||||
|
if r:
|
||||||
|
cfg["ub"] = r["ub"]
|
||||||
|
cfg["b"] = r["b"]
|
||||||
|
|
||||||
|
# ─── Phase 4: KV cache precision ───
|
||||||
|
kv_combos = [
|
||||||
|
("q4_0", "q4_0"),
|
||||||
|
("q8_0", "q8_0"),
|
||||||
|
("q4_0", "q8_0"),
|
||||||
|
("f16", "f16"),
|
||||||
|
]
|
||||||
|
r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg)
|
||||||
|
if r:
|
||||||
|
cfg["ctk"] = r["ctk"]
|
||||||
|
cfg["ctv"] = r["ctv"]
|
||||||
|
|
||||||
|
# ─── Phase 5: Misc (mmap, poll, prio) ───
|
||||||
|
misc_combos = [
|
||||||
|
(True, 50, 2), # baseline
|
||||||
|
(False, 50, 2), # no-mmap
|
||||||
|
(True, 0, 2), # no polling
|
||||||
|
(True, 100, 2), # max polling
|
||||||
|
(True, 50, 3), # realtime priority
|
||||||
|
(False, 0, 3), # no-mmap + no-poll + realtime
|
||||||
|
]
|
||||||
|
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
|
||||||
|
if r:
|
||||||
|
cfg["mmap"] = r["mmap"]
|
||||||
|
cfg["poll"] = r["poll"]
|
||||||
|
cfg["prio"] = r["prio"]
|
||||||
|
|
||||||
|
# ─── Final Report ───
|
||||||
|
print()
|
||||||
|
print("=" * 70)
|
||||||
|
print(" FINAL OPTIMAL CONFIGURATION")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f" ngl: {cfg['ngl']}")
|
||||||
|
print(f" threads: -t {cfg['t']} -tb {cfg['tb']}")
|
||||||
|
print(f" batch: -ub {cfg['ub']} -b {cfg['b']}")
|
||||||
|
print(f" kv cache: -ctk {cfg['ctk']} -ctv {cfg['ctv']}")
|
||||||
|
print(f" flash: -fa {cfg['fa']}")
|
||||||
|
print(f" mlock: {'yes' if cfg['mlock'] else 'no'}")
|
||||||
|
print(f" mmap: {'yes' if cfg['mmap'] else 'no (--no-mmap)'}")
|
||||||
|
print(f" prio: {cfg['prio']}")
|
||||||
|
print(f" poll: {cfg['poll']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Final verification run
|
||||||
|
print(" Running final verification (5 runs)...")
|
||||||
|
kill_server()
|
||||||
|
proc = start_server(cfg)
|
||||||
|
wait_for_server()
|
||||||
|
try:
|
||||||
|
run_benchmark(max_tokens=20)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
final_speeds = []
|
||||||
|
for i in range(5):
|
||||||
|
try:
|
||||||
|
tps = run_benchmark()
|
||||||
|
final_speeds.append(tps)
|
||||||
|
print(f" Run {i+1}: {tps:.2f} t/s")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
proc.kill()
|
||||||
|
|
||||||
|
if final_speeds:
|
||||||
|
avg = sum(final_speeds) / len(final_speeds)
|
||||||
|
best = max(final_speeds)
|
||||||
|
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
|
||||||
|
|
||||||
|
print()
|
||||||
|
cmd_parts = [
|
||||||
|
f"llama-server --model {MODEL}",
|
||||||
|
f"-ngl {cfg['ngl']} -c {CONTEXT}",
|
||||||
|
f"-t {cfg['t']} -tb {cfg['tb']}",
|
||||||
|
f"-ub {cfg['ub']} -b {cfg['b']}",
|
||||||
|
f"-fa {cfg['fa']}",
|
||||||
|
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
|
||||||
|
f"--prio {cfg['prio']} --poll {cfg['poll']}",
|
||||||
|
]
|
||||||
|
if cfg["mlock"]:
|
||||||
|
cmd_parts.append("--mlock")
|
||||||
|
if not cfg["mmap"]:
|
||||||
|
cmd_parts.append("--no-mmap")
|
||||||
|
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||||
|
|
||||||
|
print(" Recommended command:")
|
||||||
|
print(f" {' '.join(cmd_parts)}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Dump all results to JSON
|
||||||
|
with open("scripts/tune_results_gemma4_256k.json", "w") as f:
|
||||||
|
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||||
|
print(f"\n Full results saved: scripts/tune_results_gemma4_256k.json")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
163
scripts/auto_tune_gemma4_ncpumoe.py
Normal file
163
scripts/auto_tune_gemma4_ncpumoe.py
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
"""
|
||||||
|
Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
|
||||||
|
Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
|
||||||
|
"""
|
||||||
|
import subprocess, time, json, urllib.request, sys, os
|
||||||
|
|
||||||
|
try:
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
BASE_URL = "http://127.0.0.1:8000"
|
||||||
|
SERVER = r"llama_bin_run\llama-server.exe"
|
||||||
|
MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
|
||||||
|
CTX = 262144
|
||||||
|
RUNS = 3
|
||||||
|
|
||||||
|
|
||||||
|
def kill():
|
||||||
|
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||||
|
time.sleep(4)
|
||||||
|
|
||||||
|
|
||||||
|
def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
|
||||||
|
cmd = [SERVER, "--model", MODEL, "-ngl", "999",
|
||||||
|
"-c", str(CTX), "-np", "1", "-fa", "on",
|
||||||
|
"--cache-type-k", ctk, "--cache-type-v", ctv,
|
||||||
|
"-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
|
||||||
|
"--prio", str(prio), "--poll", "50",
|
||||||
|
"--mlock", "--port", "8000", "--host", "0.0.0.0"]
|
||||||
|
if ncpumoe > 0:
|
||||||
|
cmd.extend(["--n-cpu-moe", str(ncpumoe)])
|
||||||
|
if nommap:
|
||||||
|
cmd.append("--no-mmap")
|
||||||
|
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||||
|
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
|
||||||
|
|
||||||
|
|
||||||
|
def wait_ready(timeout=240):
|
||||||
|
t0 = time.time()
|
||||||
|
while time.time() - t0 < timeout:
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
|
||||||
|
if json.loads(r.read()).get("status") == "ok":
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(2)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def bench(n=200):
|
||||||
|
p = json.dumps({"model": "m", "messages": [{"role": "user",
|
||||||
|
"content": "Count from 1 to 50, each number on new line."}],
|
||||||
|
"max_tokens": n, "temperature": 0.0}).encode()
|
||||||
|
r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
|
||||||
|
headers={"Content-Type": "application/json"})
|
||||||
|
t0 = time.time()
|
||||||
|
with urllib.request.urlopen(r, timeout=300) as resp:
|
||||||
|
res = json.loads(resp.read())
|
||||||
|
dt = time.time() - t0
|
||||||
|
ct = res.get("usage", {}).get("completion_tokens", 0)
|
||||||
|
return ct / dt if dt > 0 else 0
|
||||||
|
|
||||||
|
|
||||||
|
def vram():
|
||||||
|
try:
|
||||||
|
r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
|
||||||
|
a, b = r.stdout.strip().split(",")
|
||||||
|
return int(a.strip()), int(b.strip())
|
||||||
|
except:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
|
||||||
|
def test(label, ncpumoe, **kw):
|
||||||
|
kill()
|
||||||
|
print(f" [{label}] Starting...", end=" ", flush=True)
|
||||||
|
p = start(ncpumoe, **kw)
|
||||||
|
if not wait_ready():
|
||||||
|
print("FAILED"); p.kill(); return None
|
||||||
|
vu, vt = vram()
|
||||||
|
print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
|
||||||
|
try: bench(20)
|
||||||
|
except: pass
|
||||||
|
speeds = []
|
||||||
|
for _ in range(RUNS):
|
||||||
|
try: speeds.append(bench())
|
||||||
|
except: pass
|
||||||
|
p.kill()
|
||||||
|
if not speeds:
|
||||||
|
print("BENCH FAILED"); return None
|
||||||
|
avg, best = sum(speeds)/len(speeds), max(speeds)
|
||||||
|
print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
|
||||||
|
return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
|
||||||
|
"vram": vu, **kw}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
|
||||||
|
print("=" * 60)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
|
||||||
|
print("\n--- Phase 1: --n-cpu-moe sweep ---")
|
||||||
|
for n in [0, 5, 10, 15, 20, 25, 30]:
|
||||||
|
nm = n > 15 # use --no-mmap when heavy CPU offload
|
||||||
|
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||||
|
if r: results.append(r)
|
||||||
|
|
||||||
|
# Find best n-cpu-moe
|
||||||
|
best_r = max(results, key=lambda x: x["avg"])
|
||||||
|
best_n = best_r["ncpumoe"]
|
||||||
|
print(f"\n ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||||
|
|
||||||
|
# Fine-tune around best
|
||||||
|
if best_n > 0:
|
||||||
|
print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
|
||||||
|
for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
|
||||||
|
if n == best_n: continue
|
||||||
|
nm = n > 15
|
||||||
|
r = test(f"ncpumoe={n}", n, nommap=nm)
|
||||||
|
if r: results.append(r)
|
||||||
|
best_r = max(results, key=lambda x: x["avg"])
|
||||||
|
best_n = best_r["ncpumoe"]
|
||||||
|
print(f"\n ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
|
||||||
|
|
||||||
|
# Phase 2: Thread sweep at best n-cpu-moe
|
||||||
|
nm = best_n > 15
|
||||||
|
print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
|
||||||
|
for t in [2, 4, 6, 8, 10]:
|
||||||
|
r = test(f"t={t}", best_n, t=t, nommap=nm)
|
||||||
|
if r: results.append(r)
|
||||||
|
best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
|
||||||
|
bt = best_t.get("t", 4)
|
||||||
|
print(f"\n ★ Best threads: {bt}")
|
||||||
|
|
||||||
|
# Phase 3: Batch sweep
|
||||||
|
print(f"\n--- Phase 3: Batch sweep ---")
|
||||||
|
for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
|
||||||
|
r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
|
||||||
|
if r: results.append(r)
|
||||||
|
|
||||||
|
# Phase 4: KV cache type
|
||||||
|
print(f"\n--- Phase 4: KV cache type ---")
|
||||||
|
for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
|
||||||
|
r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
|
||||||
|
if r: results.append(r)
|
||||||
|
|
||||||
|
# Final report
|
||||||
|
best_all = max(results, key=lambda x: x["avg"])
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f" FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
|
||||||
|
json.dump(results, f, indent=2, default=str)
|
||||||
|
print(" Saved: scripts/tune_results_gemma4_ncpumoe.json")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
335
scripts/auto_tune_qwen35b_256k.py
Normal file
335
scripts/auto_tune_qwen35b_256k.py
Normal file
@@ -0,0 +1,335 @@
|
|||||||
|
"""
|
||||||
|
Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
|
||||||
|
Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
|
||||||
|
Now tuning for -c 262144 (256K context).
|
||||||
|
|
||||||
|
Phase 1: --cpu-moe vs no --cpu-moe baseline
|
||||||
|
Phase 2: -t / -tb sweep
|
||||||
|
Phase 3: -ub / -b sweep
|
||||||
|
Phase 4: --cache-type-k/v sweep
|
||||||
|
Phase 5: Misc (mmap, poll, prio)
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import urllib.request
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
try:
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
BASE_URL = "http://127.0.0.1:8000"
|
||||||
|
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||||
|
MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
|
||||||
|
CONTEXT = 262144
|
||||||
|
BENCHMARK_RUNS = 3
|
||||||
|
BENCHMARK_TOKENS = 200
|
||||||
|
|
||||||
|
BEST = {
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": True,
|
||||||
|
"t": 6,
|
||||||
|
"tb": 6,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": True,
|
||||||
|
"mmap": True,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
}
|
||||||
|
|
||||||
|
ALL_RESULTS = []
|
||||||
|
|
||||||
|
|
||||||
|
def kill_server():
|
||||||
|
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
|
||||||
|
time.sleep(4)
|
||||||
|
|
||||||
|
|
||||||
|
def build_cmd(cfg):
|
||||||
|
cmd = [LLAMA_SERVER, "--model", MODEL,
|
||||||
|
"-ngl", str(cfg["ngl"]),
|
||||||
|
"-c", str(CONTEXT),
|
||||||
|
"-np", "1",
|
||||||
|
"-fa", cfg["fa"],
|
||||||
|
"--cache-type-k", cfg["ctk"],
|
||||||
|
"--cache-type-v", cfg["ctv"],
|
||||||
|
"-ub", str(cfg["ub"]),
|
||||||
|
"-b", str(cfg["b"]),
|
||||||
|
"-t", str(cfg["t"]),
|
||||||
|
"-tb", str(cfg["tb"]),
|
||||||
|
"--prio", str(cfg["prio"]),
|
||||||
|
"--poll", str(cfg["poll"]),
|
||||||
|
"--port", "8000",
|
||||||
|
"--host", "0.0.0.0"]
|
||||||
|
if cfg.get("cpu_moe"):
|
||||||
|
cmd.append("--cpu-moe")
|
||||||
|
if cfg["mlock"]:
|
||||||
|
cmd.append("--mlock")
|
||||||
|
if not cfg["mmap"]:
|
||||||
|
cmd.append("--no-mmap")
|
||||||
|
return cmd
|
||||||
|
|
||||||
|
|
||||||
|
def start_server(cfg):
|
||||||
|
cmd = build_cmd(cfg)
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||||
|
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||||
|
)
|
||||||
|
return proc
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_server(timeout=240):
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start < timeout:
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||||
|
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||||
|
data = json.loads(resp.read())
|
||||||
|
if data.get("status") == "ok":
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(2)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": "local-model",
|
||||||
|
"messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"temperature": 0.0
|
||||||
|
}).encode("utf-8")
|
||||||
|
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{BASE_URL}/v1/chat/completions",
|
||||||
|
data=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||||
|
result = json.loads(resp.read())
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
usage = result.get("usage", {})
|
||||||
|
ct = usage.get("completion_tokens", 0)
|
||||||
|
return ct / elapsed if elapsed > 0 else 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_vram():
|
||||||
|
try:
|
||||||
|
r = subprocess.run(
|
||||||
|
["nvidia-smi", "--query-gpu=memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
parts = r.stdout.strip().split(",")
|
||||||
|
return int(parts[0].strip()), int(parts[1].strip())
|
||||||
|
except:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_config(cfg, label=""):
|
||||||
|
kill_server()
|
||||||
|
desc = label or str(cfg)
|
||||||
|
print(f" [{desc}] Starting server...", flush=True)
|
||||||
|
proc = start_server(cfg)
|
||||||
|
|
||||||
|
if not wait_for_server():
|
||||||
|
print(f" [{desc}] FAILED to start")
|
||||||
|
proc.kill()
|
||||||
|
return None
|
||||||
|
|
||||||
|
vram_used, vram_total = get_vram()
|
||||||
|
print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
try:
|
||||||
|
run_benchmark(max_tokens=20)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
speeds = []
|
||||||
|
for i in range(BENCHMARK_RUNS):
|
||||||
|
try:
|
||||||
|
tps = run_benchmark()
|
||||||
|
speeds.append(tps)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERR({e}) ", end="", flush=True)
|
||||||
|
|
||||||
|
proc.kill()
|
||||||
|
|
||||||
|
if not speeds:
|
||||||
|
print("ALL FAILED")
|
||||||
|
return None
|
||||||
|
|
||||||
|
avg = sum(speeds) / len(speeds)
|
||||||
|
best = max(speeds)
|
||||||
|
print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||||
|
|
||||||
|
result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
|
||||||
|
"vram_used": vram_used, "vram_total": vram_total, "label": label}
|
||||||
|
ALL_RESULTS.append(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def phase_sweep(phase_name, param_name, values, base_cfg):
|
||||||
|
print(f"\n{'='*70}")
|
||||||
|
print(f" PHASE: {phase_name}")
|
||||||
|
print(f" Sweeping: {param_name} = {values}")
|
||||||
|
print(f"{'='*70}")
|
||||||
|
|
||||||
|
best_result = None
|
||||||
|
for val in values:
|
||||||
|
cfg = {**base_cfg}
|
||||||
|
if isinstance(param_name, list):
|
||||||
|
for p, v in zip(param_name, val):
|
||||||
|
cfg[p] = v
|
||||||
|
label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
|
||||||
|
else:
|
||||||
|
cfg[param_name] = val
|
||||||
|
label = f"{param_name}={val}"
|
||||||
|
|
||||||
|
r = test_config(cfg, label)
|
||||||
|
if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
|
||||||
|
best_result = r
|
||||||
|
|
||||||
|
if best_result:
|
||||||
|
print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
|
||||||
|
return best_result
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 70)
|
||||||
|
print(" Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
|
||||||
|
print(" 256K Context | RTX 3060 12GB")
|
||||||
|
print(" Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
|
|
||||||
|
cfg = dict(BEST)
|
||||||
|
|
||||||
|
# ─── Phase 1: --cpu-moe critical test ───
|
||||||
|
r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
|
||||||
|
if r:
|
||||||
|
cfg["cpu_moe"] = r["cpu_moe"]
|
||||||
|
|
||||||
|
# ─── Phase 2: CPU threads ───
|
||||||
|
thread_combos = [
|
||||||
|
(2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
|
||||||
|
(8, 8), (8, 12), (10, 10), (12, 12)
|
||||||
|
]
|
||||||
|
r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
|
||||||
|
if r:
|
||||||
|
cfg["t"] = r["t"]
|
||||||
|
cfg["tb"] = r["tb"]
|
||||||
|
|
||||||
|
# ─── Phase 3: Batch sizes ───
|
||||||
|
batch_combos = [
|
||||||
|
(128, 512), (256, 1024), (256, 2048),
|
||||||
|
(512, 1024), (512, 2048), (512, 4096),
|
||||||
|
(1024, 2048), (1024, 4096)
|
||||||
|
]
|
||||||
|
r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
|
||||||
|
if r:
|
||||||
|
cfg["ub"] = r["ub"]
|
||||||
|
cfg["b"] = r["b"]
|
||||||
|
|
||||||
|
# ─── Phase 4: KV cache ───
|
||||||
|
kv_combos = [
|
||||||
|
("q4_0", "q4_0"),
|
||||||
|
("q8_0", "q8_0"),
|
||||||
|
("f16", "f16"),
|
||||||
|
]
|
||||||
|
r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
|
||||||
|
if r:
|
||||||
|
cfg["ctk"] = r["ctk"]
|
||||||
|
cfg["ctv"] = r["ctv"]
|
||||||
|
|
||||||
|
# ─── Phase 5: Misc ───
|
||||||
|
misc_combos = [
|
||||||
|
(True, 50, 2),
|
||||||
|
(False, 50, 2),
|
||||||
|
(True, 0, 2),
|
||||||
|
(True, 100, 2),
|
||||||
|
(True, 50, 3),
|
||||||
|
]
|
||||||
|
r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
|
||||||
|
if r:
|
||||||
|
cfg["mmap"] = r["mmap"]
|
||||||
|
cfg["poll"] = r["poll"]
|
||||||
|
cfg["prio"] = r["prio"]
|
||||||
|
|
||||||
|
# ─── Final Report ───
|
||||||
|
print()
|
||||||
|
print("=" * 70)
|
||||||
|
print(" FINAL OPTIMAL CONFIGURATION")
|
||||||
|
print("=" * 70)
|
||||||
|
for k, v in cfg.items():
|
||||||
|
print(f" {k:>12}: {v}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Final verification
|
||||||
|
print(" Running final verification (5 runs)...")
|
||||||
|
kill_server()
|
||||||
|
proc = start_server(cfg)
|
||||||
|
wait_for_server()
|
||||||
|
try:
|
||||||
|
run_benchmark(max_tokens=20)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
final_speeds = []
|
||||||
|
for i in range(5):
|
||||||
|
try:
|
||||||
|
tps = run_benchmark()
|
||||||
|
final_speeds.append(tps)
|
||||||
|
print(f" Run {i+1}: {tps:.2f} t/s")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
proc.kill()
|
||||||
|
|
||||||
|
if final_speeds:
|
||||||
|
avg = sum(final_speeds) / len(final_speeds)
|
||||||
|
best = max(final_speeds)
|
||||||
|
print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
|
||||||
|
|
||||||
|
print()
|
||||||
|
cmd_parts = [
|
||||||
|
f"llama-server --model {MODEL}",
|
||||||
|
f"-ngl {cfg['ngl']} -c {CONTEXT}",
|
||||||
|
]
|
||||||
|
if cfg.get("cpu_moe"):
|
||||||
|
cmd_parts.append("--cpu-moe")
|
||||||
|
cmd_parts.extend([
|
||||||
|
f"-t {cfg['t']} -tb {cfg['tb']}",
|
||||||
|
f"-ub {cfg['ub']} -b {cfg['b']}",
|
||||||
|
f"-fa {cfg['fa']}",
|
||||||
|
f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
|
||||||
|
f"--prio {cfg['prio']} --poll {cfg['poll']}",
|
||||||
|
])
|
||||||
|
if cfg["mlock"]:
|
||||||
|
cmd_parts.append("--mlock")
|
||||||
|
if not cfg["mmap"]:
|
||||||
|
cmd_parts.append("--no-mmap")
|
||||||
|
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||||
|
|
||||||
|
print(" Recommended command:")
|
||||||
|
print(f" {' '.join(cmd_parts)}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
|
||||||
|
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||||
|
print(f"\n Full results saved: scripts/tune_results_qwen35b_256k.json")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
BIN
scripts/boot_122b.txt
Normal file
BIN
scripts/boot_122b.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_38.txt
Normal file
BIN
scripts/boot_122b_38.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_42.txt
Normal file
BIN
scripts/boot_122b_42.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_44.txt
Normal file
BIN
scripts/boot_122b_44.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_auto.txt
Normal file
BIN
scripts/boot_122b_auto.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_maxmem.txt
Normal file
BIN
scripts/boot_122b_maxmem.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_row.txt
Normal file
BIN
scripts/boot_122b_row.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_row_dual.txt
Normal file
BIN
scripts/boot_122b_row_dual.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_single.txt
Normal file
BIN
scripts/boot_122b_single.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_single2.txt
Normal file
BIN
scripts/boot_122b_single2.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_ts85.txt
Normal file
BIN
scripts/boot_122b_ts85.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_tune.txt
Normal file
BIN
scripts/boot_122b_tune.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_tuned.txt
Normal file
BIN
scripts/boot_122b_tuned.txt
Normal file
Binary file not shown.
BIN
scripts/boot_122b_v2.txt
Normal file
BIN
scripts/boot_122b_v2.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log.txt
Normal file
BIN
scripts/boot_log.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log2.txt
Normal file
BIN
scripts/boot_log2.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log3.txt
Normal file
BIN
scripts/boot_log3.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log4.txt
Normal file
BIN
scripts/boot_log4.txt
Normal file
Binary file not shown.
BIN
scripts/boot_log5.txt
Normal file
BIN
scripts/boot_log5.txt
Normal file
Binary file not shown.
BIN
scripts/boot_qwen_iq4.txt
Normal file
BIN
scripts/boot_qwen_iq4.txt
Normal file
Binary file not shown.
3
scripts/check_help.bat
Normal file
3
scripts/check_help.bat
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
@echo off
|
||||||
|
.\llama_bin_run\llama-server.exe --help 2>&1 | findstr /i "split tensor device main-gpu cpu-moe n-cpu-moe" > scripts\help_gpu_flags.txt
|
||||||
|
echo Done.
|
||||||
531
scripts/dual_gpu_benchmark.mjs
Normal file
531
scripts/dual_gpu_benchmark.mjs
Normal file
@@ -0,0 +1,531 @@
|
|||||||
|
/**
|
||||||
|
* Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||||
|
* ===========================================================
|
||||||
|
* Tests 4 models across multiple parameter configurations to find
|
||||||
|
* the absolute best model + settings for 256K context coding agent.
|
||||||
|
*
|
||||||
|
* Models:
|
||||||
|
* 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||||
|
* 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||||
|
* 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||||
|
* 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||||
|
*
|
||||||
|
* Run: node scripts/dual_gpu_benchmark.mjs
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { spawn, execSync } from "child_process";
|
||||||
|
import { writeFileSync, statSync, existsSync } from "fs";
|
||||||
|
import { resolve } from "path";
|
||||||
|
|
||||||
|
// ─── Configuration ─────────────────────────────────────────────
|
||||||
|
const BASE_URL = "http://127.0.0.1:8000";
|
||||||
|
const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
|
||||||
|
const CONTEXT = 262144; // 256K
|
||||||
|
const BENCHMARK_RUNS = 3;
|
||||||
|
const BENCHMARK_TOKENS = 200;
|
||||||
|
const SERVER_TIMEOUT = 300_000; // ms
|
||||||
|
|
||||||
|
const MODELS = [
|
||||||
|
{
|
||||||
|
name: "Qwen3.5-35B-A3B Q4_K_M",
|
||||||
|
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||||
|
type: "qwen", quant: "Q4_K_M", totalLayers: 64,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||||
|
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||||
|
type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Gemma4 26B-A4B Q4_K_M",
|
||||||
|
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
||||||
|
type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Gemma4 26B-A4B MXFP4_MOE",
|
||||||
|
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
||||||
|
type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const ALL_RESULTS = [];
|
||||||
|
|
||||||
|
// ─── Utility ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function log(msg) {
|
||||||
|
const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
|
||||||
|
console.log(`[${ts}] ${msg}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise((r) => setTimeout(r, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
function killServer() {
|
||||||
|
try {
|
||||||
|
execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
|
||||||
|
} catch {}
|
||||||
|
return sleep(5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
function getVramAll() {
|
||||||
|
try {
|
||||||
|
const out = execSync(
|
||||||
|
'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||||
|
{ encoding: "utf-8", timeout: 5000 }
|
||||||
|
);
|
||||||
|
return out.trim().split("\n").map((line) => {
|
||||||
|
const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
|
||||||
|
return { gpu, used, total };
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildCmd(modelPath, params) {
|
||||||
|
const {
|
||||||
|
ngl, t, ub, b, ctk, ctv,
|
||||||
|
cpuMoe = false, nCpuMoe = 0,
|
||||||
|
prio = 3, nommap = false
|
||||||
|
} = params;
|
||||||
|
|
||||||
|
const cmd = [
|
||||||
|
LLAMA_SERVER,
|
||||||
|
"--model", modelPath,
|
||||||
|
"-ngl", String(ngl),
|
||||||
|
"-c", String(CONTEXT),
|
||||||
|
"-np", "1",
|
||||||
|
"-fa", "on",
|
||||||
|
"--cache-type-k", ctk,
|
||||||
|
"--cache-type-v", ctv,
|
||||||
|
"-ub", String(ub),
|
||||||
|
"-b", String(b),
|
||||||
|
"-t", String(t),
|
||||||
|
"-tb", String(t),
|
||||||
|
"--prio", String(prio),
|
||||||
|
"--poll", "50",
|
||||||
|
"--mlock",
|
||||||
|
"--port", "8000",
|
||||||
|
"--host", "0.0.0.0",
|
||||||
|
];
|
||||||
|
|
||||||
|
if (cpuMoe) cmd.push("--cpu-moe");
|
||||||
|
else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
|
||||||
|
if (nommap) cmd.push("--no-mmap");
|
||||||
|
|
||||||
|
return cmd;
|
||||||
|
}
|
||||||
|
|
||||||
|
function startServer(modelPath, params) {
|
||||||
|
const args = buildCmd(modelPath, params);
|
||||||
|
const exe = args.shift();
|
||||||
|
log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
|
||||||
|
return spawn(exe, args, {
|
||||||
|
cwd: process.cwd(),
|
||||||
|
stdio: ["ignore", "pipe", "pipe"],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
|
||||||
|
const start = Date.now();
|
||||||
|
while (Date.now() - start < timeoutMs) {
|
||||||
|
try {
|
||||||
|
const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||||
|
const data = await resp.json();
|
||||||
|
if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
|
||||||
|
} catch {}
|
||||||
|
await sleep(3000);
|
||||||
|
}
|
||||||
|
return { ok: false, bootTime: timeoutMs / 1000 };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
|
||||||
|
const payload = JSON.stringify({
|
||||||
|
model: "local-model",
|
||||||
|
messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
|
||||||
|
max_tokens: maxTokens,
|
||||||
|
temperature: 0.0,
|
||||||
|
});
|
||||||
|
|
||||||
|
const start = Date.now();
|
||||||
|
const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: payload,
|
||||||
|
signal: AbortSignal.timeout(600_000),
|
||||||
|
});
|
||||||
|
const result = await resp.json();
|
||||||
|
const elapsed = (Date.now() - start) / 1000;
|
||||||
|
|
||||||
|
const usage = result.usage || {};
|
||||||
|
const ct = usage.completion_tokens || 0;
|
||||||
|
return {
|
||||||
|
tps: elapsed > 0 ? ct / elapsed : 0,
|
||||||
|
completionTokens: ct,
|
||||||
|
promptTokens: usage.prompt_tokens || 0,
|
||||||
|
elapsed,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testConfig(model, label, params) {
|
||||||
|
await killServer();
|
||||||
|
log(` [${label}] Starting server...`);
|
||||||
|
|
||||||
|
const proc = startServer(model.path, params);
|
||||||
|
const { ok, bootTime } = await waitForServer();
|
||||||
|
|
||||||
|
if (!ok) {
|
||||||
|
log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
|
||||||
|
proc.kill("SIGKILL");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const vram = getVramAll();
|
||||||
|
const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
|
||||||
|
log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
|
||||||
|
|
||||||
|
// Warmup
|
||||||
|
try { await runBenchmark(20); } catch {}
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
const speeds = [];
|
||||||
|
for (let i = 0; i < BENCHMARK_RUNS; i++) {
|
||||||
|
try {
|
||||||
|
const r = await runBenchmark();
|
||||||
|
speeds.push(r.tps);
|
||||||
|
log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
||||||
|
} catch (e) {
|
||||||
|
log(` Run ${i + 1}: ERROR (${e.message})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
proc.kill("SIGKILL");
|
||||||
|
|
||||||
|
if (speeds.length === 0) {
|
||||||
|
log(` [${label}] ALL BENCHMARK RUNS FAILED`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
||||||
|
const best = Math.max(...speeds);
|
||||||
|
log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
|
||||||
|
|
||||||
|
const result = {
|
||||||
|
model: model.name, quant: model.quant, label,
|
||||||
|
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||||
|
boot_time: +bootTime.toFixed(1), vram, params,
|
||||||
|
};
|
||||||
|
ALL_RESULTS.push(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Phase Runners ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function phase0_bootTest(model) {
|
||||||
|
log(`\n${"=".repeat(70)}`);
|
||||||
|
log(` PHASE 0: Boot Test — ${model.name}`);
|
||||||
|
log(`${"=".repeat(70)}`);
|
||||||
|
|
||||||
|
// Try full GPU first
|
||||||
|
let r = await testConfig(model, "boot-ngl999", {
|
||||||
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
|
||||||
|
});
|
||||||
|
if (r) return r;
|
||||||
|
|
||||||
|
// Try with cpu-moe
|
||||||
|
log(" Full GPU failed, trying with --cpu-moe...");
|
||||||
|
r = await testConfig(model, "boot-cpumoe", {
|
||||||
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
|
||||||
|
});
|
||||||
|
if (r) return r;
|
||||||
|
|
||||||
|
// Reduced layers
|
||||||
|
log(" --cpu-moe also failed, trying reduced layers...");
|
||||||
|
r = await testConfig(model, "boot-ngl-half", {
|
||||||
|
ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
|
||||||
|
ctk: "q4_0", ctv: "q4_0",
|
||||||
|
});
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function phase1_gpuOffload(model, baseline) {
|
||||||
|
log(`\n${"=".repeat(70)}`);
|
||||||
|
log(` PHASE 1: GPU Offload Strategy — ${model.name}`);
|
||||||
|
log(`${"=".repeat(70)}`);
|
||||||
|
|
||||||
|
const results = baseline ? [baseline] : [];
|
||||||
|
|
||||||
|
// Test --cpu-moe on/off
|
||||||
|
for (const cpuMoe of [true, false]) {
|
||||||
|
const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
|
||||||
|
if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
|
||||||
|
const r = await testConfig(model, lbl, {
|
||||||
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
|
||||||
|
});
|
||||||
|
if (r) results.push(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
// n-cpu-moe sweep
|
||||||
|
for (const n of [0, 5, 10, 15, 20]) {
|
||||||
|
if (n > model.totalLayers) continue;
|
||||||
|
const r = await testConfig(model, `n-cpu-moe=${n}`, {
|
||||||
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
|
||||||
|
});
|
||||||
|
if (r) results.push(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; }
|
||||||
|
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||||
|
log(`\n ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function phase2_threads(model, prev) {
|
||||||
|
log(`\n${"=".repeat(70)}`);
|
||||||
|
log(` PHASE 2: CPU Thread Sweep — ${model.name}`);
|
||||||
|
log(`${"=".repeat(70)}`);
|
||||||
|
|
||||||
|
const p = prev.params;
|
||||||
|
const results = [prev];
|
||||||
|
|
||||||
|
for (const t of [2, 4, 6, 8, 10, 12]) {
|
||||||
|
if (t === p.t) continue;
|
||||||
|
const r = await testConfig(model, `t=${t}`, {
|
||||||
|
...p, t,
|
||||||
|
});
|
||||||
|
if (r) results.push(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||||
|
log(`\n ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function phase3_batch(model, prev) {
|
||||||
|
log(`\n${"=".repeat(70)}`);
|
||||||
|
log(` PHASE 3: Batch Size Sweep — ${model.name}`);
|
||||||
|
log(`${"=".repeat(70)}`);
|
||||||
|
|
||||||
|
const p = prev.params;
|
||||||
|
const results = [prev];
|
||||||
|
|
||||||
|
for (const [ub, b] of [
|
||||||
|
[128, 512], [256, 1024], [256, 2048],
|
||||||
|
[512, 1024], [512, 2048], [512, 4096],
|
||||||
|
[1024, 2048], [1024, 4096],
|
||||||
|
]) {
|
||||||
|
if (ub === p.ub && b === p.b) continue;
|
||||||
|
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
|
||||||
|
if (r) results.push(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||||
|
log(`\n ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function phase4_kvcache(model, prev) {
|
||||||
|
log(`\n${"=".repeat(70)}`);
|
||||||
|
log(` PHASE 4: KV Cache Type Sweep — ${model.name}`);
|
||||||
|
log(`${"=".repeat(70)}`);
|
||||||
|
|
||||||
|
const p = prev.params;
|
||||||
|
const results = [prev];
|
||||||
|
|
||||||
|
for (const [ctk, ctv] of [
|
||||||
|
["q4_0", "q4_0"], ["q8_0", "q8_0"],
|
||||||
|
["q4_0", "q8_0"], ["f16", "f16"],
|
||||||
|
]) {
|
||||||
|
if (ctk === p.ctk && ctv === p.ctv) continue;
|
||||||
|
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
|
||||||
|
if (r) results.push(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
|
||||||
|
log(`\n ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function phase5_final(model, prev) {
|
||||||
|
log(`\n${"=".repeat(70)}`);
|
||||||
|
log(` PHASE 5: Final Verification (5 runs) — ${model.name}`);
|
||||||
|
log(`${"=".repeat(70)}`);
|
||||||
|
|
||||||
|
await killServer();
|
||||||
|
const proc = startServer(model.path, prev.params);
|
||||||
|
const { ok, bootTime } = await waitForServer();
|
||||||
|
if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; }
|
||||||
|
|
||||||
|
const vram = getVramAll();
|
||||||
|
try { await runBenchmark(20); } catch {}
|
||||||
|
|
||||||
|
const speeds = [];
|
||||||
|
for (let i = 0; i < 5; i++) {
|
||||||
|
try {
|
||||||
|
const r = await runBenchmark();
|
||||||
|
speeds.push(r.tps);
|
||||||
|
log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
|
||||||
|
} catch (e) {
|
||||||
|
log(` Final Run ${i + 1}: ERROR (${e.message})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
proc.kill("SIGKILL");
|
||||||
|
|
||||||
|
if (speeds.length > 0) {
|
||||||
|
const avg = speeds.reduce((a, b) => a + b) / speeds.length;
|
||||||
|
const best = Math.max(...speeds);
|
||||||
|
log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
|
||||||
|
|
||||||
|
const final_ = {
|
||||||
|
model: model.name, quant: model.quant,
|
||||||
|
label: `FINAL-${model.name}`,
|
||||||
|
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||||
|
boot_time: +bootTime.toFixed(1), vram, params: prev.params,
|
||||||
|
};
|
||||||
|
ALL_RESULTS.push(final_);
|
||||||
|
return final_;
|
||||||
|
}
|
||||||
|
return prev;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Main ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function runModelBenchmark(model) {
|
||||||
|
log(`\n${"#".repeat(70)}`);
|
||||||
|
log(` MODEL: ${model.name}`);
|
||||||
|
log(` File: ${model.path}`);
|
||||||
|
try {
|
||||||
|
const sz = statSync(model.path).size / 1024 ** 3;
|
||||||
|
log(` Size: ${sz.toFixed(2)} GB`);
|
||||||
|
} catch { log(` Size: unknown`); }
|
||||||
|
log(`${"#".repeat(70)}`);
|
||||||
|
|
||||||
|
if (!existsSync(model.path)) {
|
||||||
|
log(` SKIP: Model file not found!`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const baseline = await phase0_bootTest(model);
|
||||||
|
if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; }
|
||||||
|
|
||||||
|
let best = await phase1_gpuOffload(model, baseline);
|
||||||
|
if (!best) return baseline;
|
||||||
|
|
||||||
|
best = await phase2_threads(model, best);
|
||||||
|
best = await phase3_batch(model, best);
|
||||||
|
best = await phase4_kvcache(model, best);
|
||||||
|
best = await phase5_final(model, best);
|
||||||
|
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
log("=".repeat(70));
|
||||||
|
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
|
||||||
|
log(" 2x RTX 3060 (24GB Total) | 256K Context");
|
||||||
|
log(` Models: ${MODELS.length}`);
|
||||||
|
log(` Started: ${new Date().toISOString()}`);
|
||||||
|
log("=".repeat(70));
|
||||||
|
|
||||||
|
const gpus = getVramAll();
|
||||||
|
gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
|
||||||
|
|
||||||
|
const winners = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < MODELS.length; i++) {
|
||||||
|
log(`\n${"=".repeat(70)}`);
|
||||||
|
log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
|
||||||
|
log(`${"=".repeat(70)}`);
|
||||||
|
|
||||||
|
const winner = await runModelBenchmark(MODELS[i]);
|
||||||
|
if (winner) winners.push(winner);
|
||||||
|
|
||||||
|
// Save intermediate
|
||||||
|
writeFileSync("scripts/dual_gpu_results.json",
|
||||||
|
JSON.stringify(ALL_RESULTS, null, 2));
|
||||||
|
log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Grand Final ───────────────────────────────────────────
|
||||||
|
const elapsed = (Date.now() - startTime) / 60000;
|
||||||
|
|
||||||
|
log(`\n${"=".repeat(70)}`);
|
||||||
|
log(` GRAND FINAL COMPARISON`);
|
||||||
|
log(` Total time: ${elapsed.toFixed(1)} minutes`);
|
||||||
|
log(` Configs tested: ${ALL_RESULTS.length}`);
|
||||||
|
log(`${"=".repeat(70)}`);
|
||||||
|
|
||||||
|
if (winners.length === 0) {
|
||||||
|
log(" No models ran at 256K!");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||||
|
const medals = ["🥇", "🥈", "🥉", " "];
|
||||||
|
|
||||||
|
const lines = [
|
||||||
|
`Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
|
||||||
|
`Hardware: 2x RTX 3060 12GB | Context: 256K`,
|
||||||
|
`Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
|
||||||
|
"", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (let i = 0; i < winners.length; i++) {
|
||||||
|
const w = winners[i];
|
||||||
|
const p = w.params;
|
||||||
|
lines.push("");
|
||||||
|
lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`);
|
||||||
|
lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
|
||||||
|
lines.push(` Boot: ${w.boot_time.toFixed(0)}s`);
|
||||||
|
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
|
||||||
|
lines.push(` ctk=${p.ctk} ctv=${p.ctv}`);
|
||||||
|
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
||||||
|
else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const champ = winners[0];
|
||||||
|
const cp = champ.params;
|
||||||
|
lines.push("", "=".repeat(60));
|
||||||
|
lines.push(` ★ CHAMPION: ${champ.model}`);
|
||||||
|
lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`);
|
||||||
|
lines.push("=".repeat(60));
|
||||||
|
|
||||||
|
// Build recommended command
|
||||||
|
const cmdParts = [
|
||||||
|
`llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
|
||||||
|
`-ngl ${cp.ngl} -c ${CONTEXT}`,
|
||||||
|
`-t ${cp.t} -tb ${cp.t}`,
|
||||||
|
`-ub ${cp.ub} -b ${cp.b}`,
|
||||||
|
`-fa on`,
|
||||||
|
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
||||||
|
`--prio ${cp.prio || 3} --poll 50`,
|
||||||
|
`--mlock`,
|
||||||
|
];
|
||||||
|
if (cp.cpuMoe) cmdParts.push("--cpu-moe");
|
||||||
|
else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
|
||||||
|
if (cp.nommap) cmdParts.push("--no-mmap");
|
||||||
|
cmdParts.push("--port 8000 --host 0.0.0.0");
|
||||||
|
|
||||||
|
lines.push("", " Recommended command:");
|
||||||
|
lines.push(` ${cmdParts.join(" ")}`);
|
||||||
|
|
||||||
|
const summary = lines.join("\n");
|
||||||
|
console.log(summary);
|
||||||
|
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
||||||
|
writeFileSync("scripts/dual_gpu_results.json",
|
||||||
|
JSON.stringify(ALL_RESULTS, null, 2));
|
||||||
|
|
||||||
|
log(`\n Results: scripts/dual_gpu_results.json`);
|
||||||
|
log(` Summary: scripts/dual_gpu_summary.txt`);
|
||||||
|
log(` DONE!`);
|
||||||
|
|
||||||
|
await killServer();
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((e) => {
|
||||||
|
console.error("Fatal error:", e);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
644
scripts/dual_gpu_benchmark.py
Normal file
644
scripts/dual_gpu_benchmark.py
Normal file
@@ -0,0 +1,644 @@
|
|||||||
|
"""
|
||||||
|
Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
|
||||||
|
==========================================================
|
||||||
|
Tests 4 models across multiple parameter configurations to find
|
||||||
|
the absolute best model + settings for 256K context coding agent.
|
||||||
|
|
||||||
|
Models:
|
||||||
|
1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB)
|
||||||
|
2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB)
|
||||||
|
3. Gemma4 26B-A4B Q4_K_M (~15.6 GB)
|
||||||
|
4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB)
|
||||||
|
|
||||||
|
Test Phases (per model):
|
||||||
|
Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
|
||||||
|
Phase 1: GPU layer + MoE offload strategy sweep
|
||||||
|
Phase 2: CPU thread sweep (carry best from P1)
|
||||||
|
Phase 3: Batch size sweep (carry best from P1+P2)
|
||||||
|
Phase 4: KV cache type sweep (carry best from P1+P2+P3)
|
||||||
|
Phase 5: Final verification (5 runs)
|
||||||
|
|
||||||
|
Output: scripts/dual_gpu_results.json (all raw data)
|
||||||
|
scripts/dual_gpu_summary.txt (human-readable winner)
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import urllib.request
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
try:
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ─── Configuration ───────────────────────────────────────────────
|
||||||
|
BASE_URL = "http://127.0.0.1:8000"
|
||||||
|
LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
|
||||||
|
CONTEXT = 262144 # 256K
|
||||||
|
BENCHMARK_RUNS = 3
|
||||||
|
BENCHMARK_TOKENS = 200
|
||||||
|
SERVER_TIMEOUT = 300 # seconds to wait for server startup
|
||||||
|
|
||||||
|
MODELS = [
|
||||||
|
{
|
||||||
|
"name": "Qwen3.5-35B-A3B Q4_K_M",
|
||||||
|
"path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
|
||||||
|
"type": "qwen",
|
||||||
|
"quant": "Q4_K_M",
|
||||||
|
"is_mxfp4": False,
|
||||||
|
"total_layers": 64, # Qwen3.5 35B has 64 layers
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Qwen3.5-35B-A3B MXFP4_MOE",
|
||||||
|
"path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
|
||||||
|
"type": "qwen",
|
||||||
|
"quant": "MXFP4_MOE",
|
||||||
|
"is_mxfp4": True,
|
||||||
|
"total_layers": 64,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Gemma4 26B-A4B Q4_K_M",
|
||||||
|
"path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
|
||||||
|
"type": "gemma4",
|
||||||
|
"quant": "Q4_K_M",
|
||||||
|
"is_mxfp4": False,
|
||||||
|
"total_layers": 30, # Gemma4 26B has 30 layers
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Gemma4 26B-A4B MXFP4_MOE",
|
||||||
|
"path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
|
||||||
|
"type": "gemma4",
|
||||||
|
"quant": "MXFP4_MOE",
|
||||||
|
"is_mxfp4": True,
|
||||||
|
"total_layers": 30,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
ALL_RESULTS = []
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Utility Functions ──────────────────────────────────────────
|
||||||
|
def log(msg):
|
||||||
|
ts = datetime.datetime.now().strftime("%H:%M:%S")
|
||||||
|
print(f"[{ts}] {msg}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def kill_server():
|
||||||
|
subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
|
||||||
|
capture_output=True)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
|
def get_vram_all():
|
||||||
|
"""Returns list of (used, total) tuples for each GPU."""
|
||||||
|
try:
|
||||||
|
r = subprocess.run(
|
||||||
|
["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
gpus = []
|
||||||
|
for line in r.stdout.strip().split("\n"):
|
||||||
|
parts = [p.strip() for p in line.split(",")]
|
||||||
|
if len(parts) >= 3:
|
||||||
|
gpus.append({
|
||||||
|
"gpu": int(parts[0]),
|
||||||
|
"used": int(parts[1]),
|
||||||
|
"total": int(parts[2]),
|
||||||
|
})
|
||||||
|
return gpus
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
|
||||||
|
cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
|
||||||
|
"""Build llama-server command for dual-GPU."""
|
||||||
|
cmd = [
|
||||||
|
LLAMA_SERVER,
|
||||||
|
"--model", model_path,
|
||||||
|
"-ngl", str(ngl),
|
||||||
|
"-c", str(CONTEXT),
|
||||||
|
"-np", "1",
|
||||||
|
"-fa", "on",
|
||||||
|
"--cache-type-k", ctk,
|
||||||
|
"--cache-type-v", ctv,
|
||||||
|
"-ub", str(ub),
|
||||||
|
"-b", str(b),
|
||||||
|
"-t", str(t),
|
||||||
|
"-tb", str(t),
|
||||||
|
"--prio", str(prio),
|
||||||
|
"--poll", "50",
|
||||||
|
"--mlock",
|
||||||
|
"--port", "8000",
|
||||||
|
"--host", "0.0.0.0",
|
||||||
|
]
|
||||||
|
# MoE offloading options
|
||||||
|
if cpu_moe:
|
||||||
|
cmd.append("--cpu-moe")
|
||||||
|
elif n_cpu_moe > 0:
|
||||||
|
cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
|
||||||
|
if nommap:
|
||||||
|
cmd.append("--no-mmap")
|
||||||
|
return cmd
|
||||||
|
|
||||||
|
|
||||||
|
def start_server(model_path, **kwargs):
|
||||||
|
cmd = build_cmd(model_path, **kwargs)
|
||||||
|
log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||||
|
cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
|
||||||
|
)
|
||||||
|
return proc
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_server(timeout=SERVER_TIMEOUT):
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start < timeout:
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||||
|
with urllib.request.urlopen(req, timeout=3) as resp:
|
||||||
|
data = json.loads(resp.read())
|
||||||
|
if data.get("status") == "ok":
|
||||||
|
boot_time = time.time() - start
|
||||||
|
return True, boot_time
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
time.sleep(3)
|
||||||
|
return False, timeout
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmark(max_tokens=BENCHMARK_TOKENS):
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": "local-model",
|
||||||
|
"messages": [{"role": "user",
|
||||||
|
"content": "Count from 1 to 50, writing each number on a new line."}],
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"temperature": 0.0,
|
||||||
|
}).encode("utf-8")
|
||||||
|
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{BASE_URL}/v1/chat/completions",
|
||||||
|
data=payload,
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||||
|
result = json.loads(resp.read())
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
usage = result.get("usage", {})
|
||||||
|
ct = usage.get("completion_tokens", 0)
|
||||||
|
pt = usage.get("prompt_tokens", 0)
|
||||||
|
return {
|
||||||
|
"tps": ct / elapsed if elapsed > 0 else 0,
|
||||||
|
"completion_tokens": ct,
|
||||||
|
"prompt_tokens": pt,
|
||||||
|
"elapsed": elapsed,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_config(model_info, label, **kwargs):
|
||||||
|
"""Test a single configuration. Returns result dict or None."""
|
||||||
|
kill_server()
|
||||||
|
log(f" [{label}] Starting server...")
|
||||||
|
|
||||||
|
proc = start_server(model_info["path"], **kwargs)
|
||||||
|
ok, boot_time = wait_for_server()
|
||||||
|
|
||||||
|
if not ok:
|
||||||
|
log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
|
||||||
|
proc.kill()
|
||||||
|
return None
|
||||||
|
|
||||||
|
vram = get_vram_all()
|
||||||
|
vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
|
||||||
|
log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
try:
|
||||||
|
run_benchmark(max_tokens=20)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Benchmark runs
|
||||||
|
speeds = []
|
||||||
|
for i in range(BENCHMARK_RUNS):
|
||||||
|
try:
|
||||||
|
r = run_benchmark()
|
||||||
|
speeds.append(r["tps"])
|
||||||
|
log(f" Run {i+1}: {r['tps']:.2f} t/s")
|
||||||
|
except Exception as e:
|
||||||
|
log(f" Run {i+1}: ERROR ({e})")
|
||||||
|
|
||||||
|
proc.kill()
|
||||||
|
|
||||||
|
if not speeds:
|
||||||
|
log(f" [{label}] ALL BENCHMARK RUNS FAILED")
|
||||||
|
return None
|
||||||
|
|
||||||
|
avg = sum(speeds) / len(speeds)
|
||||||
|
best = max(speeds)
|
||||||
|
log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"model": model_info["name"],
|
||||||
|
"quant": model_info["quant"],
|
||||||
|
"label": label,
|
||||||
|
"avg_tps": round(avg, 2),
|
||||||
|
"best_tps": round(best, 2),
|
||||||
|
"boot_time": round(boot_time, 1),
|
||||||
|
"vram": vram,
|
||||||
|
"params": kwargs,
|
||||||
|
}
|
||||||
|
ALL_RESULTS.append(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Phase Runners ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
def phase0_boot_test(model):
|
||||||
|
"""Quick test: can the model even boot with 256K on dual GPU?"""
|
||||||
|
log(f"\n{'='*70}")
|
||||||
|
log(f" PHASE 0: Boot Test — {model['name']}")
|
||||||
|
log(f"{'='*70}")
|
||||||
|
|
||||||
|
# Try -ngl 999 (all layers to GPU) as baseline
|
||||||
|
r = test_config(
|
||||||
|
model, f"boot-ngl999",
|
||||||
|
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||||
|
)
|
||||||
|
if r:
|
||||||
|
return r
|
||||||
|
|
||||||
|
# If full GPU fails, try with cpu-moe
|
||||||
|
log(" Full GPU failed, trying with --cpu-moe...")
|
||||||
|
r = test_config(
|
||||||
|
model, f"boot-cpumoe",
|
||||||
|
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||||
|
cpu_moe=True,
|
||||||
|
)
|
||||||
|
if r:
|
||||||
|
return r
|
||||||
|
|
||||||
|
# Extreme fallback: fewer layers
|
||||||
|
log(" --cpu-moe also failed, trying reduced layers...")
|
||||||
|
r = test_config(
|
||||||
|
model, f"boot-ngl-half",
|
||||||
|
ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
|
||||||
|
ctk="q4_0", ctv="q4_0",
|
||||||
|
)
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
def phase1_gpu_offload(model, baseline):
|
||||||
|
"""Find optimal GPU layer count and MoE offload strategy."""
|
||||||
|
log(f"\n{'='*70}")
|
||||||
|
log(f" PHASE 1: GPU Offload Strategy — {model['name']}")
|
||||||
|
log(f"{'='*70}")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
if baseline:
|
||||||
|
results.append(baseline)
|
||||||
|
|
||||||
|
total = model["total_layers"]
|
||||||
|
|
||||||
|
# Strategy A: All GPU + cpu-moe variations
|
||||||
|
for cpu_moe in [True, False]:
|
||||||
|
label = f"ngl=999 cpu_moe={cpu_moe}"
|
||||||
|
# Skip if already tested in baseline
|
||||||
|
if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
|
||||||
|
baseline["params"].get("cpu_moe", False) == cpu_moe:
|
||||||
|
continue
|
||||||
|
r = test_config(
|
||||||
|
model, label,
|
||||||
|
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||||
|
cpu_moe=cpu_moe,
|
||||||
|
)
|
||||||
|
if r:
|
||||||
|
results.append(r)
|
||||||
|
|
||||||
|
# Strategy B: n-cpu-moe sweep (selective expert offload)
|
||||||
|
for n in [0, 5, 10, 15, 20]:
|
||||||
|
if n > total:
|
||||||
|
continue
|
||||||
|
r = test_config(
|
||||||
|
model, f"n-cpu-moe={n}",
|
||||||
|
ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
|
||||||
|
n_cpu_moe=n,
|
||||||
|
)
|
||||||
|
if r:
|
||||||
|
results.append(r)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
log(" PHASE 1: No configuration worked!")
|
||||||
|
return None
|
||||||
|
|
||||||
|
best = max(results, key=lambda x: x["avg_tps"])
|
||||||
|
log(f"\n ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def phase2_threads(model, prev_best):
|
||||||
|
"""Sweep CPU threads with best GPU config locked."""
|
||||||
|
log(f"\n{'='*70}")
|
||||||
|
log(f" PHASE 2: CPU Thread Sweep — {model['name']}")
|
||||||
|
log(f"{'='*70}")
|
||||||
|
|
||||||
|
p = prev_best["params"]
|
||||||
|
results = [prev_best]
|
||||||
|
|
||||||
|
for t in [2, 4, 6, 8, 10, 12]:
|
||||||
|
if t == p.get("t", 6):
|
||||||
|
continue
|
||||||
|
r = test_config(
|
||||||
|
model, f"t={t}",
|
||||||
|
ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
|
||||||
|
ctk=p["ctk"], ctv=p["ctv"],
|
||||||
|
cpu_moe=p.get("cpu_moe", False),
|
||||||
|
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||||
|
)
|
||||||
|
if r:
|
||||||
|
results.append(r)
|
||||||
|
|
||||||
|
best = max(results, key=lambda x: x["avg_tps"])
|
||||||
|
log(f"\n ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def phase3_batch(model, prev_best):
|
||||||
|
"""Sweep batch sizes."""
|
||||||
|
log(f"\n{'='*70}")
|
||||||
|
log(f" PHASE 3: Batch Size Sweep — {model['name']}")
|
||||||
|
log(f"{'='*70}")
|
||||||
|
|
||||||
|
p = prev_best["params"]
|
||||||
|
best_t = p["t"]
|
||||||
|
results = [prev_best]
|
||||||
|
|
||||||
|
for ub, b in [(128, 512), (256, 1024), (256, 2048),
|
||||||
|
(512, 1024), (512, 2048), (512, 4096),
|
||||||
|
(1024, 2048), (1024, 4096)]:
|
||||||
|
if ub == p["ub"] and b == p["b"]:
|
||||||
|
continue
|
||||||
|
r = test_config(
|
||||||
|
model, f"ub={ub} b={b}",
|
||||||
|
ngl=p["ngl"], t=best_t, ub=ub, b=b,
|
||||||
|
ctk=p["ctk"], ctv=p["ctv"],
|
||||||
|
cpu_moe=p.get("cpu_moe", False),
|
||||||
|
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||||
|
)
|
||||||
|
if r:
|
||||||
|
results.append(r)
|
||||||
|
|
||||||
|
best = max(results, key=lambda x: x["avg_tps"])
|
||||||
|
log(f"\n ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def phase4_kvcache(model, prev_best):
|
||||||
|
"""Sweep KV cache precision."""
|
||||||
|
log(f"\n{'='*70}")
|
||||||
|
log(f" PHASE 4: KV Cache Type Sweep — {model['name']}")
|
||||||
|
log(f"{'='*70}")
|
||||||
|
|
||||||
|
p = prev_best["params"]
|
||||||
|
results = [prev_best]
|
||||||
|
|
||||||
|
for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
|
||||||
|
("q4_0", "q8_0"), ("f16", "f16")]:
|
||||||
|
if ctk == p["ctk"] and ctv == p["ctv"]:
|
||||||
|
continue
|
||||||
|
r = test_config(
|
||||||
|
model, f"kv={ctk}/{ctv}",
|
||||||
|
ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
|
||||||
|
ctk=ctk, ctv=ctv,
|
||||||
|
cpu_moe=p.get("cpu_moe", False),
|
||||||
|
n_cpu_moe=p.get("n_cpu_moe", 0),
|
||||||
|
)
|
||||||
|
if r:
|
||||||
|
results.append(r)
|
||||||
|
|
||||||
|
best = max(results, key=lambda x: x["avg_tps"])
|
||||||
|
log(f"\n ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def phase5_final(model, prev_best):
|
||||||
|
"""Final verification with 5 runs."""
|
||||||
|
log(f"\n{'='*70}")
|
||||||
|
log(f" PHASE 5: Final Verification (5 runs) — {model['name']}")
|
||||||
|
log(f"{'='*70}")
|
||||||
|
|
||||||
|
p = prev_best["params"]
|
||||||
|
kill_server()
|
||||||
|
proc = start_server(model["path"], **p)
|
||||||
|
ok, boot_time = wait_for_server()
|
||||||
|
if not ok:
|
||||||
|
log(" FAILED to start for final verification!")
|
||||||
|
proc.kill()
|
||||||
|
return prev_best
|
||||||
|
|
||||||
|
vram = get_vram_all()
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
try:
|
||||||
|
run_benchmark(max_tokens=20)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
speeds = []
|
||||||
|
for i in range(5):
|
||||||
|
try:
|
||||||
|
r = run_benchmark()
|
||||||
|
speeds.append(r["tps"])
|
||||||
|
log(f" Final Run {i+1}: {r['tps']:.2f} t/s")
|
||||||
|
except Exception as e:
|
||||||
|
log(f" Final Run {i+1}: ERROR ({e})")
|
||||||
|
|
||||||
|
proc.kill()
|
||||||
|
|
||||||
|
if speeds:
|
||||||
|
avg = sum(speeds) / len(speeds)
|
||||||
|
best_tps = max(speeds)
|
||||||
|
log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
|
||||||
|
|
||||||
|
final = {
|
||||||
|
"model": model["name"],
|
||||||
|
"quant": model["quant"],
|
||||||
|
"label": f"FINAL-{model['name']}",
|
||||||
|
"avg_tps": round(avg, 2),
|
||||||
|
"best_tps": round(best_tps, 2),
|
||||||
|
"boot_time": round(boot_time, 1),
|
||||||
|
"vram": vram,
|
||||||
|
"params": p,
|
||||||
|
}
|
||||||
|
ALL_RESULTS.append(final)
|
||||||
|
return final
|
||||||
|
|
||||||
|
return prev_best
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Main ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def run_full_benchmark_for_model(model):
|
||||||
|
"""Run all phases for a single model."""
|
||||||
|
log(f"\n{'#'*70}")
|
||||||
|
log(f" MODEL: {model['name']}")
|
||||||
|
log(f" File: {model['path']}")
|
||||||
|
log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB")
|
||||||
|
log(f"{'#'*70}")
|
||||||
|
|
||||||
|
# Check model exists
|
||||||
|
if not os.path.exists(model["path"]):
|
||||||
|
log(f" SKIP: Model file not found!")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Phase 0: Can it boot?
|
||||||
|
baseline = phase0_boot_test(model)
|
||||||
|
if not baseline:
|
||||||
|
log(f" SKIP: {model['name']} cannot boot at 256K context!")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Phase 1: GPU offload strategy
|
||||||
|
best = phase1_gpu_offload(model, baseline)
|
||||||
|
if not best:
|
||||||
|
return baseline
|
||||||
|
|
||||||
|
# Phase 2: CPU threads
|
||||||
|
best = phase2_threads(model, best)
|
||||||
|
|
||||||
|
# Phase 3: Batch sizes
|
||||||
|
best = phase3_batch(model, best)
|
||||||
|
|
||||||
|
# Phase 4: KV cache
|
||||||
|
best = phase4_kvcache(model, best)
|
||||||
|
|
||||||
|
# Phase 5: Final verification
|
||||||
|
final = phase5_final(model, best)
|
||||||
|
|
||||||
|
return final
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
log("=" * 70)
|
||||||
|
log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
|
||||||
|
log(" 2x RTX 3060 (24GB Total) | 256K Context")
|
||||||
|
log(f" Models: {len(MODELS)}")
|
||||||
|
log(f" Started: {datetime.datetime.now().isoformat()}")
|
||||||
|
log("=" * 70)
|
||||||
|
|
||||||
|
# Show GPU info
|
||||||
|
gpus = get_vram_all()
|
||||||
|
for g in gpus:
|
||||||
|
log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
|
||||||
|
|
||||||
|
# Run benchmarks for each model
|
||||||
|
model_winners = []
|
||||||
|
for i, model in enumerate(MODELS):
|
||||||
|
log(f"\n{'='*70}")
|
||||||
|
log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
|
||||||
|
log(f"{'='*70}")
|
||||||
|
|
||||||
|
winner = run_full_benchmark_for_model(model)
|
||||||
|
if winner:
|
||||||
|
model_winners.append(winner)
|
||||||
|
|
||||||
|
# Save intermediate results
|
||||||
|
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||||
|
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||||
|
log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
|
||||||
|
|
||||||
|
# ─── Grand Final Comparison ──────────────────────────────────
|
||||||
|
elapsed = (time.time() - start_time) / 60
|
||||||
|
|
||||||
|
log(f"\n{'='*70}")
|
||||||
|
log(f" GRAND FINAL COMPARISON")
|
||||||
|
log(f" Total time: {elapsed:.1f} minutes")
|
||||||
|
log(f" Configs tested: {len(ALL_RESULTS)}")
|
||||||
|
log(f"{'='*70}")
|
||||||
|
|
||||||
|
if not model_winners:
|
||||||
|
log(" No models were able to run at 256K context!")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Sort by avg t/s
|
||||||
|
model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
|
||||||
|
|
||||||
|
summary_lines = []
|
||||||
|
summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
|
||||||
|
summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
|
||||||
|
summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
|
||||||
|
summary_lines.append(f"Total time: {elapsed:.1f} minutes")
|
||||||
|
summary_lines.append("")
|
||||||
|
summary_lines.append("=" * 60)
|
||||||
|
summary_lines.append(" RANKING (by AVG t/s)")
|
||||||
|
summary_lines.append("=" * 60)
|
||||||
|
|
||||||
|
for rank, w in enumerate(model_winners, 1):
|
||||||
|
medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ")
|
||||||
|
summary_lines.append(f"\n {medal} #{rank}: {w['model']}")
|
||||||
|
summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
|
||||||
|
summary_lines.append(f" Boot: {w['boot_time']:.0f}s")
|
||||||
|
p = w["params"]
|
||||||
|
summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
|
||||||
|
summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}")
|
||||||
|
if p.get("cpu_moe"):
|
||||||
|
summary_lines.append(f" --cpu-moe")
|
||||||
|
elif p.get("n_cpu_moe", 0) > 0:
|
||||||
|
summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}")
|
||||||
|
|
||||||
|
champion = model_winners[0]
|
||||||
|
summary_lines.append(f"\n{'='*60}")
|
||||||
|
summary_lines.append(f" ★ CHAMPION: {champion['model']}")
|
||||||
|
summary_lines.append(f" {champion['avg_tps']:.2f} t/s average")
|
||||||
|
summary_lines.append(f"{'='*60}")
|
||||||
|
|
||||||
|
# Build recommended command
|
||||||
|
p = champion["params"]
|
||||||
|
cmd_parts = [
|
||||||
|
f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
|
||||||
|
f"-ngl {p['ngl']} -c {CONTEXT}",
|
||||||
|
f"-t {p['t']} -tb {p['t']}",
|
||||||
|
f"-ub {p['ub']} -b {p['b']}",
|
||||||
|
"-fa on",
|
||||||
|
f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
|
||||||
|
f"--prio {p.get('prio', 3)} --poll 50",
|
||||||
|
"--mlock",
|
||||||
|
]
|
||||||
|
if p.get("cpu_moe"):
|
||||||
|
cmd_parts.append("--cpu-moe")
|
||||||
|
elif p.get("n_cpu_moe", 0) > 0:
|
||||||
|
cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
|
||||||
|
if p.get("nommap"):
|
||||||
|
cmd_parts.append("--no-mmap")
|
||||||
|
cmd_parts.append("--port 8000 --host 0.0.0.0")
|
||||||
|
|
||||||
|
summary_lines.append(f"\n Recommended command:")
|
||||||
|
summary_lines.append(f" {' '.join(cmd_parts)}")
|
||||||
|
|
||||||
|
summary = "\n".join(summary_lines)
|
||||||
|
print(summary)
|
||||||
|
|
||||||
|
with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
|
||||||
|
f.write(summary)
|
||||||
|
|
||||||
|
with open("scripts/dual_gpu_results.json", "w") as f:
|
||||||
|
json.dump(ALL_RESULTS, f, indent=2, default=str)
|
||||||
|
|
||||||
|
log(f"\n Results: scripts/dual_gpu_results.json")
|
||||||
|
log(f" Summary: scripts/dual_gpu_summary.txt")
|
||||||
|
log(f" DONE!")
|
||||||
|
|
||||||
|
kill_server()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
330
scripts/dual_gpu_benchmark_v2.mjs
Normal file
330
scripts/dual_gpu_benchmark_v2.mjs
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
/**
|
||||||
|
* Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2
|
||||||
|
* =====================================================
|
||||||
|
* Informed by VRAM analysis — tests models in optimal order.
|
||||||
|
*
|
||||||
|
* Key insights applied:
|
||||||
|
* - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA)
|
||||||
|
* - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first
|
||||||
|
* - Skip configs known to fail, minimize wasted time
|
||||||
|
*
|
||||||
|
* Run: node scripts/dual_gpu_benchmark_v2.mjs
|
||||||
|
* Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { spawn, execSync } from "child_process";
|
||||||
|
import { writeFileSync, existsSync, statSync } from "fs";
|
||||||
|
|
||||||
|
const BASE_URL = "http://127.0.0.1:8000";
|
||||||
|
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||||
|
const CTX = 262144;
|
||||||
|
const RUNS = 3;
|
||||||
|
const TOKENS = 200;
|
||||||
|
const BOOT_TIMEOUT = 300_000;
|
||||||
|
|
||||||
|
// Models ordered: smallest first (most likely to succeed fully on GPU)
|
||||||
|
const MODELS = [
|
||||||
|
{
|
||||||
|
name: "Gemma4-26B MXFP4_MOE",
|
||||||
|
path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
|
||||||
|
quant: "MXFP4_MOE",
|
||||||
|
fitsGPU: true, // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Gemma4-26B Q4_K_M",
|
||||||
|
path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
|
||||||
|
quant: "Q4_K_M",
|
||||||
|
fitsGPU: true, // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Qwen3.5-35B MXFP4_MOE",
|
||||||
|
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||||
|
quant: "MXFP4_MOE",
|
||||||
|
fitsGPU: "maybe", // 20.1 + 1.41 + 1 = 22.51 GB — tight
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Qwen3.5-35B Q4_K_M",
|
||||||
|
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||||
|
quant: "Q4_K_M",
|
||||||
|
fitsGPU: "maybe", // 20.5 + 1.41 + 1 = 22.91 GB — very tight
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const ALL = [];
|
||||||
|
let currentProc = null;
|
||||||
|
|
||||||
|
// ─── Utilities ─────────────────────────────────────────────────
|
||||||
|
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||||
|
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||||
|
|
||||||
|
async function kill() {
|
||||||
|
if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; }
|
||||||
|
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||||
|
await sleep(5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
function vram() {
|
||||||
|
try {
|
||||||
|
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||||
|
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||||
|
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||||
|
return { gpu: g, used: u, total: t };
|
||||||
|
});
|
||||||
|
} catch { return []; }
|
||||||
|
}
|
||||||
|
|
||||||
|
function startServer(modelPath, p) {
|
||||||
|
const args = [
|
||||||
|
"--model", modelPath, "-ngl", String(p.ngl),
|
||||||
|
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||||
|
"--cache-type-k", p.ctk, "--cache-type-v", p.ctv,
|
||||||
|
"-ub", String(p.ub), "-b", String(p.b),
|
||||||
|
"-t", String(p.t), "-tb", String(p.t),
|
||||||
|
"--prio", String(p.prio || 3), "--poll", "50", "--mlock",
|
||||||
|
"--port", "8000", "--host", "0.0.0.0",
|
||||||
|
];
|
||||||
|
if (p.cpuMoe) args.push("--cpu-moe");
|
||||||
|
else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||||
|
if (p.nommap) args.push("--no-mmap");
|
||||||
|
|
||||||
|
currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||||
|
return currentProc;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
while (Date.now() - t0 < timeout) {
|
||||||
|
try {
|
||||||
|
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||||
|
const d = await r.json();
|
||||||
|
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||||
|
} catch {}
|
||||||
|
await sleep(3000);
|
||||||
|
}
|
||||||
|
return { ok: false, boot: timeout / 1000 };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function bench(n = TOKENS) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: "m",
|
||||||
|
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||||
|
max_tokens: n, temperature: 0,
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(600_000),
|
||||||
|
});
|
||||||
|
const d = await r.json();
|
||||||
|
const dt = (Date.now() - t0) / 1000;
|
||||||
|
const ct = d.usage?.completion_tokens || 0;
|
||||||
|
return { tps: ct / dt, ct, dt };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testConfig(model, label, params) {
|
||||||
|
await kill();
|
||||||
|
log(` [${label}] Starting...`);
|
||||||
|
startServer(model.path, params);
|
||||||
|
const { ok, boot } = await waitReady();
|
||||||
|
if (!ok) { log(` [${label}] ✗ FAILED (timeout)`); await kill(); return null; }
|
||||||
|
|
||||||
|
const v = vram();
|
||||||
|
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||||
|
log(` [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`);
|
||||||
|
|
||||||
|
try { await bench(20); } catch {} // warmup
|
||||||
|
|
||||||
|
const speeds = [];
|
||||||
|
for (let i = 0; i < RUNS; i++) {
|
||||||
|
try { const r = await bench(); speeds.push(r.tps); log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||||
|
} catch (e) { log(` Run${i+1}: ERR ${e.message}`); }
|
||||||
|
}
|
||||||
|
await kill();
|
||||||
|
|
||||||
|
if (!speeds.length) { log(` [${label}] ✗ ALL RUNS FAILED`); return null; }
|
||||||
|
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||||
|
const best = Math.max(...speeds);
|
||||||
|
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||||
|
|
||||||
|
const res = { model: model.name, quant: model.quant, label,
|
||||||
|
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||||
|
boot: +boot.toFixed(1), vram: v, params };
|
||||||
|
ALL.push(res);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save intermediate results after each test
|
||||||
|
function saveIntermediate() {
|
||||||
|
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Smart Phase Runner ────────────────────────────────────────
|
||||||
|
|
||||||
|
async function tuneModel(model) {
|
||||||
|
log(`\n${"#".repeat(65)}`);
|
||||||
|
log(` ${model.name} (${model.quant})`);
|
||||||
|
if (!existsSync(model.path)) { log(" ✗ File not found, SKIP"); return null; }
|
||||||
|
const sz = (statSync(model.path).size / 1024**3).toFixed(2);
|
||||||
|
log(` Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`);
|
||||||
|
log(`${"#".repeat(65)}`);
|
||||||
|
|
||||||
|
// ── Step 1: Find working GPU config ──
|
||||||
|
log(`\n ── Step 1: Find optimal GPU offload ──`);
|
||||||
|
let baseline = null;
|
||||||
|
|
||||||
|
if (model.fitsGPU === true || model.fitsGPU === "maybe") {
|
||||||
|
// Try full GPU, no CPU offload
|
||||||
|
baseline = await testConfig(model, "ngl=999 pure-GPU", {
|
||||||
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" });
|
||||||
|
saveIntermediate();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!baseline) {
|
||||||
|
// Try n-cpu-moe values (ascending — find minimum needed)
|
||||||
|
for (const n of [5, 10, 15, 20]) {
|
||||||
|
baseline = await testConfig(model, `n-cpu-moe=${n}`, {
|
||||||
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n });
|
||||||
|
saveIntermediate();
|
||||||
|
if (baseline) break; // found minimum working offload
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!baseline) {
|
||||||
|
// Last resort: full cpu-moe
|
||||||
|
baseline = await testConfig(model, "cpu-moe", {
|
||||||
|
ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true });
|
||||||
|
saveIntermediate();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!baseline) { log(` ✗ ${model.name} cannot boot at 256K!`); return null; }
|
||||||
|
|
||||||
|
const bp = baseline.params; // carry forward best params
|
||||||
|
|
||||||
|
// If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory)
|
||||||
|
if (!bp.cpuMoe && !bp.nCpuMoe) {
|
||||||
|
const alt = await testConfig(model, "compare: cpu-moe", {
|
||||||
|
...bp, cpuMoe: true });
|
||||||
|
saveIntermediate();
|
||||||
|
if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; }
|
||||||
|
}
|
||||||
|
|
||||||
|
let best = baseline;
|
||||||
|
|
||||||
|
// ── Step 2: Thread sweep ──
|
||||||
|
log(`\n ── Step 2: Thread sweep ──`);
|
||||||
|
for (const t of [2, 4, 8, 10, 12]) {
|
||||||
|
if (t === best.params.t) continue;
|
||||||
|
const r = await testConfig(model, `t=${t}`, { ...best.params, t });
|
||||||
|
saveIntermediate();
|
||||||
|
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 3: Batch sweep ──
|
||||||
|
log(`\n ── Step 3: Batch sweep ──`);
|
||||||
|
for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) {
|
||||||
|
if (ub === best.params.ub && b === best.params.b) continue;
|
||||||
|
const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b });
|
||||||
|
saveIntermediate();
|
||||||
|
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 4: KV cache sweep ──
|
||||||
|
log(`\n ── Step 4: KV cache type ──`);
|
||||||
|
for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) {
|
||||||
|
if (ctk === best.params.ctk && ctv === best.params.ctv) continue;
|
||||||
|
const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv });
|
||||||
|
saveIntermediate();
|
||||||
|
if (r && r.avg_tps > best.avg_tps) best = r;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 5: Final verification (5 runs) ──
|
||||||
|
log(`\n ── Step 5: Final verification ──`);
|
||||||
|
await kill();
|
||||||
|
startServer(model.path, best.params);
|
||||||
|
const { ok, boot } = await waitReady();
|
||||||
|
if (!ok) { await kill(); return best; }
|
||||||
|
const v = vram();
|
||||||
|
try { await bench(20); } catch {}
|
||||||
|
|
||||||
|
const finals = [];
|
||||||
|
for (let i = 0; i < 5; i++) {
|
||||||
|
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||||
|
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||||
|
}
|
||||||
|
await kill();
|
||||||
|
|
||||||
|
if (finals.length > 0) {
|
||||||
|
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||||
|
const bst = Math.max(...finals);
|
||||||
|
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||||
|
const final = { model: model.name, quant: model.quant, label: `FINAL`,
|
||||||
|
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||||
|
boot: +boot.toFixed(1), vram: v, params: best.params };
|
||||||
|
ALL.push(final);
|
||||||
|
saveIntermediate();
|
||||||
|
return final;
|
||||||
|
}
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Main ──────────────────────────────────────────────────────
|
||||||
|
async function main() {
|
||||||
|
const t0 = Date.now();
|
||||||
|
log("=" .repeat(65));
|
||||||
|
log(" DUAL-GPU BENCHMARK v2 — Smart Strategy");
|
||||||
|
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||||
|
log(" " + new Date().toISOString());
|
||||||
|
log("=".repeat(65));
|
||||||
|
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||||
|
|
||||||
|
const winners = [];
|
||||||
|
for (let i = 0; i < MODELS.length; i++) {
|
||||||
|
log(`\n${"=".repeat(65)}`);
|
||||||
|
log(` MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`);
|
||||||
|
log("=".repeat(65));
|
||||||
|
const w = await tuneModel(MODELS[i]);
|
||||||
|
if (w) winners.push(w);
|
||||||
|
saveIntermediate();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Summary ──────────────────────────────────────────────
|
||||||
|
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||||
|
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||||
|
const medals = ["🥇", "🥈", "🥉", " "];
|
||||||
|
|
||||||
|
const lines = [
|
||||||
|
`Dual-GPU Benchmark v2 — ${new Date().toISOString()}`,
|
||||||
|
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||||
|
"", "=" .repeat(55), " RANKING", "=".repeat(55),
|
||||||
|
];
|
||||||
|
for (let i = 0; i < winners.length; i++) {
|
||||||
|
const w = winners[i], p = w.params;
|
||||||
|
lines.push("", ` ${medals[i]||" "} #${i+1}: ${w.model}`);
|
||||||
|
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||||
|
lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`);
|
||||||
|
if (p.cpuMoe) lines.push(` --cpu-moe`);
|
||||||
|
else if (p.nCpuMoe) lines.push(` --n-cpu-moe ${p.nCpuMoe}`);
|
||||||
|
}
|
||||||
|
if (winners.length > 0) {
|
||||||
|
const c = winners[0], cp = c.params;
|
||||||
|
lines.push("", "=".repeat(55), ` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s`, "=".repeat(55));
|
||||||
|
const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`,
|
||||||
|
`-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`,
|
||||||
|
`-ub ${cp.ub} -b ${cp.b} -fa on`,
|
||||||
|
`--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
|
||||||
|
`--prio ${cp.prio||3} --poll 50 --mlock`,
|
||||||
|
cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "",
|
||||||
|
"--port 8000 --host 0.0.0.0"].filter(Boolean).join(" ");
|
||||||
|
lines.push("", " Recommended:", ` ${cmd}`);
|
||||||
|
}
|
||||||
|
const summary = lines.join("\n");
|
||||||
|
console.log("\n" + summary);
|
||||||
|
writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
|
||||||
|
writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
|
||||||
|
log(`\n Saved: dual_gpu_results.json + dual_gpu_summary.txt`);
|
||||||
|
log(" DONE!");
|
||||||
|
await kill();
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||||
1654
scripts/dual_gpu_results.json
Normal file
1654
scripts/dual_gpu_results.json
Normal file
File diff suppressed because it is too large
Load Diff
31
scripts/dual_gpu_summary.txt
Normal file
31
scripts/dual_gpu_summary.txt
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
Dual-GPU Benchmark v2 — 2026-04-06T06:52:08.868Z
|
||||||
|
2x RTX 3060 12GB | 256K Context | 58 configs | 69.4 min
|
||||||
|
|
||||||
|
=======================================================
|
||||||
|
RANKING
|
||||||
|
=======================================================
|
||||||
|
|
||||||
|
🥇 #1: Gemma4-26B Q4_K_M
|
||||||
|
AVG: 76.4 t/s | BEST: 76.75 t/s | Boot: 9s
|
||||||
|
ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16
|
||||||
|
|
||||||
|
🥈 #2: Gemma4-26B MXFP4_MOE
|
||||||
|
AVG: 64.05 t/s | BEST: 64.29 t/s | Boot: 9s
|
||||||
|
ngl=999 t=2 ub=512 b=2048 ctk=q8_0 ctv=q8_0
|
||||||
|
|
||||||
|
🥉 #3: Qwen3.5-35B Q4_K_M
|
||||||
|
AVG: 52.05 t/s | BEST: 54.48 t/s | Boot: 12.1s
|
||||||
|
ngl=999 t=4 ub=256 b=1024 ctk=q4_0 ctv=q4_0
|
||||||
|
--n-cpu-moe 5
|
||||||
|
|
||||||
|
#4: Qwen3.5-35B MXFP4_MOE
|
||||||
|
AVG: 46.66 t/s | BEST: 47.09 t/s | Boot: 15s
|
||||||
|
ngl=999 t=6 ub=512 b=2048 ctk=q4_0 ctv=q4_0
|
||||||
|
--n-cpu-moe 5
|
||||||
|
|
||||||
|
=======================================================
|
||||||
|
★ CHAMPION: Gemma4-26B Q4_K_M — 76.4 t/s
|
||||||
|
=======================================================
|
||||||
|
|
||||||
|
Recommended:
|
||||||
|
llama-server --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf -ngl 999 -c 262144 -t 6 -tb 6 -ub 512 -b 2048 -fa on --cache-type-k f16 --cache-type-v f16 --prio 3 --poll 50 --mlock --port 8000 --host 0.0.0.0
|
||||||
BIN
scripts/final_tune_122b.txt
Normal file
BIN
scripts/final_tune_122b.txt
Normal file
Binary file not shown.
BIN
scripts/final_tune_122b_dual.txt
Normal file
BIN
scripts/final_tune_122b_dual.txt
Normal file
Binary file not shown.
101
scripts/find_max_dense.mjs
Normal file
101
scripts/find_max_dense.mjs
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
import { spawn, exec } from 'child_process';
|
||||||
|
|
||||||
|
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||||
|
|
||||||
|
async function killServer() {
|
||||||
|
return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testContextSize(modelPath, contextSize) {
|
||||||
|
console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
|
||||||
|
await killServer();
|
||||||
|
|
||||||
|
const args = [
|
||||||
|
'--model', `models\\${modelPath}`,
|
||||||
|
'-ngl', '999',
|
||||||
|
'-c', contextSize.toString(),
|
||||||
|
'-fa', 'on',
|
||||||
|
'--cache-type-k', 'q4_0',
|
||||||
|
'--cache-type-v', 'q4_0',
|
||||||
|
'-ub', '512',
|
||||||
|
'-b', '2048',
|
||||||
|
'-t', '6',
|
||||||
|
'-tb', '6',
|
||||||
|
'--split-mode', 'row',
|
||||||
|
'--prio', '3',
|
||||||
|
'--fit', 'off',
|
||||||
|
'--port', '8000',
|
||||||
|
'--host', '0.0.0.0'
|
||||||
|
];
|
||||||
|
|
||||||
|
const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
|
||||||
|
|
||||||
|
let booted = false;
|
||||||
|
let oomed = false;
|
||||||
|
|
||||||
|
server.stderr.on('data', (d) => {
|
||||||
|
const text = d.toString();
|
||||||
|
if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
|
||||||
|
oomed = true;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for (let i = 0; i < 20; i++) {
|
||||||
|
if (oomed) break;
|
||||||
|
try {
|
||||||
|
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||||
|
if (res.status === 200) {
|
||||||
|
booted = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch(e) {}
|
||||||
|
await delay(2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oomed || !booted) {
|
||||||
|
console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
|
||||||
|
server.kill('SIGKILL');
|
||||||
|
await killServer();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`✅ Booted! Running Benchmark...`);
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||||
|
r(stdout || stderr);
|
||||||
|
}));
|
||||||
|
|
||||||
|
console.log(bench);
|
||||||
|
await killServer();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function findMaxContext(modelName) {
|
||||||
|
const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
|
||||||
|
|
||||||
|
let maxFound = false;
|
||||||
|
for (const c of contexts) {
|
||||||
|
const success = await testContextSize(modelName, c);
|
||||||
|
if (success) {
|
||||||
|
maxFound = true;
|
||||||
|
console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!maxFound) {
|
||||||
|
console.log(`\n❌ Failed to find any working context size for ${modelName}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
exec('set CUDA_VISIBLE_DEVICES=');
|
||||||
|
console.log("============= QWEN 27B Q4_K_M =============");
|
||||||
|
await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
|
||||||
|
|
||||||
|
console.log("\n============= GEMMA 4 31B Q4_K_M =============");
|
||||||
|
await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
562
scripts/help_full.txt
Normal file
562
scripts/help_full.txt
Normal file
@@ -0,0 +1,562 @@
|
|||||||
|
----- common params -----
|
||||||
|
|
||||||
|
-h, --help, --usage print usage and exit
|
||||||
|
--version show version and build info
|
||||||
|
--license show source code license and dependencies
|
||||||
|
-cl, --cache-list show list of models in cache
|
||||||
|
--completion-bash print source-able bash completion script for llama.cpp
|
||||||
|
-t, --threads N number of CPU threads to use during generation (default: -1)
|
||||||
|
(env: LLAMA_ARG_THREADS)
|
||||||
|
-tb, --threads-batch N number of threads to use during batch and prompt processing (default:
|
||||||
|
same as --threads)
|
||||||
|
-C, --cpu-mask M CPU affinity mask: arbitrarily long hex. Complements cpu-range
|
||||||
|
(default: "")
|
||||||
|
-Cr, --cpu-range lo-hi range of CPUs for affinity. Complements --cpu-mask
|
||||||
|
--cpu-strict <0|1> use strict CPU placement (default: 0)
|
||||||
|
--prio N set process/thread priority : low(-1), normal(0), medium(1), high(2),
|
||||||
|
realtime(3) (default: 0)
|
||||||
|
--poll <0...100> use polling level to wait for work (0 - no polling, default: 50)
|
||||||
|
-Cb, --cpu-mask-batch M CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch
|
||||||
|
(default: same as --cpu-mask)
|
||||||
|
-Crb, --cpu-range-batch lo-hi ranges of CPUs for affinity. Complements --cpu-mask-batch
|
||||||
|
--cpu-strict-batch <0|1> use strict CPU placement (default: same as --cpu-strict)
|
||||||
|
--prio-batch N set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime
|
||||||
|
(default: 0)
|
||||||
|
--poll-batch <0|1> use polling to wait for work (default: same as --poll)
|
||||||
|
-c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
|
||||||
|
(env: LLAMA_ARG_CTX_SIZE)
|
||||||
|
-n, --predict, --n-predict N number of tokens to predict (default: -1, -1 = infinity)
|
||||||
|
(env: LLAMA_ARG_N_PREDICT)
|
||||||
|
-b, --batch-size N logical maximum batch size (default: 2048)
|
||||||
|
(env: LLAMA_ARG_BATCH)
|
||||||
|
-ub, --ubatch-size N physical maximum batch size (default: 512)
|
||||||
|
(env: LLAMA_ARG_UBATCH)
|
||||||
|
--keep N number of tokens to keep from the initial prompt (default: 0, -1 =
|
||||||
|
all)
|
||||||
|
--swa-full use full-size SWA cache (default: false)
|
||||||
|
[(more
|
||||||
|
info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||||
|
(env: LLAMA_ARG_SWA_FULL)
|
||||||
|
-fa, --flash-attn [on|off|auto] set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
|
||||||
|
(env: LLAMA_ARG_FLASH_ATTN)
|
||||||
|
--perf, --no-perf whether to enable internal libllama performance timings (default:
|
||||||
|
false)
|
||||||
|
(env: LLAMA_ARG_PERF)
|
||||||
|
-e, --escape, --no-escape whether to process escapes sequences (\n, \r, \t, \', \", \\)
|
||||||
|
(default: true)
|
||||||
|
--rope-scaling {none,linear,yarn} RoPE frequency scaling method, defaults to linear unless specified by
|
||||||
|
the model
|
||||||
|
(env: LLAMA_ARG_ROPE_SCALING_TYPE)
|
||||||
|
--rope-scale N RoPE context scaling factor, expands context by a factor of N
|
||||||
|
(env: LLAMA_ARG_ROPE_SCALE)
|
||||||
|
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from
|
||||||
|
model)
|
||||||
|
(env: LLAMA_ARG_ROPE_FREQ_BASE)
|
||||||
|
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
|
||||||
|
(env: LLAMA_ARG_ROPE_FREQ_SCALE)
|
||||||
|
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training
|
||||||
|
context size)
|
||||||
|
(env: LLAMA_ARG_YARN_ORIG_CTX)
|
||||||
|
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.00, 0.0 = full
|
||||||
|
interpolation)
|
||||||
|
(env: LLAMA_ARG_YARN_EXT_FACTOR)
|
||||||
|
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
|
||||||
|
(env: LLAMA_ARG_YARN_ATTN_FACTOR)
|
||||||
|
--yarn-beta-slow N YaRN: high correction dim or alpha (default: -1.00)
|
||||||
|
(env: LLAMA_ARG_YARN_BETA_SLOW)
|
||||||
|
--yarn-beta-fast N YaRN: low correction dim or beta (default: -1.00)
|
||||||
|
(env: LLAMA_ARG_YARN_BETA_FAST)
|
||||||
|
-kvo, --kv-offload, -nkvo, --no-kv-offload
|
||||||
|
whether to enable KV cache offloading (default: enabled)
|
||||||
|
(env: LLAMA_ARG_KV_OFFLOAD)
|
||||||
|
--repack, -nr, --no-repack whether to enable weight repacking (default: enabled)
|
||||||
|
(env: LLAMA_ARG_REPACK)
|
||||||
|
--no-host bypass host buffer allowing extra buffers to be used
|
||||||
|
(env: LLAMA_ARG_NO_HOST)
|
||||||
|
-ctk, --cache-type-k TYPE KV cache data type for K
|
||||||
|
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||||
|
(default: f16)
|
||||||
|
(env: LLAMA_ARG_CACHE_TYPE_K)
|
||||||
|
-ctv, --cache-type-v TYPE KV cache data type for V
|
||||||
|
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||||
|
(default: f16)
|
||||||
|
(env: LLAMA_ARG_CACHE_TYPE_V)
|
||||||
|
-dt, --defrag-thold N KV cache defragmentation threshold (DEPRECATED)
|
||||||
|
(env: LLAMA_ARG_DEFRAG_THOLD)
|
||||||
|
--rpc SERVERS comma separated list of RPC servers (host:port)
|
||||||
|
(env: LLAMA_ARG_RPC)
|
||||||
|
--mlock force system to keep model in RAM rather than swapping or compressing
|
||||||
|
(env: LLAMA_ARG_MLOCK)
|
||||||
|
--mmap, --no-mmap whether to memory-map model. (if mmap disabled, slower load but may
|
||||||
|
reduce pageouts if not using mlock) (default: enabled)
|
||||||
|
(env: LLAMA_ARG_MMAP)
|
||||||
|
-dio, --direct-io, -ndio, --no-direct-io
|
||||||
|
use DirectIO if available. (default: disabled)
|
||||||
|
(env: LLAMA_ARG_DIO)
|
||||||
|
--numa TYPE attempt optimizations that help on some NUMA systems
|
||||||
|
- distribute: spread execution evenly over all nodes
|
||||||
|
- isolate: only spawn threads on CPUs on the node that execution
|
||||||
|
started on
|
||||||
|
- numactl: use the CPU map provided by numactl
|
||||||
|
if run without this previously, it is recommended to drop the system
|
||||||
|
page cache before using this
|
||||||
|
see https://github.com/ggml-org/llama.cpp/issues/1437
|
||||||
|
(env: LLAMA_ARG_NUMA)
|
||||||
|
-dev, --device <dev1,dev2,..> comma-separated list of devices to use for offloading (none = don't
|
||||||
|
offload)
|
||||||
|
use --list-devices to see a list of available devices
|
||||||
|
(env: LLAMA_ARG_DEVICE)
|
||||||
|
--list-devices print list of available devices and exit
|
||||||
|
-ot, --override-tensor <tensor name pattern>=<buffer type>,...
|
||||||
|
override tensor buffer type
|
||||||
|
(env: LLAMA_ARG_OVERRIDE_TENSOR)
|
||||||
|
-cmoe, --cpu-moe keep all Mixture of Experts (MoE) weights in the CPU
|
||||||
|
(env: LLAMA_ARG_CPU_MOE)
|
||||||
|
-ncmoe, --n-cpu-moe N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||||
|
CPU
|
||||||
|
(env: LLAMA_ARG_N_CPU_MOE)
|
||||||
|
-ngl, --gpu-layers, --n-gpu-layers N max. number of layers to store in VRAM, either an exact number,
|
||||||
|
'auto', or 'all' (default: auto)
|
||||||
|
(env: LLAMA_ARG_N_GPU_LAYERS)
|
||||||
|
-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of:
|
||||||
|
- none: use one GPU only
|
||||||
|
- layer (default): split layers and KV across GPUs
|
||||||
|
- row: split rows across GPUs
|
||||||
|
(env: LLAMA_ARG_SPLIT_MODE)
|
||||||
|
-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of
|
||||||
|
proportions, e.g. 3,1
|
||||||
|
(env: LLAMA_ARG_TENSOR_SPLIT)
|
||||||
|
-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for
|
||||||
|
intermediate results and KV (with split-mode = row) (default: 0)
|
||||||
|
(env: LLAMA_ARG_MAIN_GPU)
|
||||||
|
-fit, --fit [on|off] whether to adjust unset arguments to fit in device memory ('on' or
|
||||||
|
'off', default: 'on')
|
||||||
|
(env: LLAMA_ARG_FIT)
|
||||||
|
-fitt, --fit-target MiB0,MiB1,MiB2,...
|
||||||
|
target margin per device for --fit, comma-separated list of values,
|
||||||
|
single value is broadcast across all devices, default: 1024
|
||||||
|
(env: LLAMA_ARG_FIT_TARGET)
|
||||||
|
-fitc, --fit-ctx N minimum ctx size that can be set by --fit option, default: 4096
|
||||||
|
(env: LLAMA_ARG_FIT_CTX)
|
||||||
|
--check-tensors check model tensor data for invalid values (default: false)
|
||||||
|
--override-kv KEY=TYPE:VALUE,... advanced option to override model metadata by key. to specify multiple
|
||||||
|
overrides, either use comma-separated values.
|
||||||
|
types: int, float, bool, str. example: --override-kv
|
||||||
|
tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false
|
||||||
|
--op-offload, --no-op-offload whether to offload host tensor operations to device (default: true)
|
||||||
|
--lora FNAME path to LoRA adapter (use comma-separated values to load multiple
|
||||||
|
adapters)
|
||||||
|
--lora-scaled FNAME:SCALE,... path to LoRA adapter with user defined scaling (format:
|
||||||
|
FNAME:SCALE,...)
|
||||||
|
note: use comma-separated values
|
||||||
|
--control-vector FNAME add a control vector
|
||||||
|
note: use comma-separated values to add multiple control vectors
|
||||||
|
--control-vector-scaled FNAME:SCALE,...
|
||||||
|
add a control vector with user defined scaling SCALE
|
||||||
|
note: use comma-separated values (format: FNAME:SCALE,...)
|
||||||
|
--control-vector-layer-range START END
|
||||||
|
layer range to apply the control vector(s) to, start and end inclusive
|
||||||
|
-m, --model FNAME model path to load
|
||||||
|
(env: LLAMA_ARG_MODEL)
|
||||||
|
-mu, --model-url MODEL_URL model download url (default: unused)
|
||||||
|
(env: LLAMA_ARG_MODEL_URL)
|
||||||
|
-dr, --docker-repo [<repo>/]<model>[:quant]
|
||||||
|
Docker Hub model repository. repo is optional, default to ai/. quant
|
||||||
|
is optional, default to :latest.
|
||||||
|
example: gemma3
|
||||||
|
(default: unused)
|
||||||
|
(env: LLAMA_ARG_DOCKER_REPO)
|
||||||
|
-hf, -hfr, --hf-repo <user>/<model>[:quant]
|
||||||
|
Hugging Face model repository; quant is optional, case-insensitive,
|
||||||
|
default to Q4_K_M, or falls back to the first file in the repo if
|
||||||
|
Q4_K_M doesn't exist.
|
||||||
|
mmproj is also downloaded automatically if available. to disable, add
|
||||||
|
--no-mmproj
|
||||||
|
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
|
||||||
|
(default: unused)
|
||||||
|
(env: LLAMA_ARG_HF_REPO)
|
||||||
|
-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]
|
||||||
|
Same as --hf-repo, but for the draft model (default: unused)
|
||||||
|
(env: LLAMA_ARG_HFD_REPO)
|
||||||
|
-hff, --hf-file FILE Hugging Face model file. If specified, it will override the quant in
|
||||||
|
--hf-repo (default: unused)
|
||||||
|
(env: LLAMA_ARG_HF_FILE)
|
||||||
|
-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]
|
||||||
|
Hugging Face model repository for the vocoder model (default: unused)
|
||||||
|
(env: LLAMA_ARG_HF_REPO_V)
|
||||||
|
-hffv, --hf-file-v FILE Hugging Face model file for the vocoder model (default: unused)
|
||||||
|
(env: LLAMA_ARG_HF_FILE_V)
|
||||||
|
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment
|
||||||
|
variable)
|
||||||
|
(env: HF_TOKEN)
|
||||||
|
--log-disable Log disable
|
||||||
|
--log-file FNAME Log to file
|
||||||
|
(env: LLAMA_LOG_FILE)
|
||||||
|
--log-colors [on|off|auto] Set colored logging ('on', 'off', or 'auto', default: 'auto')
|
||||||
|
'auto' enables colors when output is to a terminal
|
||||||
|
(env: LLAMA_LOG_COLORS)
|
||||||
|
-v, --verbose, --log-verbose Set verbosity level to infinity (i.e. log all messages, useful for
|
||||||
|
debugging)
|
||||||
|
--offline Offline mode: forces use of cache, prevents network access
|
||||||
|
(env: LLAMA_OFFLINE)
|
||||||
|
-lv, --verbosity, --log-verbosity N Set the verbosity threshold. Messages with a higher verbosity will be
|
||||||
|
ignored. Values:
|
||||||
|
- 0: generic output
|
||||||
|
- 1: error
|
||||||
|
- 2: warning
|
||||||
|
- 3: info
|
||||||
|
- 4: debug
|
||||||
|
(default: 3)
|
||||||
|
|
||||||
|
(env: LLAMA_LOG_VERBOSITY)
|
||||||
|
--log-prefix Enable prefix in log messages
|
||||||
|
(env: LLAMA_LOG_PREFIX)
|
||||||
|
--log-timestamps Enable timestamps in log messages
|
||||||
|
(env: LLAMA_LOG_TIMESTAMPS)
|
||||||
|
-ctkd, --cache-type-k-draft TYPE KV cache data type for K for the draft model
|
||||||
|
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||||
|
(default: f16)
|
||||||
|
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT)
|
||||||
|
-ctvd, --cache-type-v-draft TYPE KV cache data type for V for the draft model
|
||||||
|
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||||
|
(default: f16)
|
||||||
|
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT)
|
||||||
|
|
||||||
|
|
||||||
|
----- sampling params -----
|
||||||
|
|
||||||
|
--samplers SAMPLERS samplers that will be used for generation in the order, separated by
|
||||||
|
';'
|
||||||
|
(default:
|
||||||
|
penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature)
|
||||||
|
-s, --seed SEED RNG seed (default: -1, use random seed for -1)
|
||||||
|
--sampler-seq, --sampling-seq SEQUENCE
|
||||||
|
simplified sequence for samplers that will be used (default:
|
||||||
|
edskypmxt)
|
||||||
|
--ignore-eos ignore end of stream token and continue generating (implies
|
||||||
|
--logit-bias EOS-inf)
|
||||||
|
--temp, --temperature N temperature (default: 0.80)
|
||||||
|
--top-k N top-k sampling (default: 40, 0 = disabled)
|
||||||
|
(env: LLAMA_ARG_TOP_K)
|
||||||
|
--top-p N top-p sampling (default: 0.95, 1.0 = disabled)
|
||||||
|
--min-p N min-p sampling (default: 0.05, 0.0 = disabled)
|
||||||
|
--top-nsigma, --top-n-sigma N top-n-sigma sampling (default: -1.00, -1.0 = disabled)
|
||||||
|
--xtc-probability N xtc probability (default: 0.00, 0.0 = disabled)
|
||||||
|
--xtc-threshold N xtc threshold (default: 0.10, 1.0 = disabled)
|
||||||
|
--typical, --typical-p N locally typical sampling, parameter p (default: 1.00, 1.0 = disabled)
|
||||||
|
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1
|
||||||
|
= ctx_size)
|
||||||
|
--repeat-penalty N penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled)
|
||||||
|
--presence-penalty N repeat alpha presence penalty (default: 0.00, 0.0 = disabled)
|
||||||
|
--frequency-penalty N repeat alpha frequency penalty (default: 0.00, 0.0 = disabled)
|
||||||
|
--dry-multiplier N set DRY sampling multiplier (default: 0.00, 0.0 = disabled)
|
||||||
|
--dry-base N set DRY sampling base value (default: 1.75)
|
||||||
|
--dry-allowed-length N set allowed length for DRY sampling (default: 2)
|
||||||
|
--dry-penalty-last-n N set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 =
|
||||||
|
context size)
|
||||||
|
--dry-sequence-breaker STRING add sequence breaker for DRY sampling, clearing out default breakers
|
||||||
|
('\n', ':', '"', '*') in the process; use "none" to not use any
|
||||||
|
sequence breakers
|
||||||
|
--adaptive-target N adaptive-p: select tokens near this probability (valid range 0.0 to
|
||||||
|
1.0; negative = disabled) (default: -1.00)
|
||||||
|
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)
|
||||||
|
--adaptive-decay N adaptive-p: decay rate for target adaptation over time. lower values
|
||||||
|
are more reactive, higher values are more stable.
|
||||||
|
(valid range 0.0 to 0.99) (default: 0.90)
|
||||||
|
--dynatemp-range N dynamic temperature range (default: 0.00, 0.0 = disabled)
|
||||||
|
--dynatemp-exp N dynamic temperature exponent (default: 1.00)
|
||||||
|
--mirostat N use Mirostat sampling.
|
||||||
|
Top K, Nucleus and Locally Typical samplers are ignored if used.
|
||||||
|
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
|
||||||
|
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.10)
|
||||||
|
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.00)
|
||||||
|
-l, --logit-bias TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
|
||||||
|
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
|
||||||
|
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
|
||||||
|
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/
|
||||||
|
dir)
|
||||||
|
--grammar-file FNAME file to read grammar from
|
||||||
|
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g.
|
||||||
|
`{}` for any JSON object
|
||||||
|
For schemas w/ external $refs, use --grammar +
|
||||||
|
example/json_schema_to_grammar.py instead
|
||||||
|
-jf, --json-schema-file FILE File containing a JSON schema to constrain generations
|
||||||
|
(https://json-schema.org/), e.g. `{}` for any JSON object
|
||||||
|
For schemas w/ external $refs, use --grammar +
|
||||||
|
example/json_schema_to_grammar.py instead
|
||||||
|
-bs, --backend-sampling enable backend sampling (experimental) (default: disabled)
|
||||||
|
(env: LLAMA_ARG_BACKEND_SAMPLING)
|
||||||
|
|
||||||
|
|
||||||
|
----- example-specific params -----
|
||||||
|
|
||||||
|
-lcs, --lookup-cache-static FNAME path to static lookup cache to use for lookup decoding (not updated by
|
||||||
|
generation)
|
||||||
|
-lcd, --lookup-cache-dynamic FNAME path to dynamic lookup cache to use for lookup decoding (updated by
|
||||||
|
generation)
|
||||||
|
-ctxcp, --ctx-checkpoints, --swa-checkpoints N
|
||||||
|
max number of context checkpoints to create per slot (default:
|
||||||
|
32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
|
||||||
|
(env: LLAMA_ARG_CTX_CHECKPOINTS)
|
||||||
|
-cpent, --checkpoint-every-n-tokens N create a checkpoint every n tokens during prefill (processing), -1 to
|
||||||
|
disable (default: 8192)
|
||||||
|
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT)
|
||||||
|
-cram, --cache-ram N set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 -
|
||||||
|
disable)[(more
|
||||||
|
info)](https://github.com/ggml-org/llama.cpp/pull/16391)
|
||||||
|
(env: LLAMA_ARG_CACHE_RAM)
|
||||||
|
-kvu, --kv-unified, -no-kvu, --no-kv-unified
|
||||||
|
use single unified KV buffer shared across all sequences (default:
|
||||||
|
enabled if number of slots is auto)
|
||||||
|
(env: LLAMA_ARG_KV_UNIFIED)
|
||||||
|
--clear-idle, --no-clear-idle save and clear idle slots on new task (default: enabled, requires
|
||||||
|
unified KV and cache-ram)
|
||||||
|
(env: LLAMA_ARG_CLEAR_IDLE)
|
||||||
|
--context-shift, --no-context-shift whether to use context shift on infinite text generation (default:
|
||||||
|
disabled)
|
||||||
|
(env: LLAMA_ARG_CONTEXT_SHIFT)
|
||||||
|
-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
|
||||||
|
-sp, --special special tokens output enabled (default: false)
|
||||||
|
--warmup, --no-warmup whether to perform warmup with an empty run (default: enabled)
|
||||||
|
--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of
|
||||||
|
Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
|
||||||
|
--pooling {none,mean,cls,last,rank} pooling type for embeddings, use model default if unspecified
|
||||||
|
(env: LLAMA_ARG_POOLING)
|
||||||
|
-np, --parallel N number of server slots (default: -1, -1 = auto)
|
||||||
|
(env: LLAMA_ARG_N_PARALLEL)
|
||||||
|
-cb, --cont-batching, -nocb, --no-cont-batching
|
||||||
|
whether to enable continuous batching (a.k.a dynamic batching)
|
||||||
|
(default: enabled)
|
||||||
|
(env: LLAMA_ARG_CONT_BATCHING)
|
||||||
|
-mm, --mmproj FILE path to a multimodal projector file. see tools/mtmd/README.md
|
||||||
|
note: if -hf is used, this argument can be omitted
|
||||||
|
(env: LLAMA_ARG_MMPROJ)
|
||||||
|
-mmu, --mmproj-url URL URL to a multimodal projector file. see tools/mtmd/README.md
|
||||||
|
(env: LLAMA_ARG_MMPROJ_URL)
|
||||||
|
--mmproj-auto, --no-mmproj, --no-mmproj-auto
|
||||||
|
whether to use multimodal projector file (if available), useful when
|
||||||
|
using -hf (default: enabled)
|
||||||
|
(env: LLAMA_ARG_MMPROJ_AUTO)
|
||||||
|
--mmproj-offload, --no-mmproj-offload whether to enable GPU offloading for multimodal projector (default:
|
||||||
|
enabled)
|
||||||
|
(env: LLAMA_ARG_MMPROJ_OFFLOAD)
|
||||||
|
--image-min-tokens N minimum number of tokens each image can take, only used by vision
|
||||||
|
models with dynamic resolution (default: read from model)
|
||||||
|
(env: LLAMA_ARG_IMAGE_MIN_TOKENS)
|
||||||
|
--image-max-tokens N maximum number of tokens each image can take, only used by vision
|
||||||
|
models with dynamic resolution (default: read from model)
|
||||||
|
(env: LLAMA_ARG_IMAGE_MAX_TOKENS)
|
||||||
|
-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...
|
||||||
|
override tensor buffer type for draft model
|
||||||
|
-cmoed, --cpu-moe-draft keep all Mixture of Experts (MoE) weights in the CPU for the draft
|
||||||
|
model
|
||||||
|
(env: LLAMA_ARG_CPU_MOE_DRAFT)
|
||||||
|
-ncmoed, --n-cpu-moe-draft N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||||
|
CPU for the draft model
|
||||||
|
(env: LLAMA_ARG_N_CPU_MOE_DRAFT)
|
||||||
|
-a, --alias STRING set model name aliases, comma-separated (to be used by API)
|
||||||
|
(env: LLAMA_ARG_ALIAS)
|
||||||
|
--tags STRING set model tags, comma-separated (informational, not used for routing)
|
||||||
|
(env: LLAMA_ARG_TAGS)
|
||||||
|
--host HOST ip address to listen, or bind to an UNIX socket if the address ends
|
||||||
|
with .sock (default: 127.0.0.1)
|
||||||
|
(env: LLAMA_ARG_HOST)
|
||||||
|
--port PORT port to listen (default: 8080)
|
||||||
|
(env: LLAMA_ARG_PORT)
|
||||||
|
--reuse-port allow multiple sockets to bind to the same port (default: disabled)
|
||||||
|
(env: LLAMA_ARG_REUSE_PORT)
|
||||||
|
--path PATH path to serve static files from (default: )
|
||||||
|
(env: LLAMA_ARG_STATIC_PATH)
|
||||||
|
--api-prefix PREFIX prefix path the server serves from, without the trailing slash
|
||||||
|
(default: )
|
||||||
|
(env: LLAMA_ARG_API_PREFIX)
|
||||||
|
--webui-config JSON JSON that provides default WebUI settings (overrides WebUI defaults)
|
||||||
|
(env: LLAMA_ARG_WEBUI_CONFIG)
|
||||||
|
--webui-config-file PATH JSON file that provides default WebUI settings (overrides WebUI
|
||||||
|
defaults)
|
||||||
|
(env: LLAMA_ARG_WEBUI_CONFIG_FILE)
|
||||||
|
--webui-mcp-proxy, --no-webui-mcp-proxy
|
||||||
|
experimental: whether to enable MCP CORS proxy - do not enable in
|
||||||
|
untrusted environments (default: disabled)
|
||||||
|
(env: LLAMA_ARG_WEBUI_MCP_PROXY)
|
||||||
|
--tools TOOL1,TOOL2,... experimental: whether to enable built-in tools for AI agents - do not
|
||||||
|
enable in untrusted environments (default: no tools)
|
||||||
|
specify "all" to enable all tools
|
||||||
|
available tools: read_file, file_glob_search, grep_search,
|
||||||
|
exec_shell_command, write_file, edit_file, apply_diff
|
||||||
|
(env: LLAMA_ARG_TOOLS)
|
||||||
|
--webui, --no-webui whether to enable the Web UI (default: enabled)
|
||||||
|
(env: LLAMA_ARG_WEBUI)
|
||||||
|
--embedding, --embeddings restrict to only support embedding use case; use only with dedicated
|
||||||
|
embedding models (default: disabled)
|
||||||
|
(env: LLAMA_ARG_EMBEDDINGS)
|
||||||
|
--rerank, --reranking enable reranking endpoint on server (default: disabled)
|
||||||
|
(env: LLAMA_ARG_RERANKING)
|
||||||
|
--api-key KEY API key to use for authentication, multiple keys can be provided as a
|
||||||
|
comma-separated list (default: none)
|
||||||
|
(env: LLAMA_API_KEY)
|
||||||
|
--api-key-file FNAME path to file containing API keys (default: none)
|
||||||
|
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
|
||||||
|
(env: LLAMA_ARG_SSL_KEY_FILE)
|
||||||
|
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
|
||||||
|
(env: LLAMA_ARG_SSL_CERT_FILE)
|
||||||
|
--chat-template-kwargs STRING sets additional params for the json template parser, must be a valid
|
||||||
|
json object string, e.g. '{"key1":"value1","key2":"value2"}'
|
||||||
|
(env: LLAMA_CHAT_TEMPLATE_KWARGS)
|
||||||
|
-to, --timeout N server read/write timeout in seconds (default: 600)
|
||||||
|
(env: LLAMA_ARG_TIMEOUT)
|
||||||
|
--threads-http N number of threads used to process HTTP requests (default: -1)
|
||||||
|
(env: LLAMA_ARG_THREADS_HTTP)
|
||||||
|
--cache-prompt, --no-cache-prompt whether to enable prompt caching (default: enabled)
|
||||||
|
(env: LLAMA_ARG_CACHE_PROMPT)
|
||||||
|
--cache-reuse N min chunk size to attempt reusing from the cache via KV shifting,
|
||||||
|
requires prompt caching to be enabled (default: 0)
|
||||||
|
[(card)](https://ggml.ai/f0.png)
|
||||||
|
(env: LLAMA_ARG_CACHE_REUSE)
|
||||||
|
--metrics enable prometheus compatible metrics endpoint (default: disabled)
|
||||||
|
(env: LLAMA_ARG_ENDPOINT_METRICS)
|
||||||
|
--props enable changing global properties via POST /props (default: disabled)
|
||||||
|
(env: LLAMA_ARG_ENDPOINT_PROPS)
|
||||||
|
--slots, --no-slots expose slots monitoring endpoint (default: enabled)
|
||||||
|
(env: LLAMA_ARG_ENDPOINT_SLOTS)
|
||||||
|
--slot-save-path PATH path to save slot kv cache (default: disabled)
|
||||||
|
--media-path PATH directory for loading local media files; files can be accessed via
|
||||||
|
file:// URLs using relative paths (default: disabled)
|
||||||
|
--models-dir PATH directory containing models for the router server (default: disabled)
|
||||||
|
(env: LLAMA_ARG_MODELS_DIR)
|
||||||
|
--models-preset PATH path to INI file containing model presets for the router server
|
||||||
|
(default: disabled)
|
||||||
|
(env: LLAMA_ARG_MODELS_PRESET)
|
||||||
|
--models-max N for router server, maximum number of models to load simultaneously
|
||||||
|
(default: 4, 0 = unlimited)
|
||||||
|
(env: LLAMA_ARG_MODELS_MAX)
|
||||||
|
--models-autoload, --no-models-autoload
|
||||||
|
for router server, whether to automatically load models (default:
|
||||||
|
enabled)
|
||||||
|
(env: LLAMA_ARG_MODELS_AUTOLOAD)
|
||||||
|
--jinja, --no-jinja whether to use jinja template engine for chat (default: enabled)
|
||||||
|
(env: LLAMA_ARG_JINJA)
|
||||||
|
--reasoning-format FORMAT controls whether thought tags are allowed and/or extracted from the
|
||||||
|
response, and in which format they're returned; one of:
|
||||||
|
- none: leaves thoughts unparsed in `message.content`
|
||||||
|
- deepseek: puts thoughts in `message.reasoning_content`
|
||||||
|
- deepseek-legacy: keeps `<think>` tags in `message.content` while
|
||||||
|
also populating `message.reasoning_content`
|
||||||
|
(default: auto)
|
||||||
|
(env: LLAMA_ARG_THINK)
|
||||||
|
-rea, --reasoning [on|off|auto] Use reasoning/thinking in the chat ('on', 'off', or 'auto', default:
|
||||||
|
'auto' (detect from template))
|
||||||
|
(env: LLAMA_ARG_REASONING)
|
||||||
|
--reasoning-budget N token budget for thinking: -1 for unrestricted, 0 for immediate end,
|
||||||
|
N>0 for token budget (default: -1)
|
||||||
|
(env: LLAMA_ARG_THINK_BUDGET)
|
||||||
|
--reasoning-budget-message MESSAGE message injected before the end-of-thinking tag when reasoning budget
|
||||||
|
is exhausted (default: none)
|
||||||
|
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE)
|
||||||
|
--chat-template JINJA_TEMPLATE set custom jinja chat template (default: template taken from model's
|
||||||
|
metadata)
|
||||||
|
if suffix/prefix are specified, template will be disabled
|
||||||
|
only commonly used templates are accepted (unless --jinja is set
|
||||||
|
before this flag):
|
||||||
|
list of built-in templates:
|
||||||
|
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml,
|
||||||
|
command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe,
|
||||||
|
exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite,
|
||||||
|
granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2,
|
||||||
|
llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez,
|
||||||
|
minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7,
|
||||||
|
mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3,
|
||||||
|
phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca,
|
||||||
|
yandex, zephyr
|
||||||
|
(env: LLAMA_ARG_CHAT_TEMPLATE)
|
||||||
|
--chat-template-file JINJA_TEMPLATE_FILE
|
||||||
|
set custom jinja chat template file (default: template taken from
|
||||||
|
model's metadata)
|
||||||
|
if suffix/prefix are specified, template will be disabled
|
||||||
|
only commonly used templates are accepted (unless --jinja is set
|
||||||
|
before this flag):
|
||||||
|
list of built-in templates:
|
||||||
|
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml,
|
||||||
|
command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe,
|
||||||
|
exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite,
|
||||||
|
granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2,
|
||||||
|
llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez,
|
||||||
|
minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7,
|
||||||
|
mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3,
|
||||||
|
phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca,
|
||||||
|
yandex, zephyr
|
||||||
|
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE)
|
||||||
|
--skip-chat-parsing, --no-skip-chat-parsing
|
||||||
|
force a pure content parser, even if a Jinja template is specified;
|
||||||
|
model will output everything in the content section, including any
|
||||||
|
reasoning and/or tool calls (default: disabled)
|
||||||
|
(env: LLAMA_ARG_SKIP_CHAT_PARSING)
|
||||||
|
--prefill-assistant, --no-prefill-assistant
|
||||||
|
whether to prefill the assistant's response if the last message is an
|
||||||
|
assistant message (default: prefill enabled)
|
||||||
|
when this flag is set, if the last message is an assistant message
|
||||||
|
then it will be treated as a full message and not prefilled
|
||||||
|
|
||||||
|
(env: LLAMA_ARG_PREFILL_ASSISTANT)
|
||||||
|
-sps, --slot-prompt-similarity SIMILARITY
|
||||||
|
how much the prompt of a request must match the prompt of a slot in
|
||||||
|
order to use that slot (default: 0.10, 0.0 = disabled)
|
||||||
|
--lora-init-without-apply load LoRA adapters without applying them (apply later via POST
|
||||||
|
/lora-adapters) (default: disabled)
|
||||||
|
--sleep-idle-seconds SECONDS number of seconds of idleness after which the server will sleep
|
||||||
|
(default: -1; -1 = disabled)
|
||||||
|
-td, --threads-draft N number of threads to use during generation (default: same as
|
||||||
|
--threads)
|
||||||
|
-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default:
|
||||||
|
same as --threads-draft)
|
||||||
|
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
|
||||||
|
(env: LLAMA_ARG_DRAFT_MAX)
|
||||||
|
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
|
||||||
|
(default: 0)
|
||||||
|
(env: LLAMA_ARG_DRAFT_MIN)
|
||||||
|
--draft-p-min P minimum speculative decoding probability (greedy) (default: 0.75)
|
||||||
|
(env: LLAMA_ARG_DRAFT_P_MIN)
|
||||||
|
-cd, --ctx-size-draft N size of the prompt context for the draft model (default: 0, 0 = loaded
|
||||||
|
from model)
|
||||||
|
(env: LLAMA_ARG_CTX_SIZE_DRAFT)
|
||||||
|
-devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model
|
||||||
|
(none = don't offload)
|
||||||
|
use --list-devices to see a list of available devices
|
||||||
|
-ngld, --gpu-layers-draft, --n-gpu-layers-draft N
|
||||||
|
max. number of draft model layers to store in VRAM, either an exact
|
||||||
|
number, 'auto', or 'all' (default: auto)
|
||||||
|
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
|
||||||
|
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
|
||||||
|
(env: LLAMA_ARG_MODEL_DRAFT)
|
||||||
|
--spec-replace TARGET DRAFT translate the string in TARGET into DRAFT if the draft model and main
|
||||||
|
model are not compatible
|
||||||
|
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||||
|
type of speculative decoding to use when no draft model is provided
|
||||||
|
(default: none)
|
||||||
|
|
||||||
|
(env: LLAMA_ARG_SPEC_TYPE)
|
||||||
|
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
|
||||||
|
of lookup n-gram (default: 12)
|
||||||
|
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
|
||||||
|
of draft m-gram (default: 48)
|
||||||
|
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
|
||||||
|
-mv, --model-vocoder FNAME vocoder model for audio generation (default: unused)
|
||||||
|
--tts-use-guide-tokens Use guide tokens to improve TTS word recall
|
||||||
|
--embd-gemma-default use default EmbeddingGemma model (note: can download weights from the
|
||||||
|
internet)
|
||||||
|
--fim-qwen-1.5b-default use default Qwen 2.5 Coder 1.5B (note: can download weights from the
|
||||||
|
internet)
|
||||||
|
--fim-qwen-3b-default use default Qwen 2.5 Coder 3B (note: can download weights from the
|
||||||
|
internet)
|
||||||
|
--fim-qwen-7b-default use default Qwen 2.5 Coder 7B (note: can download weights from the
|
||||||
|
internet)
|
||||||
|
--fim-qwen-7b-spec use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can
|
||||||
|
download weights from the internet)
|
||||||
|
--fim-qwen-14b-spec use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note:
|
||||||
|
can download weights from the internet)
|
||||||
|
--fim-qwen-30b-default use default Qwen 3 Coder 30B A3B Instruct (note: can download weights
|
||||||
|
from the internet)
|
||||||
|
--gpt-oss-20b-default use gpt-oss-20b (note: can download weights from the internet)
|
||||||
|
--gpt-oss-120b-default use gpt-oss-120b (note: can download weights from the internet)
|
||||||
|
--vision-gemma-4b-default use Gemma 3 4B QAT (note: can download weights from the internet)
|
||||||
|
--vision-gemma-12b-default use Gemma 3 12B QAT (note: can download weights from the internet)
|
||||||
31
scripts/help_gpu_flags.txt
Normal file
31
scripts/help_gpu_flags.txt
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
ggml_cuda_init: found 2 CUDA devices (Total VRAM: 24575 MiB):
|
||||||
|
Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB
|
||||||
|
Device 1: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB
|
||||||
|
-dev, --device <dev1,dev2,..> comma-separated list of devices to use for offloading (none = don't
|
||||||
|
use --list-devices to see a list of available devices
|
||||||
|
(env: LLAMA_ARG_DEVICE)
|
||||||
|
--list-devices print list of available devices and exit
|
||||||
|
-ot, --override-tensor <tensor name pattern>=<buffer type>,...
|
||||||
|
override tensor buffer type
|
||||||
|
(env: LLAMA_ARG_OVERRIDE_TENSOR)
|
||||||
|
-cmoe, --cpu-moe keep all Mixture of Experts (MoE) weights in the CPU
|
||||||
|
-ncmoe, --n-cpu-moe N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||||
|
-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of:
|
||||||
|
- layer (default): split layers and KV across GPUs
|
||||||
|
- row: split rows across GPUs
|
||||||
|
(env: LLAMA_ARG_SPLIT_MODE)
|
||||||
|
-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of
|
||||||
|
(env: LLAMA_ARG_TENSOR_SPLIT)
|
||||||
|
-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for
|
||||||
|
intermediate results and KV (with split-mode = row) (default: 0)
|
||||||
|
-fit, --fit [on|off] whether to adjust unset arguments to fit in device memory ('on' or
|
||||||
|
target margin per device for --fit, comma-separated list of values,
|
||||||
|
single value is broadcast across all devices, default: 1024
|
||||||
|
--check-tensors check model tensor data for invalid values (default: false)
|
||||||
|
--op-offload, --no-op-offload whether to offload host tensor operations to device (default: true)
|
||||||
|
-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...
|
||||||
|
override tensor buffer type for draft model
|
||||||
|
-cmoed, --cpu-moe-draft keep all Mixture of Experts (MoE) weights in the CPU for the draft
|
||||||
|
-ncmoed, --n-cpu-moe-draft N keep the Mixture of Experts (MoE) weights of the first N layers in the
|
||||||
|
-devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model
|
||||||
|
use --list-devices to see a list of available devices
|
||||||
5
scripts/q4km_latest.txt
Normal file
5
scripts/q4km_latest.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
pure-GPU nommap small | 62.29 | GPU | VRAM:22975 | ub=128 b=512 t=4
|
||||||
|
pure-GPU ts=0.5,0.5 | 63.89 | GPU | VRAM:23002 | ub=128 b=512 t=4
|
||||||
|
tune t=2 | 64.1 | GPU | VRAM:22980 | ub=128 b=512 t=2
|
||||||
|
tune t=6 | 64.18 | GPU | VRAM:22982 | ub=128 b=512 t=6
|
||||||
|
tune t=8 | 63.11 | GPU | VRAM:22980 | ub=128 b=512 t=8
|
||||||
31
scripts/quick_pptest.mjs
Normal file
31
scripts/quick_pptest.mjs
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
// Quick PP+TG speed test
|
||||||
|
const BASE = "http://127.0.0.1:8000";
|
||||||
|
|
||||||
|
async function test(label, prompt, maxTok) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
const r = await fetch(`${BASE}/v1/chat/completions`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ model: "m", messages: [{ role: "user", content: prompt }], max_tokens: maxTok, temperature: 0 }),
|
||||||
|
signal: AbortSignal.timeout(600000),
|
||||||
|
});
|
||||||
|
const d = await r.json();
|
||||||
|
const dt = (Date.now() - t0) / 1000;
|
||||||
|
const u = d.usage || {};
|
||||||
|
const pp = u.prompt_tokens || 0;
|
||||||
|
const tg = u.completion_tokens || 0;
|
||||||
|
const ppSpeed = pp > 0 ? (pp / dt).toFixed(1) : "?";
|
||||||
|
const tgSpeed = tg > 0 ? (tg / dt).toFixed(1) : "?";
|
||||||
|
console.log(`${label} | PP:${pp}tok ${ppSpeed}t/s | TG:${tg}tok ${tgSpeed}t/s | ${dt.toFixed(1)}s`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const short = "Count 1 to 20.";
|
||||||
|
const long = "x".repeat(3000) + " Summarize above in 3 words.";
|
||||||
|
const code = Array(200).fill("function foo(x) { return x * 2 + Math.random(); }").join("\n") + "\n\nRefactor above to arrow functions. Show first 5 lines.";
|
||||||
|
|
||||||
|
await test("warmup", short, 20);
|
||||||
|
await test("SHORT", short, 200);
|
||||||
|
await test("3K-PP", long, 100);
|
||||||
|
await test("10K-CODE", code, 100);
|
||||||
|
await test("TG-500", short, 500);
|
||||||
|
console.log("DONE");
|
||||||
345
scripts/qwen_fullgpu_challenge.mjs
Normal file
345
scripts/qwen_fullgpu_challenge.mjs
Normal file
@@ -0,0 +1,345 @@
|
|||||||
|
/**
|
||||||
|
* Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크
|
||||||
|
* =====================================================
|
||||||
|
* 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기
|
||||||
|
*
|
||||||
|
* 테스트 모델:
|
||||||
|
* 1. UD-IQ4_NL (16.6 GB) — 확실히 올라감, 기준선
|
||||||
|
* 2. MXFP4_MOE (20.1 GB) — 도전! VRAM 극한 최적화
|
||||||
|
* 3. Q4_K_M (20.5 GB) — 대조군 (n-cpu-moe=5)
|
||||||
|
*
|
||||||
|
* VRAM 절감 전략:
|
||||||
|
* A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소)
|
||||||
|
* B. split-mode row (GPU간 더 균등한 분배)
|
||||||
|
* C. tensor-split 수동 밸런싱
|
||||||
|
* D. no-mmap (메모리 관리 최적화)
|
||||||
|
* E. defrag-thold (KV 캐시 파편화 방지)
|
||||||
|
*
|
||||||
|
* Run: node scripts/qwen_fullgpu_challenge.mjs
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { spawn, execSync } from "child_process";
|
||||||
|
import { writeFileSync, existsSync, statSync } from "fs";
|
||||||
|
|
||||||
|
const BASE_URL = "http://127.0.0.1:8000";
|
||||||
|
const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
|
||||||
|
const CTX = 262144;
|
||||||
|
const RUNS = 3;
|
||||||
|
const TOKENS = 200;
|
||||||
|
const BOOT_TIMEOUT = 300_000;
|
||||||
|
|
||||||
|
const MODELS = [
|
||||||
|
{
|
||||||
|
name: "Qwen3.5 UD-IQ4_NL",
|
||||||
|
path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`,
|
||||||
|
sizeGB: 16.6,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Qwen3.5 MXFP4_MOE",
|
||||||
|
path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
|
||||||
|
sizeGB: 20.11,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Qwen3.5 Q4_K_M",
|
||||||
|
path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
|
||||||
|
sizeGB: 20.5,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const ALL = [];
|
||||||
|
let proc = null;
|
||||||
|
const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
|
||||||
|
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||||
|
|
||||||
|
async function kill() {
|
||||||
|
if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; }
|
||||||
|
try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
|
||||||
|
await sleep(5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
function vram() {
|
||||||
|
try {
|
||||||
|
return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
|
||||||
|
{ encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
|
||||||
|
const [g, u, t] = l.split(",").map(s => parseInt(s));
|
||||||
|
return { gpu: g, used: u, total: t };
|
||||||
|
});
|
||||||
|
} catch { return []; }
|
||||||
|
}
|
||||||
|
|
||||||
|
function startServer(modelPath, p) {
|
||||||
|
const args = [
|
||||||
|
"--model", modelPath, "-ngl", "999",
|
||||||
|
"-c", String(CTX), "-np", "1", "-fa", "on",
|
||||||
|
"--cache-type-k", p.ctk || "q4_0",
|
||||||
|
"--cache-type-v", p.ctv || "q4_0",
|
||||||
|
"-ub", String(p.ub || 512), "-b", String(p.b || 2048),
|
||||||
|
"-t", String(p.t || 4), "-tb", String(p.t || 4),
|
||||||
|
"--prio", "3", "--poll", "50", "--mlock",
|
||||||
|
"--port", "8000", "--host", "0.0.0.0",
|
||||||
|
];
|
||||||
|
|
||||||
|
// GPU offload strategy
|
||||||
|
if (p.cpuMoe) args.push("--cpu-moe");
|
||||||
|
else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe));
|
||||||
|
|
||||||
|
// VRAM saving options
|
||||||
|
if (p.splitMode) args.push("--split-mode", p.splitMode);
|
||||||
|
if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit);
|
||||||
|
if (p.noMmap) args.push("--no-mmap");
|
||||||
|
if (p.defragThold) args.push("--defrag-thold", String(p.defragThold));
|
||||||
|
if (p.noKvOffload) args.push("--no-kv-offload");
|
||||||
|
|
||||||
|
const cmdStr = args.join(" ");
|
||||||
|
log(` CMD: ...${cmdStr.slice(-80)}`);
|
||||||
|
proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
|
||||||
|
return proc;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function waitReady(timeout = BOOT_TIMEOUT) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
while (Date.now() - t0 < timeout) {
|
||||||
|
try {
|
||||||
|
const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
|
||||||
|
const d = await r.json();
|
||||||
|
if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
|
||||||
|
} catch {}
|
||||||
|
await sleep(3000);
|
||||||
|
}
|
||||||
|
return { ok: false, boot: timeout / 1000 };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function bench(n = TOKENS) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: "m",
|
||||||
|
messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
|
||||||
|
max_tokens: n, temperature: 0,
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(600_000),
|
||||||
|
});
|
||||||
|
const d = await r.json();
|
||||||
|
const dt = (Date.now() - t0) / 1000;
|
||||||
|
const ct = d.usage?.completion_tokens || 0;
|
||||||
|
return { tps: ct / dt, ct, dt };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testConfig(model, label, params) {
|
||||||
|
await kill();
|
||||||
|
log(` [${label}] Starting...`);
|
||||||
|
startServer(model.path, params);
|
||||||
|
const { ok, boot } = await waitReady();
|
||||||
|
if (!ok) {
|
||||||
|
log(` [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`);
|
||||||
|
await kill();
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const v = vram();
|
||||||
|
const totalUsed = v.reduce((a, g) => a + g.used, 0);
|
||||||
|
const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
|
||||||
|
log(` [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`);
|
||||||
|
|
||||||
|
try { await bench(20); } catch {} // warmup
|
||||||
|
|
||||||
|
const speeds = [];
|
||||||
|
for (let i = 0; i < RUNS; i++) {
|
||||||
|
try {
|
||||||
|
const r = await bench();
|
||||||
|
speeds.push(r.tps);
|
||||||
|
log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||||
|
} catch (e) {
|
||||||
|
log(` Run${i+1}: ERR ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await kill();
|
||||||
|
|
||||||
|
if (!speeds.length) return null;
|
||||||
|
const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
|
||||||
|
const best = Math.max(...speeds);
|
||||||
|
log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
|
||||||
|
|
||||||
|
const res = {
|
||||||
|
model: model.name, label,
|
||||||
|
avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
|
||||||
|
boot: +boot.toFixed(1),
|
||||||
|
vram_total: totalUsed, vram: v,
|
||||||
|
params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" },
|
||||||
|
gpu_only: !params.cpuMoe && !params.nCpuMoe,
|
||||||
|
};
|
||||||
|
ALL.push(res);
|
||||||
|
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Test Strategies ───────────────────────────────────────────
|
||||||
|
|
||||||
|
async function testModel(model) {
|
||||||
|
log(`\n${"#".repeat(65)}`);
|
||||||
|
log(` ${model.name} (${model.sizeGB} GB)`);
|
||||||
|
if (!existsSync(model.path)) { log(" ✗ File not found!"); return null; }
|
||||||
|
log(`${"#".repeat(65)}`);
|
||||||
|
|
||||||
|
let best = null;
|
||||||
|
const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; };
|
||||||
|
|
||||||
|
// ── Strategy 1: Pure GPU, default settings ──
|
||||||
|
log(`\n ── Strategy 1: Pure GPU (default) ──`);
|
||||||
|
update(await testConfig(model, "pure-GPU default", {
|
||||||
|
t: 4, ub: 512, b: 2048
|
||||||
|
}));
|
||||||
|
|
||||||
|
// ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ──
|
||||||
|
log(`\n ── Strategy 2: Pure GPU, minimal batch ──`);
|
||||||
|
update(await testConfig(model, "pure-GPU minbatch", {
|
||||||
|
t: 4, ub: 64, b: 256
|
||||||
|
}));
|
||||||
|
|
||||||
|
// ── Strategy 3: Pure GPU, small batch + no-mmap ──
|
||||||
|
log(`\n ── Strategy 3: Pure GPU + no-mmap + small batch ──`);
|
||||||
|
update(await testConfig(model, "pure-GPU nommap small", {
|
||||||
|
t: 4, ub: 128, b: 512, noMmap: true
|
||||||
|
}));
|
||||||
|
|
||||||
|
// ── Strategy 4: Pure GPU, split-mode row ──
|
||||||
|
log(`\n ── Strategy 4: Pure GPU + split-mode row ──`);
|
||||||
|
update(await testConfig(model, "pure-GPU row-split", {
|
||||||
|
t: 4, ub: 128, b: 512, splitMode: "row"
|
||||||
|
}));
|
||||||
|
|
||||||
|
// ── Strategy 5: Pure GPU, tensor-split manual balance ──
|
||||||
|
log(`\n ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`);
|
||||||
|
update(await testConfig(model, "pure-GPU ts=0.5,0.5", {
|
||||||
|
t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5"
|
||||||
|
}));
|
||||||
|
|
||||||
|
// ── Strategy 6: Pure GPU, defrag + all tricks ──
|
||||||
|
log(`\n ── Strategy 6: Pure GPU ALL tricks ──`);
|
||||||
|
update(await testConfig(model, "pure-GPU all-tricks", {
|
||||||
|
t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1
|
||||||
|
}));
|
||||||
|
|
||||||
|
// ── Fallback: n-cpu-moe=5 baseline ──
|
||||||
|
if (!best || !best.gpu_only) {
|
||||||
|
log(`\n ── Fallback: n-cpu-moe=5 ──`);
|
||||||
|
update(await testConfig(model, "n-cpu-moe=5 baseline", {
|
||||||
|
t: 4, ub: 256, b: 1024, nCpuMoe: 5
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── If pure GPU worked, tune batch/thread/kv ──
|
||||||
|
if (best && best.gpu_only) {
|
||||||
|
log(`\n ── Pure GPU succeeded! Fine-tuning... ──`);
|
||||||
|
const bp = best.params;
|
||||||
|
|
||||||
|
// Thread sweep
|
||||||
|
for (const t of [2, 6, 8]) {
|
||||||
|
if (t === bp.t) continue;
|
||||||
|
update(await testConfig(model, `tune t=${t}`, { ...bp, t }));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Batch sweep
|
||||||
|
for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) {
|
||||||
|
if (ub === bp.ub && b === bp.b) continue;
|
||||||
|
update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b }));
|
||||||
|
}
|
||||||
|
|
||||||
|
// KV cache upgrade (extra VRAM available?)
|
||||||
|
for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) {
|
||||||
|
update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Final verification ──
|
||||||
|
if (best) {
|
||||||
|
log(`\n ── Final verification (5 runs) ──`);
|
||||||
|
await kill();
|
||||||
|
startServer(model.path, best.params);
|
||||||
|
const { ok, boot } = await waitReady();
|
||||||
|
if (ok) {
|
||||||
|
const v = vram();
|
||||||
|
try { await bench(20); } catch {}
|
||||||
|
const finals = [];
|
||||||
|
for (let i = 0; i < 5; i++) {
|
||||||
|
try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
|
||||||
|
} catch (e) { log(` Final ${i+1}: ERR`); }
|
||||||
|
}
|
||||||
|
await kill();
|
||||||
|
if (finals.length > 0) {
|
||||||
|
const avg = finals.reduce((a,b)=>a+b) / finals.length;
|
||||||
|
const bst = Math.max(...finals);
|
||||||
|
log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
|
||||||
|
const final = { model: model.name, label: "FINAL",
|
||||||
|
avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
|
||||||
|
boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0),
|
||||||
|
vram: v, params: best.params, gpu_only: best.gpu_only };
|
||||||
|
ALL.push(final);
|
||||||
|
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||||
|
return final;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await kill();
|
||||||
|
}
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Main ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const t0 = Date.now();
|
||||||
|
log("=".repeat(65));
|
||||||
|
log(" QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET");
|
||||||
|
log(" 2x RTX 3060 (24GB) | 256K Context");
|
||||||
|
log(" " + new Date().toISOString());
|
||||||
|
log("=".repeat(65));
|
||||||
|
vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`));
|
||||||
|
|
||||||
|
const winners = [];
|
||||||
|
for (const model of MODELS) {
|
||||||
|
const w = await testModel(model);
|
||||||
|
if (w) winners.push(w);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Summary ──────────────────────────────────────────────
|
||||||
|
const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
|
||||||
|
winners.sort((a, b) => b.avg_tps - a.avg_tps);
|
||||||
|
|
||||||
|
const lines = [
|
||||||
|
`Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`,
|
||||||
|
`2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
|
||||||
|
"", "=".repeat(55), " RANKING", "=".repeat(55),
|
||||||
|
];
|
||||||
|
for (let i = 0; i < winners.length; i++) {
|
||||||
|
const w = winners[i], p = w.params;
|
||||||
|
const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload";
|
||||||
|
lines.push("", ` #${i+1}: ${w.model} [${gpu}]`);
|
||||||
|
lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
|
||||||
|
lines.push(` VRAM: ${w.vram_total} MiB total`);
|
||||||
|
const flags = [];
|
||||||
|
if (p.splitMode) flags.push(`split=${p.splitMode}`);
|
||||||
|
if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`);
|
||||||
|
if (p.noMmap) flags.push("no-mmap");
|
||||||
|
if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`);
|
||||||
|
lines.push(` t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (winners.length > 0) {
|
||||||
|
const c = winners[0];
|
||||||
|
lines.push("", "=".repeat(55));
|
||||||
|
lines.push(` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`);
|
||||||
|
lines.push("=".repeat(55));
|
||||||
|
}
|
||||||
|
|
||||||
|
const summary = lines.join("\n");
|
||||||
|
console.log("\n" + summary);
|
||||||
|
writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8");
|
||||||
|
writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
|
||||||
|
log(`\n Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`);
|
||||||
|
log(" DONE!");
|
||||||
|
await kill();
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(e => { console.error("FATAL:", e); process.exit(1); });
|
||||||
834
scripts/qwen_fullgpu_results.json
Normal file
834
scripts/qwen_fullgpu_results.json
Normal file
@@ -0,0 +1,834 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "pure-GPU minbatch",
|
||||||
|
"avg_tps": 65.11,
|
||||||
|
"best_tps": 65.49,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 19177,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10039,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9138,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "pure-GPU nommap small",
|
||||||
|
"avg_tps": 65.01,
|
||||||
|
"best_tps": 65.36,
|
||||||
|
"boot": 6,
|
||||||
|
"vram_total": 19672,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10342,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9330,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "pure-GPU row-split",
|
||||||
|
"avg_tps": 13.65,
|
||||||
|
"best_tps": 14.82,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 19427,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10311,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9116,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"splitMode": "row",
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "pure-GPU ts=0.5,0.5",
|
||||||
|
"avg_tps": 64.92,
|
||||||
|
"best_tps": 65.23,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 19664,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10334,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9330,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"tensorSplit": "0.5,0.5",
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "pure-GPU all-tricks",
|
||||||
|
"avg_tps": 64.72,
|
||||||
|
"best_tps": 64.89,
|
||||||
|
"boot": 6,
|
||||||
|
"vram_total": 19171,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10033,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9138,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"noMmap": true,
|
||||||
|
"defragThold": 0.1,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "tune t=2",
|
||||||
|
"avg_tps": 64.87,
|
||||||
|
"best_tps": 65.13,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 19170,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10032,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9138,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 2,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "tune t=6",
|
||||||
|
"avg_tps": 64.88,
|
||||||
|
"best_tps": 65.17,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 19168,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10030,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9138,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 6,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "tune t=8",
|
||||||
|
"avg_tps": 64.5,
|
||||||
|
"best_tps": 64.77,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 19168,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10030,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9138,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 8,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "tune ub=256 b=1024",
|
||||||
|
"avg_tps": 64.73,
|
||||||
|
"best_tps": 64.98,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 20640,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10928,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9712,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 1024,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "tune ub=256 b=2048",
|
||||||
|
"avg_tps": 63.69,
|
||||||
|
"best_tps": 64.94,
|
||||||
|
"boot": 12,
|
||||||
|
"vram_total": 20614,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10902,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9712,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 2048,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "tune kv=q8_0/q8_0",
|
||||||
|
"avg_tps": 64.78,
|
||||||
|
"best_tps": 65.08,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 20422,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 10644,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 9778,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "tune kv=f16/f16",
|
||||||
|
"avg_tps": 65.53,
|
||||||
|
"best_tps": 65.81,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22812,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11846,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10966,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "f16",
|
||||||
|
"ctv": "f16"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 UD-IQ4_NL",
|
||||||
|
"label": "FINAL",
|
||||||
|
"avg_tps": 66.31,
|
||||||
|
"best_tps": 66.53,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22811,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11845,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10966,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "f16",
|
||||||
|
"ctv": "f16"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "pure-GPU minbatch",
|
||||||
|
"avg_tps": 63.06,
|
||||||
|
"best_tps": 64.16,
|
||||||
|
"boot": 12,
|
||||||
|
"vram_total": 22747,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11895,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10852,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "pure-GPU nommap small",
|
||||||
|
"avg_tps": 63.75,
|
||||||
|
"best_tps": 63.98,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22579,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11797,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10782,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "pure-GPU ts=0.5,0.5",
|
||||||
|
"avg_tps": 62.88,
|
||||||
|
"best_tps": 63.9,
|
||||||
|
"boot": 12,
|
||||||
|
"vram_total": 22578,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11796,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10782,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"tensorSplit": "0.5,0.5",
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "pure-GPU all-tricks",
|
||||||
|
"avg_tps": 62.55,
|
||||||
|
"best_tps": 63.71,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22743,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11891,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10852,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 64,
|
||||||
|
"b": 256,
|
||||||
|
"noMmap": true,
|
||||||
|
"defragThold": 0.1,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "tune t=2",
|
||||||
|
"avg_tps": 63.07,
|
||||||
|
"best_tps": 64.08,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22601,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11819,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10782,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 2,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "tune t=6",
|
||||||
|
"avg_tps": 63.58,
|
||||||
|
"best_tps": 64.04,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22583,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11801,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10782,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 6,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "tune t=8",
|
||||||
|
"avg_tps": 62.92,
|
||||||
|
"best_tps": 63.73,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22536,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11754,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10782,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 8,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "tune ub=256 b=1024",
|
||||||
|
"avg_tps": 62.76,
|
||||||
|
"best_tps": 63.86,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22874,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11968,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10906,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 1024,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "tune ub=256 b=2048",
|
||||||
|
"avg_tps": 62.74,
|
||||||
|
"best_tps": 63.9,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22912,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 12006,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10906,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 2048,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 MXFP4_MOE",
|
||||||
|
"label": "FINAL",
|
||||||
|
"avg_tps": 63.71,
|
||||||
|
"best_tps": 64.39,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22566,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 11784,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10782,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 Q4_K_M",
|
||||||
|
"label": "pure-GPU nommap small",
|
||||||
|
"avg_tps": 62.29,
|
||||||
|
"best_tps": 63.03,
|
||||||
|
"boot": 9,
|
||||||
|
"vram_total": 22975,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 12007,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10968,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"noMmap": true,
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 Q4_K_M",
|
||||||
|
"label": "pure-GPU ts=0.5,0.5",
|
||||||
|
"avg_tps": 63.89,
|
||||||
|
"best_tps": 64.91,
|
||||||
|
"boot": 12,
|
||||||
|
"vram_total": 23002,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 12034,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10968,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"tensorSplit": "0.5,0.5",
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 Q4_K_M",
|
||||||
|
"label": "tune t=2",
|
||||||
|
"avg_tps": 64.1,
|
||||||
|
"best_tps": 64.54,
|
||||||
|
"boot": 12,
|
||||||
|
"vram_total": 22980,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 12012,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10968,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 2,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"tensorSplit": "0.5,0.5",
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 Q4_K_M",
|
||||||
|
"label": "tune t=6",
|
||||||
|
"avg_tps": 64.18,
|
||||||
|
"best_tps": 64.72,
|
||||||
|
"boot": 12,
|
||||||
|
"vram_total": 22982,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 12014,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10968,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 6,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"tensorSplit": "0.5,0.5",
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Qwen3.5 Q4_K_M",
|
||||||
|
"label": "tune t=8",
|
||||||
|
"avg_tps": 63.11,
|
||||||
|
"best_tps": 64.02,
|
||||||
|
"boot": 12,
|
||||||
|
"vram_total": 22980,
|
||||||
|
"vram": [
|
||||||
|
{
|
||||||
|
"gpu": 0,
|
||||||
|
"used": 12012,
|
||||||
|
"total": 12288
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gpu": 1,
|
||||||
|
"used": 10968,
|
||||||
|
"total": 12288
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"t": 8,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"tensorSplit": "0.5,0.5",
|
||||||
|
"ngl": 999,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0"
|
||||||
|
},
|
||||||
|
"gpu_only": true
|
||||||
|
}
|
||||||
|
]
|
||||||
12
scripts/qwen_intermediate.csv
Normal file
12
scripts/qwen_intermediate.csv
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
model,label,avg,best,mode,vram,t,ub,b,kv,split,mmap
|
||||||
|
UD-IQ4_NL,pure-GPU minbatch,65.11,65.49,GPU,19177,t4,ub64,b256,q4_0/q4_0,,
|
||||||
|
UD-IQ4_NL,pure-GPU nommap small,65.01,65.36,GPU,19672,t4,ub128,b512,q4_0/q4_0,,nommap
|
||||||
|
UD-IQ4_NL,pure-GPU row-split,13.65,14.82,GPU,19427,t4,ub128,b512,q4_0/q4_0,row,
|
||||||
|
UD-IQ4_NL,pure-GPU ts=0.5,0.5,64.92,65.23,GPU,19664,t4,ub128,b512,q4_0/q4_0,,
|
||||||
|
UD-IQ4_NL,pure-GPU all-tricks,64.72,64.89,GPU,19171,t4,ub64,b256,q4_0/q4_0,,nommap
|
||||||
|
UD-IQ4_NL,tune t=2,64.87,65.13,GPU,19170,t2,ub64,b256,q4_0/q4_0,,
|
||||||
|
UD-IQ4_NL,tune t=6,64.88,65.17,GPU,19168,t6,ub64,b256,q4_0/q4_0,,
|
||||||
|
UD-IQ4_NL,tune t=8,64.5,64.77,GPU,19168,t8,ub64,b256,q4_0/q4_0,,
|
||||||
|
UD-IQ4_NL,tune ub=256 b=1024,64.73,64.98,GPU,20640,t4,ub256,b1024,q4_0/q4_0,,
|
||||||
|
UD-IQ4_NL,tune ub=256 b=2048,63.69,64.94,GPU,20614,t4,ub256,b2048,q4_0/q4_0,,
|
||||||
|
UD-IQ4_NL,tune kv=q8_0/q8_0,64.78,65.08,GPU,20422,t4,ub64,b256,q8_0/q8_0,,
|
||||||
|
24
scripts/qwen_latest.txt
Normal file
24
scripts/qwen_latest.txt
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
UD-IQ4_NL | pure-GPU minbatch | 65.11 | GPU | 19177
|
||||||
|
UD-IQ4_NL | pure-GPU nommap small | 65.01 | GPU | 19672
|
||||||
|
UD-IQ4_NL | pure-GPU row-split | 13.65 | GPU | 19427
|
||||||
|
UD-IQ4_NL | pure-GPU ts=0.5,0.5 | 64.92 | GPU | 19664
|
||||||
|
UD-IQ4_NL | pure-GPU all-tricks | 64.72 | GPU | 19171
|
||||||
|
UD-IQ4_NL | tune t=2 | 64.87 | GPU | 19170
|
||||||
|
UD-IQ4_NL | tune t=6 | 64.88 | GPU | 19168
|
||||||
|
UD-IQ4_NL | tune t=8 | 64.5 | GPU | 19168
|
||||||
|
UD-IQ4_NL | tune ub=256 b=1024 | 64.73 | GPU | 20640
|
||||||
|
UD-IQ4_NL | tune ub=256 b=2048 | 63.69 | GPU | 20614
|
||||||
|
UD-IQ4_NL | tune kv=q8_0/q8_0 | 64.78 | GPU | 20422
|
||||||
|
UD-IQ4_NL | tune kv=f16/f16 | 65.53 | GPU | 22812
|
||||||
|
UD-IQ4_NL | FINAL | 66.31 | GPU | 22811
|
||||||
|
MXFP4_MOE | pure-GPU minbatch | 63.06 | GPU | 22747
|
||||||
|
MXFP4_MOE | pure-GPU nommap small | 63.75 | GPU | 22579
|
||||||
|
MXFP4_MOE | pure-GPU ts=0.5,0.5 | 62.88 | GPU | 22578
|
||||||
|
MXFP4_MOE | pure-GPU all-tricks | 62.55 | GPU | 22743
|
||||||
|
MXFP4_MOE | tune t=2 | 63.07 | GPU | 22601
|
||||||
|
MXFP4_MOE | tune t=6 | 63.58 | GPU | 22583
|
||||||
|
MXFP4_MOE | tune t=8 | 62.92 | GPU | 22536
|
||||||
|
MXFP4_MOE | tune ub=256 b=1024 | 62.76 | GPU | 22874
|
||||||
|
MXFP4_MOE | tune ub=256 b=2048 | 62.74 | GPU | 22912
|
||||||
|
MXFP4_MOE | FINAL | 63.71 | GPU | 22566
|
||||||
|
Q4_K_M | pure-GPU nommap small | 62.29 | GPU | 22975
|
||||||
BIN
scripts/test_20ts.txt
Normal file
BIN
scripts/test_20ts.txt
Normal file
Binary file not shown.
64
scripts/tune_122b_20ts.mjs
Normal file
64
scripts/tune_122b_20ts.mjs
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import { exec, spawn } from 'child_process';
|
||||||
|
|
||||||
|
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||||
|
|
||||||
|
async function runTest(modelArgs, envVars, name) {
|
||||||
|
console.log(`\n===========================================`);
|
||||||
|
console.log(`Testing: ${name}`);
|
||||||
|
console.log(`Args: ${modelArgs}`);
|
||||||
|
|
||||||
|
return new Promise(async (resolve) => {
|
||||||
|
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||||
|
await delay(2000);
|
||||||
|
|
||||||
|
const env = { ...process.env, ...envVars };
|
||||||
|
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||||
|
detached: true,
|
||||||
|
stdio: 'ignore',
|
||||||
|
env
|
||||||
|
});
|
||||||
|
|
||||||
|
let ready = false;
|
||||||
|
for (let i = 0; i < 40; i++) {
|
||||||
|
try {
|
||||||
|
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||||
|
if (res.status === 200) {
|
||||||
|
ready = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {}
|
||||||
|
await delay(3000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ready) {
|
||||||
|
console.log(`[${name}] FAILED TO BOOT`);
|
||||||
|
exec('taskkill /F /IM llama-server.exe');
|
||||||
|
resolve({ success: false });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||||
|
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||||
|
console.log(stdout || stderr);
|
||||||
|
exec('taskkill /F /IM llama-server.exe');
|
||||||
|
resolve({ success: true });
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||||
|
|
||||||
|
// 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48)
|
||||||
|
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer");
|
||||||
|
|
||||||
|
// 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage
|
||||||
|
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer");
|
||||||
|
|
||||||
|
// 3. Fallback to 36 if OOM happens on 32/28
|
||||||
|
await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer");
|
||||||
|
|
||||||
|
console.log("\nALL TESTS COMPLETED");
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
72
scripts/tune_exact.mjs
Normal file
72
scripts/tune_exact.mjs
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
import { exec, spawn } from 'child_process';
|
||||||
|
|
||||||
|
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||||
|
|
||||||
|
async function runTest(modelArgs, envVars, name) {
|
||||||
|
console.log(`\n===========================================`);
|
||||||
|
console.log(`Testing: ${name}`);
|
||||||
|
console.log(`Env: ${JSON.stringify(envVars)}`);
|
||||||
|
console.log(`Args: ${modelArgs}`);
|
||||||
|
|
||||||
|
return new Promise(async (resolve) => {
|
||||||
|
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||||
|
await delay(2000);
|
||||||
|
|
||||||
|
const env = { ...process.env, ...envVars };
|
||||||
|
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||||
|
detached: true,
|
||||||
|
stdio: 'ignore',
|
||||||
|
env
|
||||||
|
});
|
||||||
|
|
||||||
|
let ready = false;
|
||||||
|
|
||||||
|
for (let i = 0; i < 40; i++) {
|
||||||
|
try {
|
||||||
|
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||||
|
if (res.status === 200) {
|
||||||
|
ready = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {}
|
||||||
|
await delay(3000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ready) {
|
||||||
|
console.log(`[${name}] FAILED TO BOOT`);
|
||||||
|
exec('taskkill /F /IM llama-server.exe');
|
||||||
|
resolve({ success: false });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[${name}] Server Ready! Running speed test...`);
|
||||||
|
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||||
|
console.log(stdout || stderr);
|
||||||
|
exec('taskkill /F /IM llama-server.exe');
|
||||||
|
resolve({ success: true });
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
// 1. 122B-A10B: Pure GPU offload (No n-cpu-moe at all)
|
||||||
|
// -ngl 999 will offload all 48 layers to the NVIDIA driver (triggering Shared VRAM fallback on Windows)
|
||||||
|
const args122B = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||||
|
await runTest(args122B, {}, "122B-A10B: Pure GPU (NVIDIA Shared Memory Fallback)");
|
||||||
|
|
||||||
|
// 2. 35B-A3B: Pure GPU tuning to hit 70 t/s
|
||||||
|
// Base configuration from previous full-gpu run:
|
||||||
|
const args35B = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 128 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||||
|
|
||||||
|
// We already got ~64 t/s basically.
|
||||||
|
// Let's try MMQ for custom matrix multiplication which is often faster on Ampere for low batch generation
|
||||||
|
await runTest(args35B, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Force MMQ = 1");
|
||||||
|
|
||||||
|
// Try increasing threads to 12 just in case
|
||||||
|
const args35B_t12 = args35B.replace("-t 6 -tb 6", "-t 12 -tb 12");
|
||||||
|
await runTest(args35B_t12, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Threads 12 + MMQ");
|
||||||
|
|
||||||
|
console.log("\nALL TESTS COMPLETED");
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
84
scripts/tune_models.mjs
Normal file
84
scripts/tune_models.mjs
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
import { exec, spawn } from 'child_process';
|
||||||
|
|
||||||
|
const delay = ms => new Promise(res => setTimeout(res, ms));
|
||||||
|
|
||||||
|
async function runTest(modelArgs, name) {
|
||||||
|
console.log(`\n===========================================`);
|
||||||
|
console.log(`Testing: ${name}`);
|
||||||
|
console.log(`Args: ${modelArgs}`);
|
||||||
|
|
||||||
|
return new Promise(async (resolve) => {
|
||||||
|
// Kill existing
|
||||||
|
await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
|
||||||
|
await delay(2000);
|
||||||
|
|
||||||
|
const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
|
||||||
|
detached: true,
|
||||||
|
stdio: 'ignore'
|
||||||
|
});
|
||||||
|
|
||||||
|
let ready = false;
|
||||||
|
let oom = false;
|
||||||
|
|
||||||
|
for (let i = 0; i < 40; i++) {
|
||||||
|
try {
|
||||||
|
const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
|
||||||
|
if (res.status === 200) {
|
||||||
|
ready = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {}
|
||||||
|
await delay(3000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ready) {
|
||||||
|
console.log(`[${name}] FAILED TO BOOT (Likely OOM)`);
|
||||||
|
exec('taskkill /F /IM llama-server.exe');
|
||||||
|
resolve({ success: false });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[${name}] Server Ready! Running benchmark...`);
|
||||||
|
// Run pptest
|
||||||
|
exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
|
||||||
|
console.log(stdout || stderr);
|
||||||
|
|
||||||
|
// Extract TG and PP from TG-500
|
||||||
|
const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/);
|
||||||
|
const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/);
|
||||||
|
|
||||||
|
const tg = tgMatch ? parseFloat(tgMatch[1]) : 0;
|
||||||
|
const pp = ppMatch ? parseFloat(ppMatch[1]) : 0;
|
||||||
|
|
||||||
|
exec('taskkill /F /IM llama-server.exe');
|
||||||
|
resolve({ success: true, tg, pp });
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
// 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512
|
||||||
|
const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||||
|
|
||||||
|
// Test 1: n-cpu-moe 1, ub 512
|
||||||
|
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512");
|
||||||
|
|
||||||
|
// Test 2: n-cpu-moe 2, ub 512
|
||||||
|
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512");
|
||||||
|
|
||||||
|
// Test 3: n-cpu-moe 4, ub 512
|
||||||
|
await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512");
|
||||||
|
|
||||||
|
// 2. 122B Tuning: Find optimal n-cpu-moe
|
||||||
|
const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
|
||||||
|
|
||||||
|
// Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU.
|
||||||
|
// Let's try 38, 35, 30
|
||||||
|
await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38");
|
||||||
|
await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30");
|
||||||
|
await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22");
|
||||||
|
|
||||||
|
console.log("Tuning finished.");
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
591
scripts/tune_results_gemma4_256k.json
Normal file
591
scripts/tune_results_gemma4_256k.json
Normal file
@@ -0,0 +1,591 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"ngl": 22,
|
||||||
|
"t": 8,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.22049935826915,
|
||||||
|
"best_tps": 25.971732307567606,
|
||||||
|
"vram_used": 11953,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ngl=22"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 8,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.805518952775174,
|
||||||
|
"best_tps": 25.953896683689454,
|
||||||
|
"vram_used": 11942,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ngl=21"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 20,
|
||||||
|
"t": 8,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 23.537353232262834,
|
||||||
|
"best_tps": 24.32109262330477,
|
||||||
|
"vram_used": 11972,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ngl=20"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 2,
|
||||||
|
"tb": 2,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 20.167581352340264,
|
||||||
|
"best_tps": 20.701192443418005,
|
||||||
|
"vram_used": 11969,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=2 | tb=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.689104997668554,
|
||||||
|
"best_tps": 26.328541632880874,
|
||||||
|
"vram_used": 11975,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=4 | tb=4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.294470150452725,
|
||||||
|
"best_tps": 26.541251363470614,
|
||||||
|
"vram_used": 11984,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=4 | tb=8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 6,
|
||||||
|
"tb": 6,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.307859289404675,
|
||||||
|
"best_tps": 26.292208504543133,
|
||||||
|
"vram_used": 11984,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=6 | tb=6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 6,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.230599923243314,
|
||||||
|
"best_tps": 26.366065850165732,
|
||||||
|
"vram_used": 11983,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=6 | tb=8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 8,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.113108026759278,
|
||||||
|
"best_tps": 26.123872617669583,
|
||||||
|
"vram_used": 11984,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=8 | tb=8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 8,
|
||||||
|
"tb": 12,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.05545428888364,
|
||||||
|
"best_tps": 26.06377500079152,
|
||||||
|
"vram_used": 11983,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=8 | tb=12"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 10,
|
||||||
|
"tb": 10,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 24.706926870374986,
|
||||||
|
"best_tps": 25.03033604251865,
|
||||||
|
"vram_used": 11984,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=10 | tb=10"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 12,
|
||||||
|
"tb": 12,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 22.468055564001904,
|
||||||
|
"best_tps": 23.425983251691825,
|
||||||
|
"vram_used": 11989,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=12 | tb=12"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 16,
|
||||||
|
"tb": 16,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 21.176973905195442,
|
||||||
|
"best_tps": 21.482429642395456,
|
||||||
|
"vram_used": 12021,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=16 | tb=16"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.545748810106186,
|
||||||
|
"best_tps": 26.344547829145817,
|
||||||
|
"vram_used": 11986,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=128 | b=512"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 1024,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.503875205368377,
|
||||||
|
"best_tps": 26.393548686102108,
|
||||||
|
"vram_used": 11981,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=256 | b=1024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.46500292415627,
|
||||||
|
"best_tps": 26.2726382287537,
|
||||||
|
"vram_used": 11981,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=256 | b=2048"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 1024,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.50982209452459,
|
||||||
|
"best_tps": 26.292282671074723,
|
||||||
|
"vram_used": 12020,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=512 | b=1024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.39646674356899,
|
||||||
|
"best_tps": 26.28106356028714,
|
||||||
|
"vram_used": 12020,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=512 | b=2048"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.471945933724726,
|
||||||
|
"best_tps": 26.268422652962233,
|
||||||
|
"vram_used": 12021,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=512 | b=4096"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.722119623856702,
|
||||||
|
"best_tps": 26.497264927416403,
|
||||||
|
"vram_used": 12019,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=1024 | b=2048"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.665819493145943,
|
||||||
|
"best_tps": 26.301163428594148,
|
||||||
|
"vram_used": 12019,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=1024 | b=4096"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.464915272955533,
|
||||||
|
"best_tps": 26.40667691713752,
|
||||||
|
"vram_used": 12019,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ctk=q4_0 | ctv=q4_0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.489715990281564,
|
||||||
|
"best_tps": 25.884133821146627,
|
||||||
|
"vram_used": 12011,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ctk=q8_0 | ctv=q8_0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 22.751034104721082,
|
||||||
|
"best_tps": 22.91250972782414,
|
||||||
|
"vram_used": 12017,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ctk=q4_0 | ctv=q8_0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "f16",
|
||||||
|
"ctv": "f16",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 24.745831571513975,
|
||||||
|
"best_tps": 25.53926086004382,
|
||||||
|
"vram_used": 11985,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ctk=f16 | ctv=f16"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.21575943186602,
|
||||||
|
"best_tps": 25.796865637378264,
|
||||||
|
"vram_used": 12013,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=True | poll=50 | prio=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": false,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 23.88172807693179,
|
||||||
|
"best_tps": 24.803356430302312,
|
||||||
|
"vram_used": 12016,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=False | poll=50 | prio=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 0,
|
||||||
|
"avg_tps": 25.041321207287698,
|
||||||
|
"best_tps": 25.88479834694897,
|
||||||
|
"vram_used": 12017,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=True | poll=0 | prio=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 100,
|
||||||
|
"avg_tps": 25.27990666474703,
|
||||||
|
"best_tps": 26.034861156695197,
|
||||||
|
"vram_used": 12017,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=True | poll=100 | prio=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 3,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.360977804679788,
|
||||||
|
"best_tps": 26.0705565191107,
|
||||||
|
"vram_used": 12022,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=True | poll=50 | prio=3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 21,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": false,
|
||||||
|
"prio": 3,
|
||||||
|
"poll": 0,
|
||||||
|
"avg_tps": 24.156893523381967,
|
||||||
|
"best_tps": 24.840307911026144,
|
||||||
|
"vram_used": 12021,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=False | poll=0 | prio=3"
|
||||||
|
}
|
||||||
|
]
|
||||||
201
scripts/tune_results_gemma4_ncpumoe.json
Normal file
201
scripts/tune_results_gemma4_ncpumoe.json
Normal file
@@ -0,0 +1,201 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=0",
|
||||||
|
"ncpumoe": 0,
|
||||||
|
"avg": 15.396949591766335,
|
||||||
|
"best": 20.220093309883133,
|
||||||
|
"vram": 12011,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=5",
|
||||||
|
"ncpumoe": 5,
|
||||||
|
"avg": 4.853957926040404,
|
||||||
|
"best": 4.9029479257524216,
|
||||||
|
"vram": 11945,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=10",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 20.64137159193706,
|
||||||
|
"best": 26.474940718957154,
|
||||||
|
"vram": 12020,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=15",
|
||||||
|
"ncpumoe": 15,
|
||||||
|
"avg": 13.424368433101165,
|
||||||
|
"best": 13.698684361880598,
|
||||||
|
"vram": 12018,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=20",
|
||||||
|
"ncpumoe": 20,
|
||||||
|
"avg": 10.338449574838693,
|
||||||
|
"best": 13.495275411319872,
|
||||||
|
"vram": 11530,
|
||||||
|
"nommap": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=25",
|
||||||
|
"ncpumoe": 25,
|
||||||
|
"avg": 12.920348175328435,
|
||||||
|
"best": 12.99923042323437,
|
||||||
|
"vram": 11625,
|
||||||
|
"nommap": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=30",
|
||||||
|
"ncpumoe": 30,
|
||||||
|
"avg": 13.251690836275145,
|
||||||
|
"best": 13.253697466971921,
|
||||||
|
"vram": 9064,
|
||||||
|
"nommap": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=7",
|
||||||
|
"ncpumoe": 7,
|
||||||
|
"avg": 16.31796299658782,
|
||||||
|
"best": 23.160760806218782,
|
||||||
|
"vram": 11994,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=9",
|
||||||
|
"ncpumoe": 9,
|
||||||
|
"avg": 7.469651892205037,
|
||||||
|
"best": 10.875064047449284,
|
||||||
|
"vram": 11941,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=11",
|
||||||
|
"ncpumoe": 11,
|
||||||
|
"avg": 14.814740144776437,
|
||||||
|
"best": 15.199641279675724,
|
||||||
|
"vram": 11984,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ncpumoe=13",
|
||||||
|
"ncpumoe": 13,
|
||||||
|
"avg": 14.183175252947136,
|
||||||
|
"best": 14.427257794639086,
|
||||||
|
"vram": 12003,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "t=2",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 28.551811207068425,
|
||||||
|
"best": 28.688565545389164,
|
||||||
|
"vram": 11968,
|
||||||
|
"t": 2,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "t=4",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.8619310622166,
|
||||||
|
"best": 31.17677746690393,
|
||||||
|
"vram": 11972,
|
||||||
|
"t": 4,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "t=6",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.578454576249854,
|
||||||
|
"best": 30.971792125516313,
|
||||||
|
"vram": 11983,
|
||||||
|
"t": 6,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "t=8",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.529393512116172,
|
||||||
|
"best": 30.954830478128166,
|
||||||
|
"vram": 11982,
|
||||||
|
"t": 8,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "t=10",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.773041112229503,
|
||||||
|
"best": 31.00899077264753,
|
||||||
|
"vram": 11972,
|
||||||
|
"t": 10,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ub=256,b=1024",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.49319055490045,
|
||||||
|
"best": 30.691055921541377,
|
||||||
|
"vram": 11993,
|
||||||
|
"t": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 1024,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ub=512,b=2048",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.923573731331718,
|
||||||
|
"best": 31.902272031660825,
|
||||||
|
"vram": 11995,
|
||||||
|
"t": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ub=512,b=4096",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.723820162954862,
|
||||||
|
"best": 31.065476003548053,
|
||||||
|
"vram": 11966,
|
||||||
|
"t": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "ub=1024,b=2048",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.489888387093156,
|
||||||
|
"best": 30.982074615885946,
|
||||||
|
"vram": 11964,
|
||||||
|
"t": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "kv=q4_0",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 30.63156129571348,
|
||||||
|
"best": 31.088674795634944,
|
||||||
|
"vram": 11988,
|
||||||
|
"t": 4,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"nommap": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "kv=q8_0",
|
||||||
|
"ncpumoe": 10,
|
||||||
|
"avg": 29.6114222576863,
|
||||||
|
"best": 30.580427895917573,
|
||||||
|
"vram": 11980,
|
||||||
|
"t": 4,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"nommap": false
|
||||||
|
}
|
||||||
|
]
|
||||||
522
scripts/tune_results_qwen35b_256k.json
Normal file
522
scripts/tune_results_qwen35b_256k.json
Normal file
@@ -0,0 +1,522 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 6,
|
||||||
|
"tb": 6,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.169961832638464,
|
||||||
|
"best_tps": 26.533887071573073,
|
||||||
|
"vram_used": 4994,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "cpu_moe=True"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": false,
|
||||||
|
"t": 6,
|
||||||
|
"tb": 6,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 11.065030380022206,
|
||||||
|
"best_tps": 11.083028272674314,
|
||||||
|
"vram_used": 11949,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "cpu_moe=False"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 2,
|
||||||
|
"tb": 2,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 21.473286428302767,
|
||||||
|
"best_tps": 21.746637577851104,
|
||||||
|
"vram_used": 4994,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=2 | tb=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.552358479030676,
|
||||||
|
"best_tps": 27.314237654089343,
|
||||||
|
"vram_used": 4991,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=4 | tb=4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 6,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.347068485327956,
|
||||||
|
"best_tps": 26.87924726131441,
|
||||||
|
"vram_used": 4993,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=4 | tb=6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 6,
|
||||||
|
"tb": 6,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.331286039513458,
|
||||||
|
"best_tps": 26.81427299445741,
|
||||||
|
"vram_used": 5001,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=6 | tb=6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 6,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.391160513711274,
|
||||||
|
"best_tps": 26.735573238878736,
|
||||||
|
"vram_used": 5001,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=6 | tb=8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 8,
|
||||||
|
"tb": 8,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 25.32340666199144,
|
||||||
|
"best_tps": 25.87949347494079,
|
||||||
|
"vram_used": 4995,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=8 | tb=8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 10,
|
||||||
|
"tb": 10,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 23.752277317850815,
|
||||||
|
"best_tps": 24.98242898809555,
|
||||||
|
"vram_used": 5011,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=10 | tb=10"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 12,
|
||||||
|
"tb": 12,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 21.75032196383532,
|
||||||
|
"best_tps": 23.18963400077116,
|
||||||
|
"vram_used": 5104,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "t=12 | tb=12"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 128,
|
||||||
|
"b": 512,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 13.27593572827031,
|
||||||
|
"best_tps": 13.337407402920235,
|
||||||
|
"vram_used": 4391,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=128 | b=512"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 1024,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.638687188233188,
|
||||||
|
"best_tps": 27.361082444434413,
|
||||||
|
"vram_used": 4495,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=256 | b=1024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 256,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.29069503392877,
|
||||||
|
"best_tps": 26.63368832924803,
|
||||||
|
"vram_used": 4490,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=256 | b=2048"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 1024,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.518331831441134,
|
||||||
|
"best_tps": 26.972021321271527,
|
||||||
|
"vram_used": 4984,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=512 | b=1024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.401541912276873,
|
||||||
|
"best_tps": 26.46530849236633,
|
||||||
|
"vram_used": 4990,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=512 | b=2048"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 26.892711500590455,
|
||||||
|
"best_tps": 26.892711500590455,
|
||||||
|
"vram_used": 5006,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=512 | b=4096"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 2048,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 12.600209659679201,
|
||||||
|
"best_tps": 12.759356030807627,
|
||||||
|
"vram_used": 12020,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=1024 | b=2048"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 1024,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 6.023959262370547,
|
||||||
|
"best_tps": 8.284882268188156,
|
||||||
|
"vram_used": 11931,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ub=1024 | b=4096"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 12.96992950856374,
|
||||||
|
"best_tps": 12.96992950856374,
|
||||||
|
"vram_used": 12022,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ctk=q4_0 | ctv=q4_0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q8_0",
|
||||||
|
"ctv": "q8_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 11.420078920350697,
|
||||||
|
"best_tps": 13.524778595767653,
|
||||||
|
"vram_used": 12030,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ctk=q8_0 | ctv=q8_0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "f16",
|
||||||
|
"ctv": "f16",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 11.978106511464183,
|
||||||
|
"best_tps": 13.729190013094977,
|
||||||
|
"vram_used": 11518,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "ctk=f16 | ctv=f16"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 16.164278220452957,
|
||||||
|
"best_tps": 22.645890325274323,
|
||||||
|
"vram_used": 11623,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=True | poll=50 | prio=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": false,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 16.555542780023114,
|
||||||
|
"best_tps": 23.333815015033892,
|
||||||
|
"vram_used": 9062,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=False | poll=50 | prio=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 0,
|
||||||
|
"avg_tps": 13.003619379106329,
|
||||||
|
"best_tps": 13.031594557134142,
|
||||||
|
"vram_used": 11994,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=True | poll=0 | prio=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 2,
|
||||||
|
"poll": 100,
|
||||||
|
"avg_tps": 5.7762452690702935,
|
||||||
|
"best_tps": 5.795560155803046,
|
||||||
|
"vram_used": 11953,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=True | poll=100 | prio=2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ngl": 999,
|
||||||
|
"cpu_moe": true,
|
||||||
|
"t": 4,
|
||||||
|
"tb": 4,
|
||||||
|
"ub": 512,
|
||||||
|
"b": 4096,
|
||||||
|
"ctk": "q4_0",
|
||||||
|
"ctv": "q4_0",
|
||||||
|
"fa": "on",
|
||||||
|
"mlock": true,
|
||||||
|
"mmap": true,
|
||||||
|
"prio": 3,
|
||||||
|
"poll": 50,
|
||||||
|
"avg_tps": 12.59406799687573,
|
||||||
|
"best_tps": 14.966737641114795,
|
||||||
|
"vram_used": 11996,
|
||||||
|
"vram_total": 12288,
|
||||||
|
"label": "mmap=True | poll=50 | prio=3"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -1,8 +1,30 @@
|
|||||||
@echo off
|
@echo off
|
||||||
|
chcp 65001 >nul
|
||||||
echo =========================================================
|
echo =========================================================
|
||||||
echo Gemma4 26B-A4B API Server (Tuned for Max Speed)
|
echo Gemma4 26B-A4B API Server (256K Context - Final Optimal)
|
||||||
echo [INFO] Tuning VRAM limit correctly to avoid WDDM swap (-ngl 22)
|
echo [CORE] --n-cpu-moe 10: VRAM 12GB 최적화용 Expert 오프로드
|
||||||
|
echo [TUNED] -t 4 -ub 512: CPU 병목 방지 및 SWA 캐시 최적화
|
||||||
|
echo [PERF] Speed: ~30.9 t/s (1x RTX 3060)
|
||||||
echo =========================================================
|
echo =========================================================
|
||||||
echo.
|
echo.
|
||||||
llama_bin_run\llama-server.exe --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf -ngl 22 -c 4096 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 8 --mlock --prio 2 --port 8000 --host 0.0.0.0
|
|
||||||
|
llama_bin_run\llama-server.exe ^
|
||||||
|
--model models\gemma-4-26B-A4B-it-Q4_K_M.gguf ^
|
||||||
|
-ngl 999 ^
|
||||||
|
--n-cpu-moe 10 ^
|
||||||
|
-c 262144 ^
|
||||||
|
-np 1 ^
|
||||||
|
-fa on ^
|
||||||
|
--cache-type-k q4_0 ^
|
||||||
|
--cache-type-v q4_0 ^
|
||||||
|
-ub 512 ^
|
||||||
|
-b 2048 ^
|
||||||
|
-t 4 ^
|
||||||
|
-tb 4 ^
|
||||||
|
--mlock ^
|
||||||
|
--prio 3 ^
|
||||||
|
--poll 50 ^
|
||||||
|
--port 8000 ^
|
||||||
|
--host 0.0.0.0
|
||||||
|
|
||||||
pause
|
pause
|
||||||
|
|||||||
Reference in New Issue
Block a user