diff --git a/.agent/.agent/scripts/translate_gsd.py b/.agent/.agent/scripts/translate_gsd.py deleted file mode 100644 index e5e5708..0000000 --- a/.agent/.agent/scripts/translate_gsd.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import glob -import re - -skill_dir = r"C:\Users\Certes\.gemini\antigravity\skills" - -translations = { - "Manage parallel workstreams — list, create, switch, status, progress, complete, and resume": "병렬 작업 스트림 관리 — 목록, 생성, 전환, 상태, 진행률, 완료 및 재개", - "Validate built features through conversational UAT": "대화형 UAT를 통해 구현된 기능 검증", - "Retroactively audit and fill Nyquist validation gaps for a completed phase": "완료된 단계에 대한 검증 누락 사후 감사 및 보완", - "Update GSD to latest version with changelog display": "GSD를 최신 버전으로 업데이트하고 변경 사항 표시", - "Retroactive 6-pillar visual audit of implemented frontend code": "구현된 프론트엔드 코드에 대한 6개 요소 시각적 사후 감사", - "Generate UI design contract (UI-SPEC.md) for frontend phases": "프론트엔드 단계를 위한 UI 디자인 명세서(UI-SPEC.md) 생성", - "Manage persistent context threads for cross-session work": "교차 세션 작업을 위한 영구 컨텍스트 스레드 관리", - "Display project statistics — phases, plans, requirements, git metrics, and timeline": "프로젝트 통계 표시 — 단계, 계획, 요구사항, Git 지표 및 타임라인", - "Create PR, run review, and prepare for merge after verification passes": "검증 통과 후 PR 생성, 리뷰 실행 및 병합 준비", - "Configure GSD workflow toggles and model profile": "GSD 워크플로우 옵션 및 모델 프로필 구성", - "Switch model profile for GSD agents (quality/balanced/budget/inherit)": "GSD 요원의 모델 프로필 전환 (고품질/균형/예산/상속)", - "Generate a session report with token usage estimates, work summary, and outcomes": "토큰 사용량, 작업 요약 및 결과를 포함한 세션 보고서 생성", - "Review and promote backlog items to active milestone": "백로그 항목을 검토하고 활성 마일스톤으로 승격", - "Request cross-AI peer review of phase plans from external AI CLIs": "외부 AI CLI에 단계 계획에 대한 교차 AI 동료 리뷰 요청", - "Resume work from previous session with full context restoration": "전체 컨텍스트 복원과 함께 이전 세션에서 작업 재개", - "Research how to implement a phase (standalone - usually use /gsd-plan-phase instead)": "단계를 구현하는 방법 리서치 (단독 실행 - 보통 /gsd-plan-phase 사용)", - "Remove a GSD workspace and clean up worktrees": "GSD 워크스페이스 제거 및 워크트리 정리", - "Remove a future phase from roadmap and renumber subsequent phases": "로드맵에서 향후 단계를 제거하고 이후 단계 번호 재지정", - "Reapply local modifications after a GSD update": "GSD 업데이트 후 로컬 수정 사항 재적용", - "Execute a quick task with GSD guarantees (atomic commits, state tracking) but skip optional agents": "GSD 보장(원자적 커밋, 상태 추적)을 사용하여 빠른 작업을 실행하되 선택적 요원 생략", - "Check project progress, show context, and route to next action (execute or plan)": "프로젝트 진행 상황 확인, 컨텍스트 표시 및 다음 작업(실행 또는 계획)으로 라우팅", - "Generate developer behavioral profile and create Claude-discoverable artifacts": "개발자 행동 프로필을 생성하고 AI가 인지할 수 있는 문서 작성", - "Create a clean PR branch by filtering out .planning/ commits — ready for code review": ".planning/ 커밋을 필터링하여 깔끔한 PR 브랜치 생성 — 코드 리뷰 준비", - "Capture a forward-looking idea with trigger conditions — surfaces automatically at the right milestone": "향후 아이디어를 트리거 조건과 함께 캡처 — 적절한 마일스톤에서 자동 표시", - "Create detailed phase plan (PLAN.md) with verification loop": "검증 루프를 포함한 상세 단계 계획(PLAN.md) 생성", - "Create phases to close all gaps identified by milestone audit": "마일스톤 감사에서 식별된 모든 격차를 해소하기 위한 단계 생성", - "Create context handoff when pausing work mid-phase": "작업 중단 시 컨텍스트 인수인계 파일 생성", - "Zero-friction idea capture. Append, list, or promote notes to todos.": "방해 없는 아이디어 캡처. 메모 추가, 나열 또는 할 일로 승격.", - "Automatically advance to the next logical step in the GSD workflow": "GSD 워크플로우의 다음 논리적 단계로 자동 진행", - "Create an isolated workspace with repo copies and independent .planning/": "외부 레포 사본 및 독립적인 .planning/을 갖춘 격리된 워크스페이스 생성", - "Initialize a new project with deep context gathering and PROJECT.md": "심층 컨텍스트 수집 및 PROJECT.md와 함께 새 프로젝트 초기화", - "Start a new milestone cycle — update PROJECT.md and route to requirements": "새로운 마일스톤 주기 시작 — PROJECT.md 업데이트 및 요구사항 재정의", - "Generate a comprehensive project summary from milestone artifacts for team onboarding and review": "팀 온보딩 및 리뷰를 위해 마일스톤 산출물에서 종합적인 프로젝트 요약 생성", - "Analyze codebase with parallel mapper agents to produce .planning/codebase/ documents": "병렬 매퍼 요원으로 코드베이스를 분석하여 .planning/codebase/ 문서 생성", - "Interactive command center for managing multiple phases from one terminal": "하나의 터미널에서 여러 단계를 관리하는 대화형 명령 센터", - "List active GSD workspaces and their status": "활성 GSD 워크스페이스 및 상태 나열", - "Surface the agent's assumptions about a phase approach before planning": "계획 전 단계적 접근 방식에 대한 요원의 가정을 미리 표시", - "Join the GSD Discord community": "GSD 디스코드 커뮤니티 참가", - "Insert urgent work as decimal phase (e.g., 72.1) between existing phases": "기존 단계 사이에 소수점 단계(예: 72.1)로 긴급 작업 삽입", - "Show available GSD commands and usage guide": "사용 가능한 GSD 명령어 및 사용 가이드 표시", - "Diagnose planning directory health and optionally repair issues": "계획 디렉토리 상태 진단 및 선택적으로 문제 복구", - "Post-mortem investigation for failed GSD workflows — analyzes git history, artifacts, and state to diagnose what went wrong": "실패한 GSD 워크플로우에 대한 사후 조사 — git 기록, 문서 및 상태 분석", - "Execute a trivial task inline — no subagents, no planning overhead": "인라인으로 사소한 작업 실행 — 서브 에이전트 및 계획 오버헤드 없음", - "Execute all plans in a phase with wave-based parallelization": "웨이브(Wave) 기반 병렬 처리를 사용하여 단계의 모든 계획 실행", - "Route freeform text to the right GSD command automatically": "자유 형식 텍스트를 적절한 GSD 명령으로 자동 라우팅", - "Systematic debugging with persistent state across context resets": "컨텍스트가 리셋되어도 상태를 유지하는 체계적인 디버깅", - "Gather phase context through adaptive questioning before planning. Use --auto to skip interactive questions (the agent picks recommended defaults).": "계획 전 심층 질문을 통해 단계 컨텍스트 수집. 대화형 건너뛰기(--auto) 가능.", - "Archive completed milestone and prepare for next version": "완료된 마일스톤 보관 및 다음 버전 준비", - "List pending todos and select one to work on": "보류 중인 할 일 목록 표시 및 작업할 항목 선택", - "Cross-phase audit of all outstanding UAT and verification items": "모든 미결 UAT 및 검증 항목에 대한 전체 단계 교차 감사", - "Audit milestone completion against original intent before archiving": "보관 전 원래 의도와 비교하여 마일스톤 달성 여부 감사", - "Capture idea or task as todo from current conversation context": "현재 대화 컨텍스트에서 아이디어 또는 작업을 할 일로 캡처", - "Generate tests for a completed phase based on UAT criteria and implementation": "UAT 기준 및 구현을 기반으로 완료된 단계에 대한 테스트 생성", - "Add phase to end of current milestone in roadmap": "로드맵의 현재 마일스톤 끝에 새 단계 추가", - "Add an idea to the backlog parking lot (999.x numbering)": "백로그 주차장(999.x 넘버링)에 아이디어 추가", - "Run all remaining phases autonomously — discuss→plan→execute per phase": "모든 남은 단계를 완전히 자율적으로 실행 (논의→계획→실행 루프)", - "Archive accumulated phase directories from completed milestones": "완료된 마일스톤에서 쌓인 단계 디렉토리 보관 및 정리" -} - -modified_count = 0 - -for filepath in glob.glob(os.path.join(skill_dir, "gsd-*", "SKILL.md")): - try: - with open(filepath, 'r', encoding='utf-8') as f: - content = f.read() - - new_content = content - for eng, kor in translations.items(): - pattern = re.compile(r"^description:\s*" + re.escape(eng) + r"\s*$", re.MULTILINE) - new_content = pattern.sub(f"description: {kor}", new_content) - - if new_content != content: - with open(filepath, 'w', encoding='utf-8') as f: - f.write(new_content) - modified_count += 1 - except Exception as e: - print(f"Error processing {filepath}: {e}") - -print(f"Successfully translated {modified_count} SKILL.md files.") diff --git a/.agent/.agent/skills/ui-ux-pro-max/scripts/core.py b/.agent/.agent/skills/ui-ux-pro-max/scripts/core.py deleted file mode 100644 index b7ba227..0000000 --- a/.agent/.agent/skills/ui-ux-pro-max/scripts/core.py +++ /dev/null @@ -1,253 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -UI/UX Pro Max Core - BM25 search engine for UI/UX style guides -""" - -import csv -import re -from pathlib import Path -from math import log -from collections import defaultdict - -# ============ CONFIGURATION ============ -DATA_DIR = Path(__file__).parent.parent / "data" -MAX_RESULTS = 3 - -CSV_CONFIG = { - "style": { - "file": "styles.csv", - "search_cols": ["Style Category", "Keywords", "Best For", "Type", "AI Prompt Keywords"], - "output_cols": ["Style Category", "Type", "Keywords", "Primary Colors", "Effects & Animation", "Best For", "Performance", "Accessibility", "Framework Compatibility", "Complexity", "AI Prompt Keywords", "CSS/Technical Keywords", "Implementation Checklist", "Design System Variables"] - }, - "color": { - "file": "colors.csv", - "search_cols": ["Product Type", "Notes"], - "output_cols": ["Product Type", "Primary (Hex)", "Secondary (Hex)", "CTA (Hex)", "Background (Hex)", "Text (Hex)", "Notes"] - }, - "chart": { - "file": "charts.csv", - "search_cols": ["Data Type", "Keywords", "Best Chart Type", "Accessibility Notes"], - "output_cols": ["Data Type", "Keywords", "Best Chart Type", "Secondary Options", "Color Guidance", "Accessibility Notes", "Library Recommendation", "Interactive Level"] - }, - "landing": { - "file": "landing.csv", - "search_cols": ["Pattern Name", "Keywords", "Conversion Optimization", "Section Order"], - "output_cols": ["Pattern Name", "Keywords", "Section Order", "Primary CTA Placement", "Color Strategy", "Conversion Optimization"] - }, - "product": { - "file": "products.csv", - "search_cols": ["Product Type", "Keywords", "Primary Style Recommendation", "Key Considerations"], - "output_cols": ["Product Type", "Keywords", "Primary Style Recommendation", "Secondary Styles", "Landing Page Pattern", "Dashboard Style (if applicable)", "Color Palette Focus"] - }, - "ux": { - "file": "ux-guidelines.csv", - "search_cols": ["Category", "Issue", "Description", "Platform"], - "output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"] - }, - "typography": { - "file": "typography.csv", - "search_cols": ["Font Pairing Name", "Category", "Mood/Style Keywords", "Best For", "Heading Font", "Body Font"], - "output_cols": ["Font Pairing Name", "Category", "Heading Font", "Body Font", "Mood/Style Keywords", "Best For", "Google Fonts URL", "CSS Import", "Tailwind Config", "Notes"] - }, - "icons": { - "file": "icons.csv", - "search_cols": ["Category", "Icon Name", "Keywords", "Best For"], - "output_cols": ["Category", "Icon Name", "Keywords", "Library", "Import Code", "Usage", "Best For", "Style"] - }, - "react": { - "file": "react-performance.csv", - "search_cols": ["Category", "Issue", "Keywords", "Description"], - "output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"] - }, - "web": { - "file": "web-interface.csv", - "search_cols": ["Category", "Issue", "Keywords", "Description"], - "output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"] - } -} - -STACK_CONFIG = { - "html-tailwind": {"file": "stacks/html-tailwind.csv"}, - "react": {"file": "stacks/react.csv"}, - "nextjs": {"file": "stacks/nextjs.csv"}, - "astro": {"file": "stacks/astro.csv"}, - "vue": {"file": "stacks/vue.csv"}, - "nuxtjs": {"file": "stacks/nuxtjs.csv"}, - "nuxt-ui": {"file": "stacks/nuxt-ui.csv"}, - "svelte": {"file": "stacks/svelte.csv"}, - "swiftui": {"file": "stacks/swiftui.csv"}, - "react-native": {"file": "stacks/react-native.csv"}, - "flutter": {"file": "stacks/flutter.csv"}, - "shadcn": {"file": "stacks/shadcn.csv"}, - "jetpack-compose": {"file": "stacks/jetpack-compose.csv"} -} - -# Common columns for all stacks -_STACK_COLS = { - "search_cols": ["Category", "Guideline", "Description", "Do", "Don't"], - "output_cols": ["Category", "Guideline", "Description", "Do", "Don't", "Code Good", "Code Bad", "Severity", "Docs URL"] -} - -AVAILABLE_STACKS = list(STACK_CONFIG.keys()) - - -# ============ BM25 IMPLEMENTATION ============ -class BM25: - """BM25 ranking algorithm for text search""" - - def __init__(self, k1=1.5, b=0.75): - self.k1 = k1 - self.b = b - self.corpus = [] - self.doc_lengths = [] - self.avgdl = 0 - self.idf = {} - self.doc_freqs = defaultdict(int) - self.N = 0 - - def tokenize(self, text): - """Lowercase, split, remove punctuation, filter short words""" - text = re.sub(r'[^\w\s]', ' ', str(text).lower()) - return [w for w in text.split() if len(w) > 2] - - def fit(self, documents): - """Build BM25 index from documents""" - self.corpus = [self.tokenize(doc) for doc in documents] - self.N = len(self.corpus) - if self.N == 0: - return - self.doc_lengths = [len(doc) for doc in self.corpus] - self.avgdl = sum(self.doc_lengths) / self.N - - for doc in self.corpus: - seen = set() - for word in doc: - if word not in seen: - self.doc_freqs[word] += 1 - seen.add(word) - - for word, freq in self.doc_freqs.items(): - self.idf[word] = log((self.N - freq + 0.5) / (freq + 0.5) + 1) - - def score(self, query): - """Score all documents against query""" - query_tokens = self.tokenize(query) - scores = [] - - for idx, doc in enumerate(self.corpus): - score = 0 - doc_len = self.doc_lengths[idx] - term_freqs = defaultdict(int) - for word in doc: - term_freqs[word] += 1 - - for token in query_tokens: - if token in self.idf: - tf = term_freqs[token] - idf = self.idf[token] - numerator = tf * (self.k1 + 1) - denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) - score += idf * numerator / denominator - - scores.append((idx, score)) - - return sorted(scores, key=lambda x: x[1], reverse=True) - - -# ============ SEARCH FUNCTIONS ============ -def _load_csv(filepath): - """Load CSV and return list of dicts""" - with open(filepath, 'r', encoding='utf-8') as f: - return list(csv.DictReader(f)) - - -def _search_csv(filepath, search_cols, output_cols, query, max_results): - """Core search function using BM25""" - if not filepath.exists(): - return [] - - data = _load_csv(filepath) - - # Build documents from search columns - documents = [" ".join(str(row.get(col, "")) for col in search_cols) for row in data] - - # BM25 search - bm25 = BM25() - bm25.fit(documents) - ranked = bm25.score(query) - - # Get top results with score > 0 - results = [] - for idx, score in ranked[:max_results]: - if score > 0: - row = data[idx] - results.append({col: row.get(col, "") for col in output_cols if col in row}) - - return results - - -def detect_domain(query): - """Auto-detect the most relevant domain from query""" - query_lower = query.lower() - - domain_keywords = { - "color": ["color", "palette", "hex", "#", "rgb"], - "chart": ["chart", "graph", "visualization", "trend", "bar", "pie", "scatter", "heatmap", "funnel"], - "landing": ["landing", "page", "cta", "conversion", "hero", "testimonial", "pricing", "section"], - "product": ["saas", "ecommerce", "e-commerce", "fintech", "healthcare", "gaming", "portfolio", "crypto", "dashboard"], - "style": ["style", "design", "ui", "minimalism", "glassmorphism", "neumorphism", "brutalism", "dark mode", "flat", "aurora", "prompt", "css", "implementation", "variable", "checklist", "tailwind"], - "ux": ["ux", "usability", "accessibility", "wcag", "touch", "scroll", "animation", "keyboard", "navigation", "mobile"], - "typography": ["font", "typography", "heading", "serif", "sans"], - "icons": ["icon", "icons", "lucide", "heroicons", "symbol", "glyph", "pictogram", "svg icon"], - "react": ["react", "next.js", "nextjs", "suspense", "memo", "usecallback", "useeffect", "rerender", "bundle", "waterfall", "barrel", "dynamic import", "rsc", "server component"], - "web": ["aria", "focus", "outline", "semantic", "virtualize", "autocomplete", "form", "input type", "preconnect"] - } - - scores = {domain: sum(1 for kw in keywords if kw in query_lower) for domain, keywords in domain_keywords.items()} - best = max(scores, key=scores.get) - return best if scores[best] > 0 else "style" - - -def search(query, domain=None, max_results=MAX_RESULTS): - """Main search function with auto-domain detection""" - if domain is None: - domain = detect_domain(query) - - config = CSV_CONFIG.get(domain, CSV_CONFIG["style"]) - filepath = DATA_DIR / config["file"] - - if not filepath.exists(): - return {"error": f"File not found: {filepath}", "domain": domain} - - results = _search_csv(filepath, config["search_cols"], config["output_cols"], query, max_results) - - return { - "domain": domain, - "query": query, - "file": config["file"], - "count": len(results), - "results": results - } - - -def search_stack(query, stack, max_results=MAX_RESULTS): - """Search stack-specific guidelines""" - if stack not in STACK_CONFIG: - return {"error": f"Unknown stack: {stack}. Available: {', '.join(AVAILABLE_STACKS)}"} - - filepath = DATA_DIR / STACK_CONFIG[stack]["file"] - - if not filepath.exists(): - return {"error": f"Stack file not found: {filepath}", "stack": stack} - - results = _search_csv(filepath, _STACK_COLS["search_cols"], _STACK_COLS["output_cols"], query, max_results) - - return { - "domain": "stack", - "stack": stack, - "query": query, - "file": STACK_CONFIG[stack]["file"], - "count": len(results), - "results": results - } diff --git a/.agent/.agent/skills/ui-ux-pro-max/scripts/design_system.py b/.agent/.agent/skills/ui-ux-pro-max/scripts/design_system.py deleted file mode 100644 index 209de20..0000000 --- a/.agent/.agent/skills/ui-ux-pro-max/scripts/design_system.py +++ /dev/null @@ -1,1067 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Design System Generator - Aggregates search results and applies reasoning -to generate comprehensive design system recommendations. - -Usage: - from design_system import generate_design_system - result = generate_design_system("SaaS dashboard", "My Project") - - # With persistence (Master + Overrides pattern) - result = generate_design_system("SaaS dashboard", "My Project", persist=True) - result = generate_design_system("SaaS dashboard", "My Project", persist=True, page="dashboard") -""" - -import csv -import json -import os -from datetime import datetime -from pathlib import Path -from core import search, DATA_DIR - - -# ============ CONFIGURATION ============ -REASONING_FILE = "ui-reasoning.csv" - -SEARCH_CONFIG = { - "product": {"max_results": 1}, - "style": {"max_results": 3}, - "color": {"max_results": 2}, - "landing": {"max_results": 2}, - "typography": {"max_results": 2} -} - - -# ============ DESIGN SYSTEM GENERATOR ============ -class DesignSystemGenerator: - """Generates design system recommendations from aggregated searches.""" - - def __init__(self): - self.reasoning_data = self._load_reasoning() - - def _load_reasoning(self) -> list: - """Load reasoning rules from CSV.""" - filepath = DATA_DIR / REASONING_FILE - if not filepath.exists(): - return [] - with open(filepath, 'r', encoding='utf-8') as f: - return list(csv.DictReader(f)) - - def _multi_domain_search(self, query: str, style_priority: list = None) -> dict: - """Execute searches across multiple domains.""" - results = {} - for domain, config in SEARCH_CONFIG.items(): - if domain == "style" and style_priority: - # For style, also search with priority keywords - priority_query = " ".join(style_priority[:2]) if style_priority else query - combined_query = f"{query} {priority_query}" - results[domain] = search(combined_query, domain, config["max_results"]) - else: - results[domain] = search(query, domain, config["max_results"]) - return results - - def _find_reasoning_rule(self, category: str) -> dict: - """Find matching reasoning rule for a category.""" - category_lower = category.lower() - - # Try exact match first - for rule in self.reasoning_data: - if rule.get("UI_Category", "").lower() == category_lower: - return rule - - # Try partial match - for rule in self.reasoning_data: - ui_cat = rule.get("UI_Category", "").lower() - if ui_cat in category_lower or category_lower in ui_cat: - return rule - - # Try keyword match - for rule in self.reasoning_data: - ui_cat = rule.get("UI_Category", "").lower() - keywords = ui_cat.replace("/", " ").replace("-", " ").split() - if any(kw in category_lower for kw in keywords): - return rule - - return {} - - def _apply_reasoning(self, category: str, search_results: dict) -> dict: - """Apply reasoning rules to search results.""" - rule = self._find_reasoning_rule(category) - - if not rule: - return { - "pattern": "Hero + Features + CTA", - "style_priority": ["Minimalism", "Flat Design"], - "color_mood": "Professional", - "typography_mood": "Clean", - "key_effects": "Subtle hover transitions", - "anti_patterns": "", - "decision_rules": {}, - "severity": "MEDIUM" - } - - # Parse decision rules JSON - decision_rules = {} - try: - decision_rules = json.loads(rule.get("Decision_Rules", "{}")) - except json.JSONDecodeError: - pass - - return { - "pattern": rule.get("Recommended_Pattern", ""), - "style_priority": [s.strip() for s in rule.get("Style_Priority", "").split("+")], - "color_mood": rule.get("Color_Mood", ""), - "typography_mood": rule.get("Typography_Mood", ""), - "key_effects": rule.get("Key_Effects", ""), - "anti_patterns": rule.get("Anti_Patterns", ""), - "decision_rules": decision_rules, - "severity": rule.get("Severity", "MEDIUM") - } - - def _select_best_match(self, results: list, priority_keywords: list) -> dict: - """Select best matching result based on priority keywords.""" - if not results: - return {} - - if not priority_keywords: - return results[0] - - # First: try exact style name match - for priority in priority_keywords: - priority_lower = priority.lower().strip() - for result in results: - style_name = result.get("Style Category", "").lower() - if priority_lower in style_name or style_name in priority_lower: - return result - - # Second: score by keyword match in all fields - scored = [] - for result in results: - result_str = str(result).lower() - score = 0 - for kw in priority_keywords: - kw_lower = kw.lower().strip() - # Higher score for style name match - if kw_lower in result.get("Style Category", "").lower(): - score += 10 - # Lower score for keyword field match - elif kw_lower in result.get("Keywords", "").lower(): - score += 3 - # Even lower for other field matches - elif kw_lower in result_str: - score += 1 - scored.append((score, result)) - - scored.sort(key=lambda x: x[0], reverse=True) - return scored[0][1] if scored and scored[0][0] > 0 else results[0] - - def _extract_results(self, search_result: dict) -> list: - """Extract results list from search result dict.""" - return search_result.get("results", []) - - def generate(self, query: str, project_name: str = None) -> dict: - """Generate complete design system recommendation.""" - # Step 1: First search product to get category - product_result = search(query, "product", 1) - product_results = product_result.get("results", []) - category = "General" - if product_results: - category = product_results[0].get("Product Type", "General") - - # Step 2: Get reasoning rules for this category - reasoning = self._apply_reasoning(category, {}) - style_priority = reasoning.get("style_priority", []) - - # Step 3: Multi-domain search with style priority hints - search_results = self._multi_domain_search(query, style_priority) - search_results["product"] = product_result # Reuse product search - - # Step 4: Select best matches from each domain using priority - style_results = self._extract_results(search_results.get("style", {})) - color_results = self._extract_results(search_results.get("color", {})) - typography_results = self._extract_results(search_results.get("typography", {})) - landing_results = self._extract_results(search_results.get("landing", {})) - - best_style = self._select_best_match(style_results, reasoning.get("style_priority", [])) - best_color = color_results[0] if color_results else {} - best_typography = typography_results[0] if typography_results else {} - best_landing = landing_results[0] if landing_results else {} - - # Step 5: Build final recommendation - # Combine effects from both reasoning and style search - style_effects = best_style.get("Effects & Animation", "") - reasoning_effects = reasoning.get("key_effects", "") - combined_effects = style_effects if style_effects else reasoning_effects - - return { - "project_name": project_name or query.upper(), - "category": category, - "pattern": { - "name": best_landing.get("Pattern Name", reasoning.get("pattern", "Hero + Features + CTA")), - "sections": best_landing.get("Section Order", "Hero > Features > CTA"), - "cta_placement": best_landing.get("Primary CTA Placement", "Above fold"), - "color_strategy": best_landing.get("Color Strategy", ""), - "conversion": best_landing.get("Conversion Optimization", "") - }, - "style": { - "name": best_style.get("Style Category", "Minimalism"), - "type": best_style.get("Type", "General"), - "effects": style_effects, - "keywords": best_style.get("Keywords", ""), - "best_for": best_style.get("Best For", ""), - "performance": best_style.get("Performance", ""), - "accessibility": best_style.get("Accessibility", "") - }, - "colors": { - "primary": best_color.get("Primary (Hex)", "#2563EB"), - "secondary": best_color.get("Secondary (Hex)", "#3B82F6"), - "cta": best_color.get("CTA (Hex)", "#F97316"), - "background": best_color.get("Background (Hex)", "#F8FAFC"), - "text": best_color.get("Text (Hex)", "#1E293B"), - "notes": best_color.get("Notes", "") - }, - "typography": { - "heading": best_typography.get("Heading Font", "Inter"), - "body": best_typography.get("Body Font", "Inter"), - "mood": best_typography.get("Mood/Style Keywords", reasoning.get("typography_mood", "")), - "best_for": best_typography.get("Best For", ""), - "google_fonts_url": best_typography.get("Google Fonts URL", ""), - "css_import": best_typography.get("CSS Import", "") - }, - "key_effects": combined_effects, - "anti_patterns": reasoning.get("anti_patterns", ""), - "decision_rules": reasoning.get("decision_rules", {}), - "severity": reasoning.get("severity", "MEDIUM") - } - - -# ============ OUTPUT FORMATTERS ============ -BOX_WIDTH = 90 # Wider box for more content - -def format_ascii_box(design_system: dict) -> str: - """Format design system as ASCII box with emojis (MCP-style).""" - project = design_system.get("project_name", "PROJECT") - pattern = design_system.get("pattern", {}) - style = design_system.get("style", {}) - colors = design_system.get("colors", {}) - typography = design_system.get("typography", {}) - effects = design_system.get("key_effects", "") - anti_patterns = design_system.get("anti_patterns", "") - - def wrap_text(text: str, prefix: str, width: int) -> list: - """Wrap long text into multiple lines.""" - if not text: - return [] - words = text.split() - lines = [] - current_line = prefix - for word in words: - if len(current_line) + len(word) + 1 <= width - 2: - current_line += (" " if current_line != prefix else "") + word - else: - if current_line != prefix: - lines.append(current_line) - current_line = prefix + word - if current_line != prefix: - lines.append(current_line) - return lines - - # Build sections from pattern - sections = pattern.get("sections", "").split(">") - sections = [s.strip() for s in sections if s.strip()] - - # Build output lines - lines = [] - w = BOX_WIDTH - 1 - - lines.append("+" + "-" * w + "+") - lines.append(f"| TARGET: {project} - RECOMMENDED DESIGN SYSTEM".ljust(BOX_WIDTH) + "|") - lines.append("+" + "-" * w + "+") - lines.append("|" + " " * BOX_WIDTH + "|") - - # Pattern section - lines.append(f"| PATTERN: {pattern.get('name', '')}".ljust(BOX_WIDTH) + "|") - if pattern.get('conversion'): - lines.append(f"| Conversion: {pattern.get('conversion', '')}".ljust(BOX_WIDTH) + "|") - if pattern.get('cta_placement'): - lines.append(f"| CTA: {pattern.get('cta_placement', '')}".ljust(BOX_WIDTH) + "|") - lines.append("| Sections:".ljust(BOX_WIDTH) + "|") - for i, section in enumerate(sections, 1): - lines.append(f"| {i}. {section}".ljust(BOX_WIDTH) + "|") - lines.append("|" + " " * BOX_WIDTH + "|") - - # Style section - lines.append(f"| STYLE: {style.get('name', '')}".ljust(BOX_WIDTH) + "|") - if style.get("keywords"): - for line in wrap_text(f"Keywords: {style.get('keywords', '')}", "| ", BOX_WIDTH): - lines.append(line.ljust(BOX_WIDTH) + "|") - if style.get("best_for"): - for line in wrap_text(f"Best For: {style.get('best_for', '')}", "| ", BOX_WIDTH): - lines.append(line.ljust(BOX_WIDTH) + "|") - if style.get("performance") or style.get("accessibility"): - perf_a11y = f"Performance: {style.get('performance', '')} | Accessibility: {style.get('accessibility', '')}" - lines.append(f"| {perf_a11y}".ljust(BOX_WIDTH) + "|") - lines.append("|" + " " * BOX_WIDTH + "|") - - # Colors section - lines.append("| COLORS:".ljust(BOX_WIDTH) + "|") - lines.append(f"| Primary: {colors.get('primary', '')}".ljust(BOX_WIDTH) + "|") - lines.append(f"| Secondary: {colors.get('secondary', '')}".ljust(BOX_WIDTH) + "|") - lines.append(f"| CTA: {colors.get('cta', '')}".ljust(BOX_WIDTH) + "|") - lines.append(f"| Background: {colors.get('background', '')}".ljust(BOX_WIDTH) + "|") - lines.append(f"| Text: {colors.get('text', '')}".ljust(BOX_WIDTH) + "|") - if colors.get("notes"): - for line in wrap_text(f"Notes: {colors.get('notes', '')}", "| ", BOX_WIDTH): - lines.append(line.ljust(BOX_WIDTH) + "|") - lines.append("|" + " " * BOX_WIDTH + "|") - - # Typography section - lines.append(f"| TYPOGRAPHY: {typography.get('heading', '')} / {typography.get('body', '')}".ljust(BOX_WIDTH) + "|") - if typography.get("mood"): - for line in wrap_text(f"Mood: {typography.get('mood', '')}", "| ", BOX_WIDTH): - lines.append(line.ljust(BOX_WIDTH) + "|") - if typography.get("best_for"): - for line in wrap_text(f"Best For: {typography.get('best_for', '')}", "| ", BOX_WIDTH): - lines.append(line.ljust(BOX_WIDTH) + "|") - if typography.get("google_fonts_url"): - lines.append(f"| Google Fonts: {typography.get('google_fonts_url', '')}".ljust(BOX_WIDTH) + "|") - if typography.get("css_import"): - lines.append(f"| CSS Import: {typography.get('css_import', '')[:70]}...".ljust(BOX_WIDTH) + "|") - lines.append("|" + " " * BOX_WIDTH + "|") - - # Key Effects section - if effects: - lines.append("| KEY EFFECTS:".ljust(BOX_WIDTH) + "|") - for line in wrap_text(effects, "| ", BOX_WIDTH): - lines.append(line.ljust(BOX_WIDTH) + "|") - lines.append("|" + " " * BOX_WIDTH + "|") - - # Anti-patterns section - if anti_patterns: - lines.append("| AVOID (Anti-patterns):".ljust(BOX_WIDTH) + "|") - for line in wrap_text(anti_patterns, "| ", BOX_WIDTH): - lines.append(line.ljust(BOX_WIDTH) + "|") - lines.append("|" + " " * BOX_WIDTH + "|") - - # Pre-Delivery Checklist section - lines.append("| PRE-DELIVERY CHECKLIST:".ljust(BOX_WIDTH) + "|") - checklist_items = [ - "[ ] No emojis as icons (use SVG: Heroicons/Lucide)", - "[ ] cursor-pointer on all clickable elements", - "[ ] Hover states with smooth transitions (150-300ms)", - "[ ] Light mode: text contrast 4.5:1 minimum", - "[ ] Focus states visible for keyboard nav", - "[ ] prefers-reduced-motion respected", - "[ ] Responsive: 375px, 768px, 1024px, 1440px" - ] - for item in checklist_items: - lines.append(f"| {item}".ljust(BOX_WIDTH) + "|") - lines.append("|" + " " * BOX_WIDTH + "|") - - lines.append("+" + "-" * w + "+") - - return "\n".join(lines) - - -def format_markdown(design_system: dict) -> str: - """Format design system as markdown.""" - project = design_system.get("project_name", "PROJECT") - pattern = design_system.get("pattern", {}) - style = design_system.get("style", {}) - colors = design_system.get("colors", {}) - typography = design_system.get("typography", {}) - effects = design_system.get("key_effects", "") - anti_patterns = design_system.get("anti_patterns", "") - - lines = [] - lines.append(f"## Design System: {project}") - lines.append("") - - # Pattern section - lines.append("### Pattern") - lines.append(f"- **Name:** {pattern.get('name', '')}") - if pattern.get('conversion'): - lines.append(f"- **Conversion Focus:** {pattern.get('conversion', '')}") - if pattern.get('cta_placement'): - lines.append(f"- **CTA Placement:** {pattern.get('cta_placement', '')}") - if pattern.get('color_strategy'): - lines.append(f"- **Color Strategy:** {pattern.get('color_strategy', '')}") - lines.append(f"- **Sections:** {pattern.get('sections', '')}") - lines.append("") - - # Style section - lines.append("### Style") - lines.append(f"- **Name:** {style.get('name', '')}") - if style.get('keywords'): - lines.append(f"- **Keywords:** {style.get('keywords', '')}") - if style.get('best_for'): - lines.append(f"- **Best For:** {style.get('best_for', '')}") - if style.get('performance') or style.get('accessibility'): - lines.append(f"- **Performance:** {style.get('performance', '')} | **Accessibility:** {style.get('accessibility', '')}") - lines.append("") - - # Colors section - lines.append("### Colors") - lines.append(f"| Role | Hex |") - lines.append(f"|------|-----|") - lines.append(f"| Primary | {colors.get('primary', '')} |") - lines.append(f"| Secondary | {colors.get('secondary', '')} |") - lines.append(f"| CTA | {colors.get('cta', '')} |") - lines.append(f"| Background | {colors.get('background', '')} |") - lines.append(f"| Text | {colors.get('text', '')} |") - if colors.get("notes"): - lines.append(f"\n*Notes: {colors.get('notes', '')}*") - lines.append("") - - # Typography section - lines.append("### Typography") - lines.append(f"- **Heading:** {typography.get('heading', '')}") - lines.append(f"- **Body:** {typography.get('body', '')}") - if typography.get("mood"): - lines.append(f"- **Mood:** {typography.get('mood', '')}") - if typography.get("best_for"): - lines.append(f"- **Best For:** {typography.get('best_for', '')}") - if typography.get("google_fonts_url"): - lines.append(f"- **Google Fonts:** {typography.get('google_fonts_url', '')}") - if typography.get("css_import"): - lines.append(f"- **CSS Import:**") - lines.append(f"```css") - lines.append(f"{typography.get('css_import', '')}") - lines.append(f"```") - lines.append("") - - # Key Effects section - if effects: - lines.append("### Key Effects") - lines.append(f"{effects}") - lines.append("") - - # Anti-patterns section - if anti_patterns: - lines.append("### Avoid (Anti-patterns)") - newline_bullet = '\n- ' - lines.append(f"- {anti_patterns.replace(' + ', newline_bullet)}") - lines.append("") - - # Pre-Delivery Checklist section - lines.append("### Pre-Delivery Checklist") - lines.append("- [ ] No emojis as icons (use SVG: Heroicons/Lucide)") - lines.append("- [ ] cursor-pointer on all clickable elements") - lines.append("- [ ] Hover states with smooth transitions (150-300ms)") - lines.append("- [ ] Light mode: text contrast 4.5:1 minimum") - lines.append("- [ ] Focus states visible for keyboard nav") - lines.append("- [ ] prefers-reduced-motion respected") - lines.append("- [ ] Responsive: 375px, 768px, 1024px, 1440px") - lines.append("") - - return "\n".join(lines) - - -# ============ MAIN ENTRY POINT ============ -def generate_design_system(query: str, project_name: str = None, output_format: str = "ascii", - persist: bool = False, page: str = None, output_dir: str = None) -> str: - """ - Main entry point for design system generation. - - Args: - query: Search query (e.g., "SaaS dashboard", "e-commerce luxury") - project_name: Optional project name for output header - output_format: "ascii" (default) or "markdown" - persist: If True, save design system to design-system/ folder - page: Optional page name for page-specific override file - output_dir: Optional output directory (defaults to current working directory) - - Returns: - Formatted design system string - """ - generator = DesignSystemGenerator() - design_system = generator.generate(query, project_name) - - # Persist to files if requested - if persist: - persist_design_system(design_system, page, output_dir, query) - - if output_format == "markdown": - return format_markdown(design_system) - return format_ascii_box(design_system) - - -# ============ PERSISTENCE FUNCTIONS ============ -def persist_design_system(design_system: dict, page: str = None, output_dir: str = None, page_query: str = None) -> dict: - """ - Persist design system to design-system// folder using Master + Overrides pattern. - - Args: - design_system: The generated design system dictionary - page: Optional page name for page-specific override file - output_dir: Optional output directory (defaults to current working directory) - page_query: Optional query string for intelligent page override generation - - Returns: - dict with created file paths and status - """ - base_dir = Path(output_dir) if output_dir else Path.cwd() - - # Use project name for project-specific folder - project_name = design_system.get("project_name", "default") - project_slug = project_name.lower().replace(' ', '-') - - design_system_dir = base_dir / "design-system" / project_slug - pages_dir = design_system_dir / "pages" - - created_files = [] - - # Create directories - design_system_dir.mkdir(parents=True, exist_ok=True) - pages_dir.mkdir(parents=True, exist_ok=True) - - master_file = design_system_dir / "MASTER.md" - - # Generate and write MASTER.md - master_content = format_master_md(design_system) - with open(master_file, 'w', encoding='utf-8') as f: - f.write(master_content) - created_files.append(str(master_file)) - - # If page is specified, create page override file with intelligent content - if page: - page_file = pages_dir / f"{page.lower().replace(' ', '-')}.md" - page_content = format_page_override_md(design_system, page, page_query) - with open(page_file, 'w', encoding='utf-8') as f: - f.write(page_content) - created_files.append(str(page_file)) - - return { - "status": "success", - "design_system_dir": str(design_system_dir), - "created_files": created_files - } - - -def format_master_md(design_system: dict) -> str: - """Format design system as MASTER.md with hierarchical override logic.""" - project = design_system.get("project_name", "PROJECT") - pattern = design_system.get("pattern", {}) - style = design_system.get("style", {}) - colors = design_system.get("colors", {}) - typography = design_system.get("typography", {}) - effects = design_system.get("key_effects", "") - anti_patterns = design_system.get("anti_patterns", "") - - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - lines = [] - - # Logic header - lines.append("# Design System Master File") - lines.append("") - lines.append("> **LOGIC:** When building a specific page, first check `design-system/pages/[page-name].md`.") - lines.append("> If that file exists, its rules **override** this Master file.") - lines.append("> If not, strictly follow the rules below.") - lines.append("") - lines.append("---") - lines.append("") - lines.append(f"**Project:** {project}") - lines.append(f"**Generated:** {timestamp}") - lines.append(f"**Category:** {design_system.get('category', 'General')}") - lines.append("") - lines.append("---") - lines.append("") - - # Global Rules section - lines.append("## Global Rules") - lines.append("") - - # Color Palette - lines.append("### Color Palette") - lines.append("") - lines.append("| Role | Hex | CSS Variable |") - lines.append("|------|-----|--------------|") - lines.append(f"| Primary | `{colors.get('primary', '#2563EB')}` | `--color-primary` |") - lines.append(f"| Secondary | `{colors.get('secondary', '#3B82F6')}` | `--color-secondary` |") - lines.append(f"| CTA/Accent | `{colors.get('cta', '#F97316')}` | `--color-cta` |") - lines.append(f"| Background | `{colors.get('background', '#F8FAFC')}` | `--color-background` |") - lines.append(f"| Text | `{colors.get('text', '#1E293B')}` | `--color-text` |") - lines.append("") - if colors.get("notes"): - lines.append(f"**Color Notes:** {colors.get('notes', '')}") - lines.append("") - - # Typography - lines.append("### Typography") - lines.append("") - lines.append(f"- **Heading Font:** {typography.get('heading', 'Inter')}") - lines.append(f"- **Body Font:** {typography.get('body', 'Inter')}") - if typography.get("mood"): - lines.append(f"- **Mood:** {typography.get('mood', '')}") - if typography.get("google_fonts_url"): - lines.append(f"- **Google Fonts:** [{typography.get('heading', '')} + {typography.get('body', '')}]({typography.get('google_fonts_url', '')})") - lines.append("") - if typography.get("css_import"): - lines.append("**CSS Import:**") - lines.append("```css") - lines.append(typography.get("css_import", "")) - lines.append("```") - lines.append("") - - # Spacing Variables - lines.append("### Spacing Variables") - lines.append("") - lines.append("| Token | Value | Usage |") - lines.append("|-------|-------|-------|") - lines.append("| `--space-xs` | `4px` / `0.25rem` | Tight gaps |") - lines.append("| `--space-sm` | `8px` / `0.5rem` | Icon gaps, inline spacing |") - lines.append("| `--space-md` | `16px` / `1rem` | Standard padding |") - lines.append("| `--space-lg` | `24px` / `1.5rem` | Section padding |") - lines.append("| `--space-xl` | `32px` / `2rem` | Large gaps |") - lines.append("| `--space-2xl` | `48px` / `3rem` | Section margins |") - lines.append("| `--space-3xl` | `64px` / `4rem` | Hero padding |") - lines.append("") - - # Shadow Depths - lines.append("### Shadow Depths") - lines.append("") - lines.append("| Level | Value | Usage |") - lines.append("|-------|-------|-------|") - lines.append("| `--shadow-sm` | `0 1px 2px rgba(0,0,0,0.05)` | Subtle lift |") - lines.append("| `--shadow-md` | `0 4px 6px rgba(0,0,0,0.1)` | Cards, buttons |") - lines.append("| `--shadow-lg` | `0 10px 15px rgba(0,0,0,0.1)` | Modals, dropdowns |") - lines.append("| `--shadow-xl` | `0 20px 25px rgba(0,0,0,0.15)` | Hero images, featured cards |") - lines.append("") - - # Component Specs section - lines.append("---") - lines.append("") - lines.append("## Component Specs") - lines.append("") - - # Buttons - lines.append("### Buttons") - lines.append("") - lines.append("```css") - lines.append("/* Primary Button */") - lines.append(".btn-primary {") - lines.append(f" background: {colors.get('cta', '#F97316')};") - lines.append(" color: white;") - lines.append(" padding: 12px 24px;") - lines.append(" border-radius: 8px;") - lines.append(" font-weight: 600;") - lines.append(" transition: all 200ms ease;") - lines.append(" cursor: pointer;") - lines.append("}") - lines.append("") - lines.append(".btn-primary:hover {") - lines.append(" opacity: 0.9;") - lines.append(" transform: translateY(-1px);") - lines.append("}") - lines.append("") - lines.append("/* Secondary Button */") - lines.append(".btn-secondary {") - lines.append(f" background: transparent;") - lines.append(f" color: {colors.get('primary', '#2563EB')};") - lines.append(f" border: 2px solid {colors.get('primary', '#2563EB')};") - lines.append(" padding: 12px 24px;") - lines.append(" border-radius: 8px;") - lines.append(" font-weight: 600;") - lines.append(" transition: all 200ms ease;") - lines.append(" cursor: pointer;") - lines.append("}") - lines.append("```") - lines.append("") - - # Cards - lines.append("### Cards") - lines.append("") - lines.append("```css") - lines.append(".card {") - lines.append(f" background: {colors.get('background', '#FFFFFF')};") - lines.append(" border-radius: 12px;") - lines.append(" padding: 24px;") - lines.append(" box-shadow: var(--shadow-md);") - lines.append(" transition: all 200ms ease;") - lines.append(" cursor: pointer;") - lines.append("}") - lines.append("") - lines.append(".card:hover {") - lines.append(" box-shadow: var(--shadow-lg);") - lines.append(" transform: translateY(-2px);") - lines.append("}") - lines.append("```") - lines.append("") - - # Inputs - lines.append("### Inputs") - lines.append("") - lines.append("```css") - lines.append(".input {") - lines.append(" padding: 12px 16px;") - lines.append(" border: 1px solid #E2E8F0;") - lines.append(" border-radius: 8px;") - lines.append(" font-size: 16px;") - lines.append(" transition: border-color 200ms ease;") - lines.append("}") - lines.append("") - lines.append(".input:focus {") - lines.append(f" border-color: {colors.get('primary', '#2563EB')};") - lines.append(" outline: none;") - lines.append(f" box-shadow: 0 0 0 3px {colors.get('primary', '#2563EB')}20;") - lines.append("}") - lines.append("```") - lines.append("") - - # Modals - lines.append("### Modals") - lines.append("") - lines.append("```css") - lines.append(".modal-overlay {") - lines.append(" background: rgba(0, 0, 0, 0.5);") - lines.append(" backdrop-filter: blur(4px);") - lines.append("}") - lines.append("") - lines.append(".modal {") - lines.append(" background: white;") - lines.append(" border-radius: 16px;") - lines.append(" padding: 32px;") - lines.append(" box-shadow: var(--shadow-xl);") - lines.append(" max-width: 500px;") - lines.append(" width: 90%;") - lines.append("}") - lines.append("```") - lines.append("") - - # Style section - lines.append("---") - lines.append("") - lines.append("## Style Guidelines") - lines.append("") - lines.append(f"**Style:** {style.get('name', 'Minimalism')}") - lines.append("") - if style.get("keywords"): - lines.append(f"**Keywords:** {style.get('keywords', '')}") - lines.append("") - if style.get("best_for"): - lines.append(f"**Best For:** {style.get('best_for', '')}") - lines.append("") - if effects: - lines.append(f"**Key Effects:** {effects}") - lines.append("") - - # Layout Pattern - lines.append("### Page Pattern") - lines.append("") - lines.append(f"**Pattern Name:** {pattern.get('name', '')}") - lines.append("") - if pattern.get('conversion'): - lines.append(f"- **Conversion Strategy:** {pattern.get('conversion', '')}") - if pattern.get('cta_placement'): - lines.append(f"- **CTA Placement:** {pattern.get('cta_placement', '')}") - lines.append(f"- **Section Order:** {pattern.get('sections', '')}") - lines.append("") - - # Anti-Patterns section - lines.append("---") - lines.append("") - lines.append("## Anti-Patterns (Do NOT Use)") - lines.append("") - if anti_patterns: - anti_list = [a.strip() for a in anti_patterns.split("+")] - for anti in anti_list: - if anti: - lines.append(f"- ❌ {anti}") - lines.append("") - lines.append("### Additional Forbidden Patterns") - lines.append("") - lines.append("- ❌ **Emojis as icons** — Use SVG icons (Heroicons, Lucide, Simple Icons)") - lines.append("- ❌ **Missing cursor:pointer** — All clickable elements must have cursor:pointer") - lines.append("- ❌ **Layout-shifting hovers** — Avoid scale transforms that shift layout") - lines.append("- ❌ **Low contrast text** — Maintain 4.5:1 minimum contrast ratio") - lines.append("- ❌ **Instant state changes** — Always use transitions (150-300ms)") - lines.append("- ❌ **Invisible focus states** — Focus states must be visible for a11y") - lines.append("") - - # Pre-Delivery Checklist - lines.append("---") - lines.append("") - lines.append("## Pre-Delivery Checklist") - lines.append("") - lines.append("Before delivering any UI code, verify:") - lines.append("") - lines.append("- [ ] No emojis used as icons (use SVG instead)") - lines.append("- [ ] All icons from consistent icon set (Heroicons/Lucide)") - lines.append("- [ ] `cursor-pointer` on all clickable elements") - lines.append("- [ ] Hover states with smooth transitions (150-300ms)") - lines.append("- [ ] Light mode: text contrast 4.5:1 minimum") - lines.append("- [ ] Focus states visible for keyboard navigation") - lines.append("- [ ] `prefers-reduced-motion` respected") - lines.append("- [ ] Responsive: 375px, 768px, 1024px, 1440px") - lines.append("- [ ] No content hidden behind fixed navbars") - lines.append("- [ ] No horizontal scroll on mobile") - lines.append("") - - return "\n".join(lines) - - -def format_page_override_md(design_system: dict, page_name: str, page_query: str = None) -> str: - """Format a page-specific override file with intelligent AI-generated content.""" - project = design_system.get("project_name", "PROJECT") - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - page_title = page_name.replace("-", " ").replace("_", " ").title() - - # Detect page type and generate intelligent overrides - page_overrides = _generate_intelligent_overrides(page_name, page_query, design_system) - - lines = [] - - lines.append(f"# {page_title} Page Overrides") - lines.append("") - lines.append(f"> **PROJECT:** {project}") - lines.append(f"> **Generated:** {timestamp}") - lines.append(f"> **Page Type:** {page_overrides.get('page_type', 'General')}") - lines.append("") - lines.append("> ⚠️ **IMPORTANT:** Rules in this file **override** the Master file (`design-system/MASTER.md`).") - lines.append("> Only deviations from the Master are documented here. For all other rules, refer to the Master.") - lines.append("") - lines.append("---") - lines.append("") - - # Page-specific rules with actual content - lines.append("## Page-Specific Rules") - lines.append("") - - # Layout Overrides - lines.append("### Layout Overrides") - lines.append("") - layout = page_overrides.get("layout", {}) - if layout: - for key, value in layout.items(): - lines.append(f"- **{key}:** {value}") - else: - lines.append("- No overrides — use Master layout") - lines.append("") - - # Spacing Overrides - lines.append("### Spacing Overrides") - lines.append("") - spacing = page_overrides.get("spacing", {}) - if spacing: - for key, value in spacing.items(): - lines.append(f"- **{key}:** {value}") - else: - lines.append("- No overrides — use Master spacing") - lines.append("") - - # Typography Overrides - lines.append("### Typography Overrides") - lines.append("") - typography = page_overrides.get("typography", {}) - if typography: - for key, value in typography.items(): - lines.append(f"- **{key}:** {value}") - else: - lines.append("- No overrides — use Master typography") - lines.append("") - - # Color Overrides - lines.append("### Color Overrides") - lines.append("") - colors = page_overrides.get("colors", {}) - if colors: - for key, value in colors.items(): - lines.append(f"- **{key}:** {value}") - else: - lines.append("- No overrides — use Master colors") - lines.append("") - - # Component Overrides - lines.append("### Component Overrides") - lines.append("") - components = page_overrides.get("components", []) - if components: - for comp in components: - lines.append(f"- {comp}") - else: - lines.append("- No overrides — use Master component specs") - lines.append("") - - # Page-Specific Components - lines.append("---") - lines.append("") - lines.append("## Page-Specific Components") - lines.append("") - unique_components = page_overrides.get("unique_components", []) - if unique_components: - for comp in unique_components: - lines.append(f"- {comp}") - else: - lines.append("- No unique components for this page") - lines.append("") - - # Recommendations - lines.append("---") - lines.append("") - lines.append("## Recommendations") - lines.append("") - recommendations = page_overrides.get("recommendations", []) - if recommendations: - for rec in recommendations: - lines.append(f"- {rec}") - lines.append("") - - return "\n".join(lines) - - -def _generate_intelligent_overrides(page_name: str, page_query: str, design_system: dict) -> dict: - """ - Generate intelligent overrides based on page type using layered search. - - Uses the existing search infrastructure to find relevant style, UX, and layout - data instead of hardcoded page types. - """ - from core import search - - page_lower = page_name.lower() - query_lower = (page_query or "").lower() - combined_context = f"{page_lower} {query_lower}" - - # Search across multiple domains for page-specific guidance - style_search = search(combined_context, "style", max_results=1) - ux_search = search(combined_context, "ux", max_results=3) - landing_search = search(combined_context, "landing", max_results=1) - - # Extract results from search response - style_results = style_search.get("results", []) - ux_results = ux_search.get("results", []) - landing_results = landing_search.get("results", []) - - # Detect page type from search results or context - page_type = _detect_page_type(combined_context, style_results) - - # Build overrides from search results - layout = {} - spacing = {} - typography = {} - colors = {} - components = [] - unique_components = [] - recommendations = [] - - # Extract style-based overrides - if style_results: - style = style_results[0] - style_name = style.get("Style Category", "") - keywords = style.get("Keywords", "") - best_for = style.get("Best For", "") - effects = style.get("Effects & Animation", "") - - # Infer layout from style keywords - if any(kw in keywords.lower() for kw in ["data", "dense", "dashboard", "grid"]): - layout["Max Width"] = "1400px or full-width" - layout["Grid"] = "12-column grid for data flexibility" - spacing["Content Density"] = "High — optimize for information display" - elif any(kw in keywords.lower() for kw in ["minimal", "simple", "clean", "single"]): - layout["Max Width"] = "800px (narrow, focused)" - layout["Layout"] = "Single column, centered" - spacing["Content Density"] = "Low — focus on clarity" - else: - layout["Max Width"] = "1200px (standard)" - layout["Layout"] = "Full-width sections, centered content" - - if effects: - recommendations.append(f"Effects: {effects}") - - # Extract UX guidelines as recommendations - for ux in ux_results: - category = ux.get("Category", "") - do_text = ux.get("Do", "") - dont_text = ux.get("Don't", "") - if do_text: - recommendations.append(f"{category}: {do_text}") - if dont_text: - components.append(f"Avoid: {dont_text}") - - # Extract landing pattern info for section structure - if landing_results: - landing = landing_results[0] - sections = landing.get("Section Order", "") - cta_placement = landing.get("Primary CTA Placement", "") - color_strategy = landing.get("Color Strategy", "") - - if sections: - layout["Sections"] = sections - if cta_placement: - recommendations.append(f"CTA Placement: {cta_placement}") - if color_strategy: - colors["Strategy"] = color_strategy - - # Add page-type specific defaults if no search results - if not layout: - layout["Max Width"] = "1200px" - layout["Layout"] = "Responsive grid" - - if not recommendations: - recommendations = [ - "Refer to MASTER.md for all design rules", - "Add specific overrides as needed for this page" - ] - - return { - "page_type": page_type, - "layout": layout, - "spacing": spacing, - "typography": typography, - "colors": colors, - "components": components, - "unique_components": unique_components, - "recommendations": recommendations - } - - -def _detect_page_type(context: str, style_results: list) -> str: - """Detect page type from context and search results.""" - context_lower = context.lower() - - # Check for common page type patterns - page_patterns = [ - (["dashboard", "admin", "analytics", "data", "metrics", "stats", "monitor", "overview"], "Dashboard / Data View"), - (["checkout", "payment", "cart", "purchase", "order", "billing"], "Checkout / Payment"), - (["settings", "profile", "account", "preferences", "config"], "Settings / Profile"), - (["landing", "marketing", "homepage", "hero", "home", "promo"], "Landing / Marketing"), - (["login", "signin", "signup", "register", "auth", "password"], "Authentication"), - (["pricing", "plans", "subscription", "tiers", "packages"], "Pricing / Plans"), - (["blog", "article", "post", "news", "content", "story"], "Blog / Article"), - (["product", "item", "detail", "pdp", "shop", "store"], "Product Detail"), - (["search", "results", "browse", "filter", "catalog", "list"], "Search Results"), - (["empty", "404", "error", "not found", "zero"], "Empty State"), - ] - - for keywords, page_type in page_patterns: - if any(kw in context_lower for kw in keywords): - return page_type - - # Fallback: try to infer from style results - if style_results: - style_name = style_results[0].get("Style Category", "").lower() - best_for = style_results[0].get("Best For", "").lower() - - if "dashboard" in best_for or "data" in best_for: - return "Dashboard / Data View" - elif "landing" in best_for or "marketing" in best_for: - return "Landing / Marketing" - - return "General" - - -# ============ CLI SUPPORT ============ -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Generate Design System") - parser.add_argument("query", help="Search query (e.g., 'SaaS dashboard')") - parser.add_argument("--project-name", "-p", type=str, default=None, help="Project name") - parser.add_argument("--format", "-f", choices=["ascii", "markdown"], default="ascii", help="Output format") - - args = parser.parse_args() - - result = generate_design_system(args.query, args.project_name, args.format) - print(result) diff --git a/.agent/.agent/skills/ui-ux-pro-max/scripts/search.py b/.agent/.agent/skills/ui-ux-pro-max/scripts/search.py deleted file mode 100644 index 575ea78..0000000 --- a/.agent/.agent/skills/ui-ux-pro-max/scripts/search.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -UI/UX Pro Max Search - BM25 search engine for UI/UX style guides -Usage: python search.py "" [--domain ] [--stack ] [--max-results 3] - python search.py "" --design-system [-p "Project Name"] - python search.py "" --design-system --persist [-p "Project Name"] [--page "dashboard"] - -Domains: style, prompt, color, chart, landing, product, ux, typography -Stacks: html-tailwind, react, nextjs - -Persistence (Master + Overrides pattern): - --persist Save design system to design-system/MASTER.md - --page Also create a page-specific override file in design-system/pages/ -""" - -import argparse -import sys -import io -from core import CSV_CONFIG, AVAILABLE_STACKS, MAX_RESULTS, search, search_stack -from design_system import generate_design_system, persist_design_system - -# Force UTF-8 for stdout/stderr to handle emojis on Windows (cp1252 default) -if sys.stdout.encoding and sys.stdout.encoding.lower() != 'utf-8': - sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') -if sys.stderr.encoding and sys.stderr.encoding.lower() != 'utf-8': - sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') - - -def format_output(result): - """Format results for Claude consumption (token-optimized)""" - if "error" in result: - return f"Error: {result['error']}" - - output = [] - if result.get("stack"): - output.append(f"## UI Pro Max Stack Guidelines") - output.append(f"**Stack:** {result['stack']} | **Query:** {result['query']}") - else: - output.append(f"## UI Pro Max Search Results") - output.append(f"**Domain:** {result['domain']} | **Query:** {result['query']}") - output.append(f"**Source:** {result['file']} | **Found:** {result['count']} results\n") - - for i, row in enumerate(result['results'], 1): - output.append(f"### Result {i}") - for key, value in row.items(): - value_str = str(value) - if len(value_str) > 300: - value_str = value_str[:300] + "..." - output.append(f"- **{key}:** {value_str}") - output.append("") - - return "\n".join(output) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="UI Pro Max Search") - parser.add_argument("query", help="Search query") - parser.add_argument("--domain", "-d", choices=list(CSV_CONFIG.keys()), help="Search domain") - parser.add_argument("--stack", "-s", choices=AVAILABLE_STACKS, help="Stack-specific search (html-tailwind, react, nextjs)") - parser.add_argument("--max-results", "-n", type=int, default=MAX_RESULTS, help="Max results (default: 3)") - parser.add_argument("--json", action="store_true", help="Output as JSON") - # Design system generation - parser.add_argument("--design-system", "-ds", action="store_true", help="Generate complete design system recommendation") - parser.add_argument("--project-name", "-p", type=str, default=None, help="Project name for design system output") - parser.add_argument("--format", "-f", choices=["ascii", "markdown"], default="ascii", help="Output format for design system") - # Persistence (Master + Overrides pattern) - parser.add_argument("--persist", action="store_true", help="Save design system to design-system/MASTER.md (creates hierarchical structure)") - parser.add_argument("--page", type=str, default=None, help="Create page-specific override file in design-system/pages/") - parser.add_argument("--output-dir", "-o", type=str, default=None, help="Output directory for persisted files (default: current directory)") - - args = parser.parse_args() - - # Design system takes priority - if args.design_system: - result = generate_design_system( - args.query, - args.project_name, - args.format, - persist=args.persist, - page=args.page, - output_dir=args.output_dir - ) - print(result) - - # Print persistence confirmation - if args.persist: - project_slug = args.project_name.lower().replace(' ', '-') if args.project_name else "default" - print("\n" + "=" * 60) - print(f"✅ Design system persisted to design-system/{project_slug}/") - print(f" 📄 design-system/{project_slug}/MASTER.md (Global Source of Truth)") - if args.page: - page_filename = args.page.lower().replace(' ', '-') - print(f" 📄 design-system/{project_slug}/pages/{page_filename}.md (Page Overrides)") - print("") - print(f"📖 Usage: When building a page, check design-system/{project_slug}/pages/[page].md first.") - print(f" If exists, its rules override MASTER.md. Otherwise, use MASTER.md.") - print("=" * 60) - # Stack search - elif args.stack: - result = search_stack(args.query, args.stack, args.max_results) - if args.json: - import json - print(json.dumps(result, indent=2, ensure_ascii=False)) - else: - print(format_output(result)) - # Domain search - else: - result = search(args.query, args.domain, args.max_results) - if args.json: - import json - print(json.dumps(result, indent=2, ensure_ascii=False)) - else: - print(format_output(result)) diff --git a/.agent/scripts/sync_vikunja.js b/.agent/scripts/sync_vikunja.js index 77cf7bf..0cf668d 100644 --- a/.agent/scripts/sync_vikunja.js +++ b/.agent/scripts/sync_vikunja.js @@ -4,21 +4,29 @@ const path = require('path'); // 1. Get arguments const args = process.argv.slice(2); if (args.length < 2) { - console.error("Usage: node sync_vikunja.js "); + console.error("Usage:"); + console.error(" node sync_vikunja.js # Update existing task"); + console.error(" node sync_vikunja.js create \"\" \"<message>\" # Create new task"); process.exit(1); } -const taskId = args[0]; +const commandOrId = args[0]; const message = args[1]; // 2. Load configuration from .env.agent -const envPath = path.join(__dirname, '../config/.env.agent'); -if (!fs.existsSync(envPath)) { - console.error("Error: .agent/config/.env.agent file not found. Please create it from the template."); +const envPath = path.join(__dirname, '../../.env.agent'); +const fallbackEnvPath = path.join(__dirname, '../config/.env.agent'); + +let envContent = ''; +if (fs.existsSync(envPath)) { + envContent = fs.readFileSync(envPath, 'utf8'); +} else if (fs.existsSync(fallbackEnvPath)) { + envContent = fs.readFileSync(fallbackEnvPath, 'utf8'); +} else { + console.error("Error: .env.agent file not found."); process.exit(1); } -const envContent = fs.readFileSync(envPath, 'utf8'); const env = {}; envContent.split('\n').forEach(line => { const match = line.match(/^([^#=]+)="?(.*?)"?$/); @@ -29,6 +37,7 @@ envContent.split('\n').forEach(line => { const apiUrl = env.VIKUNJA_API_URL; const apiToken = env.VIKUNJA_API_TOKEN; +const projectId = env.VIKUNJA_PROJECT_ID || 14; if (!apiUrl || !apiToken || apiUrl.includes('[YOUR_')) { console.error("Error: VIKUNJA_API_URL or VIKUNJA_API_TOKEN is not configured correctly in .env.agent."); @@ -40,52 +49,59 @@ if (env.AGENT_OPERATING_MODE === "TEST") { process.exit(0); } -// 3. Helper to make API calls using native fetch (Node 18+) -async function markTaskDoneAndComment(taskId, message) { +const FETCH_OPTS = { + headers: { + 'Authorization': `Bearer ${apiToken}`, + 'Content-Type': 'application/json' + } +}; + +async function createTaskAndComment(title, message) { try { - console.log(`Connecting to Vikunja API for Task ${taskId}...`); - - // Update task status to done - const patchRes = await fetch(`${apiUrl}/tasks/${taskId}`, { - method: 'POST', // Vikunja uses POST to task endpoint for updates - headers: { - 'Authorization': `Bearer ${apiToken}`, - 'Content-Type': 'application/json' - }, - body: JSON.stringify({ done: true }) - }); - - if (!patchRes.ok) { - throw new Error(`Failed to mark task as done: ${patchRes.statusText}`); - } - - console.log(`✅ Task ${taskId} successfully marked as Done.`); - - // Add comment - const commentRes = await fetch(`${apiUrl}/tasks/${taskId}/comments`, { + console.log(`Creating new task in Project ${projectId}...`); + const createRes = await fetch(`${apiUrl}/projects/${projectId}/tasks`, { method: 'PUT', - headers: { - 'Authorization': `Bearer ${apiToken}`, - 'Content-Type': 'application/json' - }, - body: JSON.stringify({ - text: `[Agent Automator] Phase completed.\nReason/Hash: ${message}` + ...FETCH_OPTS, + body: JSON.stringify({ + title: title, + description: message, + done: true }) }); - if (!commentRes.ok) { - console.error(`Warning: Task marked as done, but failed to attach comment: ${commentRes.statusText}`); - } else { - console.log("✅ Comment attached successfully."); - } - - } catch (error) { - console.error("❌ Failed to sync with Vikunja:"); - // Mask the token if it somehow leaks via error message - const secureErr = error.message.replace(new RegExp(apiToken, 'g'), "********"); - console.error(secureErr); + if (!createRes.ok) throw new Error(`Create failed: ${createRes.statusText}`); + const task = await createRes.json(); + console.log(`✅ Task created and marked Done! ID: #${task.id}`); + } catch (e) { + console.error("❌ Failed:", e.message); process.exit(1); } } -markTaskDoneAndComment(taskId, message); +async function markTaskDoneAndComment(taskId, message) { + try { + console.log(`Updating Task ${taskId}...`); + const patchRes = await fetch(`${apiUrl}/tasks/${taskId}`, { + method: 'POST', + ...FETCH_OPTS, + body: JSON.stringify({ done: true }) + }); + + if (!patchRes.ok) throw new Error(`Update failed: ${patchRes.statusText}`); + console.log(`✅ Task ${taskId} marked as Done.`); + + await fetch(`${apiUrl}/tasks/${taskId}/comments`, { + method: 'PUT', ...FETCH_OPTS, body: JSON.stringify({ text: `[Agent Automator] Phase completed.\nReason/Hash: ${message}` }) + }); + console.log("✅ Comment attached."); + } catch (e) { + console.error("❌ Failed:", e.message); + process.exit(1); + } +} + +if (commandOrId === "create") { + createTaskAndComment(message, args[2] || "Task fully completed."); +} else { + markTaskDoneAndComment(commandOrId, message); +} diff --git a/scripts/analysis_raw.txt b/scripts/analysis_raw.txt new file mode 100644 index 0000000..c9bc9f6 --- /dev/null +++ b/scripts/analysis_raw.txt @@ -0,0 +1,58 @@ +0|Gemma4-26B MXFP4_MOE|ngl=999 pure-GPU|63.21|63.78|G0:11770|G1:10411|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +1|Gemma4-26B MXFP4_MOE|compare: cpu-moe|12.92|14.21|G0:3096|G1:3497|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe +2|Gemma4-26B MXFP4_MOE|t=2|64.1|64.27|G0:11728|G1:10411|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +3|Gemma4-26B MXFP4_MOE|t=4|64|64.39|G0:11728|G1:10411|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +4|Gemma4-26B MXFP4_MOE|t=8|63.75|63.9|G0:11728|G1:10411|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +5|Gemma4-26B MXFP4_MOE|t=10|64.01|64.14|G0:11728|G1:10411|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +6|Gemma4-26B MXFP4_MOE|t=12|63.86|63.98|G0:11728|G1:10411|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +7|Gemma4-26B MXFP4_MOE|ub=256 b=1024|63.8|64.12|G0:10504|G1:9619|t=2|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU +8|Gemma4-26B MXFP4_MOE|ub=256 b=2048|63.88|64.04|G0:10504|G1:9619|t=2|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU +9|Gemma4-26B MXFP4_MOE|ub=512 b=4096|63.91|64.18|G0:11728|G1:10411|t=2|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU +10|Gemma4-26B MXFP4_MOE|ub=1024 b=2048|63.86|64.1|G0:10956|G1:9907|t=2|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU +11|Gemma4-26B MXFP4_MOE|ub=1024 b=4096|63.85|64.06|G0:10956|G1:9907|t=2|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU +12|Gemma4-26B MXFP4_MOE|kv=q8_0/q8_0|64.14|64.39|G0:10670|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU +13|Gemma4-26B MXFP4_MOE|kv=q4_0/q8_0|37.52|37.86|G0:10394|G1:9753|t=2|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU +14|Gemma4-26B MXFP4_MOE|kv=f16/f16|63.48|64.31|G0:11700|G1:11667|t=2|ub=512 b=2048|kv=f16/f16|pure-GPU +15|Gemma4-26B MXFP4_MOE|FINAL|64.05|64.29|G0:10667|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU +16|Gemma4-26B Q4_K_M|ngl=999 pure-GPU|76.01|76.31|G0:11784|G1:10454|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +17|Gemma4-26B Q4_K_M|compare: cpu-moe|10.19|10.49|G0:2652|G1:2982|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe +18|Gemma4-26B Q4_K_M|t=2|75.67|75.87|G0:11783|G1:10454|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +19|Gemma4-26B Q4_K_M|t=4|75.61|75.87|G0:11783|G1:10454|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +20|Gemma4-26B Q4_K_M|t=8|75.42|75.59|G0:11783|G1:10454|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +21|Gemma4-26B Q4_K_M|t=10|75.71|75.82|G0:11783|G1:10454|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +22|Gemma4-26B Q4_K_M|t=12|75.08|75.7|G0:11783|G1:10454|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU +23|Gemma4-26B Q4_K_M|ub=256 b=1024|75.16|75.64|G0:10559|G1:9662|t=6|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU +24|Gemma4-26B Q4_K_M|ub=256 b=2048|75.68|76.05|G0:10559|G1:9662|t=6|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU +25|Gemma4-26B Q4_K_M|ub=512 b=4096|75.92|76.16|G0:11784|G1:10454|t=6|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU +26|Gemma4-26B Q4_K_M|ub=1024 b=2048|75.7|75.9|G0:11012|G1:9950|t=6|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU +27|Gemma4-26B Q4_K_M|ub=1024 b=4096|75.77|75.99|G0:11011|G1:9950|t=6|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU +28|Gemma4-26B Q4_K_M|kv=q8_0/q8_0|76.3|76.69|G0:10725|G1:10212|t=6|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU +29|Gemma4-26B Q4_K_M|kv=q4_0/q8_0|42.88|44.58|G0:10439|G1:9796|t=6|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU +30|Gemma4-26B Q4_K_M|kv=f16/f16|76.36|76.78|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU +31|Gemma4-26B Q4_K_M|FINAL|76.4|76.75|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU +32|Qwen3.5-35B MXFP4_MOE|n-cpu-moe=5|51.43|52.07|G0:10365|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +33|Qwen3.5-35B MXFP4_MOE|t=2|43.8|46.4|G0:10365|G1:11152|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +34|Qwen3.5-35B MXFP4_MOE|t=4|49.21|52.78|G0:10353|G1:11152|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +35|Qwen3.5-35B MXFP4_MOE|t=8|46.43|50.49|G0:10397|G1:11152|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +36|Qwen3.5-35B MXFP4_MOE|t=10|46.12|50.06|G0:10351|G1:11152|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +37|Qwen3.5-35B MXFP4_MOE|t=12|45.23|47.1|G0:10337|G1:11152|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +38|Qwen3.5-35B MXFP4_MOE|ub=256 b=1024|48.9|52.3|G0:9834|G1:10906|t=6|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5 +39|Qwen3.5-35B MXFP4_MOE|ub=256 b=2048|49.62|52.52|G0:9833|G1:10906|t=6|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +40|Qwen3.5-35B MXFP4_MOE|ub=512 b=4096|48.78|52.14|G0:10337|G1:11152|t=6|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5 +41|Qwen3.5-35B MXFP4_MOE|ub=1024 b=2048|49.95|52.53|G0:11124|G1:11644|t=6|ub=1024 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +42|Qwen3.5-35B MXFP4_MOE|ub=1024 b=4096|48.75|52.06|G0:11123|G1:11644|t=6|ub=1024 b=4096|kv=q4_0/q4_0|n-cpu-moe=5 +43|Qwen3.5-35B MXFP4_MOE|kv=q4_0/q8_0|42.81|44.14|G0:10681|G1:11472|t=6|ub=512 b=2048|kv=q4_0/q8_0|n-cpu-moe=5 +44|Qwen3.5-35B MXFP4_MOE|FINAL|46.66|47.09|G0:10476|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +45|Qwen3.5-35B Q4_K_M|n-cpu-moe=5|49.01|53.09|G0:10606|G1:11338|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +46|Qwen3.5-35B Q4_K_M|t=2|45.73|47.87|G0:10599|G1:11338|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +47|Qwen3.5-35B Q4_K_M|t=4|50.98|54.33|G0:10601|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +48|Qwen3.5-35B Q4_K_M|t=8|48.45|52.1|G0:10596|G1:11338|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +49|Qwen3.5-35B Q4_K_M|t=10|47.83|51.45|G0:10595|G1:11338|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +50|Qwen3.5-35B Q4_K_M|t=12|43.77|46.79|G0:10589|G1:11338|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +51|Qwen3.5-35B Q4_K_M|ub=256 b=1024|52.14|53.82|G0:10089|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5 +52|Qwen3.5-35B Q4_K_M|ub=256 b=2048|50.23|53.66|G0:10091|G1:11092|t=4|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +53|Qwen3.5-35B Q4_K_M|ub=512 b=2048|49.89|53.89|G0:10595|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5 +54|Qwen3.5-35B Q4_K_M|ub=512 b=4096|50.4|54.19|G0:10564|G1:11338|t=4|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5 +55|Qwen3.5-35B Q4_K_M|kv=q8_0/q8_0|51.84|53.53|G0:10726|G1:11732|t=4|ub=256 b=1024|kv=q8_0/q8_0|n-cpu-moe=5 +56|Qwen3.5-35B Q4_K_M|kv=q4_0/q8_0|43.22|45.99|G0:10410|G1:11412|t=4|ub=256 b=1024|kv=q4_0/q8_0|n-cpu-moe=5 +57|Qwen3.5-35B Q4_K_M|FINAL|52.05|54.48|G0:10062|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5 \ No newline at end of file diff --git a/scripts/auto_tune_122b.py b/scripts/auto_tune_122b.py new file mode 100644 index 0000000..2652d53 --- /dev/null +++ b/scripts/auto_tune_122b.py @@ -0,0 +1,372 @@ +""" +Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트 +=========================================== +각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다. +서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다. + +예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정) +""" +import subprocess +import time +import json +import urllib.request +import os +import re +import sys +import datetime + +try: + sys.stdout.reconfigure(encoding='utf-8') +except AttributeError: + pass + +BASE_URL = "http://127.0.0.1:8000" +MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf" +SERVER_EXE = r"llama_bin_run\llama-server.exe" + +# ============================================================ +# 테스트할 설정 목록 +# ============================================================ +# 공통 파라미터 (변경하지 않는 것들) +COMMON_ARGS = [ + "--model", MODEL_PATH, + "-ngl", "999", + "--cpu-moe", + "-c", "2048", + "-np", "1", + "-fa", "on", + "--cache-type-k", "q4_0", + "--cache-type-v", "q4_0", + "-ub", "256", + "-b", "1024", + "--mlock", + "--port", "8000", + "--host", "0.0.0.0", + "--no-warmup", # 워밍업은 벤치마크 스크립트에서 직접 수행 +] + +# 변수 파라미터 조합 +CONFIGS = [ + { + "name": "A) --no-mmap -t 8", + "desc": "서버 권장: mmap 비활성화 (baseline 대비)", + "extra": ["--no-mmap", "-t", "8", "--prio", "2"], + }, + { + "name": "B) --no-mmap -t 6", + "desc": "스레드 감소 (캐시 경합 회피)", + "extra": ["--no-mmap", "-t", "6", "--prio", "2"], + }, + { + "name": "C) --no-mmap -t 10", + "desc": "스레드 증가 (RAM 대역폭 포화)", + "extra": ["--no-mmap", "-t", "10", "--prio", "2"], + }, + { + "name": "D) --no-mmap -t 12", + "desc": "더 많은 스레드", + "extra": ["--no-mmap", "-t", "12", "--prio", "2"], + }, + { + "name": "E) --no-mmap -t 10 --prio 3 --poll 100", + "desc": "최적 스레드 + 리얼타임 우선순위 + 폴링", + "extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"], + }, +] + +# ============================================================ +# 유틸리티 함수 +# ============================================================ + +def kill_server(): + """llama-server 프로세스 강제 종료""" + os.system("taskkill /F /IM llama-server.exe >nul 2>&1") + time.sleep(3) + +def start_server(config, log_path): + """서버 시작, 로그를 파일로 리다이렉트""" + cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"] + log_file = open(log_path, "w", encoding="utf-8") + proc = subprocess.Popen( + cmd, + stdout=log_file, + stderr=subprocess.STDOUT, + cwd=os.getcwd() + ) + return proc, log_file + +def wait_for_server(timeout=600): + """서버가 준비될 때까지 대기""" + start = time.time() + while time.time() - start < timeout: + try: + req = urllib.request.Request(f"{BASE_URL}/health") + with urllib.request.urlopen(req, timeout=5) as resp: + data = json.loads(resp.read()) + if data.get("status") == "ok": + return True + except: + pass + time.sleep(5) + return False + +def run_single_benchmark(prompt, max_tokens=200): + """단일 벤치마크 실행""" + payload = json.dumps({ + "model": "local-model", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "temperature": 0.0 + }).encode("utf-8") + + req = urllib.request.Request( + f"{BASE_URL}/v1/chat/completions", + data=payload, + headers={"Content-Type": "application/json"} + ) + + start = time.time() + with urllib.request.urlopen(req, timeout=600) as resp: + result = json.loads(resp.read()) + elapsed = time.time() - start + + usage = result.get("usage", {}) + completion_tokens = usage.get("completion_tokens", 0) + return completion_tokens, elapsed + +def parse_eval_times(log_path): + """서버 로그에서 순수 eval time 파싱""" + try: + with open(log_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + except: + return [] + + # "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)" + pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)' + matches = re.findall(pattern, content, re.MULTILINE) + + results = [] + for m in matches: + results.append({ + "total_ms": float(m[0]), + "tokens": int(m[1]), + "ms_per_token": float(m[2]), + "tps": float(m[3]) + }) + return results + +def parse_prompt_eval_times(log_path): + """서버 로그에서 prompt eval time 파싱""" + try: + with open(log_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + except: + return [] + + pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)' + matches = re.findall(pattern, content, re.MULTILINE) + + results = [] + for m in matches: + results.append({ + "total_ms": float(m[0]), + "tokens": int(m[1]), + "ms_per_token": float(m[2]), + "tps": float(m[3]) + }) + return results + +def parse_vram_usage(log_path): + """서버 로그에서 CUDA0 모델 버퍼 크기 파싱""" + try: + with open(log_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + except: + return "N/A" + + match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content) + if match: + return f"{float(match.group(1)):.0f} MiB" + return "N/A" + +# ============================================================ +# 메인 튜닝 루프 +# ============================================================ + +def main(): + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + + print("=" * 70) + print(" Qwen3.5 122B-A10B 자동 정밀 튜닝") + print(f" 시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}") + print(f" 테스트 설정: {len(CONFIGS)}개") + print(f" 예상 소요: ~{len(CONFIGS) * 7}분") + print("=" * 70) + print() + print(" 기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)") + print() + + # 결과 저장 + all_results = [] + + for idx, config in enumerate(CONFIGS): + config_start = time.time() + log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt") + + print(f"\n{'='*70}") + print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}") + print(f" {config['desc']}") + print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}") + print(f"{'='*70}") + + # 1. 기존 서버 종료 + print(" [1/4] 서버 종료 중...") + kill_server() + + # 2. 새 서버 시작 + print(f" [2/4] 서버 시작 중... (모델 로딩 ~3-5분)") + proc, log_file = start_server(config, log_path) + + # 3. 서버 준비 대기 + if not wait_for_server(timeout=600): + print(" ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.") + kill_server() + log_file.close() + all_results.append({ + "config": config["name"], + "status": "FAILED", + "eval_tps": [], + "prompt_tps": [], + "vram": "N/A" + }) + continue + + load_time = time.time() - config_start + print(f" [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)") + + # 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회) + print(" [4/4] 벤치마크 실행 중...") + + # 워밍업 + try: + run_single_benchmark("Say hello.", max_tokens=20) + print(" 워밍업 완료") + except Exception as e: + print(f" 워밍업 실패: {e}") + + # 본 테스트 3회 + prompts = [ + "Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.", + "Explain the complete process of photosynthesis including light and dark reactions in detail.", + "Describe the differences between SQL and NoSQL databases with examples and performance characteristics.", + ] + + for i, prompt in enumerate(prompts): + try: + tokens, elapsed = run_single_benchmark(prompt, max_tokens=200) + approx_tps = tokens / elapsed if elapsed > 0 else 0 + print(f" Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)") + except Exception as e: + print(f" Run {i+1}/3: ERROR - {e}") + + # 서버 종료 전에 로그 플러시를 위해 잠시 대기 + time.sleep(2) + + # 서버 종료 + kill_server() + log_file.close() + time.sleep(2) + + # 로그 파싱 + eval_times = parse_eval_times(log_path) + prompt_times = parse_prompt_eval_times(log_path) + vram = parse_vram_usage(log_path) + + # 워밍업 제외 (첫 번째 결과) + if len(eval_times) > 1: + bench_evals = eval_times[1:] # 워밍업 제외 + else: + bench_evals = eval_times + + if len(prompt_times) > 1: + bench_prompts = prompt_times[1:] + else: + bench_prompts = prompt_times + + eval_speeds = [e["tps"] for e in bench_evals] + prompt_speeds = [p["tps"] for p in bench_prompts] + + result = { + "config": config["name"], + "status": "OK", + "eval_tps": eval_speeds, + "prompt_tps": prompt_speeds, + "vram": vram, + } + all_results.append(result) + + config_elapsed = time.time() - config_start + print(f"\n 완료! 소요: {config_elapsed:.0f}초") + + if eval_speeds: + avg_eval = sum(eval_speeds) / len(eval_speeds) + max_eval = max(eval_speeds) + print(f" 📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}") + + # ============================================================ + # 최종 결과 비교 테이블 + # ============================================================ + print("\n") + print("=" * 80) + print(" 🏆 최종 결과 비교 테이블") + print("=" * 80) + print() + + # 기존 baseline 추가 + print(f" {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}") + print(f" {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}") + + # Baseline (이전 결과) + print(f" {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}") + + best_avg = 0 + best_config = "" + + for r in all_results: + if r["status"] != "OK" or not r["eval_tps"]: + print(f" {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}") + continue + + avg_e = sum(r["eval_tps"]) / len(r["eval_tps"]) + max_e = max(r["eval_tps"]) + avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0 + + if avg_e > best_avg: + best_avg = avg_e + best_config = r["config"] + + marker = " ⭐" if avg_e > 10.06 else "" + print(f" {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}") + + print() + if best_avg > 0: + improvement = ((best_avg - 10.02) / 10.02) * 100 + print(f" 🏆 최고 성능: {best_config}") + print(f" → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)") + + print() + print(f" 완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}") + print("=" * 80) + + # 결과를 파일로도 저장 + result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt") + with open(result_path, "w", encoding="utf-8") as f: + f.write("Qwen3.5 122B-A10B Fine Tuning Results\n") + f.write(f"Date: {timestamp}\n\n") + for r in all_results: + f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n") + print(f" 결과 저장: {result_path}") + +if __name__ == "__main__": + main() diff --git a/scripts/auto_tune_122b_r2.py b/scripts/auto_tune_122b_r2.py new file mode 100644 index 0000000..f603ed7 --- /dev/null +++ b/scripts/auto_tune_122b_r2.py @@ -0,0 +1,257 @@ +""" +Qwen3.5 122B-A10B 정밀 튜닝 2라운드 +==================================== +1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름 +→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색 +""" +import subprocess +import time +import json +import urllib.request +import os +import re +import sys +import datetime + +try: + sys.stdout.reconfigure(encoding='utf-8') +except AttributeError: + pass + +BASE_URL = "http://127.0.0.1:8000" +MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf" +SERVER_EXE = r"llama_bin_run\llama-server.exe" + +COMMON_ARGS = [ + "--model", MODEL_PATH, + "-ngl", "999", + "--cpu-moe", + "-c", "2048", + "-np", "1", + "-fa", "on", + "--cache-type-k", "q4_0", + "--cache-type-v", "q4_0", + "-ub", "256", + "-b", "1024", + "--mlock", + "--port", "8000", + "--host", "0.0.0.0", + "--no-warmup", +] + +CONFIGS = [ + { + "name": "F) mmap on, -t 4", + "desc": "최소 스레드 (4개, 물리코어 절반)", + "extra": ["-t", "4", "--prio", "2"], + }, + { + "name": "G) mmap on, -t 5", + "desc": "스레드 5개", + "extra": ["-t", "5", "--prio", "2"], + }, + { + "name": "H) mmap on, -t 6", + "desc": "스레드 6개 (--no-mmap에서 최고였음)", + "extra": ["-t", "6", "--prio", "2"], + }, + { + "name": "I) mmap on, -t 7", + "desc": "스레드 7개", + "extra": ["-t", "7", "--prio", "2"], + }, + { + "name": "J) mmap on, -t 6, --prio 3", + "desc": "최적 스레드 + 리얼타임 우선순위", + "extra": ["-t", "6", "--prio", "3"], + }, +] + +def kill_server(): + os.system("taskkill /F /IM llama-server.exe >nul 2>&1") + time.sleep(3) + +def start_server(config, log_path): + cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"] + log_file = open(log_path, "w", encoding="utf-8") + proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd()) + return proc, log_file + +def wait_for_server(timeout=600): + start = time.time() + while time.time() - start < timeout: + try: + req = urllib.request.Request(f"{BASE_URL}/health") + with urllib.request.urlopen(req, timeout=5) as resp: + data = json.loads(resp.read()) + if data.get("status") == "ok": + return True + except: + pass + time.sleep(5) + return False + +def run_single_benchmark(prompt, max_tokens=200): + payload = json.dumps({ + "model": "local-model", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "temperature": 0.0 + }).encode("utf-8") + req = urllib.request.Request( + f"{BASE_URL}/v1/chat/completions", + data=payload, + headers={"Content-Type": "application/json"} + ) + start = time.time() + with urllib.request.urlopen(req, timeout=600) as resp: + result = json.loads(resp.read()) + elapsed = time.time() - start + usage = result.get("usage", {}) + return usage.get("completion_tokens", 0), elapsed + +def parse_eval_times(log_path): + try: + with open(log_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + except: + return [] + pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)' + matches = re.findall(pattern, content, re.MULTILINE) + return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches] + +def parse_prompt_eval_times(log_path): + try: + with open(log_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + except: + return [] + pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)' + matches = re.findall(pattern, content, re.MULTILINE) + return [{"tps": float(m[3])} for m in matches] + +def main(): + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + + print("=" * 70) + print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드") + print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}") + print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)") + print("=" * 70) + print() + + all_results = [] + + for idx, config in enumerate(CONFIGS): + config_start = time.time() + log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt") + + print(f"\n{'='*70}") + print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}") + print(f" {config['desc']}") + print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}") + print(f"{'='*70}") + + kill_server() + print(f" [1/3] 서버 시작 중...") + proc, log_file = start_server(config, log_path) + + if not wait_for_server(timeout=600): + print(" ❌ 서버 시작 실패!") + kill_server() + log_file.close() + all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []}) + continue + + load_time = time.time() - config_start + print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)") + + # 워밍업 + 벤치마크 + try: + run_single_benchmark("Say hello.", max_tokens=20) + except: + pass + + print(" [3/3] 벤치마크 3회...") + prompts = [ + "Write a detailed explanation of how neural networks learn through backpropagation.", + "Explain the complete process of photosynthesis including light and dark reactions.", + "Describe the differences between SQL and NoSQL databases with examples.", + ] + for i, prompt in enumerate(prompts): + try: + tokens, elapsed = run_single_benchmark(prompt, max_tokens=200) + print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s") + except Exception as e: + print(f" Run {i+1}: ERROR - {e}") + + time.sleep(2) + kill_server() + log_file.close() + time.sleep(2) + + eval_times = parse_eval_times(log_path) + prompt_times = parse_prompt_eval_times(log_path) + bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times + bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times + + eval_speeds = [e["tps"] for e in bench_evals] + prompt_speeds = [p["tps"] for p in bench_prompts] + + all_results.append({ + "config": config["name"], + "status": "OK", + "eval_tps": eval_speeds, + "prompt_tps": prompt_speeds, + }) + + if eval_speeds: + print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}") + + # 최종 결과 + print("\n") + print("=" * 85) + print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)") + print("=" * 85) + print() + print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}") + print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}") + + # 1라운드 결과 (하드코딩) + r1 = [ + ("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52), + ("A) --no-mmap -t 8", 9.66, 9.70, 28.26), + ("B) --no-mmap -t 6", 10.02, 10.18, 26.73), + ("C) --no-mmap -t 10", 9.42, 9.46, 27.31), + ("D) --no-mmap -t 12", 9.04, 9.11, 27.92), + ("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37), + ] + for name, avg, mx, pp in r1: + marker = " ⭐" if avg >= 10.0 else "" + print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}") + + print(f" {'--- 2라운드 ---':<48}") + + best_avg = 10.06 # 기존 최고 + best_config = "[기준] mmap on, -t 8" + + for r in all_results: + if r["status"] != "OK" or not r["eval_tps"]: + print(f" {r['config']:<48} {'FAIL':>8}") + continue + avg_e = sum(r["eval_tps"]) / len(r["eval_tps"]) + max_e = max(r["eval_tps"]) + avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0 + if max_e > best_avg: + best_avg = max_e + best_config = r["config"] + marker = " ⭐" if avg_e >= 10.0 else "" + print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}") + + print() + print(f" 🏆 최고 성능: {best_config} → {best_avg:.2f} t/s") + print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}") + print("=" * 85) + +if __name__ == "__main__": + main() diff --git a/scripts/auto_tune_gemma4_256k.py b/scripts/auto_tune_gemma4_256k.py new file mode 100644 index 0000000..80d20fa --- /dev/null +++ b/scripts/auto_tune_gemma4_256k.py @@ -0,0 +1,339 @@ +""" +Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB +Phase 1: -ngl sweep (GPU layers) +Phase 2: -t / -tb sweep (CPU threads) +Phase 3: -ub / -b sweep (batch sizes) +Phase 4: --cache-type-k/v sweep (KV cache precision) +Phase 5: --no-mmap, --poll, --prio sweep (misc) +Each phase fixes the best from previous phases. +""" +import subprocess +import time +import json +import urllib.request +import sys +import os +import itertools + +try: + sys.stdout.reconfigure(encoding='utf-8') +except AttributeError: + pass + +BASE_URL = "http://127.0.0.1:8000" +LLAMA_SERVER = r"llama_bin_run\llama-server.exe" +MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf" +CONTEXT = 262144 +BENCHMARK_RUNS = 3 +BENCHMARK_TOKENS = 200 + +# ─── Baseline (from previous tuning at -c 4096) ─── +BEST = { + "ngl": 22, + "t": 8, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": True, + "mmap": True, + "prio": 2, + "poll": 50, +} + +ALL_RESULTS = [] + + +def kill_server(): + subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], + capture_output=True) + time.sleep(4) + + +def build_cmd(cfg): + cmd = [LLAMA_SERVER, "--model", MODEL, + "-ngl", str(cfg["ngl"]), + "-c", str(CONTEXT), + "-np", "1", + "-fa", cfg["fa"], + "--cache-type-k", cfg["ctk"], + "--cache-type-v", cfg["ctv"], + "-ub", str(cfg["ub"]), + "-b", str(cfg["b"]), + "-t", str(cfg["t"]), + "-tb", str(cfg["tb"]), + "--prio", str(cfg["prio"]), + "--poll", str(cfg["poll"]), + "--port", "8000", + "--host", "0.0.0.0"] + if cfg["mlock"]: + cmd.append("--mlock") + if not cfg["mmap"]: + cmd.append("--no-mmap") + return cmd + + +def start_server(cfg): + cmd = build_cmd(cfg) + proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace' + ) + return proc + + +def wait_for_server(timeout=180): + start = time.time() + while time.time() - start < timeout: + try: + req = urllib.request.Request(f"{BASE_URL}/health") + with urllib.request.urlopen(req, timeout=3) as resp: + data = json.loads(resp.read()) + if data.get("status") == "ok": + return True + except: + pass + time.sleep(2) + return False + + +def run_benchmark(max_tokens=BENCHMARK_TOKENS): + payload = json.dumps({ + "model": "local-model", + "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}], + "max_tokens": max_tokens, + "temperature": 0.0 + }).encode("utf-8") + + req = urllib.request.Request( + f"{BASE_URL}/v1/chat/completions", + data=payload, + headers={"Content-Type": "application/json"} + ) + + start = time.time() + with urllib.request.urlopen(req, timeout=300) as resp: + result = json.loads(resp.read()) + elapsed = time.time() - start + + usage = result.get("usage", {}) + ct = usage.get("completion_tokens", 0) + return ct / elapsed if elapsed > 0 else 0 + + +def get_vram(): + try: + r = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.used,memory.total", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + parts = r.stdout.strip().split(",") + return int(parts[0].strip()), int(parts[1].strip()) + except: + return 0, 0 + + +def test_config(cfg, label=""): + kill_server() + desc = label or str(cfg) + print(f" [{desc}] Starting server...") + proc = start_server(cfg) + + if not wait_for_server(): + print(f" [{desc}] FAILED to start") + proc.kill() + return None + + vram_used, vram_total = get_vram() + print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True) + + # Warmup + try: + run_benchmark(max_tokens=20) + except: + pass + + # Benchmark + speeds = [] + for i in range(BENCHMARK_RUNS): + try: + tps = run_benchmark() + speeds.append(tps) + except Exception as e: + print(f"ERR({e}) ", end="", flush=True) + + proc.kill() + + if not speeds: + print("ALL FAILED") + return None + + avg = sum(speeds) / len(speeds) + best = max(speeds) + print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s") + + result = {**cfg, "avg_tps": avg, "best_tps": best, + "vram_used": vram_used, "vram_total": vram_total, "label": label} + ALL_RESULTS.append(result) + return result + + +def phase_sweep(phase_name, param_name, values, base_cfg): + print(f"\n{'='*70}") + print(f" PHASE: {phase_name}") + print(f" Sweeping: {param_name} = {values}") + print(f"{'='*70}") + + best_result = None + for val in values: + cfg = {**base_cfg} + if isinstance(param_name, list): + for p, v in zip(param_name, val): + cfg[p] = v + label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val)) + else: + cfg[param_name] = val + label = f"{param_name}={val}" + + r = test_config(cfg, label) + if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]): + best_result = r + + if best_result: + print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s") + return best_result + + +def main(): + print("=" * 70) + print(" Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner") + print(" 256K Context | RTX 3060 12GB") + print("=" * 70) + print() + + cfg = dict(BEST) + + # ─── Phase 1: -ngl (already done, quick verify top 3) ─── + r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg) + if r: + cfg["ngl"] = r["ngl"] + + # ─── Phase 2: CPU threads (-t, -tb) ─── + thread_combos = [ + (2, 2), (4, 4), (4, 8), (6, 6), (6, 8), + (8, 8), (8, 12), (10, 10), (12, 12), (16, 16) + ] + r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg) + if r: + cfg["t"] = r["t"] + cfg["tb"] = r["tb"] + + # ─── Phase 3: Batch sizes (-ub, -b) ─── + batch_combos = [ + (128, 512), (256, 1024), (256, 2048), + (512, 1024), (512, 2048), (512, 4096), + (1024, 2048), (1024, 4096) + ] + r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg) + if r: + cfg["ub"] = r["ub"] + cfg["b"] = r["b"] + + # ─── Phase 4: KV cache precision ─── + kv_combos = [ + ("q4_0", "q4_0"), + ("q8_0", "q8_0"), + ("q4_0", "q8_0"), + ("f16", "f16"), + ] + r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg) + if r: + cfg["ctk"] = r["ctk"] + cfg["ctv"] = r["ctv"] + + # ─── Phase 5: Misc (mmap, poll, prio) ─── + misc_combos = [ + (True, 50, 2), # baseline + (False, 50, 2), # no-mmap + (True, 0, 2), # no polling + (True, 100, 2), # max polling + (True, 50, 3), # realtime priority + (False, 0, 3), # no-mmap + no-poll + realtime + ] + r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg) + if r: + cfg["mmap"] = r["mmap"] + cfg["poll"] = r["poll"] + cfg["prio"] = r["prio"] + + # ─── Final Report ─── + print() + print("=" * 70) + print(" FINAL OPTIMAL CONFIGURATION") + print("=" * 70) + print(f" ngl: {cfg['ngl']}") + print(f" threads: -t {cfg['t']} -tb {cfg['tb']}") + print(f" batch: -ub {cfg['ub']} -b {cfg['b']}") + print(f" kv cache: -ctk {cfg['ctk']} -ctv {cfg['ctv']}") + print(f" flash: -fa {cfg['fa']}") + print(f" mlock: {'yes' if cfg['mlock'] else 'no'}") + print(f" mmap: {'yes' if cfg['mmap'] else 'no (--no-mmap)'}") + print(f" prio: {cfg['prio']}") + print(f" poll: {cfg['poll']}") + print() + + # Final verification run + print(" Running final verification (5 runs)...") + kill_server() + proc = start_server(cfg) + wait_for_server() + try: + run_benchmark(max_tokens=20) + except: + pass + final_speeds = [] + for i in range(5): + try: + tps = run_benchmark() + final_speeds.append(tps) + print(f" Run {i+1}: {tps:.2f} t/s") + except: + pass + proc.kill() + + if final_speeds: + avg = sum(final_speeds) / len(final_speeds) + best = max(final_speeds) + print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s") + + print() + cmd_parts = [ + f"llama-server --model {MODEL}", + f"-ngl {cfg['ngl']} -c {CONTEXT}", + f"-t {cfg['t']} -tb {cfg['tb']}", + f"-ub {cfg['ub']} -b {cfg['b']}", + f"-fa {cfg['fa']}", + f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}", + f"--prio {cfg['prio']} --poll {cfg['poll']}", + ] + if cfg["mlock"]: + cmd_parts.append("--mlock") + if not cfg["mmap"]: + cmd_parts.append("--no-mmap") + cmd_parts.append("--port 8000 --host 0.0.0.0") + + print(" Recommended command:") + print(f" {' '.join(cmd_parts)}") + print("=" * 70) + + # Dump all results to JSON + with open("scripts/tune_results_gemma4_256k.json", "w") as f: + json.dump(ALL_RESULTS, f, indent=2, default=str) + print(f"\n Full results saved: scripts/tune_results_gemma4_256k.json") + + +if __name__ == "__main__": + main() diff --git a/scripts/auto_tune_gemma4_ncpumoe.py b/scripts/auto_tune_gemma4_ncpumoe.py new file mode 100644 index 0000000..a38b027 --- /dev/null +++ b/scripts/auto_tune_gemma4_ncpumoe.py @@ -0,0 +1,163 @@ +""" +Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context. +Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30. +""" +import subprocess, time, json, urllib.request, sys, os + +try: + sys.stdout.reconfigure(encoding='utf-8') +except: + pass + +BASE_URL = "http://127.0.0.1:8000" +SERVER = r"llama_bin_run\llama-server.exe" +MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf" +CTX = 262144 +RUNS = 3 + + +def kill(): + subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True) + time.sleep(4) + + +def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False): + cmd = [SERVER, "--model", MODEL, "-ngl", "999", + "-c", str(CTX), "-np", "1", "-fa", "on", + "--cache-type-k", ctk, "--cache-type-v", ctv, + "-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t), + "--prio", str(prio), "--poll", "50", + "--mlock", "--port", "8000", "--host", "0.0.0.0"] + if ncpumoe > 0: + cmd.extend(["--n-cpu-moe", str(ncpumoe)]) + if nommap: + cmd.append("--no-mmap") + return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace') + + +def wait_ready(timeout=240): + t0 = time.time() + while time.time() - t0 < timeout: + try: + with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r: + if json.loads(r.read()).get("status") == "ok": + return True + except: + pass + time.sleep(2) + return False + + +def bench(n=200): + p = json.dumps({"model": "m", "messages": [{"role": "user", + "content": "Count from 1 to 50, each number on new line."}], + "max_tokens": n, "temperature": 0.0}).encode() + r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p, + headers={"Content-Type": "application/json"}) + t0 = time.time() + with urllib.request.urlopen(r, timeout=300) as resp: + res = json.loads(resp.read()) + dt = time.time() - t0 + ct = res.get("usage", {}).get("completion_tokens", 0) + return ct / dt if dt > 0 else 0 + + +def vram(): + try: + r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total", + "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5) + a, b = r.stdout.strip().split(",") + return int(a.strip()), int(b.strip()) + except: + return 0, 0 + + +def test(label, ncpumoe, **kw): + kill() + print(f" [{label}] Starting...", end=" ", flush=True) + p = start(ncpumoe, **kw) + if not wait_ready(): + print("FAILED"); p.kill(); return None + vu, vt = vram() + print(f"VRAM:{vu}/{vt} | ", end="", flush=True) + try: bench(20) + except: pass + speeds = [] + for _ in range(RUNS): + try: speeds.append(bench()) + except: pass + p.kill() + if not speeds: + print("BENCH FAILED"); return None + avg, best = sum(speeds)/len(speeds), max(speeds) + print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s") + return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best, + "vram": vu, **kw} + + +def main(): + print("=" * 60) + print(" Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune") + print("=" * 60) + results = [] + + # Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30) + print("\n--- Phase 1: --n-cpu-moe sweep ---") + for n in [0, 5, 10, 15, 20, 25, 30]: + nm = n > 15 # use --no-mmap when heavy CPU offload + r = test(f"ncpumoe={n}", n, nommap=nm) + if r: results.append(r) + + # Find best n-cpu-moe + best_r = max(results, key=lambda x: x["avg"]) + best_n = best_r["ncpumoe"] + print(f"\n ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s") + + # Fine-tune around best + if best_n > 0: + print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---") + for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]: + if n == best_n: continue + nm = n > 15 + r = test(f"ncpumoe={n}", n, nommap=nm) + if r: results.append(r) + best_r = max(results, key=lambda x: x["avg"]) + best_n = best_r["ncpumoe"] + print(f"\n ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s") + + # Phase 2: Thread sweep at best n-cpu-moe + nm = best_n > 15 + print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---") + for t in [2, 4, 6, 8, 10]: + r = test(f"t={t}", best_n, t=t, nommap=nm) + if r: results.append(r) + best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"]) + bt = best_t.get("t", 4) + print(f"\n ★ Best threads: {bt}") + + # Phase 3: Batch sweep + print(f"\n--- Phase 3: Batch sweep ---") + for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]: + r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm) + if r: results.append(r) + + # Phase 4: KV cache type + print(f"\n--- Phase 4: KV cache type ---") + for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]: + r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm) + if r: results.append(r) + + # Final report + best_all = max(results, key=lambda x: x["avg"]) + print(f"\n{'='*60}") + print(f" FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})") + print(f"{'='*60}") + + with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f: + json.dump(results, f, indent=2, default=str) + print(" Saved: scripts/tune_results_gemma4_ncpumoe.json") + + +if __name__ == "__main__": + main() diff --git a/scripts/auto_tune_qwen35b_256k.py b/scripts/auto_tune_qwen35b_256k.py new file mode 100644 index 0000000..2a58847 --- /dev/null +++ b/scripts/auto_tune_qwen35b_256k.py @@ -0,0 +1,335 @@ +""" +Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB +Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s) +Now tuning for -c 262144 (256K context). + +Phase 1: --cpu-moe vs no --cpu-moe baseline +Phase 2: -t / -tb sweep +Phase 3: -ub / -b sweep +Phase 4: --cache-type-k/v sweep +Phase 5: Misc (mmap, poll, prio) +""" +import subprocess +import time +import json +import urllib.request +import sys +import os + +try: + sys.stdout.reconfigure(encoding='utf-8') +except AttributeError: + pass + +BASE_URL = "http://127.0.0.1:8000" +LLAMA_SERVER = r"llama_bin_run\llama-server.exe" +MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf" +CONTEXT = 262144 +BENCHMARK_RUNS = 3 +BENCHMARK_TOKENS = 200 + +BEST = { + "ngl": 999, + "cpu_moe": True, + "t": 6, + "tb": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": True, + "mmap": True, + "prio": 2, + "poll": 50, +} + +ALL_RESULTS = [] + + +def kill_server(): + subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True) + time.sleep(4) + + +def build_cmd(cfg): + cmd = [LLAMA_SERVER, "--model", MODEL, + "-ngl", str(cfg["ngl"]), + "-c", str(CONTEXT), + "-np", "1", + "-fa", cfg["fa"], + "--cache-type-k", cfg["ctk"], + "--cache-type-v", cfg["ctv"], + "-ub", str(cfg["ub"]), + "-b", str(cfg["b"]), + "-t", str(cfg["t"]), + "-tb", str(cfg["tb"]), + "--prio", str(cfg["prio"]), + "--poll", str(cfg["poll"]), + "--port", "8000", + "--host", "0.0.0.0"] + if cfg.get("cpu_moe"): + cmd.append("--cpu-moe") + if cfg["mlock"]: + cmd.append("--mlock") + if not cfg["mmap"]: + cmd.append("--no-mmap") + return cmd + + +def start_server(cfg): + cmd = build_cmd(cfg) + proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace' + ) + return proc + + +def wait_for_server(timeout=240): + start = time.time() + while time.time() - start < timeout: + try: + req = urllib.request.Request(f"{BASE_URL}/health") + with urllib.request.urlopen(req, timeout=3) as resp: + data = json.loads(resp.read()) + if data.get("status") == "ok": + return True + except: + pass + time.sleep(2) + return False + + +def run_benchmark(max_tokens=BENCHMARK_TOKENS): + payload = json.dumps({ + "model": "local-model", + "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}], + "max_tokens": max_tokens, + "temperature": 0.0 + }).encode("utf-8") + + req = urllib.request.Request( + f"{BASE_URL}/v1/chat/completions", + data=payload, + headers={"Content-Type": "application/json"} + ) + + start = time.time() + with urllib.request.urlopen(req, timeout=300) as resp: + result = json.loads(resp.read()) + elapsed = time.time() - start + + usage = result.get("usage", {}) + ct = usage.get("completion_tokens", 0) + return ct / elapsed if elapsed > 0 else 0 + + +def get_vram(): + try: + r = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.used,memory.total", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + parts = r.stdout.strip().split(",") + return int(parts[0].strip()), int(parts[1].strip()) + except: + return 0, 0 + + +def test_config(cfg, label=""): + kill_server() + desc = label or str(cfg) + print(f" [{desc}] Starting server...", flush=True) + proc = start_server(cfg) + + if not wait_for_server(): + print(f" [{desc}] FAILED to start") + proc.kill() + return None + + vram_used, vram_total = get_vram() + print(f" [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True) + + # Warmup + try: + run_benchmark(max_tokens=20) + except: + pass + + speeds = [] + for i in range(BENCHMARK_RUNS): + try: + tps = run_benchmark() + speeds.append(tps) + except Exception as e: + print(f"ERR({e}) ", end="", flush=True) + + proc.kill() + + if not speeds: + print("ALL FAILED") + return None + + avg = sum(speeds) / len(speeds) + best = max(speeds) + print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s") + + result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best, + "vram_used": vram_used, "vram_total": vram_total, "label": label} + ALL_RESULTS.append(result) + return result + + +def phase_sweep(phase_name, param_name, values, base_cfg): + print(f"\n{'='*70}") + print(f" PHASE: {phase_name}") + print(f" Sweeping: {param_name} = {values}") + print(f"{'='*70}") + + best_result = None + for val in values: + cfg = {**base_cfg} + if isinstance(param_name, list): + for p, v in zip(param_name, val): + cfg[p] = v + label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val)) + else: + cfg[param_name] = val + label = f"{param_name}={val}" + + r = test_config(cfg, label) + if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]): + best_result = r + + if best_result: + print(f"\n ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s") + return best_result + + +def main(): + print("=" * 70) + print(" Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner") + print(" 256K Context | RTX 3060 12GB") + print(" Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s") + print("=" * 70) + print() + + cfg = dict(BEST) + + # ─── Phase 1: --cpu-moe critical test ─── + r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg) + if r: + cfg["cpu_moe"] = r["cpu_moe"] + + # ─── Phase 2: CPU threads ─── + thread_combos = [ + (2, 2), (4, 4), (4, 6), (6, 6), (6, 8), + (8, 8), (8, 12), (10, 10), (12, 12) + ] + r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg) + if r: + cfg["t"] = r["t"] + cfg["tb"] = r["tb"] + + # ─── Phase 3: Batch sizes ─── + batch_combos = [ + (128, 512), (256, 1024), (256, 2048), + (512, 1024), (512, 2048), (512, 4096), + (1024, 2048), (1024, 4096) + ] + r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg) + if r: + cfg["ub"] = r["ub"] + cfg["b"] = r["b"] + + # ─── Phase 4: KV cache ─── + kv_combos = [ + ("q4_0", "q4_0"), + ("q8_0", "q8_0"), + ("f16", "f16"), + ] + r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg) + if r: + cfg["ctk"] = r["ctk"] + cfg["ctv"] = r["ctv"] + + # ─── Phase 5: Misc ─── + misc_combos = [ + (True, 50, 2), + (False, 50, 2), + (True, 0, 2), + (True, 100, 2), + (True, 50, 3), + ] + r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg) + if r: + cfg["mmap"] = r["mmap"] + cfg["poll"] = r["poll"] + cfg["prio"] = r["prio"] + + # ─── Final Report ─── + print() + print("=" * 70) + print(" FINAL OPTIMAL CONFIGURATION") + print("=" * 70) + for k, v in cfg.items(): + print(f" {k:>12}: {v}") + print() + + # Final verification + print(" Running final verification (5 runs)...") + kill_server() + proc = start_server(cfg) + wait_for_server() + try: + run_benchmark(max_tokens=20) + except: + pass + final_speeds = [] + for i in range(5): + try: + tps = run_benchmark() + final_speeds.append(tps) + print(f" Run {i+1}: {tps:.2f} t/s") + except: + pass + proc.kill() + + if final_speeds: + avg = sum(final_speeds) / len(final_speeds) + best = max(final_speeds) + print(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s") + + print() + cmd_parts = [ + f"llama-server --model {MODEL}", + f"-ngl {cfg['ngl']} -c {CONTEXT}", + ] + if cfg.get("cpu_moe"): + cmd_parts.append("--cpu-moe") + cmd_parts.extend([ + f"-t {cfg['t']} -tb {cfg['tb']}", + f"-ub {cfg['ub']} -b {cfg['b']}", + f"-fa {cfg['fa']}", + f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}", + f"--prio {cfg['prio']} --poll {cfg['poll']}", + ]) + if cfg["mlock"]: + cmd_parts.append("--mlock") + if not cfg["mmap"]: + cmd_parts.append("--no-mmap") + cmd_parts.append("--port 8000 --host 0.0.0.0") + + print(" Recommended command:") + print(f" {' '.join(cmd_parts)}") + print("=" * 70) + + with open("scripts/tune_results_qwen35b_256k.json", "w") as f: + json.dump(ALL_RESULTS, f, indent=2, default=str) + print(f"\n Full results saved: scripts/tune_results_qwen35b_256k.json") + + +if __name__ == "__main__": + main() diff --git a/scripts/boot_122b.txt b/scripts/boot_122b.txt new file mode 100644 index 0000000..1fb6f84 Binary files /dev/null and b/scripts/boot_122b.txt differ diff --git a/scripts/boot_122b_38.txt b/scripts/boot_122b_38.txt new file mode 100644 index 0000000..5aa3ef2 Binary files /dev/null and b/scripts/boot_122b_38.txt differ diff --git a/scripts/boot_122b_42.txt b/scripts/boot_122b_42.txt new file mode 100644 index 0000000..606690c Binary files /dev/null and b/scripts/boot_122b_42.txt differ diff --git a/scripts/boot_122b_44.txt b/scripts/boot_122b_44.txt new file mode 100644 index 0000000..a2c4e56 Binary files /dev/null and b/scripts/boot_122b_44.txt differ diff --git a/scripts/boot_122b_auto.txt b/scripts/boot_122b_auto.txt new file mode 100644 index 0000000..6c03f60 Binary files /dev/null and b/scripts/boot_122b_auto.txt differ diff --git a/scripts/boot_122b_maxmem.txt b/scripts/boot_122b_maxmem.txt new file mode 100644 index 0000000..4349116 Binary files /dev/null and b/scripts/boot_122b_maxmem.txt differ diff --git a/scripts/boot_122b_row.txt b/scripts/boot_122b_row.txt new file mode 100644 index 0000000..b321c04 Binary files /dev/null and b/scripts/boot_122b_row.txt differ diff --git a/scripts/boot_122b_row_dual.txt b/scripts/boot_122b_row_dual.txt new file mode 100644 index 0000000..989349a Binary files /dev/null and b/scripts/boot_122b_row_dual.txt differ diff --git a/scripts/boot_122b_single.txt b/scripts/boot_122b_single.txt new file mode 100644 index 0000000..5f72101 Binary files /dev/null and b/scripts/boot_122b_single.txt differ diff --git a/scripts/boot_122b_single2.txt b/scripts/boot_122b_single2.txt new file mode 100644 index 0000000..6a72269 Binary files /dev/null and b/scripts/boot_122b_single2.txt differ diff --git a/scripts/boot_122b_ts85.txt b/scripts/boot_122b_ts85.txt new file mode 100644 index 0000000..f1eed64 Binary files /dev/null and b/scripts/boot_122b_ts85.txt differ diff --git a/scripts/boot_122b_tune.txt b/scripts/boot_122b_tune.txt new file mode 100644 index 0000000..4c862e4 Binary files /dev/null and b/scripts/boot_122b_tune.txt differ diff --git a/scripts/boot_122b_tuned.txt b/scripts/boot_122b_tuned.txt new file mode 100644 index 0000000..68229fe Binary files /dev/null and b/scripts/boot_122b_tuned.txt differ diff --git a/scripts/boot_122b_v2.txt b/scripts/boot_122b_v2.txt new file mode 100644 index 0000000..f379fd5 Binary files /dev/null and b/scripts/boot_122b_v2.txt differ diff --git a/scripts/boot_log.txt b/scripts/boot_log.txt new file mode 100644 index 0000000..ad47a3e Binary files /dev/null and b/scripts/boot_log.txt differ diff --git a/scripts/boot_log2.txt b/scripts/boot_log2.txt new file mode 100644 index 0000000..77536bd Binary files /dev/null and b/scripts/boot_log2.txt differ diff --git a/scripts/boot_log3.txt b/scripts/boot_log3.txt new file mode 100644 index 0000000..cfa94c7 Binary files /dev/null and b/scripts/boot_log3.txt differ diff --git a/scripts/boot_log4.txt b/scripts/boot_log4.txt new file mode 100644 index 0000000..7c327ac Binary files /dev/null and b/scripts/boot_log4.txt differ diff --git a/scripts/boot_log5.txt b/scripts/boot_log5.txt new file mode 100644 index 0000000..e9f3aca Binary files /dev/null and b/scripts/boot_log5.txt differ diff --git a/scripts/boot_qwen_iq4.txt b/scripts/boot_qwen_iq4.txt new file mode 100644 index 0000000..db5b6f4 Binary files /dev/null and b/scripts/boot_qwen_iq4.txt differ diff --git a/scripts/check_help.bat b/scripts/check_help.bat new file mode 100644 index 0000000..87648a4 --- /dev/null +++ b/scripts/check_help.bat @@ -0,0 +1,3 @@ +@echo off +.\llama_bin_run\llama-server.exe --help 2>&1 | findstr /i "split tensor device main-gpu cpu-moe n-cpu-moe" > scripts\help_gpu_flags.txt +echo Done. diff --git a/scripts/download_llama.py b/scripts/download_llama.py new file mode 100644 index 0000000..7322872 --- /dev/null +++ b/scripts/download_llama.py @@ -0,0 +1,38 @@ +import urllib.request +import json +import zipfile +import os +import ssl + +ctx = ssl.create_default_context() +ctx.check_hostname = False +ctx.verify_mode = ssl.CERT_NONE + +url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest" +req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) +try: + with urllib.request.urlopen(req, context=ctx) as response: + data = json.loads(response.read().decode()) + + download_url = None + for asset in data['assets']: + if "bin-win-cuda-cu12.2.0-x64.zip" in asset['name'] or ("bin-win-cu" in asset['name'] and asset['name'].endswith(".zip") and "x64" in asset['name']): + download_url = asset['browser_download_url'] + break + + if download_url: + print(f"Downloading {download_url}...") + zip_path = "llama.zip" + with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file: + out_file.write(resp.read()) + print("Extracting to 'llama_bin'...") + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall("llama_bin") + print("Done extracting.") + os.remove(zip_path) + else: + print("Could not find the target zip. Available assets:") + for asset in data['assets']: + print(" -", asset['name']) +except Exception as e: + print(f"Error: {e}") diff --git a/scripts/download_models.py b/scripts/download_models.py new file mode 100644 index 0000000..aaa5c1b --- /dev/null +++ b/scripts/download_models.py @@ -0,0 +1,33 @@ +import os +from huggingface_hub import hf_hub_download + +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +models = [ + # 먼저 용량이 작은 Gemma4 26B 부터 다운로드 + ("ggml-org/gemma-4-26B-A4B-it-GGUF", "gemma-4-26B-A4B-it-Q4_K_M.gguf"), + # 다음 Qwen 35B + ("unsloth/Qwen3.5-35B-A3B-GGUF", "Qwen3.5-35B-A3B-Q4_K_M.gguf"), + # 마지막으로 122B (분할 압축되어 있음) + ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"), + ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf"), + ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf") +] + +print("=== 고속 다운로더 시작 (huggingface_hub & hf_transfer) ===") +os.makedirs("models", exist_ok=True) + +for repo, filename in models: + print(f"\n>>> 다운로드 중 (백그라운드 진행): [{repo}] 의 [{filename}]...") + try: + path = hf_hub_download( + repo_id=repo, + filename=filename, + local_dir="./models", + local_dir_use_symlinks=False + ) + print(f"완료: {path}") + except Exception as e: + print(f"다운로드 실패: {e}") + +print("\n모든 다운로드 프로세스가 종료되었습니다.") diff --git a/scripts/download_true_llama.py b/scripts/download_true_llama.py new file mode 100644 index 0000000..37862a7 --- /dev/null +++ b/scripts/download_true_llama.py @@ -0,0 +1,56 @@ +import urllib.request +import json +import zipfile +import os +import ssl +import shutil + +ctx = ssl.create_default_context() +ctx.check_hostname = False +ctx.verify_mode = ssl.CERT_NONE + +url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest" +req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) +try: + with urllib.request.urlopen(req, context=ctx) as response: + data = json.loads(response.read().decode()) + + download_url = None + for asset in data['assets']: + if "bin-win-cuda-12.4-x64.zip" in asset['name'] and "cudart" not in asset['name']: + download_url = asset['browser_download_url'] + break + + if download_url: + print(f"Downloading true binaries: {download_url}...") + zip_path = "llama_main.zip" + with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file: + out_file.write(resp.read()) + + print("Extracting to temporary folder 'llama_temp'...") + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall("llama_temp") + + print("Moving exact files to 'llama_bin_run'...") + os.makedirs("llama_bin_run", exist_ok=True) + for root, dirs, files in os.walk("llama_temp"): + for file in files: + shutil.move(os.path.join(root, file), os.path.join("llama_bin_run", file)) + + if os.path.exists("llama_bin"): + for item in os.listdir("llama_bin"): + src = os.path.join("llama_bin", item) + dst = os.path.join("llama_bin_run", item) + if not os.path.exists(dst): + try: + shutil.copy(src, dst) + except: + pass + + os.remove(zip_path) + shutil.rmtree("llama_temp", ignore_errors=True) + print("Download and path extraction fully complete.") + else: + print("Could not find the target zip.") +except Exception as e: + print(f"Error: {e}") diff --git a/scripts/dual_gpu_benchmark.mjs b/scripts/dual_gpu_benchmark.mjs new file mode 100644 index 0000000..aa60a56 --- /dev/null +++ b/scripts/dual_gpu_benchmark.mjs @@ -0,0 +1,531 @@ +/** + * Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark + * =========================================================== + * Tests 4 models across multiple parameter configurations to find + * the absolute best model + settings for 256K context coding agent. + * + * Models: + * 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB) + * 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB) + * 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB) + * 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB) + * + * Run: node scripts/dual_gpu_benchmark.mjs + */ + +import { spawn, execSync } from "child_process"; +import { writeFileSync, statSync, existsSync } from "fs"; +import { resolve } from "path"; + +// ─── Configuration ───────────────────────────────────────────── +const BASE_URL = "http://127.0.0.1:8000"; +const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`; +const CONTEXT = 262144; // 256K +const BENCHMARK_RUNS = 3; +const BENCHMARK_TOKENS = 200; +const SERVER_TIMEOUT = 300_000; // ms + +const MODELS = [ + { + name: "Qwen3.5-35B-A3B Q4_K_M", + path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`, + type: "qwen", quant: "Q4_K_M", totalLayers: 64, + }, + { + name: "Qwen3.5-35B-A3B MXFP4_MOE", + path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`, + type: "qwen", quant: "MXFP4_MOE", totalLayers: 64, + }, + { + name: "Gemma4 26B-A4B Q4_K_M", + path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`, + type: "gemma4", quant: "Q4_K_M", totalLayers: 30, + }, + { + name: "Gemma4 26B-A4B MXFP4_MOE", + path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`, + type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30, + }, +]; + +const ALL_RESULTS = []; + +// ─── Utility ─────────────────────────────────────────────────── + +function log(msg) { + const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false }); + console.log(`[${ts}] ${msg}`); +} + +function sleep(ms) { + return new Promise((r) => setTimeout(r, ms)); +} + +function killServer() { + try { + execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); + } catch {} + return sleep(5000); +} + +function getVramAll() { + try { + const out = execSync( + 'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', + { encoding: "utf-8", timeout: 5000 } + ); + return out.trim().split("\n").map((line) => { + const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim())); + return { gpu, used, total }; + }); + } catch { + return []; + } +} + +function buildCmd(modelPath, params) { + const { + ngl, t, ub, b, ctk, ctv, + cpuMoe = false, nCpuMoe = 0, + prio = 3, nommap = false + } = params; + + const cmd = [ + LLAMA_SERVER, + "--model", modelPath, + "-ngl", String(ngl), + "-c", String(CONTEXT), + "-np", "1", + "-fa", "on", + "--cache-type-k", ctk, + "--cache-type-v", ctv, + "-ub", String(ub), + "-b", String(b), + "-t", String(t), + "-tb", String(t), + "--prio", String(prio), + "--poll", "50", + "--mlock", + "--port", "8000", + "--host", "0.0.0.0", + ]; + + if (cpuMoe) cmd.push("--cpu-moe"); + else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe)); + if (nommap) cmd.push("--no-mmap"); + + return cmd; +} + +function startServer(modelPath, params) { + const args = buildCmd(modelPath, params); + const exe = args.shift(); + log(` CMD: ${exe} ${args.slice(-12).join(" ")} ...`); + return spawn(exe, args, { + cwd: process.cwd(), + stdio: ["ignore", "pipe", "pipe"], + }); +} + +async function waitForServer(timeoutMs = SERVER_TIMEOUT) { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + try { + const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) }); + const data = await resp.json(); + if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 }; + } catch {} + await sleep(3000); + } + return { ok: false, bootTime: timeoutMs / 1000 }; +} + +async function runBenchmark(maxTokens = BENCHMARK_TOKENS) { + const payload = JSON.stringify({ + model: "local-model", + messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }], + max_tokens: maxTokens, + temperature: 0.0, + }); + + const start = Date.now(); + const resp = await fetch(`${BASE_URL}/v1/chat/completions`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: payload, + signal: AbortSignal.timeout(600_000), + }); + const result = await resp.json(); + const elapsed = (Date.now() - start) / 1000; + + const usage = result.usage || {}; + const ct = usage.completion_tokens || 0; + return { + tps: elapsed > 0 ? ct / elapsed : 0, + completionTokens: ct, + promptTokens: usage.prompt_tokens || 0, + elapsed, + }; +} + +async function testConfig(model, label, params) { + await killServer(); + log(` [${label}] Starting server...`); + + const proc = startServer(model.path, params); + const { ok, bootTime } = await waitForServer(); + + if (!ok) { + log(` [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`); + proc.kill("SIGKILL"); + return null; + } + + const vram = getVramAll(); + const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | "); + log(` [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`); + + // Warmup + try { await runBenchmark(20); } catch {} + + // Benchmark + const speeds = []; + for (let i = 0; i < BENCHMARK_RUNS; i++) { + try { + const r = await runBenchmark(); + speeds.push(r.tps); + log(` Run ${i + 1}: ${r.tps.toFixed(2)} t/s`); + } catch (e) { + log(` Run ${i + 1}: ERROR (${e.message})`); + } + } + + proc.kill("SIGKILL"); + + if (speeds.length === 0) { + log(` [${label}] ALL BENCHMARK RUNS FAILED`); + return null; + } + + const avg = speeds.reduce((a, b) => a + b) / speeds.length; + const best = Math.max(...speeds); + log(` [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`); + + const result = { + model: model.name, quant: model.quant, label, + avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2), + boot_time: +bootTime.toFixed(1), vram, params, + }; + ALL_RESULTS.push(result); + return result; +} + +// ─── Phase Runners ───────────────────────────────────────────── + +async function phase0_bootTest(model) { + log(`\n${"=".repeat(70)}`); + log(` PHASE 0: Boot Test — ${model.name}`); + log(`${"=".repeat(70)}`); + + // Try full GPU first + let r = await testConfig(model, "boot-ngl999", { + ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", + }); + if (r) return r; + + // Try with cpu-moe + log(" Full GPU failed, trying with --cpu-moe..."); + r = await testConfig(model, "boot-cpumoe", { + ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true, + }); + if (r) return r; + + // Reduced layers + log(" --cpu-moe also failed, trying reduced layers..."); + r = await testConfig(model, "boot-ngl-half", { + ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048, + ctk: "q4_0", ctv: "q4_0", + }); + return r; +} + +async function phase1_gpuOffload(model, baseline) { + log(`\n${"=".repeat(70)}`); + log(` PHASE 1: GPU Offload Strategy — ${model.name}`); + log(`${"=".repeat(70)}`); + + const results = baseline ? [baseline] : []; + + // Test --cpu-moe on/off + for (const cpuMoe of [true, false]) { + const lbl = `ngl=999 cpuMoe=${cpuMoe}`; + if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue; + const r = await testConfig(model, lbl, { + ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe, + }); + if (r) results.push(r); + } + + // n-cpu-moe sweep + for (const n of [0, 5, 10, 15, 20]) { + if (n > model.totalLayers) continue; + const r = await testConfig(model, `n-cpu-moe=${n}`, { + ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n, + }); + if (r) results.push(r); + } + + if (results.length === 0) { log(" PHASE 1: No config worked!"); return null; } + const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b); + log(`\n ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`); + return best; +} + +async function phase2_threads(model, prev) { + log(`\n${"=".repeat(70)}`); + log(` PHASE 2: CPU Thread Sweep — ${model.name}`); + log(`${"=".repeat(70)}`); + + const p = prev.params; + const results = [prev]; + + for (const t of [2, 4, 6, 8, 10, 12]) { + if (t === p.t) continue; + const r = await testConfig(model, `t=${t}`, { + ...p, t, + }); + if (r) results.push(r); + } + + const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b); + log(`\n ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`); + return best; +} + +async function phase3_batch(model, prev) { + log(`\n${"=".repeat(70)}`); + log(` PHASE 3: Batch Size Sweep — ${model.name}`); + log(`${"=".repeat(70)}`); + + const p = prev.params; + const results = [prev]; + + for (const [ub, b] of [ + [128, 512], [256, 1024], [256, 2048], + [512, 1024], [512, 2048], [512, 4096], + [1024, 2048], [1024, 4096], + ]) { + if (ub === p.ub && b === p.b) continue; + const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b }); + if (r) results.push(r); + } + + const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b); + log(`\n ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`); + return best; +} + +async function phase4_kvcache(model, prev) { + log(`\n${"=".repeat(70)}`); + log(` PHASE 4: KV Cache Type Sweep — ${model.name}`); + log(`${"=".repeat(70)}`); + + const p = prev.params; + const results = [prev]; + + for (const [ctk, ctv] of [ + ["q4_0", "q4_0"], ["q8_0", "q8_0"], + ["q4_0", "q8_0"], ["f16", "f16"], + ]) { + if (ctk === p.ctk && ctv === p.ctv) continue; + const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv }); + if (r) results.push(r); + } + + const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b); + log(`\n ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`); + return best; +} + +async function phase5_final(model, prev) { + log(`\n${"=".repeat(70)}`); + log(` PHASE 5: Final Verification (5 runs) — ${model.name}`); + log(`${"=".repeat(70)}`); + + await killServer(); + const proc = startServer(model.path, prev.params); + const { ok, bootTime } = await waitForServer(); + if (!ok) { log(" FAILED to start!"); proc.kill("SIGKILL"); return prev; } + + const vram = getVramAll(); + try { await runBenchmark(20); } catch {} + + const speeds = []; + for (let i = 0; i < 5; i++) { + try { + const r = await runBenchmark(); + speeds.push(r.tps); + log(` Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`); + } catch (e) { + log(` Final Run ${i + 1}: ERROR (${e.message})`); + } + } + proc.kill("SIGKILL"); + + if (speeds.length > 0) { + const avg = speeds.reduce((a, b) => a + b) / speeds.length; + const best = Math.max(...speeds); + log(`\n ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`); + + const final_ = { + model: model.name, quant: model.quant, + label: `FINAL-${model.name}`, + avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2), + boot_time: +bootTime.toFixed(1), vram, params: prev.params, + }; + ALL_RESULTS.push(final_); + return final_; + } + return prev; +} + +// ─── Main ────────────────────────────────────────────────────── + +async function runModelBenchmark(model) { + log(`\n${"#".repeat(70)}`); + log(` MODEL: ${model.name}`); + log(` File: ${model.path}`); + try { + const sz = statSync(model.path).size / 1024 ** 3; + log(` Size: ${sz.toFixed(2)} GB`); + } catch { log(` Size: unknown`); } + log(`${"#".repeat(70)}`); + + if (!existsSync(model.path)) { + log(` SKIP: Model file not found!`); + return null; + } + + const baseline = await phase0_bootTest(model); + if (!baseline) { log(` SKIP: Cannot boot at 256K!`); return null; } + + let best = await phase1_gpuOffload(model, baseline); + if (!best) return baseline; + + best = await phase2_threads(model, best); + best = await phase3_batch(model, best); + best = await phase4_kvcache(model, best); + best = await phase5_final(model, best); + + return best; +} + +async function main() { + const startTime = Date.now(); + + log("=".repeat(70)); + log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK"); + log(" 2x RTX 3060 (24GB Total) | 256K Context"); + log(` Models: ${MODELS.length}`); + log(` Started: ${new Date().toISOString()}`); + log("=".repeat(70)); + + const gpus = getVramAll(); + gpus.forEach((g) => log(` GPU ${g.gpu}: ${g.used}/${g.total} MiB used`)); + + const winners = []; + + for (let i = 0; i < MODELS.length; i++) { + log(`\n${"=".repeat(70)}`); + log(` STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`); + log(`${"=".repeat(70)}`); + + const winner = await runModelBenchmark(MODELS[i]); + if (winner) winners.push(winner); + + // Save intermediate + writeFileSync("scripts/dual_gpu_results.json", + JSON.stringify(ALL_RESULTS, null, 2)); + log(` Intermediate saved (${ALL_RESULTS.length} configs tested)`); + } + + // ─── Grand Final ─────────────────────────────────────────── + const elapsed = (Date.now() - startTime) / 60000; + + log(`\n${"=".repeat(70)}`); + log(` GRAND FINAL COMPARISON`); + log(` Total time: ${elapsed.toFixed(1)} minutes`); + log(` Configs tested: ${ALL_RESULTS.length}`); + log(`${"=".repeat(70)}`); + + if (winners.length === 0) { + log(" No models ran at 256K!"); + return; + } + + winners.sort((a, b) => b.avg_tps - a.avg_tps); + const medals = ["🥇", "🥈", "🥉", " "]; + + const lines = [ + `Dual-GPU Benchmark Results — ${new Date().toISOString()}`, + `Hardware: 2x RTX 3060 12GB | Context: 256K`, + `Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`, + "", "=".repeat(60), " RANKING (by AVG t/s)", "=".repeat(60), + ]; + + for (let i = 0; i < winners.length; i++) { + const w = winners[i]; + const p = w.params; + lines.push(""); + lines.push(` ${medals[i] || " "} #${i + 1}: ${w.model}`); + lines.push(` AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`); + lines.push(` Boot: ${w.boot_time.toFixed(0)}s`); + lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`); + lines.push(` ctk=${p.ctk} ctv=${p.ctv}`); + if (p.cpuMoe) lines.push(` --cpu-moe`); + else if ((p.nCpuMoe || 0) > 0) lines.push(` --n-cpu-moe ${p.nCpuMoe}`); + } + + const champ = winners[0]; + const cp = champ.params; + lines.push("", "=".repeat(60)); + lines.push(` ★ CHAMPION: ${champ.model}`); + lines.push(` ${champ.avg_tps.toFixed(2)} t/s average`); + lines.push("=".repeat(60)); + + // Build recommended command + const cmdParts = [ + `llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`, + `-ngl ${cp.ngl} -c ${CONTEXT}`, + `-t ${cp.t} -tb ${cp.t}`, + `-ub ${cp.ub} -b ${cp.b}`, + `-fa on`, + `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`, + `--prio ${cp.prio || 3} --poll 50`, + `--mlock`, + ]; + if (cp.cpuMoe) cmdParts.push("--cpu-moe"); + else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`); + if (cp.nommap) cmdParts.push("--no-mmap"); + cmdParts.push("--port 8000 --host 0.0.0.0"); + + lines.push("", " Recommended command:"); + lines.push(` ${cmdParts.join(" ")}`); + + const summary = lines.join("\n"); + console.log(summary); + writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8"); + writeFileSync("scripts/dual_gpu_results.json", + JSON.stringify(ALL_RESULTS, null, 2)); + + log(`\n Results: scripts/dual_gpu_results.json`); + log(` Summary: scripts/dual_gpu_summary.txt`); + log(` DONE!`); + + await killServer(); +} + +main().catch((e) => { + console.error("Fatal error:", e); + process.exit(1); +}); diff --git a/scripts/dual_gpu_benchmark.py b/scripts/dual_gpu_benchmark.py new file mode 100644 index 0000000..4dd4089 --- /dev/null +++ b/scripts/dual_gpu_benchmark.py @@ -0,0 +1,644 @@ +""" +Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark +========================================================== +Tests 4 models across multiple parameter configurations to find +the absolute best model + settings for 256K context coding agent. + +Models: + 1. Qwen3.5-35B-A3B Q4_K_M (~20.5 GB) + 2. Qwen3.5-35B-A3B MXFP4_MOE (~20.1 GB) + 3. Gemma4 26B-A4B Q4_K_M (~15.6 GB) + 4. Gemma4 26B-A4B MXFP4_MOE (~15.5 GB) + +Test Phases (per model): + Phase 0: Basic dual-GPU startup test (can it even boot at 256K?) + Phase 1: GPU layer + MoE offload strategy sweep + Phase 2: CPU thread sweep (carry best from P1) + Phase 3: Batch size sweep (carry best from P1+P2) + Phase 4: KV cache type sweep (carry best from P1+P2+P3) + Phase 5: Final verification (5 runs) + +Output: scripts/dual_gpu_results.json (all raw data) + scripts/dual_gpu_summary.txt (human-readable winner) +""" +import subprocess +import time +import json +import urllib.request +import sys +import os +import datetime + +try: + sys.stdout.reconfigure(encoding='utf-8') +except Exception: + pass + +# ─── Configuration ─────────────────────────────────────────────── +BASE_URL = "http://127.0.0.1:8000" +LLAMA_SERVER = r"llama_bin_run\llama-server.exe" +CONTEXT = 262144 # 256K +BENCHMARK_RUNS = 3 +BENCHMARK_TOKENS = 200 +SERVER_TIMEOUT = 300 # seconds to wait for server startup + +MODELS = [ + { + "name": "Qwen3.5-35B-A3B Q4_K_M", + "path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf", + "type": "qwen", + "quant": "Q4_K_M", + "is_mxfp4": False, + "total_layers": 64, # Qwen3.5 35B has 64 layers + }, + { + "name": "Qwen3.5-35B-A3B MXFP4_MOE", + "path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf", + "type": "qwen", + "quant": "MXFP4_MOE", + "is_mxfp4": True, + "total_layers": 64, + }, + { + "name": "Gemma4 26B-A4B Q4_K_M", + "path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf", + "type": "gemma4", + "quant": "Q4_K_M", + "is_mxfp4": False, + "total_layers": 30, # Gemma4 26B has 30 layers + }, + { + "name": "Gemma4 26B-A4B MXFP4_MOE", + "path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf", + "type": "gemma4", + "quant": "MXFP4_MOE", + "is_mxfp4": True, + "total_layers": 30, + }, +] + +ALL_RESULTS = [] + + +# ─── Utility Functions ────────────────────────────────────────── +def log(msg): + ts = datetime.datetime.now().strftime("%H:%M:%S") + print(f"[{ts}] {msg}", flush=True) + + +def kill_server(): + subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], + capture_output=True) + time.sleep(5) + + +def get_vram_all(): + """Returns list of (used, total) tuples for each GPU.""" + try: + r = subprocess.run( + ["nvidia-smi", "--query-gpu=index,memory.used,memory.total", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + gpus = [] + for line in r.stdout.strip().split("\n"): + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 3: + gpus.append({ + "gpu": int(parts[0]), + "used": int(parts[1]), + "total": int(parts[2]), + }) + return gpus + except Exception: + return [] + + +def build_cmd(model_path, ngl, t, ub, b, ctk, ctv, + cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False): + """Build llama-server command for dual-GPU.""" + cmd = [ + LLAMA_SERVER, + "--model", model_path, + "-ngl", str(ngl), + "-c", str(CONTEXT), + "-np", "1", + "-fa", "on", + "--cache-type-k", ctk, + "--cache-type-v", ctv, + "-ub", str(ub), + "-b", str(b), + "-t", str(t), + "-tb", str(t), + "--prio", str(prio), + "--poll", "50", + "--mlock", + "--port", "8000", + "--host", "0.0.0.0", + ] + # MoE offloading options + if cpu_moe: + cmd.append("--cpu-moe") + elif n_cpu_moe > 0: + cmd.extend(["--n-cpu-moe", str(n_cpu_moe)]) + if nommap: + cmd.append("--no-mmap") + return cmd + + +def start_server(model_path, **kwargs): + cmd = build_cmd(model_path, **kwargs) + log(f" CMD: {' '.join(cmd[-20:])}") # show last 20 args + proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace' + ) + return proc + + +def wait_for_server(timeout=SERVER_TIMEOUT): + start = time.time() + while time.time() - start < timeout: + try: + req = urllib.request.Request(f"{BASE_URL}/health") + with urllib.request.urlopen(req, timeout=3) as resp: + data = json.loads(resp.read()) + if data.get("status") == "ok": + boot_time = time.time() - start + return True, boot_time + except Exception: + pass + time.sleep(3) + return False, timeout + + +def run_benchmark(max_tokens=BENCHMARK_TOKENS): + payload = json.dumps({ + "model": "local-model", + "messages": [{"role": "user", + "content": "Count from 1 to 50, writing each number on a new line."}], + "max_tokens": max_tokens, + "temperature": 0.0, + }).encode("utf-8") + + req = urllib.request.Request( + f"{BASE_URL}/v1/chat/completions", + data=payload, + headers={"Content-Type": "application/json"}, + ) + + start = time.time() + with urllib.request.urlopen(req, timeout=600) as resp: + result = json.loads(resp.read()) + elapsed = time.time() - start + + usage = result.get("usage", {}) + ct = usage.get("completion_tokens", 0) + pt = usage.get("prompt_tokens", 0) + return { + "tps": ct / elapsed if elapsed > 0 else 0, + "completion_tokens": ct, + "prompt_tokens": pt, + "elapsed": elapsed, + } + + +def test_config(model_info, label, **kwargs): + """Test a single configuration. Returns result dict or None.""" + kill_server() + log(f" [{label}] Starting server...") + + proc = start_server(model_info["path"], **kwargs) + ok, boot_time = wait_for_server() + + if not ok: + log(f" [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)") + proc.kill() + return None + + vram = get_vram_all() + vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram) + log(f" [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}") + + # Warmup + try: + run_benchmark(max_tokens=20) + except Exception: + pass + + # Benchmark runs + speeds = [] + for i in range(BENCHMARK_RUNS): + try: + r = run_benchmark() + speeds.append(r["tps"]) + log(f" Run {i+1}: {r['tps']:.2f} t/s") + except Exception as e: + log(f" Run {i+1}: ERROR ({e})") + + proc.kill() + + if not speeds: + log(f" [{label}] ALL BENCHMARK RUNS FAILED") + return None + + avg = sum(speeds) / len(speeds) + best = max(speeds) + log(f" [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s") + + result = { + "model": model_info["name"], + "quant": model_info["quant"], + "label": label, + "avg_tps": round(avg, 2), + "best_tps": round(best, 2), + "boot_time": round(boot_time, 1), + "vram": vram, + "params": kwargs, + } + ALL_RESULTS.append(result) + return result + + +# ─── Phase Runners ─────────────────────────────────────────────── + +def phase0_boot_test(model): + """Quick test: can the model even boot with 256K on dual GPU?""" + log(f"\n{'='*70}") + log(f" PHASE 0: Boot Test — {model['name']}") + log(f"{'='*70}") + + # Try -ngl 999 (all layers to GPU) as baseline + r = test_config( + model, f"boot-ngl999", + ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", + ) + if r: + return r + + # If full GPU fails, try with cpu-moe + log(" Full GPU failed, trying with --cpu-moe...") + r = test_config( + model, f"boot-cpumoe", + ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", + cpu_moe=True, + ) + if r: + return r + + # Extreme fallback: fewer layers + log(" --cpu-moe also failed, trying reduced layers...") + r = test_config( + model, f"boot-ngl-half", + ngl=model["total_layers"] // 2, t=6, ub=512, b=2048, + ctk="q4_0", ctv="q4_0", + ) + return r + + +def phase1_gpu_offload(model, baseline): + """Find optimal GPU layer count and MoE offload strategy.""" + log(f"\n{'='*70}") + log(f" PHASE 1: GPU Offload Strategy — {model['name']}") + log(f"{'='*70}") + + results = [] + if baseline: + results.append(baseline) + + total = model["total_layers"] + + # Strategy A: All GPU + cpu-moe variations + for cpu_moe in [True, False]: + label = f"ngl=999 cpu_moe={cpu_moe}" + # Skip if already tested in baseline + if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \ + baseline["params"].get("cpu_moe", False) == cpu_moe: + continue + r = test_config( + model, label, + ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", + cpu_moe=cpu_moe, + ) + if r: + results.append(r) + + # Strategy B: n-cpu-moe sweep (selective expert offload) + for n in [0, 5, 10, 15, 20]: + if n > total: + continue + r = test_config( + model, f"n-cpu-moe={n}", + ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0", + n_cpu_moe=n, + ) + if r: + results.append(r) + + if not results: + log(" PHASE 1: No configuration worked!") + return None + + best = max(results, key=lambda x: x["avg_tps"]) + log(f"\n ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s") + return best + + +def phase2_threads(model, prev_best): + """Sweep CPU threads with best GPU config locked.""" + log(f"\n{'='*70}") + log(f" PHASE 2: CPU Thread Sweep — {model['name']}") + log(f"{'='*70}") + + p = prev_best["params"] + results = [prev_best] + + for t in [2, 4, 6, 8, 10, 12]: + if t == p.get("t", 6): + continue + r = test_config( + model, f"t={t}", + ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"], + ctk=p["ctk"], ctv=p["ctv"], + cpu_moe=p.get("cpu_moe", False), + n_cpu_moe=p.get("n_cpu_moe", 0), + ) + if r: + results.append(r) + + best = max(results, key=lambda x: x["avg_tps"]) + log(f"\n ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s") + return best + + +def phase3_batch(model, prev_best): + """Sweep batch sizes.""" + log(f"\n{'='*70}") + log(f" PHASE 3: Batch Size Sweep — {model['name']}") + log(f"{'='*70}") + + p = prev_best["params"] + best_t = p["t"] + results = [prev_best] + + for ub, b in [(128, 512), (256, 1024), (256, 2048), + (512, 1024), (512, 2048), (512, 4096), + (1024, 2048), (1024, 4096)]: + if ub == p["ub"] and b == p["b"]: + continue + r = test_config( + model, f"ub={ub} b={b}", + ngl=p["ngl"], t=best_t, ub=ub, b=b, + ctk=p["ctk"], ctv=p["ctv"], + cpu_moe=p.get("cpu_moe", False), + n_cpu_moe=p.get("n_cpu_moe", 0), + ) + if r: + results.append(r) + + best = max(results, key=lambda x: x["avg_tps"]) + log(f"\n ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s") + return best + + +def phase4_kvcache(model, prev_best): + """Sweep KV cache precision.""" + log(f"\n{'='*70}") + log(f" PHASE 4: KV Cache Type Sweep — {model['name']}") + log(f"{'='*70}") + + p = prev_best["params"] + results = [prev_best] + + for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"), + ("q4_0", "q8_0"), ("f16", "f16")]: + if ctk == p["ctk"] and ctv == p["ctv"]: + continue + r = test_config( + model, f"kv={ctk}/{ctv}", + ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"], + ctk=ctk, ctv=ctv, + cpu_moe=p.get("cpu_moe", False), + n_cpu_moe=p.get("n_cpu_moe", 0), + ) + if r: + results.append(r) + + best = max(results, key=lambda x: x["avg_tps"]) + log(f"\n ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s") + return best + + +def phase5_final(model, prev_best): + """Final verification with 5 runs.""" + log(f"\n{'='*70}") + log(f" PHASE 5: Final Verification (5 runs) — {model['name']}") + log(f"{'='*70}") + + p = prev_best["params"] + kill_server() + proc = start_server(model["path"], **p) + ok, boot_time = wait_for_server() + if not ok: + log(" FAILED to start for final verification!") + proc.kill() + return prev_best + + vram = get_vram_all() + + # Warmup + try: + run_benchmark(max_tokens=20) + except Exception: + pass + + speeds = [] + for i in range(5): + try: + r = run_benchmark() + speeds.append(r["tps"]) + log(f" Final Run {i+1}: {r['tps']:.2f} t/s") + except Exception as e: + log(f" Final Run {i+1}: ERROR ({e})") + + proc.kill() + + if speeds: + avg = sum(speeds) / len(speeds) + best_tps = max(speeds) + log(f"\n ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s") + + final = { + "model": model["name"], + "quant": model["quant"], + "label": f"FINAL-{model['name']}", + "avg_tps": round(avg, 2), + "best_tps": round(best_tps, 2), + "boot_time": round(boot_time, 1), + "vram": vram, + "params": p, + } + ALL_RESULTS.append(final) + return final + + return prev_best + + +# ─── Main ──────────────────────────────────────────────────────── + +def run_full_benchmark_for_model(model): + """Run all phases for a single model.""" + log(f"\n{'#'*70}") + log(f" MODEL: {model['name']}") + log(f" File: {model['path']}") + log(f" Size: {os.path.getsize(model['path'])/1024**3:.2f} GB") + log(f"{'#'*70}") + + # Check model exists + if not os.path.exists(model["path"]): + log(f" SKIP: Model file not found!") + return None + + # Phase 0: Can it boot? + baseline = phase0_boot_test(model) + if not baseline: + log(f" SKIP: {model['name']} cannot boot at 256K context!") + return None + + # Phase 1: GPU offload strategy + best = phase1_gpu_offload(model, baseline) + if not best: + return baseline + + # Phase 2: CPU threads + best = phase2_threads(model, best) + + # Phase 3: Batch sizes + best = phase3_batch(model, best) + + # Phase 4: KV cache + best = phase4_kvcache(model, best) + + # Phase 5: Final verification + final = phase5_final(model, best) + + return final + + +def main(): + start_time = time.time() + + log("=" * 70) + log(" DUAL-GPU COMPREHENSIVE MODEL BENCHMARK") + log(" 2x RTX 3060 (24GB Total) | 256K Context") + log(f" Models: {len(MODELS)}") + log(f" Started: {datetime.datetime.now().isoformat()}") + log("=" * 70) + + # Show GPU info + gpus = get_vram_all() + for g in gpus: + log(f" GPU {g['gpu']}: {g['used']}/{g['total']} MiB used") + + # Run benchmarks for each model + model_winners = [] + for i, model in enumerate(MODELS): + log(f"\n{'='*70}") + log(f" STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}") + log(f"{'='*70}") + + winner = run_full_benchmark_for_model(model) + if winner: + model_winners.append(winner) + + # Save intermediate results + with open("scripts/dual_gpu_results.json", "w") as f: + json.dump(ALL_RESULTS, f, indent=2, default=str) + log(f" Intermediate results saved ({len(ALL_RESULTS)} configs tested)") + + # ─── Grand Final Comparison ────────────────────────────────── + elapsed = (time.time() - start_time) / 60 + + log(f"\n{'='*70}") + log(f" GRAND FINAL COMPARISON") + log(f" Total time: {elapsed:.1f} minutes") + log(f" Configs tested: {len(ALL_RESULTS)}") + log(f"{'='*70}") + + if not model_winners: + log(" No models were able to run at 256K context!") + return + + # Sort by avg t/s + model_winners.sort(key=lambda x: x["avg_tps"], reverse=True) + + summary_lines = [] + summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}") + summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K") + summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}") + summary_lines.append(f"Total time: {elapsed:.1f} minutes") + summary_lines.append("") + summary_lines.append("=" * 60) + summary_lines.append(" RANKING (by AVG t/s)") + summary_lines.append("=" * 60) + + for rank, w in enumerate(model_winners, 1): + medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: " "}.get(rank, " ") + summary_lines.append(f"\n {medal} #{rank}: {w['model']}") + summary_lines.append(f" AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s") + summary_lines.append(f" Boot: {w['boot_time']:.0f}s") + p = w["params"] + summary_lines.append(f" ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}") + summary_lines.append(f" ctk={p['ctk']} ctv={p['ctv']}") + if p.get("cpu_moe"): + summary_lines.append(f" --cpu-moe") + elif p.get("n_cpu_moe", 0) > 0: + summary_lines.append(f" --n-cpu-moe {p['n_cpu_moe']}") + + champion = model_winners[0] + summary_lines.append(f"\n{'='*60}") + summary_lines.append(f" ★ CHAMPION: {champion['model']}") + summary_lines.append(f" {champion['avg_tps']:.2f} t/s average") + summary_lines.append(f"{'='*60}") + + # Build recommended command + p = champion["params"] + cmd_parts = [ + f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}", + f"-ngl {p['ngl']} -c {CONTEXT}", + f"-t {p['t']} -tb {p['t']}", + f"-ub {p['ub']} -b {p['b']}", + "-fa on", + f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}", + f"--prio {p.get('prio', 3)} --poll 50", + "--mlock", + ] + if p.get("cpu_moe"): + cmd_parts.append("--cpu-moe") + elif p.get("n_cpu_moe", 0) > 0: + cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}") + if p.get("nommap"): + cmd_parts.append("--no-mmap") + cmd_parts.append("--port 8000 --host 0.0.0.0") + + summary_lines.append(f"\n Recommended command:") + summary_lines.append(f" {' '.join(cmd_parts)}") + + summary = "\n".join(summary_lines) + print(summary) + + with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f: + f.write(summary) + + with open("scripts/dual_gpu_results.json", "w") as f: + json.dump(ALL_RESULTS, f, indent=2, default=str) + + log(f"\n Results: scripts/dual_gpu_results.json") + log(f" Summary: scripts/dual_gpu_summary.txt") + log(f" DONE!") + + kill_server() + + +if __name__ == "__main__": + main() diff --git a/scripts/dual_gpu_benchmark_v2.mjs b/scripts/dual_gpu_benchmark_v2.mjs new file mode 100644 index 0000000..a19499d --- /dev/null +++ b/scripts/dual_gpu_benchmark_v2.mjs @@ -0,0 +1,330 @@ +/** + * Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2 + * ===================================================== + * Informed by VRAM analysis — tests models in optimal order. + * + * Key insights applied: + * - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA) + * - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first + * - Skip configs known to fail, minimize wasted time + * + * Run: node scripts/dual_gpu_benchmark_v2.mjs + * Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt + */ + +import { spawn, execSync } from "child_process"; +import { writeFileSync, existsSync, statSync } from "fs"; + +const BASE_URL = "http://127.0.0.1:8000"; +const LLAMA = String.raw`llama_bin_run\llama-server.exe`; +const CTX = 262144; +const RUNS = 3; +const TOKENS = 200; +const BOOT_TIMEOUT = 300_000; + +// Models ordered: smallest first (most likely to succeed fully on GPU) +const MODELS = [ + { + name: "Gemma4-26B MXFP4_MOE", + path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`, + quant: "MXFP4_MOE", + fitsGPU: true, // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB + }, + { + name: "Gemma4-26B Q4_K_M", + path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`, + quant: "Q4_K_M", + fitsGPU: true, // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB + }, + { + name: "Qwen3.5-35B MXFP4_MOE", + path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`, + quant: "MXFP4_MOE", + fitsGPU: "maybe", // 20.1 + 1.41 + 1 = 22.51 GB — tight + }, + { + name: "Qwen3.5-35B Q4_K_M", + path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`, + quant: "Q4_K_M", + fitsGPU: "maybe", // 20.5 + 1.41 + 1 = 22.91 GB — very tight + }, +]; + +const ALL = []; +let currentProc = null; + +// ─── Utilities ───────────────────────────────────────────────── +const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`); +const sleep = (ms) => new Promise(r => setTimeout(r, ms)); + +async function kill() { + if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; } + try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {} + await sleep(5000); +} + +function vram() { + try { + return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', + { encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => { + const [g, u, t] = l.split(",").map(s => parseInt(s)); + return { gpu: g, used: u, total: t }; + }); + } catch { return []; } +} + +function startServer(modelPath, p) { + const args = [ + "--model", modelPath, "-ngl", String(p.ngl), + "-c", String(CTX), "-np", "1", "-fa", "on", + "--cache-type-k", p.ctk, "--cache-type-v", p.ctv, + "-ub", String(p.ub), "-b", String(p.b), + "-t", String(p.t), "-tb", String(p.t), + "--prio", String(p.prio || 3), "--poll", "50", "--mlock", + "--port", "8000", "--host", "0.0.0.0", + ]; + if (p.cpuMoe) args.push("--cpu-moe"); + else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe)); + if (p.nommap) args.push("--no-mmap"); + + currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] }); + return currentProc; +} + +async function waitReady(timeout = BOOT_TIMEOUT) { + const t0 = Date.now(); + while (Date.now() - t0 < timeout) { + try { + const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) }); + const d = await r.json(); + if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 }; + } catch {} + await sleep(3000); + } + return { ok: false, boot: timeout / 1000 }; +} + +async function bench(n = TOKENS) { + const t0 = Date.now(); + const r = await fetch(`${BASE_URL}/v1/chat/completions`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model: "m", + messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], + max_tokens: n, temperature: 0, + }), + signal: AbortSignal.timeout(600_000), + }); + const d = await r.json(); + const dt = (Date.now() - t0) / 1000; + const ct = d.usage?.completion_tokens || 0; + return { tps: ct / dt, ct, dt }; +} + +async function testConfig(model, label, params) { + await kill(); + log(` [${label}] Starting...`); + startServer(model.path, params); + const { ok, boot } = await waitReady(); + if (!ok) { log(` [${label}] ✗ FAILED (timeout)`); await kill(); return null; } + + const v = vram(); + const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | "); + log(` [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`); + + try { await bench(20); } catch {} // warmup + + const speeds = []; + for (let i = 0; i < RUNS; i++) { + try { const r = await bench(); speeds.push(r.tps); log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`); + } catch (e) { log(` Run${i+1}: ERR ${e.message}`); } + } + await kill(); + + if (!speeds.length) { log(` [${label}] ✗ ALL RUNS FAILED`); return null; } + const avg = speeds.reduce((a,b)=>a+b) / speeds.length; + const best = Math.max(...speeds); + log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`); + + const res = { model: model.name, quant: model.quant, label, + avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2), + boot: +boot.toFixed(1), vram: v, params }; + ALL.push(res); + return res; +} + +// Save intermediate results after each test +function saveIntermediate() { + writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2)); +} + +// ─── Smart Phase Runner ──────────────────────────────────────── + +async function tuneModel(model) { + log(`\n${"#".repeat(65)}`); + log(` ${model.name} (${model.quant})`); + if (!existsSync(model.path)) { log(" ✗ File not found, SKIP"); return null; } + const sz = (statSync(model.path).size / 1024**3).toFixed(2); + log(` Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`); + log(`${"#".repeat(65)}`); + + // ── Step 1: Find working GPU config ── + log(`\n ── Step 1: Find optimal GPU offload ──`); + let baseline = null; + + if (model.fitsGPU === true || model.fitsGPU === "maybe") { + // Try full GPU, no CPU offload + baseline = await testConfig(model, "ngl=999 pure-GPU", { + ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" }); + saveIntermediate(); + } + + if (!baseline) { + // Try n-cpu-moe values (ascending — find minimum needed) + for (const n of [5, 10, 15, 20]) { + baseline = await testConfig(model, `n-cpu-moe=${n}`, { + ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n }); + saveIntermediate(); + if (baseline) break; // found minimum working offload + } + } + + if (!baseline) { + // Last resort: full cpu-moe + baseline = await testConfig(model, "cpu-moe", { + ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true }); + saveIntermediate(); + } + + if (!baseline) { log(` ✗ ${model.name} cannot boot at 256K!`); return null; } + + const bp = baseline.params; // carry forward best params + + // If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory) + if (!bp.cpuMoe && !bp.nCpuMoe) { + const alt = await testConfig(model, "compare: cpu-moe", { + ...bp, cpuMoe: true }); + saveIntermediate(); + if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; } + } + + let best = baseline; + + // ── Step 2: Thread sweep ── + log(`\n ── Step 2: Thread sweep ──`); + for (const t of [2, 4, 8, 10, 12]) { + if (t === best.params.t) continue; + const r = await testConfig(model, `t=${t}`, { ...best.params, t }); + saveIntermediate(); + if (r && r.avg_tps > best.avg_tps) best = r; + } + + // ── Step 3: Batch sweep ── + log(`\n ── Step 3: Batch sweep ──`); + for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) { + if (ub === best.params.ub && b === best.params.b) continue; + const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b }); + saveIntermediate(); + if (r && r.avg_tps > best.avg_tps) best = r; + } + + // ── Step 4: KV cache sweep ── + log(`\n ── Step 4: KV cache type ──`); + for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) { + if (ctk === best.params.ctk && ctv === best.params.ctv) continue; + const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv }); + saveIntermediate(); + if (r && r.avg_tps > best.avg_tps) best = r; + } + + // ── Step 5: Final verification (5 runs) ── + log(`\n ── Step 5: Final verification ──`); + await kill(); + startServer(model.path, best.params); + const { ok, boot } = await waitReady(); + if (!ok) { await kill(); return best; } + const v = vram(); + try { await bench(20); } catch {} + + const finals = []; + for (let i = 0; i < 5; i++) { + try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`); + } catch (e) { log(` Final ${i+1}: ERR`); } + } + await kill(); + + if (finals.length > 0) { + const avg = finals.reduce((a,b)=>a+b) / finals.length; + const bst = Math.max(...finals); + log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`); + const final = { model: model.name, quant: model.quant, label: `FINAL`, + avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2), + boot: +boot.toFixed(1), vram: v, params: best.params }; + ALL.push(final); + saveIntermediate(); + return final; + } + return best; +} + +// ─── Main ────────────────────────────────────────────────────── +async function main() { + const t0 = Date.now(); + log("=" .repeat(65)); + log(" DUAL-GPU BENCHMARK v2 — Smart Strategy"); + log(" 2x RTX 3060 (24GB) | 256K Context"); + log(" " + new Date().toISOString()); + log("=".repeat(65)); + vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`)); + + const winners = []; + for (let i = 0; i < MODELS.length; i++) { + log(`\n${"=".repeat(65)}`); + log(` MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`); + log("=".repeat(65)); + const w = await tuneModel(MODELS[i]); + if (w) winners.push(w); + saveIntermediate(); + } + + // ─── Summary ────────────────────────────────────────────── + const elapsed = ((Date.now() - t0) / 60000).toFixed(1); + winners.sort((a, b) => b.avg_tps - a.avg_tps); + const medals = ["🥇", "🥈", "🥉", " "]; + + const lines = [ + `Dual-GPU Benchmark v2 — ${new Date().toISOString()}`, + `2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`, + "", "=" .repeat(55), " RANKING", "=".repeat(55), + ]; + for (let i = 0; i < winners.length; i++) { + const w = winners[i], p = w.params; + lines.push("", ` ${medals[i]||" "} #${i+1}: ${w.model}`); + lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`); + lines.push(` ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`); + if (p.cpuMoe) lines.push(` --cpu-moe`); + else if (p.nCpuMoe) lines.push(` --n-cpu-moe ${p.nCpuMoe}`); + } + if (winners.length > 0) { + const c = winners[0], cp = c.params; + lines.push("", "=".repeat(55), ` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s`, "=".repeat(55)); + const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`, + `-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`, + `-ub ${cp.ub} -b ${cp.b} -fa on`, + `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`, + `--prio ${cp.prio||3} --poll 50 --mlock`, + cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "", + "--port 8000 --host 0.0.0.0"].filter(Boolean).join(" "); + lines.push("", " Recommended:", ` ${cmd}`); + } + const summary = lines.join("\n"); + console.log("\n" + summary); + writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8"); + writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2)); + log(`\n Saved: dual_gpu_results.json + dual_gpu_summary.txt`); + log(" DONE!"); + await kill(); +} + +main().catch(e => { console.error("FATAL:", e); process.exit(1); }); diff --git a/scripts/dual_gpu_results.json b/scripts/dual_gpu_results.json new file mode 100644 index 0000000..31a529d --- /dev/null +++ b/scripts/dual_gpu_results.json @@ -0,0 +1,1654 @@ +[ + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ngl=999 pure-GPU", + "avg_tps": 63.21, + "best_tps": 63.78, + "boot": 9.1, + "vram": [ + { + "gpu": 0, + "used": 11770, + "total": 12288 + }, + { + "gpu": 1, + "used": 10411, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "compare: cpu-moe", + "avg_tps": 12.92, + "best_tps": 14.21, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 3096, + "total": 12288 + }, + { + "gpu": 1, + "used": 3497, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "cpuMoe": true + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=2", + "avg_tps": 64.1, + "best_tps": 64.27, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11728, + "total": 12288 + }, + { + "gpu": 1, + "used": 10411, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=4", + "avg_tps": 64, + "best_tps": 64.39, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11728, + "total": 12288 + }, + { + "gpu": 1, + "used": 10411, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=8", + "avg_tps": 63.75, + "best_tps": 63.9, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11728, + "total": 12288 + }, + { + "gpu": 1, + "used": 10411, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=10", + "avg_tps": 64.01, + "best_tps": 64.14, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11728, + "total": 12288 + }, + { + "gpu": 1, + "used": 10411, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 10, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=12", + "avg_tps": 63.86, + "best_tps": 63.98, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11728, + "total": 12288 + }, + { + "gpu": 1, + "used": 10411, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 12, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=256 b=1024", + "avg_tps": 63.8, + "best_tps": 64.12, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10504, + "total": 12288 + }, + { + "gpu": 1, + "used": 9619, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 256, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=256 b=2048", + "avg_tps": 63.88, + "best_tps": 64.04, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10504, + "total": 12288 + }, + { + "gpu": 1, + "used": 9619, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 256, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=512 b=4096", + "avg_tps": 63.91, + "best_tps": 64.18, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11728, + "total": 12288 + }, + { + "gpu": 1, + "used": 10411, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=1024 b=2048", + "avg_tps": 63.86, + "best_tps": 64.1, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10956, + "total": 12288 + }, + { + "gpu": 1, + "used": 9907, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 1024, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=1024 b=4096", + "avg_tps": 63.85, + "best_tps": 64.06, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10956, + "total": 12288 + }, + { + "gpu": 1, + "used": 9907, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 1024, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "kv=q8_0/q8_0", + "avg_tps": 64.14, + "best_tps": 64.39, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10670, + "total": 12288 + }, + { + "gpu": 1, + "used": 10169, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "kv=q4_0/q8_0", + "avg_tps": 37.52, + "best_tps": 37.86, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10394, + "total": 12288 + }, + { + "gpu": 1, + "used": 9753, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q8_0" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "kv=f16/f16", + "avg_tps": 63.48, + "best_tps": 64.31, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11700, + "total": 12288 + }, + { + "gpu": 1, + "used": 11667, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 2048, + "ctk": "f16", + "ctv": "f16" + } + }, + { + "model": "Gemma4-26B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "FINAL", + "avg_tps": 64.05, + "best_tps": 64.29, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10667, + "total": 12288 + }, + { + "gpu": 1, + "used": 10169, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "ngl=999 pure-GPU", + "avg_tps": 76.01, + "best_tps": 76.31, + "boot": 12.1, + "vram": [ + { + "gpu": 0, + "used": 11784, + "total": 12288 + }, + { + "gpu": 1, + "used": 10454, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "compare: cpu-moe", + "avg_tps": 10.19, + "best_tps": 10.49, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 2652, + "total": 12288 + }, + { + "gpu": 1, + "used": 2982, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "cpuMoe": true + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=2", + "avg_tps": 75.67, + "best_tps": 75.87, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11783, + "total": 12288 + }, + { + "gpu": 1, + "used": 10454, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=4", + "avg_tps": 75.61, + "best_tps": 75.87, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11783, + "total": 12288 + }, + { + "gpu": 1, + "used": 10454, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=8", + "avg_tps": 75.42, + "best_tps": 75.59, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11783, + "total": 12288 + }, + { + "gpu": 1, + "used": 10454, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=10", + "avg_tps": 75.71, + "best_tps": 75.82, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11783, + "total": 12288 + }, + { + "gpu": 1, + "used": 10454, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 10, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=12", + "avg_tps": 75.08, + "best_tps": 75.7, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11783, + "total": 12288 + }, + { + "gpu": 1, + "used": 10454, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 12, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=256 b=1024", + "avg_tps": 75.16, + "best_tps": 75.64, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10559, + "total": 12288 + }, + { + "gpu": 1, + "used": 9662, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 256, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=256 b=2048", + "avg_tps": 75.68, + "best_tps": 76.05, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10559, + "total": 12288 + }, + { + "gpu": 1, + "used": 9662, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 256, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=512 b=4096", + "avg_tps": 75.92, + "best_tps": 76.16, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11784, + "total": 12288 + }, + { + "gpu": 1, + "used": 10454, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=1024 b=2048", + "avg_tps": 75.7, + "best_tps": 75.9, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11012, + "total": 12288 + }, + { + "gpu": 1, + "used": 9950, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 1024, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=1024 b=4096", + "avg_tps": 75.77, + "best_tps": 75.99, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11011, + "total": 12288 + }, + { + "gpu": 1, + "used": 9950, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 1024, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "kv=q8_0/q8_0", + "avg_tps": 76.3, + "best_tps": 76.69, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10725, + "total": 12288 + }, + { + "gpu": 1, + "used": 10212, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "kv=q4_0/q8_0", + "avg_tps": 42.88, + "best_tps": 44.58, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 10439, + "total": 12288 + }, + { + "gpu": 1, + "used": 9796, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q8_0" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "kv=f16/f16", + "avg_tps": 76.36, + "best_tps": 76.78, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11761, + "total": 12288 + }, + { + "gpu": 1, + "used": 11710, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "f16", + "ctv": "f16" + } + }, + { + "model": "Gemma4-26B Q4_K_M", + "quant": "Q4_K_M", + "label": "FINAL", + "avg_tps": 76.4, + "best_tps": 76.75, + "boot": 9, + "vram": [ + { + "gpu": 0, + "used": 11761, + "total": 12288 + }, + { + "gpu": 1, + "used": 11710, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "f16", + "ctv": "f16" + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "n-cpu-moe=5", + "avg_tps": 51.43, + "best_tps": 52.07, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10365, + "total": 12288 + }, + { + "gpu": 1, + "used": 11152, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=2", + "avg_tps": 43.8, + "best_tps": 46.4, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10365, + "total": 12288 + }, + { + "gpu": 1, + "used": 11152, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=4", + "avg_tps": 49.21, + "best_tps": 52.78, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10353, + "total": 12288 + }, + { + "gpu": 1, + "used": 11152, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=8", + "avg_tps": 46.43, + "best_tps": 50.49, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10397, + "total": 12288 + }, + { + "gpu": 1, + "used": 11152, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=10", + "avg_tps": 46.12, + "best_tps": 50.06, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10351, + "total": 12288 + }, + { + "gpu": 1, + "used": 11152, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 10, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "t=12", + "avg_tps": 45.23, + "best_tps": 47.1, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10337, + "total": 12288 + }, + { + "gpu": 1, + "used": 11152, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 12, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=256 b=1024", + "avg_tps": 48.9, + "best_tps": 52.3, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 9834, + "total": 12288 + }, + { + "gpu": 1, + "used": 10906, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 256, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=256 b=2048", + "avg_tps": 49.62, + "best_tps": 52.52, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 9833, + "total": 12288 + }, + { + "gpu": 1, + "used": 10906, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 256, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=512 b=4096", + "avg_tps": 48.78, + "best_tps": 52.14, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10337, + "total": 12288 + }, + { + "gpu": 1, + "used": 11152, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=1024 b=2048", + "avg_tps": 49.95, + "best_tps": 52.53, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 11124, + "total": 12288 + }, + { + "gpu": 1, + "used": 11644, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 1024, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "ub=1024 b=4096", + "avg_tps": 48.75, + "best_tps": 52.06, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 11123, + "total": 12288 + }, + { + "gpu": 1, + "used": 11644, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 1024, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "kv=q4_0/q8_0", + "avg_tps": 42.81, + "best_tps": 44.14, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10681, + "total": 12288 + }, + { + "gpu": 1, + "used": 11472, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q8_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B MXFP4_MOE", + "quant": "MXFP4_MOE", + "label": "FINAL", + "avg_tps": 46.66, + "best_tps": 47.09, + "boot": 15, + "vram": [ + { + "gpu": 0, + "used": 10476, + "total": 12288 + }, + { + "gpu": 1, + "used": 11152, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "n-cpu-moe=5", + "avg_tps": 49.01, + "best_tps": 53.09, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10606, + "total": 12288 + }, + { + "gpu": 1, + "used": 11338, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=2", + "avg_tps": 45.73, + "best_tps": 47.87, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10599, + "total": 12288 + }, + { + "gpu": 1, + "used": 11338, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 2, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=4", + "avg_tps": 50.98, + "best_tps": 54.33, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10601, + "total": 12288 + }, + { + "gpu": 1, + "used": 11338, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=8", + "avg_tps": 48.45, + "best_tps": 52.1, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10596, + "total": 12288 + }, + { + "gpu": 1, + "used": 11338, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=10", + "avg_tps": 47.83, + "best_tps": 51.45, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10595, + "total": 12288 + }, + { + "gpu": 1, + "used": 11338, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 10, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "t=12", + "avg_tps": 43.77, + "best_tps": 46.79, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10589, + "total": 12288 + }, + { + "gpu": 1, + "used": 11338, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 12, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=256 b=1024", + "avg_tps": 52.14, + "best_tps": 53.82, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10089, + "total": 12288 + }, + { + "gpu": 1, + "used": 11092, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 256, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=256 b=2048", + "avg_tps": 50.23, + "best_tps": 53.66, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10091, + "total": 12288 + }, + { + "gpu": 1, + "used": 11092, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 256, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=512 b=2048", + "avg_tps": 49.89, + "best_tps": 53.89, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10595, + "total": 12288 + }, + { + "gpu": 1, + "used": 11338, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "ub=512 b=4096", + "avg_tps": 50.4, + "best_tps": 54.19, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10564, + "total": 12288 + }, + { + "gpu": 1, + "used": 11338, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "kv=q8_0/q8_0", + "avg_tps": 51.84, + "best_tps": 53.53, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10726, + "total": 12288 + }, + { + "gpu": 1, + "used": 11732, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 256, + "b": 1024, + "ctk": "q8_0", + "ctv": "q8_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "kv=q4_0/q8_0", + "avg_tps": 43.22, + "best_tps": 45.99, + "boot": 12, + "vram": [ + { + "gpu": 0, + "used": 10410, + "total": 12288 + }, + { + "gpu": 1, + "used": 11412, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 256, + "b": 1024, + "ctk": "q4_0", + "ctv": "q8_0", + "nCpuMoe": 5 + } + }, + { + "model": "Qwen3.5-35B Q4_K_M", + "quant": "Q4_K_M", + "label": "FINAL", + "avg_tps": 52.05, + "best_tps": 54.48, + "boot": 12.1, + "vram": [ + { + "gpu": 0, + "used": 10062, + "total": 12288 + }, + { + "gpu": 1, + "used": 11092, + "total": 12288 + } + ], + "params": { + "ngl": 999, + "t": 4, + "ub": 256, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0", + "nCpuMoe": 5 + } + } +] \ No newline at end of file diff --git a/scripts/dual_gpu_summary.txt b/scripts/dual_gpu_summary.txt new file mode 100644 index 0000000..29e3cbb --- /dev/null +++ b/scripts/dual_gpu_summary.txt @@ -0,0 +1,31 @@ +Dual-GPU Benchmark v2 — 2026-04-06T06:52:08.868Z +2x RTX 3060 12GB | 256K Context | 58 configs | 69.4 min + +======================================================= + RANKING +======================================================= + + 🥇 #1: Gemma4-26B Q4_K_M + AVG: 76.4 t/s | BEST: 76.75 t/s | Boot: 9s + ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16 + + 🥈 #2: Gemma4-26B MXFP4_MOE + AVG: 64.05 t/s | BEST: 64.29 t/s | Boot: 9s + ngl=999 t=2 ub=512 b=2048 ctk=q8_0 ctv=q8_0 + + 🥉 #3: Qwen3.5-35B Q4_K_M + AVG: 52.05 t/s | BEST: 54.48 t/s | Boot: 12.1s + ngl=999 t=4 ub=256 b=1024 ctk=q4_0 ctv=q4_0 + --n-cpu-moe 5 + + #4: Qwen3.5-35B MXFP4_MOE + AVG: 46.66 t/s | BEST: 47.09 t/s | Boot: 15s + ngl=999 t=6 ub=512 b=2048 ctk=q4_0 ctv=q4_0 + --n-cpu-moe 5 + +======================================================= + ★ CHAMPION: Gemma4-26B Q4_K_M — 76.4 t/s +======================================================= + + Recommended: + llama-server --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf -ngl 999 -c 262144 -t 6 -tb 6 -ub 512 -b 2048 -fa on --cache-type-k f16 --cache-type-v f16 --prio 3 --poll 50 --mlock --port 8000 --host 0.0.0.0 \ No newline at end of file diff --git a/scripts/final_tune_122b.txt b/scripts/final_tune_122b.txt new file mode 100644 index 0000000..6eb0816 Binary files /dev/null and b/scripts/final_tune_122b.txt differ diff --git a/scripts/final_tune_122b_dual.txt b/scripts/final_tune_122b_dual.txt new file mode 100644 index 0000000..cdaacb8 Binary files /dev/null and b/scripts/final_tune_122b_dual.txt differ diff --git a/scripts/find_max_dense.mjs b/scripts/find_max_dense.mjs new file mode 100644 index 0000000..e3a8531 --- /dev/null +++ b/scripts/find_max_dense.mjs @@ -0,0 +1,101 @@ +import { spawn, exec } from 'child_process'; + +const delay = ms => new Promise(res => setTimeout(res, ms)); + +async function killServer() { + return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); })); +} + +async function testContextSize(modelPath, contextSize) { + console.log(`\nTesting ${modelPath} with -c ${contextSize}...`); + await killServer(); + + const args = [ + '--model', `models\\${modelPath}`, + '-ngl', '999', + '-c', contextSize.toString(), + '-fa', 'on', + '--cache-type-k', 'q4_0', + '--cache-type-v', 'q4_0', + '-ub', '512', + '-b', '2048', + '-t', '6', + '-tb', '6', + '--split-mode', 'row', + '--prio', '3', + '--fit', 'off', + '--port', '8000', + '--host', '0.0.0.0' + ]; + + const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' }); + + let booted = false; + let oomed = false; + + server.stderr.on('data', (d) => { + const text = d.toString(); + if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) { + oomed = true; + } + }); + + for (let i = 0; i < 20; i++) { + if (oomed) break; + try { + const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 }); + if (res.status === 200) { + booted = true; + break; + } + } catch(e) {} + await delay(2000); + } + + if (oomed || !booted) { + console.log(`❌ Failed: Out of Memory at -c ${contextSize}`); + server.kill('SIGKILL'); + await killServer(); + return false; + } + + console.log(`✅ Booted! Running Benchmark...`); + + // Benchmark + const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => { + r(stdout || stderr); + })); + + console.log(bench); + await killServer(); + return true; +} + +async function findMaxContext(modelName) { + const contexts = [262144, 131072, 65536, 32768, 16384, 8192]; + + let maxFound = false; + for (const c of contexts) { + const success = await testContextSize(modelName, c); + if (success) { + maxFound = true; + console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`); + break; + } + } + + if (!maxFound) { + console.log(`\n❌ Failed to find any working context size for ${modelName}`); + } +} + +async function main() { + exec('set CUDA_VISIBLE_DEVICES='); + console.log("============= QWEN 27B Q4_K_M ============="); + await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf'); + + console.log("\n============= GEMMA 4 31B Q4_K_M ============="); + await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf'); +} + +main(); diff --git a/scripts/help_full.txt b/scripts/help_full.txt new file mode 100644 index 0000000..a5bc29f --- /dev/null +++ b/scripts/help_full.txt @@ -0,0 +1,562 @@ +----- common params ----- + +-h, --help, --usage print usage and exit +--version show version and build info +--license show source code license and dependencies +-cl, --cache-list show list of models in cache +--completion-bash print source-able bash completion script for llama.cpp +-t, --threads N number of CPU threads to use during generation (default: -1) + (env: LLAMA_ARG_THREADS) +-tb, --threads-batch N number of threads to use during batch and prompt processing (default: + same as --threads) +-C, --cpu-mask M CPU affinity mask: arbitrarily long hex. Complements cpu-range + (default: "") +-Cr, --cpu-range lo-hi range of CPUs for affinity. Complements --cpu-mask +--cpu-strict <0|1> use strict CPU placement (default: 0) +--prio N set process/thread priority : low(-1), normal(0), medium(1), high(2), + realtime(3) (default: 0) +--poll <0...100> use polling level to wait for work (0 - no polling, default: 50) +-Cb, --cpu-mask-batch M CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch + (default: same as --cpu-mask) +-Crb, --cpu-range-batch lo-hi ranges of CPUs for affinity. Complements --cpu-mask-batch +--cpu-strict-batch <0|1> use strict CPU placement (default: same as --cpu-strict) +--prio-batch N set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime + (default: 0) +--poll-batch <0|1> use polling to wait for work (default: same as --poll) +-c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) + (env: LLAMA_ARG_CTX_SIZE) +-n, --predict, --n-predict N number of tokens to predict (default: -1, -1 = infinity) + (env: LLAMA_ARG_N_PREDICT) +-b, --batch-size N logical maximum batch size (default: 2048) + (env: LLAMA_ARG_BATCH) +-ub, --ubatch-size N physical maximum batch size (default: 512) + (env: LLAMA_ARG_UBATCH) +--keep N number of tokens to keep from the initial prompt (default: 0, -1 = + all) +--swa-full use full-size SWA cache (default: false) + [(more + info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) + (env: LLAMA_ARG_SWA_FULL) +-fa, --flash-attn [on|off|auto] set Flash Attention use ('on', 'off', or 'auto', default: 'auto') + (env: LLAMA_ARG_FLASH_ATTN) +--perf, --no-perf whether to enable internal libllama performance timings (default: + false) + (env: LLAMA_ARG_PERF) +-e, --escape, --no-escape whether to process escapes sequences (\n, \r, \t, \', \", \\) + (default: true) +--rope-scaling {none,linear,yarn} RoPE frequency scaling method, defaults to linear unless specified by + the model + (env: LLAMA_ARG_ROPE_SCALING_TYPE) +--rope-scale N RoPE context scaling factor, expands context by a factor of N + (env: LLAMA_ARG_ROPE_SCALE) +--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from + model) + (env: LLAMA_ARG_ROPE_FREQ_BASE) +--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N + (env: LLAMA_ARG_ROPE_FREQ_SCALE) +--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training + context size) + (env: LLAMA_ARG_YARN_ORIG_CTX) +--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.00, 0.0 = full + interpolation) + (env: LLAMA_ARG_YARN_EXT_FACTOR) +--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: -1.00) + (env: LLAMA_ARG_YARN_ATTN_FACTOR) +--yarn-beta-slow N YaRN: high correction dim or alpha (default: -1.00) + (env: LLAMA_ARG_YARN_BETA_SLOW) +--yarn-beta-fast N YaRN: low correction dim or beta (default: -1.00) + (env: LLAMA_ARG_YARN_BETA_FAST) +-kvo, --kv-offload, -nkvo, --no-kv-offload + whether to enable KV cache offloading (default: enabled) + (env: LLAMA_ARG_KV_OFFLOAD) +--repack, -nr, --no-repack whether to enable weight repacking (default: enabled) + (env: LLAMA_ARG_REPACK) +--no-host bypass host buffer allowing extra buffers to be used + (env: LLAMA_ARG_NO_HOST) +-ctk, --cache-type-k TYPE KV cache data type for K + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (default: f16) + (env: LLAMA_ARG_CACHE_TYPE_K) +-ctv, --cache-type-v TYPE KV cache data type for V + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (default: f16) + (env: LLAMA_ARG_CACHE_TYPE_V) +-dt, --defrag-thold N KV cache defragmentation threshold (DEPRECATED) + (env: LLAMA_ARG_DEFRAG_THOLD) +--rpc SERVERS comma separated list of RPC servers (host:port) + (env: LLAMA_ARG_RPC) +--mlock force system to keep model in RAM rather than swapping or compressing + (env: LLAMA_ARG_MLOCK) +--mmap, --no-mmap whether to memory-map model. (if mmap disabled, slower load but may + reduce pageouts if not using mlock) (default: enabled) + (env: LLAMA_ARG_MMAP) +-dio, --direct-io, -ndio, --no-direct-io + use DirectIO if available. (default: disabled) + (env: LLAMA_ARG_DIO) +--numa TYPE attempt optimizations that help on some NUMA systems + - distribute: spread execution evenly over all nodes + - isolate: only spawn threads on CPUs on the node that execution + started on + - numactl: use the CPU map provided by numactl + if run without this previously, it is recommended to drop the system + page cache before using this + see https://github.com/ggml-org/llama.cpp/issues/1437 + (env: LLAMA_ARG_NUMA) +-dev, --device <dev1,dev2,..> comma-separated list of devices to use for offloading (none = don't + offload) + use --list-devices to see a list of available devices + (env: LLAMA_ARG_DEVICE) +--list-devices print list of available devices and exit +-ot, --override-tensor <tensor name pattern>=<buffer type>,... + override tensor buffer type + (env: LLAMA_ARG_OVERRIDE_TENSOR) +-cmoe, --cpu-moe keep all Mixture of Experts (MoE) weights in the CPU + (env: LLAMA_ARG_CPU_MOE) +-ncmoe, --n-cpu-moe N keep the Mixture of Experts (MoE) weights of the first N layers in the + CPU + (env: LLAMA_ARG_N_CPU_MOE) +-ngl, --gpu-layers, --n-gpu-layers N max. number of layers to store in VRAM, either an exact number, + 'auto', or 'all' (default: auto) + (env: LLAMA_ARG_N_GPU_LAYERS) +-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of: + - none: use one GPU only + - layer (default): split layers and KV across GPUs + - row: split rows across GPUs + (env: LLAMA_ARG_SPLIT_MODE) +-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of + proportions, e.g. 3,1 + (env: LLAMA_ARG_TENSOR_SPLIT) +-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for + intermediate results and KV (with split-mode = row) (default: 0) + (env: LLAMA_ARG_MAIN_GPU) +-fit, --fit [on|off] whether to adjust unset arguments to fit in device memory ('on' or + 'off', default: 'on') + (env: LLAMA_ARG_FIT) +-fitt, --fit-target MiB0,MiB1,MiB2,... + target margin per device for --fit, comma-separated list of values, + single value is broadcast across all devices, default: 1024 + (env: LLAMA_ARG_FIT_TARGET) +-fitc, --fit-ctx N minimum ctx size that can be set by --fit option, default: 4096 + (env: LLAMA_ARG_FIT_CTX) +--check-tensors check model tensor data for invalid values (default: false) +--override-kv KEY=TYPE:VALUE,... advanced option to override model metadata by key. to specify multiple + overrides, either use comma-separated values. + types: int, float, bool, str. example: --override-kv + tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false +--op-offload, --no-op-offload whether to offload host tensor operations to device (default: true) +--lora FNAME path to LoRA adapter (use comma-separated values to load multiple + adapters) +--lora-scaled FNAME:SCALE,... path to LoRA adapter with user defined scaling (format: + FNAME:SCALE,...) + note: use comma-separated values +--control-vector FNAME add a control vector + note: use comma-separated values to add multiple control vectors +--control-vector-scaled FNAME:SCALE,... + add a control vector with user defined scaling SCALE + note: use comma-separated values (format: FNAME:SCALE,...) +--control-vector-layer-range START END + layer range to apply the control vector(s) to, start and end inclusive +-m, --model FNAME model path to load + (env: LLAMA_ARG_MODEL) +-mu, --model-url MODEL_URL model download url (default: unused) + (env: LLAMA_ARG_MODEL_URL) +-dr, --docker-repo [<repo>/]<model>[:quant] + Docker Hub model repository. repo is optional, default to ai/. quant + is optional, default to :latest. + example: gemma3 + (default: unused) + (env: LLAMA_ARG_DOCKER_REPO) +-hf, -hfr, --hf-repo <user>/<model>[:quant] + Hugging Face model repository; quant is optional, case-insensitive, + default to Q4_K_M, or falls back to the first file in the repo if + Q4_K_M doesn't exist. + mmproj is also downloaded automatically if available. to disable, add + --no-mmproj + example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M + (default: unused) + (env: LLAMA_ARG_HF_REPO) +-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant] + Same as --hf-repo, but for the draft model (default: unused) + (env: LLAMA_ARG_HFD_REPO) +-hff, --hf-file FILE Hugging Face model file. If specified, it will override the quant in + --hf-repo (default: unused) + (env: LLAMA_ARG_HF_FILE) +-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant] + Hugging Face model repository for the vocoder model (default: unused) + (env: LLAMA_ARG_HF_REPO_V) +-hffv, --hf-file-v FILE Hugging Face model file for the vocoder model (default: unused) + (env: LLAMA_ARG_HF_FILE_V) +-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment + variable) + (env: HF_TOKEN) +--log-disable Log disable +--log-file FNAME Log to file + (env: LLAMA_LOG_FILE) +--log-colors [on|off|auto] Set colored logging ('on', 'off', or 'auto', default: 'auto') + 'auto' enables colors when output is to a terminal + (env: LLAMA_LOG_COLORS) +-v, --verbose, --log-verbose Set verbosity level to infinity (i.e. log all messages, useful for + debugging) +--offline Offline mode: forces use of cache, prevents network access + (env: LLAMA_OFFLINE) +-lv, --verbosity, --log-verbosity N Set the verbosity threshold. Messages with a higher verbosity will be + ignored. Values: + - 0: generic output + - 1: error + - 2: warning + - 3: info + - 4: debug + (default: 3) + + (env: LLAMA_LOG_VERBOSITY) +--log-prefix Enable prefix in log messages + (env: LLAMA_LOG_PREFIX) +--log-timestamps Enable timestamps in log messages + (env: LLAMA_LOG_TIMESTAMPS) +-ctkd, --cache-type-k-draft TYPE KV cache data type for K for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (default: f16) + (env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) +-ctvd, --cache-type-v-draft TYPE KV cache data type for V for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (default: f16) + (env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) + + +----- sampling params ----- + +--samplers SAMPLERS samplers that will be used for generation in the order, separated by + ';' + (default: + penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) +-s, --seed SEED RNG seed (default: -1, use random seed for -1) +--sampler-seq, --sampling-seq SEQUENCE + simplified sequence for samplers that will be used (default: + edskypmxt) +--ignore-eos ignore end of stream token and continue generating (implies + --logit-bias EOS-inf) +--temp, --temperature N temperature (default: 0.80) +--top-k N top-k sampling (default: 40, 0 = disabled) + (env: LLAMA_ARG_TOP_K) +--top-p N top-p sampling (default: 0.95, 1.0 = disabled) +--min-p N min-p sampling (default: 0.05, 0.0 = disabled) +--top-nsigma, --top-n-sigma N top-n-sigma sampling (default: -1.00, -1.0 = disabled) +--xtc-probability N xtc probability (default: 0.00, 0.0 = disabled) +--xtc-threshold N xtc threshold (default: 0.10, 1.0 = disabled) +--typical, --typical-p N locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) +--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 + = ctx_size) +--repeat-penalty N penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) +--presence-penalty N repeat alpha presence penalty (default: 0.00, 0.0 = disabled) +--frequency-penalty N repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) +--dry-multiplier N set DRY sampling multiplier (default: 0.00, 0.0 = disabled) +--dry-base N set DRY sampling base value (default: 1.75) +--dry-allowed-length N set allowed length for DRY sampling (default: 2) +--dry-penalty-last-n N set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = + context size) +--dry-sequence-breaker STRING add sequence breaker for DRY sampling, clearing out default breakers + ('\n', ':', '"', '*') in the process; use "none" to not use any + sequence breakers +--adaptive-target N adaptive-p: select tokens near this probability (valid range 0.0 to + 1.0; negative = disabled) (default: -1.00) + [(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) +--adaptive-decay N adaptive-p: decay rate for target adaptation over time. lower values + are more reactive, higher values are more stable. + (valid range 0.0 to 0.99) (default: 0.90) +--dynatemp-range N dynamic temperature range (default: 0.00, 0.0 = disabled) +--dynatemp-exp N dynamic temperature exponent (default: 1.00) +--mirostat N use Mirostat sampling. + Top K, Nucleus and Locally Typical samplers are ignored if used. + (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) +--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.10) +--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.00) +-l, --logit-bias TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion, + i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', + or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' +--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ + dir) +--grammar-file FNAME file to read grammar from +-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. + `{}` for any JSON object + For schemas w/ external $refs, use --grammar + + example/json_schema_to_grammar.py instead +-jf, --json-schema-file FILE File containing a JSON schema to constrain generations + (https://json-schema.org/), e.g. `{}` for any JSON object + For schemas w/ external $refs, use --grammar + + example/json_schema_to_grammar.py instead +-bs, --backend-sampling enable backend sampling (experimental) (default: disabled) + (env: LLAMA_ARG_BACKEND_SAMPLING) + + +----- example-specific params ----- + +-lcs, --lookup-cache-static FNAME path to static lookup cache to use for lookup decoding (not updated by + generation) +-lcd, --lookup-cache-dynamic FNAME path to dynamic lookup cache to use for lookup decoding (updated by + generation) +-ctxcp, --ctx-checkpoints, --swa-checkpoints N + max number of context checkpoints to create per slot (default: + 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293) + (env: LLAMA_ARG_CTX_CHECKPOINTS) +-cpent, --checkpoint-every-n-tokens N create a checkpoint every n tokens during prefill (processing), -1 to + disable (default: 8192) + (env: LLAMA_ARG_CHECKPOINT_EVERY_NT) +-cram, --cache-ram N set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - + disable)[(more + info)](https://github.com/ggml-org/llama.cpp/pull/16391) + (env: LLAMA_ARG_CACHE_RAM) +-kvu, --kv-unified, -no-kvu, --no-kv-unified + use single unified KV buffer shared across all sequences (default: + enabled if number of slots is auto) + (env: LLAMA_ARG_KV_UNIFIED) +--clear-idle, --no-clear-idle save and clear idle slots on new task (default: enabled, requires + unified KV and cache-ram) + (env: LLAMA_ARG_CLEAR_IDLE) +--context-shift, --no-context-shift whether to use context shift on infinite text generation (default: + disabled) + (env: LLAMA_ARG_CONTEXT_SHIFT) +-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode +-sp, --special special tokens output enabled (default: false) +--warmup, --no-warmup whether to perform warmup with an empty run (default: enabled) +--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of + Prefix/Suffix/Middle) as some models prefer this. (default: disabled) +--pooling {none,mean,cls,last,rank} pooling type for embeddings, use model default if unspecified + (env: LLAMA_ARG_POOLING) +-np, --parallel N number of server slots (default: -1, -1 = auto) + (env: LLAMA_ARG_N_PARALLEL) +-cb, --cont-batching, -nocb, --no-cont-batching + whether to enable continuous batching (a.k.a dynamic batching) + (default: enabled) + (env: LLAMA_ARG_CONT_BATCHING) +-mm, --mmproj FILE path to a multimodal projector file. see tools/mtmd/README.md + note: if -hf is used, this argument can be omitted + (env: LLAMA_ARG_MMPROJ) +-mmu, --mmproj-url URL URL to a multimodal projector file. see tools/mtmd/README.md + (env: LLAMA_ARG_MMPROJ_URL) +--mmproj-auto, --no-mmproj, --no-mmproj-auto + whether to use multimodal projector file (if available), useful when + using -hf (default: enabled) + (env: LLAMA_ARG_MMPROJ_AUTO) +--mmproj-offload, --no-mmproj-offload whether to enable GPU offloading for multimodal projector (default: + enabled) + (env: LLAMA_ARG_MMPROJ_OFFLOAD) +--image-min-tokens N minimum number of tokens each image can take, only used by vision + models with dynamic resolution (default: read from model) + (env: LLAMA_ARG_IMAGE_MIN_TOKENS) +--image-max-tokens N maximum number of tokens each image can take, only used by vision + models with dynamic resolution (default: read from model) + (env: LLAMA_ARG_IMAGE_MAX_TOKENS) +-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,... + override tensor buffer type for draft model +-cmoed, --cpu-moe-draft keep all Mixture of Experts (MoE) weights in the CPU for the draft + model + (env: LLAMA_ARG_CPU_MOE_DRAFT) +-ncmoed, --n-cpu-moe-draft N keep the Mixture of Experts (MoE) weights of the first N layers in the + CPU for the draft model + (env: LLAMA_ARG_N_CPU_MOE_DRAFT) +-a, --alias STRING set model name aliases, comma-separated (to be used by API) + (env: LLAMA_ARG_ALIAS) +--tags STRING set model tags, comma-separated (informational, not used for routing) + (env: LLAMA_ARG_TAGS) +--host HOST ip address to listen, or bind to an UNIX socket if the address ends + with .sock (default: 127.0.0.1) + (env: LLAMA_ARG_HOST) +--port PORT port to listen (default: 8080) + (env: LLAMA_ARG_PORT) +--reuse-port allow multiple sockets to bind to the same port (default: disabled) + (env: LLAMA_ARG_REUSE_PORT) +--path PATH path to serve static files from (default: ) + (env: LLAMA_ARG_STATIC_PATH) +--api-prefix PREFIX prefix path the server serves from, without the trailing slash + (default: ) + (env: LLAMA_ARG_API_PREFIX) +--webui-config JSON JSON that provides default WebUI settings (overrides WebUI defaults) + (env: LLAMA_ARG_WEBUI_CONFIG) +--webui-config-file PATH JSON file that provides default WebUI settings (overrides WebUI + defaults) + (env: LLAMA_ARG_WEBUI_CONFIG_FILE) +--webui-mcp-proxy, --no-webui-mcp-proxy + experimental: whether to enable MCP CORS proxy - do not enable in + untrusted environments (default: disabled) + (env: LLAMA_ARG_WEBUI_MCP_PROXY) +--tools TOOL1,TOOL2,... experimental: whether to enable built-in tools for AI agents - do not + enable in untrusted environments (default: no tools) + specify "all" to enable all tools + available tools: read_file, file_glob_search, grep_search, + exec_shell_command, write_file, edit_file, apply_diff + (env: LLAMA_ARG_TOOLS) +--webui, --no-webui whether to enable the Web UI (default: enabled) + (env: LLAMA_ARG_WEBUI) +--embedding, --embeddings restrict to only support embedding use case; use only with dedicated + embedding models (default: disabled) + (env: LLAMA_ARG_EMBEDDINGS) +--rerank, --reranking enable reranking endpoint on server (default: disabled) + (env: LLAMA_ARG_RERANKING) +--api-key KEY API key to use for authentication, multiple keys can be provided as a + comma-separated list (default: none) + (env: LLAMA_API_KEY) +--api-key-file FNAME path to file containing API keys (default: none) +--ssl-key-file FNAME path to file a PEM-encoded SSL private key + (env: LLAMA_ARG_SSL_KEY_FILE) +--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate + (env: LLAMA_ARG_SSL_CERT_FILE) +--chat-template-kwargs STRING sets additional params for the json template parser, must be a valid + json object string, e.g. '{"key1":"value1","key2":"value2"}' + (env: LLAMA_CHAT_TEMPLATE_KWARGS) +-to, --timeout N server read/write timeout in seconds (default: 600) + (env: LLAMA_ARG_TIMEOUT) +--threads-http N number of threads used to process HTTP requests (default: -1) + (env: LLAMA_ARG_THREADS_HTTP) +--cache-prompt, --no-cache-prompt whether to enable prompt caching (default: enabled) + (env: LLAMA_ARG_CACHE_PROMPT) +--cache-reuse N min chunk size to attempt reusing from the cache via KV shifting, + requires prompt caching to be enabled (default: 0) + [(card)](https://ggml.ai/f0.png) + (env: LLAMA_ARG_CACHE_REUSE) +--metrics enable prometheus compatible metrics endpoint (default: disabled) + (env: LLAMA_ARG_ENDPOINT_METRICS) +--props enable changing global properties via POST /props (default: disabled) + (env: LLAMA_ARG_ENDPOINT_PROPS) +--slots, --no-slots expose slots monitoring endpoint (default: enabled) + (env: LLAMA_ARG_ENDPOINT_SLOTS) +--slot-save-path PATH path to save slot kv cache (default: disabled) +--media-path PATH directory for loading local media files; files can be accessed via + file:// URLs using relative paths (default: disabled) +--models-dir PATH directory containing models for the router server (default: disabled) + (env: LLAMA_ARG_MODELS_DIR) +--models-preset PATH path to INI file containing model presets for the router server + (default: disabled) + (env: LLAMA_ARG_MODELS_PRESET) +--models-max N for router server, maximum number of models to load simultaneously + (default: 4, 0 = unlimited) + (env: LLAMA_ARG_MODELS_MAX) +--models-autoload, --no-models-autoload + for router server, whether to automatically load models (default: + enabled) + (env: LLAMA_ARG_MODELS_AUTOLOAD) +--jinja, --no-jinja whether to use jinja template engine for chat (default: enabled) + (env: LLAMA_ARG_JINJA) +--reasoning-format FORMAT controls whether thought tags are allowed and/or extracted from the + response, and in which format they're returned; one of: + - none: leaves thoughts unparsed in `message.content` + - deepseek: puts thoughts in `message.reasoning_content` + - deepseek-legacy: keeps `<think>` tags in `message.content` while + also populating `message.reasoning_content` + (default: auto) + (env: LLAMA_ARG_THINK) +-rea, --reasoning [on|off|auto] Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: + 'auto' (detect from template)) + (env: LLAMA_ARG_REASONING) +--reasoning-budget N token budget for thinking: -1 for unrestricted, 0 for immediate end, + N>0 for token budget (default: -1) + (env: LLAMA_ARG_THINK_BUDGET) +--reasoning-budget-message MESSAGE message injected before the end-of-thinking tag when reasoning budget + is exhausted (default: none) + (env: LLAMA_ARG_THINK_BUDGET_MESSAGE) +--chat-template JINJA_TEMPLATE set custom jinja chat template (default: template taken from model's + metadata) + if suffix/prefix are specified, template will be disabled + only commonly used templates are accepted (unless --jinja is set + before this flag): + list of built-in templates: + bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, + command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, + exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, + granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, + llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, + minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, + mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, + phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, + yandex, zephyr + (env: LLAMA_ARG_CHAT_TEMPLATE) +--chat-template-file JINJA_TEMPLATE_FILE + set custom jinja chat template file (default: template taken from + model's metadata) + if suffix/prefix are specified, template will be disabled + only commonly used templates are accepted (unless --jinja is set + before this flag): + list of built-in templates: + bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, + command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, + exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, + granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, + llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, + minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, + mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, + phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, + yandex, zephyr + (env: LLAMA_ARG_CHAT_TEMPLATE_FILE) +--skip-chat-parsing, --no-skip-chat-parsing + force a pure content parser, even if a Jinja template is specified; + model will output everything in the content section, including any + reasoning and/or tool calls (default: disabled) + (env: LLAMA_ARG_SKIP_CHAT_PARSING) +--prefill-assistant, --no-prefill-assistant + whether to prefill the assistant's response if the last message is an + assistant message (default: prefill enabled) + when this flag is set, if the last message is an assistant message + then it will be treated as a full message and not prefilled + + (env: LLAMA_ARG_PREFILL_ASSISTANT) +-sps, --slot-prompt-similarity SIMILARITY + how much the prompt of a request must match the prompt of a slot in + order to use that slot (default: 0.10, 0.0 = disabled) +--lora-init-without-apply load LoRA adapters without applying them (apply later via POST + /lora-adapters) (default: disabled) +--sleep-idle-seconds SECONDS number of seconds of idleness after which the server will sleep + (default: -1; -1 = disabled) +-td, --threads-draft N number of threads to use during generation (default: same as + --threads) +-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: + same as --threads-draft) +--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16) + (env: LLAMA_ARG_DRAFT_MAX) +--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding + (default: 0) + (env: LLAMA_ARG_DRAFT_MIN) +--draft-p-min P minimum speculative decoding probability (greedy) (default: 0.75) + (env: LLAMA_ARG_DRAFT_P_MIN) +-cd, --ctx-size-draft N size of the prompt context for the draft model (default: 0, 0 = loaded + from model) + (env: LLAMA_ARG_CTX_SIZE_DRAFT) +-devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model + (none = don't offload) + use --list-devices to see a list of available devices +-ngld, --gpu-layers-draft, --n-gpu-layers-draft N + max. number of draft model layers to store in VRAM, either an exact + number, 'auto', or 'all' (default: auto) + (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) +-md, --model-draft FNAME draft model for speculative decoding (default: unused) + (env: LLAMA_ARG_MODEL_DRAFT) +--spec-replace TARGET DRAFT translate the string in TARGET into DRAFT if the draft model and main + model are not compatible +--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] + type of speculative decoding to use when no draft model is provided + (default: none) + + (env: LLAMA_ARG_SPEC_TYPE) +--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length + of lookup n-gram (default: 12) +--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length + of draft m-gram (default: 48) +--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1) +-mv, --model-vocoder FNAME vocoder model for audio generation (default: unused) +--tts-use-guide-tokens Use guide tokens to improve TTS word recall +--embd-gemma-default use default EmbeddingGemma model (note: can download weights from the + internet) +--fim-qwen-1.5b-default use default Qwen 2.5 Coder 1.5B (note: can download weights from the + internet) +--fim-qwen-3b-default use default Qwen 2.5 Coder 3B (note: can download weights from the + internet) +--fim-qwen-7b-default use default Qwen 2.5 Coder 7B (note: can download weights from the + internet) +--fim-qwen-7b-spec use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can + download weights from the internet) +--fim-qwen-14b-spec use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: + can download weights from the internet) +--fim-qwen-30b-default use default Qwen 3 Coder 30B A3B Instruct (note: can download weights + from the internet) +--gpt-oss-20b-default use gpt-oss-20b (note: can download weights from the internet) +--gpt-oss-120b-default use gpt-oss-120b (note: can download weights from the internet) +--vision-gemma-4b-default use Gemma 3 4B QAT (note: can download weights from the internet) +--vision-gemma-12b-default use Gemma 3 12B QAT (note: can download weights from the internet) diff --git a/scripts/help_gpu_flags.txt b/scripts/help_gpu_flags.txt new file mode 100644 index 0000000..68565c7 --- /dev/null +++ b/scripts/help_gpu_flags.txt @@ -0,0 +1,31 @@ +ggml_cuda_init: found 2 CUDA devices (Total VRAM: 24575 MiB): + Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB + Device 1: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB +-dev, --device <dev1,dev2,..> comma-separated list of devices to use for offloading (none = don't + use --list-devices to see a list of available devices + (env: LLAMA_ARG_DEVICE) +--list-devices print list of available devices and exit +-ot, --override-tensor <tensor name pattern>=<buffer type>,... + override tensor buffer type + (env: LLAMA_ARG_OVERRIDE_TENSOR) +-cmoe, --cpu-moe keep all Mixture of Experts (MoE) weights in the CPU +-ncmoe, --n-cpu-moe N keep the Mixture of Experts (MoE) weights of the first N layers in the +-sm, --split-mode {none,layer,row} how to split the model across multiple GPUs, one of: + - layer (default): split layers and KV across GPUs + - row: split rows across GPUs + (env: LLAMA_ARG_SPLIT_MODE) +-ts, --tensor-split N0,N1,N2,... fraction of the model to offload to each GPU, comma-separated list of + (env: LLAMA_ARG_TENSOR_SPLIT) +-mg, --main-gpu INDEX the GPU to use for the model (with split-mode = none), or for + intermediate results and KV (with split-mode = row) (default: 0) +-fit, --fit [on|off] whether to adjust unset arguments to fit in device memory ('on' or + target margin per device for --fit, comma-separated list of values, + single value is broadcast across all devices, default: 1024 +--check-tensors check model tensor data for invalid values (default: false) +--op-offload, --no-op-offload whether to offload host tensor operations to device (default: true) +-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,... + override tensor buffer type for draft model +-cmoed, --cpu-moe-draft keep all Mixture of Experts (MoE) weights in the CPU for the draft +-ncmoed, --n-cpu-moe-draft N keep the Mixture of Experts (MoE) weights of the first N layers in the +-devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model + use --list-devices to see a list of available devices diff --git a/scripts/hf_search.py b/scripts/hf_search.py new file mode 100644 index 0000000..afecd79 --- /dev/null +++ b/scripts/hf_search.py @@ -0,0 +1,28 @@ +from huggingface_hub import HfApi +import sys + +api = HfApi() + +def search_gguf(query): + print(f"\n--- Searching for: {query} ---") + try: + models = api.list_models(search=query, limit=3) + found = list(models) + if not found: + print("No models found.") + return + for m in found: + print(f"Repo: {m.id}") + files = api.list_repo_files(repo_id=m.id) + ggufs = [f for f in files if "q4_k_m" in f.lower() and f.endswith(".gguf")] + if not ggufs: + ggufs = [f for f in files if f.endswith(".gguf")][:3] + print(f" GGUFs: {ggufs}") + except Exception as e: + print(f"Error: {e}") + +search_gguf("122b-a10b gguf") +search_gguf("Qwen3.5 122b gguf") +search_gguf("35b-a3b gguf") +search_gguf("gemma-4 26b gguf") +search_gguf("Qwen 122B") diff --git a/scripts/perf_test.py b/scripts/perf_test.py new file mode 100644 index 0000000..6aaaebd --- /dev/null +++ b/scripts/perf_test.py @@ -0,0 +1,123 @@ +import time +import json +import urllib.request +import sys + +try: + sys.stdout.reconfigure(encoding='utf-8') +except AttributeError: + pass + +BASE_URL = "http://127.0.0.1:8000" + +def check_server(): + """Check if server is up""" + try: + req = urllib.request.Request(f"{BASE_URL}/health") + with urllib.request.urlopen(req, timeout=5) as resp: + data = json.loads(resp.read()) + return data.get("status") == "ok" + except: + return False + +def run_benchmark(prompt, max_tokens=100, label="Test"): + """Run a single benchmark request and return results""" + payload = json.dumps({ + "model": "local-model", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "temperature": 0.0 + }).encode("utf-8") + + req = urllib.request.Request( + f"{BASE_URL}/v1/chat/completions", + data=payload, + headers={"Content-Type": "application/json"} + ) + + start = time.time() + with urllib.request.urlopen(req, timeout=300) as resp: + result = json.loads(resp.read()) + elapsed = time.time() - start + + content = result["choices"][0]["message"].get("content", "") + usage = result.get("usage", {}) + prompt_tokens = usage.get("prompt_tokens", 0) + completion_tokens = usage.get("completion_tokens", 0) + + gen_tps = completion_tokens / elapsed if elapsed > 0 else 0 + + return { + "label": label, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "elapsed": elapsed, + "gen_tps_approx": gen_tps, + "content_preview": content[:100] + } + +def main(): + print("=" * 60) + print(" LLM Performance Benchmark Tool") + print("=" * 60) + print() + + # Wait for server + print("[1/3] Checking server health...") + for i in range(30): + if check_server(): + print(" -> Server is ready!") + break + print(f" -> Waiting for server... ({i+1}/30)") + time.sleep(2) + else: + print(" -> ERROR: Server not responding after 60s") + return + + # Warmup + print() + print("[2/3] Warmup run (short)...") + try: + warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup") + print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s") + except Exception as e: + print(f" -> Warmup failed: {e}") + + # Main benchmark + print() + print("[3/3] Running main benchmark...") + print("-" * 60) + + test_prompt = "Count from 1 to 50, writing each number on a new line." + + results = [] + for i in range(3): + print(f" Run {i+1}/3...") + try: + r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}") + results.append(r) + print(f" Tokens: {r['completion_tokens']} | " + f"Time: {r['elapsed']:.2f}s | " + f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)") + except Exception as e: + print(f" ERROR: {e}") + + if results: + print() + print("=" * 60) + print(" RESULTS SUMMARY") + print("=" * 60) + avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results) + max_tps = max(r["gen_tps_approx"] for r in results) + min_tps = min(r["gen_tps_approx"] for r in results) + print(f" Runs: {len(results)}") + print(f" Avg TPS: {avg_tps:.2f} t/s (approx, includes prompt eval)") + print(f" Min TPS: {min_tps:.2f} t/s") + print(f" Max TPS: {max_tps:.2f} t/s") + print() + print(" NOTE: Check server console for exact generation t/s") + print(" (the 'eval time' line shows pure token generation speed)") + print("=" * 60) + +if __name__ == "__main__": + main() diff --git a/scripts/perf_test_122b.py b/scripts/perf_test_122b.py new file mode 100644 index 0000000..0981587 --- /dev/null +++ b/scripts/perf_test_122b.py @@ -0,0 +1,169 @@ +import time +import json +import urllib.request +import sys +import os +import re + +try: + sys.stdout.reconfigure(encoding='utf-8') +except AttributeError: + pass + +BASE_URL = "http://127.0.0.1:8000" + +def check_server(): + """Check if server is up""" + try: + req = urllib.request.Request(f"{BASE_URL}/health") + with urllib.request.urlopen(req, timeout=5) as resp: + data = json.loads(resp.read()) + return data.get("status") == "ok" + except: + return False + +def check_slots(): + """Check server slot info for VRAM usage details""" + try: + req = urllib.request.Request(f"{BASE_URL}/slots") + with urllib.request.urlopen(req, timeout=5) as resp: + return json.loads(resp.read()) + except: + return None + +def run_benchmark(prompt, max_tokens=300, label="Test"): + """Run a single benchmark request and return results""" + payload = json.dumps({ + "model": "local-model", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "temperature": 0.0 + }).encode("utf-8") + + req = urllib.request.Request( + f"{BASE_URL}/v1/chat/completions", + data=payload, + headers={"Content-Type": "application/json"} + ) + + start = time.time() + with urllib.request.urlopen(req, timeout=600) as resp: + result = json.loads(resp.read()) + elapsed = time.time() - start + + content = result["choices"][0]["message"].get("content", "") + usage = result.get("usage", {}) + prompt_tokens = usage.get("prompt_tokens", 0) + completion_tokens = usage.get("completion_tokens", 0) + + gen_tps = completion_tokens / elapsed if elapsed > 0 else 0 + + return { + "label": label, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "elapsed": elapsed, + "gen_tps_approx": gen_tps, + "content_preview": content[:150] + } + +def main(): + print("=" * 70) + print(" Qwen3.5 122B-A10B Performance Benchmark") + print(" Target: 10+ t/s generation speed") + print("=" * 70) + print() + + # Wait for server (model loading takes 3-5 min for 71 GB) + print("[1/4] Waiting for server (122B model load takes 3-5 min)...") + max_wait = 600 # 10 minutes max + for i in range(max_wait // 5): + if check_server(): + print(f" -> Server is ready! (waited {i*5}s)") + break + if i % 6 == 0: + print(f" -> Loading model... ({i*5}s / {max_wait}s)") + time.sleep(5) + else: + print(f" -> ERROR: Server not responding after {max_wait}s") + return + + # Check server info + print() + print("[2/4] Checking server status...") + slots = check_slots() + if slots: + print(f" -> Slots available: {len(slots)}") + + # Warmup + print() + print("[3/4] Warmup run (short, pre-heating GPU caches)...") + try: + warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup") + print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s") + print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)") + except Exception as e: + print(f" -> Warmup failed: {e}") + + # Main benchmark - 5 runs for statistical reliability + print() + print("[4/4] Running main benchmark (5 runs x 300 tokens)...") + print("-" * 70) + + test_prompts = [ + "Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.", + "Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.", + "Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.", + "Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.", + "Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.", + ] + + results = [] + for i in range(5): + prompt = test_prompts[i % len(test_prompts)] + print(f"\n Run {i+1}/5: {prompt[:50]}...") + try: + r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}") + results.append(r) + print(f" Completion tokens: {r['completion_tokens']}") + print(f" Total time: {r['elapsed']:.2f}s") + print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)") + except Exception as e: + print(f" ERROR: {e}") + + if results: + print() + print("=" * 70) + print(" RESULTS SUMMARY - Qwen3.5 122B-A10B") + print("=" * 70) + avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results) + max_tps = max(r["gen_tps_approx"] for r in results) + min_tps = min(r["gen_tps_approx"] for r in results) + total_tokens = sum(r["completion_tokens"] for r in results) + total_time = sum(r["elapsed"] for r in results) + + print(f" Runs completed: {len(results)}/5") + print(f" Total tokens: {total_tokens}") + print(f" Total time: {total_time:.1f}s") + print() + print(f" Approx TPS (avg): {avg_tps:.2f} t/s") + print(f" Approx TPS (min): {min_tps:.2f} t/s") + print(f" Approx TPS (max): {max_tps:.2f} t/s") + print() + + # Verdict + if avg_tps >= 10: + print(" ✅ TARGET ACHIEVED: 10+ t/s!") + elif avg_tps >= 8: + print(" ⚠️ CLOSE TO TARGET: Consider further tuning") + else: + print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s") + + print() + print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.") + print(" ⚡ Check the server console/log for exact 'eval time' t/s value,") + print(" ⚡ which shows pure token generation speed (always higher).") + print("=" * 70) + +if __name__ == "__main__": + main() diff --git a/scripts/q4km_latest.txt b/scripts/q4km_latest.txt new file mode 100644 index 0000000..c4687b1 --- /dev/null +++ b/scripts/q4km_latest.txt @@ -0,0 +1,5 @@ +pure-GPU nommap small | 62.29 | GPU | VRAM:22975 | ub=128 b=512 t=4 +pure-GPU ts=0.5,0.5 | 63.89 | GPU | VRAM:23002 | ub=128 b=512 t=4 +tune t=2 | 64.1 | GPU | VRAM:22980 | ub=128 b=512 t=2 +tune t=6 | 64.18 | GPU | VRAM:22982 | ub=128 b=512 t=6 +tune t=8 | 63.11 | GPU | VRAM:22980 | ub=128 b=512 t=8 \ No newline at end of file diff --git a/scripts/quick_pptest.mjs b/scripts/quick_pptest.mjs new file mode 100644 index 0000000..69d923f --- /dev/null +++ b/scripts/quick_pptest.mjs @@ -0,0 +1,31 @@ +// Quick PP+TG speed test +const BASE = "http://127.0.0.1:8000"; + +async function test(label, prompt, maxTok) { + const t0 = Date.now(); + const r = await fetch(`${BASE}/v1/chat/completions`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ model: "m", messages: [{ role: "user", content: prompt }], max_tokens: maxTok, temperature: 0 }), + signal: AbortSignal.timeout(600000), + }); + const d = await r.json(); + const dt = (Date.now() - t0) / 1000; + const u = d.usage || {}; + const pp = u.prompt_tokens || 0; + const tg = u.completion_tokens || 0; + const ppSpeed = pp > 0 ? (pp / dt).toFixed(1) : "?"; + const tgSpeed = tg > 0 ? (tg / dt).toFixed(1) : "?"; + console.log(`${label} | PP:${pp}tok ${ppSpeed}t/s | TG:${tg}tok ${tgSpeed}t/s | ${dt.toFixed(1)}s`); +} + +const short = "Count 1 to 20."; +const long = "x".repeat(3000) + " Summarize above in 3 words."; +const code = Array(200).fill("function foo(x) { return x * 2 + Math.random(); }").join("\n") + "\n\nRefactor above to arrow functions. Show first 5 lines."; + +await test("warmup", short, 20); +await test("SHORT", short, 200); +await test("3K-PP", long, 100); +await test("10K-CODE", code, 100); +await test("TG-500", short, 500); +console.log("DONE"); diff --git a/scripts/qwen_fullgpu_challenge.mjs b/scripts/qwen_fullgpu_challenge.mjs new file mode 100644 index 0000000..209488d --- /dev/null +++ b/scripts/qwen_fullgpu_challenge.mjs @@ -0,0 +1,345 @@ +/** + * Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크 + * ===================================================== + * 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기 + * + * 테스트 모델: + * 1. UD-IQ4_NL (16.6 GB) — 확실히 올라감, 기준선 + * 2. MXFP4_MOE (20.1 GB) — 도전! VRAM 극한 최적화 + * 3. Q4_K_M (20.5 GB) — 대조군 (n-cpu-moe=5) + * + * VRAM 절감 전략: + * A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소) + * B. split-mode row (GPU간 더 균등한 분배) + * C. tensor-split 수동 밸런싱 + * D. no-mmap (메모리 관리 최적화) + * E. defrag-thold (KV 캐시 파편화 방지) + * + * Run: node scripts/qwen_fullgpu_challenge.mjs + */ + +import { spawn, execSync } from "child_process"; +import { writeFileSync, existsSync, statSync } from "fs"; + +const BASE_URL = "http://127.0.0.1:8000"; +const LLAMA = String.raw`llama_bin_run\llama-server.exe`; +const CTX = 262144; +const RUNS = 3; +const TOKENS = 200; +const BOOT_TIMEOUT = 300_000; + +const MODELS = [ + { + name: "Qwen3.5 UD-IQ4_NL", + path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`, + sizeGB: 16.6, + }, + { + name: "Qwen3.5 MXFP4_MOE", + path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`, + sizeGB: 20.11, + }, + { + name: "Qwen3.5 Q4_K_M", + path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`, + sizeGB: 20.5, + }, +]; + +const ALL = []; +let proc = null; +const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`); +const sleep = (ms) => new Promise(r => setTimeout(r, ms)); + +async function kill() { + if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; } + try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {} + await sleep(5000); +} + +function vram() { + try { + return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits', + { encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => { + const [g, u, t] = l.split(",").map(s => parseInt(s)); + return { gpu: g, used: u, total: t }; + }); + } catch { return []; } +} + +function startServer(modelPath, p) { + const args = [ + "--model", modelPath, "-ngl", "999", + "-c", String(CTX), "-np", "1", "-fa", "on", + "--cache-type-k", p.ctk || "q4_0", + "--cache-type-v", p.ctv || "q4_0", + "-ub", String(p.ub || 512), "-b", String(p.b || 2048), + "-t", String(p.t || 4), "-tb", String(p.t || 4), + "--prio", "3", "--poll", "50", "--mlock", + "--port", "8000", "--host", "0.0.0.0", + ]; + + // GPU offload strategy + if (p.cpuMoe) args.push("--cpu-moe"); + else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe)); + + // VRAM saving options + if (p.splitMode) args.push("--split-mode", p.splitMode); + if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit); + if (p.noMmap) args.push("--no-mmap"); + if (p.defragThold) args.push("--defrag-thold", String(p.defragThold)); + if (p.noKvOffload) args.push("--no-kv-offload"); + + const cmdStr = args.join(" "); + log(` CMD: ...${cmdStr.slice(-80)}`); + proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] }); + return proc; +} + +async function waitReady(timeout = BOOT_TIMEOUT) { + const t0 = Date.now(); + while (Date.now() - t0 < timeout) { + try { + const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) }); + const d = await r.json(); + if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 }; + } catch {} + await sleep(3000); + } + return { ok: false, boot: timeout / 1000 }; +} + +async function bench(n = TOKENS) { + const t0 = Date.now(); + const r = await fetch(`${BASE_URL}/v1/chat/completions`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model: "m", + messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }], + max_tokens: n, temperature: 0, + }), + signal: AbortSignal.timeout(600_000), + }); + const d = await r.json(); + const dt = (Date.now() - t0) / 1000; + const ct = d.usage?.completion_tokens || 0; + return { tps: ct / dt, ct, dt }; +} + +async function testConfig(model, label, params) { + await kill(); + log(` [${label}] Starting...`); + startServer(model.path, params); + const { ok, boot } = await waitReady(); + if (!ok) { + log(` [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`); + await kill(); + return null; + } + + const v = vram(); + const totalUsed = v.reduce((a, g) => a + g.used, 0); + const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | "); + log(` [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`); + + try { await bench(20); } catch {} // warmup + + const speeds = []; + for (let i = 0; i < RUNS; i++) { + try { + const r = await bench(); + speeds.push(r.tps); + log(` Run${i+1}: ${r.tps.toFixed(2)} t/s`); + } catch (e) { + log(` Run${i+1}: ERR ${e.message}`); + } + } + await kill(); + + if (!speeds.length) return null; + const avg = speeds.reduce((a,b)=>a+b) / speeds.length; + const best = Math.max(...speeds); + log(` [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`); + + const res = { + model: model.name, label, + avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2), + boot: +boot.toFixed(1), + vram_total: totalUsed, vram: v, + params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" }, + gpu_only: !params.cpuMoe && !params.nCpuMoe, + }; + ALL.push(res); + writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2)); + return res; +} + +// ─── Test Strategies ─────────────────────────────────────────── + +async function testModel(model) { + log(`\n${"#".repeat(65)}`); + log(` ${model.name} (${model.sizeGB} GB)`); + if (!existsSync(model.path)) { log(" ✗ File not found!"); return null; } + log(`${"#".repeat(65)}`); + + let best = null; + const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; }; + + // ── Strategy 1: Pure GPU, default settings ── + log(`\n ── Strategy 1: Pure GPU (default) ──`); + update(await testConfig(model, "pure-GPU default", { + t: 4, ub: 512, b: 2048 + })); + + // ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ── + log(`\n ── Strategy 2: Pure GPU, minimal batch ──`); + update(await testConfig(model, "pure-GPU minbatch", { + t: 4, ub: 64, b: 256 + })); + + // ── Strategy 3: Pure GPU, small batch + no-mmap ── + log(`\n ── Strategy 3: Pure GPU + no-mmap + small batch ──`); + update(await testConfig(model, "pure-GPU nommap small", { + t: 4, ub: 128, b: 512, noMmap: true + })); + + // ── Strategy 4: Pure GPU, split-mode row ── + log(`\n ── Strategy 4: Pure GPU + split-mode row ──`); + update(await testConfig(model, "pure-GPU row-split", { + t: 4, ub: 128, b: 512, splitMode: "row" + })); + + // ── Strategy 5: Pure GPU, tensor-split manual balance ── + log(`\n ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`); + update(await testConfig(model, "pure-GPU ts=0.5,0.5", { + t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5" + })); + + // ── Strategy 6: Pure GPU, defrag + all tricks ── + log(`\n ── Strategy 6: Pure GPU ALL tricks ──`); + update(await testConfig(model, "pure-GPU all-tricks", { + t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1 + })); + + // ── Fallback: n-cpu-moe=5 baseline ── + if (!best || !best.gpu_only) { + log(`\n ── Fallback: n-cpu-moe=5 ──`); + update(await testConfig(model, "n-cpu-moe=5 baseline", { + t: 4, ub: 256, b: 1024, nCpuMoe: 5 + })); + } + + // ── If pure GPU worked, tune batch/thread/kv ── + if (best && best.gpu_only) { + log(`\n ── Pure GPU succeeded! Fine-tuning... ──`); + const bp = best.params; + + // Thread sweep + for (const t of [2, 6, 8]) { + if (t === bp.t) continue; + update(await testConfig(model, `tune t=${t}`, { ...bp, t })); + } + + // Batch sweep + for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) { + if (ub === bp.ub && b === bp.b) continue; + update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b })); + } + + // KV cache upgrade (extra VRAM available?) + for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) { + update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv })); + } + } + + // ── Final verification ── + if (best) { + log(`\n ── Final verification (5 runs) ──`); + await kill(); + startServer(model.path, best.params); + const { ok, boot } = await waitReady(); + if (ok) { + const v = vram(); + try { await bench(20); } catch {} + const finals = []; + for (let i = 0; i < 5; i++) { + try { const r = await bench(); finals.push(r.tps); log(` Final ${i+1}: ${r.tps.toFixed(2)} t/s`); + } catch (e) { log(` Final ${i+1}: ERR`); } + } + await kill(); + if (finals.length > 0) { + const avg = finals.reduce((a,b)=>a+b) / finals.length; + const bst = Math.max(...finals); + log(` ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`); + const final = { model: model.name, label: "FINAL", + avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2), + boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0), + vram: v, params: best.params, gpu_only: best.gpu_only }; + ALL.push(final); + writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2)); + return final; + } + } + await kill(); + } + return best; +} + +// ─── Main ────────────────────────────────────────────────────── + +async function main() { + const t0 = Date.now(); + log("=".repeat(65)); + log(" QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET"); + log(" 2x RTX 3060 (24GB) | 256K Context"); + log(" " + new Date().toISOString()); + log("=".repeat(65)); + vram().forEach(g => log(` GPU${g.gpu}: ${g.used}/${g.total} MiB`)); + + const winners = []; + for (const model of MODELS) { + const w = await testModel(model); + if (w) winners.push(w); + } + + // ─── Summary ────────────────────────────────────────────── + const elapsed = ((Date.now() - t0) / 60000).toFixed(1); + winners.sort((a, b) => b.avg_tps - a.avg_tps); + + const lines = [ + `Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`, + `2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`, + "", "=".repeat(55), " RANKING", "=".repeat(55), + ]; + for (let i = 0; i < winners.length; i++) { + const w = winners[i], p = w.params; + const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload"; + lines.push("", ` #${i+1}: ${w.model} [${gpu}]`); + lines.push(` AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`); + lines.push(` VRAM: ${w.vram_total} MiB total`); + const flags = []; + if (p.splitMode) flags.push(`split=${p.splitMode}`); + if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`); + if (p.noMmap) flags.push("no-mmap"); + if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`); + lines.push(` t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`); + } + + if (winners.length > 0) { + const c = winners[0]; + lines.push("", "=".repeat(55)); + lines.push(` ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`); + lines.push("=".repeat(55)); + } + + const summary = lines.join("\n"); + console.log("\n" + summary); + writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8"); + writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2)); + log(`\n Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`); + log(" DONE!"); + await kill(); +} + +main().catch(e => { console.error("FATAL:", e); process.exit(1); }); diff --git a/scripts/qwen_fullgpu_results.json b/scripts/qwen_fullgpu_results.json new file mode 100644 index 0000000..3515bda --- /dev/null +++ b/scripts/qwen_fullgpu_results.json @@ -0,0 +1,834 @@ +[ + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "pure-GPU minbatch", + "avg_tps": 65.11, + "best_tps": 65.49, + "boot": 9, + "vram_total": 19177, + "vram": [ + { + "gpu": 0, + "used": 10039, + "total": 12288 + }, + { + "gpu": 1, + "used": 9138, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 64, + "b": 256, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "pure-GPU nommap small", + "avg_tps": 65.01, + "best_tps": 65.36, + "boot": 6, + "vram_total": 19672, + "vram": [ + { + "gpu": 0, + "used": 10342, + "total": 12288 + }, + { + "gpu": 1, + "used": 9330, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 128, + "b": 512, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "pure-GPU row-split", + "avg_tps": 13.65, + "best_tps": 14.82, + "boot": 9, + "vram_total": 19427, + "vram": [ + { + "gpu": 0, + "used": 10311, + "total": 12288 + }, + { + "gpu": 1, + "used": 9116, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 128, + "b": 512, + "splitMode": "row", + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "pure-GPU ts=0.5,0.5", + "avg_tps": 64.92, + "best_tps": 65.23, + "boot": 9, + "vram_total": 19664, + "vram": [ + { + "gpu": 0, + "used": 10334, + "total": 12288 + }, + { + "gpu": 1, + "used": 9330, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 128, + "b": 512, + "tensorSplit": "0.5,0.5", + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "pure-GPU all-tricks", + "avg_tps": 64.72, + "best_tps": 64.89, + "boot": 6, + "vram_total": 19171, + "vram": [ + { + "gpu": 0, + "used": 10033, + "total": 12288 + }, + { + "gpu": 1, + "used": 9138, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 64, + "b": 256, + "noMmap": true, + "defragThold": 0.1, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "tune t=2", + "avg_tps": 64.87, + "best_tps": 65.13, + "boot": 9, + "vram_total": 19170, + "vram": [ + { + "gpu": 0, + "used": 10032, + "total": 12288 + }, + { + "gpu": 1, + "used": 9138, + "total": 12288 + } + ], + "params": { + "t": 2, + "ub": 64, + "b": 256, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "tune t=6", + "avg_tps": 64.88, + "best_tps": 65.17, + "boot": 9, + "vram_total": 19168, + "vram": [ + { + "gpu": 0, + "used": 10030, + "total": 12288 + }, + { + "gpu": 1, + "used": 9138, + "total": 12288 + } + ], + "params": { + "t": 6, + "ub": 64, + "b": 256, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "tune t=8", + "avg_tps": 64.5, + "best_tps": 64.77, + "boot": 9, + "vram_total": 19168, + "vram": [ + { + "gpu": 0, + "used": 10030, + "total": 12288 + }, + { + "gpu": 1, + "used": 9138, + "total": 12288 + } + ], + "params": { + "t": 8, + "ub": 64, + "b": 256, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "tune ub=256 b=1024", + "avg_tps": 64.73, + "best_tps": 64.98, + "boot": 9, + "vram_total": 20640, + "vram": [ + { + "gpu": 0, + "used": 10928, + "total": 12288 + }, + { + "gpu": 1, + "used": 9712, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 256, + "b": 1024, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "tune ub=256 b=2048", + "avg_tps": 63.69, + "best_tps": 64.94, + "boot": 12, + "vram_total": 20614, + "vram": [ + { + "gpu": 0, + "used": 10902, + "total": 12288 + }, + { + "gpu": 1, + "used": 9712, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 256, + "b": 2048, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "tune kv=q8_0/q8_0", + "avg_tps": 64.78, + "best_tps": 65.08, + "boot": 9, + "vram_total": 20422, + "vram": [ + { + "gpu": 0, + "used": 10644, + "total": 12288 + }, + { + "gpu": 1, + "used": 9778, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 64, + "b": 256, + "ngl": 999, + "ctk": "q8_0", + "ctv": "q8_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "tune kv=f16/f16", + "avg_tps": 65.53, + "best_tps": 65.81, + "boot": 9, + "vram_total": 22812, + "vram": [ + { + "gpu": 0, + "used": 11846, + "total": 12288 + }, + { + "gpu": 1, + "used": 10966, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 64, + "b": 256, + "ngl": 999, + "ctk": "f16", + "ctv": "f16" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 UD-IQ4_NL", + "label": "FINAL", + "avg_tps": 66.31, + "best_tps": 66.53, + "boot": 9, + "vram_total": 22811, + "vram": [ + { + "gpu": 0, + "used": 11845, + "total": 12288 + }, + { + "gpu": 1, + "used": 10966, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 64, + "b": 256, + "ngl": 999, + "ctk": "f16", + "ctv": "f16" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "pure-GPU minbatch", + "avg_tps": 63.06, + "best_tps": 64.16, + "boot": 12, + "vram_total": 22747, + "vram": [ + { + "gpu": 0, + "used": 11895, + "total": 12288 + }, + { + "gpu": 1, + "used": 10852, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 64, + "b": 256, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "pure-GPU nommap small", + "avg_tps": 63.75, + "best_tps": 63.98, + "boot": 9, + "vram_total": 22579, + "vram": [ + { + "gpu": 0, + "used": 11797, + "total": 12288 + }, + { + "gpu": 1, + "used": 10782, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 128, + "b": 512, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "pure-GPU ts=0.5,0.5", + "avg_tps": 62.88, + "best_tps": 63.9, + "boot": 12, + "vram_total": 22578, + "vram": [ + { + "gpu": 0, + "used": 11796, + "total": 12288 + }, + { + "gpu": 1, + "used": 10782, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 128, + "b": 512, + "tensorSplit": "0.5,0.5", + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "pure-GPU all-tricks", + "avg_tps": 62.55, + "best_tps": 63.71, + "boot": 9, + "vram_total": 22743, + "vram": [ + { + "gpu": 0, + "used": 11891, + "total": 12288 + }, + { + "gpu": 1, + "used": 10852, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 64, + "b": 256, + "noMmap": true, + "defragThold": 0.1, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "tune t=2", + "avg_tps": 63.07, + "best_tps": 64.08, + "boot": 9, + "vram_total": 22601, + "vram": [ + { + "gpu": 0, + "used": 11819, + "total": 12288 + }, + { + "gpu": 1, + "used": 10782, + "total": 12288 + } + ], + "params": { + "t": 2, + "ub": 128, + "b": 512, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "tune t=6", + "avg_tps": 63.58, + "best_tps": 64.04, + "boot": 9, + "vram_total": 22583, + "vram": [ + { + "gpu": 0, + "used": 11801, + "total": 12288 + }, + { + "gpu": 1, + "used": 10782, + "total": 12288 + } + ], + "params": { + "t": 6, + "ub": 128, + "b": 512, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "tune t=8", + "avg_tps": 62.92, + "best_tps": 63.73, + "boot": 9, + "vram_total": 22536, + "vram": [ + { + "gpu": 0, + "used": 11754, + "total": 12288 + }, + { + "gpu": 1, + "used": 10782, + "total": 12288 + } + ], + "params": { + "t": 8, + "ub": 128, + "b": 512, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "tune ub=256 b=1024", + "avg_tps": 62.76, + "best_tps": 63.86, + "boot": 9, + "vram_total": 22874, + "vram": [ + { + "gpu": 0, + "used": 11968, + "total": 12288 + }, + { + "gpu": 1, + "used": 10906, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 256, + "b": 1024, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "tune ub=256 b=2048", + "avg_tps": 62.74, + "best_tps": 63.9, + "boot": 9, + "vram_total": 22912, + "vram": [ + { + "gpu": 0, + "used": 12006, + "total": 12288 + }, + { + "gpu": 1, + "used": 10906, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 256, + "b": 2048, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 MXFP4_MOE", + "label": "FINAL", + "avg_tps": 63.71, + "best_tps": 64.39, + "boot": 9, + "vram_total": 22566, + "vram": [ + { + "gpu": 0, + "used": 11784, + "total": 12288 + }, + { + "gpu": 1, + "used": 10782, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 128, + "b": 512, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 Q4_K_M", + "label": "pure-GPU nommap small", + "avg_tps": 62.29, + "best_tps": 63.03, + "boot": 9, + "vram_total": 22975, + "vram": [ + { + "gpu": 0, + "used": 12007, + "total": 12288 + }, + { + "gpu": 1, + "used": 10968, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 128, + "b": 512, + "noMmap": true, + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 Q4_K_M", + "label": "pure-GPU ts=0.5,0.5", + "avg_tps": 63.89, + "best_tps": 64.91, + "boot": 12, + "vram_total": 23002, + "vram": [ + { + "gpu": 0, + "used": 12034, + "total": 12288 + }, + { + "gpu": 1, + "used": 10968, + "total": 12288 + } + ], + "params": { + "t": 4, + "ub": 128, + "b": 512, + "tensorSplit": "0.5,0.5", + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 Q4_K_M", + "label": "tune t=2", + "avg_tps": 64.1, + "best_tps": 64.54, + "boot": 12, + "vram_total": 22980, + "vram": [ + { + "gpu": 0, + "used": 12012, + "total": 12288 + }, + { + "gpu": 1, + "used": 10968, + "total": 12288 + } + ], + "params": { + "t": 2, + "ub": 128, + "b": 512, + "tensorSplit": "0.5,0.5", + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 Q4_K_M", + "label": "tune t=6", + "avg_tps": 64.18, + "best_tps": 64.72, + "boot": 12, + "vram_total": 22982, + "vram": [ + { + "gpu": 0, + "used": 12014, + "total": 12288 + }, + { + "gpu": 1, + "used": 10968, + "total": 12288 + } + ], + "params": { + "t": 6, + "ub": 128, + "b": 512, + "tensorSplit": "0.5,0.5", + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + }, + { + "model": "Qwen3.5 Q4_K_M", + "label": "tune t=8", + "avg_tps": 63.11, + "best_tps": 64.02, + "boot": 12, + "vram_total": 22980, + "vram": [ + { + "gpu": 0, + "used": 12012, + "total": 12288 + }, + { + "gpu": 1, + "used": 10968, + "total": 12288 + } + ], + "params": { + "t": 8, + "ub": 128, + "b": 512, + "tensorSplit": "0.5,0.5", + "ngl": 999, + "ctk": "q4_0", + "ctv": "q4_0" + }, + "gpu_only": true + } +] \ No newline at end of file diff --git a/scripts/qwen_intermediate.csv b/scripts/qwen_intermediate.csv new file mode 100644 index 0000000..e94b673 --- /dev/null +++ b/scripts/qwen_intermediate.csv @@ -0,0 +1,12 @@ +model,label,avg,best,mode,vram,t,ub,b,kv,split,mmap +UD-IQ4_NL,pure-GPU minbatch,65.11,65.49,GPU,19177,t4,ub64,b256,q4_0/q4_0,, +UD-IQ4_NL,pure-GPU nommap small,65.01,65.36,GPU,19672,t4,ub128,b512,q4_0/q4_0,,nommap +UD-IQ4_NL,pure-GPU row-split,13.65,14.82,GPU,19427,t4,ub128,b512,q4_0/q4_0,row, +UD-IQ4_NL,pure-GPU ts=0.5,0.5,64.92,65.23,GPU,19664,t4,ub128,b512,q4_0/q4_0,, +UD-IQ4_NL,pure-GPU all-tricks,64.72,64.89,GPU,19171,t4,ub64,b256,q4_0/q4_0,,nommap +UD-IQ4_NL,tune t=2,64.87,65.13,GPU,19170,t2,ub64,b256,q4_0/q4_0,, +UD-IQ4_NL,tune t=6,64.88,65.17,GPU,19168,t6,ub64,b256,q4_0/q4_0,, +UD-IQ4_NL,tune t=8,64.5,64.77,GPU,19168,t8,ub64,b256,q4_0/q4_0,, +UD-IQ4_NL,tune ub=256 b=1024,64.73,64.98,GPU,20640,t4,ub256,b1024,q4_0/q4_0,, +UD-IQ4_NL,tune ub=256 b=2048,63.69,64.94,GPU,20614,t4,ub256,b2048,q4_0/q4_0,, +UD-IQ4_NL,tune kv=q8_0/q8_0,64.78,65.08,GPU,20422,t4,ub64,b256,q8_0/q8_0,, \ No newline at end of file diff --git a/scripts/qwen_latest.txt b/scripts/qwen_latest.txt new file mode 100644 index 0000000..5c3446e --- /dev/null +++ b/scripts/qwen_latest.txt @@ -0,0 +1,24 @@ +UD-IQ4_NL | pure-GPU minbatch | 65.11 | GPU | 19177 +UD-IQ4_NL | pure-GPU nommap small | 65.01 | GPU | 19672 +UD-IQ4_NL | pure-GPU row-split | 13.65 | GPU | 19427 +UD-IQ4_NL | pure-GPU ts=0.5,0.5 | 64.92 | GPU | 19664 +UD-IQ4_NL | pure-GPU all-tricks | 64.72 | GPU | 19171 +UD-IQ4_NL | tune t=2 | 64.87 | GPU | 19170 +UD-IQ4_NL | tune t=6 | 64.88 | GPU | 19168 +UD-IQ4_NL | tune t=8 | 64.5 | GPU | 19168 +UD-IQ4_NL | tune ub=256 b=1024 | 64.73 | GPU | 20640 +UD-IQ4_NL | tune ub=256 b=2048 | 63.69 | GPU | 20614 +UD-IQ4_NL | tune kv=q8_0/q8_0 | 64.78 | GPU | 20422 +UD-IQ4_NL | tune kv=f16/f16 | 65.53 | GPU | 22812 +UD-IQ4_NL | FINAL | 66.31 | GPU | 22811 +MXFP4_MOE | pure-GPU minbatch | 63.06 | GPU | 22747 +MXFP4_MOE | pure-GPU nommap small | 63.75 | GPU | 22579 +MXFP4_MOE | pure-GPU ts=0.5,0.5 | 62.88 | GPU | 22578 +MXFP4_MOE | pure-GPU all-tricks | 62.55 | GPU | 22743 +MXFP4_MOE | tune t=2 | 63.07 | GPU | 22601 +MXFP4_MOE | tune t=6 | 63.58 | GPU | 22583 +MXFP4_MOE | tune t=8 | 62.92 | GPU | 22536 +MXFP4_MOE | tune ub=256 b=1024 | 62.76 | GPU | 22874 +MXFP4_MOE | tune ub=256 b=2048 | 62.74 | GPU | 22912 +MXFP4_MOE | FINAL | 63.71 | GPU | 22566 +Q4_K_M | pure-GPU nommap small | 62.29 | GPU | 22975 \ No newline at end of file diff --git a/scripts/test_20ts.txt b/scripts/test_20ts.txt new file mode 100644 index 0000000..90f3e9e Binary files /dev/null and b/scripts/test_20ts.txt differ diff --git a/scripts/tune_122b_20ts.mjs b/scripts/tune_122b_20ts.mjs new file mode 100644 index 0000000..f700592 --- /dev/null +++ b/scripts/tune_122b_20ts.mjs @@ -0,0 +1,64 @@ +import { exec, spawn } from 'child_process'; + +const delay = ms => new Promise(res => setTimeout(res, ms)); + +async function runTest(modelArgs, envVars, name) { + console.log(`\n===========================================`); + console.log(`Testing: ${name}`); + console.log(`Args: ${modelArgs}`); + + return new Promise(async (resolve) => { + await new Promise(r => exec('taskkill /F /IM llama-server.exe', r)); + await delay(2000); + + const env = { ...process.env, ...envVars }; + const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), { + detached: true, + stdio: 'ignore', + env + }); + + let ready = false; + for (let i = 0; i < 40; i++) { + try { + const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 }); + if (res.status === 200) { + ready = true; + break; + } + } catch (e) {} + await delay(3000); + } + + if (!ready) { + console.log(`[${name}] FAILED TO BOOT`); + exec('taskkill /F /IM llama-server.exe'); + resolve({ success: false }); + return; + } + + console.log(`[${name}] Server Ready! Running benchmark...`); + exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => { + console.log(stdout || stderr); + exec('taskkill /F /IM llama-server.exe'); + resolve({ success: true }); + }); + }); +} + +async function main() { + const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; + + // 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48) + await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer"); + + // 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage + await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer"); + + // 3. Fallback to 36 if OOM happens on 32/28 + await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer"); + + console.log("\nALL TESTS COMPLETED"); +} + +main(); diff --git a/scripts/tune_exact.mjs b/scripts/tune_exact.mjs new file mode 100644 index 0000000..52e4367 --- /dev/null +++ b/scripts/tune_exact.mjs @@ -0,0 +1,72 @@ +import { exec, spawn } from 'child_process'; + +const delay = ms => new Promise(res => setTimeout(res, ms)); + +async function runTest(modelArgs, envVars, name) { + console.log(`\n===========================================`); + console.log(`Testing: ${name}`); + console.log(`Env: ${JSON.stringify(envVars)}`); + console.log(`Args: ${modelArgs}`); + + return new Promise(async (resolve) => { + await new Promise(r => exec('taskkill /F /IM llama-server.exe', r)); + await delay(2000); + + const env = { ...process.env, ...envVars }; + const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), { + detached: true, + stdio: 'ignore', + env + }); + + let ready = false; + + for (let i = 0; i < 40; i++) { + try { + const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 }); + if (res.status === 200) { + ready = true; + break; + } + } catch (e) {} + await delay(3000); + } + + if (!ready) { + console.log(`[${name}] FAILED TO BOOT`); + exec('taskkill /F /IM llama-server.exe'); + resolve({ success: false }); + return; + } + + console.log(`[${name}] Server Ready! Running speed test...`); + exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => { + console.log(stdout || stderr); + exec('taskkill /F /IM llama-server.exe'); + resolve({ success: true }); + }); + }); +} + +async function main() { + // 1. 122B-A10B: Pure GPU offload (No n-cpu-moe at all) + // -ngl 999 will offload all 48 layers to the NVIDIA driver (triggering Shared VRAM fallback on Windows) + const args122B = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; + await runTest(args122B, {}, "122B-A10B: Pure GPU (NVIDIA Shared Memory Fallback)"); + + // 2. 35B-A3B: Pure GPU tuning to hit 70 t/s + // Base configuration from previous full-gpu run: + const args35B = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 128 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; + + // We already got ~64 t/s basically. + // Let's try MMQ for custom matrix multiplication which is often faster on Ampere for low batch generation + await runTest(args35B, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Force MMQ = 1"); + + // Try increasing threads to 12 just in case + const args35B_t12 = args35B.replace("-t 6 -tb 6", "-t 12 -tb 12"); + await runTest(args35B_t12, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Threads 12 + MMQ"); + + console.log("\nALL TESTS COMPLETED"); +} + +main(); diff --git a/scripts/tune_models.mjs b/scripts/tune_models.mjs new file mode 100644 index 0000000..1726f1f --- /dev/null +++ b/scripts/tune_models.mjs @@ -0,0 +1,84 @@ +import { exec, spawn } from 'child_process'; + +const delay = ms => new Promise(res => setTimeout(res, ms)); + +async function runTest(modelArgs, name) { + console.log(`\n===========================================`); + console.log(`Testing: ${name}`); + console.log(`Args: ${modelArgs}`); + + return new Promise(async (resolve) => { + // Kill existing + await new Promise(r => exec('taskkill /F /IM llama-server.exe', r)); + await delay(2000); + + const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), { + detached: true, + stdio: 'ignore' + }); + + let ready = false; + let oom = false; + + for (let i = 0; i < 40; i++) { + try { + const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 }); + if (res.status === 200) { + ready = true; + break; + } + } catch (e) {} + await delay(3000); + } + + if (!ready) { + console.log(`[${name}] FAILED TO BOOT (Likely OOM)`); + exec('taskkill /F /IM llama-server.exe'); + resolve({ success: false }); + return; + } + + console.log(`[${name}] Server Ready! Running benchmark...`); + // Run pptest + exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => { + console.log(stdout || stderr); + + // Extract TG and PP from TG-500 + const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/); + const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/); + + const tg = tgMatch ? parseFloat(tgMatch[1]) : 0; + const pp = ppMatch ? parseFloat(ppMatch[1]) : 0; + + exec('taskkill /F /IM llama-server.exe'); + resolve({ success: true, tg, pp }); + }); + }); +} + +async function main() { + // 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512 + const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; + + // Test 1: n-cpu-moe 1, ub 512 + await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512"); + + // Test 2: n-cpu-moe 2, ub 512 + await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512"); + + // Test 3: n-cpu-moe 4, ub 512 + await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512"); + + // 2. 122B Tuning: Find optimal n-cpu-moe + const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`; + + // Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU. + // Let's try 38, 35, 30 + await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38"); + await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30"); + await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22"); + + console.log("Tuning finished."); +} + +main(); diff --git a/scripts/tune_results_gemma4_256k.json b/scripts/tune_results_gemma4_256k.json new file mode 100644 index 0000000..d6933c0 --- /dev/null +++ b/scripts/tune_results_gemma4_256k.json @@ -0,0 +1,591 @@ +[ + { + "ngl": 22, + "t": 8, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.22049935826915, + "best_tps": 25.971732307567606, + "vram_used": 11953, + "vram_total": 12288, + "label": "ngl=22" + }, + { + "ngl": 21, + "t": 8, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.805518952775174, + "best_tps": 25.953896683689454, + "vram_used": 11942, + "vram_total": 12288, + "label": "ngl=21" + }, + { + "ngl": 20, + "t": 8, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 23.537353232262834, + "best_tps": 24.32109262330477, + "vram_used": 11972, + "vram_total": 12288, + "label": "ngl=20" + }, + { + "ngl": 21, + "t": 2, + "tb": 2, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 20.167581352340264, + "best_tps": 20.701192443418005, + "vram_used": 11969, + "vram_total": 12288, + "label": "t=2 | tb=2" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.689104997668554, + "best_tps": 26.328541632880874, + "vram_used": 11975, + "vram_total": 12288, + "label": "t=4 | tb=4" + }, + { + "ngl": 21, + "t": 4, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.294470150452725, + "best_tps": 26.541251363470614, + "vram_used": 11984, + "vram_total": 12288, + "label": "t=4 | tb=8" + }, + { + "ngl": 21, + "t": 6, + "tb": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.307859289404675, + "best_tps": 26.292208504543133, + "vram_used": 11984, + "vram_total": 12288, + "label": "t=6 | tb=6" + }, + { + "ngl": 21, + "t": 6, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.230599923243314, + "best_tps": 26.366065850165732, + "vram_used": 11983, + "vram_total": 12288, + "label": "t=6 | tb=8" + }, + { + "ngl": 21, + "t": 8, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.113108026759278, + "best_tps": 26.123872617669583, + "vram_used": 11984, + "vram_total": 12288, + "label": "t=8 | tb=8" + }, + { + "ngl": 21, + "t": 8, + "tb": 12, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.05545428888364, + "best_tps": 26.06377500079152, + "vram_used": 11983, + "vram_total": 12288, + "label": "t=8 | tb=12" + }, + { + "ngl": 21, + "t": 10, + "tb": 10, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 24.706926870374986, + "best_tps": 25.03033604251865, + "vram_used": 11984, + "vram_total": 12288, + "label": "t=10 | tb=10" + }, + { + "ngl": 21, + "t": 12, + "tb": 12, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 22.468055564001904, + "best_tps": 23.425983251691825, + "vram_used": 11989, + "vram_total": 12288, + "label": "t=12 | tb=12" + }, + { + "ngl": 21, + "t": 16, + "tb": 16, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 21.176973905195442, + "best_tps": 21.482429642395456, + "vram_used": 12021, + "vram_total": 12288, + "label": "t=16 | tb=16" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 128, + "b": 512, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.545748810106186, + "best_tps": 26.344547829145817, + "vram_used": 11986, + "vram_total": 12288, + "label": "ub=128 | b=512" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 256, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.503875205368377, + "best_tps": 26.393548686102108, + "vram_used": 11981, + "vram_total": 12288, + "label": "ub=256 | b=1024" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 256, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.46500292415627, + "best_tps": 26.2726382287537, + "vram_used": 11981, + "vram_total": 12288, + "label": "ub=256 | b=2048" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 512, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.50982209452459, + "best_tps": 26.292282671074723, + "vram_used": 12020, + "vram_total": 12288, + "label": "ub=512 | b=1024" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.39646674356899, + "best_tps": 26.28106356028714, + "vram_used": 12020, + "vram_total": 12288, + "label": "ub=512 | b=2048" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.471945933724726, + "best_tps": 26.268422652962233, + "vram_used": 12021, + "vram_total": 12288, + "label": "ub=512 | b=4096" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.722119623856702, + "best_tps": 26.497264927416403, + "vram_used": 12019, + "vram_total": 12288, + "label": "ub=1024 | b=2048" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.665819493145943, + "best_tps": 26.301163428594148, + "vram_used": 12019, + "vram_total": 12288, + "label": "ub=1024 | b=4096" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.464915272955533, + "best_tps": 26.40667691713752, + "vram_used": 12019, + "vram_total": 12288, + "label": "ctk=q4_0 | ctv=q4_0" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.489715990281564, + "best_tps": 25.884133821146627, + "vram_used": 12011, + "vram_total": 12288, + "label": "ctk=q8_0 | ctv=q8_0" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q4_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 22.751034104721082, + "best_tps": 22.91250972782414, + "vram_used": 12017, + "vram_total": 12288, + "label": "ctk=q4_0 | ctv=q8_0" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "f16", + "ctv": "f16", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 24.745831571513975, + "best_tps": 25.53926086004382, + "vram_used": 11985, + "vram_total": 12288, + "label": "ctk=f16 | ctv=f16" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.21575943186602, + "best_tps": 25.796865637378264, + "vram_used": 12013, + "vram_total": 12288, + "label": "mmap=True | poll=50 | prio=2" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": false, + "prio": 2, + "poll": 50, + "avg_tps": 23.88172807693179, + "best_tps": 24.803356430302312, + "vram_used": 12016, + "vram_total": 12288, + "label": "mmap=False | poll=50 | prio=2" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 0, + "avg_tps": 25.041321207287698, + "best_tps": 25.88479834694897, + "vram_used": 12017, + "vram_total": 12288, + "label": "mmap=True | poll=0 | prio=2" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 100, + "avg_tps": 25.27990666474703, + "best_tps": 26.034861156695197, + "vram_used": 12017, + "vram_total": 12288, + "label": "mmap=True | poll=100 | prio=2" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 3, + "poll": 50, + "avg_tps": 25.360977804679788, + "best_tps": 26.0705565191107, + "vram_used": 12022, + "vram_total": 12288, + "label": "mmap=True | poll=50 | prio=3" + }, + { + "ngl": 21, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q8_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": false, + "prio": 3, + "poll": 0, + "avg_tps": 24.156893523381967, + "best_tps": 24.840307911026144, + "vram_used": 12021, + "vram_total": 12288, + "label": "mmap=False | poll=0 | prio=3" + } +] \ No newline at end of file diff --git a/scripts/tune_results_gemma4_ncpumoe.json b/scripts/tune_results_gemma4_ncpumoe.json new file mode 100644 index 0000000..e6bf6fb --- /dev/null +++ b/scripts/tune_results_gemma4_ncpumoe.json @@ -0,0 +1,201 @@ +[ + { + "label": "ncpumoe=0", + "ncpumoe": 0, + "avg": 15.396949591766335, + "best": 20.220093309883133, + "vram": 12011, + "nommap": false + }, + { + "label": "ncpumoe=5", + "ncpumoe": 5, + "avg": 4.853957926040404, + "best": 4.9029479257524216, + "vram": 11945, + "nommap": false + }, + { + "label": "ncpumoe=10", + "ncpumoe": 10, + "avg": 20.64137159193706, + "best": 26.474940718957154, + "vram": 12020, + "nommap": false + }, + { + "label": "ncpumoe=15", + "ncpumoe": 15, + "avg": 13.424368433101165, + "best": 13.698684361880598, + "vram": 12018, + "nommap": false + }, + { + "label": "ncpumoe=20", + "ncpumoe": 20, + "avg": 10.338449574838693, + "best": 13.495275411319872, + "vram": 11530, + "nommap": true + }, + { + "label": "ncpumoe=25", + "ncpumoe": 25, + "avg": 12.920348175328435, + "best": 12.99923042323437, + "vram": 11625, + "nommap": true + }, + { + "label": "ncpumoe=30", + "ncpumoe": 30, + "avg": 13.251690836275145, + "best": 13.253697466971921, + "vram": 9064, + "nommap": true + }, + { + "label": "ncpumoe=7", + "ncpumoe": 7, + "avg": 16.31796299658782, + "best": 23.160760806218782, + "vram": 11994, + "nommap": false + }, + { + "label": "ncpumoe=9", + "ncpumoe": 9, + "avg": 7.469651892205037, + "best": 10.875064047449284, + "vram": 11941, + "nommap": false + }, + { + "label": "ncpumoe=11", + "ncpumoe": 11, + "avg": 14.814740144776437, + "best": 15.199641279675724, + "vram": 11984, + "nommap": false + }, + { + "label": "ncpumoe=13", + "ncpumoe": 13, + "avg": 14.183175252947136, + "best": 14.427257794639086, + "vram": 12003, + "nommap": false + }, + { + "label": "t=2", + "ncpumoe": 10, + "avg": 28.551811207068425, + "best": 28.688565545389164, + "vram": 11968, + "t": 2, + "nommap": false + }, + { + "label": "t=4", + "ncpumoe": 10, + "avg": 30.8619310622166, + "best": 31.17677746690393, + "vram": 11972, + "t": 4, + "nommap": false + }, + { + "label": "t=6", + "ncpumoe": 10, + "avg": 30.578454576249854, + "best": 30.971792125516313, + "vram": 11983, + "t": 6, + "nommap": false + }, + { + "label": "t=8", + "ncpumoe": 10, + "avg": 30.529393512116172, + "best": 30.954830478128166, + "vram": 11982, + "t": 8, + "nommap": false + }, + { + "label": "t=10", + "ncpumoe": 10, + "avg": 30.773041112229503, + "best": 31.00899077264753, + "vram": 11972, + "t": 10, + "nommap": false + }, + { + "label": "ub=256,b=1024", + "ncpumoe": 10, + "avg": 30.49319055490045, + "best": 30.691055921541377, + "vram": 11993, + "t": 4, + "ub": 256, + "b": 1024, + "nommap": false + }, + { + "label": "ub=512,b=2048", + "ncpumoe": 10, + "avg": 30.923573731331718, + "best": 31.902272031660825, + "vram": 11995, + "t": 4, + "ub": 512, + "b": 2048, + "nommap": false + }, + { + "label": "ub=512,b=4096", + "ncpumoe": 10, + "avg": 30.723820162954862, + "best": 31.065476003548053, + "vram": 11966, + "t": 4, + "ub": 512, + "b": 4096, + "nommap": false + }, + { + "label": "ub=1024,b=2048", + "ncpumoe": 10, + "avg": 30.489888387093156, + "best": 30.982074615885946, + "vram": 11964, + "t": 4, + "ub": 1024, + "b": 2048, + "nommap": false + }, + { + "label": "kv=q4_0", + "ncpumoe": 10, + "avg": 30.63156129571348, + "best": 31.088674795634944, + "vram": 11988, + "t": 4, + "ctk": "q4_0", + "ctv": "q4_0", + "nommap": false + }, + { + "label": "kv=q8_0", + "ncpumoe": 10, + "avg": 29.6114222576863, + "best": 30.580427895917573, + "vram": 11980, + "t": 4, + "ctk": "q8_0", + "ctv": "q8_0", + "nommap": false + } +] \ No newline at end of file diff --git a/scripts/tune_results_qwen35b_256k.json b/scripts/tune_results_qwen35b_256k.json new file mode 100644 index 0000000..e82d0be --- /dev/null +++ b/scripts/tune_results_qwen35b_256k.json @@ -0,0 +1,522 @@ +[ + { + "ngl": 999, + "cpu_moe": true, + "t": 6, + "tb": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.169961832638464, + "best_tps": 26.533887071573073, + "vram_used": 4994, + "vram_total": 12288, + "label": "cpu_moe=True" + }, + { + "ngl": 999, + "cpu_moe": false, + "t": 6, + "tb": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 11.065030380022206, + "best_tps": 11.083028272674314, + "vram_used": 11949, + "vram_total": 12288, + "label": "cpu_moe=False" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 2, + "tb": 2, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 21.473286428302767, + "best_tps": 21.746637577851104, + "vram_used": 4994, + "vram_total": 12288, + "label": "t=2 | tb=2" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.552358479030676, + "best_tps": 27.314237654089343, + "vram_used": 4991, + "vram_total": 12288, + "label": "t=4 | tb=4" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.347068485327956, + "best_tps": 26.87924726131441, + "vram_used": 4993, + "vram_total": 12288, + "label": "t=4 | tb=6" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 6, + "tb": 6, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.331286039513458, + "best_tps": 26.81427299445741, + "vram_used": 5001, + "vram_total": 12288, + "label": "t=6 | tb=6" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 6, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.391160513711274, + "best_tps": 26.735573238878736, + "vram_used": 5001, + "vram_total": 12288, + "label": "t=6 | tb=8" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 8, + "tb": 8, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 25.32340666199144, + "best_tps": 25.87949347494079, + "vram_used": 4995, + "vram_total": 12288, + "label": "t=8 | tb=8" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 10, + "tb": 10, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 23.752277317850815, + "best_tps": 24.98242898809555, + "vram_used": 5011, + "vram_total": 12288, + "label": "t=10 | tb=10" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 12, + "tb": 12, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 21.75032196383532, + "best_tps": 23.18963400077116, + "vram_used": 5104, + "vram_total": 12288, + "label": "t=12 | tb=12" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 128, + "b": 512, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 13.27593572827031, + "best_tps": 13.337407402920235, + "vram_used": 4391, + "vram_total": 12288, + "label": "ub=128 | b=512" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 256, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.638687188233188, + "best_tps": 27.361082444434413, + "vram_used": 4495, + "vram_total": 12288, + "label": "ub=256 | b=1024" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 256, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.29069503392877, + "best_tps": 26.63368832924803, + "vram_used": 4490, + "vram_total": 12288, + "label": "ub=256 | b=2048" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 1024, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.518331831441134, + "best_tps": 26.972021321271527, + "vram_used": 4984, + "vram_total": 12288, + "label": "ub=512 | b=1024" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.401541912276873, + "best_tps": 26.46530849236633, + "vram_used": 4990, + "vram_total": 12288, + "label": "ub=512 | b=2048" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 26.892711500590455, + "best_tps": 26.892711500590455, + "vram_used": 5006, + "vram_total": 12288, + "label": "ub=512 | b=4096" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 2048, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 12.600209659679201, + "best_tps": 12.759356030807627, + "vram_used": 12020, + "vram_total": 12288, + "label": "ub=1024 | b=2048" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 1024, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 6.023959262370547, + "best_tps": 8.284882268188156, + "vram_used": 11931, + "vram_total": 12288, + "label": "ub=1024 | b=4096" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 12.96992950856374, + "best_tps": 12.96992950856374, + "vram_used": 12022, + "vram_total": 12288, + "label": "ctk=q4_0 | ctv=q4_0" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q8_0", + "ctv": "q8_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 11.420078920350697, + "best_tps": 13.524778595767653, + "vram_used": 12030, + "vram_total": 12288, + "label": "ctk=q8_0 | ctv=q8_0" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "f16", + "ctv": "f16", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 11.978106511464183, + "best_tps": 13.729190013094977, + "vram_used": 11518, + "vram_total": 12288, + "label": "ctk=f16 | ctv=f16" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 50, + "avg_tps": 16.164278220452957, + "best_tps": 22.645890325274323, + "vram_used": 11623, + "vram_total": 12288, + "label": "mmap=True | poll=50 | prio=2" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": false, + "prio": 2, + "poll": 50, + "avg_tps": 16.555542780023114, + "best_tps": 23.333815015033892, + "vram_used": 9062, + "vram_total": 12288, + "label": "mmap=False | poll=50 | prio=2" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 0, + "avg_tps": 13.003619379106329, + "best_tps": 13.031594557134142, + "vram_used": 11994, + "vram_total": 12288, + "label": "mmap=True | poll=0 | prio=2" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 2, + "poll": 100, + "avg_tps": 5.7762452690702935, + "best_tps": 5.795560155803046, + "vram_used": 11953, + "vram_total": 12288, + "label": "mmap=True | poll=100 | prio=2" + }, + { + "ngl": 999, + "cpu_moe": true, + "t": 4, + "tb": 4, + "ub": 512, + "b": 4096, + "ctk": "q4_0", + "ctv": "q4_0", + "fa": "on", + "mlock": true, + "mmap": true, + "prio": 3, + "poll": 50, + "avg_tps": 12.59406799687573, + "best_tps": 14.966737641114795, + "vram_used": 11996, + "vram_total": 12288, + "label": "mmap=True | poll=50 | prio=3" + } +] \ No newline at end of file