Update tuning scripts and add task creation to sync_vikunja.js

2026-04-06 21:49:56 +09:00
parent 626a089b6b
commit 7c7a899fd5
61 changed files with 8705 additions and 1566 deletions
--- a/.agent/.agent/scripts/translate_gsd.py
+++ b/.agent/.agent/scripts/translate_gsd.py
@@ -1,86 +0,0 @@
-import os
-import glob
-import re
-
-skill_dir = r"C:\Users\Certes\.gemini\antigravity\skills"
-
-translations = {
-    "Manage parallel workstreams — list, create, switch, status, progress, complete, and resume": "병렬 작업 스트림 관리 — 목록, 생성, 전환, 상태, 진행률, 완료 및 재개",
-    "Validate built features through conversational UAT": "대화형 UAT를 통해 구현된 기능 검증",
-    "Retroactively audit and fill Nyquist validation gaps for a completed phase": "완료된 단계에 대한 검증 누락 사후 감사 및 보완",
-    "Update GSD to latest version with changelog display": "GSD를 최신 버전으로 업데이트하고 변경 사항 표시",
-    "Retroactive 6-pillar visual audit of implemented frontend code": "구현된 프론트엔드 코드에 대한 6개 요소 시각적 사후 감사",
-    "Generate UI design contract (UI-SPEC.md) for frontend phases": "프론트엔드 단계를 위한 UI 디자인 명세서(UI-SPEC.md) 생성",
-    "Manage persistent context threads for cross-session work": "교차 세션 작업을 위한 영구 컨텍스트 스레드 관리",
-    "Display project statistics — phases, plans, requirements, git metrics, and timeline": "프로젝트 통계 표시 — 단계, 계획, 요구사항, Git 지표 및 타임라인",
-    "Create PR, run review, and prepare for merge after verification passes": "검증 통과 후 PR 생성, 리뷰 실행 및 병합 준비",
-    "Configure GSD workflow toggles and model profile": "GSD 워크플로우 옵션 및 모델 프로필 구성",
-    "Switch model profile for GSD agents (quality/balanced/budget/inherit)": "GSD 요원의 모델 프로필 전환 (고품질/균형/예산/상속)",
-    "Generate a session report with token usage estimates, work summary, and outcomes": "토큰 사용량, 작업 요약 및 결과를 포함한 세션 보고서 생성",
-    "Review and promote backlog items to active milestone": "백로그 항목을 검토하고 활성 마일스톤으로 승격",
-    "Request cross-AI peer review of phase plans from external AI CLIs": "외부 AI CLI에 단계 계획에 대한 교차 AI 동료 리뷰 요청",
-    "Resume work from previous session with full context restoration": "전체 컨텍스트 복원과 함께 이전 세션에서 작업 재개",
-    "Research how to implement a phase (standalone - usually use /gsd-plan-phase instead)": "단계를 구현하는 방법 리서치 (단독 실행 - 보통 /gsd-plan-phase 사용)",
-    "Remove a GSD workspace and clean up worktrees": "GSD 워크스페이스 제거 및 워크트리 정리",
-    "Remove a future phase from roadmap and renumber subsequent phases": "로드맵에서 향후 단계를 제거하고 이후 단계 번호 재지정",
-    "Reapply local modifications after a GSD update": "GSD 업데이트 후 로컬 수정 사항 재적용",
-    "Execute a quick task with GSD guarantees (atomic commits, state tracking) but skip optional agents": "GSD 보장(원자적 커밋, 상태 추적)을 사용하여 빠른 작업을 실행하되 선택적 요원 생략",
-    "Check project progress, show context, and route to next action (execute or plan)": "프로젝트 진행 상황 확인, 컨텍스트 표시 및 다음 작업(실행 또는 계획)으로 라우팅",
-    "Generate developer behavioral profile and create Claude-discoverable artifacts": "개발자 행동 프로필을 생성하고 AI가 인지할 수 있는 문서 작성",
-    "Create a clean PR branch by filtering out .planning/ commits — ready for code review": ".planning/ 커밋을 필터링하여 깔끔한 PR 브랜치 생성 — 코드 리뷰 준비",
-    "Capture a forward-looking idea with trigger conditions — surfaces automatically at the right milestone": "향후 아이디어를 트리거 조건과 함께 캡처 — 적절한 마일스톤에서 자동 표시",
-    "Create detailed phase plan (PLAN.md) with verification loop": "검증 루프를 포함한 상세 단계 계획(PLAN.md) 생성",
-    "Create phases to close all gaps identified by milestone audit": "마일스톤 감사에서 식별된 모든 격차를 해소하기 위한 단계 생성",
-    "Create context handoff when pausing work mid-phase": "작업 중단 시 컨텍스트 인수인계 파일 생성",
-    "Zero-friction idea capture. Append, list, or promote notes to todos.": "방해 없는 아이디어 캡처. 메모 추가, 나열 또는 할 일로 승격.",
-    "Automatically advance to the next logical step in the GSD workflow": "GSD 워크플로우의 다음 논리적 단계로 자동 진행",
-    "Create an isolated workspace with repo copies and independent .planning/": "외부 레포 사본 및 독립적인 .planning/을 갖춘 격리된 워크스페이스 생성",
-    "Initialize a new project with deep context gathering and PROJECT.md": "심층 컨텍스트 수집 및 PROJECT.md와 함께 새 프로젝트 초기화",
-    "Start a new milestone cycle — update PROJECT.md and route to requirements": "새로운 마일스톤 주기 시작 — PROJECT.md 업데이트 및 요구사항 재정의",
-    "Generate a comprehensive project summary from milestone artifacts for team onboarding and review": "팀 온보딩 및 리뷰를 위해 마일스톤 산출물에서 종합적인 프로젝트 요약 생성",
-    "Analyze codebase with parallel mapper agents to produce .planning/codebase/ documents": "병렬 매퍼 요원으로 코드베이스를 분석하여 .planning/codebase/ 문서 생성",
-    "Interactive command center for managing multiple phases from one terminal": "하나의 터미널에서 여러 단계를 관리하는 대화형 명령 센터",
-    "List active GSD workspaces and their status": "활성 GSD 워크스페이스 및 상태 나열",
-    "Surface the agent's assumptions about a phase approach before planning": "계획 전 단계적 접근 방식에 대한 요원의 가정을 미리 표시",
-    "Join the GSD Discord community": "GSD 디스코드 커뮤니티 참가",
-    "Insert urgent work as decimal phase (e.g., 72.1) between existing phases": "기존 단계 사이에 소수점 단계(예: 72.1)로 긴급 작업 삽입",
-    "Show available GSD commands and usage guide": "사용 가능한 GSD 명령어 및 사용 가이드 표시",
-    "Diagnose planning directory health and optionally repair issues": "계획 디렉토리 상태 진단 및 선택적으로 문제 복구",
-    "Post-mortem investigation for failed GSD workflows — analyzes git history, artifacts, and state to diagnose what went wrong": "실패한 GSD 워크플로우에 대한 사후 조사 — git 기록, 문서 및 상태 분석",
-    "Execute a trivial task inline — no subagents, no planning overhead": "인라인으로 사소한 작업 실행 — 서브 에이전트 및 계획 오버헤드 없음",
-    "Execute all plans in a phase with wave-based parallelization": "웨이브(Wave) 기반 병렬 처리를 사용하여 단계의 모든 계획 실행",
-    "Route freeform text to the right GSD command automatically": "자유 형식 텍스트를 적절한 GSD 명령으로 자동 라우팅",
-    "Systematic debugging with persistent state across context resets": "컨텍스트가 리셋되어도 상태를 유지하는 체계적인 디버깅",
-    "Gather phase context through adaptive questioning before planning. Use --auto to skip interactive questions (the agent picks recommended defaults).": "계획 전 심층 질문을 통해 단계 컨텍스트 수집. 대화형 건너뛰기(--auto) 가능.",
-    "Archive completed milestone and prepare for next version": "완료된 마일스톤 보관 및 다음 버전 준비",
-    "List pending todos and select one to work on": "보류 중인 할 일 목록 표시 및 작업할 항목 선택",
-    "Cross-phase audit of all outstanding UAT and verification items": "모든 미결 UAT 및 검증 항목에 대한 전체 단계 교차 감사",
-    "Audit milestone completion against original intent before archiving": "보관 전 원래 의도와 비교하여 마일스톤 달성 여부 감사",
-    "Capture idea or task as todo from current conversation context": "현재 대화 컨텍스트에서 아이디어 또는 작업을 할 일로 캡처",
-    "Generate tests for a completed phase based on UAT criteria and implementation": "UAT 기준 및 구현을 기반으로 완료된 단계에 대한 테스트 생성",
-    "Add phase to end of current milestone in roadmap": "로드맵의 현재 마일스톤 끝에 새 단계 추가",
-    "Add an idea to the backlog parking lot (999.x numbering)": "백로그 주차장(999.x 넘버링)에 아이디어 추가",
-    "Run all remaining phases autonomously — discuss→plan→execute per phase": "모든 남은 단계를 완전히 자율적으로 실행 (논의→계획→실행 루프)",
-    "Archive accumulated phase directories from completed milestones": "완료된 마일스톤에서 쌓인 단계 디렉토리 보관 및 정리"
-}
-
-modified_count = 0
-
-for filepath in glob.glob(os.path.join(skill_dir, "gsd-*", "SKILL.md")):
-    try:
-        with open(filepath, 'r', encoding='utf-8') as f:
-            content = f.read()
-
-        new_content = content
-        for eng, kor in translations.items():
-            pattern = re.compile(r"^description:\s*" + re.escape(eng) + r"\s*$", re.MULTILINE)
-            new_content = pattern.sub(f"description: {kor}", new_content)
-
-        if new_content != content:
-            with open(filepath, 'w', encoding='utf-8') as f:
-                f.write(new_content)
-            modified_count += 1
-    except Exception as e:
-        print(f"Error processing {filepath}: {e}")
-
-print(f"Successfully translated {modified_count} SKILL.md files.")
--- a/.agent/.agent/skills/ui-ux-pro-max/scripts/core.py
+++ b/.agent/.agent/skills/ui-ux-pro-max/scripts/core.py
@@ -1,253 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-UI/UX Pro Max Core - BM25 search engine for UI/UX style guides
-"""
-
-import csv
-import re
-from pathlib import Path
-from math import log
-from collections import defaultdict
-
-# ============ CONFIGURATION ============
-DATA_DIR = Path(__file__).parent.parent / "data"
-MAX_RESULTS = 3
-
-CSV_CONFIG = {
-    "style": {
-        "file": "styles.csv",
-        "search_cols": ["Style Category", "Keywords", "Best For", "Type", "AI Prompt Keywords"],
-        "output_cols": ["Style Category", "Type", "Keywords", "Primary Colors", "Effects & Animation", "Best For", "Performance", "Accessibility", "Framework Compatibility", "Complexity", "AI Prompt Keywords", "CSS/Technical Keywords", "Implementation Checklist", "Design System Variables"]
-    },
-    "color": {
-        "file": "colors.csv",
-        "search_cols": ["Product Type", "Notes"],
-        "output_cols": ["Product Type", "Primary (Hex)", "Secondary (Hex)", "CTA (Hex)", "Background (Hex)", "Text (Hex)", "Notes"]
-    },
-    "chart": {
-        "file": "charts.csv",
-        "search_cols": ["Data Type", "Keywords", "Best Chart Type", "Accessibility Notes"],
-        "output_cols": ["Data Type", "Keywords", "Best Chart Type", "Secondary Options", "Color Guidance", "Accessibility Notes", "Library Recommendation", "Interactive Level"]
-    },
-    "landing": {
-        "file": "landing.csv",
-        "search_cols": ["Pattern Name", "Keywords", "Conversion Optimization", "Section Order"],
-        "output_cols": ["Pattern Name", "Keywords", "Section Order", "Primary CTA Placement", "Color Strategy", "Conversion Optimization"]
-    },
-    "product": {
-        "file": "products.csv",
-        "search_cols": ["Product Type", "Keywords", "Primary Style Recommendation", "Key Considerations"],
-        "output_cols": ["Product Type", "Keywords", "Primary Style Recommendation", "Secondary Styles", "Landing Page Pattern", "Dashboard Style (if applicable)", "Color Palette Focus"]
-    },
-    "ux": {
-        "file": "ux-guidelines.csv",
-        "search_cols": ["Category", "Issue", "Description", "Platform"],
-        "output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
-    },
-    "typography": {
-        "file": "typography.csv",
-        "search_cols": ["Font Pairing Name", "Category", "Mood/Style Keywords", "Best For", "Heading Font", "Body Font"],
-        "output_cols": ["Font Pairing Name", "Category", "Heading Font", "Body Font", "Mood/Style Keywords", "Best For", "Google Fonts URL", "CSS Import", "Tailwind Config", "Notes"]
-    },
-    "icons": {
-        "file": "icons.csv",
-        "search_cols": ["Category", "Icon Name", "Keywords", "Best For"],
-        "output_cols": ["Category", "Icon Name", "Keywords", "Library", "Import Code", "Usage", "Best For", "Style"]
-    },
-    "react": {
-        "file": "react-performance.csv",
-        "search_cols": ["Category", "Issue", "Keywords", "Description"],
-        "output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
-    },
-    "web": {
-        "file": "web-interface.csv",
-        "search_cols": ["Category", "Issue", "Keywords", "Description"],
-        "output_cols": ["Category", "Issue", "Platform", "Description", "Do", "Don't", "Code Example Good", "Code Example Bad", "Severity"]
-    }
-}
-
-STACK_CONFIG = {
-    "html-tailwind": {"file": "stacks/html-tailwind.csv"},
-    "react": {"file": "stacks/react.csv"},
-    "nextjs": {"file": "stacks/nextjs.csv"},
-    "astro": {"file": "stacks/astro.csv"},
-    "vue": {"file": "stacks/vue.csv"},
-    "nuxtjs": {"file": "stacks/nuxtjs.csv"},
-    "nuxt-ui": {"file": "stacks/nuxt-ui.csv"},
-    "svelte": {"file": "stacks/svelte.csv"},
-    "swiftui": {"file": "stacks/swiftui.csv"},
-    "react-native": {"file": "stacks/react-native.csv"},
-    "flutter": {"file": "stacks/flutter.csv"},
-    "shadcn": {"file": "stacks/shadcn.csv"},
-    "jetpack-compose": {"file": "stacks/jetpack-compose.csv"}
-}
-
-# Common columns for all stacks
-_STACK_COLS = {
-    "search_cols": ["Category", "Guideline", "Description", "Do", "Don't"],
-    "output_cols": ["Category", "Guideline", "Description", "Do", "Don't", "Code Good", "Code Bad", "Severity", "Docs URL"]
-}
-
-AVAILABLE_STACKS = list(STACK_CONFIG.keys())
-
-
-# ============ BM25 IMPLEMENTATION ============
-class BM25:
-    """BM25 ranking algorithm for text search"""
-
-    def __init__(self, k1=1.5, b=0.75):
-        self.k1 = k1
-        self.b = b
-        self.corpus = []
-        self.doc_lengths = []
-        self.avgdl = 0
-        self.idf = {}
-        self.doc_freqs = defaultdict(int)
-        self.N = 0
-
-    def tokenize(self, text):
-        """Lowercase, split, remove punctuation, filter short words"""
-        text = re.sub(r'[^\w\s]', ' ', str(text).lower())
-        return [w for w in text.split() if len(w) > 2]
-
-    def fit(self, documents):
-        """Build BM25 index from documents"""
-        self.corpus = [self.tokenize(doc) for doc in documents]
-        self.N = len(self.corpus)
-        if self.N == 0:
-            return
-        self.doc_lengths = [len(doc) for doc in self.corpus]
-        self.avgdl = sum(self.doc_lengths) / self.N
-
-        for doc in self.corpus:
-            seen = set()
-            for word in doc:
-                if word not in seen:
-                    self.doc_freqs[word] += 1
-                    seen.add(word)
-
-        for word, freq in self.doc_freqs.items():
-            self.idf[word] = log((self.N - freq + 0.5) / (freq + 0.5) + 1)
-
-    def score(self, query):
-        """Score all documents against query"""
-        query_tokens = self.tokenize(query)
-        scores = []
-
-        for idx, doc in enumerate(self.corpus):
-            score = 0
-            doc_len = self.doc_lengths[idx]
-            term_freqs = defaultdict(int)
-            for word in doc:
-                term_freqs[word] += 1
-
-            for token in query_tokens:
-                if token in self.idf:
-                    tf = term_freqs[token]
-                    idf = self.idf[token]
-                    numerator = tf * (self.k1 + 1)
-                    denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
-                    score += idf * numerator / denominator
-
-            scores.append((idx, score))
-
-        return sorted(scores, key=lambda x: x[1], reverse=True)
-
-
-# ============ SEARCH FUNCTIONS ============
-def _load_csv(filepath):
-    """Load CSV and return list of dicts"""
-    with open(filepath, 'r', encoding='utf-8') as f:
-        return list(csv.DictReader(f))
-
-
-def _search_csv(filepath, search_cols, output_cols, query, max_results):
-    """Core search function using BM25"""
-    if not filepath.exists():
-        return []
-
-    data = _load_csv(filepath)
-
-    # Build documents from search columns
-    documents = [" ".join(str(row.get(col, "")) for col in search_cols) for row in data]
-
-    # BM25 search
-    bm25 = BM25()
-    bm25.fit(documents)
-    ranked = bm25.score(query)
-
-    # Get top results with score > 0
-    results = []
-    for idx, score in ranked[:max_results]:
-        if score > 0:
-            row = data[idx]
-            results.append({col: row.get(col, "") for col in output_cols if col in row})
-
-    return results
-
-
-def detect_domain(query):
-    """Auto-detect the most relevant domain from query"""
-    query_lower = query.lower()
-
-    domain_keywords = {
-        "color": ["color", "palette", "hex", "#", "rgb"],
-        "chart": ["chart", "graph", "visualization", "trend", "bar", "pie", "scatter", "heatmap", "funnel"],
-        "landing": ["landing", "page", "cta", "conversion", "hero", "testimonial", "pricing", "section"],
-        "product": ["saas", "ecommerce", "e-commerce", "fintech", "healthcare", "gaming", "portfolio", "crypto", "dashboard"],
-        "style": ["style", "design", "ui", "minimalism", "glassmorphism", "neumorphism", "brutalism", "dark mode", "flat", "aurora", "prompt", "css", "implementation", "variable", "checklist", "tailwind"],
-        "ux": ["ux", "usability", "accessibility", "wcag", "touch", "scroll", "animation", "keyboard", "navigation", "mobile"],
-        "typography": ["font", "typography", "heading", "serif", "sans"],
-        "icons": ["icon", "icons", "lucide", "heroicons", "symbol", "glyph", "pictogram", "svg icon"],
-        "react": ["react", "next.js", "nextjs", "suspense", "memo", "usecallback", "useeffect", "rerender", "bundle", "waterfall", "barrel", "dynamic import", "rsc", "server component"],
-        "web": ["aria", "focus", "outline", "semantic", "virtualize", "autocomplete", "form", "input type", "preconnect"]
-    }
-
-    scores = {domain: sum(1 for kw in keywords if kw in query_lower) for domain, keywords in domain_keywords.items()}
-    best = max(scores, key=scores.get)
-    return best if scores[best] > 0 else "style"
-
-
-def search(query, domain=None, max_results=MAX_RESULTS):
-    """Main search function with auto-domain detection"""
-    if domain is None:
-        domain = detect_domain(query)
-
-    config = CSV_CONFIG.get(domain, CSV_CONFIG["style"])
-    filepath = DATA_DIR / config["file"]
-
-    if not filepath.exists():
-        return {"error": f"File not found: {filepath}", "domain": domain}
-
-    results = _search_csv(filepath, config["search_cols"], config["output_cols"], query, max_results)
-
-    return {
-        "domain": domain,
-        "query": query,
-        "file": config["file"],
-        "count": len(results),
-        "results": results
-    }
-
-
-def search_stack(query, stack, max_results=MAX_RESULTS):
-    """Search stack-specific guidelines"""
-    if stack not in STACK_CONFIG:
-        return {"error": f"Unknown stack: {stack}. Available: {', '.join(AVAILABLE_STACKS)}"}
-
-    filepath = DATA_DIR / STACK_CONFIG[stack]["file"]
-
-    if not filepath.exists():
-        return {"error": f"Stack file not found: {filepath}", "stack": stack}
-
-    results = _search_csv(filepath, _STACK_COLS["search_cols"], _STACK_COLS["output_cols"], query, max_results)
-
-    return {
-        "domain": "stack",
-        "stack": stack,
-        "query": query,
-        "file": STACK_CONFIG[stack]["file"],
-        "count": len(results),
-        "results": results
-    }
--- a/.agent/.agent/skills/ui-ux-pro-max/scripts/design_system.py
+++ b/.agent/.agent/skills/ui-ux-pro-max/scripts/design_system.py
--- a/.agent/.agent/skills/ui-ux-pro-max/scripts/search.py
+++ b/.agent/.agent/skills/ui-ux-pro-max/scripts/search.py
@@ -1,114 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-UI/UX Pro Max Search - BM25 search engine for UI/UX style guides
-Usage: python search.py "<query>" [--domain <domain>] [--stack <stack>] [--max-results 3]
-       python search.py "<query>" --design-system [-p "Project Name"]
-       python search.py "<query>" --design-system --persist [-p "Project Name"] [--page "dashboard"]
-
-Domains: style, prompt, color, chart, landing, product, ux, typography
-Stacks: html-tailwind, react, nextjs
-
-Persistence (Master + Overrides pattern):
-  --persist    Save design system to design-system/MASTER.md
-  --page       Also create a page-specific override file in design-system/pages/
-"""
-
-import argparse
-import sys
-import io
-from core import CSV_CONFIG, AVAILABLE_STACKS, MAX_RESULTS, search, search_stack
-from design_system import generate_design_system, persist_design_system
-
-# Force UTF-8 for stdout/stderr to handle emojis on Windows (cp1252 default)
-if sys.stdout.encoding and sys.stdout.encoding.lower() != 'utf-8':
-    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
-if sys.stderr.encoding and sys.stderr.encoding.lower() != 'utf-8':
-    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
-
-
-def format_output(result):
-    """Format results for Claude consumption (token-optimized)"""
-    if "error" in result:
-        return f"Error: {result['error']}"
-
-    output = []
-    if result.get("stack"):
-        output.append(f"## UI Pro Max Stack Guidelines")
-        output.append(f"**Stack:** {result['stack']} | **Query:** {result['query']}")
-    else:
-        output.append(f"## UI Pro Max Search Results")
-        output.append(f"**Domain:** {result['domain']} | **Query:** {result['query']}")
-    output.append(f"**Source:** {result['file']} | **Found:** {result['count']} results\n")
-
-    for i, row in enumerate(result['results'], 1):
-        output.append(f"### Result {i}")
-        for key, value in row.items():
-            value_str = str(value)
-            if len(value_str) > 300:
-                value_str = value_str[:300] + "..."
-            output.append(f"- **{key}:** {value_str}")
-        output.append("")
-
-    return "\n".join(output)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="UI Pro Max Search")
-    parser.add_argument("query", help="Search query")
-    parser.add_argument("--domain", "-d", choices=list(CSV_CONFIG.keys()), help="Search domain")
-    parser.add_argument("--stack", "-s", choices=AVAILABLE_STACKS, help="Stack-specific search (html-tailwind, react, nextjs)")
-    parser.add_argument("--max-results", "-n", type=int, default=MAX_RESULTS, help="Max results (default: 3)")
-    parser.add_argument("--json", action="store_true", help="Output as JSON")
-    # Design system generation
-    parser.add_argument("--design-system", "-ds", action="store_true", help="Generate complete design system recommendation")
-    parser.add_argument("--project-name", "-p", type=str, default=None, help="Project name for design system output")
-    parser.add_argument("--format", "-f", choices=["ascii", "markdown"], default="ascii", help="Output format for design system")
-    # Persistence (Master + Overrides pattern)
-    parser.add_argument("--persist", action="store_true", help="Save design system to design-system/MASTER.md (creates hierarchical structure)")
-    parser.add_argument("--page", type=str, default=None, help="Create page-specific override file in design-system/pages/")
-    parser.add_argument("--output-dir", "-o", type=str, default=None, help="Output directory for persisted files (default: current directory)")
-
-    args = parser.parse_args()
-
-    # Design system takes priority
-    if args.design_system:
-        result = generate_design_system(
-            args.query, 
-            args.project_name, 
-            args.format,
-            persist=args.persist,
-            page=args.page,
-            output_dir=args.output_dir
-        )
-        print(result)
-        
-        # Print persistence confirmation
-        if args.persist:
-            project_slug = args.project_name.lower().replace(' ', '-') if args.project_name else "default"
-            print("\n" + "=" * 60)
-            print(f"✅ Design system persisted to design-system/{project_slug}/")
-            print(f"   📄 design-system/{project_slug}/MASTER.md (Global Source of Truth)")
-            if args.page:
-                page_filename = args.page.lower().replace(' ', '-')
-                print(f"   📄 design-system/{project_slug}/pages/{page_filename}.md (Page Overrides)")
-            print("")
-            print(f"📖 Usage: When building a page, check design-system/{project_slug}/pages/[page].md first.")
-            print(f"   If exists, its rules override MASTER.md. Otherwise, use MASTER.md.")
-            print("=" * 60)
-    # Stack search
-    elif args.stack:
-        result = search_stack(args.query, args.stack, args.max_results)
-        if args.json:
-            import json
-            print(json.dumps(result, indent=2, ensure_ascii=False))
-        else:
-            print(format_output(result))
-    # Domain search
-    else:
-        result = search(args.query, args.domain, args.max_results)
-        if args.json:
-            import json
-            print(json.dumps(result, indent=2, ensure_ascii=False))
-        else:
-            print(format_output(result))
--- a/.agent/scripts/sync_vikunja.js
+++ b/.agent/scripts/sync_vikunja.js
@@ -4,21 +4,29 @@ const path = require('path');
 // 1. Get arguments
 const args = process.argv.slice(2);
 if (args.length < 2) {
-    console.error("Usage: node sync_vikunja.js <task_id> <message_or_commit>");
+    console.error("Usage:");
+    console.error("  node sync_vikunja.js <task_id> <message>          # Update existing task");
+    console.error("  node sync_vikunja.js create \"<title>\" \"<message>\" # Create new task");
    process.exit(1);
 }

-const taskId = args[0];
+const commandOrId = args[0];
 const message = args[1];

 // 2. Load configuration from .env.agent
-const envPath = path.join(__dirname, '../config/.env.agent');
-if (!fs.existsSync(envPath)) {
-    console.error("Error: .agent/config/.env.agent file not found. Please create it from the template.");
+const envPath = path.join(__dirname, '../../.env.agent');
+const fallbackEnvPath = path.join(__dirname, '../config/.env.agent');
+
+let envContent = '';
+if (fs.existsSync(envPath)) {
+    envContent = fs.readFileSync(envPath, 'utf8');
+} else if (fs.existsSync(fallbackEnvPath)) {
+    envContent = fs.readFileSync(fallbackEnvPath, 'utf8');
+} else {
+    console.error("Error: .env.agent file not found.");
    process.exit(1);
 }

-const envContent = fs.readFileSync(envPath, 'utf8');
 const env = {};
 envContent.split('\n').forEach(line => {
    const match = line.match(/^([^#=]+)="?(.*?)"?$/);
@@ -29,6 +37,7 @@ envContent.split('\n').forEach(line => {

 const apiUrl = env.VIKUNJA_API_URL;
 const apiToken = env.VIKUNJA_API_TOKEN;
+const projectId = env.VIKUNJA_PROJECT_ID || 14;

 if (!apiUrl || !apiToken || apiUrl.includes('[YOUR_')) {
    console.error("Error: VIKUNJA_API_URL or VIKUNJA_API_TOKEN is not configured correctly in .env.agent.");
@@ -40,52 +49,59 @@ if (env.AGENT_OPERATING_MODE === "TEST") {
    process.exit(0);
 }

-// 3. Helper to make API calls using native fetch (Node 18+)
-async function markTaskDoneAndComment(taskId, message) {
+const FETCH_OPTS = {
+    headers: {
+        'Authorization': `Bearer ${apiToken}`,
+        'Content-Type': 'application/json'
+    }
+};
+
+async function createTaskAndComment(title, message) {
    try {
-        console.log(`Connecting to Vikunja API for Task ${taskId}...`);
-
-        // Update task status to done
-        const patchRes = await fetch(`${apiUrl}/tasks/${taskId}`, {
-            method: 'POST', // Vikunja uses POST to task endpoint for updates
-            headers: {
-                'Authorization': `Bearer ${apiToken}`,
-                'Content-Type': 'application/json'
-            },
-            body: JSON.stringify({ done: true })
-        });
-
-        if (!patchRes.ok) {
-            throw new Error(`Failed to mark task as done: ${patchRes.statusText}`);
-        }
-
-        console.log(`✅ Task ${taskId} successfully marked as Done.`);
-
-        // Add comment
-        const commentRes = await fetch(`${apiUrl}/tasks/${taskId}/comments`, {
+        console.log(`Creating new task in Project ${projectId}...`);
+        const createRes = await fetch(`${apiUrl}/projects/${projectId}/tasks`, {
            method: 'PUT',
-            headers: {
-                'Authorization': `Bearer ${apiToken}`,
-                'Content-Type': 'application/json'
-            },
-            body: JSON.stringify({
-                text: `[Agent Automator] Phase completed.\nReason/Hash: ${message}`
+            ...FETCH_OPTS,
+            body: JSON.stringify({ 
+                title: title, 
+                description: message,
+                done: true
            })
        });

-        if (!commentRes.ok) {
-            console.error(`Warning: Task marked as done, but failed to attach comment: ${commentRes.statusText}`);
-        } else {
-            console.log("✅ Comment attached successfully.");
-        }
-
-    } catch (error) {
-        console.error("❌ Failed to sync with Vikunja:");
-        // Mask the token if it somehow leaks via error message
-        const secureErr = error.message.replace(new RegExp(apiToken, 'g'), "********");
-        console.error(secureErr);
+        if (!createRes.ok) throw new Error(`Create failed: ${createRes.statusText}`);
+        const task = await createRes.json();
+        console.log(`✅ Task created and marked Done! ID: #${task.id}`);
+    } catch (e) {
+        console.error("❌ Failed:", e.message);
        process.exit(1);
    }
 }

-markTaskDoneAndComment(taskId, message);
+async function markTaskDoneAndComment(taskId, message) {
+    try {
+        console.log(`Updating Task ${taskId}...`);
+        const patchRes = await fetch(`${apiUrl}/tasks/${taskId}`, {
+            method: 'POST',
+            ...FETCH_OPTS,
+            body: JSON.stringify({ done: true })
+        });
+
+        if (!patchRes.ok) throw new Error(`Update failed: ${patchRes.statusText}`);
+        console.log(`✅ Task ${taskId} marked as Done.`);
+
+        await fetch(`${apiUrl}/tasks/${taskId}/comments`, {
+            method: 'PUT', ...FETCH_OPTS, body: JSON.stringify({ text: `[Agent Automator] Phase completed.\nReason/Hash: ${message}` })
+        });
+        console.log("✅ Comment attached.");
+    } catch (e) {
+        console.error("❌ Failed:", e.message);
+        process.exit(1);
+    }
+}
+
+if (commandOrId === "create") {
+    createTaskAndComment(message, args[2] || "Task fully completed.");
+} else {
+    markTaskDoneAndComment(commandOrId, message);
+}
--- a/scripts/analysis_raw.txt
+++ b/scripts/analysis_raw.txt
@@ -0,0 +1,58 @@
+0|Gemma4-26B MXFP4_MOE|ngl=999 pure-GPU|63.21|63.78|G0:11770|G1:10411|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+1|Gemma4-26B MXFP4_MOE|compare: cpu-moe|12.92|14.21|G0:3096|G1:3497|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe
+2|Gemma4-26B MXFP4_MOE|t=2|64.1|64.27|G0:11728|G1:10411|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+3|Gemma4-26B MXFP4_MOE|t=4|64|64.39|G0:11728|G1:10411|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+4|Gemma4-26B MXFP4_MOE|t=8|63.75|63.9|G0:11728|G1:10411|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+5|Gemma4-26B MXFP4_MOE|t=10|64.01|64.14|G0:11728|G1:10411|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+6|Gemma4-26B MXFP4_MOE|t=12|63.86|63.98|G0:11728|G1:10411|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+7|Gemma4-26B MXFP4_MOE|ub=256 b=1024|63.8|64.12|G0:10504|G1:9619|t=2|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU
+8|Gemma4-26B MXFP4_MOE|ub=256 b=2048|63.88|64.04|G0:10504|G1:9619|t=2|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU
+9|Gemma4-26B MXFP4_MOE|ub=512 b=4096|63.91|64.18|G0:11728|G1:10411|t=2|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU
+10|Gemma4-26B MXFP4_MOE|ub=1024 b=2048|63.86|64.1|G0:10956|G1:9907|t=2|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU
+11|Gemma4-26B MXFP4_MOE|ub=1024 b=4096|63.85|64.06|G0:10956|G1:9907|t=2|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU
+12|Gemma4-26B MXFP4_MOE|kv=q8_0/q8_0|64.14|64.39|G0:10670|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
+13|Gemma4-26B MXFP4_MOE|kv=q4_0/q8_0|37.52|37.86|G0:10394|G1:9753|t=2|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU
+14|Gemma4-26B MXFP4_MOE|kv=f16/f16|63.48|64.31|G0:11700|G1:11667|t=2|ub=512 b=2048|kv=f16/f16|pure-GPU
+15|Gemma4-26B MXFP4_MOE|FINAL|64.05|64.29|G0:10667|G1:10169|t=2|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
+16|Gemma4-26B Q4_K_M|ngl=999 pure-GPU|76.01|76.31|G0:11784|G1:10454|t=6|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+17|Gemma4-26B Q4_K_M|compare: cpu-moe|10.19|10.49|G0:2652|G1:2982|t=6|ub=512 b=2048|kv=q4_0/q4_0|cpu-moe
+18|Gemma4-26B Q4_K_M|t=2|75.67|75.87|G0:11783|G1:10454|t=2|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+19|Gemma4-26B Q4_K_M|t=4|75.61|75.87|G0:11783|G1:10454|t=4|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+20|Gemma4-26B Q4_K_M|t=8|75.42|75.59|G0:11783|G1:10454|t=8|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+21|Gemma4-26B Q4_K_M|t=10|75.71|75.82|G0:11783|G1:10454|t=10|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+22|Gemma4-26B Q4_K_M|t=12|75.08|75.7|G0:11783|G1:10454|t=12|ub=512 b=2048|kv=q4_0/q4_0|pure-GPU
+23|Gemma4-26B Q4_K_M|ub=256 b=1024|75.16|75.64|G0:10559|G1:9662|t=6|ub=256 b=1024|kv=q4_0/q4_0|pure-GPU
+24|Gemma4-26B Q4_K_M|ub=256 b=2048|75.68|76.05|G0:10559|G1:9662|t=6|ub=256 b=2048|kv=q4_0/q4_0|pure-GPU
+25|Gemma4-26B Q4_K_M|ub=512 b=4096|75.92|76.16|G0:11784|G1:10454|t=6|ub=512 b=4096|kv=q4_0/q4_0|pure-GPU
+26|Gemma4-26B Q4_K_M|ub=1024 b=2048|75.7|75.9|G0:11012|G1:9950|t=6|ub=1024 b=2048|kv=q4_0/q4_0|pure-GPU
+27|Gemma4-26B Q4_K_M|ub=1024 b=4096|75.77|75.99|G0:11011|G1:9950|t=6|ub=1024 b=4096|kv=q4_0/q4_0|pure-GPU
+28|Gemma4-26B Q4_K_M|kv=q8_0/q8_0|76.3|76.69|G0:10725|G1:10212|t=6|ub=512 b=2048|kv=q8_0/q8_0|pure-GPU
+29|Gemma4-26B Q4_K_M|kv=q4_0/q8_0|42.88|44.58|G0:10439|G1:9796|t=6|ub=512 b=2048|kv=q4_0/q8_0|pure-GPU
+30|Gemma4-26B Q4_K_M|kv=f16/f16|76.36|76.78|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU
+31|Gemma4-26B Q4_K_M|FINAL|76.4|76.75|G0:11761|G1:11710|t=6|ub=512 b=2048|kv=f16/f16|pure-GPU
+32|Qwen3.5-35B MXFP4_MOE|n-cpu-moe=5|51.43|52.07|G0:10365|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+33|Qwen3.5-35B MXFP4_MOE|t=2|43.8|46.4|G0:10365|G1:11152|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+34|Qwen3.5-35B MXFP4_MOE|t=4|49.21|52.78|G0:10353|G1:11152|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+35|Qwen3.5-35B MXFP4_MOE|t=8|46.43|50.49|G0:10397|G1:11152|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+36|Qwen3.5-35B MXFP4_MOE|t=10|46.12|50.06|G0:10351|G1:11152|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+37|Qwen3.5-35B MXFP4_MOE|t=12|45.23|47.1|G0:10337|G1:11152|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+38|Qwen3.5-35B MXFP4_MOE|ub=256 b=1024|48.9|52.3|G0:9834|G1:10906|t=6|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
+39|Qwen3.5-35B MXFP4_MOE|ub=256 b=2048|49.62|52.52|G0:9833|G1:10906|t=6|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+40|Qwen3.5-35B MXFP4_MOE|ub=512 b=4096|48.78|52.14|G0:10337|G1:11152|t=6|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
+41|Qwen3.5-35B MXFP4_MOE|ub=1024 b=2048|49.95|52.53|G0:11124|G1:11644|t=6|ub=1024 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+42|Qwen3.5-35B MXFP4_MOE|ub=1024 b=4096|48.75|52.06|G0:11123|G1:11644|t=6|ub=1024 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
+43|Qwen3.5-35B MXFP4_MOE|kv=q4_0/q8_0|42.81|44.14|G0:10681|G1:11472|t=6|ub=512 b=2048|kv=q4_0/q8_0|n-cpu-moe=5
+44|Qwen3.5-35B MXFP4_MOE|FINAL|46.66|47.09|G0:10476|G1:11152|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+45|Qwen3.5-35B Q4_K_M|n-cpu-moe=5|49.01|53.09|G0:10606|G1:11338|t=6|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+46|Qwen3.5-35B Q4_K_M|t=2|45.73|47.87|G0:10599|G1:11338|t=2|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+47|Qwen3.5-35B Q4_K_M|t=4|50.98|54.33|G0:10601|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+48|Qwen3.5-35B Q4_K_M|t=8|48.45|52.1|G0:10596|G1:11338|t=8|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+49|Qwen3.5-35B Q4_K_M|t=10|47.83|51.45|G0:10595|G1:11338|t=10|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+50|Qwen3.5-35B Q4_K_M|t=12|43.77|46.79|G0:10589|G1:11338|t=12|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+51|Qwen3.5-35B Q4_K_M|ub=256 b=1024|52.14|53.82|G0:10089|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
+52|Qwen3.5-35B Q4_K_M|ub=256 b=2048|50.23|53.66|G0:10091|G1:11092|t=4|ub=256 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+53|Qwen3.5-35B Q4_K_M|ub=512 b=2048|49.89|53.89|G0:10595|G1:11338|t=4|ub=512 b=2048|kv=q4_0/q4_0|n-cpu-moe=5
+54|Qwen3.5-35B Q4_K_M|ub=512 b=4096|50.4|54.19|G0:10564|G1:11338|t=4|ub=512 b=4096|kv=q4_0/q4_0|n-cpu-moe=5
+55|Qwen3.5-35B Q4_K_M|kv=q8_0/q8_0|51.84|53.53|G0:10726|G1:11732|t=4|ub=256 b=1024|kv=q8_0/q8_0|n-cpu-moe=5
+56|Qwen3.5-35B Q4_K_M|kv=q4_0/q8_0|43.22|45.99|G0:10410|G1:11412|t=4|ub=256 b=1024|kv=q4_0/q8_0|n-cpu-moe=5
+57|Qwen3.5-35B Q4_K_M|FINAL|52.05|54.48|G0:10062|G1:11092|t=4|ub=256 b=1024|kv=q4_0/q4_0|n-cpu-moe=5
--- a/scripts/auto_tune_122b.py
+++ b/scripts/auto_tune_122b.py
@@ -0,0 +1,372 @@
+"""
+Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
+===========================================
+각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
+서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
+
+예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import os
+import re
+import sys
+import datetime
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
+SERVER_EXE = r"llama_bin_run\llama-server.exe"
+
+# ============================================================
+# 테스트할 설정 목록
+# ============================================================
+# 공통 파라미터 (변경하지 않는 것들)
+COMMON_ARGS = [
+    "--model", MODEL_PATH,
+    "-ngl", "999",
+    "--cpu-moe",
+    "-c", "2048",
+    "-np", "1",
+    "-fa", "on",
+    "--cache-type-k", "q4_0",
+    "--cache-type-v", "q4_0",
+    "-ub", "256",
+    "-b", "1024",
+    "--mlock",
+    "--port", "8000",
+    "--host", "0.0.0.0",
+    "--no-warmup",  # 워밍업은 벤치마크 스크립트에서 직접 수행
+]
+
+# 변수 파라미터 조합
+CONFIGS = [
+    {
+        "name": "A) --no-mmap -t 8",
+        "desc": "서버 권장: mmap 비활성화 (baseline 대비)",
+        "extra": ["--no-mmap", "-t", "8", "--prio", "2"],
+    },
+    {
+        "name": "B) --no-mmap -t 6",
+        "desc": "스레드 감소 (캐시 경합 회피)",
+        "extra": ["--no-mmap", "-t", "6", "--prio", "2"],
+    },
+    {
+        "name": "C) --no-mmap -t 10",
+        "desc": "스레드 증가 (RAM 대역폭 포화)",
+        "extra": ["--no-mmap", "-t", "10", "--prio", "2"],
+    },
+    {
+        "name": "D) --no-mmap -t 12",
+        "desc": "더 많은 스레드",
+        "extra": ["--no-mmap", "-t", "12", "--prio", "2"],
+    },
+    {
+        "name": "E) --no-mmap -t 10 --prio 3 --poll 100",
+        "desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
+        "extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
+    },
+]
+
+# ============================================================
+# 유틸리티 함수
+# ============================================================
+
+def kill_server():
+    """llama-server 프로세스 강제 종료"""
+    os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
+    time.sleep(3)
+
+def start_server(config, log_path):
+    """서버 시작, 로그를 파일로 리다이렉트"""
+    cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
+    log_file = open(log_path, "w", encoding="utf-8")
+    proc = subprocess.Popen(
+        cmd,
+        stdout=log_file,
+        stderr=subprocess.STDOUT,
+        cwd=os.getcwd()
+    )
+    return proc, log_file
+
+def wait_for_server(timeout=600):
+    """서버가 준비될 때까지 대기"""
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(5)
+    return False
+
+def run_single_benchmark(prompt, max_tokens=200):
+    """단일 벤치마크 실행"""
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    completion_tokens = usage.get("completion_tokens", 0)
+    return completion_tokens, elapsed
+
+def parse_eval_times(log_path):
+    """서버 로그에서 순수 eval time 파싱"""
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return []
+    
+    # "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
+    pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
+    matches = re.findall(pattern, content, re.MULTILINE)
+    
+    results = []
+    for m in matches:
+        results.append({
+            "total_ms": float(m[0]),
+            "tokens": int(m[1]),
+            "ms_per_token": float(m[2]),
+            "tps": float(m[3])
+        })
+    return results
+
+def parse_prompt_eval_times(log_path):
+    """서버 로그에서 prompt eval time 파싱"""
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return []
+    
+    pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
+    matches = re.findall(pattern, content, re.MULTILINE)
+    
+    results = []
+    for m in matches:
+        results.append({
+            "total_ms": float(m[0]),
+            "tokens": int(m[1]),
+            "ms_per_token": float(m[2]),
+            "tps": float(m[3])
+        })
+    return results
+
+def parse_vram_usage(log_path):
+    """서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return "N/A"
+    
+    match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
+    if match:
+        return f"{float(match.group(1)):.0f} MiB"
+    return "N/A"
+
+# ============================================================
+# 메인 튜닝 루프
+# ============================================================
+
+def main():
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    
+    print("=" * 70)
+    print("  Qwen3.5 122B-A10B 자동 정밀 튜닝")
+    print(f"  시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
+    print(f"  테스트 설정: {len(CONFIGS)}개")
+    print(f"  예상 소요: ~{len(CONFIGS) * 7}분")
+    print("=" * 70)
+    print()
+    print("  기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
+    print()
+    
+    # 결과 저장
+    all_results = []
+    
+    for idx, config in enumerate(CONFIGS):
+        config_start = time.time()
+        log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
+        
+        print(f"\n{'='*70}")
+        print(f"  [{idx+1}/{len(CONFIGS)}] {config['name']}")
+        print(f"  {config['desc']}")
+        print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
+        print(f"{'='*70}")
+        
+        # 1. 기존 서버 종료
+        print("  [1/4] 서버 종료 중...")
+        kill_server()
+        
+        # 2. 새 서버 시작
+        print(f"  [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
+        proc, log_file = start_server(config, log_path)
+        
+        # 3. 서버 준비 대기
+        if not wait_for_server(timeout=600):
+            print("  ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
+            kill_server()
+            log_file.close()
+            all_results.append({
+                "config": config["name"],
+                "status": "FAILED",
+                "eval_tps": [],
+                "prompt_tps": [],
+                "vram": "N/A"
+            })
+            continue
+        
+        load_time = time.time() - config_start
+        print(f"  [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
+        
+        # 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
+        print("  [4/4] 벤치마크 실행 중...")
+        
+        # 워밍업
+        try:
+            run_single_benchmark("Say hello.", max_tokens=20)
+            print("    워밍업 완료")
+        except Exception as e:
+            print(f"    워밍업 실패: {e}")
+        
+        # 본 테스트 3회
+        prompts = [
+            "Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
+            "Explain the complete process of photosynthesis including light and dark reactions in detail.",
+            "Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
+        ]
+        
+        for i, prompt in enumerate(prompts):
+            try:
+                tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
+                approx_tps = tokens / elapsed if elapsed > 0 else 0
+                print(f"    Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
+            except Exception as e:
+                print(f"    Run {i+1}/3: ERROR - {e}")
+        
+        # 서버 종료 전에 로그 플러시를 위해 잠시 대기
+        time.sleep(2)
+        
+        # 서버 종료
+        kill_server()
+        log_file.close()
+        time.sleep(2)
+        
+        # 로그 파싱
+        eval_times = parse_eval_times(log_path)
+        prompt_times = parse_prompt_eval_times(log_path)
+        vram = parse_vram_usage(log_path)
+        
+        # 워밍업 제외 (첫 번째 결과)
+        if len(eval_times) > 1:
+            bench_evals = eval_times[1:]  # 워밍업 제외
+        else:
+            bench_evals = eval_times
+        
+        if len(prompt_times) > 1:
+            bench_prompts = prompt_times[1:]
+        else:
+            bench_prompts = prompt_times
+        
+        eval_speeds = [e["tps"] for e in bench_evals]
+        prompt_speeds = [p["tps"] for p in bench_prompts]
+        
+        result = {
+            "config": config["name"],
+            "status": "OK",
+            "eval_tps": eval_speeds,
+            "prompt_tps": prompt_speeds,
+            "vram": vram,
+        }
+        all_results.append(result)
+        
+        config_elapsed = time.time() - config_start
+        print(f"\n  완료! 소요: {config_elapsed:.0f}초")
+        
+        if eval_speeds:
+            avg_eval = sum(eval_speeds) / len(eval_speeds)
+            max_eval = max(eval_speeds)
+            print(f"  📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
+    
+    # ============================================================
+    # 최종 결과 비교 테이블
+    # ============================================================
+    print("\n")
+    print("=" * 80)
+    print("  🏆 최종 결과 비교 테이블")
+    print("=" * 80)
+    print()
+    
+    # 기존 baseline 추가
+    print(f"  {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
+    print(f"  {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
+    
+    # Baseline (이전 결과)
+    print(f"  {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
+    
+    best_avg = 0
+    best_config = ""
+    
+    for r in all_results:
+        if r["status"] != "OK" or not r["eval_tps"]:
+            print(f"  {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
+            continue
+        
+        avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
+        max_e = max(r["eval_tps"])
+        avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
+        
+        if avg_e > best_avg:
+            best_avg = avg_e
+            best_config = r["config"]
+        
+        marker = " ⭐" if avg_e > 10.06 else ""
+        print(f"  {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
+    
+    print()
+    if best_avg > 0:
+        improvement = ((best_avg - 10.02) / 10.02) * 100
+        print(f"  🏆 최고 성능: {best_config}")
+        print(f"     → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
+    
+    print()
+    print(f"  완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
+    print("=" * 80)
+    
+    # 결과를 파일로도 저장
+    result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
+    with open(result_path, "w", encoding="utf-8") as f:
+        f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
+        f.write(f"Date: {timestamp}\n\n")
+        for r in all_results:
+            f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
+    print(f"  결과 저장: {result_path}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/auto_tune_122b_r2.py
+++ b/scripts/auto_tune_122b_r2.py
@@ -0,0 +1,257 @@
+"""
+Qwen3.5 122B-A10B 정밀 튜닝 2라운드
+====================================
+1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
+→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import os
+import re
+import sys
+import datetime
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
+SERVER_EXE = r"llama_bin_run\llama-server.exe"
+
+COMMON_ARGS = [
+    "--model", MODEL_PATH,
+    "-ngl", "999",
+    "--cpu-moe",
+    "-c", "2048",
+    "-np", "1",
+    "-fa", "on",
+    "--cache-type-k", "q4_0",
+    "--cache-type-v", "q4_0",
+    "-ub", "256",
+    "-b", "1024",
+    "--mlock",
+    "--port", "8000",
+    "--host", "0.0.0.0",
+    "--no-warmup",
+]
+
+CONFIGS = [
+    {
+        "name": "F) mmap on, -t 4",
+        "desc": "최소 스레드 (4개, 물리코어 절반)",
+        "extra": ["-t", "4", "--prio", "2"],
+    },
+    {
+        "name": "G) mmap on, -t 5",
+        "desc": "스레드 5개",
+        "extra": ["-t", "5", "--prio", "2"],
+    },
+    {
+        "name": "H) mmap on, -t 6",
+        "desc": "스레드 6개 (--no-mmap에서 최고였음)",
+        "extra": ["-t", "6", "--prio", "2"],
+    },
+    {
+        "name": "I) mmap on, -t 7",
+        "desc": "스레드 7개",
+        "extra": ["-t", "7", "--prio", "2"],
+    },
+    {
+        "name": "J) mmap on, -t 6, --prio 3",
+        "desc": "최적 스레드 + 리얼타임 우선순위",
+        "extra": ["-t", "6", "--prio", "3"],
+    },
+]
+
+def kill_server():
+    os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
+    time.sleep(3)
+
+def start_server(config, log_path):
+    cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
+    log_file = open(log_path, "w", encoding="utf-8")
+    proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
+    return proc, log_file
+
+def wait_for_server(timeout=600):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(5)
+    return False
+
+def run_single_benchmark(prompt, max_tokens=200):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+    usage = result.get("usage", {})
+    return usage.get("completion_tokens", 0), elapsed
+
+def parse_eval_times(log_path):
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return []
+    pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
+    matches = re.findall(pattern, content, re.MULTILINE)
+    return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
+
+def parse_prompt_eval_times(log_path):
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except:
+        return []
+    pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
+    matches = re.findall(pattern, content, re.MULTILINE)
+    return [{"tps": float(m[3])} for m in matches]
+
+def main():
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    
+    print("=" * 70)
+    print("  Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
+    print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
+    print(f"  테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
+    print("=" * 70)
+    print()
+    
+    all_results = []
+    
+    for idx, config in enumerate(CONFIGS):
+        config_start = time.time()
+        log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
+        
+        print(f"\n{'='*70}")
+        print(f"  [{idx+1}/{len(CONFIGS)}] {config['name']}")
+        print(f"  {config['desc']}")
+        print(f"  시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
+        print(f"{'='*70}")
+        
+        kill_server()
+        print(f"  [1/3] 서버 시작 중...")
+        proc, log_file = start_server(config, log_path)
+        
+        if not wait_for_server(timeout=600):
+            print("  ❌ 서버 시작 실패!")
+            kill_server()
+            log_file.close()
+            all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
+            continue
+        
+        load_time = time.time() - config_start
+        print(f"  [2/3] 서버 준비 완료! ({load_time:.0f}초)")
+        
+        # 워밍업 + 벤치마크
+        try:
+            run_single_benchmark("Say hello.", max_tokens=20)
+        except:
+            pass
+        
+        print("  [3/3] 벤치마크 3회...")
+        prompts = [
+            "Write a detailed explanation of how neural networks learn through backpropagation.",
+            "Explain the complete process of photosynthesis including light and dark reactions.",
+            "Describe the differences between SQL and NoSQL databases with examples.",
+        ]
+        for i, prompt in enumerate(prompts):
+            try:
+                tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
+                print(f"    Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
+            except Exception as e:
+                print(f"    Run {i+1}: ERROR - {e}")
+        
+        time.sleep(2)
+        kill_server()
+        log_file.close()
+        time.sleep(2)
+        
+        eval_times = parse_eval_times(log_path)
+        prompt_times = parse_prompt_eval_times(log_path)
+        bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
+        bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
+        
+        eval_speeds = [e["tps"] for e in bench_evals]
+        prompt_speeds = [p["tps"] for p in bench_prompts]
+        
+        all_results.append({
+            "config": config["name"],
+            "status": "OK",
+            "eval_tps": eval_speeds,
+            "prompt_tps": prompt_speeds,
+        })
+        
+        if eval_speeds:
+            print(f"  📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
+    
+    # 최종 결과
+    print("\n")
+    print("=" * 85)
+    print("  🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
+    print("=" * 85)
+    print()
+    print(f"  {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
+    print(f"  {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
+    
+    # 1라운드 결과 (하드코딩)
+    r1 = [
+        ("[기준] mmap on, -t 8, --prio 2",              10.02, 10.06, 29.52),
+        ("A) --no-mmap -t 8",                           9.66,  9.70,  28.26),
+        ("B) --no-mmap -t 6",                          10.02, 10.18,  26.73),
+        ("C) --no-mmap -t 10",                          9.42,  9.46,  27.31),
+        ("D) --no-mmap -t 12",                          9.04,  9.11,  27.92),
+        ("E) --no-mmap -t 10 --prio 3 --poll 100",     9.41,  9.45,  28.37),
+    ]
+    for name, avg, mx, pp in r1:
+        marker = " ⭐" if avg >= 10.0 else ""
+        print(f"  {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
+    
+    print(f"  {'--- 2라운드 ---':<48}")
+    
+    best_avg = 10.06  # 기존 최고
+    best_config = "[기준] mmap on, -t 8"
+    
+    for r in all_results:
+        if r["status"] != "OK" or not r["eval_tps"]:
+            print(f"  {r['config']:<48} {'FAIL':>8}")
+            continue
+        avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
+        max_e = max(r["eval_tps"])
+        avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
+        if max_e > best_avg:
+            best_avg = max_e
+            best_config = r["config"]
+        marker = " ⭐" if avg_e >= 10.0 else ""
+        print(f"  {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
+    
+    print()
+    print(f"  🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
+    print(f"  완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
+    print("=" * 85)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/auto_tune_gemma4_256k.py
+++ b/scripts/auto_tune_gemma4_256k.py
@@ -0,0 +1,339 @@
+"""
+Gemma4 26B-A4B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
+Phase 1: -ngl sweep (GPU layers)
+Phase 2: -t / -tb sweep (CPU threads)
+Phase 3: -ub / -b sweep (batch sizes)
+Phase 4: --cache-type-k/v sweep (KV cache precision)
+Phase 5: --no-mmap, --poll, --prio sweep (misc)
+Each phase fixes the best from previous phases.
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+import itertools
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
+CONTEXT = 262144
+BENCHMARK_RUNS = 3
+BENCHMARK_TOKENS = 200
+
+# ─── Baseline (from previous tuning at -c 4096) ───
+BEST = {
+    "ngl": 22,
+    "t": 8,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": True,
+    "mmap": True,
+    "prio": 2,
+    "poll": 50,
+}
+
+ALL_RESULTS = []
+
+
+def kill_server():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
+                   capture_output=True)
+    time.sleep(4)
+
+
+def build_cmd(cfg):
+    cmd = [LLAMA_SERVER, "--model", MODEL,
+           "-ngl", str(cfg["ngl"]),
+           "-c", str(CONTEXT),
+           "-np", "1",
+           "-fa", cfg["fa"],
+           "--cache-type-k", cfg["ctk"],
+           "--cache-type-v", cfg["ctv"],
+           "-ub", str(cfg["ub"]),
+           "-b", str(cfg["b"]),
+           "-t", str(cfg["t"]),
+           "-tb", str(cfg["tb"]),
+           "--prio", str(cfg["prio"]),
+           "--poll", str(cfg["poll"]),
+           "--port", "8000",
+           "--host", "0.0.0.0"]
+    if cfg["mlock"]:
+        cmd.append("--mlock")
+    if not cfg["mmap"]:
+        cmd.append("--no-mmap")
+    return cmd
+
+
+def start_server(cfg):
+    cmd = build_cmd(cfg)
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
+    )
+    return proc
+
+
+def wait_for_server(timeout=180):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=3) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(2)
+    return False
+
+
+def run_benchmark(max_tokens=BENCHMARK_TOKENS):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    ct = usage.get("completion_tokens", 0)
+    return ct / elapsed if elapsed > 0 else 0
+
+
+def get_vram():
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=memory.used,memory.total",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        parts = r.stdout.strip().split(",")
+        return int(parts[0].strip()), int(parts[1].strip())
+    except:
+        return 0, 0
+
+
+def test_config(cfg, label=""):
+    kill_server()
+    desc = label or str(cfg)
+    print(f"  [{desc}] Starting server...")
+    proc = start_server(cfg)
+
+    if not wait_for_server():
+        print(f"  [{desc}] FAILED to start")
+        proc.kill()
+        return None
+
+    vram_used, vram_total = get_vram()
+    print(f"  [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+
+    # Benchmark
+    speeds = []
+    for i in range(BENCHMARK_RUNS):
+        try:
+            tps = run_benchmark()
+            speeds.append(tps)
+        except Exception as e:
+            print(f"ERR({e}) ", end="", flush=True)
+
+    proc.kill()
+
+    if not speeds:
+        print("ALL FAILED")
+        return None
+
+    avg = sum(speeds) / len(speeds)
+    best = max(speeds)
+    print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
+
+    result = {**cfg, "avg_tps": avg, "best_tps": best,
+              "vram_used": vram_used, "vram_total": vram_total, "label": label}
+    ALL_RESULTS.append(result)
+    return result
+
+
+def phase_sweep(phase_name, param_name, values, base_cfg):
+    print(f"\n{'='*70}")
+    print(f"  PHASE: {phase_name}")
+    print(f"  Sweeping: {param_name} = {values}")
+    print(f"{'='*70}")
+
+    best_result = None
+    for val in values:
+        cfg = {**base_cfg}
+        if isinstance(param_name, list):
+            for p, v in zip(param_name, val):
+                cfg[p] = v
+            label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
+        else:
+            cfg[param_name] = val
+            label = f"{param_name}={val}"
+
+        r = test_config(cfg, label)
+        if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
+            best_result = r
+
+    if best_result:
+        print(f"\n  ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
+    return best_result
+
+
+def main():
+    print("=" * 70)
+    print("  Gemma4 26B-A4B COMPREHENSIVE Auto-Tuner")
+    print("  256K Context | RTX 3060 12GB")
+    print("=" * 70)
+    print()
+
+    cfg = dict(BEST)
+
+    # ─── Phase 1: -ngl (already done, quick verify top 3) ───
+    r = phase_sweep("GPU Layers (-ngl)", "ngl", [22, 21, 20], cfg)
+    if r:
+        cfg["ngl"] = r["ngl"]
+
+    # ─── Phase 2: CPU threads (-t, -tb) ───
+    thread_combos = [
+        (2, 2), (4, 4), (4, 8), (6, 6), (6, 8),
+        (8, 8), (8, 12), (10, 10), (12, 12), (16, 16)
+    ]
+    r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
+    if r:
+        cfg["t"] = r["t"]
+        cfg["tb"] = r["tb"]
+
+    # ─── Phase 3: Batch sizes (-ub, -b) ───
+    batch_combos = [
+        (128, 512), (256, 1024), (256, 2048),
+        (512, 1024), (512, 2048), (512, 4096),
+        (1024, 2048), (1024, 4096)
+    ]
+    r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
+    if r:
+        cfg["ub"] = r["ub"]
+        cfg["b"] = r["b"]
+
+    # ─── Phase 4: KV cache precision ───
+    kv_combos = [
+        ("q4_0", "q4_0"),
+        ("q8_0", "q8_0"),
+        ("q4_0", "q8_0"),
+        ("f16", "f16"),
+    ]
+    r = phase_sweep("KV Cache Type (-ctk, -ctv)", ["ctk", "ctv"], kv_combos, cfg)
+    if r:
+        cfg["ctk"] = r["ctk"]
+        cfg["ctv"] = r["ctv"]
+
+    # ─── Phase 5: Misc (mmap, poll, prio) ───
+    misc_combos = [
+        (True, 50, 2),   # baseline
+        (False, 50, 2),  # no-mmap
+        (True, 0, 2),    # no polling
+        (True, 100, 2),  # max polling
+        (True, 50, 3),   # realtime priority
+        (False, 0, 3),   # no-mmap + no-poll + realtime
+    ]
+    r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
+    if r:
+        cfg["mmap"] = r["mmap"]
+        cfg["poll"] = r["poll"]
+        cfg["prio"] = r["prio"]
+
+    # ─── Final Report ───
+    print()
+    print("=" * 70)
+    print("  FINAL OPTIMAL CONFIGURATION")
+    print("=" * 70)
+    print(f"  ngl:       {cfg['ngl']}")
+    print(f"  threads:   -t {cfg['t']} -tb {cfg['tb']}")
+    print(f"  batch:     -ub {cfg['ub']} -b {cfg['b']}")
+    print(f"  kv cache:  -ctk {cfg['ctk']} -ctv {cfg['ctv']}")
+    print(f"  flash:     -fa {cfg['fa']}")
+    print(f"  mlock:     {'yes' if cfg['mlock'] else 'no'}")
+    print(f"  mmap:      {'yes' if cfg['mmap'] else 'no (--no-mmap)'}")
+    print(f"  prio:      {cfg['prio']}")
+    print(f"  poll:      {cfg['poll']}")
+    print()
+
+    # Final verification run
+    print("  Running final verification (5 runs)...")
+    kill_server()
+    proc = start_server(cfg)
+    wait_for_server()
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+    final_speeds = []
+    for i in range(5):
+        try:
+            tps = run_benchmark()
+            final_speeds.append(tps)
+            print(f"    Run {i+1}: {tps:.2f} t/s")
+        except:
+            pass
+    proc.kill()
+
+    if final_speeds:
+        avg = sum(final_speeds) / len(final_speeds)
+        best = max(final_speeds)
+        print(f"\n  ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
+
+    print()
+    cmd_parts = [
+        f"llama-server --model {MODEL}",
+        f"-ngl {cfg['ngl']} -c {CONTEXT}",
+        f"-t {cfg['t']} -tb {cfg['tb']}",
+        f"-ub {cfg['ub']} -b {cfg['b']}",
+        f"-fa {cfg['fa']}",
+        f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
+        f"--prio {cfg['prio']} --poll {cfg['poll']}",
+    ]
+    if cfg["mlock"]:
+        cmd_parts.append("--mlock")
+    if not cfg["mmap"]:
+        cmd_parts.append("--no-mmap")
+    cmd_parts.append("--port 8000 --host 0.0.0.0")
+
+    print("  Recommended command:")
+    print(f"    {' '.join(cmd_parts)}")
+    print("=" * 70)
+
+    # Dump all results to JSON
+    with open("scripts/tune_results_gemma4_256k.json", "w") as f:
+        json.dump(ALL_RESULTS, f, indent=2, default=str)
+    print(f"\n  Full results saved: scripts/tune_results_gemma4_256k.json")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/auto_tune_gemma4_ncpumoe.py
+++ b/scripts/auto_tune_gemma4_ncpumoe.py
@@ -0,0 +1,163 @@
+"""
+Gemma4 26B --n-cpu-moe sweep + secondary param tuning at 256K context.
+Gemma4 has 30 layers. Sweep n-cpu-moe from 0 to 30.
+"""
+import subprocess, time, json, urllib.request, sys, os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf"
+CTX = 262144
+RUNS = 3
+
+
+def kill():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+    time.sleep(4)
+
+
+def start(ncpumoe, t=4, ub=512, b=2048, ctk="q4_0", ctv="q4_0", prio=3, nommap=False):
+    cmd = [SERVER, "--model", MODEL, "-ngl", "999",
+           "-c", str(CTX), "-np", "1", "-fa", "on",
+           "--cache-type-k", ctk, "--cache-type-v", ctv,
+           "-ub", str(ub), "-b", str(b), "-t", str(t), "-tb", str(t),
+           "--prio", str(prio), "--poll", "50",
+           "--mlock", "--port", "8000", "--host", "0.0.0.0"]
+    if ncpumoe > 0:
+        cmd.extend(["--n-cpu-moe", str(ncpumoe)])
+    if nommap:
+        cmd.append("--no-mmap")
+    return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                            cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace')
+
+
+def wait_ready(timeout=240):
+    t0 = time.time()
+    while time.time() - t0 < timeout:
+        try:
+            with urllib.request.urlopen(urllib.request.Request(f"{BASE_URL}/health"), timeout=3) as r:
+                if json.loads(r.read()).get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(2)
+    return False
+
+
+def bench(n=200):
+    p = json.dumps({"model": "m", "messages": [{"role": "user",
+         "content": "Count from 1 to 50, each number on new line."}],
+         "max_tokens": n, "temperature": 0.0}).encode()
+    r = urllib.request.Request(f"{BASE_URL}/v1/chat/completions", data=p,
+                               headers={"Content-Type": "application/json"})
+    t0 = time.time()
+    with urllib.request.urlopen(r, timeout=300) as resp:
+        res = json.loads(resp.read())
+    dt = time.time() - t0
+    ct = res.get("usage", {}).get("completion_tokens", 0)
+    return ct / dt if dt > 0 else 0
+
+
+def vram():
+    try:
+        r = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total",
+                            "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5)
+        a, b = r.stdout.strip().split(",")
+        return int(a.strip()), int(b.strip())
+    except:
+        return 0, 0
+
+
+def test(label, ncpumoe, **kw):
+    kill()
+    print(f"  [{label}] Starting...", end=" ", flush=True)
+    p = start(ncpumoe, **kw)
+    if not wait_ready():
+        print("FAILED"); p.kill(); return None
+    vu, vt = vram()
+    print(f"VRAM:{vu}/{vt} | ", end="", flush=True)
+    try: bench(20)
+    except: pass
+    speeds = []
+    for _ in range(RUNS):
+        try: speeds.append(bench())
+        except: pass
+    p.kill()
+    if not speeds:
+        print("BENCH FAILED"); return None
+    avg, best = sum(speeds)/len(speeds), max(speeds)
+    print(f"AVG:{avg:.1f} BEST:{best:.1f} t/s")
+    return {"label": label, "ncpumoe": ncpumoe, "avg": avg, "best": best,
+            "vram": vu, **kw}
+
+
+def main():
+    print("=" * 60)
+    print("  Gemma4 26B 256K | --n-cpu-moe Sweep + Param Tune")
+    print("=" * 60)
+    results = []
+
+    # Phase 1: n-cpu-moe sweep (0, 5, 10, 15, 20, 25, 30)
+    print("\n--- Phase 1: --n-cpu-moe sweep ---")
+    for n in [0, 5, 10, 15, 20, 25, 30]:
+        nm = n > 15  # use --no-mmap when heavy CPU offload
+        r = test(f"ncpumoe={n}", n, nommap=nm)
+        if r: results.append(r)
+
+    # Find best n-cpu-moe
+    best_r = max(results, key=lambda x: x["avg"])
+    best_n = best_r["ncpumoe"]
+    print(f"\n  ★ Best n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
+
+    # Fine-tune around best
+    if best_n > 0:
+        print(f"\n--- Phase 1b: Fine-tune around ncpumoe={best_n} ---")
+        for n in [max(0, best_n-3), max(0, best_n-1), best_n+1, min(30, best_n+3)]:
+            if n == best_n: continue
+            nm = n > 15
+            r = test(f"ncpumoe={n}", n, nommap=nm)
+            if r: results.append(r)
+        best_r = max(results, key=lambda x: x["avg"])
+        best_n = best_r["ncpumoe"]
+        print(f"\n  ★ Refined n-cpu-moe: {best_n} → {best_r['avg']:.1f} t/s")
+
+    # Phase 2: Thread sweep at best n-cpu-moe
+    nm = best_n > 15
+    print(f"\n--- Phase 2: Thread sweep (ncpumoe={best_n}) ---")
+    for t in [2, 4, 6, 8, 10]:
+        r = test(f"t={t}", best_n, t=t, nommap=nm)
+        if r: results.append(r)
+    best_t = max([x for x in results if x["ncpumoe"]==best_n], key=lambda x: x["avg"])
+    bt = best_t.get("t", 4)
+    print(f"\n  ★ Best threads: {bt}")
+
+    # Phase 3: Batch sweep
+    print(f"\n--- Phase 3: Batch sweep ---")
+    for ub, b in [(256, 1024), (512, 2048), (512, 4096), (1024, 2048)]:
+        r = test(f"ub={ub},b={b}", best_n, t=bt, ub=ub, b=b, nommap=nm)
+        if r: results.append(r)
+
+    # Phase 4: KV cache type
+    print(f"\n--- Phase 4: KV cache type ---")
+    for ctk, ctv in [("q4_0","q4_0"), ("q8_0","q8_0")]:
+        r = test(f"kv={ctk}", best_n, t=bt, ctk=ctk, ctv=ctv, nommap=nm)
+        if r: results.append(r)
+
+    # Final report
+    best_all = max(results, key=lambda x: x["avg"])
+    print(f"\n{'='*60}")
+    print(f"  FINAL BEST: {best_all['label']} → {best_all['avg']:.1f} t/s (VRAM: {best_all['vram']})")
+    print(f"{'='*60}")
+
+    with open("scripts/tune_results_gemma4_ncpumoe.json", "w") as f:
+        json.dump(results, f, indent=2, default=str)
+    print("  Saved: scripts/tune_results_gemma4_ncpumoe.json")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/auto_tune_qwen35b_256k.py
+++ b/scripts/auto_tune_qwen35b_256k.py
@@ -0,0 +1,335 @@
+"""
+Qwen3.5 35B-A3B Comprehensive Auto-Tuner | 256K Context | RTX 3060 12GB
+Based on existing optimized setting: --cpu-moe -ngl 999 -c 4096 -t 6 (35 t/s)
+Now tuning for -c 262144 (256K context).
+
+Phase 1: --cpu-moe vs no --cpu-moe baseline
+Phase 2: -t / -tb sweep
+Phase 3: -ub / -b sweep
+Phase 4: --cache-type-k/v sweep
+Phase 5: Misc (mmap, poll, prio)
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+MODEL = r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf"
+CONTEXT = 262144
+BENCHMARK_RUNS = 3
+BENCHMARK_TOKENS = 200
+
+BEST = {
+    "ngl": 999,
+    "cpu_moe": True,
+    "t": 6,
+    "tb": 6,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": True,
+    "mmap": True,
+    "prio": 2,
+    "poll": 50,
+}
+
+ALL_RESULTS = []
+
+
+def kill_server():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"], capture_output=True)
+    time.sleep(4)
+
+
+def build_cmd(cfg):
+    cmd = [LLAMA_SERVER, "--model", MODEL,
+           "-ngl", str(cfg["ngl"]),
+           "-c", str(CONTEXT),
+           "-np", "1",
+           "-fa", cfg["fa"],
+           "--cache-type-k", cfg["ctk"],
+           "--cache-type-v", cfg["ctv"],
+           "-ub", str(cfg["ub"]),
+           "-b", str(cfg["b"]),
+           "-t", str(cfg["t"]),
+           "-tb", str(cfg["tb"]),
+           "--prio", str(cfg["prio"]),
+           "--poll", str(cfg["poll"]),
+           "--port", "8000",
+           "--host", "0.0.0.0"]
+    if cfg.get("cpu_moe"):
+        cmd.append("--cpu-moe")
+    if cfg["mlock"]:
+        cmd.append("--mlock")
+    if not cfg["mmap"]:
+        cmd.append("--no-mmap")
+    return cmd
+
+
+def start_server(cfg):
+    cmd = build_cmd(cfg)
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
+    )
+    return proc
+
+
+def wait_for_server(timeout=240):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=3) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    return True
+        except:
+            pass
+        time.sleep(2)
+    return False
+
+
+def run_benchmark(max_tokens=BENCHMARK_TOKENS):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": "Count from 1 to 50, writing each number on a new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    ct = usage.get("completion_tokens", 0)
+    return ct / elapsed if elapsed > 0 else 0
+
+
+def get_vram():
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=memory.used,memory.total",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        parts = r.stdout.strip().split(",")
+        return int(parts[0].strip()), int(parts[1].strip())
+    except:
+        return 0, 0
+
+
+def test_config(cfg, label=""):
+    kill_server()
+    desc = label or str(cfg)
+    print(f"  [{desc}] Starting server...", flush=True)
+    proc = start_server(cfg)
+
+    if not wait_for_server():
+        print(f"  [{desc}] FAILED to start")
+        proc.kill()
+        return None
+
+    vram_used, vram_total = get_vram()
+    print(f"  [{desc}] VRAM: {vram_used}/{vram_total} MiB | ", end="", flush=True)
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+
+    speeds = []
+    for i in range(BENCHMARK_RUNS):
+        try:
+            tps = run_benchmark()
+            speeds.append(tps)
+        except Exception as e:
+            print(f"ERR({e}) ", end="", flush=True)
+
+    proc.kill()
+
+    if not speeds:
+        print("ALL FAILED")
+        return None
+
+    avg = sum(speeds) / len(speeds)
+    best = max(speeds)
+    print(f"AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
+
+    result = {**{k: v for k, v in cfg.items()}, "avg_tps": avg, "best_tps": best,
+              "vram_used": vram_used, "vram_total": vram_total, "label": label}
+    ALL_RESULTS.append(result)
+    return result
+
+
+def phase_sweep(phase_name, param_name, values, base_cfg):
+    print(f"\n{'='*70}")
+    print(f"  PHASE: {phase_name}")
+    print(f"  Sweeping: {param_name} = {values}")
+    print(f"{'='*70}")
+
+    best_result = None
+    for val in values:
+        cfg = {**base_cfg}
+        if isinstance(param_name, list):
+            for p, v in zip(param_name, val):
+                cfg[p] = v
+            label = " | ".join(f"{p}={v}" for p, v in zip(param_name, val))
+        else:
+            cfg[param_name] = val
+            label = f"{param_name}={val}"
+
+        r = test_config(cfg, label)
+        if r and (best_result is None or r["avg_tps"] > best_result["avg_tps"]):
+            best_result = r
+
+    if best_result:
+        print(f"\n  ★ Phase winner: {best_result['label']} → {best_result['avg_tps']:.2f} t/s")
+    return best_result
+
+
+def main():
+    print("=" * 70)
+    print("  Qwen3.5 35B-A3B COMPREHENSIVE Auto-Tuner")
+    print("  256K Context | RTX 3060 12GB")
+    print("  Baseline: --cpu-moe -ngl 999 -c 4096 -t 6 → 35 t/s")
+    print("=" * 70)
+    print()
+
+    cfg = dict(BEST)
+
+    # ─── Phase 1: --cpu-moe critical test ───
+    r = phase_sweep("CPU-MoE Mode", "cpu_moe", [True, False], cfg)
+    if r:
+        cfg["cpu_moe"] = r["cpu_moe"]
+
+    # ─── Phase 2: CPU threads ───
+    thread_combos = [
+        (2, 2), (4, 4), (4, 6), (6, 6), (6, 8),
+        (8, 8), (8, 12), (10, 10), (12, 12)
+    ]
+    r = phase_sweep("CPU Threads (-t, -tb)", ["t", "tb"], thread_combos, cfg)
+    if r:
+        cfg["t"] = r["t"]
+        cfg["tb"] = r["tb"]
+
+    # ─── Phase 3: Batch sizes ───
+    batch_combos = [
+        (128, 512), (256, 1024), (256, 2048),
+        (512, 1024), (512, 2048), (512, 4096),
+        (1024, 2048), (1024, 4096)
+    ]
+    r = phase_sweep("Batch Sizes (-ub, -b)", ["ub", "b"], batch_combos, cfg)
+    if r:
+        cfg["ub"] = r["ub"]
+        cfg["b"] = r["b"]
+
+    # ─── Phase 4: KV cache ───
+    kv_combos = [
+        ("q4_0", "q4_0"),
+        ("q8_0", "q8_0"),
+        ("f16", "f16"),
+    ]
+    r = phase_sweep("KV Cache Type", ["ctk", "ctv"], kv_combos, cfg)
+    if r:
+        cfg["ctk"] = r["ctk"]
+        cfg["ctv"] = r["ctv"]
+
+    # ─── Phase 5: Misc ───
+    misc_combos = [
+        (True, 50, 2),
+        (False, 50, 2),
+        (True, 0, 2),
+        (True, 100, 2),
+        (True, 50, 3),
+    ]
+    r = phase_sweep("Misc (mmap, poll, prio)", ["mmap", "poll", "prio"], misc_combos, cfg)
+    if r:
+        cfg["mmap"] = r["mmap"]
+        cfg["poll"] = r["poll"]
+        cfg["prio"] = r["prio"]
+
+    # ─── Final Report ───
+    print()
+    print("=" * 70)
+    print("  FINAL OPTIMAL CONFIGURATION")
+    print("=" * 70)
+    for k, v in cfg.items():
+        print(f"  {k:>12}: {v}")
+    print()
+
+    # Final verification
+    print("  Running final verification (5 runs)...")
+    kill_server()
+    proc = start_server(cfg)
+    wait_for_server()
+    try:
+        run_benchmark(max_tokens=20)
+    except:
+        pass
+    final_speeds = []
+    for i in range(5):
+        try:
+            tps = run_benchmark()
+            final_speeds.append(tps)
+            print(f"    Run {i+1}: {tps:.2f} t/s")
+        except:
+            pass
+    proc.kill()
+
+    if final_speeds:
+        avg = sum(final_speeds) / len(final_speeds)
+        best = max(final_speeds)
+        print(f"\n  ★ FINAL: AVG {avg:.2f} t/s | BEST {best:.2f} t/s")
+
+    print()
+    cmd_parts = [
+        f"llama-server --model {MODEL}",
+        f"-ngl {cfg['ngl']} -c {CONTEXT}",
+    ]
+    if cfg.get("cpu_moe"):
+        cmd_parts.append("--cpu-moe")
+    cmd_parts.extend([
+        f"-t {cfg['t']} -tb {cfg['tb']}",
+        f"-ub {cfg['ub']} -b {cfg['b']}",
+        f"-fa {cfg['fa']}",
+        f"--cache-type-k {cfg['ctk']} --cache-type-v {cfg['ctv']}",
+        f"--prio {cfg['prio']} --poll {cfg['poll']}",
+    ])
+    if cfg["mlock"]:
+        cmd_parts.append("--mlock")
+    if not cfg["mmap"]:
+        cmd_parts.append("--no-mmap")
+    cmd_parts.append("--port 8000 --host 0.0.0.0")
+
+    print("  Recommended command:")
+    print(f"    {' '.join(cmd_parts)}")
+    print("=" * 70)
+
+    with open("scripts/tune_results_qwen35b_256k.json", "w") as f:
+        json.dump(ALL_RESULTS, f, indent=2, default=str)
+    print(f"\n  Full results saved: scripts/tune_results_qwen35b_256k.json")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/boot_122b.txt
+++ b/scripts/boot_122b.txt
--- a/scripts/boot_122b_38.txt
+++ b/scripts/boot_122b_38.txt
--- a/scripts/boot_122b_42.txt
+++ b/scripts/boot_122b_42.txt
--- a/scripts/boot_122b_44.txt
+++ b/scripts/boot_122b_44.txt
--- a/scripts/boot_122b_auto.txt
+++ b/scripts/boot_122b_auto.txt
--- a/scripts/boot_122b_maxmem.txt
+++ b/scripts/boot_122b_maxmem.txt
--- a/scripts/boot_122b_row.txt
+++ b/scripts/boot_122b_row.txt
--- a/scripts/boot_122b_row_dual.txt
+++ b/scripts/boot_122b_row_dual.txt
--- a/scripts/boot_122b_single.txt
+++ b/scripts/boot_122b_single.txt
--- a/scripts/boot_122b_single2.txt
+++ b/scripts/boot_122b_single2.txt
--- a/scripts/boot_122b_ts85.txt
+++ b/scripts/boot_122b_ts85.txt
--- a/scripts/boot_122b_tune.txt
+++ b/scripts/boot_122b_tune.txt
--- a/scripts/boot_122b_tuned.txt
+++ b/scripts/boot_122b_tuned.txt
--- a/scripts/boot_122b_v2.txt
+++ b/scripts/boot_122b_v2.txt
--- a/scripts/boot_log.txt
+++ b/scripts/boot_log.txt
--- a/scripts/boot_log2.txt
+++ b/scripts/boot_log2.txt
--- a/scripts/boot_log3.txt
+++ b/scripts/boot_log3.txt
--- a/scripts/boot_log4.txt
+++ b/scripts/boot_log4.txt
--- a/scripts/boot_log5.txt
+++ b/scripts/boot_log5.txt
--- a/scripts/boot_qwen_iq4.txt
+++ b/scripts/boot_qwen_iq4.txt
--- a/scripts/check_help.bat
+++ b/scripts/check_help.bat
@@ -0,0 +1,3 @@
+@echo off
+.\llama_bin_run\llama-server.exe --help 2>&1 | findstr /i "split tensor device main-gpu cpu-moe n-cpu-moe" > scripts\help_gpu_flags.txt
+echo Done.
--- a/scripts/download_llama.py
+++ b/scripts/download_llama.py
@@ -0,0 +1,38 @@
+import urllib.request
+import json
+import zipfile
+import os
+import ssl
+
+ctx = ssl.create_default_context()
+ctx.check_hostname = False
+ctx.verify_mode = ssl.CERT_NONE
+
+url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
+req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+try:
+    with urllib.request.urlopen(req, context=ctx) as response:
+        data = json.loads(response.read().decode())
+
+    download_url = None
+    for asset in data['assets']:
+        if "bin-win-cuda-cu12.2.0-x64.zip" in asset['name'] or ("bin-win-cu" in asset['name'] and asset['name'].endswith(".zip") and "x64" in asset['name']):
+            download_url = asset['browser_download_url']
+            break
+            
+    if download_url:
+        print(f"Downloading {download_url}...")
+        zip_path = "llama.zip"
+        with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
+            out_file.write(resp.read())
+        print("Extracting to 'llama_bin'...")
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall("llama_bin")
+        print("Done extracting.")
+        os.remove(zip_path)
+    else:
+        print("Could not find the target zip. Available assets:")
+        for asset in data['assets']:
+            print(" -", asset['name'])
+except Exception as e:
+    print(f"Error: {e}")
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
@@ -0,0 +1,33 @@
+import os
+from huggingface_hub import hf_hub_download
+
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+models = [
+    # 먼저 용량이 작은 Gemma4 26B 부터 다운로드
+    ("ggml-org/gemma-4-26B-A4B-it-GGUF", "gemma-4-26B-A4B-it-Q4_K_M.gguf"),
+    # 다음 Qwen 35B
+    ("unsloth/Qwen3.5-35B-A3B-GGUF", "Qwen3.5-35B-A3B-Q4_K_M.gguf"),
+    # 마지막으로 122B (분할 압축되어 있음)
+    ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"),
+    ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf"),
+    ("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf")
+]
+
+print("=== 고속 다운로더 시작 (huggingface_hub & hf_transfer) ===")
+os.makedirs("models", exist_ok=True)
+
+for repo, filename in models:
+    print(f"\n>>> 다운로드 중 (백그라운드 진행): [{repo}] 의 [{filename}]...")
+    try:
+        path = hf_hub_download(
+            repo_id=repo, 
+            filename=filename, 
+            local_dir="./models", 
+            local_dir_use_symlinks=False
+        )
+        print(f"완료: {path}")
+    except Exception as e:
+        print(f"다운로드 실패: {e}")
+
+print("\n모든 다운로드 프로세스가 종료되었습니다.")
--- a/scripts/download_true_llama.py
+++ b/scripts/download_true_llama.py
@@ -0,0 +1,56 @@
+import urllib.request
+import json
+import zipfile
+import os
+import ssl
+import shutil
+
+ctx = ssl.create_default_context()
+ctx.check_hostname = False
+ctx.verify_mode = ssl.CERT_NONE
+
+url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
+req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+try:
+    with urllib.request.urlopen(req, context=ctx) as response:
+        data = json.loads(response.read().decode())
+
+    download_url = None
+    for asset in data['assets']:
+        if "bin-win-cuda-12.4-x64.zip" in asset['name'] and "cudart" not in asset['name']:
+            download_url = asset['browser_download_url']
+            break
+            
+    if download_url:
+        print(f"Downloading true binaries: {download_url}...")
+        zip_path = "llama_main.zip"
+        with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
+            out_file.write(resp.read())
+            
+        print("Extracting to temporary folder 'llama_temp'...")
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall("llama_temp")
+            
+        print("Moving exact files to 'llama_bin_run'...")
+        os.makedirs("llama_bin_run", exist_ok=True)
+        for root, dirs, files in os.walk("llama_temp"):
+            for file in files:
+                shutil.move(os.path.join(root, file), os.path.join("llama_bin_run", file))
+                
+        if os.path.exists("llama_bin"):
+            for item in os.listdir("llama_bin"):
+                src = os.path.join("llama_bin", item)
+                dst = os.path.join("llama_bin_run", item)
+                if not os.path.exists(dst):
+                    try:
+                        shutil.copy(src, dst)
+                    except:
+                        pass
+                        
+        os.remove(zip_path)
+        shutil.rmtree("llama_temp", ignore_errors=True)
+        print("Download and path extraction fully complete.")
+    else:
+        print("Could not find the target zip.")
+except Exception as e:
+    print(f"Error: {e}")
--- a/scripts/dual_gpu_benchmark.mjs
+++ b/scripts/dual_gpu_benchmark.mjs
@@ -0,0 +1,531 @@
+/**
+ * Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
+ * ===========================================================
+ * Tests 4 models across multiple parameter configurations to find
+ * the absolute best model + settings for 256K context coding agent.
+ *
+ * Models:
+ *   1. Qwen3.5-35B-A3B  Q4_K_M     (~20.5 GB)
+ *   2. Qwen3.5-35B-A3B  MXFP4_MOE  (~20.1 GB)
+ *   3. Gemma4  26B-A4B   Q4_K_M     (~15.6 GB)
+ *   4. Gemma4  26B-A4B   MXFP4_MOE  (~15.5 GB)
+ *
+ * Run: node scripts/dual_gpu_benchmark.mjs
+ */
+
+import { spawn, execSync } from "child_process";
+import { writeFileSync, statSync, existsSync } from "fs";
+import { resolve } from "path";
+
+// ─── Configuration ─────────────────────────────────────────────
+const BASE_URL = "http://127.0.0.1:8000";
+const LLAMA_SERVER = String.raw`llama_bin_run\llama-server.exe`;
+const CONTEXT = 262144; // 256K
+const BENCHMARK_RUNS = 3;
+const BENCHMARK_TOKENS = 200;
+const SERVER_TIMEOUT = 300_000; // ms
+
+const MODELS = [
+  {
+    name: "Qwen3.5-35B-A3B Q4_K_M",
+    path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
+    type: "qwen", quant: "Q4_K_M", totalLayers: 64,
+  },
+  {
+    name: "Qwen3.5-35B-A3B MXFP4_MOE",
+    path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
+    type: "qwen", quant: "MXFP4_MOE", totalLayers: 64,
+  },
+  {
+    name: "Gemma4 26B-A4B Q4_K_M",
+    path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
+    type: "gemma4", quant: "Q4_K_M", totalLayers: 30,
+  },
+  {
+    name: "Gemma4 26B-A4B MXFP4_MOE",
+    path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
+    type: "gemma4", quant: "MXFP4_MOE", totalLayers: 30,
+  },
+];
+
+const ALL_RESULTS = [];
+
+// ─── Utility ───────────────────────────────────────────────────
+
+function log(msg) {
+  const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
+  console.log(`[${ts}] ${msg}`);
+}
+
+function sleep(ms) {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+function killServer() {
+  try {
+    execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" });
+  } catch {}
+  return sleep(5000);
+}
+
+function getVramAll() {
+  try {
+    const out = execSync(
+      'nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
+      { encoding: "utf-8", timeout: 5000 }
+    );
+    return out.trim().split("\n").map((line) => {
+      const [gpu, used, total] = line.split(",").map((s) => parseInt(s.trim()));
+      return { gpu, used, total };
+    });
+  } catch {
+    return [];
+  }
+}
+
+function buildCmd(modelPath, params) {
+  const {
+    ngl, t, ub, b, ctk, ctv,
+    cpuMoe = false, nCpuMoe = 0,
+    prio = 3, nommap = false
+  } = params;
+
+  const cmd = [
+    LLAMA_SERVER,
+    "--model", modelPath,
+    "-ngl", String(ngl),
+    "-c", String(CONTEXT),
+    "-np", "1",
+    "-fa", "on",
+    "--cache-type-k", ctk,
+    "--cache-type-v", ctv,
+    "-ub", String(ub),
+    "-b", String(b),
+    "-t", String(t),
+    "-tb", String(t),
+    "--prio", String(prio),
+    "--poll", "50",
+    "--mlock",
+    "--port", "8000",
+    "--host", "0.0.0.0",
+  ];
+
+  if (cpuMoe) cmd.push("--cpu-moe");
+  else if (nCpuMoe > 0) cmd.push("--n-cpu-moe", String(nCpuMoe));
+  if (nommap) cmd.push("--no-mmap");
+
+  return cmd;
+}
+
+function startServer(modelPath, params) {
+  const args = buildCmd(modelPath, params);
+  const exe = args.shift();
+  log(`  CMD: ${exe} ${args.slice(-12).join(" ")} ...`);
+  return spawn(exe, args, {
+    cwd: process.cwd(),
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+}
+
+async function waitForServer(timeoutMs = SERVER_TIMEOUT) {
+  const start = Date.now();
+  while (Date.now() - start < timeoutMs) {
+    try {
+      const resp = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
+      const data = await resp.json();
+      if (data.status === "ok") return { ok: true, bootTime: (Date.now() - start) / 1000 };
+    } catch {}
+    await sleep(3000);
+  }
+  return { ok: false, bootTime: timeoutMs / 1000 };
+}
+
+async function runBenchmark(maxTokens = BENCHMARK_TOKENS) {
+  const payload = JSON.stringify({
+    model: "local-model",
+    messages: [{ role: "user", content: "Count from 1 to 50, writing each number on a new line." }],
+    max_tokens: maxTokens,
+    temperature: 0.0,
+  });
+
+  const start = Date.now();
+  const resp = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: payload,
+    signal: AbortSignal.timeout(600_000),
+  });
+  const result = await resp.json();
+  const elapsed = (Date.now() - start) / 1000;
+
+  const usage = result.usage || {};
+  const ct = usage.completion_tokens || 0;
+  return {
+    tps: elapsed > 0 ? ct / elapsed : 0,
+    completionTokens: ct,
+    promptTokens: usage.prompt_tokens || 0,
+    elapsed,
+  };
+}
+
+async function testConfig(model, label, params) {
+  await killServer();
+  log(`  [${label}] Starting server...`);
+
+  const proc = startServer(model.path, params);
+  const { ok, bootTime } = await waitForServer();
+
+  if (!ok) {
+    log(`  [${label}] FAILED to start (timeout ${SERVER_TIMEOUT / 1000}s)`);
+    proc.kill("SIGKILL");
+    return null;
+  }
+
+  const vram = getVramAll();
+  const vramStr = vram.map((g) => `GPU${g.gpu}:${g.used}/${g.total}MiB`).join(" | ");
+  log(`  [${label}] Boot: ${bootTime.toFixed(0)}s | VRAM: ${vramStr}`);
+
+  // Warmup
+  try { await runBenchmark(20); } catch {}
+
+  // Benchmark
+  const speeds = [];
+  for (let i = 0; i < BENCHMARK_RUNS; i++) {
+    try {
+      const r = await runBenchmark();
+      speeds.push(r.tps);
+      log(`    Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) {
+      log(`    Run ${i + 1}: ERROR (${e.message})`);
+    }
+  }
+
+  proc.kill("SIGKILL");
+
+  if (speeds.length === 0) {
+    log(`  [${label}] ALL BENCHMARK RUNS FAILED`);
+    return null;
+  }
+
+  const avg = speeds.reduce((a, b) => a + b) / speeds.length;
+  const best = Math.max(...speeds);
+  log(`  [${label}] => AVG: ${avg.toFixed(2)} t/s | BEST: ${best.toFixed(2)} t/s`);
+
+  const result = {
+    model: model.name, quant: model.quant, label,
+    avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+    boot_time: +bootTime.toFixed(1), vram, params,
+  };
+  ALL_RESULTS.push(result);
+  return result;
+}
+
+// ─── Phase Runners ─────────────────────────────────────────────
+
+async function phase0_bootTest(model) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 0: Boot Test — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  // Try full GPU first
+  let r = await testConfig(model, "boot-ngl999", {
+    ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0",
+  });
+  if (r) return r;
+
+  // Try with cpu-moe
+  log("  Full GPU failed, trying with --cpu-moe...");
+  r = await testConfig(model, "boot-cpumoe", {
+    ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true,
+  });
+  if (r) return r;
+
+  // Reduced layers
+  log("  --cpu-moe also failed, trying reduced layers...");
+  r = await testConfig(model, "boot-ngl-half", {
+    ngl: Math.floor(model.totalLayers / 2), t: 6, ub: 512, b: 2048,
+    ctk: "q4_0", ctv: "q4_0",
+  });
+  return r;
+}
+
+async function phase1_gpuOffload(model, baseline) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 1: GPU Offload Strategy — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const results = baseline ? [baseline] : [];
+
+  // Test --cpu-moe on/off
+  for (const cpuMoe of [true, false]) {
+    const lbl = `ngl=999 cpuMoe=${cpuMoe}`;
+    if (baseline?.params?.cpuMoe === cpuMoe && baseline.params.ngl === 999) continue;
+    const r = await testConfig(model, lbl, {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe,
+    });
+    if (r) results.push(r);
+  }
+
+  // n-cpu-moe sweep
+  for (const n of [0, 5, 10, 15, 20]) {
+    if (n > model.totalLayers) continue;
+    const r = await testConfig(model, `n-cpu-moe=${n}`, {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n,
+    });
+    if (r) results.push(r);
+  }
+
+  if (results.length === 0) { log("  PHASE 1: No config worked!"); return null; }
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 1 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase2_threads(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 2: CPU Thread Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const t of [2, 4, 6, 8, 10, 12]) {
+    if (t === p.t) continue;
+    const r = await testConfig(model, `t=${t}`, {
+      ...p, t,
+    });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 2 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase3_batch(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 3: Batch Size Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const [ub, b] of [
+    [128, 512], [256, 1024], [256, 2048],
+    [512, 1024], [512, 2048], [512, 4096],
+    [1024, 2048], [1024, 4096],
+  ]) {
+    if (ub === p.ub && b === p.b) continue;
+    const r = await testConfig(model, `ub=${ub} b=${b}`, { ...p, ub, b });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 3 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase4_kvcache(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 4: KV Cache Type Sweep — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  const p = prev.params;
+  const results = [prev];
+
+  for (const [ctk, ctv] of [
+    ["q4_0", "q4_0"], ["q8_0", "q8_0"],
+    ["q4_0", "q8_0"], ["f16", "f16"],
+  ]) {
+    if (ctk === p.ctk && ctv === p.ctv) continue;
+    const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...p, ctk, ctv });
+    if (r) results.push(r);
+  }
+
+  const best = results.reduce((a, b) => a.avg_tps > b.avg_tps ? a : b);
+  log(`\n  ★ Phase 4 winner: ${best.label} → ${best.avg_tps.toFixed(2)} t/s`);
+  return best;
+}
+
+async function phase5_final(model, prev) {
+  log(`\n${"=".repeat(70)}`);
+  log(`  PHASE 5: Final Verification (5 runs) — ${model.name}`);
+  log(`${"=".repeat(70)}`);
+
+  await killServer();
+  const proc = startServer(model.path, prev.params);
+  const { ok, bootTime } = await waitForServer();
+  if (!ok) { log("  FAILED to start!"); proc.kill("SIGKILL"); return prev; }
+
+  const vram = getVramAll();
+  try { await runBenchmark(20); } catch {}
+
+  const speeds = [];
+  for (let i = 0; i < 5; i++) {
+    try {
+      const r = await runBenchmark();
+      speeds.push(r.tps);
+      log(`    Final Run ${i + 1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) {
+      log(`    Final Run ${i + 1}: ERROR (${e.message})`);
+    }
+  }
+  proc.kill("SIGKILL");
+
+  if (speeds.length > 0) {
+    const avg = speeds.reduce((a, b) => a + b) / speeds.length;
+    const best = Math.max(...speeds);
+    log(`\n  ★ FINAL: AVG ${avg.toFixed(2)} t/s | BEST ${best.toFixed(2)} t/s`);
+
+    const final_ = {
+      model: model.name, quant: model.quant,
+      label: `FINAL-${model.name}`,
+      avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+      boot_time: +bootTime.toFixed(1), vram, params: prev.params,
+    };
+    ALL_RESULTS.push(final_);
+    return final_;
+  }
+  return prev;
+}
+
+// ─── Main ──────────────────────────────────────────────────────
+
+async function runModelBenchmark(model) {
+  log(`\n${"#".repeat(70)}`);
+  log(`  MODEL: ${model.name}`);
+  log(`  File:  ${model.path}`);
+  try {
+    const sz = statSync(model.path).size / 1024 ** 3;
+    log(`  Size:  ${sz.toFixed(2)} GB`);
+  } catch { log(`  Size:  unknown`); }
+  log(`${"#".repeat(70)}`);
+
+  if (!existsSync(model.path)) {
+    log(`  SKIP: Model file not found!`);
+    return null;
+  }
+
+  const baseline = await phase0_bootTest(model);
+  if (!baseline) { log(`  SKIP: Cannot boot at 256K!`); return null; }
+
+  let best = await phase1_gpuOffload(model, baseline);
+  if (!best) return baseline;
+
+  best = await phase2_threads(model, best);
+  best = await phase3_batch(model, best);
+  best = await phase4_kvcache(model, best);
+  best = await phase5_final(model, best);
+
+  return best;
+}
+
+async function main() {
+  const startTime = Date.now();
+
+  log("=".repeat(70));
+  log("  DUAL-GPU COMPREHENSIVE MODEL BENCHMARK");
+  log("  2x RTX 3060 (24GB Total) | 256K Context");
+  log(`  Models: ${MODELS.length}`);
+  log(`  Started: ${new Date().toISOString()}`);
+  log("=".repeat(70));
+
+  const gpus = getVramAll();
+  gpus.forEach((g) => log(`  GPU ${g.gpu}: ${g.used}/${g.total} MiB used`));
+
+  const winners = [];
+
+  for (let i = 0; i < MODELS.length; i++) {
+    log(`\n${"=".repeat(70)}`);
+    log(`  STARTING MODEL ${i + 1}/${MODELS.length}: ${MODELS[i].name}`);
+    log(`${"=".repeat(70)}`);
+
+    const winner = await runModelBenchmark(MODELS[i]);
+    if (winner) winners.push(winner);
+
+    // Save intermediate
+    writeFileSync("scripts/dual_gpu_results.json",
+      JSON.stringify(ALL_RESULTS, null, 2));
+    log(`  Intermediate saved (${ALL_RESULTS.length} configs tested)`);
+  }
+
+  // ─── Grand Final ───────────────────────────────────────────
+  const elapsed = (Date.now() - startTime) / 60000;
+
+  log(`\n${"=".repeat(70)}`);
+  log(`  GRAND FINAL COMPARISON`);
+  log(`  Total time: ${elapsed.toFixed(1)} minutes`);
+  log(`  Configs tested: ${ALL_RESULTS.length}`);
+  log(`${"=".repeat(70)}`);
+
+  if (winners.length === 0) {
+    log("  No models ran at 256K!");
+    return;
+  }
+
+  winners.sort((a, b) => b.avg_tps - a.avg_tps);
+  const medals = ["🥇", "🥈", "🥉", "  "];
+
+  const lines = [
+    `Dual-GPU Benchmark Results — ${new Date().toISOString()}`,
+    `Hardware: 2x RTX 3060 12GB | Context: 256K`,
+    `Configs tested: ${ALL_RESULTS.length} | Time: ${elapsed.toFixed(1)} min`,
+    "", "=".repeat(60), "  RANKING (by AVG t/s)", "=".repeat(60),
+  ];
+
+  for (let i = 0; i < winners.length; i++) {
+    const w = winners[i];
+    const p = w.params;
+    lines.push("");
+    lines.push(`  ${medals[i] || "  "} #${i + 1}: ${w.model}`);
+    lines.push(`      AVG: ${w.avg_tps.toFixed(2)} t/s | BEST: ${w.best_tps.toFixed(2)} t/s`);
+    lines.push(`      Boot: ${w.boot_time.toFixed(0)}s`);
+    lines.push(`      ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b}`);
+    lines.push(`      ctk=${p.ctk} ctv=${p.ctv}`);
+    if (p.cpuMoe) lines.push(`      --cpu-moe`);
+    else if ((p.nCpuMoe || 0) > 0) lines.push(`      --n-cpu-moe ${p.nCpuMoe}`);
+  }
+
+  const champ = winners[0];
+  const cp = champ.params;
+  lines.push("", "=".repeat(60));
+  lines.push(`  ★ CHAMPION: ${champ.model}`);
+  lines.push(`    ${champ.avg_tps.toFixed(2)} t/s average`);
+  lines.push("=".repeat(60));
+
+  // Build recommended command
+  const cmdParts = [
+    `llama-server --model ${MODELS.find((m) => m.name === champ.model).path}`,
+    `-ngl ${cp.ngl} -c ${CONTEXT}`,
+    `-t ${cp.t} -tb ${cp.t}`,
+    `-ub ${cp.ub} -b ${cp.b}`,
+    `-fa on`,
+    `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
+    `--prio ${cp.prio || 3} --poll 50`,
+    `--mlock`,
+  ];
+  if (cp.cpuMoe) cmdParts.push("--cpu-moe");
+  else if ((cp.nCpuMoe || 0) > 0) cmdParts.push(`--n-cpu-moe ${cp.nCpuMoe}`);
+  if (cp.nommap) cmdParts.push("--no-mmap");
+  cmdParts.push("--port 8000 --host 0.0.0.0");
+
+  lines.push("", "  Recommended command:");
+  lines.push(`    ${cmdParts.join(" ")}`);
+
+  const summary = lines.join("\n");
+  console.log(summary);
+  writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
+  writeFileSync("scripts/dual_gpu_results.json",
+    JSON.stringify(ALL_RESULTS, null, 2));
+
+  log(`\n  Results: scripts/dual_gpu_results.json`);
+  log(`  Summary: scripts/dual_gpu_summary.txt`);
+  log(`  DONE!`);
+
+  await killServer();
+}
+
+main().catch((e) => {
+  console.error("Fatal error:", e);
+  process.exit(1);
+});
--- a/scripts/dual_gpu_benchmark.py
+++ b/scripts/dual_gpu_benchmark.py
@@ -0,0 +1,644 @@
+"""
+Dual-GPU (2x RTX 3060 24GB) Comprehensive Model Benchmark
+==========================================================
+Tests 4 models across multiple parameter configurations to find
+the absolute best model + settings for 256K context coding agent.
+
+Models:
+  1. Qwen3.5-35B-A3B  Q4_K_M     (~20.5 GB)
+  2. Qwen3.5-35B-A3B  MXFP4_MOE  (~20.1 GB)
+  3. Gemma4  26B-A4B   Q4_K_M     (~15.6 GB)
+  4. Gemma4  26B-A4B   MXFP4_MOE  (~15.5 GB)
+
+Test Phases (per model):
+  Phase 0: Basic dual-GPU startup test (can it even boot at 256K?)
+  Phase 1: GPU layer + MoE offload strategy sweep
+  Phase 2: CPU thread sweep (carry best from P1)
+  Phase 3: Batch size sweep (carry best from P1+P2)
+  Phase 4: KV cache type sweep (carry best from P1+P2+P3)
+  Phase 5: Final verification (5 runs)
+
+Output: scripts/dual_gpu_results.json  (all raw data)
+        scripts/dual_gpu_summary.txt   (human-readable winner)
+"""
+import subprocess
+import time
+import json
+import urllib.request
+import sys
+import os
+import datetime
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except Exception:
+    pass
+
+# ─── Configuration ───────────────────────────────────────────────
+BASE_URL = "http://127.0.0.1:8000"
+LLAMA_SERVER = r"llama_bin_run\llama-server.exe"
+CONTEXT = 262144  # 256K
+BENCHMARK_RUNS = 3
+BENCHMARK_TOKENS = 200
+SERVER_TIMEOUT = 300  # seconds to wait for server startup
+
+MODELS = [
+    {
+        "name": "Qwen3.5-35B-A3B Q4_K_M",
+        "path": r"models\Qwen3.5-35B-A3B-Q4_K_M.gguf",
+        "type": "qwen",
+        "quant": "Q4_K_M",
+        "is_mxfp4": False,
+        "total_layers": 64,  # Qwen3.5 35B has 64 layers
+    },
+    {
+        "name": "Qwen3.5-35B-A3B MXFP4_MOE",
+        "path": r"models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf",
+        "type": "qwen",
+        "quant": "MXFP4_MOE",
+        "is_mxfp4": True,
+        "total_layers": 64,
+    },
+    {
+        "name": "Gemma4 26B-A4B Q4_K_M",
+        "path": r"models\gemma-4-26B-A4B-it-Q4_K_M.gguf",
+        "type": "gemma4",
+        "quant": "Q4_K_M",
+        "is_mxfp4": False,
+        "total_layers": 30,  # Gemma4 26B has 30 layers
+    },
+    {
+        "name": "Gemma4 26B-A4B MXFP4_MOE",
+        "path": r"models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf",
+        "type": "gemma4",
+        "quant": "MXFP4_MOE",
+        "is_mxfp4": True,
+        "total_layers": 30,
+    },
+]
+
+ALL_RESULTS = []
+
+
+# ─── Utility Functions ──────────────────────────────────────────
+def log(msg):
+    ts = datetime.datetime.now().strftime("%H:%M:%S")
+    print(f"[{ts}] {msg}", flush=True)
+
+
+def kill_server():
+    subprocess.run(["taskkill", "/F", "/IM", "llama-server.exe"],
+                   capture_output=True)
+    time.sleep(5)
+
+
+def get_vram_all():
+    """Returns list of (used, total) tuples for each GPU."""
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=index,memory.used,memory.total",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        gpus = []
+        for line in r.stdout.strip().split("\n"):
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) >= 3:
+                gpus.append({
+                    "gpu": int(parts[0]),
+                    "used": int(parts[1]),
+                    "total": int(parts[2]),
+                })
+        return gpus
+    except Exception:
+        return []
+
+
+def build_cmd(model_path, ngl, t, ub, b, ctk, ctv,
+              cpu_moe=False, n_cpu_moe=0, prio=3, nommap=False):
+    """Build llama-server command for dual-GPU."""
+    cmd = [
+        LLAMA_SERVER,
+        "--model", model_path,
+        "-ngl", str(ngl),
+        "-c", str(CONTEXT),
+        "-np", "1",
+        "-fa", "on",
+        "--cache-type-k", ctk,
+        "--cache-type-v", ctv,
+        "-ub", str(ub),
+        "-b", str(b),
+        "-t", str(t),
+        "-tb", str(t),
+        "--prio", str(prio),
+        "--poll", "50",
+        "--mlock",
+        "--port", "8000",
+        "--host", "0.0.0.0",
+    ]
+    # MoE offloading options
+    if cpu_moe:
+        cmd.append("--cpu-moe")
+    elif n_cpu_moe > 0:
+        cmd.extend(["--n-cpu-moe", str(n_cpu_moe)])
+    if nommap:
+        cmd.append("--no-mmap")
+    return cmd
+
+
+def start_server(model_path, **kwargs):
+    cmd = build_cmd(model_path, **kwargs)
+    log(f"  CMD: {' '.join(cmd[-20:])}")  # show last 20 args
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        cwd=os.getcwd(), text=True, encoding='utf-8', errors='replace'
+    )
+    return proc
+
+
+def wait_for_server(timeout=SERVER_TIMEOUT):
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            req = urllib.request.Request(f"{BASE_URL}/health")
+            with urllib.request.urlopen(req, timeout=3) as resp:
+                data = json.loads(resp.read())
+                if data.get("status") == "ok":
+                    boot_time = time.time() - start
+                    return True, boot_time
+        except Exception:
+            pass
+        time.sleep(3)
+    return False, timeout
+
+
+def run_benchmark(max_tokens=BENCHMARK_TOKENS):
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user",
+                      "content": "Count from 1 to 50, writing each number on a new line."}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0,
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    usage = result.get("usage", {})
+    ct = usage.get("completion_tokens", 0)
+    pt = usage.get("prompt_tokens", 0)
+    return {
+        "tps": ct / elapsed if elapsed > 0 else 0,
+        "completion_tokens": ct,
+        "prompt_tokens": pt,
+        "elapsed": elapsed,
+    }
+
+
+def test_config(model_info, label, **kwargs):
+    """Test a single configuration. Returns result dict or None."""
+    kill_server()
+    log(f"  [{label}] Starting server...")
+
+    proc = start_server(model_info["path"], **kwargs)
+    ok, boot_time = wait_for_server()
+
+    if not ok:
+        log(f"  [{label}] FAILED to start (timeout {SERVER_TIMEOUT}s)")
+        proc.kill()
+        return None
+
+    vram = get_vram_all()
+    vram_str = " | ".join(f"GPU{g['gpu']}:{g['used']}/{g['total']}MiB" for g in vram)
+    log(f"  [{label}] Boot: {boot_time:.0f}s | VRAM: {vram_str}")
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except Exception:
+        pass
+
+    # Benchmark runs
+    speeds = []
+    for i in range(BENCHMARK_RUNS):
+        try:
+            r = run_benchmark()
+            speeds.append(r["tps"])
+            log(f"    Run {i+1}: {r['tps']:.2f} t/s")
+        except Exception as e:
+            log(f"    Run {i+1}: ERROR ({e})")
+
+    proc.kill()
+
+    if not speeds:
+        log(f"  [{label}] ALL BENCHMARK RUNS FAILED")
+        return None
+
+    avg = sum(speeds) / len(speeds)
+    best = max(speeds)
+    log(f"  [{label}] => AVG: {avg:.2f} t/s | BEST: {best:.2f} t/s")
+
+    result = {
+        "model": model_info["name"],
+        "quant": model_info["quant"],
+        "label": label,
+        "avg_tps": round(avg, 2),
+        "best_tps": round(best, 2),
+        "boot_time": round(boot_time, 1),
+        "vram": vram,
+        "params": kwargs,
+    }
+    ALL_RESULTS.append(result)
+    return result
+
+
+# ─── Phase Runners ───────────────────────────────────────────────
+
+def phase0_boot_test(model):
+    """Quick test: can the model even boot with 256K on dual GPU?"""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 0: Boot Test — {model['name']}")
+    log(f"{'='*70}")
+
+    # Try -ngl 999 (all layers to GPU) as baseline
+    r = test_config(
+        model, f"boot-ngl999",
+        ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
+    )
+    if r:
+        return r
+
+    # If full GPU fails, try with cpu-moe
+    log("  Full GPU failed, trying with --cpu-moe...")
+    r = test_config(
+        model, f"boot-cpumoe",
+        ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
+        cpu_moe=True,
+    )
+    if r:
+        return r
+
+    # Extreme fallback: fewer layers
+    log("  --cpu-moe also failed, trying reduced layers...")
+    r = test_config(
+        model, f"boot-ngl-half",
+        ngl=model["total_layers"] // 2, t=6, ub=512, b=2048,
+        ctk="q4_0", ctv="q4_0",
+    )
+    return r
+
+
+def phase1_gpu_offload(model, baseline):
+    """Find optimal GPU layer count and MoE offload strategy."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 1: GPU Offload Strategy — {model['name']}")
+    log(f"{'='*70}")
+
+    results = []
+    if baseline:
+        results.append(baseline)
+
+    total = model["total_layers"]
+
+    # Strategy A: All GPU + cpu-moe variations
+    for cpu_moe in [True, False]:
+        label = f"ngl=999 cpu_moe={cpu_moe}"
+        # Skip if already tested in baseline
+        if baseline and baseline["label"] in [f"boot-ngl999", f"boot-cpumoe"] and \
+           baseline["params"].get("cpu_moe", False) == cpu_moe:
+            continue
+        r = test_config(
+            model, label,
+            ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
+            cpu_moe=cpu_moe,
+        )
+        if r:
+            results.append(r)
+
+    # Strategy B: n-cpu-moe sweep (selective expert offload)
+    for n in [0, 5, 10, 15, 20]:
+        if n > total:
+            continue
+        r = test_config(
+            model, f"n-cpu-moe={n}",
+            ngl=999, t=6, ub=512, b=2048, ctk="q4_0", ctv="q4_0",
+            n_cpu_moe=n,
+        )
+        if r:
+            results.append(r)
+
+    if not results:
+        log("  PHASE 1: No configuration worked!")
+        return None
+
+    best = max(results, key=lambda x: x["avg_tps"])
+    log(f"\n  ★ Phase 1 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
+    return best
+
+
+def phase2_threads(model, prev_best):
+    """Sweep CPU threads with best GPU config locked."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 2: CPU Thread Sweep — {model['name']}")
+    log(f"{'='*70}")
+
+    p = prev_best["params"]
+    results = [prev_best]
+
+    for t in [2, 4, 6, 8, 10, 12]:
+        if t == p.get("t", 6):
+            continue
+        r = test_config(
+            model, f"t={t}",
+            ngl=p["ngl"], t=t, ub=p["ub"], b=p["b"],
+            ctk=p["ctk"], ctv=p["ctv"],
+            cpu_moe=p.get("cpu_moe", False),
+            n_cpu_moe=p.get("n_cpu_moe", 0),
+        )
+        if r:
+            results.append(r)
+
+    best = max(results, key=lambda x: x["avg_tps"])
+    log(f"\n  ★ Phase 2 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
+    return best
+
+
+def phase3_batch(model, prev_best):
+    """Sweep batch sizes."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 3: Batch Size Sweep — {model['name']}")
+    log(f"{'='*70}")
+
+    p = prev_best["params"]
+    best_t = p["t"]
+    results = [prev_best]
+
+    for ub, b in [(128, 512), (256, 1024), (256, 2048),
+                  (512, 1024), (512, 2048), (512, 4096),
+                  (1024, 2048), (1024, 4096)]:
+        if ub == p["ub"] and b == p["b"]:
+            continue
+        r = test_config(
+            model, f"ub={ub} b={b}",
+            ngl=p["ngl"], t=best_t, ub=ub, b=b,
+            ctk=p["ctk"], ctv=p["ctv"],
+            cpu_moe=p.get("cpu_moe", False),
+            n_cpu_moe=p.get("n_cpu_moe", 0),
+        )
+        if r:
+            results.append(r)
+
+    best = max(results, key=lambda x: x["avg_tps"])
+    log(f"\n  ★ Phase 3 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
+    return best
+
+
+def phase4_kvcache(model, prev_best):
+    """Sweep KV cache precision."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 4: KV Cache Type Sweep — {model['name']}")
+    log(f"{'='*70}")
+
+    p = prev_best["params"]
+    results = [prev_best]
+
+    for ctk, ctv in [("q4_0", "q4_0"), ("q8_0", "q8_0"),
+                     ("q4_0", "q8_0"), ("f16", "f16")]:
+        if ctk == p["ctk"] and ctv == p["ctv"]:
+            continue
+        r = test_config(
+            model, f"kv={ctk}/{ctv}",
+            ngl=p["ngl"], t=p["t"], ub=p["ub"], b=p["b"],
+            ctk=ctk, ctv=ctv,
+            cpu_moe=p.get("cpu_moe", False),
+            n_cpu_moe=p.get("n_cpu_moe", 0),
+        )
+        if r:
+            results.append(r)
+
+    best = max(results, key=lambda x: x["avg_tps"])
+    log(f"\n  ★ Phase 4 winner: {best['label']} → {best['avg_tps']:.2f} t/s")
+    return best
+
+
+def phase5_final(model, prev_best):
+    """Final verification with 5 runs."""
+    log(f"\n{'='*70}")
+    log(f"  PHASE 5: Final Verification (5 runs) — {model['name']}")
+    log(f"{'='*70}")
+
+    p = prev_best["params"]
+    kill_server()
+    proc = start_server(model["path"], **p)
+    ok, boot_time = wait_for_server()
+    if not ok:
+        log("  FAILED to start for final verification!")
+        proc.kill()
+        return prev_best
+
+    vram = get_vram_all()
+
+    # Warmup
+    try:
+        run_benchmark(max_tokens=20)
+    except Exception:
+        pass
+
+    speeds = []
+    for i in range(5):
+        try:
+            r = run_benchmark()
+            speeds.append(r["tps"])
+            log(f"    Final Run {i+1}: {r['tps']:.2f} t/s")
+        except Exception as e:
+            log(f"    Final Run {i+1}: ERROR ({e})")
+
+    proc.kill()
+
+    if speeds:
+        avg = sum(speeds) / len(speeds)
+        best_tps = max(speeds)
+        log(f"\n  ★ FINAL: AVG {avg:.2f} t/s | BEST {best_tps:.2f} t/s")
+
+        final = {
+            "model": model["name"],
+            "quant": model["quant"],
+            "label": f"FINAL-{model['name']}",
+            "avg_tps": round(avg, 2),
+            "best_tps": round(best_tps, 2),
+            "boot_time": round(boot_time, 1),
+            "vram": vram,
+            "params": p,
+        }
+        ALL_RESULTS.append(final)
+        return final
+
+    return prev_best
+
+
+# ─── Main ────────────────────────────────────────────────────────
+
+def run_full_benchmark_for_model(model):
+    """Run all phases for a single model."""
+    log(f"\n{'#'*70}")
+    log(f"  MODEL: {model['name']}")
+    log(f"  File:  {model['path']}")
+    log(f"  Size:  {os.path.getsize(model['path'])/1024**3:.2f} GB")
+    log(f"{'#'*70}")
+
+    # Check model exists
+    if not os.path.exists(model["path"]):
+        log(f"  SKIP: Model file not found!")
+        return None
+
+    # Phase 0: Can it boot?
+    baseline = phase0_boot_test(model)
+    if not baseline:
+        log(f"  SKIP: {model['name']} cannot boot at 256K context!")
+        return None
+
+    # Phase 1: GPU offload strategy
+    best = phase1_gpu_offload(model, baseline)
+    if not best:
+        return baseline
+
+    # Phase 2: CPU threads
+    best = phase2_threads(model, best)
+
+    # Phase 3: Batch sizes
+    best = phase3_batch(model, best)
+
+    # Phase 4: KV cache
+    best = phase4_kvcache(model, best)
+
+    # Phase 5: Final verification
+    final = phase5_final(model, best)
+
+    return final
+
+
+def main():
+    start_time = time.time()
+
+    log("=" * 70)
+    log("  DUAL-GPU COMPREHENSIVE MODEL BENCHMARK")
+    log("  2x RTX 3060 (24GB Total) | 256K Context")
+    log(f"  Models: {len(MODELS)}")
+    log(f"  Started: {datetime.datetime.now().isoformat()}")
+    log("=" * 70)
+
+    # Show GPU info
+    gpus = get_vram_all()
+    for g in gpus:
+        log(f"  GPU {g['gpu']}: {g['used']}/{g['total']} MiB used")
+
+    # Run benchmarks for each model
+    model_winners = []
+    for i, model in enumerate(MODELS):
+        log(f"\n{'='*70}")
+        log(f"  STARTING MODEL {i+1}/{len(MODELS)}: {model['name']}")
+        log(f"{'='*70}")
+
+        winner = run_full_benchmark_for_model(model)
+        if winner:
+            model_winners.append(winner)
+
+        # Save intermediate results
+        with open("scripts/dual_gpu_results.json", "w") as f:
+            json.dump(ALL_RESULTS, f, indent=2, default=str)
+        log(f"  Intermediate results saved ({len(ALL_RESULTS)} configs tested)")
+
+    # ─── Grand Final Comparison ──────────────────────────────────
+    elapsed = (time.time() - start_time) / 60
+
+    log(f"\n{'='*70}")
+    log(f"  GRAND FINAL COMPARISON")
+    log(f"  Total time: {elapsed:.1f} minutes")
+    log(f"  Configs tested: {len(ALL_RESULTS)}")
+    log(f"{'='*70}")
+
+    if not model_winners:
+        log("  No models were able to run at 256K context!")
+        return
+
+    # Sort by avg t/s
+    model_winners.sort(key=lambda x: x["avg_tps"], reverse=True)
+
+    summary_lines = []
+    summary_lines.append(f"Dual-GPU Benchmark Results — {datetime.datetime.now().isoformat()}")
+    summary_lines.append(f"Hardware: 2x RTX 3060 12GB | Context: 256K")
+    summary_lines.append(f"Total configs tested: {len(ALL_RESULTS)}")
+    summary_lines.append(f"Total time: {elapsed:.1f} minutes")
+    summary_lines.append("")
+    summary_lines.append("=" * 60)
+    summary_lines.append("  RANKING (by AVG t/s)")
+    summary_lines.append("=" * 60)
+
+    for rank, w in enumerate(model_winners, 1):
+        medal = {1: "🥇", 2: "🥈", 3: "🥉", 4: "  "}.get(rank, "  ")
+        summary_lines.append(f"\n  {medal} #{rank}: {w['model']}")
+        summary_lines.append(f"      AVG: {w['avg_tps']:.2f} t/s | BEST: {w['best_tps']:.2f} t/s")
+        summary_lines.append(f"      Boot: {w['boot_time']:.0f}s")
+        p = w["params"]
+        summary_lines.append(f"      ngl={p['ngl']} t={p['t']} ub={p['ub']} b={p['b']}")
+        summary_lines.append(f"      ctk={p['ctk']} ctv={p['ctv']}")
+        if p.get("cpu_moe"):
+            summary_lines.append(f"      --cpu-moe")
+        elif p.get("n_cpu_moe", 0) > 0:
+            summary_lines.append(f"      --n-cpu-moe {p['n_cpu_moe']}")
+
+    champion = model_winners[0]
+    summary_lines.append(f"\n{'='*60}")
+    summary_lines.append(f"  ★ CHAMPION: {champion['model']}")
+    summary_lines.append(f"    {champion['avg_tps']:.2f} t/s average")
+    summary_lines.append(f"{'='*60}")
+
+    # Build recommended command
+    p = champion["params"]
+    cmd_parts = [
+        f"llama-server --model {MODELS[[m['name'] for m in MODELS].index(champion['model'])]['path']}",
+        f"-ngl {p['ngl']} -c {CONTEXT}",
+        f"-t {p['t']} -tb {p['t']}",
+        f"-ub {p['ub']} -b {p['b']}",
+        "-fa on",
+        f"--cache-type-k {p['ctk']} --cache-type-v {p['ctv']}",
+        f"--prio {p.get('prio', 3)} --poll 50",
+        "--mlock",
+    ]
+    if p.get("cpu_moe"):
+        cmd_parts.append("--cpu-moe")
+    elif p.get("n_cpu_moe", 0) > 0:
+        cmd_parts.append(f"--n-cpu-moe {p['n_cpu_moe']}")
+    if p.get("nommap"):
+        cmd_parts.append("--no-mmap")
+    cmd_parts.append("--port 8000 --host 0.0.0.0")
+
+    summary_lines.append(f"\n  Recommended command:")
+    summary_lines.append(f"    {' '.join(cmd_parts)}")
+
+    summary = "\n".join(summary_lines)
+    print(summary)
+
+    with open("scripts/dual_gpu_summary.txt", "w", encoding="utf-8") as f:
+        f.write(summary)
+
+    with open("scripts/dual_gpu_results.json", "w") as f:
+        json.dump(ALL_RESULTS, f, indent=2, default=str)
+
+    log(f"\n  Results: scripts/dual_gpu_results.json")
+    log(f"  Summary: scripts/dual_gpu_summary.txt")
+    log(f"  DONE!")
+
+    kill_server()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/dual_gpu_benchmark_v2.mjs
+++ b/scripts/dual_gpu_benchmark_v2.mjs
@@ -0,0 +1,330 @@
+/**
+ * Dual-GPU (2x RTX 3060 24GB) Smart Model Benchmark v2
+ * =====================================================
+ * Informed by VRAM analysis — tests models in optimal order.
+ * 
+ * Key insights applied:
+ *   - Gemma4 fits entirely in 24GB GPU (KV cache ~0.18 GB with SWA)
+ *   - Qwen3.5 is tight (~22.5-22.9 GB needed) — try full GPU first
+ *   - Skip configs known to fail, minimize wasted time
+ *
+ * Run: node scripts/dual_gpu_benchmark_v2.mjs
+ * Results: scripts/dual_gpu_results.json + scripts/dual_gpu_summary.txt
+ */
+
+import { spawn, execSync } from "child_process";
+import { writeFileSync, existsSync, statSync } from "fs";
+
+const BASE_URL = "http://127.0.0.1:8000";
+const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
+const CTX = 262144;
+const RUNS = 3;
+const TOKENS = 200;
+const BOOT_TIMEOUT = 300_000;
+
+// Models ordered: smallest first (most likely to succeed fully on GPU)
+const MODELS = [
+  {
+    name: "Gemma4-26B MXFP4_MOE",
+    path: String.raw`models\gemma-4-26B-A4B-it-MXFP4_MOE.gguf`,
+    quant: "MXFP4_MOE",
+    fitsGPU: true,  // 15.5 + 0.18 + 1 = 16.72 GB << 23 GB
+  },
+  {
+    name: "Gemma4-26B Q4_K_M",
+    path: String.raw`models\gemma-4-26B-A4B-it-Q4_K_M.gguf`,
+    quant: "Q4_K_M",
+    fitsGPU: true,  // 15.6 + 0.18 + 1 = 16.82 GB << 23 GB
+  },
+  {
+    name: "Qwen3.5-35B MXFP4_MOE",
+    path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
+    quant: "MXFP4_MOE",
+    fitsGPU: "maybe",  // 20.1 + 1.41 + 1 = 22.51 GB — tight
+  },
+  {
+    name: "Qwen3.5-35B Q4_K_M",
+    path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
+    quant: "Q4_K_M",
+    fitsGPU: "maybe",  // 20.5 + 1.41 + 1 = 22.91 GB — very tight
+  },
+];
+
+const ALL = [];
+let currentProc = null;
+
+// ─── Utilities ─────────────────────────────────────────────────
+const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
+const sleep = (ms) => new Promise(r => setTimeout(r, ms));
+
+async function kill() {
+  if (currentProc) { try { currentProc.kill("SIGKILL"); } catch {} currentProc = null; }
+  try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
+  await sleep(5000);
+}
+
+function vram() {
+  try {
+    return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
+      { encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
+      const [g, u, t] = l.split(",").map(s => parseInt(s));
+      return { gpu: g, used: u, total: t };
+    });
+  } catch { return []; }
+}
+
+function startServer(modelPath, p) {
+  const args = [
+    "--model", modelPath, "-ngl", String(p.ngl),
+    "-c", String(CTX), "-np", "1", "-fa", "on",
+    "--cache-type-k", p.ctk, "--cache-type-v", p.ctv,
+    "-ub", String(p.ub), "-b", String(p.b),
+    "-t", String(p.t), "-tb", String(p.t),
+    "--prio", String(p.prio || 3), "--poll", "50", "--mlock",
+    "--port", "8000", "--host", "0.0.0.0",
+  ];
+  if (p.cpuMoe) args.push("--cpu-moe");
+  else if ((p.nCpuMoe || 0) > 0) args.push("--n-cpu-moe", String(p.nCpuMoe));
+  if (p.nommap) args.push("--no-mmap");
+
+  currentProc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
+  return currentProc;
+}
+
+async function waitReady(timeout = BOOT_TIMEOUT) {
+  const t0 = Date.now();
+  while (Date.now() - t0 < timeout) {
+    try {
+      const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
+      const d = await r.json();
+      if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
+    } catch {}
+    await sleep(3000);
+  }
+  return { ok: false, boot: timeout / 1000 };
+}
+
+async function bench(n = TOKENS) {
+  const t0 = Date.now();
+  const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      model: "m",
+      messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
+      max_tokens: n, temperature: 0,
+    }),
+    signal: AbortSignal.timeout(600_000),
+  });
+  const d = await r.json();
+  const dt = (Date.now() - t0) / 1000;
+  const ct = d.usage?.completion_tokens || 0;
+  return { tps: ct / dt, ct, dt };
+}
+
+async function testConfig(model, label, params) {
+  await kill();
+  log(`  [${label}] Starting...`);
+  startServer(model.path, params);
+  const { ok, boot } = await waitReady();
+  if (!ok) { log(`  [${label}] ✗ FAILED (timeout)`); await kill(); return null; }
+
+  const v = vram();
+  const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
+  log(`  [${label}] Boot:${boot.toFixed(0)}s | VRAM: ${vs}`);
+
+  try { await bench(20); } catch {} // warmup
+
+  const speeds = [];
+  for (let i = 0; i < RUNS; i++) {
+    try { const r = await bench(); speeds.push(r.tps); log(`    Run${i+1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) { log(`    Run${i+1}: ERR ${e.message}`); }
+  }
+  await kill();
+
+  if (!speeds.length) { log(`  [${label}] ✗ ALL RUNS FAILED`); return null; }
+  const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
+  const best = Math.max(...speeds);
+  log(`  [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
+
+  const res = { model: model.name, quant: model.quant, label,
+    avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+    boot: +boot.toFixed(1), vram: v, params };
+  ALL.push(res);
+  return res;
+}
+
+// Save intermediate results after each test
+function saveIntermediate() {
+  writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
+}
+
+// ─── Smart Phase Runner ────────────────────────────────────────
+
+async function tuneModel(model) {
+  log(`\n${"#".repeat(65)}`);
+  log(`  ${model.name} (${model.quant})`);
+  if (!existsSync(model.path)) { log("  ✗ File not found, SKIP"); return null; }
+  const sz = (statSync(model.path).size / 1024**3).toFixed(2);
+  log(`  Size: ${sz} GB | Fits GPU: ${model.fitsGPU}`);
+  log(`${"#".repeat(65)}`);
+
+  // ── Step 1: Find working GPU config ──
+  log(`\n  ── Step 1: Find optimal GPU offload ──`);
+  let baseline = null;
+
+  if (model.fitsGPU === true || model.fitsGPU === "maybe") {
+    // Try full GPU, no CPU offload
+    baseline = await testConfig(model, "ngl=999 pure-GPU", {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0" });
+    saveIntermediate();
+  }
+
+  if (!baseline) {
+    // Try n-cpu-moe values (ascending — find minimum needed)
+    for (const n of [5, 10, 15, 20]) {
+      baseline = await testConfig(model, `n-cpu-moe=${n}`, {
+        ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", nCpuMoe: n });
+      saveIntermediate();
+      if (baseline) break; // found minimum working offload
+    }
+  }
+
+  if (!baseline) {
+    // Last resort: full cpu-moe
+    baseline = await testConfig(model, "cpu-moe", {
+      ngl: 999, t: 6, ub: 512, b: 2048, ctk: "q4_0", ctv: "q4_0", cpuMoe: true });
+    saveIntermediate();
+  }
+
+  if (!baseline) { log(`  ✗ ${model.name} cannot boot at 256K!`); return null; }
+
+  const bp = baseline.params; // carry forward best params
+
+  // If pure GPU worked, also test cpu-moe to compare (it might be faster due to memory)
+  if (!bp.cpuMoe && !bp.nCpuMoe) {
+    const alt = await testConfig(model, "compare: cpu-moe", {
+      ...bp, cpuMoe: true });
+    saveIntermediate();
+    if (alt && alt.avg_tps > baseline.avg_tps) { baseline = alt; }
+  }
+
+  let best = baseline;
+
+  // ── Step 2: Thread sweep ──
+  log(`\n  ── Step 2: Thread sweep ──`);
+  for (const t of [2, 4, 8, 10, 12]) {
+    if (t === best.params.t) continue;
+    const r = await testConfig(model, `t=${t}`, { ...best.params, t });
+    saveIntermediate();
+    if (r && r.avg_tps > best.avg_tps) best = r;
+  }
+
+  // ── Step 3: Batch sweep ──
+  log(`\n  ── Step 3: Batch sweep ──`);
+  for (const [ub, b] of [[256, 1024], [256, 2048], [512, 2048], [512, 4096], [1024, 2048], [1024, 4096]]) {
+    if (ub === best.params.ub && b === best.params.b) continue;
+    const r = await testConfig(model, `ub=${ub} b=${b}`, { ...best.params, ub, b });
+    saveIntermediate();
+    if (r && r.avg_tps > best.avg_tps) best = r;
+  }
+
+  // ── Step 4: KV cache sweep ──
+  log(`\n  ── Step 4: KV cache type ──`);
+  for (const [ctk, ctv] of [["q8_0","q8_0"], ["q4_0","q8_0"], ["f16","f16"]]) {
+    if (ctk === best.params.ctk && ctv === best.params.ctv) continue;
+    const r = await testConfig(model, `kv=${ctk}/${ctv}`, { ...best.params, ctk, ctv });
+    saveIntermediate();
+    if (r && r.avg_tps > best.avg_tps) best = r;
+  }
+
+  // ── Step 5: Final verification (5 runs) ──
+  log(`\n  ── Step 5: Final verification ──`);
+  await kill();
+  startServer(model.path, best.params);
+  const { ok, boot } = await waitReady();
+  if (!ok) { await kill(); return best; }
+  const v = vram();
+  try { await bench(20); } catch {}
+
+  const finals = [];
+  for (let i = 0; i < 5; i++) {
+    try { const r = await bench(); finals.push(r.tps); log(`    Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) { log(`    Final ${i+1}: ERR`); }
+  }
+  await kill();
+
+  if (finals.length > 0) {
+    const avg = finals.reduce((a,b)=>a+b) / finals.length;
+    const bst = Math.max(...finals);
+    log(`  ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
+    const final = { model: model.name, quant: model.quant, label: `FINAL`,
+      avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
+      boot: +boot.toFixed(1), vram: v, params: best.params };
+    ALL.push(final);
+    saveIntermediate();
+    return final;
+  }
+  return best;
+}
+
+// ─── Main ──────────────────────────────────────────────────────
+async function main() {
+  const t0 = Date.now();
+  log("=" .repeat(65));
+  log("  DUAL-GPU BENCHMARK v2 — Smart Strategy");
+  log("  2x RTX 3060 (24GB) | 256K Context");
+  log("  " + new Date().toISOString());
+  log("=".repeat(65));
+  vram().forEach(g => log(`  GPU${g.gpu}: ${g.used}/${g.total} MiB`));
+
+  const winners = [];
+  for (let i = 0; i < MODELS.length; i++) {
+    log(`\n${"=".repeat(65)}`);
+    log(`  MODEL ${i+1}/${MODELS.length}: ${MODELS[i].name}`);
+    log("=".repeat(65));
+    const w = await tuneModel(MODELS[i]);
+    if (w) winners.push(w);
+    saveIntermediate();
+  }
+
+  // ─── Summary ──────────────────────────────────────────────
+  const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
+  winners.sort((a, b) => b.avg_tps - a.avg_tps);
+  const medals = ["🥇", "🥈", "🥉", "  "];
+
+  const lines = [
+    `Dual-GPU Benchmark v2 — ${new Date().toISOString()}`,
+    `2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
+    "", "=" .repeat(55), "  RANKING", "=".repeat(55),
+  ];
+  for (let i = 0; i < winners.length; i++) {
+    const w = winners[i], p = w.params;
+    lines.push("", `  ${medals[i]||"  "} #${i+1}: ${w.model}`);
+    lines.push(`      AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
+    lines.push(`      ngl=${p.ngl} t=${p.t} ub=${p.ub} b=${p.b} ctk=${p.ctk} ctv=${p.ctv}`);
+    if (p.cpuMoe) lines.push(`      --cpu-moe`);
+    else if (p.nCpuMoe) lines.push(`      --n-cpu-moe ${p.nCpuMoe}`);
+  }
+  if (winners.length > 0) {
+    const c = winners[0], cp = c.params;
+    lines.push("", "=".repeat(55), `  ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s`, "=".repeat(55));
+    const cmd = [`llama-server --model ${MODELS.find(m=>m.name===c.model).path}`,
+      `-ngl ${cp.ngl} -c ${CTX} -t ${cp.t} -tb ${cp.t}`,
+      `-ub ${cp.ub} -b ${cp.b} -fa on`,
+      `--cache-type-k ${cp.ctk} --cache-type-v ${cp.ctv}`,
+      `--prio ${cp.prio||3} --poll 50 --mlock`,
+      cp.cpuMoe ? "--cpu-moe" : cp.nCpuMoe ? `--n-cpu-moe ${cp.nCpuMoe}` : "",
+      "--port 8000 --host 0.0.0.0"].filter(Boolean).join(" ");
+    lines.push("", "  Recommended:", `    ${cmd}`);
+  }
+  const summary = lines.join("\n");
+  console.log("\n" + summary);
+  writeFileSync("scripts/dual_gpu_summary.txt", summary, "utf-8");
+  writeFileSync("scripts/dual_gpu_results.json", JSON.stringify(ALL, null, 2));
+  log(`\n  Saved: dual_gpu_results.json + dual_gpu_summary.txt`);
+  log("  DONE!");
+  await kill();
+}
+
+main().catch(e => { console.error("FATAL:", e); process.exit(1); });
--- a/scripts/dual_gpu_results.json
+++ b/scripts/dual_gpu_results.json
--- a/scripts/dual_gpu_summary.txt
+++ b/scripts/dual_gpu_summary.txt
@@ -0,0 +1,31 @@
+Dual-GPU Benchmark v2 — 2026-04-06T06:52:08.868Z
+2x RTX 3060 12GB | 256K Context | 58 configs | 69.4 min
+
+=======================================================
+  RANKING
+=======================================================
+
+  🥇 #1: Gemma4-26B Q4_K_M
+      AVG: 76.4 t/s | BEST: 76.75 t/s | Boot: 9s
+      ngl=999 t=6 ub=512 b=2048 ctk=f16 ctv=f16
+
+  🥈 #2: Gemma4-26B MXFP4_MOE
+      AVG: 64.05 t/s | BEST: 64.29 t/s | Boot: 9s
+      ngl=999 t=2 ub=512 b=2048 ctk=q8_0 ctv=q8_0
+
+  🥉 #3: Qwen3.5-35B Q4_K_M
+      AVG: 52.05 t/s | BEST: 54.48 t/s | Boot: 12.1s
+      ngl=999 t=4 ub=256 b=1024 ctk=q4_0 ctv=q4_0
+      --n-cpu-moe 5
+
+     #4: Qwen3.5-35B MXFP4_MOE
+      AVG: 46.66 t/s | BEST: 47.09 t/s | Boot: 15s
+      ngl=999 t=6 ub=512 b=2048 ctk=q4_0 ctv=q4_0
+      --n-cpu-moe 5
+
+=======================================================
+  ★ CHAMPION: Gemma4-26B Q4_K_M — 76.4 t/s
+=======================================================
+
+  Recommended:
+    llama-server --model models\gemma-4-26B-A4B-it-Q4_K_M.gguf -ngl 999 -c 262144 -t 6 -tb 6 -ub 512 -b 2048 -fa on --cache-type-k f16 --cache-type-v f16 --prio 3 --poll 50 --mlock --port 8000 --host 0.0.0.0
--- a/scripts/final_tune_122b.txt
+++ b/scripts/final_tune_122b.txt
--- a/scripts/final_tune_122b_dual.txt
+++ b/scripts/final_tune_122b_dual.txt
--- a/scripts/find_max_dense.mjs
+++ b/scripts/find_max_dense.mjs
@@ -0,0 +1,101 @@
+import { spawn, exec } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function killServer() {
+    return new Promise(r => exec('taskkill /F /IM llama-server.exe', () => { setTimeout(r, 2000); }));
+}
+
+async function testContextSize(modelPath, contextSize) {
+    console.log(`\nTesting ${modelPath} with -c ${contextSize}...`);
+    await killServer();
+
+    const args = [
+        '--model', `models\\${modelPath}`,
+        '-ngl', '999',
+        '-c', contextSize.toString(),
+        '-fa', 'on',
+        '--cache-type-k', 'q4_0',
+        '--cache-type-v', 'q4_0',
+        '-ub', '512',
+        '-b', '2048',
+        '-t', '6',
+        '-tb', '6',
+        '--split-mode', 'row',
+        '--prio', '3',
+        '--fit', 'off',
+        '--port', '8000',
+        '--host', '0.0.0.0'
+    ];
+
+    const server = spawn('llama_bin_run\\llama-server.exe', args, { stdio: 'pipe' });
+    
+    let booted = false;
+    let oomed = false;
+
+    server.stderr.on('data', (d) => {
+        const text = d.toString();
+        if (text.toLowerCase().includes('out of memory') || text.includes('failed to allocate')) {
+            oomed = true;
+        }
+    });
+
+    for (let i = 0; i < 20; i++) {
+        if (oomed) break;
+        try {
+            const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+            if (res.status === 200) {
+                booted = true;
+                break;
+            }
+        } catch(e) {}
+        await delay(2000);
+    }
+
+    if (oomed || !booted) {
+        console.log(`❌ Failed: Out of Memory at -c ${contextSize}`);
+        server.kill('SIGKILL');
+        await killServer();
+        return false;
+    }
+
+    console.log(`✅ Booted! Running Benchmark...`);
+    
+    // Benchmark
+    const bench = await new Promise(r => exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+        r(stdout || stderr);
+    }));
+    
+    console.log(bench);
+    await killServer();
+    return true;
+}
+
+async function findMaxContext(modelName) {
+    const contexts = [262144, 131072, 65536, 32768, 16384, 8192];
+    
+    let maxFound = false;
+    for (const c of contexts) {
+        const success = await testContextSize(modelName, c);
+        if (success) {
+            maxFound = true;
+            console.log(`\n🎉 MAX STABLE CONTEXT FOR ${modelName}: ${c}`);
+            break;
+        }
+    }
+    
+    if (!maxFound) {
+        console.log(`\n❌ Failed to find any working context size for ${modelName}`);
+    }
+}
+
+async function main() {
+    exec('set CUDA_VISIBLE_DEVICES=');
+    console.log("============= QWEN 27B Q4_K_M =============");
+    await findMaxContext('Qwen3.5-27B-Q4_K_M.gguf');
+    
+    console.log("\n============= GEMMA 4 31B Q4_K_M =============");
+    await findMaxContext('gemma-4-31B-it-Q4_K_M.gguf');
+}
+
+main();
--- a/scripts/help_full.txt
+++ b/scripts/help_full.txt
@@ -0,0 +1,562 @@
+----- common params -----
+
+-h,    --help, --usage                  print usage and exit
+--version                               show version and build info
+--license                               show source code license and dependencies
+-cl,   --cache-list                     show list of models in cache
+--completion-bash                       print source-able bash completion script for llama.cpp
+-t,    --threads N                      number of CPU threads to use during generation (default: -1)
+                                        (env: LLAMA_ARG_THREADS)
+-tb,   --threads-batch N                number of threads to use during batch and prompt processing (default:
+                                        same as --threads)
+-C,    --cpu-mask M                     CPU affinity mask: arbitrarily long hex. Complements cpu-range
+                                        (default: "")
+-Cr,   --cpu-range lo-hi                range of CPUs for affinity. Complements --cpu-mask
+--cpu-strict <0|1>                      use strict CPU placement (default: 0)
+--prio N                                set process/thread priority : low(-1), normal(0), medium(1), high(2),
+                                        realtime(3) (default: 0)
+--poll <0...100>                        use polling level to wait for work (0 - no polling, default: 50)
+-Cb,   --cpu-mask-batch M               CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch
+                                        (default: same as --cpu-mask)
+-Crb,  --cpu-range-batch lo-hi          ranges of CPUs for affinity. Complements --cpu-mask-batch
+--cpu-strict-batch <0|1>                use strict CPU placement (default: same as --cpu-strict)
+--prio-batch N                          set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime
+                                        (default: 0)
+--poll-batch <0|1>                      use polling to wait for work (default: same as --poll)
+-c,    --ctx-size N                     size of the prompt context (default: 0, 0 = loaded from model)
+                                        (env: LLAMA_ARG_CTX_SIZE)
+-n,    --predict, --n-predict N         number of tokens to predict (default: -1, -1 = infinity)
+                                        (env: LLAMA_ARG_N_PREDICT)
+-b,    --batch-size N                   logical maximum batch size (default: 2048)
+                                        (env: LLAMA_ARG_BATCH)
+-ub,   --ubatch-size N                  physical maximum batch size (default: 512)
+                                        (env: LLAMA_ARG_UBATCH)
+--keep N                                number of tokens to keep from the initial prompt (default: 0, -1 =
+                                        all)
+--swa-full                              use full-size SWA cache (default: false)
+                                        [(more
+                                        info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+                                        (env: LLAMA_ARG_SWA_FULL)
+-fa,   --flash-attn [on|off|auto]       set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
+                                        (env: LLAMA_ARG_FLASH_ATTN)
+--perf, --no-perf                       whether to enable internal libllama performance timings (default:
+                                        false)
+                                        (env: LLAMA_ARG_PERF)
+-e,    --escape, --no-escape            whether to process escapes sequences (\n, \r, \t, \', \", \\)
+                                        (default: true)
+--rope-scaling {none,linear,yarn}       RoPE frequency scaling method, defaults to linear unless specified by
+                                        the model
+                                        (env: LLAMA_ARG_ROPE_SCALING_TYPE)
+--rope-scale N                          RoPE context scaling factor, expands context by a factor of N
+                                        (env: LLAMA_ARG_ROPE_SCALE)
+--rope-freq-base N                      RoPE base frequency, used by NTK-aware scaling (default: loaded from
+                                        model)
+                                        (env: LLAMA_ARG_ROPE_FREQ_BASE)
+--rope-freq-scale N                     RoPE frequency scaling factor, expands context by a factor of 1/N
+                                        (env: LLAMA_ARG_ROPE_FREQ_SCALE)
+--yarn-orig-ctx N                       YaRN: original context size of model (default: 0 = model training
+                                        context size)
+                                        (env: LLAMA_ARG_YARN_ORIG_CTX)
+--yarn-ext-factor N                     YaRN: extrapolation mix factor (default: -1.00, 0.0 = full
+                                        interpolation)
+                                        (env: LLAMA_ARG_YARN_EXT_FACTOR)
+--yarn-attn-factor N                    YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
+                                        (env: LLAMA_ARG_YARN_ATTN_FACTOR)
+--yarn-beta-slow N                      YaRN: high correction dim or alpha (default: -1.00)
+                                        (env: LLAMA_ARG_YARN_BETA_SLOW)
+--yarn-beta-fast N                      YaRN: low correction dim or beta (default: -1.00)
+                                        (env: LLAMA_ARG_YARN_BETA_FAST)
+-kvo,  --kv-offload, -nkvo, --no-kv-offload
+                                        whether to enable KV cache offloading (default: enabled)
+                                        (env: LLAMA_ARG_KV_OFFLOAD)
+--repack, -nr, --no-repack              whether to enable weight repacking (default: enabled)
+                                        (env: LLAMA_ARG_REPACK)
+--no-host                               bypass host buffer allowing extra buffers to be used
+                                        (env: LLAMA_ARG_NO_HOST)
+-ctk,  --cache-type-k TYPE              KV cache data type for K
+                                        allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
+                                        (default: f16)
+                                        (env: LLAMA_ARG_CACHE_TYPE_K)
+-ctv,  --cache-type-v TYPE              KV cache data type for V
+                                        allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
+                                        (default: f16)
+                                        (env: LLAMA_ARG_CACHE_TYPE_V)
+-dt,   --defrag-thold N                 KV cache defragmentation threshold (DEPRECATED)
+                                        (env: LLAMA_ARG_DEFRAG_THOLD)
+--rpc SERVERS                           comma separated list of RPC servers (host:port)
+                                        (env: LLAMA_ARG_RPC)
+--mlock                                 force system to keep model in RAM rather than swapping or compressing
+                                        (env: LLAMA_ARG_MLOCK)
+--mmap, --no-mmap                       whether to memory-map model. (if mmap disabled, slower load but may
+                                        reduce pageouts if not using mlock) (default: enabled)
+                                        (env: LLAMA_ARG_MMAP)
+-dio,  --direct-io, -ndio, --no-direct-io
+                                        use DirectIO if available. (default: disabled)
+                                        (env: LLAMA_ARG_DIO)
+--numa TYPE                             attempt optimizations that help on some NUMA systems
+                                        - distribute: spread execution evenly over all nodes
+                                        - isolate: only spawn threads on CPUs on the node that execution
+                                        started on
+                                        - numactl: use the CPU map provided by numactl
+                                        if run without this previously, it is recommended to drop the system
+                                        page cache before using this
+                                        see https://github.com/ggml-org/llama.cpp/issues/1437
+                                        (env: LLAMA_ARG_NUMA)
+-dev,  --device <dev1,dev2,..>          comma-separated list of devices to use for offloading (none = don't
+                                        offload)
+                                        use --list-devices to see a list of available devices
+                                        (env: LLAMA_ARG_DEVICE)
+--list-devices                          print list of available devices and exit
+-ot,   --override-tensor <tensor name pattern>=<buffer type>,...
+                                        override tensor buffer type
+                                        (env: LLAMA_ARG_OVERRIDE_TENSOR)
+-cmoe, --cpu-moe                        keep all Mixture of Experts (MoE) weights in the CPU
+                                        (env: LLAMA_ARG_CPU_MOE)
+-ncmoe, --n-cpu-moe N                   keep the Mixture of Experts (MoE) weights of the first N layers in the
+                                        CPU
+                                        (env: LLAMA_ARG_N_CPU_MOE)
+-ngl,  --gpu-layers, --n-gpu-layers N   max. number of layers to store in VRAM, either an exact number,
+                                        'auto', or 'all' (default: auto)
+                                        (env: LLAMA_ARG_N_GPU_LAYERS)
+-sm,   --split-mode {none,layer,row}    how to split the model across multiple GPUs, one of:
+                                        - none: use one GPU only
+                                        - layer (default): split layers and KV across GPUs
+                                        - row: split rows across GPUs
+                                        (env: LLAMA_ARG_SPLIT_MODE)
+-ts,   --tensor-split N0,N1,N2,...      fraction of the model to offload to each GPU, comma-separated list of
+                                        proportions, e.g. 3,1
+                                        (env: LLAMA_ARG_TENSOR_SPLIT)
+-mg,   --main-gpu INDEX                 the GPU to use for the model (with split-mode = none), or for
+                                        intermediate results and KV (with split-mode = row) (default: 0)
+                                        (env: LLAMA_ARG_MAIN_GPU)
+-fit,  --fit [on|off]                   whether to adjust unset arguments to fit in device memory ('on' or
+                                        'off', default: 'on')
+                                        (env: LLAMA_ARG_FIT)
+-fitt, --fit-target MiB0,MiB1,MiB2,...
+                                        target margin per device for --fit, comma-separated list of values,
+                                        single value is broadcast across all devices, default: 1024
+                                        (env: LLAMA_ARG_FIT_TARGET)
+-fitc, --fit-ctx N                      minimum ctx size that can be set by --fit option, default: 4096
+                                        (env: LLAMA_ARG_FIT_CTX)
+--check-tensors                         check model tensor data for invalid values (default: false)
+--override-kv KEY=TYPE:VALUE,...        advanced option to override model metadata by key. to specify multiple
+                                        overrides, either use comma-separated values.
+                                        types: int, float, bool, str. example: --override-kv
+                                        tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false
+--op-offload, --no-op-offload           whether to offload host tensor operations to device (default: true)
+--lora FNAME                            path to LoRA adapter (use comma-separated values to load multiple
+                                        adapters)
+--lora-scaled FNAME:SCALE,...           path to LoRA adapter with user defined scaling (format:
+                                        FNAME:SCALE,...)
+                                        note: use comma-separated values
+--control-vector FNAME                  add a control vector
+                                        note: use comma-separated values to add multiple control vectors
+--control-vector-scaled FNAME:SCALE,...
+                                        add a control vector with user defined scaling SCALE
+                                        note: use comma-separated values (format: FNAME:SCALE,...)
+--control-vector-layer-range START END
+                                        layer range to apply the control vector(s) to, start and end inclusive
+-m,    --model FNAME                    model path to load
+                                        (env: LLAMA_ARG_MODEL)
+-mu,   --model-url MODEL_URL            model download url (default: unused)
+                                        (env: LLAMA_ARG_MODEL_URL)
+-dr,   --docker-repo [<repo>/]<model>[:quant]
+                                        Docker Hub model repository. repo is optional, default to ai/. quant
+                                        is optional, default to :latest.
+                                        example: gemma3
+                                        (default: unused)
+                                        (env: LLAMA_ARG_DOCKER_REPO)
+-hf,   -hfr, --hf-repo <user>/<model>[:quant]
+                                        Hugging Face model repository; quant is optional, case-insensitive,
+                                        default to Q4_K_M, or falls back to the first file in the repo if
+                                        Q4_K_M doesn't exist.
+                                        mmproj is also downloaded automatically if available. to disable, add
+                                        --no-mmproj
+                                        example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
+                                        (default: unused)
+                                        (env: LLAMA_ARG_HF_REPO)
+-hfd,  -hfrd, --hf-repo-draft <user>/<model>[:quant]
+                                        Same as --hf-repo, but for the draft model (default: unused)
+                                        (env: LLAMA_ARG_HFD_REPO)
+-hff,  --hf-file FILE                   Hugging Face model file. If specified, it will override the quant in
+                                        --hf-repo (default: unused)
+                                        (env: LLAMA_ARG_HF_FILE)
+-hfv,  -hfrv, --hf-repo-v <user>/<model>[:quant]
+                                        Hugging Face model repository for the vocoder model (default: unused)
+                                        (env: LLAMA_ARG_HF_REPO_V)
+-hffv, --hf-file-v FILE                 Hugging Face model file for the vocoder model (default: unused)
+                                        (env: LLAMA_ARG_HF_FILE_V)
+-hft,  --hf-token TOKEN                 Hugging Face access token (default: value from HF_TOKEN environment
+                                        variable)
+                                        (env: HF_TOKEN)
+--log-disable                           Log disable
+--log-file FNAME                        Log to file
+                                        (env: LLAMA_LOG_FILE)
+--log-colors [on|off|auto]              Set colored logging ('on', 'off', or 'auto', default: 'auto')
+                                        'auto' enables colors when output is to a terminal
+                                        (env: LLAMA_LOG_COLORS)
+-v,    --verbose, --log-verbose         Set verbosity level to infinity (i.e. log all messages, useful for
+                                        debugging)
+--offline                               Offline mode: forces use of cache, prevents network access
+                                        (env: LLAMA_OFFLINE)
+-lv,   --verbosity, --log-verbosity N   Set the verbosity threshold. Messages with a higher verbosity will be
+                                        ignored. Values:
+                                         - 0: generic output
+                                         - 1: error
+                                         - 2: warning
+                                         - 3: info
+                                         - 4: debug
+                                        (default: 3)
+                                        
+                                        (env: LLAMA_LOG_VERBOSITY)
+--log-prefix                            Enable prefix in log messages
+                                        (env: LLAMA_LOG_PREFIX)
+--log-timestamps                        Enable timestamps in log messages
+                                        (env: LLAMA_LOG_TIMESTAMPS)
+-ctkd, --cache-type-k-draft TYPE        KV cache data type for K for the draft model
+                                        allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
+                                        (default: f16)
+                                        (env: LLAMA_ARG_CACHE_TYPE_K_DRAFT)
+-ctvd, --cache-type-v-draft TYPE        KV cache data type for V for the draft model
+                                        allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
+                                        (default: f16)
+                                        (env: LLAMA_ARG_CACHE_TYPE_V_DRAFT)
+
+
+----- sampling params -----
+
+--samplers SAMPLERS                     samplers that will be used for generation in the order, separated by
+                                        ';'
+                                        (default:
+                                        penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature)
+-s,    --seed SEED                      RNG seed (default: -1, use random seed for -1)
+--sampler-seq, --sampling-seq SEQUENCE
+                                        simplified sequence for samplers that will be used (default:
+                                        edskypmxt)
+--ignore-eos                            ignore end of stream token and continue generating (implies
+                                        --logit-bias EOS-inf)
+--temp, --temperature N                 temperature (default: 0.80)
+--top-k N                               top-k sampling (default: 40, 0 = disabled)
+                                        (env: LLAMA_ARG_TOP_K)
+--top-p N                               top-p sampling (default: 0.95, 1.0 = disabled)
+--min-p N                               min-p sampling (default: 0.05, 0.0 = disabled)
+--top-nsigma, --top-n-sigma N           top-n-sigma sampling (default: -1.00, -1.0 = disabled)
+--xtc-probability N                     xtc probability (default: 0.00, 0.0 = disabled)
+--xtc-threshold N                       xtc threshold (default: 0.10, 1.0 = disabled)
+--typical, --typical-p N                locally typical sampling, parameter p (default: 1.00, 1.0 = disabled)
+--repeat-last-n N                       last n tokens to consider for penalize (default: 64, 0 = disabled, -1
+                                        = ctx_size)
+--repeat-penalty N                      penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled)
+--presence-penalty N                    repeat alpha presence penalty (default: 0.00, 0.0 = disabled)
+--frequency-penalty N                   repeat alpha frequency penalty (default: 0.00, 0.0 = disabled)
+--dry-multiplier N                      set DRY sampling multiplier (default: 0.00, 0.0 = disabled)
+--dry-base N                            set DRY sampling base value (default: 1.75)
+--dry-allowed-length N                  set allowed length for DRY sampling (default: 2)
+--dry-penalty-last-n N                  set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 =
+                                        context size)
+--dry-sequence-breaker STRING           add sequence breaker for DRY sampling, clearing out default breakers
+                                        ('\n', ':', '"', '*') in the process; use "none" to not use any
+                                        sequence breakers
+--adaptive-target N                     adaptive-p: select tokens near this probability (valid range 0.0 to
+                                        1.0; negative = disabled) (default: -1.00)
+                                        [(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)
+--adaptive-decay N                      adaptive-p: decay rate for target adaptation over time. lower values
+                                        are more reactive, higher values are more stable.
+                                        (valid range 0.0 to 0.99) (default: 0.90)
+--dynatemp-range N                      dynamic temperature range (default: 0.00, 0.0 = disabled)
+--dynatemp-exp N                        dynamic temperature exponent (default: 1.00)
+--mirostat N                            use Mirostat sampling.
+                                        Top K, Nucleus and Locally Typical samplers are ignored if used.
+                                        (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
+--mirostat-lr N                         Mirostat learning rate, parameter eta (default: 0.10)
+--mirostat-ent N                        Mirostat target entropy, parameter tau (default: 5.00)
+-l,    --logit-bias TOKEN_ID(+/-)BIAS   modifies the likelihood of token appearing in the completion,
+                                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
+                                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
+--grammar GRAMMAR                       BNF-like grammar to constrain generations (see samples in grammars/
+                                        dir)
+--grammar-file FNAME                    file to read grammar from
+-j,    --json-schema SCHEMA             JSON schema to constrain generations (https://json-schema.org/), e.g.
+                                        `{}` for any JSON object
+                                        For schemas w/ external $refs, use --grammar +
+                                        example/json_schema_to_grammar.py instead
+-jf,   --json-schema-file FILE          File containing a JSON schema to constrain generations
+                                        (https://json-schema.org/), e.g. `{}` for any JSON object
+                                        For schemas w/ external $refs, use --grammar +
+                                        example/json_schema_to_grammar.py instead
+-bs,   --backend-sampling               enable backend sampling (experimental) (default: disabled)
+                                        (env: LLAMA_ARG_BACKEND_SAMPLING)
+
+
+----- example-specific params -----
+
+-lcs,  --lookup-cache-static FNAME      path to static lookup cache to use for lookup decoding (not updated by
+                                        generation)
+-lcd,  --lookup-cache-dynamic FNAME     path to dynamic lookup cache to use for lookup decoding (updated by
+                                        generation)
+-ctxcp, --ctx-checkpoints, --swa-checkpoints N
+                                        max number of context checkpoints to create per slot (default:
+                                        32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
+                                        (env: LLAMA_ARG_CTX_CHECKPOINTS)
+-cpent, --checkpoint-every-n-tokens N   create a checkpoint every n tokens during prefill (processing), -1 to
+                                        disable (default: 8192)
+                                        (env: LLAMA_ARG_CHECKPOINT_EVERY_NT)
+-cram, --cache-ram N                    set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 -
+                                        disable)[(more
+                                        info)](https://github.com/ggml-org/llama.cpp/pull/16391)
+                                        (env: LLAMA_ARG_CACHE_RAM)
+-kvu,  --kv-unified, -no-kvu, --no-kv-unified
+                                        use single unified KV buffer shared across all sequences (default:
+                                        enabled if number of slots is auto)
+                                        (env: LLAMA_ARG_KV_UNIFIED)
+--clear-idle, --no-clear-idle           save and clear idle slots on new task (default: enabled, requires
+                                        unified KV and cache-ram)
+                                        (env: LLAMA_ARG_CLEAR_IDLE)
+--context-shift, --no-context-shift     whether to use context shift on infinite text generation (default:
+                                        disabled)
+                                        (env: LLAMA_ARG_CONTEXT_SHIFT)
+-r,    --reverse-prompt PROMPT          halt generation at PROMPT, return control in interactive mode
+-sp,   --special                        special tokens output enabled (default: false)
+--warmup, --no-warmup                   whether to perform warmup with an empty run (default: enabled)
+--spm-infill                            use Suffix/Prefix/Middle pattern for infill (instead of
+                                        Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
+--pooling {none,mean,cls,last,rank}     pooling type for embeddings, use model default if unspecified
+                                        (env: LLAMA_ARG_POOLING)
+-np,   --parallel N                     number of server slots (default: -1, -1 = auto)
+                                        (env: LLAMA_ARG_N_PARALLEL)
+-cb,   --cont-batching, -nocb, --no-cont-batching
+                                        whether to enable continuous batching (a.k.a dynamic batching)
+                                        (default: enabled)
+                                        (env: LLAMA_ARG_CONT_BATCHING)
+-mm,   --mmproj FILE                    path to a multimodal projector file. see tools/mtmd/README.md
+                                        note: if -hf is used, this argument can be omitted
+                                        (env: LLAMA_ARG_MMPROJ)
+-mmu,  --mmproj-url URL                 URL to a multimodal projector file. see tools/mtmd/README.md
+                                        (env: LLAMA_ARG_MMPROJ_URL)
+--mmproj-auto, --no-mmproj, --no-mmproj-auto
+                                        whether to use multimodal projector file (if available), useful when
+                                        using -hf (default: enabled)
+                                        (env: LLAMA_ARG_MMPROJ_AUTO)
+--mmproj-offload, --no-mmproj-offload   whether to enable GPU offloading for multimodal projector (default:
+                                        enabled)
+                                        (env: LLAMA_ARG_MMPROJ_OFFLOAD)
+--image-min-tokens N                    minimum number of tokens each image can take, only used by vision
+                                        models with dynamic resolution (default: read from model)
+                                        (env: LLAMA_ARG_IMAGE_MIN_TOKENS)
+--image-max-tokens N                    maximum number of tokens each image can take, only used by vision
+                                        models with dynamic resolution (default: read from model)
+                                        (env: LLAMA_ARG_IMAGE_MAX_TOKENS)
+-otd,  --override-tensor-draft <tensor name pattern>=<buffer type>,...
+                                        override tensor buffer type for draft model
+-cmoed, --cpu-moe-draft                 keep all Mixture of Experts (MoE) weights in the CPU for the draft
+                                        model
+                                        (env: LLAMA_ARG_CPU_MOE_DRAFT)
+-ncmoed, --n-cpu-moe-draft N            keep the Mixture of Experts (MoE) weights of the first N layers in the
+                                        CPU for the draft model
+                                        (env: LLAMA_ARG_N_CPU_MOE_DRAFT)
+-a,    --alias STRING                   set model name aliases, comma-separated (to be used by API)
+                                        (env: LLAMA_ARG_ALIAS)
+--tags STRING                           set model tags, comma-separated (informational, not used for routing)
+                                        (env: LLAMA_ARG_TAGS)
+--host HOST                             ip address to listen, or bind to an UNIX socket if the address ends
+                                        with .sock (default: 127.0.0.1)
+                                        (env: LLAMA_ARG_HOST)
+--port PORT                             port to listen (default: 8080)
+                                        (env: LLAMA_ARG_PORT)
+--reuse-port                            allow multiple sockets to bind to the same port (default: disabled)
+                                        (env: LLAMA_ARG_REUSE_PORT)
+--path PATH                             path to serve static files from (default: )
+                                        (env: LLAMA_ARG_STATIC_PATH)
+--api-prefix PREFIX                     prefix path the server serves from, without the trailing slash
+                                        (default: )
+                                        (env: LLAMA_ARG_API_PREFIX)
+--webui-config JSON                     JSON that provides default WebUI settings (overrides WebUI defaults)
+                                        (env: LLAMA_ARG_WEBUI_CONFIG)
+--webui-config-file PATH                JSON file that provides default WebUI settings (overrides WebUI
+                                        defaults)
+                                        (env: LLAMA_ARG_WEBUI_CONFIG_FILE)
+--webui-mcp-proxy, --no-webui-mcp-proxy
+                                        experimental: whether to enable MCP CORS proxy - do not enable in
+                                        untrusted environments (default: disabled)
+                                        (env: LLAMA_ARG_WEBUI_MCP_PROXY)
+--tools TOOL1,TOOL2,...                 experimental: whether to enable built-in tools for AI agents - do not
+                                        enable in untrusted environments (default: no tools)
+                                        specify "all" to enable all tools
+                                        available tools: read_file, file_glob_search, grep_search,
+                                        exec_shell_command, write_file, edit_file, apply_diff
+                                        (env: LLAMA_ARG_TOOLS)
+--webui, --no-webui                     whether to enable the Web UI (default: enabled)
+                                        (env: LLAMA_ARG_WEBUI)
+--embedding, --embeddings               restrict to only support embedding use case; use only with dedicated
+                                        embedding models (default: disabled)
+                                        (env: LLAMA_ARG_EMBEDDINGS)
+--rerank, --reranking                   enable reranking endpoint on server (default: disabled)
+                                        (env: LLAMA_ARG_RERANKING)
+--api-key KEY                           API key to use for authentication, multiple keys can be provided as a
+                                        comma-separated list (default: none)
+                                        (env: LLAMA_API_KEY)
+--api-key-file FNAME                    path to file containing API keys (default: none)
+--ssl-key-file FNAME                    path to file a PEM-encoded SSL private key
+                                        (env: LLAMA_ARG_SSL_KEY_FILE)
+--ssl-cert-file FNAME                   path to file a PEM-encoded SSL certificate
+                                        (env: LLAMA_ARG_SSL_CERT_FILE)
+--chat-template-kwargs STRING           sets additional params for the json template parser, must be a valid
+                                        json object string, e.g. '{"key1":"value1","key2":"value2"}'
+                                        (env: LLAMA_CHAT_TEMPLATE_KWARGS)
+-to,   --timeout N                      server read/write timeout in seconds (default: 600)
+                                        (env: LLAMA_ARG_TIMEOUT)
+--threads-http N                        number of threads used to process HTTP requests (default: -1)
+                                        (env: LLAMA_ARG_THREADS_HTTP)
+--cache-prompt, --no-cache-prompt       whether to enable prompt caching (default: enabled)
+                                        (env: LLAMA_ARG_CACHE_PROMPT)
+--cache-reuse N                         min chunk size to attempt reusing from the cache via KV shifting,
+                                        requires prompt caching to be enabled (default: 0)
+                                        [(card)](https://ggml.ai/f0.png)
+                                        (env: LLAMA_ARG_CACHE_REUSE)
+--metrics                               enable prometheus compatible metrics endpoint (default: disabled)
+                                        (env: LLAMA_ARG_ENDPOINT_METRICS)
+--props                                 enable changing global properties via POST /props (default: disabled)
+                                        (env: LLAMA_ARG_ENDPOINT_PROPS)
+--slots, --no-slots                     expose slots monitoring endpoint (default: enabled)
+                                        (env: LLAMA_ARG_ENDPOINT_SLOTS)
+--slot-save-path PATH                   path to save slot kv cache (default: disabled)
+--media-path PATH                       directory for loading local media files; files can be accessed via
+                                        file:// URLs using relative paths (default: disabled)
+--models-dir PATH                       directory containing models for the router server (default: disabled)
+                                        (env: LLAMA_ARG_MODELS_DIR)
+--models-preset PATH                    path to INI file containing model presets for the router server
+                                        (default: disabled)
+                                        (env: LLAMA_ARG_MODELS_PRESET)
+--models-max N                          for router server, maximum number of models to load simultaneously
+                                        (default: 4, 0 = unlimited)
+                                        (env: LLAMA_ARG_MODELS_MAX)
+--models-autoload, --no-models-autoload
+                                        for router server, whether to automatically load models (default:
+                                        enabled)
+                                        (env: LLAMA_ARG_MODELS_AUTOLOAD)
+--jinja, --no-jinja                     whether to use jinja template engine for chat (default: enabled)
+                                        (env: LLAMA_ARG_JINJA)
+--reasoning-format FORMAT               controls whether thought tags are allowed and/or extracted from the
+                                        response, and in which format they're returned; one of:
+                                        - none: leaves thoughts unparsed in `message.content`
+                                        - deepseek: puts thoughts in `message.reasoning_content`
+                                        - deepseek-legacy: keeps `<think>` tags in `message.content` while
+                                        also populating `message.reasoning_content`
+                                        (default: auto)
+                                        (env: LLAMA_ARG_THINK)
+-rea,  --reasoning [on|off|auto]        Use reasoning/thinking in the chat ('on', 'off', or 'auto', default:
+                                        'auto' (detect from template))
+                                        (env: LLAMA_ARG_REASONING)
+--reasoning-budget N                    token budget for thinking: -1 for unrestricted, 0 for immediate end,
+                                        N>0 for token budget (default: -1)
+                                        (env: LLAMA_ARG_THINK_BUDGET)
+--reasoning-budget-message MESSAGE      message injected before the end-of-thinking tag when reasoning budget
+                                        is exhausted (default: none)
+                                        (env: LLAMA_ARG_THINK_BUDGET_MESSAGE)
+--chat-template JINJA_TEMPLATE          set custom jinja chat template (default: template taken from model's
+                                        metadata)
+                                        if suffix/prefix are specified, template will be disabled
+                                        only commonly used templates are accepted (unless --jinja is set
+                                        before this flag):
+                                        list of built-in templates:
+                                        bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml,
+                                        command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe,
+                                        exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite,
+                                        granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2,
+                                        llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez,
+                                        minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7,
+                                        mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3,
+                                        phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca,
+                                        yandex, zephyr
+                                        (env: LLAMA_ARG_CHAT_TEMPLATE)
+--chat-template-file JINJA_TEMPLATE_FILE
+                                        set custom jinja chat template file (default: template taken from
+                                        model's metadata)
+                                        if suffix/prefix are specified, template will be disabled
+                                        only commonly used templates are accepted (unless --jinja is set
+                                        before this flag):
+                                        list of built-in templates:
+                                        bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml,
+                                        command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe,
+                                        exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite,
+                                        granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2,
+                                        llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez,
+                                        minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7,
+                                        mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3,
+                                        phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca,
+                                        yandex, zephyr
+                                        (env: LLAMA_ARG_CHAT_TEMPLATE_FILE)
+--skip-chat-parsing, --no-skip-chat-parsing
+                                        force a pure content parser, even if a Jinja template is specified;
+                                        model will output everything in the content section, including any
+                                        reasoning and/or tool calls (default: disabled)
+                                        (env: LLAMA_ARG_SKIP_CHAT_PARSING)
+--prefill-assistant, --no-prefill-assistant
+                                        whether to prefill the assistant's response if the last message is an
+                                        assistant message (default: prefill enabled)
+                                        when this flag is set, if the last message is an assistant message
+                                        then it will be treated as a full message and not prefilled
+                                        
+                                        (env: LLAMA_ARG_PREFILL_ASSISTANT)
+-sps,  --slot-prompt-similarity SIMILARITY
+                                        how much the prompt of a request must match the prompt of a slot in
+                                        order to use that slot (default: 0.10, 0.0 = disabled)
+--lora-init-without-apply               load LoRA adapters without applying them (apply later via POST
+                                        /lora-adapters) (default: disabled)
+--sleep-idle-seconds SECONDS            number of seconds of idleness after which the server will sleep
+                                        (default: -1; -1 = disabled)
+-td,   --threads-draft N                number of threads to use during generation (default: same as
+                                        --threads)
+-tbd,  --threads-batch-draft N          number of threads to use during batch and prompt processing (default:
+                                        same as --threads-draft)
+--draft, --draft-n, --draft-max N       number of tokens to draft for speculative decoding (default: 16)
+                                        (env: LLAMA_ARG_DRAFT_MAX)
+--draft-min, --draft-n-min N            minimum number of draft tokens to use for speculative decoding
+                                        (default: 0)
+                                        (env: LLAMA_ARG_DRAFT_MIN)
+--draft-p-min P                         minimum speculative decoding probability (greedy) (default: 0.75)
+                                        (env: LLAMA_ARG_DRAFT_P_MIN)
+-cd,   --ctx-size-draft N               size of the prompt context for the draft model (default: 0, 0 = loaded
+                                        from model)
+                                        (env: LLAMA_ARG_CTX_SIZE_DRAFT)
+-devd, --device-draft <dev1,dev2,..>    comma-separated list of devices to use for offloading the draft model
+                                        (none = don't offload)
+                                        use --list-devices to see a list of available devices
+-ngld, --gpu-layers-draft, --n-gpu-layers-draft N
+                                        max. number of draft model layers to store in VRAM, either an exact
+                                        number, 'auto', or 'all' (default: auto)
+                                        (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
+-md,   --model-draft FNAME              draft model for speculative decoding (default: unused)
+                                        (env: LLAMA_ARG_MODEL_DRAFT)
+--spec-replace TARGET DRAFT             translate the string in TARGET into DRAFT if the draft model and main
+                                        model are not compatible
+--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
+                                        type of speculative decoding to use when no draft model is provided
+                                        (default: none)
+                                        
+                                        (env: LLAMA_ARG_SPEC_TYPE)
+--spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
+                                        of lookup n-gram (default: 12)
+--spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
+                                        of draft m-gram (default: 48)
+--spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
+-mv,   --model-vocoder FNAME            vocoder model for audio generation (default: unused)
+--tts-use-guide-tokens                  Use guide tokens to improve TTS word recall
+--embd-gemma-default                    use default EmbeddingGemma model (note: can download weights from the
+                                        internet)
+--fim-qwen-1.5b-default                 use default Qwen 2.5 Coder 1.5B (note: can download weights from the
+                                        internet)
+--fim-qwen-3b-default                   use default Qwen 2.5 Coder 3B (note: can download weights from the
+                                        internet)
+--fim-qwen-7b-default                   use default Qwen 2.5 Coder 7B (note: can download weights from the
+                                        internet)
+--fim-qwen-7b-spec                      use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can
+                                        download weights from the internet)
+--fim-qwen-14b-spec                     use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note:
+                                        can download weights from the internet)
+--fim-qwen-30b-default                  use default Qwen 3 Coder 30B A3B Instruct (note: can download weights
+                                        from the internet)
+--gpt-oss-20b-default                   use gpt-oss-20b (note: can download weights from the internet)
+--gpt-oss-120b-default                  use gpt-oss-120b (note: can download weights from the internet)
+--vision-gemma-4b-default               use Gemma 3 4B QAT (note: can download weights from the internet)
+--vision-gemma-12b-default              use Gemma 3 12B QAT (note: can download weights from the internet)
--- a/scripts/help_gpu_flags.txt
+++ b/scripts/help_gpu_flags.txt
@@ -0,0 +1,31 @@
+ggml_cuda_init: found 2 CUDA devices (Total VRAM: 24575 MiB):
+  Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB
+  Device 1: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes, VRAM: 12287 MiB
+-dev,  --device <dev1,dev2,..>          comma-separated list of devices to use for offloading (none = don't
+                                        use --list-devices to see a list of available devices
+                                        (env: LLAMA_ARG_DEVICE)
+--list-devices                          print list of available devices and exit
+-ot,   --override-tensor <tensor name pattern>=<buffer type>,...
+                                        override tensor buffer type
+                                        (env: LLAMA_ARG_OVERRIDE_TENSOR)
+-cmoe, --cpu-moe                        keep all Mixture of Experts (MoE) weights in the CPU
+-ncmoe, --n-cpu-moe N                   keep the Mixture of Experts (MoE) weights of the first N layers in the
+-sm,   --split-mode {none,layer,row}    how to split the model across multiple GPUs, one of:
+                                        - layer (default): split layers and KV across GPUs
+                                        - row: split rows across GPUs
+                                        (env: LLAMA_ARG_SPLIT_MODE)
+-ts,   --tensor-split N0,N1,N2,...      fraction of the model to offload to each GPU, comma-separated list of
+                                        (env: LLAMA_ARG_TENSOR_SPLIT)
+-mg,   --main-gpu INDEX                 the GPU to use for the model (with split-mode = none), or for
+                                        intermediate results and KV (with split-mode = row) (default: 0)
+-fit,  --fit [on|off]                   whether to adjust unset arguments to fit in device memory ('on' or
+                                        target margin per device for --fit, comma-separated list of values,
+                                        single value is broadcast across all devices, default: 1024
+--check-tensors                         check model tensor data for invalid values (default: false)
+--op-offload, --no-op-offload           whether to offload host tensor operations to device (default: true)
+-otd,  --override-tensor-draft <tensor name pattern>=<buffer type>,...
+                                        override tensor buffer type for draft model
+-cmoed, --cpu-moe-draft                 keep all Mixture of Experts (MoE) weights in the CPU for the draft
+-ncmoed, --n-cpu-moe-draft N            keep the Mixture of Experts (MoE) weights of the first N layers in the
+-devd, --device-draft <dev1,dev2,..>    comma-separated list of devices to use for offloading the draft model
+                                        use --list-devices to see a list of available devices
--- a/scripts/hf_search.py
+++ b/scripts/hf_search.py
@@ -0,0 +1,28 @@
+from huggingface_hub import HfApi
+import sys
+
+api = HfApi()
+
+def search_gguf(query):
+    print(f"\n--- Searching for: {query} ---")
+    try:
+        models = api.list_models(search=query, limit=3)
+        found = list(models)
+        if not found:
+            print("No models found.")
+            return
+        for m in found:
+            print(f"Repo: {m.id}")
+            files = api.list_repo_files(repo_id=m.id)
+            ggufs = [f for f in files if "q4_k_m" in f.lower() and f.endswith(".gguf")]
+            if not ggufs:
+                ggufs = [f for f in files if f.endswith(".gguf")][:3]
+            print(f"  GGUFs: {ggufs}")
+    except Exception as e:
+        print(f"Error: {e}")
+
+search_gguf("122b-a10b gguf")
+search_gguf("Qwen3.5 122b gguf")
+search_gguf("35b-a3b gguf")
+search_gguf("gemma-4 26b gguf")
+search_gguf("Qwen 122B")
--- a/scripts/perf_test.py
+++ b/scripts/perf_test.py
@@ -0,0 +1,123 @@
+import time
+import json
+import urllib.request
+import sys
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+
+def check_server():
+    """Check if server is up"""
+    try:
+        req = urllib.request.Request(f"{BASE_URL}/health")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            data = json.loads(resp.read())
+            return data.get("status") == "ok"
+    except:
+        return False
+
+def run_benchmark(prompt, max_tokens=100, label="Test"):
+    """Run a single benchmark request and return results"""
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    content = result["choices"][0]["message"].get("content", "")
+    usage = result.get("usage", {})
+    prompt_tokens = usage.get("prompt_tokens", 0)
+    completion_tokens = usage.get("completion_tokens", 0)
+
+    gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
+
+    return {
+        "label": label,
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "elapsed": elapsed,
+        "gen_tps_approx": gen_tps,
+        "content_preview": content[:100]
+    }
+
+def main():
+    print("=" * 60)
+    print("  LLM Performance Benchmark Tool")
+    print("=" * 60)
+    print()
+
+    # Wait for server
+    print("[1/3] Checking server health...")
+    for i in range(30):
+        if check_server():
+            print("  -> Server is ready!")
+            break
+        print(f"  -> Waiting for server... ({i+1}/30)")
+        time.sleep(2)
+    else:
+        print("  -> ERROR: Server not responding after 60s")
+        return
+
+    # Warmup
+    print()
+    print("[2/3] Warmup run (short)...")
+    try:
+        warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
+        print(f"  -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
+    except Exception as e:
+        print(f"  -> Warmup failed: {e}")
+
+    # Main benchmark
+    print()
+    print("[3/3] Running main benchmark...")
+    print("-" * 60)
+
+    test_prompt = "Count from 1 to 50, writing each number on a new line."
+    
+    results = []
+    for i in range(3):
+        print(f"  Run {i+1}/3...")
+        try:
+            r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}")
+            results.append(r)
+            print(f"    Tokens: {r['completion_tokens']} | "
+                  f"Time: {r['elapsed']:.2f}s | "
+                  f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)")
+        except Exception as e:
+            print(f"    ERROR: {e}")
+
+    if results:
+        print()
+        print("=" * 60)
+        print("  RESULTS SUMMARY")
+        print("=" * 60)
+        avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
+        max_tps = max(r["gen_tps_approx"] for r in results)
+        min_tps = min(r["gen_tps_approx"] for r in results)
+        print(f"  Runs:     {len(results)}")
+        print(f"  Avg TPS:  {avg_tps:.2f} t/s (approx, includes prompt eval)")
+        print(f"  Min TPS:  {min_tps:.2f} t/s")
+        print(f"  Max TPS:  {max_tps:.2f} t/s")
+        print()
+        print("  NOTE: Check server console for exact generation t/s")
+        print("  (the 'eval time' line shows pure token generation speed)")
+        print("=" * 60)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/perf_test_122b.py
+++ b/scripts/perf_test_122b.py
@@ -0,0 +1,169 @@
+import time
+import json
+import urllib.request
+import sys
+import os
+import re
+
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+except AttributeError:
+    pass
+
+BASE_URL = "http://127.0.0.1:8000"
+
+def check_server():
+    """Check if server is up"""
+    try:
+        req = urllib.request.Request(f"{BASE_URL}/health")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            data = json.loads(resp.read())
+            return data.get("status") == "ok"
+    except:
+        return False
+
+def check_slots():
+    """Check server slot info for VRAM usage details"""
+    try:
+        req = urllib.request.Request(f"{BASE_URL}/slots")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            return json.loads(resp.read())
+    except:
+        return None
+
+def run_benchmark(prompt, max_tokens=300, label="Test"):
+    """Run a single benchmark request and return results"""
+    payload = json.dumps({
+        "model": "local-model",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        f"{BASE_URL}/v1/chat/completions",
+        data=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    start = time.time()
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        result = json.loads(resp.read())
+    elapsed = time.time() - start
+
+    content = result["choices"][0]["message"].get("content", "")
+    usage = result.get("usage", {})
+    prompt_tokens = usage.get("prompt_tokens", 0)
+    completion_tokens = usage.get("completion_tokens", 0)
+
+    gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
+
+    return {
+        "label": label,
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "elapsed": elapsed,
+        "gen_tps_approx": gen_tps,
+        "content_preview": content[:150]
+    }
+
+def main():
+    print("=" * 70)
+    print("  Qwen3.5 122B-A10B Performance Benchmark")
+    print("  Target: 10+ t/s generation speed")
+    print("=" * 70)
+    print()
+
+    # Wait for server (model loading takes 3-5 min for 71 GB)
+    print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
+    max_wait = 600  # 10 minutes max
+    for i in range(max_wait // 5):
+        if check_server():
+            print(f"  -> Server is ready! (waited {i*5}s)")
+            break
+        if i % 6 == 0:
+            print(f"  -> Loading model... ({i*5}s / {max_wait}s)")
+        time.sleep(5)
+    else:
+        print(f"  -> ERROR: Server not responding after {max_wait}s")
+        return
+
+    # Check server info
+    print()
+    print("[2/4] Checking server status...")
+    slots = check_slots()
+    if slots:
+        print(f"  -> Slots available: {len(slots)}")
+
+    # Warmup
+    print()
+    print("[3/4] Warmup run (short, pre-heating GPU caches)...")
+    try:
+        warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
+        print(f"  -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
+        print(f"  -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
+    except Exception as e:
+        print(f"  -> Warmup failed: {e}")
+
+    # Main benchmark - 5 runs for statistical reliability
+    print()
+    print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
+    print("-" * 70)
+
+    test_prompts = [
+        "Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
+        "Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
+        "Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
+        "Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
+        "Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
+    ]
+    
+    results = []
+    for i in range(5):
+        prompt = test_prompts[i % len(test_prompts)]
+        print(f"\n  Run {i+1}/5: {prompt[:50]}...")
+        try:
+            r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
+            results.append(r)
+            print(f"    Completion tokens: {r['completion_tokens']}")
+            print(f"    Total time: {r['elapsed']:.2f}s")
+            print(f"    Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
+        except Exception as e:
+            print(f"    ERROR: {e}")
+
+    if results:
+        print()
+        print("=" * 70)
+        print("  RESULTS SUMMARY - Qwen3.5 122B-A10B")
+        print("=" * 70)
+        avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
+        max_tps = max(r["gen_tps_approx"] for r in results)
+        min_tps = min(r["gen_tps_approx"] for r in results)
+        total_tokens = sum(r["completion_tokens"] for r in results)
+        total_time = sum(r["elapsed"] for r in results)
+        
+        print(f"  Runs completed: {len(results)}/5")
+        print(f"  Total tokens:   {total_tokens}")
+        print(f"  Total time:     {total_time:.1f}s")
+        print()
+        print(f"  Approx TPS (avg): {avg_tps:.2f} t/s")
+        print(f"  Approx TPS (min): {min_tps:.2f} t/s")
+        print(f"  Approx TPS (max): {max_tps:.2f} t/s")
+        print()
+        
+        # Verdict
+        if avg_tps >= 10:
+            print("  ✅ TARGET ACHIEVED: 10+ t/s!")
+        elif avg_tps >= 8:
+            print("  ⚠️  CLOSE TO TARGET: Consider further tuning")
+        else:
+            print(f"  ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
+        
+        print()
+        print("  ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
+        print("  ⚡ Check the server console/log for exact 'eval time' t/s value,")
+        print("  ⚡ which shows pure token generation speed (always higher).")
+        print("=" * 70)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/q4km_latest.txt
+++ b/scripts/q4km_latest.txt
@@ -0,0 +1,5 @@
+pure-GPU nommap small     |  62.29 | GPU | VRAM:22975 | ub=128 b=512 t=4
+pure-GPU ts=0.5,0.5       |  63.89 | GPU | VRAM:23002 | ub=128 b=512 t=4
+tune t=2                  |   64.1 | GPU | VRAM:22980 | ub=128 b=512 t=2
+tune t=6                  |  64.18 | GPU | VRAM:22982 | ub=128 b=512 t=6
+tune t=8                  |  63.11 | GPU | VRAM:22980 | ub=128 b=512 t=8
--- a/scripts/quick_pptest.mjs
+++ b/scripts/quick_pptest.mjs
@@ -0,0 +1,31 @@
+// Quick PP+TG speed test
+const BASE = "http://127.0.0.1:8000";
+
+async function test(label, prompt, maxTok) {
+  const t0 = Date.now();
+  const r = await fetch(`${BASE}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ model: "m", messages: [{ role: "user", content: prompt }], max_tokens: maxTok, temperature: 0 }),
+    signal: AbortSignal.timeout(600000),
+  });
+  const d = await r.json();
+  const dt = (Date.now() - t0) / 1000;
+  const u = d.usage || {};
+  const pp = u.prompt_tokens || 0;
+  const tg = u.completion_tokens || 0;
+  const ppSpeed = pp > 0 ? (pp / dt).toFixed(1) : "?";
+  const tgSpeed = tg > 0 ? (tg / dt).toFixed(1) : "?";
+  console.log(`${label} | PP:${pp}tok ${ppSpeed}t/s | TG:${tg}tok ${tgSpeed}t/s | ${dt.toFixed(1)}s`);
+}
+
+const short = "Count 1 to 20.";
+const long = "x".repeat(3000) + " Summarize above in 3 words.";
+const code = Array(200).fill("function foo(x) { return x * 2 + Math.random(); }").join("\n") + "\n\nRefactor above to arrow functions. Show first 5 lines.";
+
+await test("warmup", short, 20);
+await test("SHORT", short, 200);
+await test("3K-PP", long, 100);
+await test("10K-CODE", code, 100);
+await test("TG-500", short, 500);
+console.log("DONE");
--- a/scripts/qwen_fullgpu_challenge.mjs
+++ b/scripts/qwen_fullgpu_challenge.mjs
@@ -0,0 +1,345 @@
+/**
+ * Qwen3.5 Full-GPU Challenge — VRAM 극한 최적화 벤치마크
+ * =====================================================
+ * 목표: Qwen3.5-35B-A3B를 24GB 듀얼 3060에 100% GPU로 올리기
+ *
+ * 테스트 모델:
+ *   1. UD-IQ4_NL  (16.6 GB) — 확실히 올라감, 기준선
+ *   2. MXFP4_MOE  (20.1 GB) — 도전! VRAM 극한 최적화
+ *   3. Q4_K_M     (20.5 GB) — 대조군 (n-cpu-moe=5)
+ *
+ * VRAM 절감 전략:
+ *   A. 배치 최소화: -ub 64 -b 256 (computation buffer 축소)
+ *   B. split-mode row (GPU간 더 균등한 분배)
+ *   C. tensor-split 수동 밸런싱
+ *   D. no-mmap (메모리 관리 최적화)
+ *   E. defrag-thold (KV 캐시 파편화 방지)
+ *
+ * Run: node scripts/qwen_fullgpu_challenge.mjs
+ */
+
+import { spawn, execSync } from "child_process";
+import { writeFileSync, existsSync, statSync } from "fs";
+
+const BASE_URL = "http://127.0.0.1:8000";
+const LLAMA = String.raw`llama_bin_run\llama-server.exe`;
+const CTX = 262144;
+const RUNS = 3;
+const TOKENS = 200;
+const BOOT_TIMEOUT = 300_000;
+
+const MODELS = [
+  {
+    name: "Qwen3.5 UD-IQ4_NL",
+    path: String.raw`models\Qwen3.5-35B-A3B-UD-IQ4_NL.gguf`,
+    sizeGB: 16.6,
+  },
+  {
+    name: "Qwen3.5 MXFP4_MOE",
+    path: String.raw`models\Qwen3.5-35B-A3B-MXFP4_MOE.gguf`,
+    sizeGB: 20.11,
+  },
+  {
+    name: "Qwen3.5 Q4_K_M",
+    path: String.raw`models\Qwen3.5-35B-A3B-Q4_K_M.gguf`,
+    sizeGB: 20.5,
+  },
+];
+
+const ALL = [];
+let proc = null;
+const log = (m) => console.log(`[${new Date().toLocaleTimeString("ko-KR",{hour12:false})}] ${m}`);
+const sleep = (ms) => new Promise(r => setTimeout(r, ms));
+
+async function kill() {
+  if (proc) { try { proc.kill("SIGKILL"); } catch {} proc = null; }
+  try { execSync("taskkill /F /IM llama-server.exe", { stdio: "ignore" }); } catch {}
+  await sleep(5000);
+}
+
+function vram() {
+  try {
+    return execSync('nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv,noheader,nounits',
+      { encoding: "utf-8", timeout: 5000 }).trim().split("\n").map(l => {
+      const [g, u, t] = l.split(",").map(s => parseInt(s));
+      return { gpu: g, used: u, total: t };
+    });
+  } catch { return []; }
+}
+
+function startServer(modelPath, p) {
+  const args = [
+    "--model", modelPath, "-ngl", "999",
+    "-c", String(CTX), "-np", "1", "-fa", "on",
+    "--cache-type-k", p.ctk || "q4_0",
+    "--cache-type-v", p.ctv || "q4_0",
+    "-ub", String(p.ub || 512), "-b", String(p.b || 2048),
+    "-t", String(p.t || 4), "-tb", String(p.t || 4),
+    "--prio", "3", "--poll", "50", "--mlock",
+    "--port", "8000", "--host", "0.0.0.0",
+  ];
+
+  // GPU offload strategy
+  if (p.cpuMoe) args.push("--cpu-moe");
+  else if (p.nCpuMoe) args.push("--n-cpu-moe", String(p.nCpuMoe));
+
+  // VRAM saving options
+  if (p.splitMode) args.push("--split-mode", p.splitMode);
+  if (p.tensorSplit) args.push("--tensor-split", p.tensorSplit);
+  if (p.noMmap) args.push("--no-mmap");
+  if (p.defragThold) args.push("--defrag-thold", String(p.defragThold));
+  if (p.noKvOffload) args.push("--no-kv-offload");
+
+  const cmdStr = args.join(" ");
+  log(`  CMD: ...${cmdStr.slice(-80)}`);
+  proc = spawn(LLAMA, args, { cwd: process.cwd(), stdio: ["ignore", "pipe", "pipe"] });
+  return proc;
+}
+
+async function waitReady(timeout = BOOT_TIMEOUT) {
+  const t0 = Date.now();
+  while (Date.now() - t0 < timeout) {
+    try {
+      const r = await fetch(`${BASE_URL}/health`, { signal: AbortSignal.timeout(3000) });
+      const d = await r.json();
+      if (d.status === "ok") return { ok: true, boot: (Date.now() - t0) / 1000 };
+    } catch {}
+    await sleep(3000);
+  }
+  return { ok: false, boot: timeout / 1000 };
+}
+
+async function bench(n = TOKENS) {
+  const t0 = Date.now();
+  const r = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      model: "m",
+      messages: [{ role: "user", content: "Count from 1 to 50, each on new line." }],
+      max_tokens: n, temperature: 0,
+    }),
+    signal: AbortSignal.timeout(600_000),
+  });
+  const d = await r.json();
+  const dt = (Date.now() - t0) / 1000;
+  const ct = d.usage?.completion_tokens || 0;
+  return { tps: ct / dt, ct, dt };
+}
+
+async function testConfig(model, label, params) {
+  await kill();
+  log(`  [${label}] Starting...`);
+  startServer(model.path, params);
+  const { ok, boot } = await waitReady();
+  if (!ok) {
+    log(`  [${label}] ✗ FAILED (timeout ${BOOT_TIMEOUT/1000}s)`);
+    await kill();
+    return null;
+  }
+
+  const v = vram();
+  const totalUsed = v.reduce((a, g) => a + g.used, 0);
+  const vs = v.map(g => `GPU${g.gpu}:${g.used}/${g.total}`).join(" | ");
+  log(`  [${label}] ✓ Boot:${boot.toFixed(0)}s | VRAM: ${vs} (total: ${totalUsed} MiB)`);
+
+  try { await bench(20); } catch {} // warmup
+
+  const speeds = [];
+  for (let i = 0; i < RUNS; i++) {
+    try {
+      const r = await bench();
+      speeds.push(r.tps);
+      log(`    Run${i+1}: ${r.tps.toFixed(2)} t/s`);
+    } catch (e) {
+      log(`    Run${i+1}: ERR ${e.message}`);
+    }
+  }
+  await kill();
+
+  if (!speeds.length) return null;
+  const avg = speeds.reduce((a,b)=>a+b) / speeds.length;
+  const best = Math.max(...speeds);
+  log(`  [${label}] ⇒ AVG:${avg.toFixed(2)} BEST:${best.toFixed(2)} t/s`);
+
+  const res = {
+    model: model.name, label,
+    avg_tps: +avg.toFixed(2), best_tps: +best.toFixed(2),
+    boot: +boot.toFixed(1),
+    vram_total: totalUsed, vram: v,
+    params: { ...params, ngl: 999, ctk: params.ctk||"q4_0", ctv: params.ctv||"q4_0" },
+    gpu_only: !params.cpuMoe && !params.nCpuMoe,
+  };
+  ALL.push(res);
+  writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
+  return res;
+}
+
+// ─── Test Strategies ───────────────────────────────────────────
+
+async function testModel(model) {
+  log(`\n${"#".repeat(65)}`);
+  log(`  ${model.name} (${model.sizeGB} GB)`);
+  if (!existsSync(model.path)) { log("  ✗ File not found!"); return null; }
+  log(`${"#".repeat(65)}`);
+
+  let best = null;
+  const update = (r) => { if (r && (!best || r.avg_tps > best.avg_tps)) best = r; };
+
+  // ── Strategy 1: Pure GPU, default settings ──
+  log(`\n  ── Strategy 1: Pure GPU (default) ──`);
+  update(await testConfig(model, "pure-GPU default", {
+    t: 4, ub: 512, b: 2048
+  }));
+
+  // ── Strategy 2: Pure GPU, minimal batch (VRAM saver) ──
+  log(`\n  ── Strategy 2: Pure GPU, minimal batch ──`);
+  update(await testConfig(model, "pure-GPU minbatch", {
+    t: 4, ub: 64, b: 256
+  }));
+
+  // ── Strategy 3: Pure GPU, small batch + no-mmap ──
+  log(`\n  ── Strategy 3: Pure GPU + no-mmap + small batch ──`);
+  update(await testConfig(model, "pure-GPU nommap small", {
+    t: 4, ub: 128, b: 512, noMmap: true
+  }));
+
+  // ── Strategy 4: Pure GPU, split-mode row ──
+  log(`\n  ── Strategy 4: Pure GPU + split-mode row ──`);
+  update(await testConfig(model, "pure-GPU row-split", {
+    t: 4, ub: 128, b: 512, splitMode: "row"
+  }));
+
+  // ── Strategy 5: Pure GPU, tensor-split manual balance ──
+  log(`\n  ── Strategy 5: Pure GPU + tensor-split 0.5,0.5 ──`);
+  update(await testConfig(model, "pure-GPU ts=0.5,0.5", {
+    t: 4, ub: 128, b: 512, tensorSplit: "0.5,0.5"
+  }));
+
+  // ── Strategy 6: Pure GPU, defrag + all tricks ──
+  log(`\n  ── Strategy 6: Pure GPU ALL tricks ──`);
+  update(await testConfig(model, "pure-GPU all-tricks", {
+    t: 4, ub: 64, b: 256, noMmap: true, defragThold: 0.1
+  }));
+
+  // ── Fallback: n-cpu-moe=5 baseline ──
+  if (!best || !best.gpu_only) {
+    log(`\n  ── Fallback: n-cpu-moe=5 ──`);
+    update(await testConfig(model, "n-cpu-moe=5 baseline", {
+      t: 4, ub: 256, b: 1024, nCpuMoe: 5
+    }));
+  }
+
+  // ── If pure GPU worked, tune batch/thread/kv ──
+  if (best && best.gpu_only) {
+    log(`\n  ── Pure GPU succeeded! Fine-tuning... ──`);
+    const bp = best.params;
+
+    // Thread sweep
+    for (const t of [2, 6, 8]) {
+      if (t === bp.t) continue;
+      update(await testConfig(model, `tune t=${t}`, { ...bp, t }));
+    }
+
+    // Batch sweep
+    for (const [ub, b] of [[256, 1024], [512, 2048], [256, 2048]]) {
+      if (ub === bp.ub && b === bp.b) continue;
+      update(await testConfig(model, `tune ub=${ub} b=${b}`, { ...bp, ub, b }));
+    }
+
+    // KV cache upgrade (extra VRAM available?)
+    for (const [ctk, ctv] of [["q8_0","q8_0"], ["f16","f16"]]) {
+      update(await testConfig(model, `tune kv=${ctk}/${ctv}`, { ...bp, ctk, ctv }));
+    }
+  }
+
+  // ── Final verification ──
+  if (best) {
+    log(`\n  ── Final verification (5 runs) ──`);
+    await kill();
+    startServer(model.path, best.params);
+    const { ok, boot } = await waitReady();
+    if (ok) {
+      const v = vram();
+      try { await bench(20); } catch {}
+      const finals = [];
+      for (let i = 0; i < 5; i++) {
+        try { const r = await bench(); finals.push(r.tps); log(`    Final ${i+1}: ${r.tps.toFixed(2)} t/s`);
+        } catch (e) { log(`    Final ${i+1}: ERR`); }
+      }
+      await kill();
+      if (finals.length > 0) {
+        const avg = finals.reduce((a,b)=>a+b) / finals.length;
+        const bst = Math.max(...finals);
+        log(`  ★ FINAL: AVG ${avg.toFixed(2)} | BEST ${bst.toFixed(2)} t/s`);
+        const final = { model: model.name, label: "FINAL",
+          avg_tps: +avg.toFixed(2), best_tps: +bst.toFixed(2),
+          boot: +boot.toFixed(1), vram_total: v.reduce((a,g)=>a+g.used,0),
+          vram: v, params: best.params, gpu_only: best.gpu_only };
+        ALL.push(final);
+        writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
+        return final;
+      }
+    }
+    await kill();
+  }
+  return best;
+}
+
+// ─── Main ──────────────────────────────────────────────────────
+
+async function main() {
+  const t0 = Date.now();
+  log("=".repeat(65));
+  log("  QWEN3.5 FULL-GPU CHALLENGE — 70 t/s TARGET");
+  log("  2x RTX 3060 (24GB) | 256K Context");
+  log("  " + new Date().toISOString());
+  log("=".repeat(65));
+  vram().forEach(g => log(`  GPU${g.gpu}: ${g.used}/${g.total} MiB`));
+
+  const winners = [];
+  for (const model of MODELS) {
+    const w = await testModel(model);
+    if (w) winners.push(w);
+  }
+
+  // ─── Summary ──────────────────────────────────────────────
+  const elapsed = ((Date.now() - t0) / 60000).toFixed(1);
+  winners.sort((a, b) => b.avg_tps - a.avg_tps);
+
+  const lines = [
+    `Qwen3.5 Full-GPU Challenge — ${new Date().toISOString()}`,
+    `2x RTX 3060 12GB | 256K Context | ${ALL.length} configs | ${elapsed} min`,
+    "", "=".repeat(55), "  RANKING", "=".repeat(55),
+  ];
+  for (let i = 0; i < winners.length; i++) {
+    const w = winners[i], p = w.params;
+    const gpu = w.gpu_only ? "★ FULL GPU" : "⚠ CPU offload";
+    lines.push("", `  #${i+1}: ${w.model} [${gpu}]`);
+    lines.push(`      AVG: ${w.avg_tps} t/s | BEST: ${w.best_tps} t/s | Boot: ${w.boot}s`);
+    lines.push(`      VRAM: ${w.vram_total} MiB total`);
+    const flags = [];
+    if (p.splitMode) flags.push(`split=${p.splitMode}`);
+    if (p.tensorSplit) flags.push(`ts=${p.tensorSplit}`);
+    if (p.noMmap) flags.push("no-mmap");
+    if (p.nCpuMoe) flags.push(`n-cpu-moe=${p.nCpuMoe}`);
+    lines.push(`      t=${p.t} ub=${p.ub} b=${p.b} kv=${p.ctk}/${p.ctv} ${flags.join(" ")}`);
+  }
+
+  if (winners.length > 0) {
+    const c = winners[0];
+    lines.push("", "=".repeat(55));
+    lines.push(`  ★ CHAMPION: ${c.model} — ${c.avg_tps} t/s [${c.gpu_only?"FULL GPU":"CPU offload"}]`);
+    lines.push("=".repeat(55));
+  }
+
+  const summary = lines.join("\n");
+  console.log("\n" + summary);
+  writeFileSync("scripts/qwen_fullgpu_summary.txt", summary, "utf-8");
+  writeFileSync("scripts/qwen_fullgpu_results.json", JSON.stringify(ALL, null, 2));
+  log(`\n  Saved: qwen_fullgpu_results.json + qwen_fullgpu_summary.txt`);
+  log("  DONE!");
+  await kill();
+}
+
+main().catch(e => { console.error("FATAL:", e); process.exit(1); });
--- a/scripts/qwen_fullgpu_results.json
+++ b/scripts/qwen_fullgpu_results.json
@@ -0,0 +1,834 @@
+[
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "pure-GPU minbatch",
+    "avg_tps": 65.11,
+    "best_tps": 65.49,
+    "boot": 9,
+    "vram_total": 19177,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10039,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9138,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 64,
+      "b": 256,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "pure-GPU nommap small",
+    "avg_tps": 65.01,
+    "best_tps": 65.36,
+    "boot": 6,
+    "vram_total": 19672,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10342,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9330,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 128,
+      "b": 512,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "pure-GPU row-split",
+    "avg_tps": 13.65,
+    "best_tps": 14.82,
+    "boot": 9,
+    "vram_total": 19427,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10311,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9116,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 128,
+      "b": 512,
+      "splitMode": "row",
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "pure-GPU ts=0.5,0.5",
+    "avg_tps": 64.92,
+    "best_tps": 65.23,
+    "boot": 9,
+    "vram_total": 19664,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10334,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9330,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 128,
+      "b": 512,
+      "tensorSplit": "0.5,0.5",
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "pure-GPU all-tricks",
+    "avg_tps": 64.72,
+    "best_tps": 64.89,
+    "boot": 6,
+    "vram_total": 19171,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10033,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9138,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 64,
+      "b": 256,
+      "noMmap": true,
+      "defragThold": 0.1,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "tune t=2",
+    "avg_tps": 64.87,
+    "best_tps": 65.13,
+    "boot": 9,
+    "vram_total": 19170,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10032,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9138,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 2,
+      "ub": 64,
+      "b": 256,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "tune t=6",
+    "avg_tps": 64.88,
+    "best_tps": 65.17,
+    "boot": 9,
+    "vram_total": 19168,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10030,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9138,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 6,
+      "ub": 64,
+      "b": 256,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "tune t=8",
+    "avg_tps": 64.5,
+    "best_tps": 64.77,
+    "boot": 9,
+    "vram_total": 19168,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10030,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9138,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 8,
+      "ub": 64,
+      "b": 256,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "tune ub=256 b=1024",
+    "avg_tps": 64.73,
+    "best_tps": 64.98,
+    "boot": 9,
+    "vram_total": 20640,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10928,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9712,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 256,
+      "b": 1024,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "tune ub=256 b=2048",
+    "avg_tps": 63.69,
+    "best_tps": 64.94,
+    "boot": 12,
+    "vram_total": 20614,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10902,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9712,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 256,
+      "b": 2048,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "tune kv=q8_0/q8_0",
+    "avg_tps": 64.78,
+    "best_tps": 65.08,
+    "boot": 9,
+    "vram_total": 20422,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 10644,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 9778,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 64,
+      "b": 256,
+      "ngl": 999,
+      "ctk": "q8_0",
+      "ctv": "q8_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "tune kv=f16/f16",
+    "avg_tps": 65.53,
+    "best_tps": 65.81,
+    "boot": 9,
+    "vram_total": 22812,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11846,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10966,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 64,
+      "b": 256,
+      "ngl": 999,
+      "ctk": "f16",
+      "ctv": "f16"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 UD-IQ4_NL",
+    "label": "FINAL",
+    "avg_tps": 66.31,
+    "best_tps": 66.53,
+    "boot": 9,
+    "vram_total": 22811,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11845,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10966,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 64,
+      "b": 256,
+      "ngl": 999,
+      "ctk": "f16",
+      "ctv": "f16"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "pure-GPU minbatch",
+    "avg_tps": 63.06,
+    "best_tps": 64.16,
+    "boot": 12,
+    "vram_total": 22747,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11895,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10852,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 64,
+      "b": 256,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "pure-GPU nommap small",
+    "avg_tps": 63.75,
+    "best_tps": 63.98,
+    "boot": 9,
+    "vram_total": 22579,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11797,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10782,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 128,
+      "b": 512,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "pure-GPU ts=0.5,0.5",
+    "avg_tps": 62.88,
+    "best_tps": 63.9,
+    "boot": 12,
+    "vram_total": 22578,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11796,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10782,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 128,
+      "b": 512,
+      "tensorSplit": "0.5,0.5",
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "pure-GPU all-tricks",
+    "avg_tps": 62.55,
+    "best_tps": 63.71,
+    "boot": 9,
+    "vram_total": 22743,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11891,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10852,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 64,
+      "b": 256,
+      "noMmap": true,
+      "defragThold": 0.1,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "tune t=2",
+    "avg_tps": 63.07,
+    "best_tps": 64.08,
+    "boot": 9,
+    "vram_total": 22601,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11819,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10782,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 2,
+      "ub": 128,
+      "b": 512,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "tune t=6",
+    "avg_tps": 63.58,
+    "best_tps": 64.04,
+    "boot": 9,
+    "vram_total": 22583,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11801,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10782,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 6,
+      "ub": 128,
+      "b": 512,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "tune t=8",
+    "avg_tps": 62.92,
+    "best_tps": 63.73,
+    "boot": 9,
+    "vram_total": 22536,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11754,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10782,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 8,
+      "ub": 128,
+      "b": 512,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "tune ub=256 b=1024",
+    "avg_tps": 62.76,
+    "best_tps": 63.86,
+    "boot": 9,
+    "vram_total": 22874,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11968,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10906,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 256,
+      "b": 1024,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "tune ub=256 b=2048",
+    "avg_tps": 62.74,
+    "best_tps": 63.9,
+    "boot": 9,
+    "vram_total": 22912,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 12006,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10906,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 256,
+      "b": 2048,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 MXFP4_MOE",
+    "label": "FINAL",
+    "avg_tps": 63.71,
+    "best_tps": 64.39,
+    "boot": 9,
+    "vram_total": 22566,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 11784,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10782,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 128,
+      "b": 512,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 Q4_K_M",
+    "label": "pure-GPU nommap small",
+    "avg_tps": 62.29,
+    "best_tps": 63.03,
+    "boot": 9,
+    "vram_total": 22975,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 12007,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10968,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 128,
+      "b": 512,
+      "noMmap": true,
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 Q4_K_M",
+    "label": "pure-GPU ts=0.5,0.5",
+    "avg_tps": 63.89,
+    "best_tps": 64.91,
+    "boot": 12,
+    "vram_total": 23002,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 12034,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10968,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 4,
+      "ub": 128,
+      "b": 512,
+      "tensorSplit": "0.5,0.5",
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 Q4_K_M",
+    "label": "tune t=2",
+    "avg_tps": 64.1,
+    "best_tps": 64.54,
+    "boot": 12,
+    "vram_total": 22980,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 12012,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10968,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 2,
+      "ub": 128,
+      "b": 512,
+      "tensorSplit": "0.5,0.5",
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 Q4_K_M",
+    "label": "tune t=6",
+    "avg_tps": 64.18,
+    "best_tps": 64.72,
+    "boot": 12,
+    "vram_total": 22982,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 12014,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10968,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 6,
+      "ub": 128,
+      "b": 512,
+      "tensorSplit": "0.5,0.5",
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  },
+  {
+    "model": "Qwen3.5 Q4_K_M",
+    "label": "tune t=8",
+    "avg_tps": 63.11,
+    "best_tps": 64.02,
+    "boot": 12,
+    "vram_total": 22980,
+    "vram": [
+      {
+        "gpu": 0,
+        "used": 12012,
+        "total": 12288
+      },
+      {
+        "gpu": 1,
+        "used": 10968,
+        "total": 12288
+      }
+    ],
+    "params": {
+      "t": 8,
+      "ub": 128,
+      "b": 512,
+      "tensorSplit": "0.5,0.5",
+      "ngl": 999,
+      "ctk": "q4_0",
+      "ctv": "q4_0"
+    },
+    "gpu_only": true
+  }
+]
--- a/scripts/qwen_intermediate.csv
+++ b/scripts/qwen_intermediate.csv
@@ -0,0 +1,12 @@
+model,label,avg,best,mode,vram,t,ub,b,kv,split,mmap
+UD-IQ4_NL,pure-GPU minbatch,65.11,65.49,GPU,19177,t4,ub64,b256,q4_0/q4_0,,
+UD-IQ4_NL,pure-GPU nommap small,65.01,65.36,GPU,19672,t4,ub128,b512,q4_0/q4_0,,nommap
+UD-IQ4_NL,pure-GPU row-split,13.65,14.82,GPU,19427,t4,ub128,b512,q4_0/q4_0,row,
+UD-IQ4_NL,pure-GPU ts=0.5,0.5,64.92,65.23,GPU,19664,t4,ub128,b512,q4_0/q4_0,,
+UD-IQ4_NL,pure-GPU all-tricks,64.72,64.89,GPU,19171,t4,ub64,b256,q4_0/q4_0,,nommap
+UD-IQ4_NL,tune t=2,64.87,65.13,GPU,19170,t2,ub64,b256,q4_0/q4_0,,
+UD-IQ4_NL,tune t=6,64.88,65.17,GPU,19168,t6,ub64,b256,q4_0/q4_0,,
+UD-IQ4_NL,tune t=8,64.5,64.77,GPU,19168,t8,ub64,b256,q4_0/q4_0,,
+UD-IQ4_NL,tune ub=256 b=1024,64.73,64.98,GPU,20640,t4,ub256,b1024,q4_0/q4_0,,
+UD-IQ4_NL,tune ub=256 b=2048,63.69,64.94,GPU,20614,t4,ub256,b2048,q4_0/q4_0,,
+UD-IQ4_NL,tune kv=q8_0/q8_0,64.78,65.08,GPU,20422,t4,ub64,b256,q8_0/q8_0,,
--- a/scripts/qwen_latest.txt
+++ b/scripts/qwen_latest.txt
@@ -0,0 +1,24 @@
+UD-IQ4_NL    | pure-GPU minbatch         |  65.11 | GPU | 19177
+UD-IQ4_NL    | pure-GPU nommap small     |  65.01 | GPU | 19672
+UD-IQ4_NL    | pure-GPU row-split        |  13.65 | GPU | 19427
+UD-IQ4_NL    | pure-GPU ts=0.5,0.5       |  64.92 | GPU | 19664
+UD-IQ4_NL    | pure-GPU all-tricks       |  64.72 | GPU | 19171
+UD-IQ4_NL    | tune t=2                  |  64.87 | GPU | 19170
+UD-IQ4_NL    | tune t=6                  |  64.88 | GPU | 19168
+UD-IQ4_NL    | tune t=8                  |   64.5 | GPU | 19168
+UD-IQ4_NL    | tune ub=256 b=1024        |  64.73 | GPU | 20640
+UD-IQ4_NL    | tune ub=256 b=2048        |  63.69 | GPU | 20614
+UD-IQ4_NL    | tune kv=q8_0/q8_0         |  64.78 | GPU | 20422
+UD-IQ4_NL    | tune kv=f16/f16           |  65.53 | GPU | 22812
+UD-IQ4_NL    | FINAL                     |  66.31 | GPU | 22811
+MXFP4_MOE    | pure-GPU minbatch         |  63.06 | GPU | 22747
+MXFP4_MOE    | pure-GPU nommap small     |  63.75 | GPU | 22579
+MXFP4_MOE    | pure-GPU ts=0.5,0.5       |  62.88 | GPU | 22578
+MXFP4_MOE    | pure-GPU all-tricks       |  62.55 | GPU | 22743
+MXFP4_MOE    | tune t=2                  |  63.07 | GPU | 22601
+MXFP4_MOE    | tune t=6                  |  63.58 | GPU | 22583
+MXFP4_MOE    | tune t=8                  |  62.92 | GPU | 22536
+MXFP4_MOE    | tune ub=256 b=1024        |  62.76 | GPU | 22874
+MXFP4_MOE    | tune ub=256 b=2048        |  62.74 | GPU | 22912
+MXFP4_MOE    | FINAL                     |  63.71 | GPU | 22566
+Q4_K_M       | pure-GPU nommap small     |  62.29 | GPU | 22975
--- a/scripts/test_20ts.txt
+++ b/scripts/test_20ts.txt
--- a/scripts/tune_122b_20ts.mjs
+++ b/scripts/tune_122b_20ts.mjs
@@ -0,0 +1,64 @@
+import { exec, spawn } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function runTest(modelArgs, envVars, name) {
+    console.log(`\n===========================================`);
+    console.log(`Testing: ${name}`);
+    console.log(`Args: ${modelArgs}`);
+    
+    return new Promise(async (resolve) => {
+        await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
+        await delay(2000);
+
+        const env = { ...process.env, ...envVars };
+        const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
+            detached: true,
+            stdio: 'ignore',
+            env
+        });
+
+        let ready = false;
+        for (let i = 0; i < 40; i++) {
+            try {
+                const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+                if (res.status === 200) {
+                    ready = true;
+                    break;
+                }
+            } catch (e) {}
+            await delay(3000);
+        }
+
+        if (!ready) {
+            console.log(`[${name}] FAILED TO BOOT`);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: false });
+            return;
+        }
+
+        console.log(`[${name}] Server Ready! Running benchmark...`);
+        exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+            console.log(stdout || stderr);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: true });
+        });
+    });
+}
+
+async function main() {
+    const baseLineArgs = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 8192 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 256 -b 1024 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+
+    // 1. Dual GPU, Split Mode Layer, Maximize VRAM usage (estimate 32 layers offloaded out of 48)
+    await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 32`, {}, "122B: n-cpu-moe 32 + sm layer");
+
+    // 2. Dual GPU, Split Mode Layer, even more aggressive GPU usage
+    await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 28`, {}, "122B: n-cpu-moe 28 + sm layer");
+    
+    // 3. Fallback to 36 if OOM happens on 32/28
+    await runTest(`${baseLineArgs} --split-mode layer --n-cpu-moe 36`, {}, "122B: n-cpu-moe 36 + sm layer");
+
+    console.log("\nALL TESTS COMPLETED");
+}
+
+main();
--- a/scripts/tune_exact.mjs
+++ b/scripts/tune_exact.mjs
@@ -0,0 +1,72 @@
+import { exec, spawn } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function runTest(modelArgs, envVars, name) {
+    console.log(`\n===========================================`);
+    console.log(`Testing: ${name}`);
+    console.log(`Env: ${JSON.stringify(envVars)}`);
+    console.log(`Args: ${modelArgs}`);
+    
+    return new Promise(async (resolve) => {
+        await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
+        await delay(2000);
+
+        const env = { ...process.env, ...envVars };
+        const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
+            detached: true,
+            stdio: 'ignore',
+            env
+        });
+
+        let ready = false;
+        
+        for (let i = 0; i < 40; i++) {
+            try {
+                const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+                if (res.status === 200) {
+                    ready = true;
+                    break;
+                }
+            } catch (e) {}
+            await delay(3000);
+        }
+
+        if (!ready) {
+            console.log(`[${name}] FAILED TO BOOT`);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: false });
+            return;
+        }
+
+        console.log(`[${name}] Server Ready! Running speed test...`);
+        exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+            console.log(stdout || stderr);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: true });
+        });
+    });
+}
+
+async function main() {
+    // 1. 122B-A10B: Pure GPU offload (No n-cpu-moe at all)
+    // -ngl 999 will offload all 48 layers to the NVIDIA driver (triggering Shared VRAM fallback on Windows)
+    const args122B = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+    await runTest(args122B, {}, "122B-A10B: Pure GPU (NVIDIA Shared Memory Fallback)");
+
+    // 2. 35B-A3B: Pure GPU tuning to hit 70 t/s
+    // Base configuration from previous full-gpu run:
+    const args35B = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 128 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+    
+    // We already got ~64 t/s basically. 
+    // Let's try MMQ for custom matrix multiplication which is often faster on Ampere for low batch generation
+    await runTest(args35B, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Force MMQ = 1");
+    
+    // Try increasing threads to 12 just in case
+    const args35B_t12 = args35B.replace("-t 6 -tb 6", "-t 12 -tb 12");
+    await runTest(args35B_t12, { GGML_CUDA_FORCE_MMQ: "1" }, "35B-A3B: Threads 12 + MMQ");
+
+    console.log("\nALL TESTS COMPLETED");
+}
+
+main();
--- a/scripts/tune_models.mjs
+++ b/scripts/tune_models.mjs
@@ -0,0 +1,84 @@
+import { exec, spawn } from 'child_process';
+
+const delay = ms => new Promise(res => setTimeout(res, ms));
+
+async function runTest(modelArgs, name) {
+    console.log(`\n===========================================`);
+    console.log(`Testing: ${name}`);
+    console.log(`Args: ${modelArgs}`);
+    
+    return new Promise(async (resolve) => {
+        // Kill existing
+        await new Promise(r => exec('taskkill /F /IM llama-server.exe', r));
+        await delay(2000);
+
+        const server = spawn('llama_bin_run\\llama-server.exe', modelArgs.split(' '), {
+            detached: true,
+            stdio: 'ignore' 
+        });
+
+        let ready = false;
+        let oom = false;
+        
+        for (let i = 0; i < 40; i++) {
+            try {
+                const res = await fetch('http://127.0.0.1:8000/health', { timeout: 2000 });
+                if (res.status === 200) {
+                    ready = true;
+                    break;
+                }
+            } catch (e) {}
+            await delay(3000);
+        }
+
+        if (!ready) {
+            console.log(`[${name}] FAILED TO BOOT (Likely OOM)`);
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: false });
+            return;
+        }
+
+        console.log(`[${name}] Server Ready! Running benchmark...`);
+        // Run pptest
+        exec('node scripts/quick_pptest.mjs', (err, stdout, stderr) => {
+            console.log(stdout || stderr);
+            
+            // Extract TG and PP from TG-500
+            const tgMatch = stdout.match(/TG-500 \| PP:\d+tok \d+\.\dt\/s \| TG:\d+tok (\d+\.\d+)t\/s/);
+            const ppMatch = stdout.match(/10K-CODE \| PP:\d+tok (\d+\.\d+)t\/s/);
+            
+            const tg = tgMatch ? parseFloat(tgMatch[1]) : 0;
+            const pp = ppMatch ? parseFloat(ppMatch[1]) : 0;
+            
+            exec('taskkill /F /IM llama-server.exe');
+            resolve({ success: true, tg, pp });
+        });
+    });
+}
+
+async function main() {
+    // 1. Qwen 35B Tuning: We need 70 t/s. Let's try 1-3 layers of n-cpu-moe to unlock ub=512
+    const args35B_base = `--model models\\Qwen3.5-35B-A3B-Q4_K_M.gguf -ngl 999 -c 262144 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -b 512 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+    
+    // Test 1: n-cpu-moe 1, ub 512
+    await runTest(`${args35B_base} -ub 512 --n-cpu-moe 1`, "Qwen-35B: moe=1, ub=512");
+    
+    // Test 2: n-cpu-moe 2, ub 512
+    await runTest(`${args35B_base} -ub 512 --n-cpu-moe 2`, "Qwen-35B: moe=2, ub=512");
+
+    // Test 3: n-cpu-moe 4, ub 512
+    await runTest(`${args35B_base} -ub 512 --n-cpu-moe 4`, "Qwen-35B: moe=4, ub=512");
+    
+    // 2. 122B Tuning: Find optimal n-cpu-moe
+    const args122B_base = `--model models\\Q4_K_M\\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf -ngl 999 -c 32768 -np 1 -fa on --cache-type-k q4_0 --cache-type-v q4_0 -ub 512 -b 2048 -t 6 -tb 6 --prio 3 --fit off --port 8000 --host 0.0.0.0`;
+    
+    // Since 48 leaves 16GB free, each layer is ~1.5GB total, meaning ~0.75GB per GPU.
+    // Let's try 38, 35, 30
+    await runTest(`${args122B_base} --n-cpu-moe 38`, "Qwen-122B: moe=38");
+    await runTest(`${args122B_base} --n-cpu-moe 30`, "Qwen-122B: moe=30");
+    await runTest(`${args122B_base} --n-cpu-moe 22`, "Qwen-122B: moe=22");
+
+    console.log("Tuning finished.");
+}
+
+main();
--- a/scripts/tune_results_gemma4_256k.json
+++ b/scripts/tune_results_gemma4_256k.json
@@ -0,0 +1,591 @@
+[
+  {
+    "ngl": 22,
+    "t": 8,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.22049935826915,
+    "best_tps": 25.971732307567606,
+    "vram_used": 11953,
+    "vram_total": 12288,
+    "label": "ngl=22"
+  },
+  {
+    "ngl": 21,
+    "t": 8,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.805518952775174,
+    "best_tps": 25.953896683689454,
+    "vram_used": 11942,
+    "vram_total": 12288,
+    "label": "ngl=21"
+  },
+  {
+    "ngl": 20,
+    "t": 8,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 23.537353232262834,
+    "best_tps": 24.32109262330477,
+    "vram_used": 11972,
+    "vram_total": 12288,
+    "label": "ngl=20"
+  },
+  {
+    "ngl": 21,
+    "t": 2,
+    "tb": 2,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 20.167581352340264,
+    "best_tps": 20.701192443418005,
+    "vram_used": 11969,
+    "vram_total": 12288,
+    "label": "t=2 | tb=2"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.689104997668554,
+    "best_tps": 26.328541632880874,
+    "vram_used": 11975,
+    "vram_total": 12288,
+    "label": "t=4 | tb=4"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.294470150452725,
+    "best_tps": 26.541251363470614,
+    "vram_used": 11984,
+    "vram_total": 12288,
+    "label": "t=4 | tb=8"
+  },
+  {
+    "ngl": 21,
+    "t": 6,
+    "tb": 6,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.307859289404675,
+    "best_tps": 26.292208504543133,
+    "vram_used": 11984,
+    "vram_total": 12288,
+    "label": "t=6 | tb=6"
+  },
+  {
+    "ngl": 21,
+    "t": 6,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.230599923243314,
+    "best_tps": 26.366065850165732,
+    "vram_used": 11983,
+    "vram_total": 12288,
+    "label": "t=6 | tb=8"
+  },
+  {
+    "ngl": 21,
+    "t": 8,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.113108026759278,
+    "best_tps": 26.123872617669583,
+    "vram_used": 11984,
+    "vram_total": 12288,
+    "label": "t=8 | tb=8"
+  },
+  {
+    "ngl": 21,
+    "t": 8,
+    "tb": 12,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.05545428888364,
+    "best_tps": 26.06377500079152,
+    "vram_used": 11983,
+    "vram_total": 12288,
+    "label": "t=8 | tb=12"
+  },
+  {
+    "ngl": 21,
+    "t": 10,
+    "tb": 10,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 24.706926870374986,
+    "best_tps": 25.03033604251865,
+    "vram_used": 11984,
+    "vram_total": 12288,
+    "label": "t=10 | tb=10"
+  },
+  {
+    "ngl": 21,
+    "t": 12,
+    "tb": 12,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 22.468055564001904,
+    "best_tps": 23.425983251691825,
+    "vram_used": 11989,
+    "vram_total": 12288,
+    "label": "t=12 | tb=12"
+  },
+  {
+    "ngl": 21,
+    "t": 16,
+    "tb": 16,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 21.176973905195442,
+    "best_tps": 21.482429642395456,
+    "vram_used": 12021,
+    "vram_total": 12288,
+    "label": "t=16 | tb=16"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 128,
+    "b": 512,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.545748810106186,
+    "best_tps": 26.344547829145817,
+    "vram_used": 11986,
+    "vram_total": 12288,
+    "label": "ub=128 | b=512"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 256,
+    "b": 1024,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.503875205368377,
+    "best_tps": 26.393548686102108,
+    "vram_used": 11981,
+    "vram_total": 12288,
+    "label": "ub=256 | b=1024"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 256,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.46500292415627,
+    "best_tps": 26.2726382287537,
+    "vram_used": 11981,
+    "vram_total": 12288,
+    "label": "ub=256 | b=2048"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 1024,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.50982209452459,
+    "best_tps": 26.292282671074723,
+    "vram_used": 12020,
+    "vram_total": 12288,
+    "label": "ub=512 | b=1024"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.39646674356899,
+    "best_tps": 26.28106356028714,
+    "vram_used": 12020,
+    "vram_total": 12288,
+    "label": "ub=512 | b=2048"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.471945933724726,
+    "best_tps": 26.268422652962233,
+    "vram_used": 12021,
+    "vram_total": 12288,
+    "label": "ub=512 | b=4096"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.722119623856702,
+    "best_tps": 26.497264927416403,
+    "vram_used": 12019,
+    "vram_total": 12288,
+    "label": "ub=1024 | b=2048"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.665819493145943,
+    "best_tps": 26.301163428594148,
+    "vram_used": 12019,
+    "vram_total": 12288,
+    "label": "ub=1024 | b=4096"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.464915272955533,
+    "best_tps": 26.40667691713752,
+    "vram_used": 12019,
+    "vram_total": 12288,
+    "label": "ctk=q4_0 | ctv=q4_0"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.489715990281564,
+    "best_tps": 25.884133821146627,
+    "vram_used": 12011,
+    "vram_total": 12288,
+    "label": "ctk=q8_0 | ctv=q8_0"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 22.751034104721082,
+    "best_tps": 22.91250972782414,
+    "vram_used": 12017,
+    "vram_total": 12288,
+    "label": "ctk=q4_0 | ctv=q8_0"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "f16",
+    "ctv": "f16",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 24.745831571513975,
+    "best_tps": 25.53926086004382,
+    "vram_used": 11985,
+    "vram_total": 12288,
+    "label": "ctk=f16 | ctv=f16"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.21575943186602,
+    "best_tps": 25.796865637378264,
+    "vram_used": 12013,
+    "vram_total": 12288,
+    "label": "mmap=True | poll=50 | prio=2"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": false,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 23.88172807693179,
+    "best_tps": 24.803356430302312,
+    "vram_used": 12016,
+    "vram_total": 12288,
+    "label": "mmap=False | poll=50 | prio=2"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 0,
+    "avg_tps": 25.041321207287698,
+    "best_tps": 25.88479834694897,
+    "vram_used": 12017,
+    "vram_total": 12288,
+    "label": "mmap=True | poll=0 | prio=2"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 100,
+    "avg_tps": 25.27990666474703,
+    "best_tps": 26.034861156695197,
+    "vram_used": 12017,
+    "vram_total": 12288,
+    "label": "mmap=True | poll=100 | prio=2"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 3,
+    "poll": 50,
+    "avg_tps": 25.360977804679788,
+    "best_tps": 26.0705565191107,
+    "vram_used": 12022,
+    "vram_total": 12288,
+    "label": "mmap=True | poll=50 | prio=3"
+  },
+  {
+    "ngl": 21,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": false,
+    "prio": 3,
+    "poll": 0,
+    "avg_tps": 24.156893523381967,
+    "best_tps": 24.840307911026144,
+    "vram_used": 12021,
+    "vram_total": 12288,
+    "label": "mmap=False | poll=0 | prio=3"
+  }
+]
--- a/scripts/tune_results_gemma4_ncpumoe.json
+++ b/scripts/tune_results_gemma4_ncpumoe.json
@@ -0,0 +1,201 @@
+[
+  {
+    "label": "ncpumoe=0",
+    "ncpumoe": 0,
+    "avg": 15.396949591766335,
+    "best": 20.220093309883133,
+    "vram": 12011,
+    "nommap": false
+  },
+  {
+    "label": "ncpumoe=5",
+    "ncpumoe": 5,
+    "avg": 4.853957926040404,
+    "best": 4.9029479257524216,
+    "vram": 11945,
+    "nommap": false
+  },
+  {
+    "label": "ncpumoe=10",
+    "ncpumoe": 10,
+    "avg": 20.64137159193706,
+    "best": 26.474940718957154,
+    "vram": 12020,
+    "nommap": false
+  },
+  {
+    "label": "ncpumoe=15",
+    "ncpumoe": 15,
+    "avg": 13.424368433101165,
+    "best": 13.698684361880598,
+    "vram": 12018,
+    "nommap": false
+  },
+  {
+    "label": "ncpumoe=20",
+    "ncpumoe": 20,
+    "avg": 10.338449574838693,
+    "best": 13.495275411319872,
+    "vram": 11530,
+    "nommap": true
+  },
+  {
+    "label": "ncpumoe=25",
+    "ncpumoe": 25,
+    "avg": 12.920348175328435,
+    "best": 12.99923042323437,
+    "vram": 11625,
+    "nommap": true
+  },
+  {
+    "label": "ncpumoe=30",
+    "ncpumoe": 30,
+    "avg": 13.251690836275145,
+    "best": 13.253697466971921,
+    "vram": 9064,
+    "nommap": true
+  },
+  {
+    "label": "ncpumoe=7",
+    "ncpumoe": 7,
+    "avg": 16.31796299658782,
+    "best": 23.160760806218782,
+    "vram": 11994,
+    "nommap": false
+  },
+  {
+    "label": "ncpumoe=9",
+    "ncpumoe": 9,
+    "avg": 7.469651892205037,
+    "best": 10.875064047449284,
+    "vram": 11941,
+    "nommap": false
+  },
+  {
+    "label": "ncpumoe=11",
+    "ncpumoe": 11,
+    "avg": 14.814740144776437,
+    "best": 15.199641279675724,
+    "vram": 11984,
+    "nommap": false
+  },
+  {
+    "label": "ncpumoe=13",
+    "ncpumoe": 13,
+    "avg": 14.183175252947136,
+    "best": 14.427257794639086,
+    "vram": 12003,
+    "nommap": false
+  },
+  {
+    "label": "t=2",
+    "ncpumoe": 10,
+    "avg": 28.551811207068425,
+    "best": 28.688565545389164,
+    "vram": 11968,
+    "t": 2,
+    "nommap": false
+  },
+  {
+    "label": "t=4",
+    "ncpumoe": 10,
+    "avg": 30.8619310622166,
+    "best": 31.17677746690393,
+    "vram": 11972,
+    "t": 4,
+    "nommap": false
+  },
+  {
+    "label": "t=6",
+    "ncpumoe": 10,
+    "avg": 30.578454576249854,
+    "best": 30.971792125516313,
+    "vram": 11983,
+    "t": 6,
+    "nommap": false
+  },
+  {
+    "label": "t=8",
+    "ncpumoe": 10,
+    "avg": 30.529393512116172,
+    "best": 30.954830478128166,
+    "vram": 11982,
+    "t": 8,
+    "nommap": false
+  },
+  {
+    "label": "t=10",
+    "ncpumoe": 10,
+    "avg": 30.773041112229503,
+    "best": 31.00899077264753,
+    "vram": 11972,
+    "t": 10,
+    "nommap": false
+  },
+  {
+    "label": "ub=256,b=1024",
+    "ncpumoe": 10,
+    "avg": 30.49319055490045,
+    "best": 30.691055921541377,
+    "vram": 11993,
+    "t": 4,
+    "ub": 256,
+    "b": 1024,
+    "nommap": false
+  },
+  {
+    "label": "ub=512,b=2048",
+    "ncpumoe": 10,
+    "avg": 30.923573731331718,
+    "best": 31.902272031660825,
+    "vram": 11995,
+    "t": 4,
+    "ub": 512,
+    "b": 2048,
+    "nommap": false
+  },
+  {
+    "label": "ub=512,b=4096",
+    "ncpumoe": 10,
+    "avg": 30.723820162954862,
+    "best": 31.065476003548053,
+    "vram": 11966,
+    "t": 4,
+    "ub": 512,
+    "b": 4096,
+    "nommap": false
+  },
+  {
+    "label": "ub=1024,b=2048",
+    "ncpumoe": 10,
+    "avg": 30.489888387093156,
+    "best": 30.982074615885946,
+    "vram": 11964,
+    "t": 4,
+    "ub": 1024,
+    "b": 2048,
+    "nommap": false
+  },
+  {
+    "label": "kv=q4_0",
+    "ncpumoe": 10,
+    "avg": 30.63156129571348,
+    "best": 31.088674795634944,
+    "vram": 11988,
+    "t": 4,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "nommap": false
+  },
+  {
+    "label": "kv=q8_0",
+    "ncpumoe": 10,
+    "avg": 29.6114222576863,
+    "best": 30.580427895917573,
+    "vram": 11980,
+    "t": 4,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "nommap": false
+  }
+]
--- a/scripts/tune_results_qwen35b_256k.json
+++ b/scripts/tune_results_qwen35b_256k.json
@@ -0,0 +1,522 @@
+[
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 6,
+    "tb": 6,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.169961832638464,
+    "best_tps": 26.533887071573073,
+    "vram_used": 4994,
+    "vram_total": 12288,
+    "label": "cpu_moe=True"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": false,
+    "t": 6,
+    "tb": 6,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 11.065030380022206,
+    "best_tps": 11.083028272674314,
+    "vram_used": 11949,
+    "vram_total": 12288,
+    "label": "cpu_moe=False"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 2,
+    "tb": 2,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 21.473286428302767,
+    "best_tps": 21.746637577851104,
+    "vram_used": 4994,
+    "vram_total": 12288,
+    "label": "t=2 | tb=2"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.552358479030676,
+    "best_tps": 27.314237654089343,
+    "vram_used": 4991,
+    "vram_total": 12288,
+    "label": "t=4 | tb=4"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 6,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.347068485327956,
+    "best_tps": 26.87924726131441,
+    "vram_used": 4993,
+    "vram_total": 12288,
+    "label": "t=4 | tb=6"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 6,
+    "tb": 6,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.331286039513458,
+    "best_tps": 26.81427299445741,
+    "vram_used": 5001,
+    "vram_total": 12288,
+    "label": "t=6 | tb=6"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 6,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.391160513711274,
+    "best_tps": 26.735573238878736,
+    "vram_used": 5001,
+    "vram_total": 12288,
+    "label": "t=6 | tb=8"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 8,
+    "tb": 8,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 25.32340666199144,
+    "best_tps": 25.87949347494079,
+    "vram_used": 4995,
+    "vram_total": 12288,
+    "label": "t=8 | tb=8"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 10,
+    "tb": 10,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 23.752277317850815,
+    "best_tps": 24.98242898809555,
+    "vram_used": 5011,
+    "vram_total": 12288,
+    "label": "t=10 | tb=10"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 12,
+    "tb": 12,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 21.75032196383532,
+    "best_tps": 23.18963400077116,
+    "vram_used": 5104,
+    "vram_total": 12288,
+    "label": "t=12 | tb=12"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 128,
+    "b": 512,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 13.27593572827031,
+    "best_tps": 13.337407402920235,
+    "vram_used": 4391,
+    "vram_total": 12288,
+    "label": "ub=128 | b=512"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 256,
+    "b": 1024,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.638687188233188,
+    "best_tps": 27.361082444434413,
+    "vram_used": 4495,
+    "vram_total": 12288,
+    "label": "ub=256 | b=1024"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 256,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.29069503392877,
+    "best_tps": 26.63368832924803,
+    "vram_used": 4490,
+    "vram_total": 12288,
+    "label": "ub=256 | b=2048"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 1024,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.518331831441134,
+    "best_tps": 26.972021321271527,
+    "vram_used": 4984,
+    "vram_total": 12288,
+    "label": "ub=512 | b=1024"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.401541912276873,
+    "best_tps": 26.46530849236633,
+    "vram_used": 4990,
+    "vram_total": 12288,
+    "label": "ub=512 | b=2048"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 26.892711500590455,
+    "best_tps": 26.892711500590455,
+    "vram_used": 5006,
+    "vram_total": 12288,
+    "label": "ub=512 | b=4096"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 2048,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 12.600209659679201,
+    "best_tps": 12.759356030807627,
+    "vram_used": 12020,
+    "vram_total": 12288,
+    "label": "ub=1024 | b=2048"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 1024,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 6.023959262370547,
+    "best_tps": 8.284882268188156,
+    "vram_used": 11931,
+    "vram_total": 12288,
+    "label": "ub=1024 | b=4096"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 12.96992950856374,
+    "best_tps": 12.96992950856374,
+    "vram_used": 12022,
+    "vram_total": 12288,
+    "label": "ctk=q4_0 | ctv=q4_0"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q8_0",
+    "ctv": "q8_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 11.420078920350697,
+    "best_tps": 13.524778595767653,
+    "vram_used": 12030,
+    "vram_total": 12288,
+    "label": "ctk=q8_0 | ctv=q8_0"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "f16",
+    "ctv": "f16",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 11.978106511464183,
+    "best_tps": 13.729190013094977,
+    "vram_used": 11518,
+    "vram_total": 12288,
+    "label": "ctk=f16 | ctv=f16"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 16.164278220452957,
+    "best_tps": 22.645890325274323,
+    "vram_used": 11623,
+    "vram_total": 12288,
+    "label": "mmap=True | poll=50 | prio=2"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": false,
+    "prio": 2,
+    "poll": 50,
+    "avg_tps": 16.555542780023114,
+    "best_tps": 23.333815015033892,
+    "vram_used": 9062,
+    "vram_total": 12288,
+    "label": "mmap=False | poll=50 | prio=2"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 0,
+    "avg_tps": 13.003619379106329,
+    "best_tps": 13.031594557134142,
+    "vram_used": 11994,
+    "vram_total": 12288,
+    "label": "mmap=True | poll=0 | prio=2"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 2,
+    "poll": 100,
+    "avg_tps": 5.7762452690702935,
+    "best_tps": 5.795560155803046,
+    "vram_used": 11953,
+    "vram_total": 12288,
+    "label": "mmap=True | poll=100 | prio=2"
+  },
+  {
+    "ngl": 999,
+    "cpu_moe": true,
+    "t": 4,
+    "tb": 4,
+    "ub": 512,
+    "b": 4096,
+    "ctk": "q4_0",
+    "ctv": "q4_0",
+    "fa": "on",
+    "mlock": true,
+    "mmap": true,
+    "prio": 3,
+    "poll": 50,
+    "avg_tps": 12.59406799687573,
+    "best_tps": 14.966737641114795,
+    "vram_used": 11996,
+    "vram_total": 12288,
+    "label": "mmap=True | poll=50 | prio=3"
+  }
+]