wip: [01-llm-tuning] paused at task 1/3
This commit is contained in:
@@ -1,372 +0,0 @@
|
||||
"""
|
||||
Qwen3.5 122B-A10B 자동 정밀 튜닝 스크립트
|
||||
===========================================
|
||||
각 설정 조합으로 서버를 재시작하고 벤치마크를 자동 수행합니다.
|
||||
서버 로그에서 순수 eval time (t/s)를 파싱하여 정확한 비교 테이블을 출력합니다.
|
||||
|
||||
예상 소요 시간: 약 30-40분 (5개 설정 × ~6-7분/설정)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
# ============================================================
|
||||
# 테스트할 설정 목록
|
||||
# ============================================================
|
||||
# 공통 파라미터 (변경하지 않는 것들)
|
||||
COMMON_ARGS = [
|
||||
"--model", MODEL_PATH,
|
||||
"-ngl", "999",
|
||||
"--cpu-moe",
|
||||
"-c", "2048",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "256",
|
||||
"-b", "1024",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"--no-warmup", # 워밍업은 벤치마크 스크립트에서 직접 수행
|
||||
]
|
||||
|
||||
# 변수 파라미터 조합
|
||||
CONFIGS = [
|
||||
{
|
||||
"name": "A) --no-mmap -t 8",
|
||||
"desc": "서버 권장: mmap 비활성화 (baseline 대비)",
|
||||
"extra": ["--no-mmap", "-t", "8", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "B) --no-mmap -t 6",
|
||||
"desc": "스레드 감소 (캐시 경합 회피)",
|
||||
"extra": ["--no-mmap", "-t", "6", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "C) --no-mmap -t 10",
|
||||
"desc": "스레드 증가 (RAM 대역폭 포화)",
|
||||
"extra": ["--no-mmap", "-t", "10", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "D) --no-mmap -t 12",
|
||||
"desc": "더 많은 스레드",
|
||||
"extra": ["--no-mmap", "-t", "12", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "E) --no-mmap -t 10 --prio 3 --poll 100",
|
||||
"desc": "최적 스레드 + 리얼타임 우선순위 + 폴링",
|
||||
"extra": ["--no-mmap", "-t", "10", "--prio", "3", "--poll", "100"],
|
||||
},
|
||||
]
|
||||
|
||||
# ============================================================
|
||||
# 유틸리티 함수
|
||||
# ============================================================
|
||||
|
||||
def kill_server():
|
||||
"""llama-server 프로세스 강제 종료"""
|
||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
||||
time.sleep(3)
|
||||
|
||||
def start_server(config, log_path):
|
||||
"""서버 시작, 로그를 파일로 리다이렉트"""
|
||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
||||
log_file = open(log_path, "w", encoding="utf-8")
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=log_file,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd()
|
||||
)
|
||||
return proc, log_file
|
||||
|
||||
def wait_for_server(timeout=600):
|
||||
"""서버가 준비될 때까지 대기"""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def run_single_benchmark(prompt, max_tokens=200):
|
||||
"""단일 벤치마크 실행"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
usage = result.get("usage", {})
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
return completion_tokens, elapsed
|
||||
|
||||
def parse_eval_times(log_path):
|
||||
"""서버 로그에서 순수 eval time 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
|
||||
# "eval time = XXXXX.XX ms / NNN tokens (XXX.XX ms per token, XX.XX tokens per second)"
|
||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
|
||||
results = []
|
||||
for m in matches:
|
||||
results.append({
|
||||
"total_ms": float(m[0]),
|
||||
"tokens": int(m[1]),
|
||||
"ms_per_token": float(m[2]),
|
||||
"tps": float(m[3])
|
||||
})
|
||||
return results
|
||||
|
||||
def parse_prompt_eval_times(log_path):
|
||||
"""서버 로그에서 prompt eval time 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
|
||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
|
||||
results = []
|
||||
for m in matches:
|
||||
results.append({
|
||||
"total_ms": float(m[0]),
|
||||
"tokens": int(m[1]),
|
||||
"ms_per_token": float(m[2]),
|
||||
"tps": float(m[3])
|
||||
})
|
||||
return results
|
||||
|
||||
def parse_vram_usage(log_path):
|
||||
"""서버 로그에서 CUDA0 모델 버퍼 크기 파싱"""
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return "N/A"
|
||||
|
||||
match = re.search(r'CUDA0 model buffer size\s*=\s*([\d.]+)\s*MiB', content)
|
||||
if match:
|
||||
return f"{float(match.group(1)):.0f} MiB"
|
||||
return "N/A"
|
||||
|
||||
# ============================================================
|
||||
# 메인 튜닝 루프
|
||||
# ============================================================
|
||||
|
||||
def main():
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B 자동 정밀 튜닝")
|
||||
print(f" 시작 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f" 테스트 설정: {len(CONFIGS)}개")
|
||||
print(f" 예상 소요: ~{len(CONFIGS) * 7}분")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print(" 기존 Baseline: mmap on, -t 8, --prio 2 → 10.06 t/s (eval)")
|
||||
print()
|
||||
|
||||
# 결과 저장
|
||||
all_results = []
|
||||
|
||||
for idx, config in enumerate(CONFIGS):
|
||||
config_start = time.time()
|
||||
log_path = os.path.join(os.getcwd(), f"tune_log_{idx}.txt")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
||||
print(f" {config['desc']}")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# 1. 기존 서버 종료
|
||||
print(" [1/4] 서버 종료 중...")
|
||||
kill_server()
|
||||
|
||||
# 2. 새 서버 시작
|
||||
print(f" [2/4] 서버 시작 중... (모델 로딩 ~3-5분)")
|
||||
proc, log_file = start_server(config, log_path)
|
||||
|
||||
# 3. 서버 준비 대기
|
||||
if not wait_for_server(timeout=600):
|
||||
print(" ❌ 서버 시작 실패! 다음 설정으로 넘어갑니다.")
|
||||
kill_server()
|
||||
log_file.close()
|
||||
all_results.append({
|
||||
"config": config["name"],
|
||||
"status": "FAILED",
|
||||
"eval_tps": [],
|
||||
"prompt_tps": [],
|
||||
"vram": "N/A"
|
||||
})
|
||||
continue
|
||||
|
||||
load_time = time.time() - config_start
|
||||
print(f" [3/4] 서버 준비 완료! (로딩 {load_time:.0f}초)")
|
||||
|
||||
# 4. 벤치마크 실행 (워밍업 1회 + 본 테스트 3회)
|
||||
print(" [4/4] 벤치마크 실행 중...")
|
||||
|
||||
# 워밍업
|
||||
try:
|
||||
run_single_benchmark("Say hello.", max_tokens=20)
|
||||
print(" 워밍업 완료")
|
||||
except Exception as e:
|
||||
print(f" 워밍업 실패: {e}")
|
||||
|
||||
# 본 테스트 3회
|
||||
prompts = [
|
||||
"Write a detailed explanation of how neural networks learn through backpropagation and gradient descent.",
|
||||
"Explain the complete process of photosynthesis including light and dark reactions in detail.",
|
||||
"Describe the differences between SQL and NoSQL databases with examples and performance characteristics.",
|
||||
]
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
try:
|
||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
||||
approx_tps = tokens / elapsed if elapsed > 0 else 0
|
||||
print(f" Run {i+1}/3: {tokens} tokens, {elapsed:.1f}s, ~{approx_tps:.2f} t/s (approx)")
|
||||
except Exception as e:
|
||||
print(f" Run {i+1}/3: ERROR - {e}")
|
||||
|
||||
# 서버 종료 전에 로그 플러시를 위해 잠시 대기
|
||||
time.sleep(2)
|
||||
|
||||
# 서버 종료
|
||||
kill_server()
|
||||
log_file.close()
|
||||
time.sleep(2)
|
||||
|
||||
# 로그 파싱
|
||||
eval_times = parse_eval_times(log_path)
|
||||
prompt_times = parse_prompt_eval_times(log_path)
|
||||
vram = parse_vram_usage(log_path)
|
||||
|
||||
# 워밍업 제외 (첫 번째 결과)
|
||||
if len(eval_times) > 1:
|
||||
bench_evals = eval_times[1:] # 워밍업 제외
|
||||
else:
|
||||
bench_evals = eval_times
|
||||
|
||||
if len(prompt_times) > 1:
|
||||
bench_prompts = prompt_times[1:]
|
||||
else:
|
||||
bench_prompts = prompt_times
|
||||
|
||||
eval_speeds = [e["tps"] for e in bench_evals]
|
||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
||||
|
||||
result = {
|
||||
"config": config["name"],
|
||||
"status": "OK",
|
||||
"eval_tps": eval_speeds,
|
||||
"prompt_tps": prompt_speeds,
|
||||
"vram": vram,
|
||||
}
|
||||
all_results.append(result)
|
||||
|
||||
config_elapsed = time.time() - config_start
|
||||
print(f"\n 완료! 소요: {config_elapsed:.0f}초")
|
||||
|
||||
if eval_speeds:
|
||||
avg_eval = sum(eval_speeds) / len(eval_speeds)
|
||||
max_eval = max(eval_speeds)
|
||||
print(f" 📊 Eval TPS: avg={avg_eval:.2f}, max={max_eval:.2f}")
|
||||
|
||||
# ============================================================
|
||||
# 최종 결과 비교 테이블
|
||||
# ============================================================
|
||||
print("\n")
|
||||
print("=" * 80)
|
||||
print(" 🏆 최종 결과 비교 테이블")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
# 기존 baseline 추가
|
||||
print(f" {'설정':<45} {'Eval t/s':>10} {'최대':>8} {'Prompt t/s':>12} {'VRAM':>12}")
|
||||
print(f" {'-'*45} {'-'*10} {'-'*8} {'-'*12} {'-'*12}")
|
||||
|
||||
# Baseline (이전 결과)
|
||||
print(f" {'[기준] mmap on, -t 8, --prio 2':<45} {'10.02':>10} {'10.06':>8} {'29.52':>12} {'5392 MiB':>12}")
|
||||
|
||||
best_avg = 0
|
||||
best_config = ""
|
||||
|
||||
for r in all_results:
|
||||
if r["status"] != "OK" or not r["eval_tps"]:
|
||||
print(f" {r['config']:<45} {'FAILED':>10} {'':>8} {'':>12} {r['vram']:>12}")
|
||||
continue
|
||||
|
||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
||||
max_e = max(r["eval_tps"])
|
||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
||||
|
||||
if avg_e > best_avg:
|
||||
best_avg = avg_e
|
||||
best_config = r["config"]
|
||||
|
||||
marker = " ⭐" if avg_e > 10.06 else ""
|
||||
print(f" {r['config']:<45} {avg_e:>10.2f} {max_e:>8.2f} {avg_p:>12.2f} {r['vram']:>12}{marker}")
|
||||
|
||||
print()
|
||||
if best_avg > 0:
|
||||
improvement = ((best_avg - 10.02) / 10.02) * 100
|
||||
print(f" 🏆 최고 성능: {best_config}")
|
||||
print(f" → {best_avg:.2f} t/s (기준 10.02 t/s 대비 {improvement:+.1f}%)")
|
||||
|
||||
print()
|
||||
print(f" 완료 시간: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print("=" * 80)
|
||||
|
||||
# 결과를 파일로도 저장
|
||||
result_path = os.path.join(os.getcwd(), f"tune_results_{timestamp}.txt")
|
||||
with open(result_path, "w", encoding="utf-8") as f:
|
||||
f.write("Qwen3.5 122B-A10B Fine Tuning Results\n")
|
||||
f.write(f"Date: {timestamp}\n\n")
|
||||
for r in all_results:
|
||||
f.write(f"{r['config']}: {r['eval_tps']} (VRAM: {r['vram']})\n")
|
||||
print(f" 결과 저장: {result_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,257 +0,0 @@
|
||||
"""
|
||||
Qwen3.5 122B-A10B 정밀 튜닝 2라운드
|
||||
====================================
|
||||
1라운드 결과: mmap on이 더 빠르고, 스레드 수가 적을수록 빠름
|
||||
→ mmap on 상태에서 스레드 수 4~8 범위를 정밀 탐색
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
MODEL_PATH = r"models\Q4_K_M\Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"
|
||||
SERVER_EXE = r"llama_bin_run\llama-server.exe"
|
||||
|
||||
COMMON_ARGS = [
|
||||
"--model", MODEL_PATH,
|
||||
"-ngl", "999",
|
||||
"--cpu-moe",
|
||||
"-c", "2048",
|
||||
"-np", "1",
|
||||
"-fa", "on",
|
||||
"--cache-type-k", "q4_0",
|
||||
"--cache-type-v", "q4_0",
|
||||
"-ub", "256",
|
||||
"-b", "1024",
|
||||
"--mlock",
|
||||
"--port", "8000",
|
||||
"--host", "0.0.0.0",
|
||||
"--no-warmup",
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
{
|
||||
"name": "F) mmap on, -t 4",
|
||||
"desc": "최소 스레드 (4개, 물리코어 절반)",
|
||||
"extra": ["-t", "4", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "G) mmap on, -t 5",
|
||||
"desc": "스레드 5개",
|
||||
"extra": ["-t", "5", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "H) mmap on, -t 6",
|
||||
"desc": "스레드 6개 (--no-mmap에서 최고였음)",
|
||||
"extra": ["-t", "6", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "I) mmap on, -t 7",
|
||||
"desc": "스레드 7개",
|
||||
"extra": ["-t", "7", "--prio", "2"],
|
||||
},
|
||||
{
|
||||
"name": "J) mmap on, -t 6, --prio 3",
|
||||
"desc": "최적 스레드 + 리얼타임 우선순위",
|
||||
"extra": ["-t", "6", "--prio", "3"],
|
||||
},
|
||||
]
|
||||
|
||||
def kill_server():
|
||||
os.system("taskkill /F /IM llama-server.exe >nul 2>&1")
|
||||
time.sleep(3)
|
||||
|
||||
def start_server(config, log_path):
|
||||
cmd = [SERVER_EXE] + COMMON_ARGS + config["extra"]
|
||||
log_file = open(log_path, "w", encoding="utf-8")
|
||||
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=os.getcwd())
|
||||
return proc, log_file
|
||||
|
||||
def wait_for_server(timeout=600):
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
time.sleep(5)
|
||||
return False
|
||||
|
||||
def run_single_benchmark(prompt, max_tokens=200):
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
usage = result.get("usage", {})
|
||||
return usage.get("completion_tokens", 0), elapsed
|
||||
|
||||
def parse_eval_times(log_path):
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
pattern = r'^\s+eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
return [{"tps": float(m[3]), "tokens": int(m[1])} for m in matches]
|
||||
|
||||
def parse_prompt_eval_times(log_path):
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except:
|
||||
return []
|
||||
pattern = r'prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens\s*\(\s*([\d.]+)\s*ms per token,\s*([\d.]+)\s*tokens per second\)'
|
||||
matches = re.findall(pattern, content, re.MULTILINE)
|
||||
return [{"tps": float(m[3])} for m in matches]
|
||||
|
||||
def main():
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B 정밀 튜닝 - 2라운드")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f" 테스트: {len(CONFIGS)}개 설정 (mmap on + 스레드 4~7 정밀 탐색)")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
all_results = []
|
||||
|
||||
for idx, config in enumerate(CONFIGS):
|
||||
config_start = time.time()
|
||||
log_path = os.path.join(os.getcwd(), f"tune_r2_log_{idx}.txt")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f" [{idx+1}/{len(CONFIGS)}] {config['name']}")
|
||||
print(f" {config['desc']}")
|
||||
print(f" 시작: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
kill_server()
|
||||
print(f" [1/3] 서버 시작 중...")
|
||||
proc, log_file = start_server(config, log_path)
|
||||
|
||||
if not wait_for_server(timeout=600):
|
||||
print(" ❌ 서버 시작 실패!")
|
||||
kill_server()
|
||||
log_file.close()
|
||||
all_results.append({"config": config["name"], "status": "FAILED", "eval_tps": []})
|
||||
continue
|
||||
|
||||
load_time = time.time() - config_start
|
||||
print(f" [2/3] 서버 준비 완료! ({load_time:.0f}초)")
|
||||
|
||||
# 워밍업 + 벤치마크
|
||||
try:
|
||||
run_single_benchmark("Say hello.", max_tokens=20)
|
||||
except:
|
||||
pass
|
||||
|
||||
print(" [3/3] 벤치마크 3회...")
|
||||
prompts = [
|
||||
"Write a detailed explanation of how neural networks learn through backpropagation.",
|
||||
"Explain the complete process of photosynthesis including light and dark reactions.",
|
||||
"Describe the differences between SQL and NoSQL databases with examples.",
|
||||
]
|
||||
for i, prompt in enumerate(prompts):
|
||||
try:
|
||||
tokens, elapsed = run_single_benchmark(prompt, max_tokens=200)
|
||||
print(f" Run {i+1}: {tokens}tok, {elapsed:.1f}s, ~{tokens/elapsed:.2f} t/s")
|
||||
except Exception as e:
|
||||
print(f" Run {i+1}: ERROR - {e}")
|
||||
|
||||
time.sleep(2)
|
||||
kill_server()
|
||||
log_file.close()
|
||||
time.sleep(2)
|
||||
|
||||
eval_times = parse_eval_times(log_path)
|
||||
prompt_times = parse_prompt_eval_times(log_path)
|
||||
bench_evals = eval_times[1:] if len(eval_times) > 1 else eval_times
|
||||
bench_prompts = prompt_times[1:] if len(prompt_times) > 1 else prompt_times
|
||||
|
||||
eval_speeds = [e["tps"] for e in bench_evals]
|
||||
prompt_speeds = [p["tps"] for p in bench_prompts]
|
||||
|
||||
all_results.append({
|
||||
"config": config["name"],
|
||||
"status": "OK",
|
||||
"eval_tps": eval_speeds,
|
||||
"prompt_tps": prompt_speeds,
|
||||
})
|
||||
|
||||
if eval_speeds:
|
||||
print(f" 📊 Eval: avg={sum(eval_speeds)/len(eval_speeds):.2f}, max={max(eval_speeds):.2f}")
|
||||
|
||||
# 최종 결과
|
||||
print("\n")
|
||||
print("=" * 85)
|
||||
print(" 🏆 전체 튜닝 결과 (1라운드 + 2라운드 통합)")
|
||||
print("=" * 85)
|
||||
print()
|
||||
print(f" {'설정':<48} {'Avg t/s':>8} {'Max t/s':>8} {'Prompt':>8}")
|
||||
print(f" {'-'*48} {'-'*8} {'-'*8} {'-'*8}")
|
||||
|
||||
# 1라운드 결과 (하드코딩)
|
||||
r1 = [
|
||||
("[기준] mmap on, -t 8, --prio 2", 10.02, 10.06, 29.52),
|
||||
("A) --no-mmap -t 8", 9.66, 9.70, 28.26),
|
||||
("B) --no-mmap -t 6", 10.02, 10.18, 26.73),
|
||||
("C) --no-mmap -t 10", 9.42, 9.46, 27.31),
|
||||
("D) --no-mmap -t 12", 9.04, 9.11, 27.92),
|
||||
("E) --no-mmap -t 10 --prio 3 --poll 100", 9.41, 9.45, 28.37),
|
||||
]
|
||||
for name, avg, mx, pp in r1:
|
||||
marker = " ⭐" if avg >= 10.0 else ""
|
||||
print(f" {name:<48} {avg:>8.2f} {mx:>8.2f} {pp:>8.2f}{marker}")
|
||||
|
||||
print(f" {'--- 2라운드 ---':<48}")
|
||||
|
||||
best_avg = 10.06 # 기존 최고
|
||||
best_config = "[기준] mmap on, -t 8"
|
||||
|
||||
for r in all_results:
|
||||
if r["status"] != "OK" or not r["eval_tps"]:
|
||||
print(f" {r['config']:<48} {'FAIL':>8}")
|
||||
continue
|
||||
avg_e = sum(r["eval_tps"]) / len(r["eval_tps"])
|
||||
max_e = max(r["eval_tps"])
|
||||
avg_p = sum(r["prompt_tps"]) / len(r["prompt_tps"]) if r["prompt_tps"] else 0
|
||||
if max_e > best_avg:
|
||||
best_avg = max_e
|
||||
best_config = r["config"]
|
||||
marker = " ⭐" if avg_e >= 10.0 else ""
|
||||
print(f" {r['config']:<48} {avg_e:>8.2f} {max_e:>8.2f} {avg_p:>8.2f}{marker}")
|
||||
|
||||
print()
|
||||
print(f" 🏆 최고 성능: {best_config} → {best_avg:.2f} t/s")
|
||||
print(f" 완료: {datetime.datetime.now().strftime('%H:%M:%S')}")
|
||||
print("=" * 85)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,38 +0,0 @@
|
||||
import urllib.request
|
||||
import json
|
||||
import zipfile
|
||||
import os
|
||||
import ssl
|
||||
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
|
||||
url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
try:
|
||||
with urllib.request.urlopen(req, context=ctx) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
download_url = None
|
||||
for asset in data['assets']:
|
||||
if "bin-win-cuda-cu12.2.0-x64.zip" in asset['name'] or ("bin-win-cu" in asset['name'] and asset['name'].endswith(".zip") and "x64" in asset['name']):
|
||||
download_url = asset['browser_download_url']
|
||||
break
|
||||
|
||||
if download_url:
|
||||
print(f"Downloading {download_url}...")
|
||||
zip_path = "llama.zip"
|
||||
with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
|
||||
out_file.write(resp.read())
|
||||
print("Extracting to 'llama_bin'...")
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall("llama_bin")
|
||||
print("Done extracting.")
|
||||
os.remove(zip_path)
|
||||
else:
|
||||
print("Could not find the target zip. Available assets:")
|
||||
for asset in data['assets']:
|
||||
print(" -", asset['name'])
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
@@ -1,33 +0,0 @@
|
||||
import os
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
|
||||
models = [
|
||||
# 먼저 용량이 작은 Gemma4 26B 부터 다운로드
|
||||
("ggml-org/gemma-4-26B-A4B-it-GGUF", "gemma-4-26B-A4B-it-Q4_K_M.gguf"),
|
||||
# 다음 Qwen 35B
|
||||
("unsloth/Qwen3.5-35B-A3B-GGUF", "Qwen3.5-35B-A3B-Q4_K_M.gguf"),
|
||||
# 마지막으로 122B (분할 압축되어 있음)
|
||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf"),
|
||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf"),
|
||||
("unsloth/Qwen3.5-122B-A10B-GGUF", "Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf")
|
||||
]
|
||||
|
||||
print("=== 고속 다운로더 시작 (huggingface_hub & hf_transfer) ===")
|
||||
os.makedirs("models", exist_ok=True)
|
||||
|
||||
for repo, filename in models:
|
||||
print(f"\n>>> 다운로드 중 (백그라운드 진행): [{repo}] 의 [{filename}]...")
|
||||
try:
|
||||
path = hf_hub_download(
|
||||
repo_id=repo,
|
||||
filename=filename,
|
||||
local_dir="./models",
|
||||
local_dir_use_symlinks=False
|
||||
)
|
||||
print(f"완료: {path}")
|
||||
except Exception as e:
|
||||
print(f"다운로드 실패: {e}")
|
||||
|
||||
print("\n모든 다운로드 프로세스가 종료되었습니다.")
|
||||
@@ -1,56 +0,0 @@
|
||||
import urllib.request
|
||||
import json
|
||||
import zipfile
|
||||
import os
|
||||
import ssl
|
||||
import shutil
|
||||
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
|
||||
url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
try:
|
||||
with urllib.request.urlopen(req, context=ctx) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
download_url = None
|
||||
for asset in data['assets']:
|
||||
if "bin-win-cuda-12.4-x64.zip" in asset['name'] and "cudart" not in asset['name']:
|
||||
download_url = asset['browser_download_url']
|
||||
break
|
||||
|
||||
if download_url:
|
||||
print(f"Downloading true binaries: {download_url}...")
|
||||
zip_path = "llama_main.zip"
|
||||
with urllib.request.urlopen(download_url, context=ctx) as resp, open(zip_path, 'wb') as out_file:
|
||||
out_file.write(resp.read())
|
||||
|
||||
print("Extracting to temporary folder 'llama_temp'...")
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall("llama_temp")
|
||||
|
||||
print("Moving exact files to 'llama_bin_run'...")
|
||||
os.makedirs("llama_bin_run", exist_ok=True)
|
||||
for root, dirs, files in os.walk("llama_temp"):
|
||||
for file in files:
|
||||
shutil.move(os.path.join(root, file), os.path.join("llama_bin_run", file))
|
||||
|
||||
if os.path.exists("llama_bin"):
|
||||
for item in os.listdir("llama_bin"):
|
||||
src = os.path.join("llama_bin", item)
|
||||
dst = os.path.join("llama_bin_run", item)
|
||||
if not os.path.exists(dst):
|
||||
try:
|
||||
shutil.copy(src, dst)
|
||||
except:
|
||||
pass
|
||||
|
||||
os.remove(zip_path)
|
||||
shutil.rmtree("llama_temp", ignore_errors=True)
|
||||
print("Download and path extraction fully complete.")
|
||||
else:
|
||||
print("Could not find the target zip.")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
@@ -1,28 +0,0 @@
|
||||
from huggingface_hub import HfApi
|
||||
import sys
|
||||
|
||||
api = HfApi()
|
||||
|
||||
def search_gguf(query):
|
||||
print(f"\n--- Searching for: {query} ---")
|
||||
try:
|
||||
models = api.list_models(search=query, limit=3)
|
||||
found = list(models)
|
||||
if not found:
|
||||
print("No models found.")
|
||||
return
|
||||
for m in found:
|
||||
print(f"Repo: {m.id}")
|
||||
files = api.list_repo_files(repo_id=m.id)
|
||||
ggufs = [f for f in files if "q4_k_m" in f.lower() and f.endswith(".gguf")]
|
||||
if not ggufs:
|
||||
ggufs = [f for f in files if f.endswith(".gguf")][:3]
|
||||
print(f" GGUFs: {ggufs}")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
search_gguf("122b-a10b gguf")
|
||||
search_gguf("Qwen3.5 122b gguf")
|
||||
search_gguf("35b-a3b gguf")
|
||||
search_gguf("gemma-4 26b gguf")
|
||||
search_gguf("Qwen 122B")
|
||||
@@ -1,123 +0,0 @@
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
|
||||
def check_server():
|
||||
"""Check if server is up"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("status") == "ok"
|
||||
except:
|
||||
return False
|
||||
|
||||
def run_benchmark(prompt, max_tokens=100, label="Test"):
|
||||
"""Run a single benchmark request and return results"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
content = result["choices"][0]["message"].get("content", "")
|
||||
usage = result.get("usage", {})
|
||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
|
||||
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"elapsed": elapsed,
|
||||
"gen_tps_approx": gen_tps,
|
||||
"content_preview": content[:100]
|
||||
}
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print(" LLM Performance Benchmark Tool")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Wait for server
|
||||
print("[1/3] Checking server health...")
|
||||
for i in range(30):
|
||||
if check_server():
|
||||
print(" -> Server is ready!")
|
||||
break
|
||||
print(f" -> Waiting for server... ({i+1}/30)")
|
||||
time.sleep(2)
|
||||
else:
|
||||
print(" -> ERROR: Server not responding after 60s")
|
||||
return
|
||||
|
||||
# Warmup
|
||||
print()
|
||||
print("[2/3] Warmup run (short)...")
|
||||
try:
|
||||
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
||||
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
||||
except Exception as e:
|
||||
print(f" -> Warmup failed: {e}")
|
||||
|
||||
# Main benchmark
|
||||
print()
|
||||
print("[3/3] Running main benchmark...")
|
||||
print("-" * 60)
|
||||
|
||||
test_prompt = "Count from 1 to 50, writing each number on a new line."
|
||||
|
||||
results = []
|
||||
for i in range(3):
|
||||
print(f" Run {i+1}/3...")
|
||||
try:
|
||||
r = run_benchmark(test_prompt, max_tokens=200, label=f"Run {i+1}")
|
||||
results.append(r)
|
||||
print(f" Tokens: {r['completion_tokens']} | "
|
||||
f"Time: {r['elapsed']:.2f}s | "
|
||||
f"Speed: {r['gen_tps_approx']:.2f} t/s (approx)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
if results:
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(" RESULTS SUMMARY")
|
||||
print("=" * 60)
|
||||
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
||||
max_tps = max(r["gen_tps_approx"] for r in results)
|
||||
min_tps = min(r["gen_tps_approx"] for r in results)
|
||||
print(f" Runs: {len(results)}")
|
||||
print(f" Avg TPS: {avg_tps:.2f} t/s (approx, includes prompt eval)")
|
||||
print(f" Min TPS: {min_tps:.2f} t/s")
|
||||
print(f" Max TPS: {max_tps:.2f} t/s")
|
||||
print()
|
||||
print(" NOTE: Check server console for exact generation t/s")
|
||||
print(" (the 'eval time' line shows pure token generation speed)")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,169 +0,0 @@
|
||||
import time
|
||||
import json
|
||||
import urllib.request
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
|
||||
def check_server():
|
||||
"""Check if server is up"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/health")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("status") == "ok"
|
||||
except:
|
||||
return False
|
||||
|
||||
def check_slots():
|
||||
"""Check server slot info for VRAM usage details"""
|
||||
try:
|
||||
req = urllib.request.Request(f"{BASE_URL}/slots")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read())
|
||||
except:
|
||||
return None
|
||||
|
||||
def run_benchmark(prompt, max_tokens=300, label="Test"):
|
||||
"""Run a single benchmark request and return results"""
|
||||
payload = json.dumps({
|
||||
"model": "local-model",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.0
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}/v1/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
with urllib.request.urlopen(req, timeout=600) as resp:
|
||||
result = json.loads(resp.read())
|
||||
elapsed = time.time() - start
|
||||
|
||||
content = result["choices"][0]["message"].get("content", "")
|
||||
usage = result.get("usage", {})
|
||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
|
||||
gen_tps = completion_tokens / elapsed if elapsed > 0 else 0
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"elapsed": elapsed,
|
||||
"gen_tps_approx": gen_tps,
|
||||
"content_preview": content[:150]
|
||||
}
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" Qwen3.5 122B-A10B Performance Benchmark")
|
||||
print(" Target: 10+ t/s generation speed")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Wait for server (model loading takes 3-5 min for 71 GB)
|
||||
print("[1/4] Waiting for server (122B model load takes 3-5 min)...")
|
||||
max_wait = 600 # 10 minutes max
|
||||
for i in range(max_wait // 5):
|
||||
if check_server():
|
||||
print(f" -> Server is ready! (waited {i*5}s)")
|
||||
break
|
||||
if i % 6 == 0:
|
||||
print(f" -> Loading model... ({i*5}s / {max_wait}s)")
|
||||
time.sleep(5)
|
||||
else:
|
||||
print(f" -> ERROR: Server not responding after {max_wait}s")
|
||||
return
|
||||
|
||||
# Check server info
|
||||
print()
|
||||
print("[2/4] Checking server status...")
|
||||
slots = check_slots()
|
||||
if slots:
|
||||
print(f" -> Slots available: {len(slots)}")
|
||||
|
||||
# Warmup
|
||||
print()
|
||||
print("[3/4] Warmup run (short, pre-heating GPU caches)...")
|
||||
try:
|
||||
warmup = run_benchmark("Say hello in 5 words.", max_tokens=20, label="Warmup")
|
||||
print(f" -> Warmup done: {warmup['completion_tokens']} tokens in {warmup['elapsed']:.2f}s")
|
||||
print(f" -> Warmup speed: {warmup['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
||||
except Exception as e:
|
||||
print(f" -> Warmup failed: {e}")
|
||||
|
||||
# Main benchmark - 5 runs for statistical reliability
|
||||
print()
|
||||
print("[4/4] Running main benchmark (5 runs x 300 tokens)...")
|
||||
print("-" * 70)
|
||||
|
||||
test_prompts = [
|
||||
"Write a detailed explanation of how neural networks learn. Cover backpropagation, gradient descent, and loss functions.",
|
||||
"Explain the history of the internet from ARPANET to modern day. Include key milestones and technological breakthroughs.",
|
||||
"Describe the complete process of photosynthesis in plants. Include both light-dependent and light-independent reactions.",
|
||||
"Write about the major differences between SQL and NoSQL databases, including use cases and performance characteristics.",
|
||||
"Explain quantum computing concepts including qubits, superposition, and entanglement in simple terms.",
|
||||
]
|
||||
|
||||
results = []
|
||||
for i in range(5):
|
||||
prompt = test_prompts[i % len(test_prompts)]
|
||||
print(f"\n Run {i+1}/5: {prompt[:50]}...")
|
||||
try:
|
||||
r = run_benchmark(prompt, max_tokens=300, label=f"Run {i+1}")
|
||||
results.append(r)
|
||||
print(f" Completion tokens: {r['completion_tokens']}")
|
||||
print(f" Total time: {r['elapsed']:.2f}s")
|
||||
print(f" Approx speed: {r['gen_tps_approx']:.2f} t/s (includes prompt eval)")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
if results:
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(" RESULTS SUMMARY - Qwen3.5 122B-A10B")
|
||||
print("=" * 70)
|
||||
avg_tps = sum(r["gen_tps_approx"] for r in results) / len(results)
|
||||
max_tps = max(r["gen_tps_approx"] for r in results)
|
||||
min_tps = min(r["gen_tps_approx"] for r in results)
|
||||
total_tokens = sum(r["completion_tokens"] for r in results)
|
||||
total_time = sum(r["elapsed"] for r in results)
|
||||
|
||||
print(f" Runs completed: {len(results)}/5")
|
||||
print(f" Total tokens: {total_tokens}")
|
||||
print(f" Total time: {total_time:.1f}s")
|
||||
print()
|
||||
print(f" Approx TPS (avg): {avg_tps:.2f} t/s")
|
||||
print(f" Approx TPS (min): {min_tps:.2f} t/s")
|
||||
print(f" Approx TPS (max): {max_tps:.2f} t/s")
|
||||
print()
|
||||
|
||||
# Verdict
|
||||
if avg_tps >= 10:
|
||||
print(" ✅ TARGET ACHIEVED: 10+ t/s!")
|
||||
elif avg_tps >= 8:
|
||||
print(" ⚠️ CLOSE TO TARGET: Consider further tuning")
|
||||
else:
|
||||
print(f" ❌ BELOW TARGET: {avg_tps:.1f} t/s < 10 t/s")
|
||||
|
||||
print()
|
||||
print(" ⚡ IMPORTANT: The 'approx' speed includes prompt eval overhead.")
|
||||
print(" ⚡ Check the server console/log for exact 'eval time' t/s value,")
|
||||
print(" ⚡ which shows pure token generation speed (always higher).")
|
||||
print("=" * 70)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user