"""
Quality A/B Test — Gemma 4 26B vs Qwen 3.5 35B
실제 서비스 시나리오 기반 품질 비교
"""
import urllib.request, json, time, sys, os

try:
    sys.stdout.reconfigure(encoding='utf-8')
except:
    pass

BASE = "http://127.0.0.1:8000"
MODEL_NAME = sys.argv[1] if len(sys.argv) > 1 else "unknown"
OUTPUT_FILE = f"scripts/quality_result_{MODEL_NAME}.json"

SCENARIOS = [
    # ═══ 1. 코딩 에이전트 (VS Code) ═══
    {
        "id": "code_generate",
        "category": "coding",
        "name": "Python 함수 생성",
        "prompt": "Write a Python function `merge_sorted_lists(list1, list2)` that merges two sorted lists into one sorted list without using built-in sort. Include type hints, docstring, and handle edge cases. Show only the code.",
        "eval_criteria": ["correctness", "type_hints", "docstring", "edge_cases"]
    },
    {
        "id": "code_debug",
        "category": "coding",
        "name": "버그 찾기 & 수정",
        "prompt": """Find and fix the bug in this code:
```python
def find_duplicates(arr):
    seen = {}
    duplicates = []
    for item in arr:
        if item in seen:
            duplicates.append(item)
        seen[item] = True
    return list(set(duplicates))

# Bug: find_duplicates([1,2,2,3,3,3]) returns [2,3] but
# find_duplicates([]) crashes with unexpected behavior
# Also it should return count of each duplicate
```
Fix it to return a dict like {2: 2, 3: 3} (value=count of occurrences).""",
        "eval_criteria": ["bug_identified", "correct_fix", "clean_code"]
    },
    {
        "id": "code_refactor",
        "category": "coding",
        "name": "TypeScript 리팩토링",
        "prompt": """Refactor this messy TypeScript into clean, typed code:
```typescript
async function getData(url, retry, timeout) {
  let result = null
  for (let i = 0; i < retry; i++) {
    try {
      const r = await fetch(url, {signal: AbortSignal.timeout(timeout)})
      if (r.ok) {
        result = await r.json()
        break
      }
    } catch(e) {
      if (i === retry - 1) throw e
      await new Promise(r => setTimeout(r, 1000 * (i+1)))
    }
  }
  return result
}
```
Add proper types, error handling, configurable backoff, and make it production-ready.""",
        "eval_criteria": ["types", "error_handling", "backoff", "production_quality"]
    },

    # ═══ 2. 개인 비서 (Discord Bot) — 한국어 ═══
    {
        "id": "korean_schedule",
        "category": "assistant_kr",
        "name": "한국어 일정 관리",
        "prompt": "내일 오후 2시에 팀 미팅이 있고, 3시에 치과 예약이 있어. 그리고 저녁 7시에 친구랑 홍대에서 만나기로 했어. 이 일정들을 정리해주고, 이동 시간을 고려해서 현실적으로 가능한지 알려줘. 서울 기준으로.",
        "eval_criteria": ["korean_fluency", "schedule_analysis", "practical_advice"]
    },
    {
        "id": "korean_email",
        "category": "assistant_kr",
        "name": "한국어 이메일 요약",
        "prompt": """다음 이메일을 3줄로 요약하고, 필요한 액션을 정리해줘:

안녕하세요 김팀장님,

지난 주 논의했던 Q2 마케팅 예산 관련하여 연락드립니다. 
본부장님께서 기존 제안 대비 15% 삭감을 요청하셨습니다. 
이에 따라 디지털 마케팅 채널 중 ROI가 낮은 채널을 우선 정리해야 할 것 같습니다.

리서치팀에서는 네이버 검색광고 대비 인스타그램 광고의 전환율이 
0.3%로 가장 낮다는 분석 결과를 공유했습니다. 
수요일까지 수정안을 제출해야 하니, 화요일 오전까지 
각 채널별 삭감 우선순위를 정리해서 회신 부탁드립니다.

감사합니다.
마케팅팀 박과장 드림""",
        "eval_criteria": ["korean_summary", "action_items", "conciseness"]
    },

    # ═══ 3. MCP 도구 (Function Calling) ═══
    {
        "id": "tool_calling",
        "category": "tool_use",
        "name": "Function Calling (JSON)",
        "prompt": """You have access to these tools:
- search_web(query: string) -> string
- get_calendar(date: string) -> list[Event]  
- send_email(to: string, subject: string, body: string) -> bool

User says: "Check my calendar for tomorrow, and if I have a meeting with John, search for the latest quarterly report and email him a summary."

Respond with the exact sequence of tool calls as JSON array. Use this format:
[{"tool": "name", "args": {...}}, ...]""",
        "eval_criteria": ["correct_sequence", "valid_json", "complete_args"]
    },
    {
        "id": "structured_output",
        "category": "tool_use",
        "name": "구조화 출력 (JSON)",
        "prompt": """Parse this unstructured text into a JSON object:

"삼성전자가 2026년 1분기 실적을 발표했다. 매출은 79조원으로 전년 동기 대비 12% 증가했고, 영업이익은 15.2조원을 기록했다. 반도체 부문이 전체 이익의 65%를 차지했으며, 특히 HBM4 수요 증가로 인해 메모리 사업부 매출이 전 분기 대비 23% 성장했다."

Output format:
{
  "company": "",
  "period": "",
  "revenue": {"amount": "", "unit": "", "yoy_change": ""},
  "operating_profit": {"amount": "", "unit": ""},
  "segments": [{"name": "", "profit_share": "", "highlights": ""}]
}""",
        "eval_criteria": ["correct_parsing", "valid_json", "completeness"]
    },

    # ═══ 4. 일반 추론 ═══
    {
        "id": "reasoning",
        "category": "reasoning",
        "name": "논리 추론",
        "prompt": "A farmer has 3 fields. Field A produces 20% more wheat than Field B. Field C produces 15% less than Field A. Together, all three fields produced 1,000 kg of wheat. How much did each field produce? Show your work step by step.",
        "eval_criteria": ["correct_answer", "clear_steps", "math_accuracy"]
    },
]


def ask(prompt, max_tokens=800):
    payload = json.dumps({
        "model": "m",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0
    }).encode()
    req = urllib.request.Request(
        f"{BASE}/v1/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"}
    )
    t0 = time.time()
    resp = json.loads(urllib.request.urlopen(req, timeout=120).read())
    dt = time.time() - t0
    usage = resp.get("usage", {})
    content = resp["choices"][0]["message"]["content"]
    return {
        "content": content,
        "tokens": usage.get("completion_tokens", 0),
        "time": round(dt, 2),
        "tps": round(usage.get("completion_tokens", 0) / dt, 2) if dt > 0 else 0
    }


def main():
    print(f"{'='*60}")
    print(f"  Quality A/B Test — Model: {MODEL_NAME}")
    print(f"  {len(SCENARIOS)} scenarios | {time.strftime('%Y-%m-%d %H:%M')}")
    print(f"{'='*60}\n")

    # Health check
    try:
        req = urllib.request.Request(f"{BASE}/health")
        resp = json.loads(urllib.request.urlopen(req, timeout=5).read())
        if resp.get("status") != "ok":
            print("Server not ready!")
            sys.exit(1)
    except Exception as e:
        print(f"Server not reachable: {e}")
        sys.exit(1)

    # Warmup
    print("Warmup...", flush=True)
    ask("Hello", max_tokens=10)
    print("Done\n", flush=True)

    results = []
    for i, sc in enumerate(SCENARIOS):
        print(f"[{i+1}/{len(SCENARIOS)}] {sc['category']} — {sc['name']}")
        print(f"  Prompt: {sc['prompt'][:80]}...", flush=True)

        try:
            resp = ask(sc["prompt"])
            print(f"  ✅ {resp['tokens']} tokens | {resp['tps']:.1f} t/s | {resp['time']}s")
            print(f"  Response preview: {resp['content'][:120]}...\n")

            results.append({
                "id": sc["id"],
                "category": sc["category"],
                "name": sc["name"],
                "model": MODEL_NAME,
                "response": resp["content"],
                "tokens": resp["tokens"],
                "time": resp["time"],
                "tps": resp["tps"],
                "eval_criteria": sc["eval_criteria"]
            })
        except Exception as e:
            print(f"  ❌ Error: {e}\n")
            results.append({
                "id": sc["id"],
                "category": sc["category"],
                "name": sc["name"],
                "model": MODEL_NAME,
                "response": f"ERROR: {e}",
                "tokens": 0,
                "time": 0,
                "tps": 0,
            })

    # Save
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"\n{'='*60}")
    print(f"  Results saved: {OUTPUT_FILE}")
    print(f"  Total scenarios: {len(results)}")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()