443 lines
15 KiB
Python
443 lines
15 KiB
Python
"""
|
|
데이터 재검증 + Merton/Shadow Rating 재산출
|
|
|
|
1. 비정상 종목 필터링 (SPAC, 리츠, 펀드, ETF 등)
|
|
2. 금융업 필터 보강
|
|
3. DD 이상치 캡핑
|
|
4. EDF 단조성 보정 (isotonic regression)
|
|
5. Shadow Rating 재산출
|
|
6. 등급별 부도율 재산출
|
|
|
|
Usage:
|
|
python -m src.models.revalidate # 전체 재검증
|
|
python -m src.models.revalidate --dry # 필터링 결과만 확인 (DB 미갱신)
|
|
"""
|
|
import sys
|
|
import argparse
|
|
import yaml
|
|
import numpy as np
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from scipy.stats import norm
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
from src.data.database import get_connection, init_db
|
|
from src.models.merton import (
|
|
solve_merton, calculate_dd, calculate_edf, naive_dd,
|
|
dd_to_rating, DD_RATING_MAP, GLOBAL_DEFAULT_RATES
|
|
)
|
|
|
|
# ============================================================
|
|
# 1. 비정상 종목 필터링
|
|
# ============================================================
|
|
|
|
# SPAC, 리츠, 펀드, ETF 등 비정상 종목 키워드
|
|
EXCLUDE_KEYWORDS = [
|
|
"스팩", "SPAC", "리츠", "REIT", "인프라",
|
|
"호스팩", "호스펀드", "호펀드",
|
|
"선박", "ETF", "ETN",
|
|
]
|
|
|
|
# 단어 끝에 "N호" 패턴 (스팩 종목 탐지)
|
|
import re
|
|
SPAC_PATTERN = re.compile(r"(\d+호|제\d+호)")
|
|
|
|
# 금융업 키워드 (강화)
|
|
FINANCIAL_KEYWORDS = [
|
|
"은행", "금융", "보험", "증권", "캐피탈", "저축",
|
|
"생명", "화재", "손해", "카드", "리스", "자산운용",
|
|
"파이낸셜", "파이낸스", "벤처캐피탈",
|
|
"투자증권", "종합금융", "상호저축", "새마을금고",
|
|
]
|
|
|
|
# 금융지주 (정확 매칭만)
|
|
FINANCIAL_HOLDING_NAMES = [
|
|
"KB금융", "신한지주", "하나금융지주", "우리금융지주",
|
|
"BNK금융지주", "DGB금융지주", "JB금융지주",
|
|
"한국금융지주", "메리츠금융지주",
|
|
]
|
|
|
|
|
|
def classify_ticker(name: str, leverage: float) -> str:
|
|
"""종목 분류 → 'normal', 'spac', 'reit', 'financial', 'etf_fund'"""
|
|
if not name:
|
|
return "normal"
|
|
|
|
# SPAC 탐지
|
|
if "스팩" in name or "SPAC" in name or "호스팩" in name:
|
|
return "spac"
|
|
if SPAC_PATTERN.search(name) and any(kw in name for kw in ["스팩", "기업인수", "합병"]):
|
|
return "spac"
|
|
|
|
# 리츠
|
|
if "리츠" in name or "REIT" in name or "인프라" in name:
|
|
return "reit"
|
|
|
|
# ETF/ETN
|
|
if "ETF" in name or "ETN" in name:
|
|
return "etf_fund"
|
|
|
|
# 금융업 (이름 기반)
|
|
if any(kw in name for kw in FINANCIAL_KEYWORDS):
|
|
return "financial"
|
|
if name in FINANCIAL_HOLDING_NAMES:
|
|
return "financial"
|
|
|
|
# 레버리지 >0.90 = 금융업 가능성 높음
|
|
if pd.notna(leverage) and leverage > 0.90:
|
|
return "financial"
|
|
|
|
return "normal"
|
|
|
|
|
|
def filter_and_classify(conn) -> pd.DataFrame:
|
|
"""전 종목 분류 + 필터링"""
|
|
query = """
|
|
SELECT
|
|
mr.ticker, c.name, mr.DD, mr.EDF, mr.E, mr.D,
|
|
mr.sigma_E, mr.sigma_V, mr.leverage, mr.method,
|
|
f.leverage_ratio, f.total_assets, f.total_equity,
|
|
f.roa, f.interest_coverage, f.log_assets,
|
|
f.current_liabilities, f.non_current_liabilities,
|
|
f.total_liabilities, f.default_point,
|
|
f.operating_income, f.net_income,
|
|
mr.base_date, mr.fin_year
|
|
FROM merton_results mr
|
|
JOIN financial_data f ON mr.ticker = f.ticker
|
|
JOIN companies c ON mr.ticker = c.ticker
|
|
"""
|
|
df = pd.read_sql_query(query, conn)
|
|
|
|
# 분류
|
|
df["category"] = df.apply(
|
|
lambda r: classify_ticker(r["name"], r["leverage_ratio"]), axis=1
|
|
)
|
|
|
|
return df
|
|
|
|
|
|
# ============================================================
|
|
# 2. DD 이상치 캡핑 + EDF floor
|
|
# ============================================================
|
|
|
|
def apply_dd_caps(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""DD 이상치 캡핑: 100 이상은 비정상"""
|
|
df = df.copy()
|
|
|
|
# DD 캡핑: [-5, 15] 범위로 제한
|
|
# DD가 15 이상 = 부도확률이 사실상 0 (수치적으로 무의미한 차이)
|
|
# DD가 -5 이하 = 이미 부도 상태
|
|
original_dd = df["DD"].copy()
|
|
df["DD"] = df["DD"].clip(-5, 15)
|
|
|
|
capped_high = (original_dd > 15).sum()
|
|
capped_low = (original_dd < -5).sum()
|
|
|
|
# EDF 재계산 (캡핑된 DD 기준)
|
|
df["EDF"] = df["DD"].apply(lambda dd: norm.cdf(-dd))
|
|
|
|
print(f" DD 캡핑: 상한(>15)={capped_high}건, 하한(<-5)={capped_low}건")
|
|
|
|
return df
|
|
|
|
|
|
# ============================================================
|
|
# 3. Composite Score + Shadow Rating (개선)
|
|
# ============================================================
|
|
|
|
def compute_improved_shadow(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""개선된 Shadow Rating: DD 가중치 높이고 monotonicity 보장"""
|
|
df = df.copy()
|
|
|
|
def zscore(s):
|
|
mean, std = s.mean(), s.std()
|
|
if std == 0 or pd.isna(std):
|
|
return pd.Series(0, index=s.index)
|
|
return (s - mean) / std
|
|
|
|
z_dd = zscore(df["DD"])
|
|
z_lev = -zscore(df["leverage_ratio"].fillna(0.5))
|
|
|
|
roa = df["roa"].fillna(0).clip(-1, 1)
|
|
z_roa = zscore(roa)
|
|
|
|
icr = df["interest_coverage"].fillna(0).clip(-10, 100)
|
|
z_icr = zscore(icr)
|
|
|
|
z_size = zscore(df["log_assets"].fillna(df["log_assets"].median()))
|
|
|
|
# DD에 70% 가중치 (EDF 역전 최소화)
|
|
df["composite_score"] = (
|
|
0.70 * z_dd + # DD 핵심
|
|
0.10 * z_lev + # 레버리지
|
|
0.10 * z_roa + # 수익성
|
|
0.05 * z_icr + # 이자보상
|
|
0.05 * z_size # 규모
|
|
)
|
|
|
|
# Score 내림차순 정렬 → 등급 할당
|
|
df = df.sort_values("composite_score", ascending=False).reset_index(drop=True)
|
|
n = len(df)
|
|
|
|
# 등급 분포 (현실적 S&P 분포 기반 — 한국 시장 조정)
|
|
rating_dist = {
|
|
"AAA": 0.005, "AA+": 0.01, "AA": 0.02, "AA-": 0.03,
|
|
"A+": 0.05, "A": 0.07, "A-": 0.08,
|
|
"BBB+": 0.08, "BBB": 0.10, "BBB-": 0.09,
|
|
"BB+": 0.08, "BB": 0.09, "BB-": 0.07,
|
|
"B+": 0.06, "B": 0.05, "B-": 0.04,
|
|
"CCC+": 0.02, "CCC": 0.02, "CCC-": 0.03,
|
|
}
|
|
|
|
grades = list(rating_dist.keys())
|
|
idx = 0
|
|
df["shadow_rating"] = ""
|
|
for i, grade in enumerate(grades):
|
|
if i == len(grades) - 1:
|
|
count = n - idx
|
|
else:
|
|
count = max(1, round(n * rating_dist[grade]))
|
|
df.loc[idx:idx+count-1, "shadow_rating"] = grade
|
|
idx += count
|
|
df.loc[df["shadow_rating"] == "", "shadow_rating"] = grades[-1]
|
|
|
|
return df
|
|
|
|
|
|
# ============================================================
|
|
# 4. EDF 단조성 보정
|
|
# ============================================================
|
|
|
|
def enforce_monotonicity(dr_df: pd.DataFrame) -> pd.DataFrame:
|
|
"""등급별 부도율 단조 증가 보장 (isotonic 보정)"""
|
|
dr_df = dr_df.copy()
|
|
|
|
# 등급 순서 (좋은 → 나쁜)
|
|
rating_order = list(GLOBAL_DEFAULT_RATES.keys())
|
|
dr_df["rating_idx"] = dr_df["rating_grade"].apply(
|
|
lambda x: rating_order.index(x) if x in rating_order else -1
|
|
)
|
|
dr_df = dr_df.sort_values("rating_idx")
|
|
|
|
# pool adjacent violator (isotonic regression)
|
|
values = dr_df["korean_dr"].values.copy()
|
|
n = len(values)
|
|
|
|
# 단조 증가 강제
|
|
for i in range(1, n):
|
|
if values[i] < values[i-1]:
|
|
# 역전 → 두 값의 평균으로 대체
|
|
avg = (values[i-1] + values[i]) / 2
|
|
values[i-1] = avg
|
|
values[i] = avg
|
|
# 이전 값과도 체크
|
|
j = i - 1
|
|
while j > 0 and values[j] < values[j-1]:
|
|
avg = (values[j-1] + values[j]) / 2
|
|
values[j-1] = avg
|
|
values[j] = avg
|
|
j -= 1
|
|
|
|
dr_df["korean_dr_monotone"] = values
|
|
dr_df.drop(columns=["rating_idx"], inplace=True)
|
|
|
|
return dr_df
|
|
|
|
|
|
# ============================================================
|
|
# 5. 등급별 부도율 산출
|
|
# ============================================================
|
|
|
|
def compute_default_rates(df: pd.DataFrame, config: dict) -> pd.DataFrame:
|
|
"""등급별 부도율 + 글로벌 블렌딩 + 베이지안 + 단조보정"""
|
|
threshold = config.get("blending", {}).get("threshold", 50)
|
|
prior_strength = config.get("blending", {}).get("bayesian_prior_strength", 50)
|
|
|
|
rating_order = list(GLOBAL_DEFAULT_RATES.keys())
|
|
|
|
results = []
|
|
for rating in rating_order:
|
|
subset = df[df["shadow_rating"] == rating]
|
|
n_firms = len(subset)
|
|
if n_firms == 0:
|
|
continue
|
|
|
|
korean_dr = subset["EDF"].mean()
|
|
global_dr = GLOBAL_DEFAULT_RATES.get(rating, 0.01)
|
|
|
|
weight_kr = min(n_firms / threshold, 1.0)
|
|
blended_dr = weight_kr * korean_dr + (1 - weight_kr) * global_dr
|
|
|
|
alpha_prior = global_dr * prior_strength
|
|
beta_prior = (1 - global_dr) * prior_strength
|
|
alpha_post = alpha_prior + n_firms * korean_dr
|
|
beta_post = beta_prior + n_firms * (1 - korean_dr)
|
|
bayesian_dr = alpha_post / (alpha_post + beta_post)
|
|
|
|
results.append({
|
|
"rating_grade": rating,
|
|
"n_firms": n_firms,
|
|
"korean_dr": korean_dr,
|
|
"global_dr": global_dr,
|
|
"weight_kr": weight_kr,
|
|
"blended_dr": blended_dr,
|
|
"bayesian_dr": bayesian_dr,
|
|
})
|
|
|
|
dr_df = pd.DataFrame(results)
|
|
|
|
# 단조성 보정
|
|
dr_df = enforce_monotonicity(dr_df)
|
|
|
|
# 최종 부도율 = 단조보정된 한국DR과 글로벌의 블렌딩
|
|
dr_df["final_dr"] = dr_df.apply(
|
|
lambda r: min(r["weight_kr"], 1.0) * r["korean_dr_monotone"] +
|
|
(1 - min(r["weight_kr"], 1.0)) * r["global_dr"],
|
|
axis=1
|
|
)
|
|
|
|
# EDF floor: AAA도 최소 0.0001% (1bp)
|
|
dr_df["final_dr"] = dr_df["final_dr"].clip(lower=0.00001)
|
|
|
|
return dr_df
|
|
|
|
|
|
# ============================================================
|
|
# Main
|
|
# ============================================================
|
|
|
|
def load_config() -> dict:
|
|
config_path = Path(__file__).parent.parent.parent / "config" / "settings.yaml"
|
|
with open(config_path, "r", encoding="utf-8") as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="데이터 재검증 + 재산출")
|
|
parser.add_argument("--dry", action="store_true", help="DB 미갱신 (확인만)")
|
|
args = parser.parse_args()
|
|
|
|
config = load_config()
|
|
conn = init_db()
|
|
|
|
# 1) 분류 + 필터링
|
|
print("="*60)
|
|
print("[1/5] 종목 분류 + 필터링")
|
|
print("="*60)
|
|
|
|
df = filter_and_classify(conn)
|
|
print(f" 전체: {len(df)}개")
|
|
|
|
cat_counts = df["category"].value_counts()
|
|
for cat, cnt in cat_counts.items():
|
|
example = df[df["category"] == cat]["name"].iloc[0] if cnt > 0 else ""
|
|
print(f" {cat:12s}: {cnt:5d}개 (예: {example})")
|
|
|
|
# 비정상 종목 제거
|
|
df_clean = df[df["category"] == "normal"].copy()
|
|
removed = len(df) - len(df_clean)
|
|
print(f"\n -> 정상 종목: {len(df_clean)}개 (제거: {removed}개)")
|
|
|
|
# 2) DD 캡핑
|
|
print("\n" + "="*60)
|
|
print("[2/5] DD 캡핑 + EDF 재계산")
|
|
print("="*60)
|
|
|
|
df_clean = apply_dd_caps(df_clean)
|
|
|
|
print(f" DD 통계: 평균={df_clean['DD'].mean():.2f}, 중앙={df_clean['DD'].median():.2f}")
|
|
print(f" EDF 통계: 평균={df_clean['EDF'].mean():.6f}, 중앙={df_clean['EDF'].median():.6f}")
|
|
|
|
# 3) Shadow Rating 재산출
|
|
print("\n" + "="*60)
|
|
print("[3/5] Shadow Rating 재산출 (DD 70%)")
|
|
print("="*60)
|
|
|
|
df_clean = compute_improved_shadow(df_clean)
|
|
|
|
rating_order = list(GLOBAL_DEFAULT_RATES.keys())
|
|
df_clean["shadow_rating"] = pd.Categorical(
|
|
df_clean["shadow_rating"], categories=rating_order, ordered=True
|
|
)
|
|
|
|
dist = df_clean["shadow_rating"].value_counts().sort_index()
|
|
prev_edf = -1
|
|
for rating, count in dist.items():
|
|
if count > 0:
|
|
avg_dd = df_clean[df_clean["shadow_rating"] == rating]["DD"].mean()
|
|
avg_edf = df_clean[df_clean["shadow_rating"] == rating]["EDF"].mean()
|
|
inv = " <<<INVERSION" if avg_edf < prev_edf and prev_edf >= 0 else ""
|
|
print(f" {rating:5s}: {count:4d} DD={avg_dd:6.2f} EDF={avg_edf:.6f}{inv}")
|
|
prev_edf = avg_edf
|
|
|
|
# 4) 등급별 부도율
|
|
print("\n" + "="*60)
|
|
print("[4/5] 등급별 부도율 (단조보정 + 블렌딩)")
|
|
print("="*60)
|
|
|
|
dr_df = compute_default_rates(df_clean, config)
|
|
|
|
print(f"\n{'grade':>5} | {'N':>4} | {'EDF_KR':>10} | {'monotone':>10} | {'global':>10} | {'final':>10}")
|
|
print("-" * 65)
|
|
for _, row in dr_df.iterrows():
|
|
print(f" {row['rating_grade']:5s} | {row['n_firms']:4d} | "
|
|
f"{row['korean_dr']:10.6f} | {row['korean_dr_monotone']:10.6f} | "
|
|
f"{row['global_dr']:10.6f} | {row['final_dr']:10.6f}")
|
|
|
|
# 5) DB 저장
|
|
if not args.dry:
|
|
print("\n" + "="*60)
|
|
print("[5/5] DB 저장")
|
|
print("="*60)
|
|
|
|
# merton_results 초기화 & 재저장
|
|
conn.execute("DELETE FROM merton_results")
|
|
conn.execute("DELETE FROM default_rates")
|
|
|
|
base_date_str = df_clean["base_date"].iloc[0] if "base_date" in df_clean.columns else datetime.now().strftime("%Y-%m-%d")
|
|
|
|
for _, row in df_clean.iterrows():
|
|
conn.execute("""
|
|
INSERT OR REPLACE INTO merton_results
|
|
(ticker, base_date, fin_year, E, sigma_E, D, V, sigma_V, DD, EDF, leverage, method, dd_rating)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
row["ticker"], row.get("base_date", base_date_str), int(row.get("fin_year", 2024)),
|
|
row["E"], row["sigma_E"], row["D"],
|
|
row.get("E", 0) + row.get("D", 0), # V approximation
|
|
row["sigma_V"], row["DD"], row["EDF"],
|
|
row["leverage"], row["method"], row["shadow_rating"]
|
|
))
|
|
|
|
for _, row in dr_df.iterrows():
|
|
conn.execute("""
|
|
INSERT OR REPLACE INTO default_rates
|
|
(base_date, rating_grade, n_firms, n_defaults, korean_dr, global_dr, weight_kr, blended_dr, bayesian_dr)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
datetime.now().strftime("%Y-%m-%d"),
|
|
row["rating_grade"], int(row["n_firms"]), 0,
|
|
row["korean_dr_monotone"], row["global_dr"], row["weight_kr"],
|
|
row["final_dr"], row["bayesian_dr"]
|
|
))
|
|
|
|
# companies 금융업 표시
|
|
conn.execute("UPDATE companies SET is_financial = 0")
|
|
for _, row in df[df["category"] != "normal"].iterrows():
|
|
conn.execute("UPDATE companies SET is_financial = 1 WHERE ticker = ?", (row["ticker"],))
|
|
|
|
conn.commit()
|
|
print(f" merton_results: {len(df_clean)}건 저장")
|
|
print(f" default_rates: {len(dr_df)}건 저장")
|
|
else:
|
|
print("\n [DRY RUN] DB 미갱신")
|
|
|
|
conn.close()
|
|
return df_clean, dr_df
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|