Files
LifetimePD/validation/statistical_tests.py
Variet Agent 3a9374c61a feat: Lifetime PD (50yr) - Belkin & Suchower + Vasicek model
- Belkin & Suchower (1998) credit cycle index (Zt) estimation via WLS
- Vasicek single-factor conditional PD/TM model
- Macro-Zt OLS regression with stepwise variable selection
- 3-scenario (boom/neutral/recession) 50yr PD projection
- Statistical validation suite (ADF, Ljung-Box, R2, ARCH)
- BOK ECOS API integration with fallback data
- Visualization module (7 chart types)
- Detailed theoretical methodology docs/methodology.md
2026-03-10 21:57:34 +09:00

335 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
통계적 유의성 검증 모듈
Zt 시계열 및 거시연계 회귀모형의 통계적 타당성을 엄밀하게 검증합니다.
검증 항목:
1. Zt 시계열: ADF 단위근 검정, Shapiro-Wilk 정규성 검정
2. 회귀 모형: R², F-test, AIC/BIC, 잔차 진단
3. 잔차: Durbin-Watson, Ljung-Box, ARCH-LM, Breusch-Pagan
4. 구조적 안정성: CUSUM(추정 가능시)
5. 다중공선성: VIF
참고:
- Greene, W.H. (2018). "Econometric Analysis" 8th ed.
- Hamilton, J.D. (1994). "Time Series Analysis"
"""
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import (
het_breuschpagan, acorr_ljungbox, het_arch
)
from statsmodels.stats.stattools import durbin_watson
from typing import Dict, Optional, List
import logging
logger = logging.getLogger(__name__)
def test_stationarity(
series: np.ndarray,
name: str = "Zt",
significance: float = 0.05
) -> Dict:
"""
ADF (Augmented Dickey-Fuller) 단위근 검정
H0: 단위근 존재 (비정상 시계열)
H1: 정상 시계열
Returns
-------
dict with test_statistic, p_value, critical_values, is_stationary
"""
result = adfuller(series, autolag="AIC")
is_stationary = result[1] < significance
output = {
"test_name": "ADF (Augmented Dickey-Fuller)",
"variable": name,
"test_statistic": result[0],
"p_value": result[1],
"lags_used": result[2],
"n_obs": result[3],
"critical_values": result[4],
"is_stationary": is_stationary,
"conclusion": f"{'정상' if is_stationary else '비정상'} 시계열 "
f"(p={result[1]:.4f}, α={significance})"
}
logger.info(f"ADF 검정 [{name}]: statistic={result[0]:.4f}, "
f"p-value={result[1]:.4f}{'Pass' if is_stationary else 'FAIL'}")
return output
def test_normality(
series: np.ndarray,
name: str = "Zt",
significance: float = 0.05
) -> Dict:
"""
Shapiro-Wilk 정규성 검정
H0: 정규분포를 따름
H1: 정규분포를 따르지 않음
"""
stat, p_value = stats.shapiro(series)
is_normal = p_value > significance
output = {
"test_name": "Shapiro-Wilk Normality Test",
"variable": name,
"test_statistic": stat,
"p_value": p_value,
"is_normal": is_normal,
"mean": float(np.mean(series)),
"std": float(np.std(series)),
"skewness": float(stats.skew(series)),
"kurtosis": float(stats.kurtosis(series)),
"conclusion": f"{'정규분포' if is_normal else '비정규분포'} "
f"(p={p_value:.4f}, α={significance})"
}
logger.info(f"정규성 검정 [{name}]: W={stat:.4f}, "
f"p-value={p_value:.4f}{'Pass' if is_normal else 'FAIL'}")
return output
def test_serial_correlation(
residuals: np.ndarray,
lags: int = 5,
significance: float = 0.05
) -> Dict:
"""
잔차 자기상관 검정
1) Durbin-Watson: d ≈ 2이면 자기상관 없음
2) Ljung-Box Q-test: H0 = 자기상관 없음
"""
# Durbin-Watson
dw = durbin_watson(residuals)
# Ljung-Box
lb_result = acorr_ljungbox(residuals, lags=[lags], return_df=True)
lb_stat = lb_result["lb_stat"].values[0]
lb_pvalue = lb_result["lb_pvalue"].values[0]
no_autocorr = lb_pvalue > significance
output = {
"test_name": "Serial Correlation Tests",
"durbin_watson": float(dw),
"dw_interpretation": (
"양의 자기상관" if dw < 1.5 else
"음의 자기상관" if dw > 2.5 else
"자기상관 없음"
),
"ljung_box_statistic": float(lb_stat),
"ljung_box_pvalue": float(lb_pvalue),
"ljung_box_lags": lags,
"no_autocorrelation": no_autocorr,
"conclusion": f"{'자기상관 없음' if no_autocorr else '자기상관 존재'} "
f"(DW={dw:.3f}, LB p={lb_pvalue:.4f})"
}
logger.info(f"자기상관 검정: DW={dw:.3f}, LB p-value={lb_pvalue:.4f} "
f"{'Pass' if no_autocorr else 'FAIL'}")
return output
def test_heteroscedasticity(
residuals: np.ndarray,
exog: np.ndarray,
significance: float = 0.05
) -> Dict:
"""
이분산 검정
1) Breusch-Pagan: H0 = 등분산
2) ARCH-LM: H0 = ARCH 효과 없음
"""
# Breusch-Pagan
try:
bp_stat, bp_pvalue, _, _ = het_breuschpagan(residuals, exog)
except Exception:
bp_stat, bp_pvalue = np.nan, np.nan
# ARCH-LM
try:
arch_result = het_arch(residuals, nlags=3)
arch_stat = arch_result[0]
arch_pvalue = arch_result[1]
except Exception:
arch_stat, arch_pvalue = np.nan, np.nan
homoscedastic = (
(np.isnan(bp_pvalue) or bp_pvalue > significance) and
(np.isnan(arch_pvalue) or arch_pvalue > significance)
)
output = {
"test_name": "Heteroscedasticity Tests",
"breusch_pagan_stat": float(bp_stat) if not np.isnan(bp_stat) else None,
"breusch_pagan_pvalue": float(bp_pvalue) if not np.isnan(bp_pvalue) else None,
"arch_lm_stat": float(arch_stat) if not np.isnan(arch_stat) else None,
"arch_lm_pvalue": float(arch_pvalue) if not np.isnan(arch_pvalue) else None,
"is_homoscedastic": homoscedastic,
"conclusion": f"{'등분산' if homoscedastic else '이분산'} "
f"(BP p={bp_pvalue:.4f}, ARCH p={arch_pvalue:.4f})"
}
logger.info(f"이분산 검정: BP p={bp_pvalue:.4f}, ARCH p={arch_pvalue:.4f} "
f"{'Pass' if homoscedastic else 'FAIL'}")
return output
def validate_pd_properties(
cumulative_pds: np.ndarray,
grade_names: List[str] = None
) -> Dict:
"""
PD 결과의 수학적 성질 검증
1) 0 ≤ PD ≤ 1
2) 누적 PD 단조증가
3) 등급간 순서 유지 (낮은 등급 PD > 높은 등급 PD)
"""
issues = []
# 1) 범위 검증
if np.any(cumulative_pds < 0) or np.any(cumulative_pds > 1.0001):
issues.append("PD 값이 [0,1] 범위를 벗어남")
# 2) 단조증가 검증
for j in range(cumulative_pds.shape[1]):
diffs = np.diff(cumulative_pds[:, j])
if np.any(diffs < -1e-10):
grade_name = grade_names[j] if grade_names else f"Grade{j}"
issues.append(f"누적 PD 단조증가 위반: {grade_name}")
# 3) 등급간 순서 검증 (마지막 행, 즉 최종 누적 PD에서)
final_pds = cumulative_pds[-1]
for j in range(len(final_pds) - 1):
if final_pds[j] > final_pds[j + 1] + 1e-6:
g1 = grade_names[j] if grade_names else f"Grade{j}"
g2 = grade_names[j + 1] if grade_names else f"Grade{j+1}"
issues.append(f"등급 순서 위반: PD({g1}) > PD({g2})")
output = {
"test_name": "PD Properties Validation",
"range_valid": not any("범위" in i for i in issues),
"monotone_valid": not any("단조" in i for i in issues),
"order_valid": not any("순서" in i for i in issues),
"all_valid": len(issues) == 0,
"issues": issues,
"conclusion": "모든 검증 통과" if len(issues) == 0 else f"이슈 {len(issues)}"
}
return output
def run_full_validation(
zt_series: np.ndarray,
regression_result,
pd_results: Dict,
grade_names: List[str] = None
) -> pd.DataFrame:
"""
전체 검증 실행 및 결과 요약 테이블 생성
Parameters
----------
zt_series : np.ndarray
Zt 추정값 시계열
regression_result : statsmodels.RegressionResults
회귀 모형 결과 (또는 None)
pd_results : Dict
compute_all_scenarios() 결과
Returns
-------
pd.DataFrame
검증 결과 요약 테이블
"""
all_tests = []
# 1. Zt 시계열 검증
adf_result = test_stationarity(zt_series, "Zt")
all_tests.append({
"검정": adf_result["test_name"],
"대상": "Zt 시계열",
"통계량": f"{adf_result['test_statistic']:.4f}",
"p-value": f"{adf_result['p_value']:.4f}",
"결과": "Pass O" if adf_result["is_stationary"] else "Fail X",
"해석": adf_result["conclusion"]
})
norm_result = test_normality(zt_series, "Zt")
all_tests.append({
"검정": norm_result["test_name"],
"대상": "Zt 시계열",
"통계량": f"{norm_result['test_statistic']:.4f}",
"p-value": f"{norm_result['p_value']:.4f}",
"결과": "Pass O" if norm_result["is_normal"] else "Fail X",
"해석": norm_result["conclusion"]
})
# 2. 회귀모형 잔차 검증
if regression_result is not None:
residuals = regression_result.resid
exog = regression_result.model.exog
serial_result = test_serial_correlation(residuals)
all_tests.append({
"검정": "Ljung-Box Q-test",
"대상": "잔차 자기상관",
"통계량": f"{serial_result['ljung_box_statistic']:.4f}",
"p-value": f"{serial_result['ljung_box_pvalue']:.4f}",
"결과": "Pass O" if serial_result["no_autocorrelation"] else "Fail X",
"해석": serial_result["conclusion"]
})
het_result = test_heteroscedasticity(residuals, exog)
all_tests.append({
"검정": "Breusch-Pagan / ARCH-LM",
"대상": "잔차 이분산",
"통계량": f"BP={het_result['breusch_pagan_stat']:.4f}" if het_result['breusch_pagan_stat'] else "N/A",
"p-value": f"{het_result['breusch_pagan_pvalue']:.4f}" if het_result['breusch_pagan_pvalue'] else "N/A",
"결과": "Pass O" if het_result["is_homoscedastic"] else "Fail X",
"해석": het_result["conclusion"]
})
# R², F-test
all_tests.append({
"검정": "R² / F-test",
"대상": "모형 설명력",
"통계량": f"R²={regression_result.rsquared:.4f}",
"p-value": f"{regression_result.f_pvalue:.4f}",
"결과": "Pass O" if regression_result.f_pvalue < 0.05 else "Fail X",
"해석": f"R²={regression_result.rsquared:.3f}, "
f"Adj.R²={regression_result.rsquared_adj:.3f}"
})
# 3. PD 성질 검증
for scenario_name in pd_results.get("by_scenario", {}):
cum_pd = pd_results["by_scenario"][scenario_name]["cumulative_pd"]
pd_valid = validate_pd_properties(cum_pd, grade_names)
all_tests.append({
"검정": "PD Properties",
"대상": f"Cumulative PD ({scenario_name})",
"통계량": "-",
"p-value": "-",
"결과": "Pass O" if pd_valid["all_valid"] else "Fail X",
"해석": pd_valid["conclusion"]
})
return pd.DataFrame(all_tests)