- Belkin & Suchower (1998) credit cycle index (Zt) estimation via WLS - Vasicek single-factor conditional PD/TM model - Macro-Zt OLS regression with stepwise variable selection - 3-scenario (boom/neutral/recession) 50yr PD projection - Statistical validation suite (ADF, Ljung-Box, R2, ARCH) - BOK ECOS API integration with fallback data - Visualization module (7 chart types) - Detailed theoretical methodology docs/methodology.md
335 lines
11 KiB
Python
335 lines
11 KiB
Python
"""
|
||
통계적 유의성 검증 모듈
|
||
|
||
Zt 시계열 및 거시연계 회귀모형의 통계적 타당성을 엄밀하게 검증합니다.
|
||
|
||
검증 항목:
|
||
1. Zt 시계열: ADF 단위근 검정, Shapiro-Wilk 정규성 검정
|
||
2. 회귀 모형: R², F-test, AIC/BIC, 잔차 진단
|
||
3. 잔차: Durbin-Watson, Ljung-Box, ARCH-LM, Breusch-Pagan
|
||
4. 구조적 안정성: CUSUM(추정 가능시)
|
||
5. 다중공선성: VIF
|
||
|
||
참고:
|
||
- Greene, W.H. (2018). "Econometric Analysis" 8th ed.
|
||
- Hamilton, J.D. (1994). "Time Series Analysis"
|
||
"""
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
from scipy import stats
|
||
from statsmodels.tsa.stattools import adfuller
|
||
from statsmodels.stats.diagnostic import (
|
||
het_breuschpagan, acorr_ljungbox, het_arch
|
||
)
|
||
from statsmodels.stats.stattools import durbin_watson
|
||
from typing import Dict, Optional, List
|
||
import logging
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def test_stationarity(
|
||
series: np.ndarray,
|
||
name: str = "Zt",
|
||
significance: float = 0.05
|
||
) -> Dict:
|
||
"""
|
||
ADF (Augmented Dickey-Fuller) 단위근 검정
|
||
|
||
H0: 단위근 존재 (비정상 시계열)
|
||
H1: 정상 시계열
|
||
|
||
Returns
|
||
-------
|
||
dict with test_statistic, p_value, critical_values, is_stationary
|
||
"""
|
||
result = adfuller(series, autolag="AIC")
|
||
|
||
is_stationary = result[1] < significance
|
||
|
||
output = {
|
||
"test_name": "ADF (Augmented Dickey-Fuller)",
|
||
"variable": name,
|
||
"test_statistic": result[0],
|
||
"p_value": result[1],
|
||
"lags_used": result[2],
|
||
"n_obs": result[3],
|
||
"critical_values": result[4],
|
||
"is_stationary": is_stationary,
|
||
"conclusion": f"{'정상' if is_stationary else '비정상'} 시계열 "
|
||
f"(p={result[1]:.4f}, α={significance})"
|
||
}
|
||
|
||
logger.info(f"ADF 검정 [{name}]: statistic={result[0]:.4f}, "
|
||
f"p-value={result[1]:.4f} → {'Pass' if is_stationary else 'FAIL'}")
|
||
|
||
return output
|
||
|
||
|
||
def test_normality(
|
||
series: np.ndarray,
|
||
name: str = "Zt",
|
||
significance: float = 0.05
|
||
) -> Dict:
|
||
"""
|
||
Shapiro-Wilk 정규성 검정
|
||
|
||
H0: 정규분포를 따름
|
||
H1: 정규분포를 따르지 않음
|
||
"""
|
||
stat, p_value = stats.shapiro(series)
|
||
is_normal = p_value > significance
|
||
|
||
output = {
|
||
"test_name": "Shapiro-Wilk Normality Test",
|
||
"variable": name,
|
||
"test_statistic": stat,
|
||
"p_value": p_value,
|
||
"is_normal": is_normal,
|
||
"mean": float(np.mean(series)),
|
||
"std": float(np.std(series)),
|
||
"skewness": float(stats.skew(series)),
|
||
"kurtosis": float(stats.kurtosis(series)),
|
||
"conclusion": f"{'정규분포' if is_normal else '비정규분포'} "
|
||
f"(p={p_value:.4f}, α={significance})"
|
||
}
|
||
|
||
logger.info(f"정규성 검정 [{name}]: W={stat:.4f}, "
|
||
f"p-value={p_value:.4f} → {'Pass' if is_normal else 'FAIL'}")
|
||
|
||
return output
|
||
|
||
|
||
def test_serial_correlation(
|
||
residuals: np.ndarray,
|
||
lags: int = 5,
|
||
significance: float = 0.05
|
||
) -> Dict:
|
||
"""
|
||
잔차 자기상관 검정
|
||
|
||
1) Durbin-Watson: d ≈ 2이면 자기상관 없음
|
||
2) Ljung-Box Q-test: H0 = 자기상관 없음
|
||
"""
|
||
# Durbin-Watson
|
||
dw = durbin_watson(residuals)
|
||
|
||
# Ljung-Box
|
||
lb_result = acorr_ljungbox(residuals, lags=[lags], return_df=True)
|
||
lb_stat = lb_result["lb_stat"].values[0]
|
||
lb_pvalue = lb_result["lb_pvalue"].values[0]
|
||
|
||
no_autocorr = lb_pvalue > significance
|
||
|
||
output = {
|
||
"test_name": "Serial Correlation Tests",
|
||
"durbin_watson": float(dw),
|
||
"dw_interpretation": (
|
||
"양의 자기상관" if dw < 1.5 else
|
||
"음의 자기상관" if dw > 2.5 else
|
||
"자기상관 없음"
|
||
),
|
||
"ljung_box_statistic": float(lb_stat),
|
||
"ljung_box_pvalue": float(lb_pvalue),
|
||
"ljung_box_lags": lags,
|
||
"no_autocorrelation": no_autocorr,
|
||
"conclusion": f"{'자기상관 없음' if no_autocorr else '자기상관 존재'} "
|
||
f"(DW={dw:.3f}, LB p={lb_pvalue:.4f})"
|
||
}
|
||
|
||
logger.info(f"자기상관 검정: DW={dw:.3f}, LB p-value={lb_pvalue:.4f} "
|
||
f"→ {'Pass' if no_autocorr else 'FAIL'}")
|
||
|
||
return output
|
||
|
||
|
||
def test_heteroscedasticity(
|
||
residuals: np.ndarray,
|
||
exog: np.ndarray,
|
||
significance: float = 0.05
|
||
) -> Dict:
|
||
"""
|
||
이분산 검정
|
||
|
||
1) Breusch-Pagan: H0 = 등분산
|
||
2) ARCH-LM: H0 = ARCH 효과 없음
|
||
"""
|
||
# Breusch-Pagan
|
||
try:
|
||
bp_stat, bp_pvalue, _, _ = het_breuschpagan(residuals, exog)
|
||
except Exception:
|
||
bp_stat, bp_pvalue = np.nan, np.nan
|
||
|
||
# ARCH-LM
|
||
try:
|
||
arch_result = het_arch(residuals, nlags=3)
|
||
arch_stat = arch_result[0]
|
||
arch_pvalue = arch_result[1]
|
||
except Exception:
|
||
arch_stat, arch_pvalue = np.nan, np.nan
|
||
|
||
homoscedastic = (
|
||
(np.isnan(bp_pvalue) or bp_pvalue > significance) and
|
||
(np.isnan(arch_pvalue) or arch_pvalue > significance)
|
||
)
|
||
|
||
output = {
|
||
"test_name": "Heteroscedasticity Tests",
|
||
"breusch_pagan_stat": float(bp_stat) if not np.isnan(bp_stat) else None,
|
||
"breusch_pagan_pvalue": float(bp_pvalue) if not np.isnan(bp_pvalue) else None,
|
||
"arch_lm_stat": float(arch_stat) if not np.isnan(arch_stat) else None,
|
||
"arch_lm_pvalue": float(arch_pvalue) if not np.isnan(arch_pvalue) else None,
|
||
"is_homoscedastic": homoscedastic,
|
||
"conclusion": f"{'등분산' if homoscedastic else '이분산'} "
|
||
f"(BP p={bp_pvalue:.4f}, ARCH p={arch_pvalue:.4f})"
|
||
}
|
||
|
||
logger.info(f"이분산 검정: BP p={bp_pvalue:.4f}, ARCH p={arch_pvalue:.4f} "
|
||
f"→ {'Pass' if homoscedastic else 'FAIL'}")
|
||
|
||
return output
|
||
|
||
|
||
def validate_pd_properties(
|
||
cumulative_pds: np.ndarray,
|
||
grade_names: List[str] = None
|
||
) -> Dict:
|
||
"""
|
||
PD 결과의 수학적 성질 검증
|
||
|
||
1) 0 ≤ PD ≤ 1
|
||
2) 누적 PD 단조증가
|
||
3) 등급간 순서 유지 (낮은 등급 PD > 높은 등급 PD)
|
||
"""
|
||
issues = []
|
||
|
||
# 1) 범위 검증
|
||
if np.any(cumulative_pds < 0) or np.any(cumulative_pds > 1.0001):
|
||
issues.append("PD 값이 [0,1] 범위를 벗어남")
|
||
|
||
# 2) 단조증가 검증
|
||
for j in range(cumulative_pds.shape[1]):
|
||
diffs = np.diff(cumulative_pds[:, j])
|
||
if np.any(diffs < -1e-10):
|
||
grade_name = grade_names[j] if grade_names else f"Grade{j}"
|
||
issues.append(f"누적 PD 단조증가 위반: {grade_name}")
|
||
|
||
# 3) 등급간 순서 검증 (마지막 행, 즉 최종 누적 PD에서)
|
||
final_pds = cumulative_pds[-1]
|
||
for j in range(len(final_pds) - 1):
|
||
if final_pds[j] > final_pds[j + 1] + 1e-6:
|
||
g1 = grade_names[j] if grade_names else f"Grade{j}"
|
||
g2 = grade_names[j + 1] if grade_names else f"Grade{j+1}"
|
||
issues.append(f"등급 순서 위반: PD({g1}) > PD({g2})")
|
||
|
||
output = {
|
||
"test_name": "PD Properties Validation",
|
||
"range_valid": not any("범위" in i for i in issues),
|
||
"monotone_valid": not any("단조" in i for i in issues),
|
||
"order_valid": not any("순서" in i for i in issues),
|
||
"all_valid": len(issues) == 0,
|
||
"issues": issues,
|
||
"conclusion": "모든 검증 통과" if len(issues) == 0 else f"이슈 {len(issues)}건"
|
||
}
|
||
|
||
return output
|
||
|
||
|
||
def run_full_validation(
|
||
zt_series: np.ndarray,
|
||
regression_result,
|
||
pd_results: Dict,
|
||
grade_names: List[str] = None
|
||
) -> pd.DataFrame:
|
||
"""
|
||
전체 검증 실행 및 결과 요약 테이블 생성
|
||
|
||
Parameters
|
||
----------
|
||
zt_series : np.ndarray
|
||
Zt 추정값 시계열
|
||
regression_result : statsmodels.RegressionResults
|
||
회귀 모형 결과 (또는 None)
|
||
pd_results : Dict
|
||
compute_all_scenarios() 결과
|
||
|
||
Returns
|
||
-------
|
||
pd.DataFrame
|
||
검증 결과 요약 테이블
|
||
"""
|
||
all_tests = []
|
||
|
||
# 1. Zt 시계열 검증
|
||
adf_result = test_stationarity(zt_series, "Zt")
|
||
all_tests.append({
|
||
"검정": adf_result["test_name"],
|
||
"대상": "Zt 시계열",
|
||
"통계량": f"{adf_result['test_statistic']:.4f}",
|
||
"p-value": f"{adf_result['p_value']:.4f}",
|
||
"결과": "Pass O" if adf_result["is_stationary"] else "Fail X",
|
||
"해석": adf_result["conclusion"]
|
||
})
|
||
|
||
norm_result = test_normality(zt_series, "Zt")
|
||
all_tests.append({
|
||
"검정": norm_result["test_name"],
|
||
"대상": "Zt 시계열",
|
||
"통계량": f"{norm_result['test_statistic']:.4f}",
|
||
"p-value": f"{norm_result['p_value']:.4f}",
|
||
"결과": "Pass O" if norm_result["is_normal"] else "Fail X",
|
||
"해석": norm_result["conclusion"]
|
||
})
|
||
|
||
# 2. 회귀모형 잔차 검증
|
||
if regression_result is not None:
|
||
residuals = regression_result.resid
|
||
exog = regression_result.model.exog
|
||
|
||
serial_result = test_serial_correlation(residuals)
|
||
all_tests.append({
|
||
"검정": "Ljung-Box Q-test",
|
||
"대상": "잔차 자기상관",
|
||
"통계량": f"{serial_result['ljung_box_statistic']:.4f}",
|
||
"p-value": f"{serial_result['ljung_box_pvalue']:.4f}",
|
||
"결과": "Pass O" if serial_result["no_autocorrelation"] else "Fail X",
|
||
"해석": serial_result["conclusion"]
|
||
})
|
||
|
||
het_result = test_heteroscedasticity(residuals, exog)
|
||
all_tests.append({
|
||
"검정": "Breusch-Pagan / ARCH-LM",
|
||
"대상": "잔차 이분산",
|
||
"통계량": f"BP={het_result['breusch_pagan_stat']:.4f}" if het_result['breusch_pagan_stat'] else "N/A",
|
||
"p-value": f"{het_result['breusch_pagan_pvalue']:.4f}" if het_result['breusch_pagan_pvalue'] else "N/A",
|
||
"결과": "Pass O" if het_result["is_homoscedastic"] else "Fail X",
|
||
"해석": het_result["conclusion"]
|
||
})
|
||
|
||
# R², F-test
|
||
all_tests.append({
|
||
"검정": "R² / F-test",
|
||
"대상": "모형 설명력",
|
||
"통계량": f"R²={regression_result.rsquared:.4f}",
|
||
"p-value": f"{regression_result.f_pvalue:.4f}",
|
||
"결과": "Pass O" if regression_result.f_pvalue < 0.05 else "Fail X",
|
||
"해석": f"R²={regression_result.rsquared:.3f}, "
|
||
f"Adj.R²={regression_result.rsquared_adj:.3f}"
|
||
})
|
||
|
||
# 3. PD 성질 검증
|
||
for scenario_name in pd_results.get("by_scenario", {}):
|
||
cum_pd = pd_results["by_scenario"][scenario_name]["cumulative_pd"]
|
||
pd_valid = validate_pd_properties(cum_pd, grade_names)
|
||
all_tests.append({
|
||
"검정": "PD Properties",
|
||
"대상": f"Cumulative PD ({scenario_name})",
|
||
"통계량": "-",
|
||
"p-value": "-",
|
||
"결과": "Pass O" if pd_valid["all_valid"] else "Fail X",
|
||
"해석": pd_valid["conclusion"]
|
||
})
|
||
|
||
return pd.DataFrame(all_tests)
|