Files
LifetimePD/validation/statistical_tests.py
Variet Agent d1ddf06e5d feat(model): KAP YTM PD floor integration, expanded 226-var search, ADF fix (AIC->BIC), Model#2 with 6-test diagnostics
- Replace hardcoded DEFAULT_PD_FLOORS with build_complete_pd_floor_table() (KAP bond YTM)
- Fix ADF test: autolag='AIC' -> 'BIC' for small sample (N=26) robustness
- Expand variable search: 40 -> 226 vars (log/diff/return/lag2), 1.9M combos
- Select Model #2: HOUSING_PRICE + CREDIT_SPREAD_LAG1 + CURRENT_ACCOUNT_R
- Add 6-test diagnostics table to AR1 sheet (ADF/LB/DW/BP/ARCH/Shapiro)
- Add Korean variable names for transformed variables
- Generate report v7 with full diagnostics
2026-03-12 00:06:23 +09:00

339 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
통계적 유의성 검증 모듈
Zt 시계열 및 거시연계 회귀모형의 통계적 타당성을 엄밀하게 검증합니다.
검증 항목:
1. Zt 시계열: ADF 단위근 검정, Shapiro-Wilk 정규성 검정
2. 회귀 모형: R², F-test, AIC/BIC, 잔차 진단
3. 잔차: Durbin-Watson, Ljung-Box, ARCH-LM, Breusch-Pagan
4. 구조적 안정성: CUSUM(추정 가능시)
5. 다중공선성: VIF
참고:
- Greene, W.H. (2018). "Econometric Analysis" 8th ed.
- Hamilton, J.D. (1994). "Time Series Analysis"
"""
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import (
het_breuschpagan, acorr_ljungbox, het_arch
)
from statsmodels.stats.stattools import durbin_watson
from typing import Dict, Optional, List
import logging
logger = logging.getLogger(__name__)
def test_stationarity(
series: np.ndarray,
name: str = "Zt",
significance: float = 0.05
) -> Dict:
"""
ADF (Augmented Dickey-Fuller) 단위근 검정
H0: 단위근 존재 (비정상 시계열)
H1: 정상 시계열
Returns
-------
dict with test_statistic, p_value, critical_values, is_stationary
"""
# BIC를 사용하는 이유:
# - AIC는 소표본(N<50)에서 과다 lag 선택 경향 (Hamilton 1994, Ch.17)
# - N=26에서 AIC → lag=8 → 유효관측치=17 → 검정력 상실
# - BIC는 보수적 lag 선택 → 소표본에서 적절 (Schwarz 1978)
result = adfuller(series, autolag="BIC")
is_stationary = result[1] < significance
output = {
"test_name": "ADF (Augmented Dickey-Fuller)",
"variable": name,
"test_statistic": result[0],
"p_value": result[1],
"lags_used": result[2],
"n_obs": result[3],
"critical_values": result[4],
"is_stationary": is_stationary,
"conclusion": f"{'정상' if is_stationary else '비정상'} 시계열 "
f"(p={result[1]:.4f}, α={significance})"
}
logger.info(f"ADF 검정 [{name}]: statistic={result[0]:.4f}, "
f"p-value={result[1]:.4f}{'Pass' if is_stationary else 'FAIL'}")
return output
def test_normality(
series: np.ndarray,
name: str = "Zt",
significance: float = 0.05
) -> Dict:
"""
Shapiro-Wilk 정규성 검정
H0: 정규분포를 따름
H1: 정규분포를 따르지 않음
"""
stat, p_value = stats.shapiro(series)
is_normal = p_value > significance
output = {
"test_name": "Shapiro-Wilk Normality Test",
"variable": name,
"test_statistic": stat,
"p_value": p_value,
"is_normal": is_normal,
"mean": float(np.mean(series)),
"std": float(np.std(series)),
"skewness": float(stats.skew(series)),
"kurtosis": float(stats.kurtosis(series)),
"conclusion": f"{'정규분포' if is_normal else '비정규분포'} "
f"(p={p_value:.4f}, α={significance})"
}
logger.info(f"정규성 검정 [{name}]: W={stat:.4f}, "
f"p-value={p_value:.4f}{'Pass' if is_normal else 'FAIL'}")
return output
def test_serial_correlation(
residuals: np.ndarray,
lags: int = 5,
significance: float = 0.05
) -> Dict:
"""
잔차 자기상관 검정
1) Durbin-Watson: d ≈ 2이면 자기상관 없음
2) Ljung-Box Q-test: H0 = 자기상관 없음
"""
# Durbin-Watson
dw = durbin_watson(residuals)
# Ljung-Box
lb_result = acorr_ljungbox(residuals, lags=[lags], return_df=True)
lb_stat = lb_result["lb_stat"].values[0]
lb_pvalue = lb_result["lb_pvalue"].values[0]
no_autocorr = lb_pvalue > significance
output = {
"test_name": "Serial Correlation Tests",
"durbin_watson": float(dw),
"dw_interpretation": (
"양의 자기상관" if dw < 1.5 else
"음의 자기상관" if dw > 2.5 else
"자기상관 없음"
),
"ljung_box_statistic": float(lb_stat),
"ljung_box_pvalue": float(lb_pvalue),
"ljung_box_lags": lags,
"no_autocorrelation": no_autocorr,
"conclusion": f"{'자기상관 없음' if no_autocorr else '자기상관 존재'} "
f"(DW={dw:.3f}, LB p={lb_pvalue:.4f})"
}
logger.info(f"자기상관 검정: DW={dw:.3f}, LB p-value={lb_pvalue:.4f} "
f"{'Pass' if no_autocorr else 'FAIL'}")
return output
def test_heteroscedasticity(
residuals: np.ndarray,
exog: np.ndarray,
significance: float = 0.05
) -> Dict:
"""
이분산 검정
1) Breusch-Pagan: H0 = 등분산
2) ARCH-LM: H0 = ARCH 효과 없음
"""
# Breusch-Pagan
try:
bp_stat, bp_pvalue, _, _ = het_breuschpagan(residuals, exog)
except Exception:
bp_stat, bp_pvalue = np.nan, np.nan
# ARCH-LM
try:
arch_result = het_arch(residuals, nlags=3)
arch_stat = arch_result[0]
arch_pvalue = arch_result[1]
except Exception:
arch_stat, arch_pvalue = np.nan, np.nan
homoscedastic = (
(np.isnan(bp_pvalue) or bp_pvalue > significance) and
(np.isnan(arch_pvalue) or arch_pvalue > significance)
)
output = {
"test_name": "Heteroscedasticity Tests",
"breusch_pagan_stat": float(bp_stat) if not np.isnan(bp_stat) else None,
"breusch_pagan_pvalue": float(bp_pvalue) if not np.isnan(bp_pvalue) else None,
"arch_lm_stat": float(arch_stat) if not np.isnan(arch_stat) else None,
"arch_lm_pvalue": float(arch_pvalue) if not np.isnan(arch_pvalue) else None,
"is_homoscedastic": homoscedastic,
"conclusion": f"{'등분산' if homoscedastic else '이분산'} "
f"(BP p={bp_pvalue:.4f}, ARCH p={arch_pvalue:.4f})"
}
logger.info(f"이분산 검정: BP p={bp_pvalue:.4f}, ARCH p={arch_pvalue:.4f} "
f"{'Pass' if homoscedastic else 'FAIL'}")
return output
def validate_pd_properties(
cumulative_pds: np.ndarray,
grade_names: List[str] = None
) -> Dict:
"""
PD 결과의 수학적 성질 검증
1) 0 ≤ PD ≤ 1
2) 누적 PD 단조증가
3) 등급간 순서 유지 (낮은 등급 PD > 높은 등급 PD)
"""
issues = []
# 1) 범위 검증
if np.any(cumulative_pds < 0) or np.any(cumulative_pds > 1.0001):
issues.append("PD 값이 [0,1] 범위를 벗어남")
# 2) 단조증가 검증
for j in range(cumulative_pds.shape[1]):
diffs = np.diff(cumulative_pds[:, j])
if np.any(diffs < -1e-10):
grade_name = grade_names[j] if grade_names else f"Grade{j}"
issues.append(f"누적 PD 단조증가 위반: {grade_name}")
# 3) 등급간 순서 검증 (마지막 행, 즉 최종 누적 PD에서)
final_pds = cumulative_pds[-1]
for j in range(len(final_pds) - 1):
if final_pds[j] > final_pds[j + 1] + 1e-6:
g1 = grade_names[j] if grade_names else f"Grade{j}"
g2 = grade_names[j + 1] if grade_names else f"Grade{j+1}"
issues.append(f"등급 순서 위반: PD({g1}) > PD({g2})")
output = {
"test_name": "PD Properties Validation",
"range_valid": not any("범위" in i for i in issues),
"monotone_valid": not any("단조" in i for i in issues),
"order_valid": not any("순서" in i for i in issues),
"all_valid": len(issues) == 0,
"issues": issues,
"conclusion": "모든 검증 통과" if len(issues) == 0 else f"이슈 {len(issues)}"
}
return output
def run_full_validation(
zt_series: np.ndarray,
regression_result,
pd_results: Dict,
grade_names: List[str] = None
) -> pd.DataFrame:
"""
전체 검증 실행 및 결과 요약 테이블 생성
Parameters
----------
zt_series : np.ndarray
Zt 추정값 시계열
regression_result : statsmodels.RegressionResults
회귀 모형 결과 (또는 None)
pd_results : Dict
compute_all_scenarios() 결과
Returns
-------
pd.DataFrame
검증 결과 요약 테이블
"""
all_tests = []
# 1. Zt 시계열 검증
adf_result = test_stationarity(zt_series, "Zt")
all_tests.append({
"검정": adf_result["test_name"],
"대상": "Zt 시계열",
"통계량": f"{adf_result['test_statistic']:.4f}",
"p-value": f"{adf_result['p_value']:.4f}",
"결과": "Pass O" if adf_result["is_stationary"] else "Fail X",
"해석": adf_result["conclusion"]
})
norm_result = test_normality(zt_series, "Zt")
all_tests.append({
"검정": norm_result["test_name"],
"대상": "Zt 시계열",
"통계량": f"{norm_result['test_statistic']:.4f}",
"p-value": f"{norm_result['p_value']:.4f}",
"결과": "Pass O" if norm_result["is_normal"] else "Fail X",
"해석": norm_result["conclusion"]
})
# 2. 회귀모형 잔차 검증
if regression_result is not None:
residuals = regression_result.resid
exog = regression_result.model.exog
serial_result = test_serial_correlation(residuals)
all_tests.append({
"검정": "Ljung-Box Q-test",
"대상": "잔차 자기상관",
"통계량": f"{serial_result['ljung_box_statistic']:.4f}",
"p-value": f"{serial_result['ljung_box_pvalue']:.4f}",
"결과": "Pass O" if serial_result["no_autocorrelation"] else "Fail X",
"해석": serial_result["conclusion"]
})
het_result = test_heteroscedasticity(residuals, exog)
all_tests.append({
"검정": "Breusch-Pagan / ARCH-LM",
"대상": "잔차 이분산",
"통계량": f"BP={het_result['breusch_pagan_stat']:.4f}" if het_result['breusch_pagan_stat'] else "N/A",
"p-value": f"{het_result['breusch_pagan_pvalue']:.4f}" if het_result['breusch_pagan_pvalue'] else "N/A",
"결과": "Pass O" if het_result["is_homoscedastic"] else "Fail X",
"해석": het_result["conclusion"]
})
# R², F-test
all_tests.append({
"검정": "R² / F-test",
"대상": "모형 설명력",
"통계량": f"R²={regression_result.rsquared:.4f}",
"p-value": f"{regression_result.f_pvalue:.4f}",
"결과": "Pass O" if regression_result.f_pvalue < 0.05 else "Fail X",
"해석": f"R²={regression_result.rsquared:.3f}, "
f"Adj.R²={regression_result.rsquared_adj:.3f}"
})
# 3. PD 성질 검증
for scenario_name in pd_results.get("by_scenario", {}):
cum_pd = pd_results["by_scenario"][scenario_name]["cumulative_pd"]
pd_valid = validate_pd_properties(cum_pd, grade_names)
all_tests.append({
"검정": "PD Properties",
"대상": f"Cumulative PD ({scenario_name})",
"통계량": "-",
"p-value": "-",
"결과": "Pass O" if pd_valid["all_valid"] else "Fail X",
"해석": pd_valid["conclusion"]
})
return pd.DataFrame(all_tests)