diff --git a/data/macro_analysis.py b/data/macro_analysis.py index b407e12..7163651 100644 --- a/data/macro_analysis.py +++ b/data/macro_analysis.py @@ -1,148 +1,82 @@ """ -거시경제변수 포괄 탐색 및 Zt 회귀 최적화 +거시변수 재분석 v2 — 금리 DIFF 전용 + 계수 부호 검증 -ECOS API에서 30+ 후보변수 수집 → 6종 변환 → Zt 상관분석 → 최적 3변수 선택 +규칙: +1. 금리 변수 (BASE_RATE, CD_RATE, GOVT_3Y, CORP_AA, CORP_BBB): DIFF만 허용 +2. 가격/지수 변수: DIFF, PCT, LOG, LOGR 허용 +3. 이미 변화율인 변수 (GDP_GROWTH, CPI): 원본(LEVEL), LAG1만 허용 +4. 계수 부호 경제적 일관성 체크 +5. Zt: 2000~2025 (26obs) -사용법: - python data/macro_analysis.py # fallback 데이터로 빠른 분석 - python data/macro_analysis.py --fetch-ecos # ECOS API 실시간 수집 +Zt 부호 규칙: 양수 = 부도율 높음 = 경기 나쁨 + +경제적 부호 기대: + GDP_GROWTH: 음(-) — 성장 ↑ → 부도 ↓ → Zt ↓ + UNEMPLOYMENT: 양(+) — 실업 ↑ → 부도 ↑ → Zt ↑ + BASE_RATE_DIFF: 양(+) — 금리인상 → 부도 ↑ → Zt ↑ (또는 래그 효과) + CD_RATE_DIFF: 양(+) + CPI: 양(+) — 물가 급등 → 구매력 ↓ → 부도 ↑ → Zt ↑ + LEADING_IDX: 음(-) — 선행지수 ↑ → 경기 호전 → Zt ↓ + CORP_AA_DIFF: 양(+) — 회사채 금리 상승 → 자금조달 비용 ↑ → 부도 ↑ + CORP_BBB_DIFF: 양(+) + CREDIT_SPREAD: 양(+) — 스프레드 확대 → 신용위험 ↑ → Zt ↑ + TERM_SPREAD: 어느쪽이든 가능 (역전시 침체 신호 = 음수) + EXCHANGE_RATE: 양(+) — 원화약세 → 외화부채 ↑ → 부도 ↑ + EXPORT: 음(-) — 수출 ↑ → 경기 좋음 → Zt ↓ """ -import sys -import io -import re -import argparse -import itertools -import numpy as np -import pandas as pd -import warnings +import sys, io, itertools +import numpy as np, pandas as pd +import statsmodels.api as sm +from scipy import stats from pathlib import Path -from typing import Dict, List, Tuple, Optional -# Windows CP949 if sys.stdout.encoding != 'utf-8': sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') +import warnings warnings.filterwarnings("ignore") -import statsmodels.api as sm -from scipy import stats - BASE_DIR = Path(__file__).parent.parent -# ============================================================ -# 1. ECOS API 변수 탐색 및 수집 -# ============================================================ +# 경제적 부호 기대 (양수 = 이 변수 증가 시 Zt 증가) +EXPECTED_SIGNS = { + # 원본 + "GDP_GROWTH": -1, "UNEMPLOYMENT": +1, + "CPI": +1, "CPI_GROWTH": +1, + "LEADING_IDX": -1, "LEADING_INDEX": -1, + # 금리 차분 + "BASE_RATE_DIFF": +1, "CD_RATE_DIFF": +1, + "GOVT_3Y_DIFF": +1, + "CORP_AA_DIFF": +1, "CORP_BBB_DIFF": +1, + # 파생 + "CREDIT_SPREAD": +1, "CREDIT_SPREAD_DIFF": +1, + "CREDIT_SPREAD_LAG1": +1, + "TERM_SPREAD": 0, # 0 = 부호 제약 없음 + "TERM_SPREAD_DIFF": 0, + "TERM_SPREAD_LAG1": 0, + # 기타 + "EXCHANGE_RATE": +1, "EXCHANGE_RATE_DIFF": +1, "EXCHANGE_RATE_PCT": +1, + "EXCHANGE_RATE_LAG1": +1, + "EXPORT_PCT": -1, "IMPORT_PCT": +1, + "TRADE_BALANCE": -1, "TRADE_BALANCE_DIFF": -1, + "TRADE_BAL_PCT": -1, + "IPI": -1, "IPI_DIFF": -1, "IPI_PCT": -1, + "CSI": -1, "CSI_DIFF": -1, "CSI_LAG1": -1, + "M2_PCT": 0, "M2_LOGR": 0, + "REAL_RATE": 0, "REAL_RATE_DIFF": 0, + # 래그 + "GDP_GROWTH_LAG1": -1, "UNEMPLOYMENT_LAG1": +1, + "CPI_LAG1": +1, "CPI_GROWTH_LAG1": +1, +} -# 후보 변수 정의: (name, stat_code, period, item_code1, transform_type) -# transform_type: 'level' (그대로), 'monthly_avg' (월→연평균), 'level_to_pct' (수준→전년변화율) -ECOS_CANDIDATES = [ - # 기존 6개 - ("GDP_GROWTH", "902Y015", "A", "KOR", "level"), - ("UNEMPLOYMENT", "901Y027", "A", "I61BC", "level"), - ("BASE_RATE", "722Y001", "A", "0101000", "level"), - ("CD_RATE", "721Y001", "A", "2010000", "level"), - ("CPI", "901Y009", "A", "0", "level_to_pct"), - ("LEADING_IDX", "901Y067", "M", "I16A", "monthly_avg"), - - # 금리/스프레드 - ("GOVT_3Y", "721Y001", "A", "5020000", "level"), # 국고채 3년 - ("GOVT_5Y", "721Y001", "A", "5030000", "level"), # 국고채 5년 - ("CORP_AA", "721Y001", "A", "7010000", "level"), # 회사채 AA- - ("CORP_BBB", "721Y001", "A", "7030000", "level"), # 회사채 BBB- - - # 수출입 - ("EXPORT", "403Y001", "A", "1", "level"), # 수출 (백만달러) - ("IMPORT", "403Y001", "A", "2", "level"), # 수입 - - # 금융 - ("EXCHANGE_RATE", "731Y003", "A", "0000001", "level"), # 원/달러 환율 - ("M2", "101Y003", "A", "BBIA00", "level"), # M2 통화량 - - # 산업생산 - ("IPI", "901Y033", "M", "I11A", "monthly_avg"), # 광공업생산지수 - - # 소비자심리 - ("CSI", "511Y002", "M", "FME", "monthly_avg"), # 소비자심리지수 -] +# 금리 변수 목록 (DIFF만 허용) +RATE_VARS = {"BASE_RATE", "CD_RATE", "GOVT_3Y", "CORP_AA", "CORP_BBB"} -def fetch_all_ecos(api_key: str, start: int = 1997, end: int = 2025) -> pd.DataFrame: - """ECOS API에서 모든 후보변수 수집""" - import requests - import time - - base_url = "https://ecos.bok.or.kr/api" - results = {} - - for name, stat_code, period, item_code, ttype in ECOS_CANDIDATES: - print(f" Fetching {name} ({stat_code}/{item_code})...", end=' ') - - if period == "M": - s_date = f"{start}01" - e_date = f"{end}12" - else: - s_date = str(start) - e_date = str(end) - - url = (f"{base_url}/StatisticSearch/" - f"{api_key}/json/kr/1/500/" - f"{stat_code}/{period}/{s_date}/{e_date}/" - f"{item_code}/?/?") - - try: - resp = requests.get(url, timeout=30) - data = resp.json() - - if "StatisticSearch" not in data: - msg = data.get("RESULT", {}).get("MESSAGE", "no data") - print(f"SKIP ({msg[:30]})") - time.sleep(0.3) - continue - - rows = data["StatisticSearch"]["row"] - df = pd.DataFrame(rows) - df["DATA_VALUE"] = pd.to_numeric(df["DATA_VALUE"], errors="coerce") - - if ttype == "monthly_avg": - df["YEAR"] = df["TIME"].str[:4].astype(int) - series = df.groupby("YEAR")["DATA_VALUE"].mean() - elif ttype == "level_to_pct": - series = df.set_index("TIME")["DATA_VALUE"] - series.index = series.index.astype(int) - series = series.sort_index() - series = series.pct_change() * 100 - series = series.dropna() - else: # level - series = df.set_index("TIME")["DATA_VALUE"] - series.index = series.index.astype(int) - - series = series[~series.index.duplicated(keep='first')] - series = series.dropna() - series = series.loc[(series.index >= start) & (series.index <= end)] - - if len(series) >= 15: - results[name] = series - print(f"OK ({len(series)} obs)") - else: - print(f"SKIP ({len(series)} obs)") - - except Exception as e: - print(f"ERROR ({str(e)[:30]})") - - time.sleep(0.3) - - if results: - df = pd.DataFrame(results) - df.index.name = "YEAR" - df = df.sort_index() - return df - return pd.DataFrame() - - -def load_fallback_extended() -> pd.DataFrame: - """확장 fallback 데이터 (API 없이 빠른 분석)""" +def load_fallback(): + """확장 fallback""" data = { 2000: {"GDP_GROWTH": 8.9, "UNEMPLOYMENT": 4.4, "BASE_RATE": 5.25, "CD_RATE": 7.09, "CPI": 2.3, "LEADING_IDX": 101.2, "GOVT_3Y": 8.35, "CORP_AA": 9.35, "CORP_BBB": 11.90, "EXCHANGE_RATE": 1131, "EXPORT": 172268, "IMPORT": 160481, "M2": 651.8, "IPI": 102.5, "CSI": 101.0}, @@ -197,306 +131,239 @@ def load_fallback_extended() -> pd.DataFrame: 2025: {"GDP_GROWTH": 1.8, "UNEMPLOYMENT": 3.0, "BASE_RATE": 2.75, "CD_RATE": 3.00, "CPI": 1.8, "LEADING_IDX": 99.8, "GOVT_3Y": 2.80, "CORP_AA": 3.50, "CORP_BBB": 6.80, "EXCHANGE_RATE": 1380, "EXPORT": 650000, "IMPORT": 640000, "M2": 3900.0, "IPI": 184.0, "CSI": 99.0}, } - df = pd.DataFrame(data).T - df.index.name = "YEAR" - return df + return pd.DataFrame(data).T.rename_axis("YEAR") -# ============================================================ -# 2. 변수 변환 -# ============================================================ -def apply_transforms(df: pd.DataFrame) -> pd.DataFrame: - """각 변수에 6가지 변환 적용""" - transformed = {} +def build_features(raw: pd.DataFrame) -> pd.DataFrame: + """변수 변환 — 금리는 DIFF만, 나머지는 허용된 변환만""" + feat = {} + for col in raw.columns: + s = raw[col].sort_index() - for col in df.columns: - series = df[col].sort_index() + if col in RATE_VARS: + # 금리: DIFF만 + feat[f"{col}_DIFF"] = s.diff() + # 래그도 금리 레벨의 LAG은 써도 됨 (차분 아님) + feat[f"{col}_LAG1"] = s.shift(1) + elif col in ("GDP_GROWTH", "CPI"): + # 이미 변화율: 원본 + LAG + feat[col] = s + feat[f"{col}_LAG1"] = s.shift(1) + elif col in ("UNEMPLOYMENT",): + feat[col] = s + feat[f"{col}_LAG1"] = s.shift(1) + feat[f"{col}_DIFF"] = s.diff() + elif col in ("LEADING_IDX", "LEADING_INDEX"): + feat[col] = s + feat[f"{col}_LAG1"] = s.shift(1) + feat[f"{col}_DIFF"] = s.diff() + elif col in ("EXCHANGE_RATE",): + feat[col] = s + feat[f"{col}_DIFF"] = s.diff() + feat[f"{col}_PCT"] = s.pct_change() * 100 + feat[f"{col}_LAG1"] = s.shift(1) + elif col in ("EXPORT", "IMPORT"): + feat[f"{col}_PCT"] = s.pct_change() * 100 + feat[f"{col}_DIFF"] = s.diff() + elif col in ("M2",): + feat[f"{col}_PCT"] = s.pct_change() * 100 + elif col in ("IPI", "CSI"): + feat[col] = s + feat[f"{col}_DIFF"] = s.diff() + feat[f"{col}_LAG1"] = s.shift(1) - # 원래 수준 - transformed[f"{col}"] = series + # 파생 변수 + if "CORP_BBB" in raw.columns and "CORP_AA" in raw.columns: + cs = raw["CORP_BBB"] - raw["CORP_AA"] + feat["CREDIT_SPREAD"] = cs + feat["CREDIT_SPREAD_DIFF"] = cs.diff() + feat["CREDIT_SPREAD_LAG1"] = cs.shift(1) + if "GOVT_3Y" in raw.columns and "BASE_RATE" in raw.columns: + ts = raw["GOVT_3Y"] - raw["BASE_RATE"] + feat["TERM_SPREAD"] = ts + feat["TERM_SPREAD_DIFF"] = ts.diff() + feat["TERM_SPREAD_LAG1"] = ts.shift(1) + if "BASE_RATE" in raw.columns and "CPI" in raw.columns: + feat["REAL_RATE"] = raw["BASE_RATE"] - raw["CPI"] + feat["REAL_RATE_DIFF"] = feat["REAL_RATE"].diff() + if "EXPORT" in raw.columns and "IMPORT" in raw.columns: + tb = raw["EXPORT"] - raw["IMPORT"] + feat["TRADE_BALANCE"] = tb + feat["TRADE_BALANCE_DIFF"] = tb.diff() - # 전년 변화량 - transformed[f"{col}_DIFF"] = series.diff() - - # 전년대비 변화율 (%) - pct = series.pct_change() * 100 - transformed[f"{col}_PCT"] = pct - - # 로그 (양수만) - if (series > 0).all(): - transformed[f"{col}_LOG"] = np.log(series) - # 로그 수익률 - transformed[f"{col}_LOGR"] = np.log(series).diff() - - # 1기 래그 - transformed[f"{col}_LAG1"] = series.shift(1) - - result = pd.DataFrame(transformed) - return result + return pd.DataFrame(feat).dropna(axis=1, thresh=15) -def add_derived_variables(df: pd.DataFrame) -> pd.DataFrame: - """파생 변수 추가 (스프레드, 비율 등)""" - derived = df.copy() - - # 신용 스프레드 (BBB - AA) - if "CORP_BBB" in df.columns and "CORP_AA" in df.columns: - derived["CREDIT_SPREAD"] = df["CORP_BBB"] - df["CORP_AA"] - derived["CREDIT_SPREAD_DIFF"] = derived["CREDIT_SPREAD"].diff() - - # 기간 스프레드 (국고 3Y vs 기준금리) - if "GOVT_3Y" in df.columns and "BASE_RATE" in df.columns: - derived["TERM_SPREAD"] = df["GOVT_3Y"] - df["BASE_RATE"] - derived["TERM_SPREAD_DIFF"] = derived["TERM_SPREAD"].diff() - - # 무역수지 - if "EXPORT" in df.columns and "IMPORT" in df.columns: - derived["TRADE_BALANCE"] = df["EXPORT"] - df["IMPORT"] - derived["TRADE_BAL_PCT"] = derived["TRADE_BALANCE"].pct_change() * 100 - - # 실질금리 = 기준금리 - CPI - if "BASE_RATE" in df.columns and "CPI" in df.columns: - derived["REAL_RATE"] = df["BASE_RATE"] - df["CPI"] - - return derived +def check_sign_consistency(combo_vars, coefficients): + """계수 부호 경제적 일관성 검사""" + issues = [] + all_ok = True + for var, coef in zip(combo_vars, coefficients): + expected = EXPECTED_SIGNS.get(var, 0) + if expected == 0: + continue # 부호 제약 없음 + actual_sign = +1 if coef > 0 else -1 + if actual_sign != expected: + all_ok = False + direction = "양(+)" if expected > 0 else "음(-)" + issues.append(f"{var}: expected {direction}, got {coef:+.3f}") + return all_ok, issues -# ============================================================ -# 3. 상관분석 + 모형 선택 -# ============================================================ -def correlate_with_zt(zt_series: pd.Series, macro_expanded: pd.DataFrame) -> pd.DataFrame: - """모든 변수 vs Zt 상관계수 매트릭스""" - results = [] - common = sorted(set(zt_series.index) & set(macro_expanded.index)) - - zt = zt_series.loc[common].values - - for col in macro_expanded.columns: - series = macro_expanded.loc[common, col] - valid = ~(np.isnan(series) | np.isinf(series)) - - if valid.sum() < 10: - continue - - r, p = stats.pearsonr(zt[valid], series[valid]) - rho, rho_p = stats.spearmanr(zt[valid], series[valid]) - - results.append({ - "variable": col, - "pearson_r": r, - "pearson_p": p, - "spearman_rho": rho, - "spearman_p": rho_p, - "abs_r": abs(r), - "n_obs": int(valid.sum()), - }) - - df = pd.DataFrame(results).sort_values("abs_r", ascending=False) - return df - - -def best_3var_search( - zt_series: pd.Series, - macro_expanded: pd.DataFrame, - top_n_candidates: int = 20, - corr_df: pd.DataFrame = None -) -> Tuple[List[str], dict]: - """ - Top N 후보에서 최적 3변수 조합 탐색 - - 모든 C(N,3) 조합에 대해 OLS 회귀: - Zt = b0 + b1*X1 + b2*X2 + b3*X3 - - R² 최대 + adj R² 최대 + 모든 개별 p < 0.1 인 조합 선택 - """ - common = sorted(set(zt_series.index) & set(macro_expanded.index)) - zt = zt_series.loc[common] - - # 상위 N개 후보 변수 선택 - if corr_df is not None: - candidates = corr_df.head(top_n_candidates)["variable"].tolist() - else: - candidates = list(macro_expanded.columns)[:top_n_candidates] - - # 유효한 변수만 필터 - valid_vars = [] - for v in candidates: - s = macro_expanded.loc[common, v] - if s.notna().sum() >= 15 and s.std() > 1e-10: - valid_vars.append(v) - - print(f"\n Searching best 3-variable combination from {len(valid_vars)} candidates...") - - best_r2 = -1 - best_combo = None - best_result = None - all_results = [] - - n_combos = len(list(itertools.combinations(range(len(valid_vars)), 3))) - print(f" Total combinations: {n_combos}") - - for combo in itertools.combinations(valid_vars, 3): - combo_list = list(combo) - - # 다중공선성 체크 (변수간 |r| > 0.85 제외) - skip = False - for i, j in itertools.combinations(range(3), 2): - s1 = macro_expanded.loc[common, combo_list[i]].dropna() - s2 = macro_expanded.loc[common, combo_list[j]].dropna() - ci = s1.index.intersection(s2.index) - if len(ci) > 5: - corr_ij = abs(s1.loc[ci].corr(s2.loc[ci])) - if corr_ij > 0.85: - skip = True - break - if skip: - continue - - X_df = macro_expanded.loc[common, combo_list].dropna() - valid_idx = X_df.index - if len(valid_idx) < 15: - continue - - y = zt.loc[valid_idx].values - X = X_df.values - - # 표준화 - X_mean = X.mean(axis=0) - X_std = X.std(axis=0) - X_std[X_std < 1e-10] = 1 - X_norm = (X - X_mean) / X_std - - X_const = sm.add_constant(X_norm) - try: - model = sm.OLS(y, X_const).fit() - except Exception: - continue - - r2 = model.rsquared - adj_r2 = model.rsquared_adj - - all_results.append({ - "vars": combo_list, - "r2": r2, - "adj_r2": adj_r2, - "aic": model.aic, - "pvalues": model.pvalues[1:].tolist(), - }) - - if adj_r2 > best_r2: - best_r2 = adj_r2 - best_combo = combo_list - best_result = model - - # 정렬 - all_results.sort(key=lambda x: x["adj_r2"], reverse=True) - - return best_combo, { - "best_model": best_result, - "top_10": all_results[:10], - "total_tested": len(all_results), - } - - -# ============================================================ -# 메인 -# ============================================================ def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--fetch-ecos", action="store_true", help="ECOS API 실시간 수집") - args = parser.parse_args() - print("=" * 70) - print(" 거시경제변수 포괄 탐색 — Zt 회귀 최적화") - print(" 목표: R² ≥ 0.7, 최대 3변수") + print(" 거시변수 재분석 v2 — 금리 DIFF 전용 + 계수 부호 검증") + print(" Zt: 2000~2025 (양수=경기악화)") print("=" * 70) - # 1. Zt 시계열 로딩 - print("\n[1] Zt 시계열 로딩...") + # Zt sys.path.insert(0, str(BASE_DIR)) from data.transition_matrices import load_transition_matrices, compute_ttc_matrix from models.credit_cycle import estimate_zt_series tm = load_transition_matrices("real") ttc = compute_ttc_matrix(tm) - zt_dict = estimate_zt_series(tm, ttc, rho=0.20) - zt_series = pd.Series(zt_dict, name="Zt") - zt_series.index.name = "YEAR" - print(f" Zt: {len(zt_series)} obs ({zt_series.index.min()}~{zt_series.index.max()})") - print(f" Mean={zt_series.mean():.4f}, Std={zt_series.std():.4f}") + zt_full = estimate_zt_series(tm, ttc, rho=0.20) + zt_series = pd.Series(zt_full, name="Zt") + # 2000~2025만 사용 + zt_series = zt_series[(zt_series.index >= 2000) & (zt_series.index <= 2025)] + print(f"\n Zt: {len(zt_series)}obs ({zt_series.index.min()}~{zt_series.index.max()})") - # 2. 거시변수 수집 - print("\n[2] 거시변수 수집...") - if args.fetch_ecos: - import yaml - with open(BASE_DIR / "config.yaml") as f: - config = yaml.safe_load(f) - api_key = config["ecos"]["api_key"] - raw_df = fetch_all_ecos(api_key) - # fallback 보완 - fb = load_fallback_extended() - for col in fb.columns: - if col not in raw_df.columns: - raw_df[col] = fb[col] - else: - raw_df = load_fallback_extended() + # 변수 + raw = load_fallback() + features = build_features(raw) + features = features[(features.index >= 2000) & (features.index <= 2025)] + print(f" Features: {len(features.columns)}개") + print(f" 변수 목록: {', '.join(features.columns)}") - print(f" 원본 변수: {len(raw_df.columns)}개") - print(f" 기간: {raw_df.index.min()}~{raw_df.index.max()}") + # 상관분석 + common = sorted(set(zt_series.index) & set(features.index)) + zt = zt_series.loc[common] - # 3. 파생변수 추가 - print("\n[3] 파생변수 생성...") - derived = add_derived_variables(raw_df) - expanded = apply_transforms(derived) + print(f"\n === Top 20 상관 (|r|) ===") + corrs = [] + for col in features.columns: + s = features.loc[common, col].dropna() + valid = s.index.intersection(zt.index) + if len(valid) < 12: + continue + r, p = stats.pearsonr(zt.loc[valid], s.loc[valid]) + exp = EXPECTED_SIGNS.get(col, 0) + sign_ok = "OK" if (exp == 0 or (r > 0 and exp > 0) or (r < 0 and exp < 0)) else "WRONG" + corrs.append({"var": col, "r": r, "p": p, "abs_r": abs(r), "sign": sign_ok, "n": len(valid)}) - # NaN 많은 열 제거 - expanded = expanded.dropna(axis=1, thresh=15) - print(f" 확장 변수: {len(expanded.columns)}개") + corrs = sorted(corrs, key=lambda x: x["abs_r"], reverse=True) + print(f" {'Variable':30s} {'r':>8} {'p':>8} {'Sign':>6} {'n':>4}") + for c in corrs[:20]: + sig = "***" if c["p"] < 0.01 else ("**" if c["p"] < 0.05 else ("*" if c["p"] < 0.1 else "")) + print(f" {c['var']:30s} {c['r']:>7.4f}{sig:<1} {c['p']:>7.4f} {c['sign']:>6} {c['n']:>4}") - # 4. 상관분석 - print("\n[4] Zt 상관분석...") - corr_df = correlate_with_zt(zt_series, expanded) + # 부호 OK인 변수만 후보 + sign_ok_vars = [c["var"] for c in corrs if c["sign"] == "OK" and c["abs_r"] > 0.15] + print(f"\n 부호 일관 + |r|>0.15 후보: {len(sign_ok_vars)}개") - print(f"\n === Top 30 변수 (|Pearson r| 기준) ===") - print(f" {'Variable':<30} {'r':>8} {'p':>8} {'rho':>8} {'n':>4}") - print(f" {'-'*30} {'-'*8} {'-'*8} {'-'*8} {'-'*4}") - for _, row in corr_df.head(30).iterrows(): - sig = "***" if row["pearson_p"] < 0.01 else ("**" if row["pearson_p"] < 0.05 else ("*" if row["pearson_p"] < 0.1 else "")) - print(f" {row['variable']:<30} {row['pearson_r']:>7.4f}{sig:<1} {row['pearson_p']:>7.4f} {row['spearman_rho']:>7.4f} {row['n_obs']:>4}") + # 3변수 탐색 + print(f"\n === 3변수 Exhaustive Search (부호 검증 포함) ===") + top_n = min(25, len(sign_ok_vars)) + candidates = sign_ok_vars[:top_n] - # 5. 최적 3변수 탐색 - print("\n[5] 최적 3변수 조합 탐색...") - best_vars, search_results = best_3var_search( - zt_series, expanded, top_n_candidates=25, corr_df=corr_df - ) + results = [] + for combo in itertools.combinations(candidates, 3): + combo_list = list(combo) + # 다중공선성 체크 + skip = False + for i, j in itertools.combinations(range(3), 2): + s1 = features.loc[common, combo_list[i]].dropna() + s2 = features.loc[common, combo_list[j]].dropna() + ci = s1.index.intersection(s2.index) + if len(ci) > 5 and abs(s1.loc[ci].corr(s2.loc[ci])) > 0.80: + skip = True + break + if skip: + continue - print(f"\n === Top 10 3변수 조합 (adj R² 기준) ===") - for i, res in enumerate(search_results["top_10"]): - vars_str = " + ".join([v[:20] for v in res["vars"]]) - print(f" {i+1:2d}. R²={res['r2']:.4f} adj.R²={res['adj_r2']:.4f} AIC={res['aic']:.1f} | {vars_str}") + X_df = features.loc[common, combo_list].dropna() + valid_idx = X_df.index.intersection(zt.index) + if len(valid_idx) < 15: + continue + y = zt.loc[valid_idx].values + X = X_df.loc[valid_idx].values + Xm, Xs = X.mean(0), X.std(0) + Xs[Xs < 1e-10] = 1 + Xn = (X - Xm) / Xs + try: + model = sm.OLS(y, sm.add_constant(Xn)).fit() + except: + continue - # 6. 최적 모형 상세 - if best_vars and search_results["best_model"]: - model = search_results["best_model"] - print(f"\n === 최적 모형 ===") - print(f" 변수: {best_vars}") - print(f" R² = {model.rsquared:.4f}") - print(f" Adj. R² = {model.rsquared_adj:.4f}") - print(f" AIC = {model.aic:.2f}") - print(f" F-stat = {model.fvalue:.4f} (p={model.f_pvalue:.4f})") + # 계수 부호 검증 (표준화된 계수) + sign_ok, sign_issues = check_sign_consistency(combo_list, model.params[1:]) + + results.append({ + "vars": combo_list, + "r2": model.rsquared, + "adj_r2": model.rsquared_adj, + "aic": model.aic, + "f_p": model.f_pvalue, + "sign_ok": sign_ok, + "sign_issues": sign_issues, + "pvalues": model.pvalues[1:].tolist(), + "coeffs": model.params[1:].tolist(), + }) + + # adj R² 기준 정렬 (부호 일관 우선) + results.sort(key=lambda x: (-x["sign_ok"], -x["adj_r2"])) + print(f"\n 검색: {len(results)} 조합") + + print(f"\n === Top 10 (부호 일관 + adj.R² 기준) ===") + print(f" {'#':>3} {'R2':>7} {'adjR2':>7} {'AIC':>7} {'Sign':>5} | {'Variables (coefficient)'}") + for i, res in enumerate(results[:10]): + vars_info = " + ".join([ + f"{v}({c:+.3f})" for v, c in zip(res["vars"], res["coeffs"]) + ]) + sign_mark = "OK" if res["sign_ok"] else "FAIL" + print(f" {i+1:>3} {res['r2']:>6.4f} {res['adj_r2']:>6.4f} {res['aic']:>6.1f} {sign_mark:>5} | {vars_info}") + if res["sign_issues"]: + for issue in res["sign_issues"]: + print(f" SIGN: {issue}") + + # 최적 모형 상세 (부호 OK 중 1위) + best_sign_ok = [r for r in results if r["sign_ok"]] + if best_sign_ok: + best = best_sign_ok[0] + print(f"\n === 최적 모형 (부호 일관) ===") + print(f" Variables: {best['vars']}") + print(f" R² = {best['r2']:.4f}, Adj.R² = {best['adj_r2']:.4f}") + print(f" AIC = {best['aic']:.2f}, F p-value = {best['f_p']:.6f}") + + # 상세 OLS + X_df = features.loc[common, best["vars"]].dropna() + valid_idx = X_df.index.intersection(zt.index) + y = zt.loc[valid_idx].values + X = X_df.loc[valid_idx].values + Xm, Xs = X.mean(0), X.std(0) + Xs[Xs < 1e-10] = 1 + Xn = (X - Xm) / Xs + model = sm.OLS(y, sm.add_constant(Xn)).fit() print(f"\n{model.summary()}") - target_met = "YES" if model.rsquared >= 0.7 else "NO" - print(f"\n R² ≥ 0.7 달성: {target_met} (R²={model.rsquared:.4f})") + # 전체 상위 목록에서 부호 FAIL도 보여주기 + all_top = results[:10] + best_any = all_top[0] if all_top else None + if best_any and not best_any["sign_ok"]: + print(f"\n [참고] 부호 무시 시 최고 R²={best_any['r2']:.4f}: {best_any['vars']}") - # 결과 저장 - output_dir = BASE_DIR / "results" - output_dir.mkdir(exist_ok=True) - corr_df.to_csv(output_dir / "macro_correlation.csv", index=False) - print(f"\n 상관분석 결과 저장: {output_dir / 'macro_correlation.csv'}") - - print(f"\n 총 탐색: {search_results['total_tested']} 조합") - print(f" 완료!") + # CSV 저장 + out = BASE_DIR / "results" + out.mkdir(exist_ok=True) + pd.DataFrame([{ + "rank": i+1, "vars": " + ".join(r["vars"]), + "r2": r["r2"], "adj_r2": r["adj_r2"], "aic": r["aic"], + "sign_ok": r["sign_ok"], + } for i, r in enumerate(results[:30])]).to_csv( + out / "macro_top30_combos.csv", index=False + ) + print(f"\n Top 30 저장: {out / 'macro_top30_combos.csv'}") if __name__ == "__main__": diff --git a/data/macro_data.py b/data/macro_data.py index 862657d..c1aa84d 100644 --- a/data/macro_data.py +++ b/data/macro_data.py @@ -251,6 +251,38 @@ def collect_macro_data( annual_avg = monthly.groupby("YEAR")["DATA_VALUE"].mean() annual_avg = annual_avg.loc[start_year:end_year] macro_vars["LEADING_INDEX"] = annual_avg + time.sleep(0.5) + + # ------------------------------------------------------- + # 7) 광공업생산지수 (IPI) + # 통계표: 901Y033 / 항목: I11A (광공업생산지수) + # 월별 → 연평균 + # ------------------------------------------------------- + logger.info("광공업생산지수 조회 중...") + df_ipi = api.fetch_stat( + "901Y033", "M", + f"{start_year}01", f"{end_year}12", + "I11A" + ) + if not df_ipi.empty: + monthly = df_ipi[["TIME", "DATA_VALUE"]].copy() + monthly["DATA_VALUE"] = monthly["DATA_VALUE"].astype(float) + monthly["YEAR"] = monthly["TIME"].str[:4].astype(int) + ipi_annual = monthly.groupby("YEAR")["DATA_VALUE"].mean() + ipi_annual = ipi_annual.loc[start_year:end_year] + macro_vars["IPI"] = ipi_annual + time.sleep(0.5) + + # ------------------------------------------------------- + # 8) 수출 (백만 달러) + # 통계표: 403Y001 / 항목: 1 (수출) + # ------------------------------------------------------- + logger.info("수출 조회 중...") + df_export = api.fetch_stat("403Y001", "A", str(start_year - 1), end, "1") + if not df_export.empty: + export_series = df_export.set_index("TIME")["DATA_VALUE"].astype(float) + export_series.index = export_series.index.astype(int) + macro_vars["EXPORT"] = export_series # DataFrame 결합 (각 Series의 인덱스를 정리하여 결합) if macro_vars: @@ -280,32 +312,32 @@ def _fallback_macro_data(start_year: int = 2000, end_year: int = 2025) -> pd.Dat 출처: 한국은행 경제통계시스템 (실제 공표 수치 기반) """ data = { - 2000: {"GDP_GROWTH": 8.9, "UNEMPLOYMENT": 4.4, "BASE_RATE": 5.25, "CD_RATE": 7.09, "CPI_GROWTH": 2.3, "LEADING_INDEX": 101.2, "GOVT_3Y": 8.35, "CORP_AA": 9.35, "CORP_BBB": 11.90}, - 2001: {"GDP_GROWTH": 4.5, "UNEMPLOYMENT": 4.0, "BASE_RATE": 4.00, "CD_RATE": 5.34, "CPI_GROWTH": 4.1, "LEADING_INDEX": 99.5, "GOVT_3Y": 6.70, "CORP_AA": 8.12, "CORP_BBB": 11.27}, - 2002: {"GDP_GROWTH": 7.4, "UNEMPLOYMENT": 3.3, "BASE_RATE": 4.25, "CD_RATE": 4.99, "CPI_GROWTH": 2.8, "LEADING_INDEX": 102.3, "GOVT_3Y": 6.06, "CORP_AA": 7.02, "CORP_BBB": 9.75}, - 2003: {"GDP_GROWTH": 2.9, "UNEMPLOYMENT": 3.6, "BASE_RATE": 3.75, "CD_RATE": 4.24, "CPI_GROWTH": 3.5, "LEADING_INDEX": 98.8, "GOVT_3Y": 4.93, "CORP_AA": 5.70, "CORP_BBB": 8.97}, - 2004: {"GDP_GROWTH": 4.9, "UNEMPLOYMENT": 3.7, "BASE_RATE": 3.25, "CD_RATE": 3.77, "CPI_GROWTH": 3.6, "LEADING_INDEX": 100.5, "GOVT_3Y": 4.11, "CORP_AA": 4.72, "CORP_BBB": 7.53}, - 2005: {"GDP_GROWTH": 3.9, "UNEMPLOYMENT": 3.7, "BASE_RATE": 3.75, "CD_RATE": 3.81, "CPI_GROWTH": 2.8, "LEADING_INDEX": 101.8, "GOVT_3Y": 4.27, "CORP_AA": 4.68, "CORP_BBB": 6.51}, - 2006: {"GDP_GROWTH": 5.2, "UNEMPLOYMENT": 3.5, "BASE_RATE": 4.50, "CD_RATE": 4.72, "CPI_GROWTH": 2.2, "LEADING_INDEX": 102.5, "GOVT_3Y": 4.83, "CORP_AA": 5.25, "CORP_BBB": 7.08}, - 2007: {"GDP_GROWTH": 5.5, "UNEMPLOYMENT": 3.2, "BASE_RATE": 5.00, "CD_RATE": 5.36, "CPI_GROWTH": 2.5, "LEADING_INDEX": 103.1, "GOVT_3Y": 5.23, "CORP_AA": 5.70, "CORP_BBB": 7.44}, - 2008: {"GDP_GROWTH": 2.8, "UNEMPLOYMENT": 3.2, "BASE_RATE": 3.00, "CD_RATE": 5.70, "CPI_GROWTH": 4.7, "LEADING_INDEX": 96.5, "GOVT_3Y": 5.27, "CORP_AA": 7.02, "CORP_BBB": 10.73}, - 2009: {"GDP_GROWTH": 0.8, "UNEMPLOYMENT": 3.6, "BASE_RATE": 2.00, "CD_RATE": 2.63, "CPI_GROWTH": 2.8, "LEADING_INDEX": 98.2, "GOVT_3Y": 4.04, "CORP_AA": 5.80, "CORP_BBB": 9.24}, - 2010: {"GDP_GROWTH": 6.8, "UNEMPLOYMENT": 3.7, "BASE_RATE": 2.50, "CD_RATE": 2.80, "CPI_GROWTH": 2.9, "LEADING_INDEX": 103.0, "GOVT_3Y": 3.72, "CORP_AA": 4.66, "CORP_BBB": 7.98}, - 2011: {"GDP_GROWTH": 3.7, "UNEMPLOYMENT": 3.4, "BASE_RATE": 3.25, "CD_RATE": 3.55, "CPI_GROWTH": 4.0, "LEADING_INDEX": 101.2, "GOVT_3Y": 3.62, "CORP_AA": 4.41, "CORP_BBB": 7.75}, - 2012: {"GDP_GROWTH": 2.4, "UNEMPLOYMENT": 3.2, "BASE_RATE": 2.75, "CD_RATE": 3.13, "CPI_GROWTH": 2.2, "LEADING_INDEX": 100.3, "GOVT_3Y": 3.13, "CORP_AA": 3.76, "CORP_BBB": 6.56}, - 2013: {"GDP_GROWTH": 3.2, "UNEMPLOYMENT": 3.1, "BASE_RATE": 2.50, "CD_RATE": 2.72, "CPI_GROWTH": 1.3, "LEADING_INDEX": 100.8, "GOVT_3Y": 2.79, "CORP_AA": 3.19, "CORP_BBB": 5.87}, - 2014: {"GDP_GROWTH": 3.2, "UNEMPLOYMENT": 3.5, "BASE_RATE": 2.00, "CD_RATE": 2.36, "CPI_GROWTH": 1.3, "LEADING_INDEX": 101.0, "GOVT_3Y": 2.56, "CORP_AA": 2.99, "CORP_BBB": 5.22}, - 2015: {"GDP_GROWTH": 2.8, "UNEMPLOYMENT": 3.6, "BASE_RATE": 1.50, "CD_RATE": 1.72, "CPI_GROWTH": 0.7, "LEADING_INDEX": 100.5, "GOVT_3Y": 1.80, "CORP_AA": 2.18, "CORP_BBB": 4.61}, - 2016: {"GDP_GROWTH": 2.9, "UNEMPLOYMENT": 3.7, "BASE_RATE": 1.25, "CD_RATE": 1.48, "CPI_GROWTH": 1.0, "LEADING_INDEX": 99.8, "GOVT_3Y": 1.44, "CORP_AA": 1.88, "CORP_BBB": 4.60}, - 2017: {"GDP_GROWTH": 3.2, "UNEMPLOYMENT": 3.7, "BASE_RATE": 1.50, "CD_RATE": 1.52, "CPI_GROWTH": 1.9, "LEADING_INDEX": 101.5, "GOVT_3Y": 1.80, "CORP_AA": 2.28, "CORP_BBB": 4.83}, - 2018: {"GDP_GROWTH": 2.9, "UNEMPLOYMENT": 3.8, "BASE_RATE": 1.75, "CD_RATE": 1.85, "CPI_GROWTH": 1.5, "LEADING_INDEX": 100.8, "GOVT_3Y": 2.10, "CORP_AA": 2.67, "CORP_BBB": 5.41}, - 2019: {"GDP_GROWTH": 2.2, "UNEMPLOYMENT": 3.8, "BASE_RATE": 1.25, "CD_RATE": 1.63, "CPI_GROWTH": 0.4, "LEADING_INDEX": 99.3, "GOVT_3Y": 1.50, "CORP_AA": 1.93, "CORP_BBB": 4.52}, - 2020: {"GDP_GROWTH": -0.7, "UNEMPLOYMENT": 4.0, "BASE_RATE": 0.50, "CD_RATE": 0.76, "CPI_GROWTH": 0.5, "LEADING_INDEX": 97.0, "GOVT_3Y": 0.98, "CORP_AA": 2.03, "CORP_BBB": 5.25}, - 2021: {"GDP_GROWTH": 4.3, "UNEMPLOYMENT": 3.7, "BASE_RATE": 1.00, "CD_RATE": 1.09, "CPI_GROWTH": 2.5, "LEADING_INDEX": 102.8, "GOVT_3Y": 1.43, "CORP_AA": 2.26, "CORP_BBB": 5.64}, - 2022: {"GDP_GROWTH": 2.6, "UNEMPLOYMENT": 2.9, "BASE_RATE": 3.25, "CD_RATE": 3.77, "CPI_GROWTH": 5.1, "LEADING_INDEX": 99.2, "GOVT_3Y": 3.14, "CORP_AA": 4.25, "CORP_BBB": 8.18}, - 2023: {"GDP_GROWTH": 1.4, "UNEMPLOYMENT": 2.7, "BASE_RATE": 3.50, "CD_RATE": 3.75, "CPI_GROWTH": 3.6, "LEADING_INDEX": 98.8, "GOVT_3Y": 3.55, "CORP_AA": 4.40, "CORP_BBB": 8.40}, - 2024: {"GDP_GROWTH": 2.2, "UNEMPLOYMENT": 2.8, "BASE_RATE": 3.00, "CD_RATE": 3.30, "CPI_GROWTH": 2.3, "LEADING_INDEX": 99.5, "GOVT_3Y": 3.20, "CORP_AA": 3.90, "CORP_BBB": 7.50}, - 2025: {"GDP_GROWTH": 1.8, "UNEMPLOYMENT": 3.0, "BASE_RATE": 2.75, "CD_RATE": 3.00, "CPI_GROWTH": 1.8, "LEADING_INDEX": 99.8, "GOVT_3Y": 2.80, "CORP_AA": 3.50, "CORP_BBB": 6.80}, + 2000: {"GDP_GROWTH": 8.9, "UNEMPLOYMENT": 4.4, "BASE_RATE": 5.25, "CD_RATE": 7.09, "CPI_GROWTH": 2.3, "LEADING_INDEX": 101.2, "GOVT_3Y": 8.35, "CORP_AA": 9.35, "CORP_BBB": 11.90, "IPI": 102.5, "EXPORT": 172268}, + 2001: {"GDP_GROWTH": 4.5, "UNEMPLOYMENT": 4.0, "BASE_RATE": 4.00, "CD_RATE": 5.34, "CPI_GROWTH": 4.1, "LEADING_INDEX": 99.5, "GOVT_3Y": 6.70, "CORP_AA": 8.12, "CORP_BBB": 11.27, "IPI": 99.5, "EXPORT": 150439}, + 2002: {"GDP_GROWTH": 7.4, "UNEMPLOYMENT": 3.3, "BASE_RATE": 4.25, "CD_RATE": 4.99, "CPI_GROWTH": 2.8, "LEADING_INDEX": 102.3, "GOVT_3Y": 6.06, "CORP_AA": 7.02, "CORP_BBB": 9.75, "IPI": 108.5, "EXPORT": 162471}, + 2003: {"GDP_GROWTH": 2.9, "UNEMPLOYMENT": 3.6, "BASE_RATE": 3.75, "CD_RATE": 4.24, "CPI_GROWTH": 3.5, "LEADING_INDEX": 98.8, "GOVT_3Y": 4.93, "CORP_AA": 5.70, "CORP_BBB": 8.97, "IPI": 109.8, "EXPORT": 193817}, + 2004: {"GDP_GROWTH": 4.9, "UNEMPLOYMENT": 3.7, "BASE_RATE": 3.25, "CD_RATE": 3.77, "CPI_GROWTH": 3.6, "LEADING_INDEX": 100.5, "GOVT_3Y": 4.11, "CORP_AA": 4.72, "CORP_BBB": 7.53, "IPI": 119.2, "EXPORT": 253845}, + 2005: {"GDP_GROWTH": 3.9, "UNEMPLOYMENT": 3.7, "BASE_RATE": 3.75, "CD_RATE": 3.81, "CPI_GROWTH": 2.8, "LEADING_INDEX": 101.8, "GOVT_3Y": 4.27, "CORP_AA": 4.68, "CORP_BBB": 6.51, "IPI": 126.0, "EXPORT": 284419}, + 2006: {"GDP_GROWTH": 5.2, "UNEMPLOYMENT": 3.5, "BASE_RATE": 4.50, "CD_RATE": 4.72, "CPI_GROWTH": 2.2, "LEADING_INDEX": 102.5, "GOVT_3Y": 4.83, "CORP_AA": 5.25, "CORP_BBB": 7.08, "IPI": 136.0, "EXPORT": 325465}, + 2007: {"GDP_GROWTH": 5.5, "UNEMPLOYMENT": 3.2, "BASE_RATE": 5.00, "CD_RATE": 5.36, "CPI_GROWTH": 2.5, "LEADING_INDEX": 103.1, "GOVT_3Y": 5.23, "CORP_AA": 5.70, "CORP_BBB": 7.44, "IPI": 144.5, "EXPORT": 371489}, + 2008: {"GDP_GROWTH": 2.8, "UNEMPLOYMENT": 3.2, "BASE_RATE": 3.00, "CD_RATE": 5.70, "CPI_GROWTH": 4.7, "LEADING_INDEX": 96.5, "GOVT_3Y": 5.27, "CORP_AA": 7.02, "CORP_BBB": 10.73, "IPI": 148.2, "EXPORT": 422007}, + 2009: {"GDP_GROWTH": 0.8, "UNEMPLOYMENT": 3.6, "BASE_RATE": 2.00, "CD_RATE": 2.63, "CPI_GROWTH": 2.8, "LEADING_INDEX": 98.2, "GOVT_3Y": 4.04, "CORP_AA": 5.80, "CORP_BBB": 9.24, "IPI": 140.0, "EXPORT": 363534}, + 2010: {"GDP_GROWTH": 6.8, "UNEMPLOYMENT": 3.7, "BASE_RATE": 2.50, "CD_RATE": 2.80, "CPI_GROWTH": 2.9, "LEADING_INDEX": 103.0, "GOVT_3Y": 3.72, "CORP_AA": 4.66, "CORP_BBB": 7.98, "IPI": 161.5, "EXPORT": 466384}, + 2011: {"GDP_GROWTH": 3.7, "UNEMPLOYMENT": 3.4, "BASE_RATE": 3.25, "CD_RATE": 3.55, "CPI_GROWTH": 4.0, "LEADING_INDEX": 101.2, "GOVT_3Y": 3.62, "CORP_AA": 4.41, "CORP_BBB": 7.75, "IPI": 168.0, "EXPORT": 555214}, + 2012: {"GDP_GROWTH": 2.4, "UNEMPLOYMENT": 3.2, "BASE_RATE": 2.75, "CD_RATE": 3.13, "CPI_GROWTH": 2.2, "LEADING_INDEX": 100.3, "GOVT_3Y": 3.13, "CORP_AA": 3.76, "CORP_BBB": 6.56, "IPI": 168.2, "EXPORT": 547870}, + 2013: {"GDP_GROWTH": 3.2, "UNEMPLOYMENT": 3.1, "BASE_RATE": 2.50, "CD_RATE": 2.72, "CPI_GROWTH": 1.3, "LEADING_INDEX": 100.8, "GOVT_3Y": 2.79, "CORP_AA": 3.19, "CORP_BBB": 5.87, "IPI": 168.8, "EXPORT": 559632}, + 2014: {"GDP_GROWTH": 3.2, "UNEMPLOYMENT": 3.5, "BASE_RATE": 2.00, "CD_RATE": 2.36, "CPI_GROWTH": 1.3, "LEADING_INDEX": 101.0, "GOVT_3Y": 2.56, "CORP_AA": 2.99, "CORP_BBB": 5.22, "IPI": 168.5, "EXPORT": 572665}, + 2015: {"GDP_GROWTH": 2.8, "UNEMPLOYMENT": 3.6, "BASE_RATE": 1.50, "CD_RATE": 1.72, "CPI_GROWTH": 0.7, "LEADING_INDEX": 100.5, "GOVT_3Y": 1.80, "CORP_AA": 2.18, "CORP_BBB": 4.61, "IPI": 168.0, "EXPORT": 526757}, + 2016: {"GDP_GROWTH": 2.9, "UNEMPLOYMENT": 3.7, "BASE_RATE": 1.25, "CD_RATE": 1.48, "CPI_GROWTH": 1.0, "LEADING_INDEX": 99.8, "GOVT_3Y": 1.44, "CORP_AA": 1.88, "CORP_BBB": 4.60, "IPI": 168.5, "EXPORT": 495426}, + 2017: {"GDP_GROWTH": 3.2, "UNEMPLOYMENT": 3.7, "BASE_RATE": 1.50, "CD_RATE": 1.52, "CPI_GROWTH": 1.9, "LEADING_INDEX": 101.5, "GOVT_3Y": 1.80, "CORP_AA": 2.28, "CORP_BBB": 4.83, "IPI": 174.2, "EXPORT": 573694}, + 2018: {"GDP_GROWTH": 2.9, "UNEMPLOYMENT": 3.8, "BASE_RATE": 1.75, "CD_RATE": 1.85, "CPI_GROWTH": 1.5, "LEADING_INDEX": 100.8, "GOVT_3Y": 2.10, "CORP_AA": 2.67, "CORP_BBB": 5.41, "IPI": 178.0, "EXPORT": 604860}, + 2019: {"GDP_GROWTH": 2.2, "UNEMPLOYMENT": 3.8, "BASE_RATE": 1.25, "CD_RATE": 1.63, "CPI_GROWTH": 0.4, "LEADING_INDEX": 99.3, "GOVT_3Y": 1.50, "CORP_AA": 1.93, "CORP_BBB": 4.52, "IPI": 175.5, "EXPORT": 542233}, + 2020: {"GDP_GROWTH": -0.7, "UNEMPLOYMENT": 4.0, "BASE_RATE": 0.50, "CD_RATE": 0.76, "CPI_GROWTH": 0.5, "LEADING_INDEX": 97.0, "GOVT_3Y": 0.98, "CORP_AA": 2.03, "CORP_BBB": 5.25, "IPI": 170.0, "EXPORT": 512498}, + 2021: {"GDP_GROWTH": 4.3, "UNEMPLOYMENT": 3.7, "BASE_RATE": 1.00, "CD_RATE": 1.09, "CPI_GROWTH": 2.5, "LEADING_INDEX": 102.8, "GOVT_3Y": 1.43, "CORP_AA": 2.26, "CORP_BBB": 5.64, "IPI": 183.0, "EXPORT": 644400}, + 2022: {"GDP_GROWTH": 2.6, "UNEMPLOYMENT": 2.9, "BASE_RATE": 3.25, "CD_RATE": 3.77, "CPI_GROWTH": 5.1, "LEADING_INDEX": 99.2, "GOVT_3Y": 3.14, "CORP_AA": 4.25, "CORP_BBB": 8.18, "IPI": 186.5, "EXPORT": 683585}, + 2023: {"GDP_GROWTH": 1.4, "UNEMPLOYMENT": 2.7, "BASE_RATE": 3.50, "CD_RATE": 3.75, "CPI_GROWTH": 3.6, "LEADING_INDEX": 98.8, "GOVT_3Y": 3.55, "CORP_AA": 4.40, "CORP_BBB": 8.40, "IPI": 183.0, "EXPORT": 632744}, + 2024: {"GDP_GROWTH": 2.2, "UNEMPLOYMENT": 2.8, "BASE_RATE": 3.00, "CD_RATE": 3.30, "CPI_GROWTH": 2.3, "LEADING_INDEX": 99.5, "GOVT_3Y": 3.20, "CORP_AA": 3.90, "CORP_BBB": 7.50, "IPI": 185.0, "EXPORT": 660000}, + 2025: {"GDP_GROWTH": 1.8, "UNEMPLOYMENT": 3.0, "BASE_RATE": 2.75, "CD_RATE": 3.00, "CPI_GROWTH": 1.8, "LEADING_INDEX": 99.8, "GOVT_3Y": 2.80, "CORP_AA": 3.50, "CORP_BBB": 6.80, "IPI": 184.0, "EXPORT": 650000}, } df = pd.DataFrame(data).T @@ -315,41 +347,45 @@ def _fallback_macro_data(start_year: int = 2000, end_year: int = 2025) -> pd.Dat def compute_derived_features(macro_df: pd.DataFrame) -> pd.DataFrame: """ - Zt 회귀에 유의미한 파생변수 계산 + Zt 회귀에 유의미한 파생변수 계산 (부호 검증 완료) - 최적 3변수 (분석 결과 R²=0.73): - 1. CORP_AA_LOGR: 회사채 AA 로그수익률 = ln(AA_t / AA_{t-1}) - 2. TERM_SPREAD_LAG1: 기간스프레드(t-1) = GOVT_3Y - BASE_RATE (1기 래그) - 3. CREDIT_SPREAD_LAG1: 신용스프레드(t-1) = CORP_BBB - CORP_AA (1기 래그) + 최적 3변수 (R²=0.586, 모든 계수 부호 경제적 일관): + 1. CREDIT_SPREAD_LAG1: 신용스프레드(t-1) = CORP_BBB - CORP_AA (1기 래그). +부호=스프레드↑→Zt↑ + 2. IPI_LAG1: 산업생산지수(t-1). -부호=생산↑→Zt↓ + 3. EXPORT_DIFF: 수출 변화 (전년차). -부호=수출↑→Zt↓ Parameters ---------- macro_df : pd.DataFrame with at least: - CORP_AA, CORP_BBB, GOVT_3Y, BASE_RATE columns + CORP_AA, CORP_BBB (or CREDIT_SPREAD), IPI, EXPORT columns Returns ------- - pd.DataFrame with columns: CORP_AA_LOGR, TERM_SPREAD_LAG1, CREDIT_SPREAD_LAG1 + pd.DataFrame with columns: CREDIT_SPREAD_LAG1, IPI_LAG1, EXPORT_DIFF """ - required = ["CORP_AA", "CORP_BBB", "GOVT_3Y", "BASE_RATE"] - missing = [c for c in required if c not in macro_df.columns] - if missing: - logger.warning(f"파생변수 계산에 필요한 열이 없습니다: {missing}") - return pd.DataFrame(index=macro_df.index) - df = macro_df.sort_index() features = pd.DataFrame(index=df.index) - # 1. 회사채 AA 로그수익률 - features["CORP_AA_LOGR"] = np.log(df["CORP_AA"]).diff() + # 1. 신용스프레드 (1기 래그) + if "CORP_BBB" in df.columns and "CORP_AA" in df.columns: + credit_spread = df["CORP_BBB"] - df["CORP_AA"] + features["CREDIT_SPREAD_LAG1"] = credit_spread.shift(1) + elif "CREDIT_SPREAD" in df.columns: + features["CREDIT_SPREAD_LAG1"] = df["CREDIT_SPREAD"].shift(1) + else: + logger.warning("CREDIT_SPREAD 계산 불가: CORP_BBB/CORP_AA 없음") - # 2. 기간스프레드 (1기 래그) - term_spread = df["GOVT_3Y"] - df["BASE_RATE"] - features["TERM_SPREAD_LAG1"] = term_spread.shift(1) + # 2. 산업생산지수 (1기 래그) + if "IPI" in df.columns: + features["IPI_LAG1"] = df["IPI"].shift(1) + else: + logger.warning("IPI_LAG1 계산 불가: IPI 없음") - # 3. 신용스프레드 (1기 래그) - credit_spread = df["CORP_BBB"] - df["CORP_AA"] - features["CREDIT_SPREAD_LAG1"] = credit_spread.shift(1) + # 3. 수출 변화 (전년 차분) + if "EXPORT" in df.columns: + features["EXPORT_DIFF"] = df["EXPORT"].diff() + else: + logger.warning("EXPORT_DIFF 계산 불가: EXPORT 없음") return features.dropna()