feat(data): implement market-implied PD floor and 7x7 transition matrix parsing #task-290

2026-03-11 15:53:38 +09:00
parent 0762fcc5d8
commit b8514c1251
235 changed files with 1729 additions and 1102 deletions
--- a/data/parse_pdf_matrices.py
+++ b/data/parse_pdf_matrices.py
@@ -29,7 +29,8 @@ if sys.stdout.encoding != 'utf-8':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')

-MODEL_GRADES = ["AAA", "AA", "A", "BBB", "BB", "B", "CCC", "D"]
+MODEL_GRADES_7 = ["AAA", "AA", "A", "BBB", "BB", "B", "D"]
+MODEL_GRADES_8 = ["AAA", "AA", "A", "BBB", "BB", "B", "CCC", "D"]
 GRADE_LABELS = ["AAA", "AA", "A", "BBB", "BB"]  # B이하 is separate

 BASE_DIR = Path(__file__).parent.parent
@@ -460,87 +461,72 @@ def _parse_kr_numbers(s: str) -> Optional[List[float]]:


 # ============================================================
-# 후처리: 6×8 → 8×8
+# 후처리: 6x8 -> 7x7 (WR->D 보정 + CCC 제거)
 # ============================================================
-def postprocess_matrix(raw_6x8: np.ndarray) -> np.ndarray:
-    """6×8 (AAA~B이하 × AAA~WR) → 8×8 (AAA~D × AAA~D)"""
+
+_BROAD_GRADE_MAP_6 = {0: "AAA", 1: "AA", 2: "A", 3: "BBB", 4: "BB", 5: "B"}
+
+
+def postprocess_matrix(raw_6x8, pd_floors=None):
+    """6x8 (AAA~B이하 x AAA~WR+D) -> 7x7 (AAA~B+D)
+
+    Steps:
+    1. PD floor correction: if observed PD < floor, transfer from WR to D
+    2. Remaining WR -> proportional redistribution
+    3. B이하 -> B mapping
+    4. Add D row (absorbing state)
+    5. Normalize rows to sum=1
+    """
    assert raw_6x8.shape == (6, 8), f"Expected (6,8), got {raw_6x8.shape}"

-    # WR 열(7) 제거 → 비례 재배분
-    mat_6x7 = raw_6x8[:, :7].copy()
-    for i in range(6):
-        row_sum = mat_6x7[i].sum()
-        if row_sum > 0:
-            mat_6x7[i] = mat_6x7[i] / row_sum * 100.0
+    mat = raw_6x8.copy()
+    COL_D = 6
+    COL_WR = 7

-    # 8×8 구성: B이하(5) → B(5), D:col6→col7
-    mat = np.zeros((8, 8))
+    # Step 1: PD floor correction (WR -> D transfer)
+    if pd_floors is not None:
+        for i in range(6):
+            broad = _BROAD_GRADE_MAP_6[i]
+            if broad not in pd_floors:
+                continue
+            floor_pct = pd_floors[broad] * 100  # decimal -> %
+            observed_pd = mat[i, COL_D]
+            wr_available = mat[i, COL_WR]
+            if observed_pd < floor_pct and wr_available > 0:
+                deficit = floor_pct - observed_pd
+                transfer = min(deficit, wr_available)
+                mat[i, COL_D] += transfer
+                mat[i, COL_WR] -= transfer
+
+    # Step 2: Remaining WR -> proportional redistribution
+    for i in range(6):
+        wr_remaining = mat[i, COL_WR]
+        if wr_remaining > 0:
+            non_wr_cols = mat[i, :7]
+            non_wr_sum = non_wr_cols.sum()
+            if non_wr_sum > 0:
+                mat[i, :7] = non_wr_cols * (non_wr_sum + wr_remaining) / non_wr_sum
+            mat[i, COL_WR] = 0.0
+
+    # Step 3: B이하 -> B mapping + build 7x7
+    mat_7x7 = np.zeros((7, 7))
    for i in range(6):
        for j in range(6):
-            mat[i, j] = mat_6x7[i, j]
-        mat[i, 7] = mat_6x7[i, 6]  # D
+            mat_7x7[i, j] = mat[i, j]
+        mat_7x7[i, 6] = mat[i, COL_D]

-    # CCC 행/열 extrapolation
-    mat = _extrapolate_ccc(mat)
+    # Step 4: D row (absorbing state)
+    mat_7x7[6, :] = 0.0
+    mat_7x7[6, 6] = 100.0

-    # D 행
-    mat[7, :] = 0.0
-    mat[7, 7] = 100.0
-
-    # → 확률, 행합 정규화
-    mat /= 100.0
-    for i in range(8):
-        s = mat[i].sum()
+    # Step 5: Convert to probability and normalize
+    mat_7x7 /= 100.0
+    for i in range(7):
+        s = mat_7x7[i].sum()
        if s > 0:
-            mat[i] /= s
+            mat_7x7[i] /= s

-    return mat
-
-
-def _extrapolate_ccc(mat: np.ndarray) -> np.ndarray:
-    """CCC 행/열 extrapolation from B이하 PD 패턴"""
-    pd_bb = mat[4, 7]
-    pd_b = mat[5, 7]
-
-    # CCC PD
-    if pd_bb > 0 and pd_b > pd_bb:
-        ratio = pd_b / pd_bb
-    else:
-        ratio = 2.5
-    pd_ccc = min(pd_b * ratio, 60.0)
-    pd_ccc = max(pd_ccc, pd_b * 1.5)
-
-    # Stay rates
-    stay_bb = mat[4, 4]
-    stay_b = mat[5, 5]
-    stay_ratio = (stay_b / stay_bb) if (stay_bb > 0 and stay_b < stay_bb) else 0.7
-    stay_ccc = max(stay_b * stay_ratio, 5.0)
-
-    upgrade_to_b = mat[5, 4] * 0.8 if mat[5, 4] > 0 else 2.0
-
-    # CCC 행
-    mat[6, :] = [0, 0, 0.1, 0.2, 0.3, upgrade_to_b, stay_ccc, pd_ccc]
-
-    ccc_sum = mat[6].sum()
-    if ccc_sum > 100:
-        mat[6, 6] = max(mat[6, 6] - (ccc_sum - 100), 1.0)
-    elif ccc_sum < 100:
-        mat[6, 6] += (100 - ccc_sum)
-
-    # CCC 열: B→CCC, BB→CCC, BBB→CCC 전이 분리
-    b_to_ccc = mat[5, 5] * 0.15
-    mat[5, 6] = b_to_ccc
-    mat[5, 5] -= b_to_ccc
-
-    bb_to_ccc = mat[4, 5] * 0.1 if mat[4, 5] > 0 else 0.5
-    mat[4, 6] = bb_to_ccc
-    mat[4, 5] = max(mat[4, 5] - bb_to_ccc, 0)
-
-    mat[3, 6] = 0.3
-    mat[3, 5] = max(mat[3, 5] - 0.15, 0)
-    mat[3, 3] = max(mat[3, 3] - 0.15, 0)
-
-    return mat
+    return mat_7x7


 # ============================================================
@@ -549,6 +535,17 @@ def _extrapolate_ccc(mat: np.ndarray) -> np.ndarray:
 def main():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

+    # Load PD floors
+    broad_floors = None
+    try:
+        import sys as _sys
+        _sys.path.insert(0, str(BASE_DIR))
+        from data.pd_floor import build_complete_pd_floor_table
+        broad_floors, _, _ = build_complete_pd_floor_table()
+        print(f"  PD floor loaded: {', '.join(f'{g}={v*10000:.1f}bp' for g, v in broad_floors.items())}")
+    except Exception as e:
+        print(f"  PD floor load failed ({e}), proceeding without floor")
+
    all_matrices = {}

    for agency, pdf_path in PDF_FILES.items():
@@ -582,7 +579,7 @@ def main():
        processed = {}
        for year, raw_mat in sorted(raw.items()):
            try:
-                processed[year] = postprocess_matrix(raw_mat)
+                processed[year] = postprocess_matrix(raw_mat, pd_floors=broad_floors)
            except Exception as e:
                print(f"  ERROR {year}: {e}")

@@ -590,7 +587,7 @@ def main():
        print(f"  Processed {len(processed)} matrices")

        for year, mat in processed.items():
-            df = pd.DataFrame(mat, index=MODEL_GRADES, columns=MODEL_GRADES)
+            df = pd.DataFrame(mat, index=MODEL_GRADES_7, columns=MODEL_GRADES_7)
            df.to_csv(OUTPUT_DIR / f"{agency}_{year}.csv", float_format="%.6f")

    # 3사 평균
@@ -609,11 +606,11 @@ def main():

    for year in common_years:
        avg = np.mean([all_matrices[a][year] for a in agency_names], axis=0)
-        for i in range(8):
+        for i in range(7):
            s = avg[i].sum()
            if s > 0:
                avg[i] /= s
-        df = pd.DataFrame(avg, index=MODEL_GRADES, columns=MODEL_GRADES)
+        df = pd.DataFrame(avg, index=MODEL_GRADES_7, columns=MODEL_GRADES_7)
        df.to_csv(OUTPUT_DIR / f"AVG_{year}.csv", float_format="%.6f")

    # PD 요약
@@ -632,7 +629,7 @@ def main():
        if sample_year not in common_years and not any(sample_year in all_matrices[a] for a in agency_names):
            continue
        print(f"\n  Year {sample_year}:")
-        for gi, grade in enumerate(MODEL_GRADES[:-1]):
+        for gi, grade in enumerate(MODEL_GRADES_7[:-1]):
            print(f"  {grade:>5}:", end='')
            for a in agency_names:
                if sample_year in all_matrices[a]: