feat(data): parse 3-agency PDF transition matrices to CSV #task-290

- New: data/parse_pdf_matrices.py (KR/NICE/SCI PDF parser)
  - KR: text-based parser (space-separated numbers + dashes)
  - NICE: text-based parser (clean numeric format)
  - SCI: pdfplumber table extraction (column-position-aware)
  - WR redistribution, B이하→B mapping, CCC extrapolation from PD patterns
- Modified: data/transition_matrices.py (added source='real' loader)
- Modified: config.yaml (data.transition_source: 'real')
- Modified: main.py (reads transition source from config)
- Output: 112 CSV files (KR/NICE/SCI/AVG × 28 years)
This commit is contained in:
Variet Agent
2026-03-11 01:07:27 +09:00
parent ebdc6b805b
commit 8af743e6f3
116 changed files with 1714 additions and 3 deletions

View File

@@ -181,6 +181,9 @@ def load_transition_matrices(
if source == "builtin":
return _build_sample_matrices()
elif source == "real":
return _load_real_matrices(data_dir)
elif source == "csv":
if data_dir is None:
raise ValueError("CSV 로딩시 data_dir를 지정해야 합니다.")
@@ -195,6 +198,43 @@ def load_transition_matrices(
raise ValueError(f"지원하지 않는 소스: {source}")
def _load_real_matrices(data_dir: Optional[str] = None) -> Dict[int, np.ndarray]:
"""
실제 3사 전이행렬 로딩 (data/real/AVG_YYYY.csv)
parse_pdf_matrices.py 로 생성된 3사 평균 CSV 사용.
"""
if data_dir is None:
data_dir = str(Path(__file__).parent / "real")
real_dir = Path(data_dir)
if not real_dir.exists():
raise FileNotFoundError(
f"실제 전이행렬 디렉토리가 없습니다: {real_dir}\n"
"먼저 python data/parse_pdf_matrices.py 를 실행하세요."
)
matrices = {}
for csv_file in sorted(real_dir.glob("AVG_*.csv")):
year = _extract_year_from_filename(csv_file.name)
if year is not None:
df = pd.read_csv(csv_file, index_col=0)
tm = df.values.astype(float)
for i in range(tm.shape[0]):
row_sum = tm[i].sum()
if row_sum > 0:
tm[i] /= row_sum
matrices[year] = tm
if not matrices:
raise FileNotFoundError(
f"AVG_*.csv 파일이 없습니다: {real_dir}\n"
"먼저 python data/parse_pdf_matrices.py 를 실행하세요."
)
return matrices
def _load_from_csv(data_dir: Path, pattern: str) -> Dict[int, np.ndarray]:
"""CSV 파일에서 전이행렬 로딩 (파일명에 연도 포함 예상)"""
matrices = {}