"""pdf 청킹 — pdfplumber로 페이지/표 추출, 헤딩 분리 실패 시 페이지 단위 fallback.""" from __future__ import annotations def parse(path: str) -> list[dict]: import pdfplumber chunks: list[dict] = [] with pdfplumber.open(path) as pdf: for pno, page in enumerate(pdf.pages, start=1): txt = (page.extract_text() or "").strip() if txt: chunks.append({ "text": txt[:5000], "chunk_kind": "page", "locator": f"page={pno}", }) try: tables = page.extract_tables() or [] except Exception: tables = [] for ti, table in enumerate(tables, start=1): rows = [[(c or "").strip() for c in row] for row in table if row] if not rows: continue md = "\n".join(" | ".join(r) for r in rows[:200]) chunks.append({ "text": md, "chunk_kind": "table", "locator": f"page={pno}; table={ti}", }) return chunks