HC900-Crawler/mcp-server/parsers/pdf_parser.py

"""pdf 청킹 — pdfplumber로 페이지/표 추출, 헤딩 분리 실패 시 페이지 단위 fallback."""
from __future__ import annotations


def parse(path: str) -> list[dict]:
    import pdfplumber

    chunks: list[dict] = []
    with pdfplumber.open(path) as pdf:
        for pno, page in enumerate(pdf.pages, start=1):
            txt = (page.extract_text() or "").strip()
            if txt:
                chunks.append({
                    "text": txt[:5000],
                    "chunk_kind": "page",
                    "locator": f"page={pno}",
                })

            try:
                tables = page.extract_tables() or []
            except Exception:
                tables = []
            for ti, table in enumerate(tables, start=1):
                rows = [[(c or "").strip() for c in row] for row in table if row]
                if not rows:
                    continue
                md = "\n".join(" | ".join(r) for r in rows[:200])
                chunks.append({
                    "text": md,
                    "chunk_kind": "table",
                    "locator": f"page={pno}; table={ti}",
                })

    return chunks