ExperionCrawler/mcp-server/parsers/docx_parser.py

"""docx 청킹 — 헤딩 경로 별 청크."""
from __future__ import annotations


def parse(path: str) -> list[dict]:
    from docx import Document

    doc = Document(path)
    chunks: list[dict] = []

    cur_path: list[str] = []
    buf: list[str] = []

    def flush():
        if buf:
            heading = " / ".join(cur_path) if cur_path else "preface"
            chunks.append({
                "text": "\n".join(buf).strip(),
                "chunk_kind": "heading",
                "locator": f"heading={heading}",
            })

    for p in doc.paragraphs:
        text = (p.text or "").strip()
        if not text:
            continue

        style_name = (p.style.name or "").lower() if p.style else ""
        if style_name.startswith("heading"):
            flush()
            buf = []
            try:
                level = int(style_name.split()[-1])
            except (ValueError, IndexError):
                level = 1
            cur_path = cur_path[: max(0, level - 1)] + [text]
        else:
            buf.append(text)

    flush()
    return chunks