"""docx 청킹 — 헤딩 경로 별 청크.""" from __future__ import annotations def parse(path: str) -> list[dict]: from docx import Document doc = Document(path) chunks: list[dict] = [] cur_path: list[str] = [] buf: list[str] = [] def flush(): if buf: heading = " / ".join(cur_path) if cur_path else "preface" chunks.append({ "text": "\n".join(buf).strip(), "chunk_kind": "heading", "locator": f"heading={heading}", }) for p in doc.paragraphs: text = (p.text or "").strip() if not text: continue style_name = (p.style.name or "").lower() if p.style else "" if style_name.startswith("heading"): flush() buf = [] try: level = int(style_name.split()[-1]) except (ValueError, IndexError): level = 1 cur_path = cur_path[: max(0, level - 1)] + [text] else: buf.append(text) flush() return chunks