"""md / txt 청킹 — md는 # 헤딩 단위, txt는 빈 줄 두 개 단위.""" from __future__ import annotations import os def parse(path: str) -> list[dict]: ext = os.path.splitext(path)[1].lower() with open(path, "r", encoding="utf-8", errors="ignore") as f: content = f.read() if ext == ".md": return _parse_md(content) return _parse_txt(content) def _parse_md(text: str) -> list[dict]: chunks: list[dict] = [] lines = text.split("\n") cur_heading = "preface" buf: list[str] = [] section_idx = 0 def flush(): nonlocal section_idx body = "\n".join(buf).strip() if body: section_idx += 1 chunks.append({ "text": body, "chunk_kind": "heading", "locator": f"heading={cur_heading}", }) for ln in lines: s = ln.lstrip() if s.startswith("#"): flush() buf = [] cur_heading = s.lstrip("#").strip() or "section" else: buf.append(ln) flush() return chunks def _parse_txt(text: str) -> list[dict]: chunks: list[dict] = [] parts = [p.strip() for p in text.split("\n\n") if p.strip()] for i, p in enumerate(parts, start=1): chunks.append({ "text": p, "chunk_kind": "paragraph", "locator": f"paragraph={i}", }) return chunks