HC900-Crawler/mcp-server/parsers/text_parser.py

"""md / txt 청킹 — md는 # 헤딩 단위, txt는 빈 줄 두 개 단위."""
from __future__ import annotations
import os


def parse(path: str) -> list[dict]:
    ext = os.path.splitext(path)[1].lower()
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()

    if ext == ".md":
        return _parse_md(content)
    return _parse_txt(content)


def _parse_md(text: str) -> list[dict]:
    chunks: list[dict] = []
    lines = text.split("\n")

    cur_heading = "preface"
    buf: list[str] = []
    section_idx = 0

    def flush():
        nonlocal section_idx
        body = "\n".join(buf).strip()
        if body:
            section_idx += 1
            chunks.append({
                "text": body,
                "chunk_kind": "heading",
                "locator": f"heading={cur_heading}",
            })

    for ln in lines:
        s = ln.lstrip()
        if s.startswith("#"):
            flush()
            buf = []
            cur_heading = s.lstrip("#").strip() or "section"
        else:
            buf.append(ln)
    flush()
    return chunks


def _parse_txt(text: str) -> list[dict]:
    chunks: list[dict] = []
    parts = [p.strip() for p in text.split("\n\n") if p.strip()]
    for i, p in enumerate(parts, start=1):
        chunks.append({
            "text": p,
            "chunk_kind": "paragraph",
            "locator": f"paragraph={i}",
        })
    return chunks