HC900-Crawler/mcp-server/parsers/xlsx_parser.py

"""xlsx 청킹 — 시트 단위(markdown) + 행 단위 둘 다 생성."""
from __future__ import annotations


def parse(path: str) -> list[dict]:
    from openpyxl import load_workbook

    wb = load_workbook(path, read_only=True, data_only=True)
    chunks: list[dict] = []

    for sheet in wb.worksheets:
        rows = list(sheet.iter_rows(values_only=True))
        if not rows:
            continue

        header = [str(c) if c is not None else "" for c in rows[0]]
        sheet_name = sheet.title

        # 1) 시트 청크 — markdown 표 (선두 1000행 제한)
        body_rows = rows[1:1001]
        md_lines = ["| " + " | ".join(header) + " |",
                    "| " + " | ".join(["---"] * len(header)) + " |"]
        for r in body_rows:
            cells = [str(c) if c is not None else "" for c in r]
            cells += [""] * (len(header) - len(cells))
            md_lines.append("| " + " | ".join(cells[: len(header)]) + " |")
        chunks.append({
            "text": "\n".join(md_lines),
            "chunk_kind": "sheet",
            "locator": f"sheet={sheet_name}",
        })

        # 2) 행 청크 — 각 행을 'col=val' 형식 한 줄로
        for i, r in enumerate(rows[1:], start=2):
            parts = []
            for j, val in enumerate(r):
                if val is None or val == "":
                    continue
                col = header[j] if j < len(header) and header[j] else f"col{j+1}"
                parts.append(f"{col}={val}")
            if not parts:
                continue
            chunks.append({
                "text": f"{sheet_name}: " + ", ".join(parts),
                "chunk_kind": "row",
                "locator": f"sheet={sheet_name}; row={i}",
            })

    return chunks