Files
HC900-Crawler/mcp-server/parsers/text_parser.py
windpacer 16fc7a2598 Initial commit: HC900 Crawler
Honeywell HC900을 Modbus TCP로 직접 폴링 → gRPC → C# 크롤러 → PostgreSQL.
기존 Experion OPC UA 데이터 경로를 HC900 직접 통신으로 대체.

- industrial-comm/cpp: C++ Modbus 게이트웨이 (gRPC 서버)
- src: C# .NET 8 ASP.NET Core 크롤러 + 웹 UI (3-Layer)
- mcp-server: Python FastMCP (RAG/NL2SQL/P&ID)
- 다중 컨트롤러(N-Controller) 지원

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-03 20:28:14 +09:00

57 lines
1.4 KiB
Python

"""md / txt 청킹 — md는 # 헤딩 단위, txt는 빈 줄 두 개 단위."""
from __future__ import annotations
import os
def parse(path: str) -> list[dict]:
ext = os.path.splitext(path)[1].lower()
with open(path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
if ext == ".md":
return _parse_md(content)
return _parse_txt(content)
def _parse_md(text: str) -> list[dict]:
chunks: list[dict] = []
lines = text.split("\n")
cur_heading = "preface"
buf: list[str] = []
section_idx = 0
def flush():
nonlocal section_idx
body = "\n".join(buf).strip()
if body:
section_idx += 1
chunks.append({
"text": body,
"chunk_kind": "heading",
"locator": f"heading={cur_heading}",
})
for ln in lines:
s = ln.lstrip()
if s.startswith("#"):
flush()
buf = []
cur_heading = s.lstrip("#").strip() or "section"
else:
buf.append(ln)
flush()
return chunks
def _parse_txt(text: str) -> list[dict]:
chunks: list[dict] = []
parts = [p.strip() for p in text.split("\n\n") if p.strip()]
for i, p in enumerate(parts, start=1):
chunks.append({
"text": p,
"chunk_kind": "paragraph",
"locator": f"paragraph={i}",
})
return chunks