Initial commit: HC900 Crawler

Honeywell HC900을 Modbus TCP로 직접 폴링 → gRPC → C# 크롤러 → PostgreSQL. 기존 Experion OPC UA 데이터 경로를 HC900 직접 통신으로 대체. - industrial-comm/cpp: C++ Modbus 게이트웨이 (gRPC 서버) - src: C# .NET 8 ASP.NET Core 크롤러 + 웹 UI (3-Layer) - mcp-server: Python FastMCP (RAG/NL2SQL/P&ID) - 다중 컨트롤러(N-Controller) 지원 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-03 20:28:14 +09:00
commit 16fc7a2598
325 changed files with 126583 additions and 0 deletions
--- a/mcp-server/index_opc_docs.py
+++ b/mcp-server/index_opc_docs.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+Experion OPC UA 문서 인덱싱 스크립트
+- HTM 파일 → 텍스트 추출 → 청킹 → Ollama 임베딩 → Qdrant 업서트
+- 사용 모델: nomic-embed-text (768-dim, MCP 서버와 동일)
+- 컬렉션: experion-opc-docs
+"""
+
+import os
+import sys
+import uuid
+import time
+import textwrap
+from html.parser import HTMLParser
+from pathlib import Path
+import httpx
+
+# ── 설정 ──────────────────────────────────────────────────────────────────────
+DOCS_DIR      = "/home/windpacer/projects/Experion_opcua_documents"
+QDRANT_URL    = "http://localhost:6333"
+OLLAMA_URL    = "http://localhost:11434"
+EMBED_MODEL   = "nomic-embed-text"
+COLLECTION    = "experion-opc-docs"
+CHUNK_SIZE    = 600   # 문자 수
+CHUNK_OVERLAP = 100
+VECTOR_DIM    = 768
+
+# ── HTML → 텍스트 추출 ────────────────────────────────────────────────────────
+
+class _TextExtractor(HTMLParser):
+    SKIP_TAGS = {"script", "style", "head", "nav", "footer"}
+
+    def __init__(self):
+        super().__init__()
+        self._skip  = 0
+        self._parts = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag in self.SKIP_TAGS:
+            self._skip += 1
+
+    def handle_endtag(self, tag):
+        if tag in self.SKIP_TAGS and self._skip:
+            self._skip -= 1
+        if tag in ("p", "h1", "h2", "h3", "h4", "li", "td", "tr", "div"):
+            self._parts.append("\n")
+
+    def handle_data(self, data):
+        if not self._skip:
+            stripped = data.strip()
+            if stripped:
+                self._parts.append(stripped + " ")
+
+    def get_text(self) -> str:
+        raw = "".join(self._parts)
+        lines = [l.strip() for l in raw.splitlines()]
+        lines = [l for l in lines if l]
+        return "\n".join(lines)
+
+
+def extract_text(htm_path: str) -> str:
+    with open(htm_path, encoding="utf-8", errors="replace") as f:
+        html = f.read()
+    p = _TextExtractor()
+    p.feed(html)
+    return p.get_text()
+
+
+# ── 청킹 ─────────────────────────────────────────────────────────────────────
+
+def chunk_text(text: str, size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
+    if len(text) <= size:
+        return [text] if text.strip() else []
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + size
+        chunk = text[start:end]
+        if chunk.strip():
+            chunks.append(chunk.strip())
+        start += size - overlap
+    return chunks
+
+
+# ── Ollama 임베딩 ─────────────────────────────────────────────────────────────
+
+def embed(text: str) -> list[float]:
+    with httpx.Client(timeout=30) as client:
+        resp = client.post(
+            f"{OLLAMA_URL}/api/embeddings",
+            json={"model": EMBED_MODEL, "prompt": text},
+        )
+        resp.raise_for_status()
+        return resp.json()["embedding"]
+
+
+# ── Qdrant 컬렉션 생성 ────────────────────────────────────────────────────────
+
+def ensure_collection():
+    with httpx.Client(timeout=15) as client:
+        resp = client.get(f"{QDRANT_URL}/collections/{COLLECTION}")
+        if resp.status_code == 200:
+            info = resp.json()["result"]
+            count = info.get("points_count", 0)
+            print(f"컬렉션 '{COLLECTION}' 이미 존재 (points: {count})")
+            answer = input("기존 컬렉션을 삭제하고 재인덱싱? [y/N]: ").strip().lower()
+            if answer != "y":
+                print("취소")
+                sys.exit(0)
+            client.delete(f"{QDRANT_URL}/collections/{COLLECTION}")
+            print("기존 컬렉션 삭제 완료")
+
+        create_resp = client.put(
+            f"{QDRANT_URL}/collections/{COLLECTION}",
+            json={"vectors": {"size": VECTOR_DIM, "distance": "Cosine"}},
+        )
+        create_resp.raise_for_status()
+        print(f"컬렉션 '{COLLECTION}' 생성 완료")
+
+
+# ── Qdrant 업서트 ─────────────────────────────────────────────────────────────
+
+def upsert_batch(points: list[dict]):
+    with httpx.Client(timeout=30) as client:
+        resp = client.put(
+            f"{QDRANT_URL}/collections/{COLLECTION}/points",
+            json={"points": points},
+        )
+        resp.raise_for_status()
+
+
+# ── 메인 ─────────────────────────────────────────────────────────────────────
+
+def main():
+    htm_files = sorted(Path(DOCS_DIR).rglob("*.htm"))
+    if not htm_files:
+        print(f"HTM 파일 없음: {DOCS_DIR}")
+        sys.exit(1)
+
+    print(f"HTM 파일 수: {len(htm_files)}")
+    ensure_collection()
+
+    total_chunks = 0
+    batch: list[dict] = []
+    BATCH_SIZE = 20
+
+    for i, path in enumerate(htm_files, 1):
+        rel = str(path.relative_to(Path(DOCS_DIR).parent))
+        text = extract_text(str(path))
+        chunks = chunk_text(text)
+
+        for j, chunk in enumerate(chunks):
+            vec = embed(chunk)
+            batch.append({
+                "id":      str(uuid.uuid5(uuid.NAMESPACE_URL, f"{path}#{j}")),
+                "vector":  vec,
+                "payload": {
+                    "filePath":   rel,
+                    "content":    chunk,
+                    "chunkIndex": j,
+                },
+            })
+
+            if len(batch) >= BATCH_SIZE:
+                upsert_batch(batch)
+                total_chunks += len(batch)
+                batch = []
+
+        print(f"[{i:2d}/{len(htm_files)}] {path.name}  ({len(chunks)} chunks)", flush=True)
+
+    if batch:
+        upsert_batch(batch)
+        total_chunks += len(batch)
+
+    print(f"\n완료: {total_chunks}개 청크 → 컬렉션 '{COLLECTION}'")
+
+
+if __name__ == "__main__":
+    main()