#!/usr/bin/env python3 """P&ID 파싱 전용 워커 프로세스 Usage: python pid_worker.py 담당 도구: extract_pid_tags, match_pid_tags, parse_pid_dxf, parse_pid_pdf, parse_pid_drawing, build_pid_graph_parallel, analyze_pid_impact """ from __future__ import annotations import sys import os # mcp-server 디렉토리를 Python 경로에 추가 (pipeline 패키지 접근) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import io import json import asyncio import signal import logging import re from functools import lru_cache from fastapi import FastAPI, Request import uvicorn # ── 설정 ───────────────────────────────────────────────────────────────────── VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000/v1") VLLM_MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-Coder-Next-FP8") DB_CONNECTION_STRING = os.environ.get("DB_CONNECTION_STRING", "postgresql://postgres:postgres@localhost:5432/iiot_platform") DB_TIMEOUT = int(os.environ.get("DB_TIMEOUT", "10")) _SERVER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) STORAGE_DIR = os.path.join(_SERVER_DIR, "storage") logging.basicConfig( level=logging.INFO, stream=sys.stderr, format="%(asctime)s [pid_worker] %(levelname)s %(message)s", ) app = FastAPI() # ── 싱글톤 ─────────────────────────────────────────────────────────────────── @lru_cache(maxsize=1) def _llm(): from openai import OpenAI return OpenAI(base_url=VLLM_BASE_URL, api_key="dummy") @lru_cache(maxsize=1) def _ocr(): from paddleocr import PaddleOCR use_gpu = os.environ.get("PADDLE_USE_GPU", "true").lower() == "true" try: return PaddleOCR(use_angle_cls=True, lang="korean", use_gpu=use_gpu, show_log=False) except Exception: if use_gpu: os.environ["PADDLE_USE_GPU"] = "false" return _ocr() raise # ── DB ─────────────────────────────────────────────────────────────────────── def _get_db_connection(): import psycopg return psycopg.connect(DB_CONNECTION_STRING, connect_timeout=DB_TIMEOUT) # ── 텍스트 추출 ────────────────────────────────────────────────────────────── def _extract_text_from_dxf(filepath: str) -> str: import ezdxf from ezdxf.tools.text import plain_mtext doc = ezdxf.readfile(filepath) msp = doc.modelspace() texts = [] for entity in msp: if entity.dxftype() == "TEXT": texts.append(entity.dxf.text) elif entity.dxftype() == "MTEXT": try: plain = plain_mtext(entity.dxf.text) if plain.strip(): texts.append(plain) except Exception: pass return "\n".join(texts) def _extract_text_from_pdf(filepath: str) -> str: import fitz doc = fitz.open(filepath) return "\n".join(page.get_text() for page in doc) def _extract_text_from_pdf_ocr(filepath: str) -> str: import fitz from PIL import Image import numpy as np doc = fitz.open(filepath) all_texts = [] for page in doc: mat = fitz.Matrix(300 / 72) pix = page.get_pixmap(matrix=mat) img = Image.open(io.BytesIO(pix.tobytes("png"))) result = _ocr().ocr(np.array(img), cls=True) if result and result[0]: all_texts.extend(line[1][0] for line in result[0]) return "\n".join(all_texts) # ── JSON 배열 파싱 유틸 ─────────────────────────────────────────────────────── def _parse_json_array(raw: str, finish_reason: str = "") -> list: """LLM 출력에서 JSON 배열 추출. finish_reason=length 잘림 복구 포함.""" if raw.startswith("```"): lines = raw.splitlines() raw = "\n".join(lines[1:-1] if lines and lines[-1].strip() == "```" else lines[1:]).strip() if finish_reason == "length": last_close = raw.rfind("}") if last_close != -1: raw = raw[:last_close + 1] + "]" # 가장 긴 균형 잡힌 [...] 추출 depth = 0; start = -1; best = "" for i, c in enumerate(raw): if c == "[": if depth == 0: start = i depth += 1 elif c == "]": depth -= 1 if depth == 0 and start >= 0: cand = raw[start:i + 1] if len(cand) > len(best): best = cand raw = best if best else "[]" try: return json.loads(raw) except json.JSONDecodeError: data = [] for obj in re.findall(r"\{[^{}]*\}", raw, re.DOTALL): try: data.append(json.loads(obj)) except json.JSONDecodeError: pass return data # ── 태그 추출/매핑 도구 ─────────────────────────────────────────────────────── def _extract_pid_tags(text: str, source_type: str) -> str: system = ( "You are a P&ID (Piping and Instrumentation Diagram) expert.\n" "Extract all instrument and equipment tags from the provided text.\n" "Return ONLY a valid JSON array. Each element must have exactly these fields:\n" '{"tagNo":"FCV-101","equipmentName":null,"instrumentType":"FCV",' '"lineNumber":null,"pidDrawingNo":null,"confidence":0.95}\n' "Rules:\n" "- tagNo: any token matching [LETTERS]-[DIGITS] or [LETTERS]-[DIGITS]-[SUFFIX]\n" " Examples: FCV-101, P-10101, T-10100, VG-6203-15A-F1A-n, BT-6200, DP-10101\n" "- instrumentType: leading letters of tagNo\n" "- equipmentName: descriptive name if present near tag, else null\n" "- lineNumber/pidDrawingNo: null unless explicitly associated\n" "- confidence: 0.95 for clear tags, lower for ambiguous\n" "- Output ONLY the JSON array, no markdown, no explanation.\n" "- If no tags found, return: []\n" ) truncated = text[:100000] resp = _llm().chat.completions.create( model=VLLM_MODEL, messages=[ {"role": "system", "content": system}, {"role": "user", "content": f"Source: {source_type}\n\nText:\n{truncated}"}, ], max_tokens=32768, temperature=0.1, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) raw = (resp.choices[0].message.content or "").strip() data = _parse_json_array(raw, resp.choices[0].finish_reason) logging.info(f"extract_pid_tags source={source_type} count={len(data)}") return json.dumps({"success": True, "count": len(data), "tags": data}, ensure_ascii=False, indent=2) def _match_pid_tags(pid_tags: list, experion_tags: list) -> str: system = ( "You are a P&ID to Experion tag matching expert.\n" "Match P&ID tags to Experion tags based on similarity.\n" "Return ONLY a JSON array:\n" '[{"pidTag":"FT-101","experionTag":"ft-101.pv","confidence":0.92},...]\n' "- If no good match: confidence < 0.5, experionTag null\n" "- Output ONLY the JSON array.\n" ) resp = _llm().chat.completions.create( model=VLLM_MODEL, messages=[ {"role": "system", "content": system}, {"role": "user", "content": ( f"P&ID Tags:\n{chr(10).join(pid_tags)}\n\n" f"Experion Tags:\n{chr(10).join(experion_tags)}" )}, ], max_tokens=16384, temperature=0.1, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) raw = (resp.choices[0].message.content or "").strip() data = _parse_json_array(raw, resp.choices[0].finish_reason) return json.dumps({"success": True, "count": len(data), "mappings": data}, ensure_ascii=False, indent=2) # ── 도면 파싱 도구 ──────────────────────────────────────────────────────────── _TAG_EXTRACT_SYSTEM = ( "You are a P&ID (Piping and Instrumentation Diagram) expert.\n" "Extract instrument and equipment tags from the provided text.\n" "Return ONLY a JSON array:\n" '[{"tagNo":"FIT-10115","equipmentName":"Flow Transmitter","instrumentType":"FIT",' '"lineNumber":"L-101","pidDrawingNo":"P&ID-001","confidence":0.95},...]\n' "Rules:\n" "- tagNo: Instrument [Function]-[Number], Equipment [Type]-[Number]\n" "- instrumentType: first 2-4 letters of tagNo\n" "- equipmentName/lineNumber/pidDrawingNo: null if not present\n" "- confidence: 0.0 to 1.0\n" "- Output ONLY the JSON array, no markdown.\n" "- If no tags found, return: []\n" ) def _parse_pid_dxf(filepath: str) -> str: text = _extract_text_from_dxf(filepath) if not text.strip(): return json.dumps({"success": True, "text": "", "count": 0, "tags": []}, ensure_ascii=False, indent=2) resp = _llm().chat.completions.create( model=VLLM_MODEL, messages=[ {"role": "system", "content": _TAG_EXTRACT_SYSTEM}, {"role": "user", "content": f"Source: dxf\n\nText:\n{text[:8000]}"}, ], max_tokens=8192, temperature=0.1, ) raw = (resp.choices[0].message.content or "").strip() data = _parse_json_array(raw, resp.choices[0].finish_reason) if not isinstance(data, list): data = [] return json.dumps({"success": True, "text": text[:10000], "count": len(data), "tags": data}, ensure_ascii=False, indent=2) def _parse_pid_pdf(filepath: str, use_ocr: bool = True) -> str: text = _extract_text_from_pdf_ocr(filepath) if use_ocr else _extract_text_from_pdf(filepath) if not text.strip(): return json.dumps({"success": True, "text": "", "count": 0, "tags": []}, ensure_ascii=False, indent=2) resp = _llm().chat.completions.create( model=VLLM_MODEL, messages=[ {"role": "system", "content": _TAG_EXTRACT_SYSTEM}, {"role": "user", "content": f"Source: pdf\n\nText:\n{text[:12000]}"}, ], max_tokens=4096, temperature=0.1, ) raw = (resp.choices[0].message.content or "").strip() data = _parse_json_array(raw, resp.choices[0].finish_reason) if not isinstance(data, list): data = [] return json.dumps({"success": True, "text": text[:10000], "count": len(data), "tags": data}, ensure_ascii=False, indent=2) def _parse_pid_drawing(filepath: str) -> str: ext = os.path.splitext(filepath)[1].lower() if ext == ".dxf": return _parse_pid_dxf(filepath) elif ext == ".pdf": return _parse_pid_pdf(filepath) elif ext == ".dwg": return json.dumps({ "success": False, "error": "DWG 파일은 직접 파싱할 수 없습니다. DXF로 변환 후 사용하세요.", }, ensure_ascii=False) else: return json.dumps({ "success": False, "error": f"지원하지 않는 형식: {ext}. 지원: .dxf, .pdf", }, ensure_ascii=False) # ── 그래프 도구 ─────────────────────────────────────────────────────────────── async def _build_pid_graph_parallel(filepath: str) -> str: from pipeline.extractor import PidGeometricExtractor from pipeline.topology import PidTopologyBuilder from pipeline.mapper import IntelligentMapper from openai import AsyncOpenAI os.makedirs(STORAGE_DIR, exist_ok=True) # Phase 1: 기하 추출 extractor = PidGeometricExtractor(filepath) geo_data_path = os.path.join(STORAGE_DIR, os.path.basename(filepath) + "_geo.json") extractor.extract_and_save(geo_data_path) with open(geo_data_path, "r", encoding="utf-8") as f: geo_data = json.load(f) # 시스템 태그 조회 system_tags: list[str] = [] try: conn = _get_db_connection() with conn.cursor() as cur: cur.execute("SELECT tagname FROM realtime_table") system_tags = [r[0] for r in cur.fetchall()] except Exception as e: logging.warning(f"시스템 태그 조회 실패: {e}") # Phase 2: 1차 위상 빌더 (Mapper용 그래프) builder = PidTopologyBuilder(geo_data) builder.build_graph() # Phase 3: 병렬 LLM 매핑 api_client = AsyncOpenAI(base_url=VLLM_BASE_URL, api_key="dummy") mapper = IntelligentMapper(builder.G, system_tags, api_client=api_client) transmitter_nodes = [ n for n, d in builder.G.nodes(data=True) if (d.get("value") or "").upper() in {"FIT", "FT", "LT", "PT", "TE"} ] valve_nodes = [ n for n, d in builder.G.nodes(data=True) if (d.get("value") or "").upper() in {"FCV", "LCV", "TCV", "PCV", "XV"} ] equipment_nodes = [ n for n, d in builder.G.nodes(data=True) if d.get("type") not in {"TEXT", "LINE", "LWPOLYLINE"} ] extracted_results = await asyncio.gather( mapper.extract_transmitters(transmitter_nodes), mapper.extract_valves(valve_nodes), mapper.extract_equipment(equipment_nodes), ) # 매핑 결과 통합 all_mapped_tags = [] for res_dict in extracted_results: for node_id, mapping in res_dict.items(): if mapping.resolved_tag != "UNKNOWN": node_data = builder.G.nodes[node_id] all_mapped_tags.append({ "entity_id": node_id, "tagName": mapping.resolved_tag, "bbox": ( node_data["bbox"].bounds if hasattr(node_data["bbox"], "bounds") else node_data["bbox"] ), "clean_value": mapping.resolved_tag, }) # Phase 4: 최종 위상 모델링 + 저장 final_builder = PidTopologyBuilder(geo_data, all_extracted_tags=all_mapped_tags) final_builder.build_graph() graph_id = os.path.basename(filepath).replace(".dxf", "_graph.json") graph_path = os.path.join(STORAGE_DIR, graph_id) final_builder.save_graph(graph_path) logging.info(f"build_pid_graph_parallel graph_id={graph_id} " f"nodes={final_builder.G.number_of_nodes()} " f"edges={final_builder.G.number_of_edges()}") return json.dumps({ "success": True, "graph_id": graph_id, "graph_path": graph_path, "nodes": final_builder.G.number_of_nodes(), "edges": final_builder.G.number_of_edges(), }, ensure_ascii=False) def _analyze_pid_impact(graph_id: str, start_node_id: str) -> str: from pipeline.analyzer import PidAnalysisEngine graph_path = os.path.join(STORAGE_DIR, graph_id) mapping_path = graph_path.replace("_graph.json", "_mapping.json") analyzer = PidAnalysisEngine(graph_path, mapping_path) result = analyzer.analyze_impact(start_node_id) return json.dumps(result, ensure_ascii=False, indent=2) # ── 요청 디스패처 ───────────────────────────────────────────────────────────── async def _dispatch(tool: str, params: dict) -> str: try: match tool: # blocking 함수는 asyncio.to_thread로 스레드풀 오프로드 case "extract_pid_tags": return await asyncio.to_thread(_extract_pid_tags, **params) case "match_pid_tags": return await asyncio.to_thread(_match_pid_tags, **params) case "parse_pid_dxf": return await asyncio.to_thread(_parse_pid_dxf, **params) case "parse_pid_pdf": return await asyncio.to_thread(_parse_pid_pdf, **params) case "parse_pid_drawing": return await asyncio.to_thread(_parse_pid_drawing, **params) case "analyze_pid_impact": return await asyncio.to_thread(_analyze_pid_impact, **params) # 이미 async — 직접 await case "build_pid_graph_parallel": return await _build_pid_graph_parallel(**params) case _: return json.dumps({"success": False, "error": f"알 수 없는 도구: {tool}"}, ensure_ascii=False) except Exception as e: logging.error(f"dispatch error tool={tool}: {e}", exc_info=True) return json.dumps({"success": False, "error": str(e)}, ensure_ascii=False) # ── 종료 예약 ───────────────────────────────────────────────────────────────── def _schedule_shutdown(): """응답 전송 완료 후 0.5초 뒤 프로세스 종료 예약.""" async def _do(): await asyncio.sleep(0.5) os.kill(os.getpid(), signal.SIGTERM) asyncio.create_task(_do()) # ── HTTP 엔드포인트 ─────────────────────────────────────────────────────────── @app.get("/health") async def health(): return {"status": "ok"} @app.post("/execute") async def execute(request: Request): body = await request.json() return await _dispatch(body["tool"], body["params"]) @app.post("/execute/one_shot") async def execute_one_shot(request: Request): """요청 처리 후 프로세스 자동 종료 (P&ID 워커 전용).""" body = await request.json() result = await _dispatch(body["tool"], body["params"]) _schedule_shutdown() return result # ── 진입점 ─────────────────────────────────────────────────────────────────── if __name__ == "__main__": port = int(sys.argv[1]) if len(sys.argv) > 1 else 5004 os.makedirs(STORAGE_DIR, exist_ok=True) uvicorn.run(app, host="0.0.0.0", port=port, log_level="warning")