#!/usr/bin/env python3 """P&ID 파싱 전용 워커 프로세스 Usage: python pid_worker.py 담당 도구: extract_pid_tags, match_pid_tags, parse_pid_dxf, parse_pid_pdf, parse_pid_drawing, build_pid_graph_parallel, analyze_pid_impact """ from __future__ import annotations import sys import os # mcp-server 디렉토리를 Python 경로에 추가 (pipeline 패키지 접근) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import io import json import asyncio import signal import logging import re from functools import lru_cache from fastapi import FastAPI, Request import uvicorn # ── 설정 ───────────────────────────────────────────────────────────────────── VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000/v1") VLLM_MODEL = os.environ.get("VLLM_MODEL", "Qwen3.6-27B-FP8") DB_CONNECTION_STRING = os.environ.get("DB_CONNECTION_STRING", "postgresql://postgres:postgres@localhost:5432/iiot_platform") DB_TIMEOUT = int(os.environ.get("DB_TIMEOUT", "10")) _SERVER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) STORAGE_DIR = os.path.join(_SERVER_DIR, "storage") logging.basicConfig( level=logging.INFO, stream=sys.stderr, format="%(asctime)s [pid_worker] %(levelname)s %(message)s", ) app = FastAPI() # ── 싱글톤 ─────────────────────────────────────────────────────────────────── @lru_cache(maxsize=1) def _llm(): from openai import OpenAI return OpenAI(base_url=VLLM_BASE_URL, api_key="dummy") @lru_cache(maxsize=1) def _ocr(): from paddleocr import PaddleOCR use_gpu = os.environ.get("PADDLE_USE_GPU", "true").lower() == "true" try: return PaddleOCR(use_angle_cls=True, lang="korean", use_gpu=use_gpu, show_log=False) except Exception: if use_gpu: os.environ["PADDLE_USE_GPU"] = "false" return _ocr() raise # ── DB ─────────────────────────────────────────────────────────────────────── def _get_db_connection(): import psycopg return psycopg.connect(DB_CONNECTION_STRING, connect_timeout=DB_TIMEOUT) # ── 텍스트 추출 ────────────────────────────────────────────────────────────── def _extract_text_from_dxf(filepath: str) -> str: import ezdxf from ezdxf.tools.text import plain_mtext doc = ezdxf.readfile(filepath) msp = doc.modelspace() texts = [] for entity in msp: if entity.dxftype() == "TEXT": texts.append(entity.dxf.text) elif entity.dxftype() == "MTEXT": try: plain = plain_mtext(entity.dxf.text) if plain.strip(): texts.append(plain) except Exception: pass return "\n".join(texts) def _extract_text_from_pdf(filepath: str) -> str: import fitz doc = fitz.open(filepath) return "\n".join(page.get_text() for page in doc) def _extract_text_from_pdf_ocr(filepath: str) -> str: import fitz from PIL import Image import numpy as np doc = fitz.open(filepath) all_texts = [] for page in doc: mat = fitz.Matrix(300 / 72) pix = page.get_pixmap(matrix=mat) img = Image.open(io.BytesIO(pix.tobytes("png"))) result = _ocr().ocr(np.array(img), cls=True) if result and result[0]: all_texts.extend(line[1][0] for line in result[0]) return "\n".join(all_texts) # ── JSON 배열 파싱 유틸 ─────────────────────────────────────────────────────── def _parse_json_array(raw: str, finish_reason: str = "") -> list: """LLM 출력에서 JSON 배열 추출. finish_reason=length 잘림 복구 포함.""" if raw.startswith("```"): lines = raw.splitlines() raw = "\n".join(lines[1:-1] if lines and lines[-1].strip() == "```" else lines[1:]).strip() if finish_reason == "length": last_close = raw.rfind("}") if last_close != -1: raw = raw[:last_close + 1] + "]" # 가장 긴 균형 잡힌 [...] 추출 depth = 0; start = -1; best = "" for i, c in enumerate(raw): if c == "[": if depth == 0: start = i depth += 1 elif c == "]": depth -= 1 if depth == 0 and start >= 0: cand = raw[start:i + 1] if len(cand) > len(best): best = cand raw = best if best else "[]" try: return json.loads(raw) except json.JSONDecodeError: data = [] for obj in re.findall(r"\{[^{}]*\}", raw, re.DOTALL): try: data.append(json.loads(obj)) except json.JSONDecodeError: pass return data # ── 태그 추출/매핑 도구 ─────────────────────────────────────────────────────── def _extract_pid_tags(text: str, source_type: str) -> str: system = ( "You are a P&ID (Piping and Instrumentation Diagram) expert.\n" "Extract all instrument and equipment tags from the provided text.\n" "Return ONLY a valid JSON array. Each element must have exactly these fields:\n" '{"tagNo":"FCV-101","equipmentName":null,"instrumentType":"FCV",' '"lineNumber":null,"pidDrawingNo":null,"confidence":0.95}\n' "Rules:\n" "- tagNo: any token matching [LETTERS]-[DIGITS] or [LETTERS]-[DIGITS]-[SUFFIX]\n" " Examples: FCV-101, P-10101, T-10100, VG-6203-15A-F1A-n, BT-6200, DP-10101\n" "- instrumentType: leading letters of tagNo\n" "- equipmentName: descriptive name if present near tag, else null\n" "- lineNumber/pidDrawingNo: null unless explicitly associated\n" "- confidence: 0.95 for clear tags, lower for ambiguous\n" "- Output ONLY the JSON array, no markdown, no explanation.\n" "- If no tags found, return: []\n" ) truncated = text[:100000] resp = _llm().chat.completions.create( model=VLLM_MODEL, messages=[ {"role": "system", "content": system}, {"role": "user", "content": f"Source: {source_type}\n\nText:\n{truncated}"}, ], max_tokens=32768, temperature=0.1, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) raw = (resp.choices[0].message.content or "").strip() data = _parse_json_array(raw, resp.choices[0].finish_reason) logging.info(f"extract_pid_tags source={source_type} count={len(data)}") return json.dumps({ "success": True, "data": {"count": len(data), "tags": data}, "message": "태그 추출 완료" }, ensure_ascii=False, indent=2) def _match_pid_tags(pid_tags: list, experion_tags: list) -> str: system = ( "You are a P&ID to Experion tag matching expert.\n" "Match P&ID tags to Experion tags based on similarity.\n" "Return ONLY a JSON array:\n" '[{"pidTag":"FT-101","experionTag":"ft-101.pv","confidence":0.92},...]\n' "- If no good match: confidence < 0.5, experionTag null\n" "- Output ONLY the JSON array.\n" ) resp = _llm().chat.completions.create( model=VLLM_MODEL, messages=[ {"role": "system", "content": system}, {"role": "user", "content": ( f"P&ID Tags:\n{chr(10).join(pid_tags)}\n\n" f"Experion Tags:\n{chr(10).join(experion_tags)}" )}, ], max_tokens=16384, temperature=0.1, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) raw = (resp.choices[0].message.content or "").strip() data = _parse_json_array(raw, resp.choices[0].finish_reason) return json.dumps({ "success": True, "data": {"count": len(data), "mappings": data}, "message": "태그 매핑 완료" }, ensure_ascii=False, indent=2) # ── 도면 파싱 도구 ──────────────────────────────────────────────────────────── _TAG_EXTRACT_SYSTEM = ( "You are a P&ID (Piping and Instrumentation Diagram) expert.\n" "Extract instrument and equipment tags from the provided text.\n" "Return ONLY a JSON array:\n" '[{"tagNo":"FIT-10115","equipmentName":"Flow Transmitter","instrumentType":"FIT",' '"lineNumber":"L-101","pidDrawingNo":"P&ID-001","confidence":0.95},...]\n' "Rules:\n" "- tagNo: Instrument [Function]-[Number], Equipment [Type]-[Number]\n" "- instrumentType: first 2-4 letters of tagNo\n" "- equipmentName/lineNumber/pidDrawingNo: null if not present\n" "- confidence: 0.0 to 1.0\n" "- Output ONLY the JSON array, no markdown.\n" "- If no tags found, return: []\n" ) def _parse_pid_dxf(filepath: str) -> str: text = _extract_text_from_dxf(filepath) if not text.strip(): return json.dumps({"success": True, "text": "", "count": 0, "tags": []}, ensure_ascii=False, indent=2) resp = _llm().chat.completions.create( model=VLLM_MODEL, messages=[ {"role": "system", "content": _TAG_EXTRACT_SYSTEM}, {"role": "user", "content": f"Source: dxf\n\nText:\n{text[:8000]}"}, ], max_tokens=8192, temperature=0.1, ) raw = (resp.choices[0].message.content or "").strip() data = _parse_json_array(raw, resp.choices[0].finish_reason) if not isinstance(data, list): data = [] return json.dumps({ "success": True, "data": {"text": text[:10000], "count": len(data), "tags": data}, "message": "DXF 파싱 완료" }, ensure_ascii=False, indent=2) def _parse_pid_pdf(filepath: str, use_ocr: bool = True) -> str: text = _extract_text_from_pdf_ocr(filepath) if use_ocr else _extract_text_from_pdf(filepath) if not text.strip(): return json.dumps({"success": True, "text": "", "count": 0, "tags": []}, ensure_ascii=False, indent=2) resp = _llm().chat.completions.create( model=VLLM_MODEL, messages=[ {"role": "system", "content": _TAG_EXTRACT_SYSTEM}, {"role": "user", "content": f"Source: pdf\n\nText:\n{text[:12000]}"}, ], max_tokens=4096, temperature=0.1, ) raw = (resp.choices[0].message.content or "").strip() data = _parse_json_array(raw, resp.choices[0].finish_reason) if not isinstance(data, list): data = [] return json.dumps({ "success": True, "data": {"text": text[:10000], "count": len(data), "tags": data}, "message": "PDF 파싱 완료" }, ensure_ascii=False, indent=2) def _parse_pid_drawing(filepath: str) -> str: ext = os.path.splitext(filepath)[1].lower() if ext == ".dxf": return _parse_pid_dxf(filepath) elif ext == ".pdf": return _parse_pid_pdf(filepath) elif ext == ".dwg": return json.dumps({ "success": False, "data": None, "error": "DWG 파일은 직접 파싱할 수 없습니다. DXF로 변환 후 사용하세요.", "message": "지원하지 않는 파일 형식" }, ensure_ascii=False) else: return json.dumps({ "success": False, "error": f"지원하지 않는 형식: {ext}. 지원: .dxf, .pdf", }, ensure_ascii=False) # ── 그래프 도구 ─────────────────────────────────────────────────────────────── import time async def _build_pid_graph_parallel(filepath: str) -> str: """ P&ID 그래프 빌드 — 독립 프로세스 병렬 아키텍처. Phase 1: 도면 분할 + 기하 추출 Phase 2: 전체 텍스트 1회 추출 Phase 3: 5개 독립 추출 프로세스 병렬 실행 (subprocess.Popen) Phase 4: 결과 통합 + tagNo 기준 중복 제거 Phase 5: 위상 그래프 빌드 + 저장 """ from pipeline.extractor import PidGeometricExtractor from pipeline.topology import PidTopologyBuilder import subprocess import tempfile import shutil os.makedirs(STORAGE_DIR, exist_ok=True) t0 = time.time() basename = os.path.basename(filepath) worker_dir = os.path.dirname(os.path.abspath(__file__)) logging.info(f"[{basename}] === 독립 프로세스 병렬 아키텍처 시작 ===") # ── Phase 1: 도면 분할 + 기하 추출 ────────────────────────────── logging.info(f"[{basename}] Phase 1: 도면 분할 + 기하 추출 시작") extractor = PidGeometricExtractor(filepath) regions = extractor.split_drawings() logging.info(f"[{basename}] 도면 분할: {len(regions)}개 영역") geo_data_path = os.path.join(STORAGE_DIR, basename + "_geo.json") extractor.extract_and_save(geo_data_path) with open(geo_data_path, "r", encoding="utf-8") as f: geo_data = json.load(f) logging.info(f"[{basename}] Phase 1 완료 ({time.time()-t0:.1f}s) - {len(geo_data)}개 엔티티") # ── Phase 2: 전체 텍스트 1회 추출 ─────────────────────────────── t2 = time.time() logging.info(f"[{basename}] Phase 2: 전체 텍스트 1회 추출") full_text = _extract_text_from_dxf(filepath) # 임시 디렉토리 생성 (프로세스 간 통신용) temp_dir = tempfile.mkdtemp(prefix=f"pid_{basename.replace('.dxf', '')}_") text_path = os.path.join(temp_dir, "full_text.txt") with open(text_path, "w", encoding="utf-8") as f: f.write(full_text) logging.info(f"[{basename}] Phase 2 완료 ({time.time()-t2:.1f}s) - {len(full_text)}자") # ── Phase 3: 5개 독립 추출 프로세스 병렬 실행 ─────────────────── t3 = time.time() logging.info(f"[{basename}] Phase 3: 5개 독립 추출 프로세스 병렬 실행") extractors = [ ("sensor", "pid_extract_sensor.py"), ("valve", "pid_extract_valve.py"), ("system", "pid_extract_system.py"), ("gauge", "pid_extract_gauge.py"), ("pump", "pid_extract_pump.py"), ] results_dir = os.path.join(temp_dir, "results") os.makedirs(results_dir, exist_ok=True) processes = [] for name, script in extractors: output_path = os.path.join(results_dir, f"{name}.json") script_path = os.path.join(worker_dir, script) cmd = [ sys.executable, script_path, "--input", text_path, "--output", output_path, ] try: proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) processes.append((name, proc, output_path)) logging.info(f"[{basename}] 시작: {name} (pid={proc.pid})") except Exception as e: logging.error(f"[{basename}] 실행 실패 {name}: {e}") # 모든 프로세스 대기 (timeout=300초) for name, proc, output_path in processes: try: stdout, stderr = proc.communicate(timeout=300) if proc.returncode != 0: logging.error(f"[{basename}] {name} 실패 (rc={proc.returncode}): {stderr[:200]}") else: logging.info(f"[{basename}] {name} 완료: {stdout.strip()[:100]}") except subprocess.TimeoutExpired: proc.kill() logging.error(f"[{basename}] {name} 타임아웃 (300초)") logging.info(f"[{basename}] Phase 3 완료 ({time.time()-t3:.1f}s)") # ── Phase 4: 결과 JSON 통합 + tagNo 기준 중복 제거 ────────────── t4 = time.time() logging.info(f"[{basename}] Phase 4: 결과 통합 + 중복 제거") all_tags = [] seen_tagnos = set() for name, _, output_path in processes: if not os.path.exists(output_path): logging.warning(f"[{basename}] 결과 없음: {name}") continue try: with open(output_path, "r", encoding="utf-8") as f: result = json.load(f) tags = result.get("tags", []) count_new = 0 for tag in tags: tag_no = tag.get("tagNo", "") if tag_no and tag_no not in seen_tagnos: seen_tagnos.add(tag_no) all_tags.append(tag) count_new += 1 logging.info(f"[{basename}] {name}: {len(tags)}개 중 {count_new}개 신규") except Exception as e: logging.error(f"[{basename}] 결과 로드 실패 {name}: {e}") logging.info(f"[{basename}] Phase 4 완료 ({time.time()-t4:.1f}s) - 총 {len(all_tags)}개 태그") # ── Phase 5: 위상 그래프 빌드 + 저장 ──────────────────────────── t5 = time.time() logging.info(f"[{basename}] Phase 5: 위상 그래프 빌드") builder = PidTopologyBuilder(geo_data) builder.build_graph() # 추출된 태그를 그래프에 추가 from shapely.geometry import box as shapely_box for tag in all_tags: tag_no = tag.get("tagNo", "UNKNOWN") eq_name = tag.get("equipmentName") inst_type = tag.get("instrumentType") confidence = tag.get("confidence", 0.5) # 해당 태그의 bbox 찾기 (geo_data에서 clean_value 매칭) matched_bbox = None for entity in geo_data: if entity.get("clean_value", "").upper() == tag_no.upper(): bbox = entity.get("bbox", {}) matched_bbox = ( bbox.get("min_x"), bbox.get("min_y"), bbox.get("max_x"), bbox.get("max_y") ) break node_id = f"tag_{tag_no}" if matched_bbox: bbox_geom = shapely_box(matched_bbox[0], matched_bbox[1], matched_bbox[2], matched_bbox[3]) builder.G.add_node(node_id, type="TEXT", bbox=bbox_geom, value=tag_no, equipment_name=eq_name, instrument_type=inst_type, confidence=confidence) else: builder.G.add_node(node_id, type="TEXT", bbox=shapely_box(0, 0, 1, 1), value=tag_no, equipment_name=eq_name, instrument_type=inst_type, confidence=confidence) # 태그-설비 연결 equipments = [n for n, d in builder.G.nodes(data=True) if d.get("type") not in ("TEXT", "LINE", "LWPOLYLINE")] if equipments: eq_grid = builder._build_spatial_grid(equipments) tag_ids = [f"tag_{t.get('tagNo', '')}" for t in all_tags] for tag_id in tag_ids: if tag_id in builder.G: best_match = builder._find_nearest_equipment(tag_id, eq_grid) if best_match: builder.G.add_edge(tag_id, best_match, relation="associated_with") graph_id = basename.replace(".dxf", "_graph.json") graph_path = os.path.join(STORAGE_DIR, graph_id) builder.save_graph(graph_path) # 임시 디렉토리 정리 shutil.rmtree(temp_dir, ignore_errors=True) total_time = time.time() - t0 logging.info(f"[{basename}] 전체 완료 ({total_time:.1f}s) - graph_id={graph_id} " f"nodes={builder.G.number_of_nodes()} " f"edges={builder.G.number_of_edges()} " f"tags={len(all_tags)}") return json.dumps({ "success": True, "graph_id": graph_id, "graph_path": graph_path, "nodes": builder.G.number_of_nodes(), "edges": builder.G.number_of_edges(), "tags_extracted": len(all_tags), "processing_time_sec": round(total_time, 1) }, ensure_ascii=False) def _analyze_pid_impact(graph_id: str, start_node_id: str) -> str: from pipeline.analyzer import PidAnalysisEngine graph_path = os.path.join(STORAGE_DIR, graph_id) mapping_path = graph_path.replace("_graph.json", "_mapping.json") analyzer = PidAnalysisEngine(graph_path, mapping_path) result = analyzer.analyze_impact(start_node_id) return json.dumps(result, ensure_ascii=False, indent=2) # ── 요청 디스패처 ───────────────────────────────────────────────────────────── async def _dispatch(tool: str, params: dict) -> str: try: match tool: # blocking 함수는 asyncio.to_thread로 스레드풀 오프로드 case "extract_pid_tags": return await asyncio.to_thread(_extract_pid_tags, **params) case "match_pid_tags": return await asyncio.to_thread(_match_pid_tags, **params) case "parse_pid_dxf": return await asyncio.to_thread(_parse_pid_dxf, **params) case "parse_pid_pdf": return await asyncio.to_thread(_parse_pid_pdf, **params) case "parse_pid_drawing": return await asyncio.to_thread(_parse_pid_drawing, **params) case "analyze_pid_impact": return await asyncio.to_thread(_analyze_pid_impact, **params) # 이미 async — 직접 await case "build_pid_graph_parallel": return await _build_pid_graph_parallel(**params) case _: return json.dumps({"success": False, "error": f"알 수 없는 도구: {tool}"}, ensure_ascii=False) except Exception as e: logging.error(f"dispatch error tool={tool}: {e}", exc_info=True) return json.dumps({"success": False, "error": str(e)}, ensure_ascii=False) # ── 종료 예약 ───────────────────────────────────────────────────────────────── def _schedule_shutdown(): """ 응답 전송 완료 후 프로세스 종료 예약. FastAPI의 BackgroundTasks를 사용하여 응답이 완전히 전송된 후 종료되도록 유도함. """ async def _do(): # 네트워크 전송 및 커넥션 정리를 위한 최소한의 대기 시간 await asyncio.sleep(1.0) logging.info("One-shot worker shutting down...") os.kill(os.getpid(), signal.SIGTERM) asyncio.create_task(_do()) # ── HTTP 엔드포인트 ─────────────────────────────────────────────────────────── @app.get("/health") async def health(): return {"status": "ok"} @app.post("/execute") async def execute(request: Request): body = await request.json() return await _dispatch(body["tool"], body["params"]) from fastapi import BackgroundTasks @app.post("/execute/one_shot") async def execute_one_shot(request: Request, background_tasks: BackgroundTasks): """요청 처리 후 프로세스 자동 종료 (P&ID 워커 전용).""" body = await request.json() result = await _dispatch(body["tool"], body["params"]) # BackgroundTasks에 등록하여 응답 전송이 완료된 후 _schedule_shutdown이 실행되도록 함 background_tasks.add_function(_schedule_shutdown) return result # ── 진입점 ─────────────────────────────────────────────────────────────────── if __name__ == "__main__": port = int(sys.argv[1]) if len(sys.argv) > 1 else 5004 os.makedirs(STORAGE_DIR, exist_ok=True) uvicorn.run(app, host="0.0.0.0", port=port, log_level="warning")