Files
ExperionCrawler/.rooBackup/2026-05-05-114600/mcp-server/worker/pid_worker.py
2026-05-08 17:22:10 +09:00

529 lines
21 KiB
Python

#!/usr/bin/env python3
"""P&ID 파싱 전용 워커 프로세스
Usage: python pid_worker.py <port>
담당 도구:
extract_pid_tags, match_pid_tags,
parse_pid_dxf, parse_pid_pdf, parse_pid_drawing,
build_pid_graph_parallel, analyze_pid_impact
"""
from __future__ import annotations
import sys
import os
# mcp-server 디렉토리를 Python 경로에 추가 (pipeline 패키지 접근)
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import io
import json
import asyncio
import signal
import logging
import re
from functools import lru_cache
from fastapi import FastAPI, Request
import uvicorn
# ── 설정 ─────────────────────────────────────────────────────────────────────
VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000/v1")
VLLM_MODEL = os.environ.get("VLLM_MODEL", "Qwen3.6-27B-FP8")
DB_CONNECTION_STRING = os.environ.get("DB_CONNECTION_STRING", "postgresql://postgres:postgres@localhost:5432/iiot_platform")
DB_TIMEOUT = int(os.environ.get("DB_TIMEOUT", "10"))
_SERVER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
STORAGE_DIR = os.path.join(_SERVER_DIR, "storage")
logging.basicConfig(
level=logging.INFO,
stream=sys.stderr,
format="%(asctime)s [pid_worker] %(levelname)s %(message)s",
)
app = FastAPI()
# ── 싱글톤 ───────────────────────────────────────────────────────────────────
@lru_cache(maxsize=1)
def _llm():
from openai import OpenAI
return OpenAI(base_url=VLLM_BASE_URL, api_key="dummy")
@lru_cache(maxsize=1)
def _ocr():
from paddleocr import PaddleOCR
use_gpu = os.environ.get("PADDLE_USE_GPU", "true").lower() == "true"
try:
return PaddleOCR(use_angle_cls=True, lang="korean", use_gpu=use_gpu, show_log=False)
except Exception:
if use_gpu:
os.environ["PADDLE_USE_GPU"] = "false"
return _ocr()
raise
# ── DB ───────────────────────────────────────────────────────────────────────
def _get_db_connection():
import psycopg
return psycopg.connect(DB_CONNECTION_STRING, connect_timeout=DB_TIMEOUT)
# ── 텍스트 추출 ──────────────────────────────────────────────────────────────
def _extract_text_from_dxf(filepath: str) -> str:
import ezdxf
from ezdxf.tools.text import plain_mtext
doc = ezdxf.readfile(filepath)
msp = doc.modelspace()
texts = []
for entity in msp:
if entity.dxftype() == "TEXT":
texts.append(entity.dxf.text)
elif entity.dxftype() == "MTEXT":
try:
plain = plain_mtext(entity.dxf.text)
if plain.strip():
texts.append(plain)
except Exception:
pass
return "\n".join(texts)
def _extract_text_from_pdf(filepath: str) -> str:
import fitz
doc = fitz.open(filepath)
return "\n".join(page.get_text() for page in doc)
def _extract_text_from_pdf_ocr(filepath: str) -> str:
import fitz
from PIL import Image
import numpy as np
doc = fitz.open(filepath)
all_texts = []
for page in doc:
mat = fitz.Matrix(300 / 72)
pix = page.get_pixmap(matrix=mat)
img = Image.open(io.BytesIO(pix.tobytes("png")))
result = _ocr().ocr(np.array(img), cls=True)
if result and result[0]:
all_texts.extend(line[1][0] for line in result[0])
return "\n".join(all_texts)
# ── JSON 배열 파싱 유틸 ───────────────────────────────────────────────────────
def _parse_json_array(raw: str, finish_reason: str = "") -> list:
"""LLM 출력에서 JSON 배열 추출. finish_reason=length 잘림 복구 포함."""
if raw.startswith("```"):
lines = raw.splitlines()
raw = "\n".join(lines[1:-1] if lines and lines[-1].strip() == "```" else lines[1:]).strip()
if finish_reason == "length":
last_close = raw.rfind("}")
if last_close != -1:
raw = raw[:last_close + 1] + "]"
# 가장 긴 균형 잡힌 [...] 추출
depth = 0; start = -1; best = ""
for i, c in enumerate(raw):
if c == "[":
if depth == 0:
start = i
depth += 1
elif c == "]":
depth -= 1
if depth == 0 and start >= 0:
cand = raw[start:i + 1]
if len(cand) > len(best):
best = cand
raw = best if best else "[]"
try:
return json.loads(raw)
except json.JSONDecodeError:
data = []
for obj in re.findall(r"\{[^{}]*\}", raw, re.DOTALL):
try:
data.append(json.loads(obj))
except json.JSONDecodeError:
pass
return data
# ── 태그 추출/매핑 도구 ───────────────────────────────────────────────────────
def _extract_pid_tags(text: str, source_type: str) -> str:
system = (
"You are a P&ID (Piping and Instrumentation Diagram) expert.\n"
"Extract all instrument and equipment tags from the provided text.\n"
"Return ONLY a valid JSON array. Each element must have exactly these fields:\n"
'{"tagNo":"FCV-101","equipmentName":null,"instrumentType":"FCV",'
'"lineNumber":null,"pidDrawingNo":null,"confidence":0.95}\n'
"Rules:\n"
"- tagNo: any token matching [LETTERS]-[DIGITS] or [LETTERS]-[DIGITS]-[SUFFIX]\n"
" Examples: FCV-101, P-10101, T-10100, VG-6203-15A-F1A-n, BT-6200, DP-10101\n"
"- instrumentType: leading letters of tagNo\n"
"- equipmentName: descriptive name if present near tag, else null\n"
"- lineNumber/pidDrawingNo: null unless explicitly associated\n"
"- confidence: 0.95 for clear tags, lower for ambiguous\n"
"- Output ONLY the JSON array, no markdown, no explanation.\n"
"- If no tags found, return: []\n"
)
truncated = text[:100000]
resp = _llm().chat.completions.create(
model=VLLM_MODEL,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": f"Source: {source_type}\n\nText:\n{truncated}"},
],
max_tokens=32768,
temperature=0.1,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
raw = (resp.choices[0].message.content or "").strip()
data = _parse_json_array(raw, resp.choices[0].finish_reason)
logging.info(f"extract_pid_tags source={source_type} count={len(data)}")
return json.dumps({
"success": True,
"data": {"count": len(data), "tags": data},
"message": "태그 추출 완료"
}, ensure_ascii=False, indent=2)
def _match_pid_tags(pid_tags: list, experion_tags: list) -> str:
system = (
"You are a P&ID to Experion tag matching expert.\n"
"Match P&ID tags to Experion tags based on similarity.\n"
"Return ONLY a JSON array:\n"
'[{"pidTag":"FT-101","experionTag":"ft-101.pv","confidence":0.92},...]\n'
"- If no good match: confidence < 0.5, experionTag null\n"
"- Output ONLY the JSON array.\n"
)
resp = _llm().chat.completions.create(
model=VLLM_MODEL,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": (
f"P&ID Tags:\n{chr(10).join(pid_tags)}\n\n"
f"Experion Tags:\n{chr(10).join(experion_tags)}"
)},
],
max_tokens=16384,
temperature=0.1,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
raw = (resp.choices[0].message.content or "").strip()
data = _parse_json_array(raw, resp.choices[0].finish_reason)
return json.dumps({
"success": True,
"data": {"count": len(data), "mappings": data},
"message": "태그 매핑 완료"
}, ensure_ascii=False, indent=2)
# ── 도면 파싱 도구 ────────────────────────────────────────────────────────────
_TAG_EXTRACT_SYSTEM = (
"You are a P&ID (Piping and Instrumentation Diagram) expert.\n"
"Extract instrument and equipment tags from the provided text.\n"
"Return ONLY a JSON array:\n"
'[{"tagNo":"FIT-10115","equipmentName":"Flow Transmitter","instrumentType":"FIT",'
'"lineNumber":"L-101","pidDrawingNo":"P&ID-001","confidence":0.95},...]\n'
"Rules:\n"
"- tagNo: Instrument [Function]-[Number], Equipment [Type]-[Number]\n"
"- instrumentType: first 2-4 letters of tagNo\n"
"- equipmentName/lineNumber/pidDrawingNo: null if not present\n"
"- confidence: 0.0 to 1.0\n"
"- Output ONLY the JSON array, no markdown.\n"
"- If no tags found, return: []\n"
)
def _parse_pid_dxf(filepath: str) -> str:
text = _extract_text_from_dxf(filepath)
if not text.strip():
return json.dumps({"success": True, "text": "", "count": 0, "tags": []},
ensure_ascii=False, indent=2)
resp = _llm().chat.completions.create(
model=VLLM_MODEL,
messages=[
{"role": "system", "content": _TAG_EXTRACT_SYSTEM},
{"role": "user", "content": f"Source: dxf\n\nText:\n{text[:8000]}"},
],
max_tokens=8192,
temperature=0.1,
)
raw = (resp.choices[0].message.content or "").strip()
data = _parse_json_array(raw, resp.choices[0].finish_reason)
if not isinstance(data, list):
data = []
return json.dumps({
"success": True,
"data": {"text": text[:10000], "count": len(data), "tags": data},
"message": "DXF 파싱 완료"
}, ensure_ascii=False, indent=2)
def _parse_pid_pdf(filepath: str, use_ocr: bool = True) -> str:
text = _extract_text_from_pdf_ocr(filepath) if use_ocr else _extract_text_from_pdf(filepath)
if not text.strip():
return json.dumps({"success": True, "text": "", "count": 0, "tags": []},
ensure_ascii=False, indent=2)
resp = _llm().chat.completions.create(
model=VLLM_MODEL,
messages=[
{"role": "system", "content": _TAG_EXTRACT_SYSTEM},
{"role": "user", "content": f"Source: pdf\n\nText:\n{text[:12000]}"},
],
max_tokens=4096,
temperature=0.1,
)
raw = (resp.choices[0].message.content or "").strip()
data = _parse_json_array(raw, resp.choices[0].finish_reason)
if not isinstance(data, list):
data = []
return json.dumps({
"success": True,
"data": {"text": text[:10000], "count": len(data), "tags": data},
"message": "PDF 파싱 완료"
}, ensure_ascii=False, indent=2)
def _parse_pid_drawing(filepath: str) -> str:
ext = os.path.splitext(filepath)[1].lower()
if ext == ".dxf":
return _parse_pid_dxf(filepath)
elif ext == ".pdf":
return _parse_pid_pdf(filepath)
elif ext == ".dwg":
return json.dumps({
"success": False,
"data": None,
"error": "DWG 파일은 직접 파싱할 수 없습니다. DXF로 변환 후 사용하세요.",
"message": "지원하지 않는 파일 형식"
}, ensure_ascii=False)
else:
return json.dumps({
"success": False,
"error": f"지원하지 않는 형식: {ext}. 지원: .dxf, .pdf",
}, ensure_ascii=False)
# ── 그래프 도구 ───────────────────────────────────────────────────────────────
import time
async def _build_pid_graph_parallel(filepath: str) -> str:
from pipeline.extractor import PidGeometricExtractor
from pipeline.topology import PidTopologyBuilder
from pipeline.mapper import IntelligentMapper
from openai import AsyncOpenAI
os.makedirs(STORAGE_DIR, exist_ok=True)
t0 = time.time()
basename = os.path.basename(filepath)
logging.info(f"[{basename}] Phase 1: 기하 추출 시작")
# Phase 1: 기하 추출
extractor = PidGeometricExtractor(filepath)
geo_data_path = os.path.join(STORAGE_DIR, basename + "_geo.json")
extractor.extract_and_save(geo_data_path)
with open(geo_data_path, "r", encoding="utf-8") as f:
geo_data = json.load(f)
logging.info(f"[{basename}] Phase 1 완료 ({time.time()-t0:.1f}s) - {len(geo_data)}개 엔티티 추출")
# 시스템 태그 조회
system_tags: list[str] = []
try:
conn = _get_db_connection()
with conn.cursor() as cur:
cur.execute("SELECT tagname FROM realtime_table")
system_tags = [r[0] for r in cur.fetchall()]
except Exception as e:
logging.warning(f"시스템 태그 조회 실패: {e}")
logging.info(f"[{basename}] 시스템 태그 {len(system_tags)}개 로드")
# Phase 2: 1차 위상 빌더 (Mapper용 그래프)
t1 = time.time()
builder = PidTopologyBuilder(geo_data)
builder.build_graph()
logging.info(f"[{basename}] Phase 2 완료 ({time.time()-t1:.1f}s) - 그래프: {builder.G.number_of_nodes()}노드, {builder.G.number_of_edges()}엣지")
# Phase 3: 병렬 LLM 매핑
t2 = time.time()
api_client = AsyncOpenAI(base_url=VLLM_BASE_URL, api_key="dummy")
mapper = IntelligentMapper(builder.G, system_tags, api_client=api_client)
transmitter_nodes = [
n for n, d in builder.G.nodes(data=True)
if (d.get("value") or "").upper() in {"FIT", "FT", "LT", "PT", "TE"}
]
valve_nodes = [
n for n, d in builder.G.nodes(data=True)
if (d.get("value") or "").upper() in {"FCV", "LCV", "TCV", "PCV", "XV"}
]
equipment_nodes = [
n for n, d in builder.G.nodes(data=True)
if d.get("type") not in {"TEXT", "LINE", "LWPOLYLINE"}
]
logging.info(f"[{basename}] Phase 3 시작 - transmitter:{len(transmitter_nodes)}, valve:{len(valve_nodes)}, equipment:{len(equipment_nodes)}")
extracted_results = await asyncio.gather(
mapper.extract_transmitters(transmitter_nodes),
mapper.extract_valves(valve_nodes),
mapper.extract_equipment(equipment_nodes),
)
logging.info(f"[{basename}] Phase 3 완료 ({time.time()-t2:.1f}s)")
# 매핑 결과 통합
all_mapped_tags = []
for res_dict in extracted_results:
for node_id, mapping in res_dict.items():
if mapping.resolved_tag != "UNKNOWN":
node_data = builder.G.nodes[node_id]
all_mapped_tags.append({
"entity_id": node_id,
"tagName": mapping.resolved_tag,
"bbox": (
node_data["bbox"].bounds
if hasattr(node_data["bbox"], "bounds")
else node_data["bbox"]
),
"clean_value": mapping.resolved_tag,
})
logging.info(f"[{basename}] 매핑 완료: {len(all_mapped_tags)}개 태그 해결")
# Phase 4: 매핑 태그를 기존 그래프에 추가 + 저장 (build_graph() 중복 호출 제거)
t3 = time.time()
from shapely.geometry import box as shapely_box
# 기존 builder.G에 매핑 태그 노드를 추가하고 연결
for tag in all_mapped_tags:
bbox_vals = tag['bbox']
# bbox가 tuple 또는 dict 형태 처리
if isinstance(bbox_vals, (list, tuple)):
bbox_geom = shapely_box(bbox_vals[0], bbox_vals[1], bbox_vals[2], bbox_vals[3])
else:
bbox_geom = shapely_box(bbox_vals['min_x'], bbox_vals['min_y'],
bbox_vals['max_x'], bbox_vals['max_y'])
builder.G.add_node(tag['entity_id'],
type='TEXT',
bbox=bbox_geom,
value=tag.get('clean_value') or tag.get('tagName'))
# 매핑 태그-설비 연결 (SpatialGrid 사용)
equipments = [n for n, d in builder.G.nodes(data=True) if d['type'] not in ['TEXT', 'LINE', 'LWPOLYLINE']]
eq_grid = builder._build_spatial_grid(equipments)
new_tags = [tag['entity_id'] for tag in all_mapped_tags]
for tag_id in new_tags:
best_match = builder._find_nearest_equipment(tag_id, eq_grid)
if best_match:
builder.G.add_edge(tag_id, best_match, relation='associated_with')
graph_id = basename.replace(".dxf", "_graph.json")
graph_path = os.path.join(STORAGE_DIR, graph_id)
builder.save_graph(graph_path)
total_time = time.time() - t0
logging.info(f"[{basename}] 전체 완료 ({total_time:.1f}s) - graph_id={graph_id} "
f"nodes={builder.G.number_of_nodes()} "
f"edges={builder.G.number_of_edges()}")
return json.dumps({
"success": True,
"graph_id": graph_id,
"graph_path": graph_path,
"nodes": builder.G.number_of_nodes(),
"edges": builder.G.number_of_edges(),
"processing_time_sec": round(total_time, 1)
}, ensure_ascii=False)
def _analyze_pid_impact(graph_id: str, start_node_id: str) -> str:
from pipeline.analyzer import PidAnalysisEngine
graph_path = os.path.join(STORAGE_DIR, graph_id)
mapping_path = graph_path.replace("_graph.json", "_mapping.json")
analyzer = PidAnalysisEngine(graph_path, mapping_path)
result = analyzer.analyze_impact(start_node_id)
return json.dumps(result, ensure_ascii=False, indent=2)
# ── 요청 디스패처 ─────────────────────────────────────────────────────────────
async def _dispatch(tool: str, params: dict) -> str:
try:
match tool:
# blocking 함수는 asyncio.to_thread로 스레드풀 오프로드
case "extract_pid_tags":
return await asyncio.to_thread(_extract_pid_tags, **params)
case "match_pid_tags":
return await asyncio.to_thread(_match_pid_tags, **params)
case "parse_pid_dxf":
return await asyncio.to_thread(_parse_pid_dxf, **params)
case "parse_pid_pdf":
return await asyncio.to_thread(_parse_pid_pdf, **params)
case "parse_pid_drawing":
return await asyncio.to_thread(_parse_pid_drawing, **params)
case "analyze_pid_impact":
return await asyncio.to_thread(_analyze_pid_impact, **params)
# 이미 async — 직접 await
case "build_pid_graph_parallel":
return await _build_pid_graph_parallel(**params)
case _:
return json.dumps({"success": False, "error": f"알 수 없는 도구: {tool}"},
ensure_ascii=False)
except Exception as e:
logging.error(f"dispatch error tool={tool}: {e}", exc_info=True)
return json.dumps({"success": False, "error": str(e)}, ensure_ascii=False)
# ── 종료 예약 ─────────────────────────────────────────────────────────────────
def _schedule_shutdown():
"""
응답 전송 완료 후 프로세스 종료 예약.
FastAPI의 BackgroundTasks를 사용하여 응답이 완전히 전송된 후 종료되도록 유도함.
"""
async def _do():
# 네트워크 전송 및 커넥션 정리를 위한 최소한의 대기 시간
await asyncio.sleep(1.0)
logging.info("One-shot worker shutting down...")
os.kill(os.getpid(), signal.SIGTERM)
asyncio.create_task(_do())
# ── HTTP 엔드포인트 ───────────────────────────────────────────────────────────
@app.get("/health")
async def health():
return {"status": "ok"}
@app.post("/execute")
async def execute(request: Request):
body = await request.json()
return await _dispatch(body["tool"], body["params"])
from fastapi import BackgroundTasks
@app.post("/execute/one_shot")
async def execute_one_shot(request: Request, background_tasks: BackgroundTasks):
"""요청 처리 후 프로세스 자동 종료 (P&ID 워커 전용)."""
body = await request.json()
result = await _dispatch(body["tool"], body["params"])
# BackgroundTasks에 등록하여 응답 전송이 완료된 후 _schedule_shutdown이 실행되도록 함
background_tasks.add_function(_schedule_shutdown)
return result
# ── 진입점 ───────────────────────────────────────────────────────────────────
if __name__ == "__main__":
port = int(sys.argv[1]) if len(sys.argv) > 1 else 5004
os.makedirs(STORAGE_DIR, exist_ok=True)
uvicorn.run(app, host="0.0.0.0", port=port, log_level="warning")