ExperionCrawler/.rooBackup/2026-05-03_013114/mcp-server/worker/pid_worker.py

#!/usr/bin/env python3
"""P&ID 파싱 전용 워커 프로세스

Usage: python pid_worker.py <port>

담당 도구:
  extract_pid_tags, match_pid_tags,
  parse_pid_dxf, parse_pid_pdf, parse_pid_drawing,
  build_pid_graph_parallel, analyze_pid_impact
"""

from __future__ import annotations
import sys
import os

# mcp-server 디렉토리를 Python 경로에 추가 (pipeline 패키지 접근)
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import io
import json
import asyncio
import signal
import logging
import re
from functools import lru_cache

from fastapi import FastAPI, Request
import uvicorn

# ── 설정 ─────────────────────────────────────────────────────────────────────

VLLM_BASE_URL        = "http://localhost:8000/v1"
VLLM_MODEL           = "Qwen/Qwen3-Coder-Next-FP8"
DB_CONNECTION_STRING = "postgresql://postgres:postgres@localhost:5432/iiot_platform"
DB_TIMEOUT           = 10

_SERVER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
STORAGE_DIR = os.path.join(_SERVER_DIR, "storage")

logging.basicConfig(
    level=logging.INFO,
    stream=sys.stderr,
    format="%(asctime)s [pid_worker] %(levelname)s %(message)s",
)

app = FastAPI()

# ── 싱글톤 ───────────────────────────────────────────────────────────────────

@lru_cache(maxsize=1)
def _llm():
    from openai import OpenAI
    return OpenAI(base_url=VLLM_BASE_URL, api_key="dummy")


@lru_cache(maxsize=1)
def _ocr():
    from paddleocr import PaddleOCR
    use_gpu = os.environ.get("PADDLE_USE_GPU", "true").lower() == "true"
    try:
        return PaddleOCR(use_angle_cls=True, lang="korean", use_gpu=use_gpu, show_log=False)
    except Exception:
        if use_gpu:
            os.environ["PADDLE_USE_GPU"] = "false"
            return _ocr()
        raise

# ── DB ───────────────────────────────────────────────────────────────────────

def _get_db_connection():
    import psycopg
    return psycopg.connect(DB_CONNECTION_STRING, connect_timeout=DB_TIMEOUT)

# ── 텍스트 추출 ──────────────────────────────────────────────────────────────

def _extract_text_from_dxf(filepath: str) -> str:
    import ezdxf
    from ezdxf.tools.text import plain_mtext
    doc = ezdxf.readfile(filepath)
    msp = doc.modelspace()
    texts = []
    for entity in msp:
        if entity.dxftype() == "TEXT":
            texts.append(entity.dxf.text)
        elif entity.dxftype() == "MTEXT":
            try:
                plain = plain_mtext(entity.dxf.text)
                if plain.strip():
                    texts.append(plain)
            except Exception:
                pass
    return "\n".join(texts)


def _extract_text_from_pdf(filepath: str) -> str:
    import fitz
    doc = fitz.open(filepath)
    return "\n".join(page.get_text() for page in doc)


def _extract_text_from_pdf_ocr(filepath: str) -> str:
    import fitz
    from PIL import Image
    import numpy as np
    doc = fitz.open(filepath)
    all_texts = []
    for page in doc:
        mat = fitz.Matrix(300 / 72)
        pix = page.get_pixmap(matrix=mat)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        result = _ocr().ocr(np.array(img), cls=True)
        if result and result[0]:
            all_texts.extend(line[1][0] for line in result[0])
    return "\n".join(all_texts)

# ── JSON 배열 파싱 유틸 ───────────────────────────────────────────────────────

def _parse_json_array(raw: str, finish_reason: str = "") -> list:
    """LLM 출력에서 JSON 배열 추출. finish_reason=length 잘림 복구 포함."""
    if raw.startswith("```"):
        lines = raw.splitlines()
        raw = "\n".join(lines[1:-1] if lines and lines[-1].strip() == "```" else lines[1:]).strip()

    if finish_reason == "length":
        last_close = raw.rfind("}")
        if last_close != -1:
            raw = raw[:last_close + 1] + "]"

    # 가장 긴 균형 잡힌 [...] 추출
    depth = 0; start = -1; best = ""
    for i, c in enumerate(raw):
        if c == "[":
            if depth == 0:
                start = i
            depth += 1
        elif c == "]":
            depth -= 1
            if depth == 0 and start >= 0:
                cand = raw[start:i + 1]
                if len(cand) > len(best):
                    best = cand
    raw = best if best else "[]"

    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        data = []
        for obj in re.findall(r"\{[^{}]*\}", raw, re.DOTALL):
            try:
                data.append(json.loads(obj))
            except json.JSONDecodeError:
                pass
        return data

# ── 태그 추출/매핑 도구 ───────────────────────────────────────────────────────

def _extract_pid_tags(text: str, source_type: str) -> str:
    system = (
        "You are a P&ID (Piping and Instrumentation Diagram) expert.\n"
        "Extract all instrument and equipment tags from the provided text.\n"
        "Return ONLY a valid JSON array. Each element must have exactly these fields:\n"
        '{"tagNo":"FCV-101","equipmentName":null,"instrumentType":"FCV",'
        '"lineNumber":null,"pidDrawingNo":null,"confidence":0.95}\n'
        "Rules:\n"
        "- tagNo: any token matching [LETTERS]-[DIGITS] or [LETTERS]-[DIGITS]-[SUFFIX]\n"
        "  Examples: FCV-101, P-10101, T-10100, VG-6203-15A-F1A-n, BT-6200, DP-10101\n"
        "- instrumentType: leading letters of tagNo\n"
        "- equipmentName: descriptive name if present near tag, else null\n"
        "- lineNumber/pidDrawingNo: null unless explicitly associated\n"
        "- confidence: 0.95 for clear tags, lower for ambiguous\n"
        "- Output ONLY the JSON array, no markdown, no explanation.\n"
        "- If no tags found, return: []\n"
    )
    truncated = text[:100000]
    resp = _llm().chat.completions.create(
        model=VLLM_MODEL,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": f"Source: {source_type}\n\nText:\n{truncated}"},
        ],
        max_tokens=32768,
        temperature=0.1,
        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
    )
    raw = (resp.choices[0].message.content or "").strip()
    data = _parse_json_array(raw, resp.choices[0].finish_reason)
    logging.info(f"extract_pid_tags source={source_type} count={len(data)}")
    return json.dumps({"success": True, "count": len(data), "tags": data},
                      ensure_ascii=False, indent=2)


def _match_pid_tags(pid_tags: list, experion_tags: list) -> str:
    system = (
        "You are a P&ID to Experion tag matching expert.\n"
        "Match P&ID tags to Experion tags based on similarity.\n"
        "Return ONLY a JSON array:\n"
        '[{"pidTag":"FT-101","experionTag":"ft-101.pv","confidence":0.92},...]\n'
        "- If no good match: confidence < 0.5, experionTag null\n"
        "- Output ONLY the JSON array.\n"
    )
    resp = _llm().chat.completions.create(
        model=VLLM_MODEL,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": (
                f"P&ID Tags:\n{chr(10).join(pid_tags)}\n\n"
                f"Experion Tags:\n{chr(10).join(experion_tags)}"
            )},
        ],
        max_tokens=16384,
        temperature=0.1,
        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
    )
    raw = (resp.choices[0].message.content or "").strip()
    data = _parse_json_array(raw, resp.choices[0].finish_reason)
    return json.dumps({"success": True, "count": len(data), "mappings": data},
                      ensure_ascii=False, indent=2)

# ── 도면 파싱 도구 ────────────────────────────────────────────────────────────

_TAG_EXTRACT_SYSTEM = (
    "You are a P&ID (Piping and Instrumentation Diagram) expert.\n"
    "Extract instrument and equipment tags from the provided text.\n"
    "Return ONLY a JSON array:\n"
    '[{"tagNo":"FIT-10115","equipmentName":"Flow Transmitter","instrumentType":"FIT",'
    '"lineNumber":"L-101","pidDrawingNo":"P&ID-001","confidence":0.95},...]\n'
    "Rules:\n"
    "- tagNo: Instrument [Function]-[Number], Equipment [Type]-[Number]\n"
    "- instrumentType: first 2-4 letters of tagNo\n"
    "- equipmentName/lineNumber/pidDrawingNo: null if not present\n"
    "- confidence: 0.0 to 1.0\n"
    "- Output ONLY the JSON array, no markdown.\n"
    "- If no tags found, return: []\n"
)


def _parse_pid_dxf(filepath: str) -> str:
    text = _extract_text_from_dxf(filepath)
    if not text.strip():
        return json.dumps({"success": True, "text": "", "count": 0, "tags": []},
                          ensure_ascii=False, indent=2)

    resp = _llm().chat.completions.create(
        model=VLLM_MODEL,
        messages=[
            {"role": "system", "content": _TAG_EXTRACT_SYSTEM},
            {"role": "user", "content": f"Source: dxf\n\nText:\n{text[:12000]}"},
        ],
        max_tokens=4096,
        temperature=0.1,
    )
    raw = (resp.choices[0].message.content or "").strip()
    data = _parse_json_array(raw, resp.choices[0].finish_reason)
    if not isinstance(data, list):
        data = []
    return json.dumps({"success": True, "text": text[:10000], "count": len(data), "tags": data},
                      ensure_ascii=False, indent=2)


def _parse_pid_pdf(filepath: str, use_ocr: bool = True) -> str:
    text = _extract_text_from_pdf_ocr(filepath) if use_ocr else _extract_text_from_pdf(filepath)
    if not text.strip():
        return json.dumps({"success": True, "text": "", "count": 0, "tags": []},
                          ensure_ascii=False, indent=2)

    resp = _llm().chat.completions.create(
        model=VLLM_MODEL,
        messages=[
            {"role": "system", "content": _TAG_EXTRACT_SYSTEM},
            {"role": "user", "content": f"Source: pdf\n\nText:\n{text[:12000]}"},
        ],
        max_tokens=4096,
        temperature=0.1,
    )
    raw = (resp.choices[0].message.content or "").strip()
    data = _parse_json_array(raw, resp.choices[0].finish_reason)
    if not isinstance(data, list):
        data = []
    return json.dumps({"success": True, "text": text[:10000], "count": len(data), "tags": data},
                      ensure_ascii=False, indent=2)


def _parse_pid_drawing(filepath: str) -> str:
    ext = os.path.splitext(filepath)[1].lower()
    if ext == ".dxf":
        return _parse_pid_dxf(filepath)
    elif ext == ".pdf":
        return _parse_pid_pdf(filepath)
    elif ext == ".dwg":
        return json.dumps({
            "success": False,
            "error": "DWG 파일은 직접 파싱할 수 없습니다. DXF로 변환 후 사용하세요.",
        }, ensure_ascii=False)
    else:
        return json.dumps({
            "success": False,
            "error": f"지원하지 않는 형식: {ext}. 지원: .dxf, .pdf",
        }, ensure_ascii=False)

# ── 그래프 도구 ───────────────────────────────────────────────────────────────

async def _build_pid_graph_parallel(filepath: str) -> str:
    from pipeline.extractor import PidGeometricExtractor
    from pipeline.topology import PidTopologyBuilder
    from pipeline.mapper import IntelligentMapper
    from openai import AsyncOpenAI

    os.makedirs(STORAGE_DIR, exist_ok=True)

    # Phase 1: 기하 추출
    extractor = PidGeometricExtractor(filepath)
    geo_data_path = os.path.join(STORAGE_DIR, os.path.basename(filepath) + "_geo.json")
    extractor.extract_and_save(geo_data_path)
    with open(geo_data_path, "r", encoding="utf-8") as f:
        geo_data = json.load(f)

    # 시스템 태그 조회
    system_tags: list[str] = []
    try:
        conn = _get_db_connection()
        with conn.cursor() as cur:
            cur.execute("SELECT tagname FROM realtime_table")
            system_tags = [r[0] for r in cur.fetchall()]
    except Exception as e:
        logging.warning(f"시스템 태그 조회 실패: {e}")

    # Phase 2: 1차 위상 빌더 (Mapper용 그래프)
    builder = PidTopologyBuilder(geo_data)
    builder.build_graph()

    # Phase 3: 병렬 LLM 매핑
    api_client = AsyncOpenAI(base_url=VLLM_BASE_URL, api_key="dummy")
    mapper = IntelligentMapper(builder.G, system_tags, api_client=api_client)

    transmitter_nodes = [
        n for n, d in builder.G.nodes(data=True)
        if d.get("value", "").upper() in {"FIT", "FT", "LT", "PT", "TE"}
    ]
    valve_nodes = [
        n for n, d in builder.G.nodes(data=True)
        if d.get("value", "").upper() in {"FCV", "LCV", "TCV", "PCV", "XV"}
    ]
    equipment_nodes = [
        n for n, d in builder.G.nodes(data=True)
        if d.get("type") not in {"TEXT", "LINE", "LWPOLYLINE"}
    ]

    extracted_results = await asyncio.gather(
        mapper.extract_transmitters(transmitter_nodes),
        mapper.extract_valves(valve_nodes),
        mapper.extract_equipment(equipment_nodes),
    )

    # 매핑 결과 통합
    all_mapped_tags = []
    for res_dict in extracted_results:
        for node_id, mapping in res_dict.items():
            if mapping.resolved_tag != "UNKNOWN":
                node_data = builder.G.nodes[node_id]
                all_mapped_tags.append({
                    "entity_id": node_id,
                    "tagName": mapping.resolved_tag,
                    "bbox": (
                        node_data["bbox"].bounds
                        if hasattr(node_data["bbox"], "bounds")
                        else node_data["bbox"]
                    ),
                    "clean_value": mapping.resolved_tag,
                })

    # Phase 4: 최종 위상 모델링 + 저장
    final_builder = PidTopologyBuilder(geo_data, all_extracted_tags=all_mapped_tags)
    final_builder.build_graph()

    graph_id = os.path.basename(filepath).replace(".dxf", "_graph.json")
    graph_path = os.path.join(STORAGE_DIR, graph_id)
    final_builder.save_graph(graph_path)

    logging.info(f"build_pid_graph_parallel graph_id={graph_id} "
                 f"nodes={final_builder.G.number_of_nodes()} "
                 f"edges={final_builder.G.number_of_edges()}")
    return json.dumps({
        "success": True,
        "graph_id": graph_id,
        "graph_path": graph_path,
        "nodes": final_builder.G.number_of_nodes(),
        "edges": final_builder.G.number_of_edges(),
    }, ensure_ascii=False)


def _analyze_pid_impact(graph_id: str, start_node_id: str) -> str:
    from pipeline.analyzer import PidAnalysisEngine

    graph_path = os.path.join(STORAGE_DIR, graph_id)
    mapping_path = graph_path.replace("_graph.json", "_mapping.json")
    analyzer = PidAnalysisEngine(graph_path, mapping_path)
    result = analyzer.analyze_impact(start_node_id)
    return json.dumps(result, ensure_ascii=False, indent=2)

# ── 요청 디스패처 ─────────────────────────────────────────────────────────────

async def _dispatch(tool: str, params: dict) -> str:
    try:
        match tool:
            # blocking 함수는 asyncio.to_thread로 스레드풀 오프로드
            case "extract_pid_tags":
                return await asyncio.to_thread(_extract_pid_tags, **params)
            case "match_pid_tags":
                return await asyncio.to_thread(_match_pid_tags, **params)
            case "parse_pid_dxf":
                return await asyncio.to_thread(_parse_pid_dxf, **params)
            case "parse_pid_pdf":
                return await asyncio.to_thread(_parse_pid_pdf, **params)
            case "parse_pid_drawing":
                return await asyncio.to_thread(_parse_pid_drawing, **params)
            case "analyze_pid_impact":
                return await asyncio.to_thread(_analyze_pid_impact, **params)
            # 이미 async — 직접 await
            case "build_pid_graph_parallel":
                return await _build_pid_graph_parallel(**params)
            case _:
                return json.dumps({"success": False, "error": f"알 수 없는 도구: {tool}"},
                                  ensure_ascii=False)
    except Exception as e:
        logging.error(f"dispatch error tool={tool}: {e}", exc_info=True)
        return json.dumps({"success": False, "error": str(e)}, ensure_ascii=False)

# ── 종료 예약 ─────────────────────────────────────────────────────────────────

def _schedule_shutdown():
    """응답 전송 완료 후 0.5초 뒤 프로세스 종료 예약."""
    async def _do():
        await asyncio.sleep(0.5)
        os.kill(os.getpid(), signal.SIGTERM)
    asyncio.create_task(_do())

# ── HTTP 엔드포인트 ───────────────────────────────────────────────────────────

@app.get("/health")
async def health():
    return {"status": "ok"}


@app.post("/execute")
async def execute(request: Request):
    body = await request.json()
    return await _dispatch(body["tool"], body["params"])


@app.post("/execute/one_shot")
async def execute_one_shot(request: Request):
    """요청 처리 후 프로세스 자동 종료 (P&ID 워커 전용)."""
    body = await request.json()
    result = await _dispatch(body["tool"], body["params"])
    _schedule_shutdown()
    return result

# ── 진입점 ───────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    port = int(sys.argv[1]) if len(sys.argv) > 1 else 5004
    os.makedirs(STORAGE_DIR, exist_ok=True)
    uvicorn.run(app, host="0.0.0.0", port=port, log_level="warning")


❯ 이거 패러렐 프로세싱으로 만든거 맞지?

● 솔직히 말하면 절반만 맞습니다.

  현재 상태

  병렬인 것

  build_pid_graph_parallel 내부 — asyncio.gather로 LLM 3개 동시 호출:
  extracted_results = await asyncio.gather(
      mapper.extract_transmitters(transmitter_nodes),   # LLM 호출 1
      mapper.extract_valves(valve_nodes),               # LLM 호출 2
      mapper.extract_equipment(equipment_nodes),        # LLM 호출 3
  )
  이건 server.py에서 그대로 가져온 것이고, 진짜 병렬입니다.

  병렬이 아닌 것

  나머지 도구들 — _parse_pid_dxf, _parse_pid_pdf, _extract_pid_tags 등:
  # 전부 동기 함수 → uvicorn 이벤트루프를 블로킹함
  def _parse_pid_dxf(filepath: str) -> str:
      text = _extract_text_from_dxf(filepath)  # ezdxf: blocking I/O
      resp = _llm().chat.completions.create(...) # vLLM HTTP: blocking
      ...
  async def execute(request)에서 이 함수를 직접 호출하면 이벤트루프 전체가 멈춥니다.

  수정 방향

  asyncio.to_thread()로 동기 함수를 스레드풀에 오프로드:

  async def _dispatch(tool: str, params: dict) -> str:
      match tool:
          case "parse_pid_dxf":
              # blocking → 스레드풀로 분리
              return await asyncio.to_thread(_parse_pid_dxf, **params)
          case "parse_pid_pdf":
              return await asyncio.to_thread(_parse_pid_pdf, **params)
          case "extract_pid_tags":
              return await asyncio.to_thread(_extract_pid_tags, **params)
          case "match_pid_tags":
              return await asyncio.to_thread(_match_pid_tags, **params)
          case "analyze_pid_impact":
              return await asyncio.to_thread(_analyze_pid_impact, **params)
          case "build_pid_graph_parallel":
              return await _build_pid_graph_parallel(**params)  # 이미 async
          ...

  단, 이 워커는 계획서에서 세마포어로 동시 요청 1개만 허용하기로 했으므로 실질적 영향은 작습니다. 그래도 이벤트루프
  블로킹은 uvicorn의 헬스체크·종료 신호 처리까지 막기 때문에 수정하는 게 맞습니다.

  수정해드릴까요?

✻ Cooked for 31s

❯ 그래

  Read 1 file (ctrl+o to expand)

● _dispatch 함수만 수정하면 됩니다.

● Update(worker/pid_worker.py)

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 Edit file
 worker/pid_worker.py
╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌
 402  async def _dispatch(tool: str, params: dict) -> str:
 403      try:
 404          match tool:
 405 +            # blocking 함수는 asyncio.to_thread로 스레드풀 오프로드
 406              case "extract_pid_tags":
 406 -                return _extract_pid_tags(**params)
 407 +                return await asyncio.to_thread(_extract_pid_tags, **params)
              return await asyncio.to_thread(_match_pid_tags, **params)
          case "analyze_pid_impact":
              return await asyncio.to_thread(_analyze_pid_impact, **params)
          case "build_pid_graph_parallel":
              return await _build_pid_graph_parallel(**params)  # 이미 async
          ...

  단, 이 워커는 계획서에서 세마포어로 동시 요청 1개만 허용하기로 했으므로 실질적 영향은 작습니다. 그래도 이벤트루프
  블로킹은 uvicorn의 헬스체크·종료 신호 처리까지 막기 때문에 수정하는 게 맞습니다.

  수정해드릴까요?

✻ Cooked for 31s

❯ 그래

  Read 1 file (ctrl+o to expand)

● _dispatch 함수만 수정하면 됩니다.

● Update(worker/pid_worker.py)
  ⎿  Added 9 lines, removed 7 lines
      402  async def _dispatch(tool: str, params: dict) -> str:
      403      try:
      404          match tool:
      405 +            # blocking 함수는 asyncio.to_thread로 스레드풀 오프로드
      406              case "extract_pid_tags":
      406 -                return _extract_pid_tags(**params)
      407 +                return await asyncio.to_thread(_extract_pid_tags, **params)
      408              case "match_pid_tags":
      408 -                return _match_pid_tags(**params)
      409 +                return await asyncio.to_thread(_match_pid_tags, **params)
      410              case "parse_pid_dxf":
      410 -                return _parse_pid_dxf(**params)
      411 +                return await asyncio.to_thread(_parse_pid_dxf, **params)
      412              case "parse_pid_pdf":
      412 -                return _parse_pid_pdf(**params)
      413 +                return await asyncio.to_thread(_parse_pid_pdf, **params)
      414              case "parse_pid_drawing":
      414 -                return _parse_pid_drawing(**params)
      415 +                return await asyncio.to_thread(_parse_pid_drawing, **params)
      416 +            case "analyze_pid_impact":
      417 +                return await asyncio.to_thread(_analyze_pid_impact, **params)
      418 +            # 이미 async — 직접 await
      419              case "build_pid_graph_parallel":
      420                  return await _build_pid_graph_parallel(**params)
      417 -            case "analyze_pid_impact":
      418 -                return _analyze_pid_impact(**params)
      421              case _:
      422                  return json.dumps({"success": False, "error": f"알 수 없는 도구: {tool}"},
      423                                    ensure_ascii=False)

● Bash(.venv/bin/python -m py_compile worker/pid_worker.py && echo "OK")
  ⎿  OK

● 완료. 변경된 _dispatch:

  # blocking 함수는 asyncio.to_thread로 스레드풀 오프로드
  case "extract_pid_tags":
      return await asyncio.to_thread(_extract_pid_tags, **params)
  case "parse_pid_dxf":
      return await asyncio.to_thread(_parse_pid_dxf, **params)
  ...

  # 이미 async — 직접 await
  case "build_pid_graph_parallel":
      return await _build_pid_graph_parallel(**params)

  이제 ezdxf 파싱, PaddleOCR, vLLM HTTP 호출 같은 blocking 작업이 스레드풀에서 실행되어 uvicorn 이벤트루프를 막지
  않습니다. 헬스체크(/health)와 종료 신호(SIGTERM) 처리가 요청 진행 중에도 정상 작동합니다.