opencode 로 바꾸고 작업전 커밋

2026-05-08 17:22:10 +09:00
parent 15c17522c8
commit e923aab43b
202 changed files with 1336027 additions and 115 deletions
--- a/mcp-server/worker/pid_worker.py
+++ b/mcp-server/worker/pid_worker.py
@@ -173,7 +173,7 @@ def _extract_pid_tags(text: str, source_type: str) -> str:
    )
    truncated = text[:100000]
    resp = _llm().chat.completions.create(
-        model=VLLM_MODEL,
+        model="Qwen3.6-27B-FP8",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": f"Source: {source_type}\n\nText:\n{truncated}"},
@@ -202,7 +202,7 @@ def _match_pid_tags(pid_tags: list, experion_tags: list) -> str:
        "- Output ONLY the JSON array.\n"
    )
    resp = _llm().chat.completions.create(
-        model=VLLM_MODEL,
+        model="Qwen3.6-27B-FP8",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": (
@@ -247,7 +247,7 @@ def _parse_pid_dxf(filepath: str) -> str:
                          ensure_ascii=False, indent=2)

    resp = _llm().chat.completions.create(
-        model=VLLM_MODEL,
+        model="Qwen3.6-27B-FP8",
        messages=[
            {"role": "system", "content": _TAG_EXTRACT_SYSTEM},
            {"role": "user", "content": f"Source: dxf\n\nText:\n{text[:8000]}"},
@@ -273,7 +273,7 @@ def _parse_pid_pdf(filepath: str, use_ocr: bool = True) -> str:
                          ensure_ascii=False, indent=2)

    resp = _llm().chat.completions.create(
-        model=VLLM_MODEL,
+        model="Qwen3.6-27B-FP8",
        messages=[
            {"role": "system", "content": _TAG_EXTRACT_SYSTEM},
            {"role": "user", "content": f"Source: pdf\n\nText:\n{text[:12000]}"},
@@ -313,92 +313,211 @@ def _parse_pid_drawing(filepath: str) -> str:

 # ── 그래프 도구 ───────────────────────────────────────────────────────────────

+import time
+
 async def _build_pid_graph_parallel(filepath: str) -> str:
+    """
+    P&ID 그래프 빌드 — 독립 프로세스 병렬 아키텍처.
+
+    Phase 1: 도면 분할 + 기하 추출
+    Phase 2: 전체 텍스트 1회 추출
+    Phase 3: 5개 독립 추출 프로세스 병렬 실행 (subprocess.Popen)
+    Phase 4: 결과 통합 + tagNo 기준 중복 제거
+    Phase 5: 위상 그래프 빌드 + 저장
+    """
    from pipeline.extractor import PidGeometricExtractor
    from pipeline.topology import PidTopologyBuilder
-    from pipeline.mapper import IntelligentMapper
-    from openai import AsyncOpenAI
+
+    import subprocess
+    import tempfile
+    import shutil

    os.makedirs(STORAGE_DIR, exist_ok=True)
+    t0 = time.time()
+    basename = os.path.basename(filepath)
+    worker_dir = os.path.dirname(os.path.abspath(__file__))
+    logging.info(f"[{basename}] === 독립 프로세스 병렬 아키텍처 시작 ===")

-    # Phase 1: 기하 추출
+    # ── Phase 1: 도면 분할 + 기하 추출 ──────────────────────────────
+    logging.info(f"[{basename}] Phase 1: 도면 분할 + 기하 추출 시작")
    extractor = PidGeometricExtractor(filepath)
-    geo_data_path = os.path.join(STORAGE_DIR, os.path.basename(filepath) + "_geo.json")
+    regions = extractor.split_drawings()
+    logging.info(f"[{basename}] 도면 분할: {len(regions)}개 영역")
+
+    geo_data_path = os.path.join(STORAGE_DIR, basename + "_geo.json")
    extractor.extract_and_save(geo_data_path)
    with open(geo_data_path, "r", encoding="utf-8") as f:
        geo_data = json.load(f)
+    logging.info(f"[{basename}] Phase 1 완료 ({time.time()-t0:.1f}s) - {len(geo_data)}개 엔티티")

-    # 시스템 태그 조회
-    system_tags: list[str] = []
-    try:
-        conn = _get_db_connection()
-        with conn.cursor() as cur:
-            cur.execute("SELECT tagname FROM realtime_table")
-            system_tags = [r[0] for r in cur.fetchall()]
-    except Exception as e:
-        logging.warning(f"시스템 태그 조회 실패: {e}")
+    # ── Phase 2: 전체 텍스트 1회 추출 ───────────────────────────────
+    t2 = time.time()
+    logging.info(f"[{basename}] Phase 2: 전체 텍스트 1회 추출")
+    full_text = _extract_text_from_dxf(filepath)
+
+    # 임시 디렉토리 생성 (프로세스 간 통신용)
+    temp_dir = tempfile.mkdtemp(prefix=f"pid_{basename.replace('.dxf', '')}_")
+    text_path = os.path.join(temp_dir, "full_text.txt")
+    with open(text_path, "w", encoding="utf-8") as f:
+        f.write(full_text)
+    logging.info(f"[{basename}] Phase 2 완료 ({time.time()-t2:.1f}s) - {len(full_text)}자")
+
+    # ── Phase 3: 5개 독립 추출 프로세스 병렬 실행 ───────────────────
+    t3 = time.time()
+    logging.info(f"[{basename}] Phase 3: 5개 독립 추출 프로세스 병렬 실행")
+
+    extractors = [
+        ("sensor", "pid_extract_sensor.py"),
+        ("valve",  "pid_extract_valve.py"),
+        ("system", "pid_extract_system.py"),
+        ("gauge",  "pid_extract_gauge.py"),
+        ("pump",   "pid_extract_pump.py"),
+    ]
+
+    results_dir = os.path.join(temp_dir, "results")
+    os.makedirs(results_dir, exist_ok=True)
+
+    processes = []
+    for name, script in extractors:
+        output_path = os.path.join(results_dir, f"{name}.json")
+        script_path = os.path.join(worker_dir, script)
+        cmd = [
+            sys.executable, script_path,
+            "--input", text_path,
+            "--output", output_path,
+        ]
+        try:
+            proc = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+            processes.append((name, proc, output_path))
+            logging.info(f"[{basename}]   시작: {name} (pid={proc.pid})")
+        except Exception as e:
+            logging.error(f"[{basename}]   실행 실패 {name}: {e}")
+
+    # 모든 프로세스 대기 (timeout=300초)
+    for name, proc, output_path in processes:
+        try:
+            stdout, stderr = proc.communicate(timeout=300)
+            if proc.returncode != 0:
+                logging.error(f"[{basename}]   {name} 실패 (rc={proc.returncode}): {stderr[:200]}")
+            else:
+                logging.info(f"[{basename}]   {name} 완료: {stdout.strip()[:100]}")
+        except subprocess.TimeoutExpired:
+            proc.kill()
+            logging.error(f"[{basename}]   {name} 타임아웃 (300초)")
+
+    logging.info(f"[{basename}] Phase 3 완료 ({time.time()-t3:.1f}s)")
+
+    # ── Phase 4: 결과 JSON 통합 + tagNo 기준 중복 제거 ──────────────
+    t4 = time.time()
+    logging.info(f"[{basename}] Phase 4: 결과 통합 + 중복 제거")
+
+    all_tags = []
+    seen_tagnos = set()
+
+    for name, _, output_path in processes:
+        if not os.path.exists(output_path):
+            logging.warning(f"[{basename}]   결과 없음: {name}")
+            continue
+        try:
+            with open(output_path, "r", encoding="utf-8") as f:
+                result = json.load(f)
+            tags = result.get("tags", [])
+            count_new = 0
+            for tag in tags:
+                tag_no = tag.get("tagNo", "")
+                if tag_no and tag_no not in seen_tagnos:
+                    seen_tagnos.add(tag_no)
+                    all_tags.append(tag)
+                    count_new += 1
+            logging.info(f"[{basename}]   {name}: {len(tags)}개 중 {count_new}개 신규")
+        except Exception as e:
+            logging.error(f"[{basename}]   결과 로드 실패 {name}: {e}")
+
+    logging.info(f"[{basename}] Phase 4 완료 ({time.time()-t4:.1f}s) - 총 {len(all_tags)}개 태그")
+
+    # ── Phase 5: 위상 그래프 빌드 + 저장 ────────────────────────────
+    t5 = time.time()
+    logging.info(f"[{basename}] Phase 5: 위상 그래프 빌드")

-    # Phase 2: 1차 위상 빌더 (Mapper용 그래프)
    builder = PidTopologyBuilder(geo_data)
    builder.build_graph()

-    # Phase 3: 병렬 LLM 매핑
-    api_client = AsyncOpenAI(base_url=VLLM_BASE_URL, api_key="dummy")
-    mapper = IntelligentMapper(builder.G, system_tags, api_client=api_client)
+    # 추출된 태그를 그래프에 추가
+    from shapely.geometry import box as shapely_box
+    for tag in all_tags:
+        tag_no = tag.get("tagNo", "UNKNOWN")
+        eq_name = tag.get("equipmentName")
+        inst_type = tag.get("instrumentType")
+        confidence = tag.get("confidence", 0.5)

-    transmitter_nodes = [
-        n for n, d in builder.G.nodes(data=True)
-        if (d.get("value") or "").upper() in {"FIT", "FT", "LT", "PT", "TE"}
-    ]
-    valve_nodes = [
-        n for n, d in builder.G.nodes(data=True)
-        if (d.get("value") or "").upper() in {"FCV", "LCV", "TCV", "PCV", "XV"}
-    ]
-    equipment_nodes = [
-        n for n, d in builder.G.nodes(data=True)
-        if d.get("type") not in {"TEXT", "LINE", "LWPOLYLINE"}
-    ]
+        # 해당 태그의 bbox 찾기 (geo_data에서 clean_value 매칭)
+        matched_bbox = None
+        for entity in geo_data:
+            if entity.get("clean_value", "").upper() == tag_no.upper():
+                bbox = entity.get("bbox", {})
+                matched_bbox = (
+                    bbox.get("min_x"), bbox.get("min_y"),
+                    bbox.get("max_x"), bbox.get("max_y")
+                )
+                break

-    extracted_results = await asyncio.gather(
-        mapper.extract_transmitters(transmitter_nodes),
-        mapper.extract_valves(valve_nodes),
-        mapper.extract_equipment(equipment_nodes),
-    )
+        node_id = f"tag_{tag_no}"
+        if matched_bbox:
+            bbox_geom = shapely_box(matched_bbox[0], matched_bbox[1],
+                                     matched_bbox[2], matched_bbox[3])
+            builder.G.add_node(node_id,
+                               type="TEXT",
+                               bbox=bbox_geom,
+                               value=tag_no,
+                               equipment_name=eq_name,
+                               instrument_type=inst_type,
+                               confidence=confidence)
+        else:
+            builder.G.add_node(node_id,
+                               type="TEXT",
+                               bbox=shapely_box(0, 0, 1, 1),
+                               value=tag_no,
+                               equipment_name=eq_name,
+                               instrument_type=inst_type,
+                               confidence=confidence)

-    # 매핑 결과 통합
-    all_mapped_tags = []
-    for res_dict in extracted_results:
-        for node_id, mapping in res_dict.items():
-            if mapping.resolved_tag != "UNKNOWN":
-                node_data = builder.G.nodes[node_id]
-                all_mapped_tags.append({
-                    "entity_id": node_id,
-                    "tagName": mapping.resolved_tag,
-                    "bbox": (
-                        node_data["bbox"].bounds
-                        if hasattr(node_data["bbox"], "bounds")
-                        else node_data["bbox"]
-                    ),
-                    "clean_value": mapping.resolved_tag,
-                })
+    # 태그-설비 연결
+    equipments = [n for n, d in builder.G.nodes(data=True)
+                  if d.get("type") not in ("TEXT", "LINE", "LWPOLYLINE")]
+    if equipments:
+        eq_grid = builder._build_spatial_grid(equipments)
+        tag_ids = [f"tag_{t.get('tagNo', '')}" for t in all_tags]
+        for tag_id in tag_ids:
+            if tag_id in builder.G:
+                best_match = builder._find_nearest_equipment(tag_id, eq_grid)
+                if best_match:
+                    builder.G.add_edge(tag_id, best_match, relation="associated_with")

-    # Phase 4: 최종 위상 모델링 + 저장
-    final_builder = PidTopologyBuilder(geo_data, all_extracted_tags=all_mapped_tags)
-    final_builder.build_graph()
-
-    graph_id = os.path.basename(filepath).replace(".dxf", "_graph.json")
+    graph_id = basename.replace(".dxf", "_graph.json")
    graph_path = os.path.join(STORAGE_DIR, graph_id)
-    final_builder.save_graph(graph_path)
+    builder.save_graph(graph_path)

-    logging.info(f"build_pid_graph_parallel graph_id={graph_id} "
-                 f"nodes={final_builder.G.number_of_nodes()} "
-                 f"edges={final_builder.G.number_of_edges()}")
+    # 임시 디렉토리 정리
+    shutil.rmtree(temp_dir, ignore_errors=True)
+
+    total_time = time.time() - t0
+    logging.info(f"[{basename}] 전체 완료 ({total_time:.1f}s) - graph_id={graph_id} "
+                 f"nodes={builder.G.number_of_nodes()} "
+                 f"edges={builder.G.number_of_edges()} "
+                 f"tags={len(all_tags)}")
    return json.dumps({
        "success": True,
        "graph_id": graph_id,
        "graph_path": graph_path,
-        "nodes": final_builder.G.number_of_nodes(),
-        "edges": final_builder.G.number_of_edges(),
+        "nodes": builder.G.number_of_nodes(),
+        "edges": builder.G.number_of_edges(),
+        "tags_extracted": len(all_tags),
+        "processing_time_sec": round(total_time, 1)
    }, ensure_ascii=False)