opencode 로 바꾸고 작업전 커밋

This commit is contained in:
windpacer
2026-05-08 17:22:10 +09:00
parent 15c17522c8
commit e923aab43b
202 changed files with 1336027 additions and 115 deletions

View File

@@ -173,7 +173,7 @@ def _extract_pid_tags(text: str, source_type: str) -> str:
)
truncated = text[:100000]
resp = _llm().chat.completions.create(
model=VLLM_MODEL,
model="Qwen3.6-27B-FP8",
messages=[
{"role": "system", "content": system},
{"role": "user", "content": f"Source: {source_type}\n\nText:\n{truncated}"},
@@ -202,7 +202,7 @@ def _match_pid_tags(pid_tags: list, experion_tags: list) -> str:
"- Output ONLY the JSON array.\n"
)
resp = _llm().chat.completions.create(
model=VLLM_MODEL,
model="Qwen3.6-27B-FP8",
messages=[
{"role": "system", "content": system},
{"role": "user", "content": (
@@ -247,7 +247,7 @@ def _parse_pid_dxf(filepath: str) -> str:
ensure_ascii=False, indent=2)
resp = _llm().chat.completions.create(
model=VLLM_MODEL,
model="Qwen3.6-27B-FP8",
messages=[
{"role": "system", "content": _TAG_EXTRACT_SYSTEM},
{"role": "user", "content": f"Source: dxf\n\nText:\n{text[:8000]}"},
@@ -273,7 +273,7 @@ def _parse_pid_pdf(filepath: str, use_ocr: bool = True) -> str:
ensure_ascii=False, indent=2)
resp = _llm().chat.completions.create(
model=VLLM_MODEL,
model="Qwen3.6-27B-FP8",
messages=[
{"role": "system", "content": _TAG_EXTRACT_SYSTEM},
{"role": "user", "content": f"Source: pdf\n\nText:\n{text[:12000]}"},
@@ -313,92 +313,211 @@ def _parse_pid_drawing(filepath: str) -> str:
# ── 그래프 도구 ───────────────────────────────────────────────────────────────
import time
async def _build_pid_graph_parallel(filepath: str) -> str:
"""
P&ID 그래프 빌드 — 독립 프로세스 병렬 아키텍처.
Phase 1: 도면 분할 + 기하 추출
Phase 2: 전체 텍스트 1회 추출
Phase 3: 5개 독립 추출 프로세스 병렬 실행 (subprocess.Popen)
Phase 4: 결과 통합 + tagNo 기준 중복 제거
Phase 5: 위상 그래프 빌드 + 저장
"""
from pipeline.extractor import PidGeometricExtractor
from pipeline.topology import PidTopologyBuilder
from pipeline.mapper import IntelligentMapper
from openai import AsyncOpenAI
import subprocess
import tempfile
import shutil
os.makedirs(STORAGE_DIR, exist_ok=True)
t0 = time.time()
basename = os.path.basename(filepath)
worker_dir = os.path.dirname(os.path.abspath(__file__))
logging.info(f"[{basename}] === 독립 프로세스 병렬 아키텍처 시작 ===")
# Phase 1: 기하 추출
# ── Phase 1: 도면 분할 + 기하 추출 ──────────────────────────────
logging.info(f"[{basename}] Phase 1: 도면 분할 + 기하 추출 시작")
extractor = PidGeometricExtractor(filepath)
geo_data_path = os.path.join(STORAGE_DIR, os.path.basename(filepath) + "_geo.json")
regions = extractor.split_drawings()
logging.info(f"[{basename}] 도면 분할: {len(regions)}개 영역")
geo_data_path = os.path.join(STORAGE_DIR, basename + "_geo.json")
extractor.extract_and_save(geo_data_path)
with open(geo_data_path, "r", encoding="utf-8") as f:
geo_data = json.load(f)
logging.info(f"[{basename}] Phase 1 완료 ({time.time()-t0:.1f}s) - {len(geo_data)}개 엔티티")
# 시스템 태그 조회
system_tags: list[str] = []
try:
conn = _get_db_connection()
with conn.cursor() as cur:
cur.execute("SELECT tagname FROM realtime_table")
system_tags = [r[0] for r in cur.fetchall()]
except Exception as e:
logging.warning(f"시스템 태그 조회 실패: {e}")
# ── Phase 2: 전체 텍스트 1회 추출 ───────────────────────────────
t2 = time.time()
logging.info(f"[{basename}] Phase 2: 전체 텍스트 1회 추출")
full_text = _extract_text_from_dxf(filepath)
# 임시 디렉토리 생성 (프로세스 간 통신용)
temp_dir = tempfile.mkdtemp(prefix=f"pid_{basename.replace('.dxf', '')}_")
text_path = os.path.join(temp_dir, "full_text.txt")
with open(text_path, "w", encoding="utf-8") as f:
f.write(full_text)
logging.info(f"[{basename}] Phase 2 완료 ({time.time()-t2:.1f}s) - {len(full_text)}")
# ── Phase 3: 5개 독립 추출 프로세스 병렬 실행 ───────────────────
t3 = time.time()
logging.info(f"[{basename}] Phase 3: 5개 독립 추출 프로세스 병렬 실행")
extractors = [
("sensor", "pid_extract_sensor.py"),
("valve", "pid_extract_valve.py"),
("system", "pid_extract_system.py"),
("gauge", "pid_extract_gauge.py"),
("pump", "pid_extract_pump.py"),
]
results_dir = os.path.join(temp_dir, "results")
os.makedirs(results_dir, exist_ok=True)
processes = []
for name, script in extractors:
output_path = os.path.join(results_dir, f"{name}.json")
script_path = os.path.join(worker_dir, script)
cmd = [
sys.executable, script_path,
"--input", text_path,
"--output", output_path,
]
try:
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
processes.append((name, proc, output_path))
logging.info(f"[{basename}] 시작: {name} (pid={proc.pid})")
except Exception as e:
logging.error(f"[{basename}] 실행 실패 {name}: {e}")
# 모든 프로세스 대기 (timeout=300초)
for name, proc, output_path in processes:
try:
stdout, stderr = proc.communicate(timeout=300)
if proc.returncode != 0:
logging.error(f"[{basename}] {name} 실패 (rc={proc.returncode}): {stderr[:200]}")
else:
logging.info(f"[{basename}] {name} 완료: {stdout.strip()[:100]}")
except subprocess.TimeoutExpired:
proc.kill()
logging.error(f"[{basename}] {name} 타임아웃 (300초)")
logging.info(f"[{basename}] Phase 3 완료 ({time.time()-t3:.1f}s)")
# ── Phase 4: 결과 JSON 통합 + tagNo 기준 중복 제거 ──────────────
t4 = time.time()
logging.info(f"[{basename}] Phase 4: 결과 통합 + 중복 제거")
all_tags = []
seen_tagnos = set()
for name, _, output_path in processes:
if not os.path.exists(output_path):
logging.warning(f"[{basename}] 결과 없음: {name}")
continue
try:
with open(output_path, "r", encoding="utf-8") as f:
result = json.load(f)
tags = result.get("tags", [])
count_new = 0
for tag in tags:
tag_no = tag.get("tagNo", "")
if tag_no and tag_no not in seen_tagnos:
seen_tagnos.add(tag_no)
all_tags.append(tag)
count_new += 1
logging.info(f"[{basename}] {name}: {len(tags)}개 중 {count_new}개 신규")
except Exception as e:
logging.error(f"[{basename}] 결과 로드 실패 {name}: {e}")
logging.info(f"[{basename}] Phase 4 완료 ({time.time()-t4:.1f}s) - 총 {len(all_tags)}개 태그")
# ── Phase 5: 위상 그래프 빌드 + 저장 ────────────────────────────
t5 = time.time()
logging.info(f"[{basename}] Phase 5: 위상 그래프 빌드")
# Phase 2: 1차 위상 빌더 (Mapper용 그래프)
builder = PidTopologyBuilder(geo_data)
builder.build_graph()
# Phase 3: 병렬 LLM 매핑
api_client = AsyncOpenAI(base_url=VLLM_BASE_URL, api_key="dummy")
mapper = IntelligentMapper(builder.G, system_tags, api_client=api_client)
# 추출된 태그를 그래프에 추가
from shapely.geometry import box as shapely_box
for tag in all_tags:
tag_no = tag.get("tagNo", "UNKNOWN")
eq_name = tag.get("equipmentName")
inst_type = tag.get("instrumentType")
confidence = tag.get("confidence", 0.5)
transmitter_nodes = [
n for n, d in builder.G.nodes(data=True)
if (d.get("value") or "").upper() in {"FIT", "FT", "LT", "PT", "TE"}
]
valve_nodes = [
n for n, d in builder.G.nodes(data=True)
if (d.get("value") or "").upper() in {"FCV", "LCV", "TCV", "PCV", "XV"}
]
equipment_nodes = [
n for n, d in builder.G.nodes(data=True)
if d.get("type") not in {"TEXT", "LINE", "LWPOLYLINE"}
]
# 해당 태그의 bbox 찾기 (geo_data에서 clean_value 매칭)
matched_bbox = None
for entity in geo_data:
if entity.get("clean_value", "").upper() == tag_no.upper():
bbox = entity.get("bbox", {})
matched_bbox = (
bbox.get("min_x"), bbox.get("min_y"),
bbox.get("max_x"), bbox.get("max_y")
)
break
extracted_results = await asyncio.gather(
mapper.extract_transmitters(transmitter_nodes),
mapper.extract_valves(valve_nodes),
mapper.extract_equipment(equipment_nodes),
)
node_id = f"tag_{tag_no}"
if matched_bbox:
bbox_geom = shapely_box(matched_bbox[0], matched_bbox[1],
matched_bbox[2], matched_bbox[3])
builder.G.add_node(node_id,
type="TEXT",
bbox=bbox_geom,
value=tag_no,
equipment_name=eq_name,
instrument_type=inst_type,
confidence=confidence)
else:
builder.G.add_node(node_id,
type="TEXT",
bbox=shapely_box(0, 0, 1, 1),
value=tag_no,
equipment_name=eq_name,
instrument_type=inst_type,
confidence=confidence)
# 매핑 결과 통합
all_mapped_tags = []
for res_dict in extracted_results:
for node_id, mapping in res_dict.items():
if mapping.resolved_tag != "UNKNOWN":
node_data = builder.G.nodes[node_id]
all_mapped_tags.append({
"entity_id": node_id,
"tagName": mapping.resolved_tag,
"bbox": (
node_data["bbox"].bounds
if hasattr(node_data["bbox"], "bounds")
else node_data["bbox"]
),
"clean_value": mapping.resolved_tag,
})
# 태그-설비 연결
equipments = [n for n, d in builder.G.nodes(data=True)
if d.get("type") not in ("TEXT", "LINE", "LWPOLYLINE")]
if equipments:
eq_grid = builder._build_spatial_grid(equipments)
tag_ids = [f"tag_{t.get('tagNo', '')}" for t in all_tags]
for tag_id in tag_ids:
if tag_id in builder.G:
best_match = builder._find_nearest_equipment(tag_id, eq_grid)
if best_match:
builder.G.add_edge(tag_id, best_match, relation="associated_with")
# Phase 4: 최종 위상 모델링 + 저장
final_builder = PidTopologyBuilder(geo_data, all_extracted_tags=all_mapped_tags)
final_builder.build_graph()
graph_id = os.path.basename(filepath).replace(".dxf", "_graph.json")
graph_id = basename.replace(".dxf", "_graph.json")
graph_path = os.path.join(STORAGE_DIR, graph_id)
final_builder.save_graph(graph_path)
builder.save_graph(graph_path)
logging.info(f"build_pid_graph_parallel graph_id={graph_id} "
f"nodes={final_builder.G.number_of_nodes()} "
f"edges={final_builder.G.number_of_edges()}")
# 임시 디렉토리 정리
shutil.rmtree(temp_dir, ignore_errors=True)
total_time = time.time() - t0
logging.info(f"[{basename}] 전체 완료 ({total_time:.1f}s) - graph_id={graph_id} "
f"nodes={builder.G.number_of_nodes()} "
f"edges={builder.G.number_of_edges()} "
f"tags={len(all_tags)}")
return json.dumps({
"success": True,
"graph_id": graph_id,
"graph_path": graph_path,
"nodes": final_builder.G.number_of_nodes(),
"edges": final_builder.G.number_of_edges(),
"nodes": builder.G.number_of_nodes(),
"edges": builder.G.number_of_edges(),
"tags_extracted": len(all_tags),
"processing_time_sec": round(total_time, 1)
}, ensure_ascii=False)