ExperionCrawler/test_dxf_extract_pid3.py

#!/usr/bin/env python3
"""
DXF 파일에서 P&ID 태그 추출 스크립트
ezdxf + OpenAI API 사용 (청크 단위 3회 호출)
"""

import ezdxf
import json
import re
from ezdxf.tools.text import plain_mtext
from openai import OpenAI

# DXF 파일 경로
filepath = '/home/windpacer/projects/ExperionCrawler/src/Web/uploads/pid/p-9100.dxf'

# DXF 파일 읽기
doc = ezdxf.readfile(filepath)
msp = doc.modelspace()

# 텍스트 추출
texts = []
for entity in msp:
    if entity.dxftype() == 'TEXT':
        texts.append(entity.dxf.text)
    elif entity.dxftype() == 'MTEXT':
        try:
            plain = plain_mtext(entity.dxf.text)
            if plain.strip():
                texts.append(plain)
        except Exception:
            pass

text = '\n'.join(texts)
print(f'총 텍스트: {len(text)}자')

# OpenAI 클라이언트 생성 (타임아웃 1800초 = 30분)
llm = OpenAI(
    base_url='http://localhost:8000/v1',
    api_key='dummy',
    timeout=1800
)

# 청크별 프롬프트
chunks = [
    {
        'name': 'System Tags',
        'system': (
            'You are a P&ID expert. Extract system tags only.\n'
            'Return ONLY a JSON array.\n'
            '\n'
            'Instrument types to extract: LI, PI, TI, FIQ, FICQ, TICA, PICA, LICA, FIC, TIC, PIC, LIC\n'
            'Format: [{"tagNo":"FICQ-10101","confidence":0.95},...]\n'
        ),
        'user': 'Extract ALL tags of LI, PI, TI, FIQ, FICQ, TICA, PICA, LICA, FIC, TIC, PIC, LIC from the text below:\n\n{text}'
    }
]

# 결과 저장
all_tags = []
seen_tags = set()

for chunk in chunks:
    print(f'\n=== {chunk["name"]} ===')

    # 프롬프트 생성
    system = chunk['system']
    user = chunk['user'].format(text=text[:100000])

    # LLM 호출
    resp = llm.chat.completions.create(
        model='Qwen3.6-27B-FP8',
        messages=[
            {'role': 'system', 'content': system},
            {'role': 'user', 'content': user},
        ],
        max_tokens=65536,
        temperature=0.1,
        extra_body={'chat_template_kwargs': {'enable_thinking': False}},
    )

    raw = (resp.choices[0].message.content or '').strip()
    finish = resp.choices[0].finish_reason
    print(f'응답 길이: {len(raw)}자, finish_reason: {finish}')

    # 잘린 경우 복구
    if finish == 'length' and not raw.rstrip().endswith(']'):
        last = raw.rfind('}')
        if last != -1:
            raw = raw[:last+1] + ']'
            print('잘린 JSON 복구')

    # 배열 추출
    depth = 0
    start = -1
    best = ''
    for i, c in enumerate(raw):
        if c == '[':
            if depth == 0:
                start = i
            depth += 1
        elif c == ']':
            depth -= 1
            if depth == 0 and start >= 0:
                cand = raw[start:i+1]
                if len(cand) > len(best):
                    best = cand

    # JSON 파싱 및 결과 저장
    try:
        data = json.loads(best) if best else []
        print(f'추출된 태그 수: {len(data)}개')

        for item in data:
            tag_no = item.get('tagNo')
            if tag_no and tag_no not in seen_tags:
                seen_tags.add(tag_no)
                all_tags.append({
                    'tagNo': tag_no,
                    'confidence': item.get('confidence', 0.95)
                })

        print(f'중복 제거 후 총 태그 수: {len(all_tags)}개')
    except json.JSONDecodeError as e:
        print(f'파싱 실패: {e}')
        print(repr(best[-300:] if best else ''))

# 최종 결과
print(f'\n=== 최종 결과 ===')
print(f'총 추출 태그 수: {len(all_tags)}개')
print()
for t in all_tags[:30]:
    print(f'  {t["tagNo"]}')
if len(all_tags) > 30:
    print(f'  ... (총 {len(all_tags)}개)')

# JSON 파일로 저장
output_path = '/home/windpacer/projects/ExperionCrawler/src/Web/uploads/pid/p-9100_extracted3.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(all_tags, f, indent=2, ensure_ascii=False)
print(f'\n결과 저장: {output_path}')