ExperionCrawler/.rooBackup/2026-05-05-222700/mcp-server/worker/pid_extract_template.py

#!/usr/bin/env python3
"""P&ID 태그 추출기 공통 템플릿

독립 프로세스로서 CLI에서 실행되며,
입력 텍스트 파일에서 P&ID 태그를 추출하여 JSON 파일로 출력합니다.

사용법:
    python pid_extract_template.py --input full_text.txt --output result.json --prompt "system prompt text"
    python pid_extract_template.py --input full_text.txt --output result.json --prompt-file prompt.txt

환경 변수:
    VLLM_BASE_URL: vLLM 엔드포인트 (기본: http://localhost:8000/v1)
    VLLM_MODEL: 모델명 (기본: glm-4.7-flash)
"""

import argparse
import json
import logging
import os
import re
import sys
import time
from typing import List

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
)
logger = logging.getLogger("pid_extractor")


def parse_json_array(raw: str, finish_reason: str = "") -> list:
    """LLM 출력에서 JSON 배열 추출. finish_reason=length 잘림 복구 포함."""
    if raw.startswith("```"):
        lines = raw.splitlines()
        raw = "\n".join(lines[1:-1] if lines and lines[-1].strip() == "```" else lines[1:]).strip()

    if finish_reason == "length":
        last_close = raw.rfind("}")
        if last_close != -1:
            raw = raw[:last_close + 1] + "]"

    # 가장 긴 균형 잡힌 [...] 추출
    depth = 0
    start = -1
    best = ""
    for i, c in enumerate(raw):
        if c == "[":
            if depth == 0:
                start = i
            depth += 1
        elif c == "]":
            depth -= 1
            if depth == 0 and start >= 0:
                cand = raw[start:i + 1]
                if len(cand) > len(best):
                    best = cand
    raw = best if best else "[]"

    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        data = []
        for obj in re.findall(r"\{[^{}]*\}", raw, re.DOTALL):
            try:
                data.append(json.loads(obj))
            except json.JSONDecodeError:
                pass
        return data


def call_llm(system_prompt: str, user_text: str, max_tokens: int = 65536) -> List[dict]:
    """
    vLLM에 LLM 호출하여 태그 목록 추출.

    Args:
        system_prompt: 시스템 프롬프트
        user_text: 입력 텍스트
        max_tokens: 최대 토큰 수

    Returns:
        추출된 태그 목록 (JSON 배열)
    """
    from openai import OpenAI

    base_url = os.environ.get("VLLM_BASE_URL", "http://localhost:8000/v1")
    model = os.environ.get("VLLM_MODEL", "glm-4.7-flash")

    client = OpenAI(base_url=base_url, api_key="dummy")

    logger.info(f"vLLM 호출: {base_url}, 모델: {model}, max_tokens: {max_tokens}")
    logger.info(f"입력 텍스트 길이: {len(user_text)}자")

    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_text},
        ],
        max_tokens=max_tokens,
        temperature=0.1,
        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
    )

    raw = (resp.choices[0].message.content or "").strip()
    finish_reason = resp.choices[0].finish_reason

    logger.info(f"LLM 응답: finish_reason={finish_reason}, 응답 길이={len(raw)}자")

    data = parse_json_array(raw, finish_reason)

    if finish_reason == "length":
        logger.warning(f"finish_reason=length: 응답이 잘렸습니다. 복구 시도됨. 추출된 태그 수: {len(data)}")

    return data


def main():
    parser = argparse.ArgumentParser(description="P&ID 태그 추출기")
    parser.add_argument("--input", required=True, help="입력 텍스트 파일 경로")
    parser.add_argument("--output", required=True, help="출력 JSON 파일 경로")
    parser.add_argument("--prompt", type=str, default=None, help="시스템 프롬프트 (인라인)")
    parser.add_argument("--prompt-file", type=str, default=None, help="시스템 프롬프트 파일 경로")
    parser.add_argument("--max-tokens", type=int, default=65536, help="최대 토큰 수 (기본: 65536)")

    args = parser.parse_args()

    # 1. 입력 텍스트 읽기
    if not os.path.exists(args.input):
        logger.error(f"입력 파일을 찾을 수 없습니다: {args.input}")
        sys.exit(1)

    with open(args.input, "r", encoding="utf-8") as f:
        input_text = f.read()

    logger.info(f"입력 파일 읽기 완료: {len(input_text)}자")

    # 2. 시스템 프롬프트 읽기
    system_prompt = None
    if args.prompt:
        system_prompt = args.prompt
    elif args.prompt_file:
        if not os.path.exists(args.prompt_file):
            logger.error(f"프롬프트 파일을 찾을 수 없습니다: {args.prompt_file}")
            sys.exit(1)
        with open(args.prompt_file, "r", encoding="utf-8") as f:
            system_prompt = f.read()
    else:
        logger.error("--prompt 또는 --prompt-file 중 하나를 지정해야 합니다.")
        sys.exit(1)

    logger.info(f"시스템 프롬프트: {len(system_prompt)}자")

    # 3. LLM 호출
    t0 = time.time()
    tags = call_llm(system_prompt, input_text, max_tokens=args.max_tokens)
    elapsed = time.time() - t0

    logger.info(f"추출 완료: {len(tags)}개 태그, 소요 시간: {elapsed:.1f}초")

    # 4. 결과 JSON 쓰기
    output_dir = os.path.dirname(args.output)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    result = {
        "success": True,
        "count": len(tags),
        "tags": tags,
        "processing_time_sec": round(elapsed, 1),
    }

    with open(args.output, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    logger.info(f"결과 저장 완료: {args.output}")

    # 5. 요약 출력
    print(json.dumps({
        "success": True,
        "count": len(tags),
        "time": round(elapsed, 1)
    }, ensure_ascii=False))


if __name__ == "__main__":
    main()