ExperionCrawler/dxf-graph/pid_extractor.py

"""
P&ID Extractor - DXF/PDF → Claude Vision API → CSV → PostgreSQL
Extracts: Equipment Name, Tag No., Instrument Type, Line Number, P&ID Drawing No.
"""

import os
import json
import csv
import base64
import re
import logging
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass, field, asdict
from typing import Optional
import anthropic

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("logs/extractor.log"),
        logging.StreamHandler()
    ]
)
log = logging.getLogger(__name__)


# ─────────────────────────────────────────────
# Data Model
# ─────────────────────────────────────────────

@dataclass
class PIDItem:
    """Single extracted item from a P&ID drawing."""
    pid_drawing_no: str = ""
    tag_no: str = ""
    equipment_name: str = ""
    instrument_type: str = ""
    line_number: str = ""
    service_description: str = ""
    confidence: float = 0.0
    source_file: str = ""
    extracted_at: str = field(default_factory=lambda: datetime.now().isoformat())

    def to_dict(self) -> dict:
        return asdict(self)


# ─────────────────────────────────────────────
# File Converters
# ─────────────────────────────────────────────

def dxf_to_image(dxf_path: str, output_dir: str = "output") -> list[str]:
    """
    Convert DXF file to PNG image(s) using ezdxf + matplotlib.
    Returns list of image paths.
    """
    try:
        import ezdxf
        from ezdxf.addons.drawing import RenderContext, Frontend
        from ezdxf.addons.drawing.matplotlib import MatplotlibBackend
        import matplotlib.pyplot as plt

        doc = ezdxf.readfile(dxf_path)
        msp = doc.modelspace()

        fig = plt.figure(figsize=(24, 18), dpi=150)
        ax = fig.add_axes([0, 0, 1, 1])
        ctx = RenderContext(doc)
        out = MatplotlibBackend(ax)
        Frontend(ctx, out).draw_layout(msp, finalize=True)

        output_path = Path(output_dir) / (Path(dxf_path).stem + ".png")
        fig.savefig(output_path, dpi=150, bbox_inches="tight",
                    facecolor="white", edgecolor="none")
        plt.close(fig)

        log.info(f"DXF converted: {output_path}")
        return [str(output_path)]

    except ImportError:
        log.warning("ezdxf/matplotlib not installed. Run: pip install ezdxf matplotlib")
        return []
    except Exception as e:
        log.error(f"DXF conversion failed: {e}")
        return []


def pdf_to_images(pdf_path: str, output_dir: str = "output",
                  dpi: int = 200) -> list[str]:
    """
    Convert PDF pages to PNG images using pdf2image.
    Returns list of image paths.
    """
    try:
        from pdf2image import convert_from_path

        pages = convert_from_path(pdf_path, dpi=dpi)
        paths = []
        stem = Path(pdf_path).stem

        for i, page in enumerate(pages):
            out_path = Path(output_dir) / f"{stem}_page{i+1:03d}.png"
            page.save(str(out_path), "PNG")
            paths.append(str(out_path))
            log.info(f"PDF page {i+1} saved: {out_path}")

        return paths

    except ImportError:
        log.warning("pdf2image not installed. Run: pip install pdf2image")
        return []
    except Exception as e:
        log.error(f"PDF conversion failed: {e}")
        return []


def dxf_text_extract(dxf_path: str) -> str:
    """
    Directly extract all text entities from DXF file (faster, no image needed).
    Returns concatenated text for pre-filtering.
    """
    try:
        import ezdxf

        doc = ezdxf.readfile(dxf_path)
        texts = []
        for entity in doc.modelspace():
            if entity.dxftype() in ("TEXT", "MTEXT", "ATTRIB", "ATTDEF"):
                try:
                    txt = entity.dxf.text if hasattr(entity.dxf, "text") else ""
                    if txt.strip():
                        texts.append(txt.strip())
                except Exception:
                    pass
        return "\n".join(texts)
    except Exception as e:
        log.error(f"DXF text extraction failed: {e}")
        return ""


# ─────────────────────────────────────────────
# Claude Vision Analyzer
# ─────────────────────────────────────────────

EXTRACTION_PROMPT = """You are an expert P&ID (Piping and Instrumentation Diagram) engineer.
Analyze this P&ID drawing image and extract ALL of the following items:

1. **P&ID Drawing Number** (도면번호) - usually found in title block
2. **Tag Numbers** (태그번호) - e.g. FT-1001, PT-2003, LT-1005, E-101, V-201
3. **Equipment Names** (장비명) - e.g. Heat Exchanger, Pump, Vessel, Compressor
4. **Instrument Types** (계기타입) - e.g. Flow Transmitter, Pressure Indicator, Level Controller
5. **Line Numbers** (라인번호) - e.g. 6\"-P-1001-A1A, 3\"-IA-2001

For each item found, return a JSON array with this exact structure:
[
  {
    "pid_drawing_no": "P&ID drawing number or sheet number",
    "tag_no": "instrument or equipment tag (e.g. FT-1001)",
    "equipment_name": "descriptive name in English (e.g. Flow Transmitter)",
    "instrument_type": "ISA instrument type abbreviation (e.g. FT, PT, LT, E, V, P)",
    "line_number": "pipe line number if associated",
    "service_description": "brief service description if visible",
    "confidence": 0.0 to 1.0 confidence score
  }
]

Rules:
- Extract EVERY tag and instrument visible, do not skip any
- If a field is not visible/applicable, use empty string ""
- Return ONLY valid JSON array, no markdown, no explanation
- confidence: 1.0 = clearly readable, 0.5 = partially legible, 0.2 = guessed
"""

TEXT_EXTRACTION_PROMPT = """You are an expert P&ID engineer.
Below is raw text extracted from a DXF P&ID file.
Parse and extract ALL instrument tags, equipment tags, line numbers, and drawing info.

Text content:
{text_content}

Return a JSON array with this exact structure:
[
  {{
    "pid_drawing_no": "drawing number if found",
    "tag_no": "tag number (e.g. FT-1001, E-101)",
    "equipment_name": "equipment or instrument name",
    "instrument_type": "ISA type abbreviation",
    "line_number": "line number if found",
    "service_description": "service description if found",
    "confidence": 0.8
  }}
]

Return ONLY valid JSON, no markdown.
"""


class PIDAnalyzer:
    """Claude-powered P&ID analyzer supporting both image and text modes."""

    def __init__(self, api_key: Optional[str] = None):
        self.client = anthropic.Anthropic(
            api_key=api_key or os.environ.get("ANTHROPIC_API_KEY")
        )
        self.model = "claude-opus-4-20250514"

    def analyze_image(self, image_path: str) -> list[PIDItem]:
        """Analyze a P&ID image using Claude Vision."""
        log.info(f"Analyzing image: {image_path}")

        with open(image_path, "rb") as f:
            image_data = base64.standard_b64encode(f.read()).decode("utf-8")

        # Detect media type
        suffix = Path(image_path).suffix.lower()
        media_map = {".png": "image/png", ".jpg": "image/jpeg",
                     ".jpeg": "image/jpeg", ".gif": "image/gif",
                     ".webp": "image/webp"}
        media_type = media_map.get(suffix, "image/png")

        response = self.client.messages.create(
            model=self.model,
            max_tokens=4096,
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": media_type,
                            "data": image_data
                        }
                    },
                    {"type": "text", "text": EXTRACTION_PROMPT}
                ]
            }]
        )

        raw = response.content[0].text
        return self._parse_response(raw, source_file=image_path)

    def analyze_dxf_text(self, dxf_path: str) -> list[PIDItem]:
        """Analyze DXF by extracting text entities and sending to Claude."""
        log.info(f"Analyzing DXF text: {dxf_path}")
        text_content = dxf_text_extract(dxf_path)

        if not text_content.strip():
            log.warning("No text found in DXF, falling back to image mode")
            images = dxf_to_image(dxf_path)
            results = []
            for img in images:
                results.extend(self.analyze_image(img))
            return results

        prompt = TEXT_EXTRACTION_PROMPT.format(
            text_content=text_content[:8000]  # token limit guard
        )

        response = self.client.messages.create(
            model=self.model,
            max_tokens=4096,
            messages=[{"role": "user", "content": prompt}]
        )

        raw = response.content[0].text
        return self._parse_response(raw, source_file=dxf_path)

    def _parse_response(self, raw: str, source_file: str) -> list[PIDItem]:
        """Parse Claude's JSON response into PIDItem list."""
        try:
            # Strip markdown fences if present
            clean = re.sub(r"```(?:json)?|```", "", raw).strip()

            # Find JSON array
            match = re.search(r"\[.*\]", clean, re.DOTALL)
            if not match:
                log.warning("No JSON array found in response")
                return []

            data = json.loads(match.group())
            items = []
            for d in data:
                if not isinstance(d, dict):
                    continue
                item = PIDItem(
                    pid_drawing_no=d.get("pid_drawing_no", ""),
                    tag_no=d.get("tag_no", ""),
                    equipment_name=d.get("equipment_name", ""),
                    instrument_type=d.get("instrument_type", ""),
                    line_number=d.get("line_number", ""),
                    service_description=d.get("service_description", ""),
                    confidence=float(d.get("confidence", 0.5)),
                    source_file=Path(source_file).name,
                )
                items.append(item)

            log.info(f"Extracted {len(items)} items from {Path(source_file).name}")
            return items

        except json.JSONDecodeError as e:
            log.error(f"JSON parse error: {e}\nRaw: {raw[:500]}")
            return []


# ─────────────────────────────────────────────
# CSV Exporter
# ─────────────────────────────────────────────

CSV_COLUMNS = [
    "pid_drawing_no", "tag_no", "equipment_name", "instrument_type",
    "line_number", "service_description", "confidence",
    "source_file", "extracted_at"
]


def export_csv(items: list[PIDItem], output_path: str) -> str:
    """Export extracted items to CSV file."""
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
        writer.writeheader()
        for item in items:
            writer.writerow(item.to_dict())

    log.info(f"CSV saved: {output_path} ({len(items)} rows)")
    return output_path


# ─────────────────────────────────────────────
# PostgreSQL Loader
# ─────────────────────────────────────────────

CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS pid_equipment (
    id              SERIAL PRIMARY KEY,
    pid_drawing_no  VARCHAR(100),
    tag_no          VARCHAR(100),
    equipment_name  VARCHAR(255),
    instrument_type VARCHAR(50),
    line_number     VARCHAR(100),
    service_description TEXT,
    confidence      FLOAT,
    source_file     VARCHAR(255),
    extracted_at    TIMESTAMPTZ DEFAULT NOW(),
    created_at      TIMESTAMPTZ DEFAULT NOW()
);

-- Useful indexes
CREATE INDEX IF NOT EXISTS idx_pid_tag_no ON pid_equipment(tag_no);
CREATE INDEX IF NOT EXISTS idx_pid_drawing_no ON pid_equipment(pid_drawing_no);
CREATE INDEX IF NOT EXISTS idx_pid_instrument_type ON pid_equipment(instrument_type);
"""

INSERT_SQL = """
INSERT INTO pid_equipment
    (pid_drawing_no, tag_no, equipment_name, instrument_type,
     line_number, service_description, confidence, source_file, extracted_at)
VALUES
    (%(pid_drawing_no)s, %(tag_no)s, %(equipment_name)s, %(instrument_type)s,
     %(line_number)s, %(service_description)s, %(confidence)s,
     %(source_file)s, %(extracted_at)s)
ON CONFLICT DO NOTHING;
"""


def load_to_postgres(items: list[PIDItem], dsn: str) -> int:
    """
    Load extracted items into PostgreSQL.
    DSN format: postgresql://user:password@host:5432/dbname
    Returns number of rows inserted.
    """
    try:
        import psycopg2

        conn = psycopg2.connect(dsn)
        cur = conn.cursor()

        # Create table if needed
        cur.execute(CREATE_TABLE_SQL)

        # Insert rows
        rows = [item.to_dict() for item in items]
        cur.executemany(INSERT_SQL, rows)
        conn.commit()

        count = cur.rowcount
        cur.close()
        conn.close()

        log.info(f"Inserted {count} rows into PostgreSQL")
        return count

    except ImportError:
        log.error("psycopg2 not installed. Run: pip install psycopg2-binary")
        return 0
    except Exception as e:
        log.error(f"PostgreSQL error: {e}")
        return 0


# ─────────────────────────────────────────────
# AX (Asset Excellence) Formatter
# ─────────────────────────────────────────────

AX_COLUMN_MAP = {
    "tag_no":           "Tag Number",
    "equipment_name":   "Asset Description",
    "instrument_type":  "Equipment Class",
    "pid_drawing_no":   "P&ID Reference",
    "line_number":      "Line Reference",
    "service_description": "Service",
}


def export_ax_excel(items: list[PIDItem], output_path: str) -> str:
    """
    Export data in AX (Asset Excellence / Hexagon) compatible Excel format.
    Columns mapped to typical AX field names.
    """
    try:
        import openpyxl
        from openpyxl.styles import Font, PatternFill, Alignment, Border, Side

        wb = openpyxl.Workbook()
        ws = wb.active
        ws.title = "AX_Import"

        # Header style
        header_fill = PatternFill("solid", fgColor="1F4E79")
        header_font = Font(color="FFFFFF", bold=True, size=11)
        header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
        thin = Side(style="thin", color="CCCCCC")
        border = Border(left=thin, right=thin, top=thin, bottom=thin)

        ax_columns = list(AX_COLUMN_MAP.values())
        ax_columns.append("Confidence")

        # Write header
        for col_idx, col_name in enumerate(ax_columns, start=1):
            cell = ws.cell(row=1, column=col_idx, value=col_name)
            cell.font = header_font
            cell.fill = header_fill
            cell.alignment = header_align
            cell.border = border

        # Write data rows
        for row_idx, item in enumerate(items, start=2):
            row_data = [
                item.tag_no,
                item.equipment_name,
                item.instrument_type,
                item.pid_drawing_no,
                item.line_number,
                item.service_description,
                item.confidence,
            ]
            for col_idx, value in enumerate(row_data, start=1):
                cell = ws.cell(row=row_idx, column=col_idx, value=value)
                cell.border = border
                cell.alignment = Alignment(vertical="center")

                # Confidence color coding
                if col_idx == len(ax_columns):
                    if isinstance(value, float):
                        if value >= 0.8:
                            cell.fill = PatternFill("solid", fgColor="C6EFCE")  # green
                        elif value >= 0.5:
                            cell.fill = PatternFill("solid", fgColor="FFEB9C")  # yellow
                        else:
                            cell.fill = PatternFill("solid", fgColor="FFC7CE")  # red

        # Column widths
        col_widths = [20, 35, 25, 20, 20, 30, 12]
        for i, w in enumerate(col_widths, start=1):
            ws.column_dimensions[
                openpyxl.utils.get_column_letter(i)
            ].width = w

        ws.row_dimensions[1].height = 35
        ws.freeze_panes = "A2"

        wb.save(output_path)
        log.info(f"AX Excel saved: {output_path} ({len(items)} rows)")
        return output_path

    except ImportError:
        log.error("openpyxl not installed. Run: pip install openpyxl")
        return ""
    except Exception as e:
        log.error(f"Excel export error: {e}")
        return ""


# ─────────────────────────────────────────────
# Main Pipeline
# ─────────────────────────────────────────────

def run_pipeline(
    input_files: list[str],
    output_dir: str = "output",
    db_dsn: Optional[str] = None,
    use_image_mode: bool = False,
) -> dict:
    """
    Full pipeline: files → AI extraction → CSV + Excel + optional DB load.

    Args:
        input_files: List of DXF or PDF file paths
        output_dir: Directory for output files
        db_dsn: PostgreSQL DSN (optional)
        use_image_mode: Force image conversion even for DXF

    Returns:
        Summary dict with counts and output paths
    """
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    Path("logs").mkdir(exist_ok=True)

    analyzer = PIDAnalyzer()
    all_items: list[PIDItem] = []

    for file_path in input_files:
        suffix = Path(file_path).suffix.lower()
        log.info(f"Processing: {file_path}")

        try:
            if suffix == ".pdf":
                image_paths = pdf_to_images(file_path, output_dir)
                for img in image_paths:
                    all_items.extend(analyzer.analyze_image(img))

            elif suffix == ".dxf":
                if use_image_mode:
                    image_paths = dxf_to_image(file_path, output_dir)
                    for img in image_paths:
                        all_items.extend(analyzer.analyze_image(img))
                else:
                    all_items.extend(analyzer.analyze_dxf_text(file_path))

            elif suffix in (".png", ".jpg", ".jpeg"):
                all_items.extend(analyzer.analyze_image(file_path))

            else:
                log.warning(f"Unsupported file type: {suffix}")

        except Exception as e:
            log.error(f"Failed processing {file_path}: {e}")

    if not all_items:
        log.warning("No items extracted from any file")
        return {"total": 0, "csv": None, "excel": None, "db_rows": 0}

    # Sort by drawing + tag
    all_items.sort(key=lambda x: (x.pid_drawing_no, x.tag_no))

    # Export CSV
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_path = str(Path(output_dir) / f"pid_extracted_{ts}.csv")
    export_csv(all_items, csv_path)

    # Export AX Excel
    excel_path = str(Path(output_dir) / f"pid_AX_import_{ts}.xlsx")
    export_ax_excel(all_items, excel_path)

    # Load to DB
    db_rows = 0
    if db_dsn:
        db_rows = load_to_postgres(all_items, db_dsn)

    summary = {
        "total": len(all_items),
        "csv": csv_path,
        "excel": excel_path,
        "db_rows": db_rows,
        "files_processed": len(input_files),
    }

    log.info(f"Pipeline complete: {summary}")
    return summary


# ─────────────────────────────────────────────
# CLI Entry Point
# ─────────────────────────────────────────────

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="P&ID Extractor: DXF/PDF → CSV/Excel/PostgreSQL"
    )
    parser.add_argument("files", nargs="+", help="DXF or PDF file paths")
    parser.add_argument("--output-dir", default="output", help="Output directory")
    parser.add_argument("--db-dsn", help="PostgreSQL DSN (optional)")
    parser.add_argument("--image-mode", action="store_true",
                        help="Force DXF → image conversion (slower but more accurate)")
    args = parser.parse_args()

    result = run_pipeline(
        input_files=args.files,
        output_dir=args.output_dir,
        db_dsn=args.db_dsn,
        use_image_mode=args.image_mode,
    )

    print("\n===== Extraction Summary =====")
    for k, v in result.items():
        print(f"  {k}: {v}")