ExperionCrawler/actual_parser_coding_byGemma426b.md

# Actual Implementation: P&ID Parser (Distributed Processing)

This document contains the actual implementation of the P&ID Parser based on the design plan.

## 1. Python Implementation

### 1.1 `dxf_preprocessor.py`
```python
import ezdxf
import json
from datetime import datetime
import os

class DXFPreprocessor:
    """
    DXF 파일을 로드하여 핵심 엔티티를 추출하고 중간 JSON 포맷으로 저장합니다.
    """
    def __init__(self):
        self.entities = []

    def load_and_parse(self, file_path):
        try:
            if not os.path.exists(file_path):
                print(f"Error: File not found {file_path}")
                return False

            doc = ezdxf.readfile(file_path)
            msp = doc.modelspace()

            for entity in msp:
                # 추출 대상 엔티티 타입 정의
                if entity.dxftype() in ['TEXT', 'MTEXT', 'LINE', 'CIRCLE', 'LWPOLYLINE']:
                    data = {
                        "type": entity.dxftype(),
                        "layer": entity.dxf.layer,
                        "content": "",
                        "coordinates": {"x": 0.0, "y": 0.0, "z": 0.0},
                        "attributes": {"color": entity.dxf.color, "lineweight": entity.dxf.lineweight}
                    }

                    # 텍스트 내용 추출
                    if entity.dxftype() in ['TEXT', 'MTEXT']:
                        data["content"] = entity.dxf.text if entity.dxftype() == 'TEXT' else entity.text

                    # 좌표 정보 추출 (단순화)
                    try:
                        if entity.dxftype() == 'LINE':
                            data["coordinates"] = {"x": entity.dxf.start.x, "y": entity.dxf.start.y, "z": entity.dxf.start.z}
                        elif entity.dxftype() == 'CIRCLE':
                            data["coordinates"] = {"x": entity.dxf.center.x, "y": entity.dxf.center.y, "z": entity.dxf.center.z}
                        elif entity.dxftype() == 'LWPOLYLINE':
                            data["coordinates"] = {"x": entity.dxf.vertices[0].x, "y": entity.dxf.vertices[0].y, "z": 0.0}
                    except Exception:
                        pass # 좌표 추출 실패 시 기본값 유지

                    self.entities.append(data)
            return True
        except Exception as e:
            print(f"Error parsing DXF: {e}")
            return False

    def generate_intermediate_json(self, output_path, filename):
        data = {
            "metadata": {
                "filename": filename,
                "timestamp": datetime.now().isoformat()
            },
            "entities": self.entities
        }
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"Intermediate JSON saved to: {output_path}")

if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python dxf_preprocessor.py <input_dxf_path>")
    else:
        input_path = sys.argv[1]
        output_path = input_path.replace(".dxf", "_intermediate.json")
        preprocessor = DXFPreprocessor()
        if preprocessor.load_and_parse(input_path):
            preprocessor.generate_intermediate_json(output_path, os.path.basename(input_path))
```

### 1.2 `extractors/base_extractor.py`
```python
import json
import re
import sys
import os

class BaseExtractor:
    """
    모든 특화된 추출기(Specialized Extractors)의 기본 클래스입니다.
    """
    def __init__(self, input_json_path):
        self.input_json_path = input_json_path
        self.data = None
        self.results = []

    def load_input_json(self):
        try:
            with open(self.input_json_path, 'r', encoding='utf-8') as f:
                self.data = json.load(f)
            return True
        except Exception as e:
            print(f"Error loading JSON: {e}")
            return False

    def apply_regex_pattern(self, pattern):
        if not self.data:
            return

        regex = re.compile(pattern)
        for entity in self.data.get("entities", []):
            content = entity.get("content", "")
            if content:
                match = regex.search(content)
                if match:
                    # 매칭된 정보를 결과 리스트에 추가
                    self.results.append({
                        "tag": match.group(0),
                        "type": entity["type"],
                        "layer": entity["layer"],
                        "content": content,
                        "coordinates": entity["coordinates"]
                    })

    def save_output_json(self, output_path):
        output_data = {
            "source_file": self.data["metadata"]["filename"],
            "extracted_count": len(self.results),
            "results": self.results
        }
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)
        print(f"Extraction results saved to: {output_path}")

if __name__ == "__main__":
    # This block is replaced by specific extractor scripts
    pass
```

### 1.3 `extractors/transmitter_extractor.py` (Example of Specialized Extractor)
```python
import sys
from base_extractor import BaseExtractor

class TransmitterExtractor(BaseExtractor):
    def run(self):
        # Pattern: (FIT|FT|LT|PT|TE) - 123
        pattern = r"(FIT|FT|LT|PT|TE)\s?-\s?\d+"
        self.apply_regex_pattern(pattern)

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python transmitter_extractor.py <input_json_path>")
    else:
        input_path = sys.argv[1]
        output_path = input_path.replace(".json", "_transmitter.json")
        extractor = TransmitterExtractor(input_path)
        if extractor.load_input_json():
            extractor.run()
            extractor.save_output_json(output_path)
```

### 1.4 `extraction_orchestrator.py`
```python
import subprocess
import json
import os
import glob

class ExtractionOrchestrator:
    """
    서브 프로세스들을 병렬로 실행하고 결과를 통합합니다.
    """
    def __init__(self, extractor_scripts):
        self.extractor_scripts = extractor_scripts
        self.processes = []

    def run_parallel_extractors(self, input_json_path):
        print(f"Starting parallel extraction for: {input_json_path}")
        for script in self.extractor_scripts:
            # 각 스크립트를 별도 프로세스로 실행
            p = subprocess.Popen(['python', script, input_json_path])
            self.processes.append(p)

        for p in self.processes:
            p.wait()
        print("All extraction processes completed.")

    def aggregate_results(self, input_json_path, output_master_path):
        master_results = []
        # 모든 _*.json 파일들을 찾아 병합
        pattern = os.path.dirname(input_json_path) + "/*_*.json"
        result_files = glob.glob(pattern)

        for file_path in result_files:
            if os.path.basename(file_path).startswith("intermediate"):
                continue # 원본 중간 파일은 제외

            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    res_data = json.load(f)
                    master_results.extend(res_data["results"])
            except Exception as e:
                print(
                    f"Error aggregating {file_path}: {e}"
                )

        final_output = {
            "source_file": os.path.basename(input_json_path),
            "total_extracted": len(master_results),
            "results": master_results
        }

        with open(output_master_path, 'w', encoding='utf-8') as f:
            json.dump(final_output, f, indent=2, ensure_ascii=False)
        print(f"Master extraction result saved to: {output_master_path}")

if __name__ == "__main__":
    # Example usage
    scripts = [
        'extractors/transmitter_extractor.py',
        # 'extractors/valve_extractor.py', ...
    ]
    orchestrator = ExtractionOrchestrator(scripts)
    input_json = "path/to/intermediate.json"
    orchestrator.run_parallel_extractors(input_json)
    orchestrator.aggregate_results(input_json, "master_result.json")
```

### 1.5 `database_integrator.py`
```python
import json
import requests

class DatabaseIntegrator:
    """
    병합된 결과를 .NET Backend API로 전송합니다.
    """
    def __init__(self, api_url):
        self.api_url = api_url

    def send_to_backend(self, master_json_path):
        try:
            with open(master_json_append_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            response = requests.post(self.api_url, json=data)
            if response.status_code == 200:
                print("Successfully sent data to backend.")
            else:
                print(f"Failed to send data. Status: {response.status_code}, Error: {response.text}")
        except Exception as e:
            print(f"Error during integration: {e}")

if __name__ == "__main__":
    API_ENDPOINT = "http://localhost:5000/api/pid/extraction"
    integrator = DatabaseIntegrator(API_ENDPOINT)
    integrator.send_to_backend("master_result.json")
```

## 2. C# Backend Implementation

### 2.1 `PidExtractionController.cs`
```csharp
using Microsoft.AspNetCore.Mvc;
using ExperionCrawler.Core.Application.DTOs;
using ExperionCrawler.Core.Application.Services;

namespace ExperionCrawler.Web.Controllers
{
    [ApiController]
    [Route("api/[controller]")]
    public class PidExtractionController : ControllerBase
    {
        private readonly IPidProcessingService _pidService;

        public PidExtractionController(IPidProcessingService pidService)
        {
            _pidService = pidService;
        }

        [HttpPost("extraction")]
        public async Task<IActionResult> PostExtractionResult([FromBody] ExtractionDto dto)
        {
            if (dto == null) return BadRequest("Invalid data.");

            try
            {
                await _pidService.ProcessAndSave(dto);
                return Ok(new { message = "Extraction data processed successfully." });
            }
            catch (Exception ex)
            {
                return StatusCode(500, $"Internal server error: {ex.Message}");
            }
        }
    }
}
```

### 2.2 `PidProcessingService.cs`
```csharp
using ExperionCrawler.Core.Application.DTOs;
using ExperionCrawler.Core.Application.Interfaces;
using ExperionCrawler.Core.Domain.Entities;

namespace ExperionCrawler.Core.Application.Services
{
    public class PidProcessingService : IPidProcessingService
    {
        private readonly IPidRepository _repository;

        public PidProcessing
        {
            _repository = repository;
        }

        public async Task ProcessAndSave(ExtractionDto dto)
        {
            // 1. Validate DTO
            if (string.IsNullOrEmpty(dto.SourceFile)) throw new ArgumentException("Source file name is required.");

            // 2. Map DTO to Domain Entity
            foreach (var item in dto.Results)
            {
                var equipment = new PidEquipment
                {
                    TagName = item.Tag,
                    Layer = item.Layer,
                    Description = item.Content,
                    SourceFile = dto.SourceFile,
                    CreatedAt = DateTime.UtcNow
                };

                // 3. Save to Database
                await _repository.SaveAsync(equipment);
            }
        }
    }
}
```

### 2.3 `PidRepository.cs`
```csharp
using ExperionCrawler.Core.Application.Interfaces;
using ExperionCrawler.Core.Domain.Entities;
using ExperionCrawler.Infrastructure.Database;

namespace ExperionCrawler.Infrastructure.Repositories
{
    public class PidRepository : IPidRepository
    {
        private readonly ExperionDbContext _context;

        public PidRepository(ExperionDbContext context)
        {
            _context = context;
        }

        public async Task SaveAsync(PidEquipment entity)
        {
            await _context.PidEquipments.AddAsync(entity);
            await _context.SaveChangesAsync();
        }
    }
}
```