Files
ExperionCrawler/src/Core/Application/Services/PidExtractorService.cs
2026-05-09 04:28:10 +09:00

352 lines
14 KiB
C#

using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using ExperionCrawler.Core.Application.DTOs;
using ExperionCrawler.Core.Application.Interfaces;
using ExperionCrawler.Core.Domain.Entities;
using ExperionCrawler.Infrastructure.Database;
using ExperionCrawler.Infrastructure.Mcp;
using Microsoft.EntityFrameworkCore;
using netDxf;
using UglyToad.PdfPig;
namespace ExperionCrawler.Core.Application.Services;
public class PidExtractorService : IPidExtractorService
{
private readonly McpClient _mcp;
private readonly ExperionDbContext _dbContext;
private readonly ILogger<PidExtractorService> _logger;
public PidExtractorService(McpClient mcp, ExperionDbContext dbContext, ILogger<PidExtractorService> logger)
{
_mcp = mcp;
_dbContext = dbContext;
_logger = logger;
}
public async Task<PidExtractionResult> ExtractFromFileAsync(string filePath, bool useImageMode = false)
{
await using var stream = File.OpenRead(filePath);
return await ExtractFromStreamAsync(stream, Path.GetFileName(filePath), useImageMode);
}
public async Task<PidExtractionResult> ExtractFromStreamAsync(Stream stream, string fileName, bool useImageMode = false)
{
var ext = Path.GetExtension(fileName).ToLowerInvariant();
string text = ext switch
{
".dxf" => ExtractDxfText(stream),
".pdf" => ExtractPdfText(stream),
_ => throw new NotSupportedException($"지원 형식: .dxf .pdf (스캔본 이미지는 Vision 모드 필요)")
};
if (string.IsNullOrWhiteSpace(text))
return new PidExtractionResult(0, 0, 0);
// MCP → vLLM 태그 추출
var sourceType = ext.TrimStart('.');
var json = await _mcp.ExtractPidTagsAsync(text, sourceType);
var extractedItems = ParseJson(json);
if (extractedItems.Count == 0)
{
_logger.LogWarning("P&ID 추출 결과 0건 — 파일: {FileName}", fileName);
return new PidExtractionResult(0, 0, 0);
}
// MCP → vLLM 태그 매핑 제안
var pidTagNos = extractedItems.Select(i => i.TagNo).Distinct().ToList();
var experionTagNames = await _dbContext.RealtimePoints.Select(r => r.TagName).ToListAsync();
var mappingJson = await _mcp.MatchPidTagsAsync(pidTagNos, experionTagNames);
var mappings = ParseMappingJson(mappingJson);
// DB 저장
var dbItems = new List<PidEquipment>();
foreach (var item in extractedItems)
{
mappings.TryGetValue(item.TagNo, out var matched);
var experionTag = matched != null
? await _dbContext.RealtimePoints.FirstOrDefaultAsync(r => r.TagName == matched)
: await FindFallbackTagAsync(item.TagNo);
dbItems.Add(new PidEquipment
{
TagNo = item.TagNo,
EquipmentName = item.EquipmentName,
InstrumentType = item.InstrumentType,
LineNumber = item.LineNumber,
PidDrawingNo = item.PidDrawingNo,
Confidence = item.Confidence,
ExperionTagId = experionTag?.Id,
ExtractedAt = DateTime.UtcNow,
UpdatedAt = DateTime.UtcNow
});
}
await _dbContext.PidEquipment.AddRangeAsync(dbItems);
await _dbContext.SaveChangesAsync();
_logger.LogInformation("P&ID 추출 완료: {Total}건 저장 (파일: {FileName})", dbItems.Count, fileName);
return new PidExtractionResult(
TotalCount: dbItems.Count,
ConfidenceItems: dbItems.Count(i => i.Confidence >= 0.7),
LowConfidenceItems: dbItems.Count(i => i.Confidence < 0.5));
}
private string ExtractDxfText(Stream stream)
{
var tmp = Path.GetTempFileName() + ".dxf";
try
{
using (var fs = File.Create(tmp))
stream.CopyTo(fs);
var doc = DxfDocument.Load(tmp);
var sb = new StringBuilder();
foreach (var txt in doc.Entities.Texts)
sb.AppendLine(txt.Value);
foreach (var mtxt in doc.Entities.MTexts)
sb.AppendLine(mtxt.PlainText());
foreach (var blk in doc.Blocks)
foreach (var attr in blk.AttributeDefinitions.Values)
sb.AppendLine(attr.Value);
var text = sb.ToString();
// P&ID 태그 관련 정보만 필터링하여 MCP 서버로 전달
return FilterDxfText(text);
}
finally
{
if (File.Exists(tmp)) File.Delete(tmp);
}
}
/// <summary>
/// DXF 텍스트에서 P&ID 태그 패턴에 해당하는 라인만 필터링
/// 불필요한 텍스트를 제거하여 MCP 서버 부하 감소 및 JSON 파싱 오류 방지
/// </summary>
private string FilterDxfText(string text)
{
var lines = text.Split('\n');
var filteredLines = new List<string>();
foreach (var line in lines)
{
var trimmed = line.Trim();
// P&ID 태그 패턴 포함 라인만 유지
// - 단일 글자 장비 태그 포함: P-10101, T-10100, E-10119, C-10111
// - 다중 글자 계측 태그: FCV-101, FICQ-6113, PSV-6203
// - 복합 태그: VG-6203-15A-F1A-n, CD-10513-40A
if (Regex.IsMatch(trimmed, @"[A-Z]{1,6}-\d{2,6}(-[A-Z0-9]+)*"))
{
filteredLines.Add(trimmed);
}
}
return string.Join("\n", filteredLines);
}
private string ExtractPdfText(Stream stream)
{
using var pdf = PdfDocument.Open(stream);
var sb = new StringBuilder();
foreach (var page in pdf.GetPages())
sb.AppendLine(page.Text);
return sb.ToString();
}
private List<ExtractedItem> ParseJson(string json)
{
try
{
// MCP 서버 응답 형식: {"success": ..., "count": ..., "tags": [...]}
// 또는 기존 형식: [...]
using var doc = JsonDocument.Parse(json);
var root = doc.RootElement;
// "tags" 필드가 있으면 중첩 구조로 간주
if (root.TryGetProperty("tags", out var tagsElement))
{
return JsonSerializer.Deserialize<List<ExtractedItem>>(tagsElement.GetRawText(),
new JsonSerializerOptions { PropertyNameCaseInsensitive = true }) ?? [];
}
// 루트가 배열이면 직접 파싱
if (root.ValueKind == JsonValueKind.Array)
{
return JsonSerializer.Deserialize<List<ExtractedItem>>(json,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true }) ?? [];
}
_logger.LogWarning("P&ID JSON 파싱 실패: 'tags' 필드 또는 배열 형식 없음");
return [];
}
catch (Exception ex)
{
_logger.LogWarning("P&ID JSON 파싱 실패: {Msg} / raw: {Raw}", ex.Message, json[..Math.Min(200, json.Length)]);
return [];
}
}
private Dictionary<string, string> ParseMappingJson(string json)
{
try
{
using var doc = JsonDocument.Parse(json);
var root = doc.RootElement;
// MCP 응답: {"success": ..., "count": ..., "mappings": [...]}
JsonElement arrayEl = root.ValueKind == JsonValueKind.Array
? root
: root.TryGetProperty("mappings", out var m) ? m : default;
if (arrayEl.ValueKind != JsonValueKind.Array)
return [];
var list = JsonSerializer.Deserialize<List<MappingItem>>(arrayEl.GetRawText(),
new JsonSerializerOptions { PropertyNameCaseInsensitive = true }) ?? [];
return list
.Where(m => m.Confidence >= 0.7 && !string.IsNullOrEmpty(m.ExperionTag))
.ToDictionary(m => m.PidTag, m => m.ExperionTag!);
}
catch { return []; }
}
private async Task<RealtimePoint?> FindFallbackTagAsync(string tagNo)
{
var normalized = tagNo.Split('.')[0];
return await _dbContext.RealtimePoints
.FirstOrDefaultAsync(t => t.TagName == normalized
|| t.TagName.StartsWith(normalized + "."));
}
public async Task<(int Total, IEnumerable<PidEquipment> Items)> GetEquipmentAsync(
string? tagNo, int page, int pageSize)
{
var q = _dbContext.PidEquipment.AsQueryable();
if (!string.IsNullOrEmpty(tagNo))
q = q.Where(e => e.TagNo.Contains(tagNo));
var total = await q.CountAsync();
var items = await q.OrderByDescending(e => e.ExtractedAt)
.Skip((page - 1) * pageSize).Take(pageSize).ToListAsync();
return (total, items);
}
public async Task<PidEquipment?> GetByIdAsync(long id)
=> await _dbContext.PidEquipment.Include(e => e.ExperionTag).FirstOrDefaultAsync(e => e.Id == id);
public async Task UpdateConfidenceAsync(long id, double confidence)
{
var e = await _dbContext.PidEquipment.FindAsync(id);
if (e == null) return;
e.Confidence = confidence; e.UpdatedAt = DateTime.UtcNow;
await _dbContext.SaveChangesAsync();
}
public async Task ActivateAsync(long id)
{
var e = await _dbContext.PidEquipment.FindAsync(id);
if (e == null) return;
e.IsActive = true; e.UpdatedAt = DateTime.UtcNow;
await _dbContext.SaveChangesAsync();
}
public async Task DeactivateAsync(long id)
{
var e = await _dbContext.PidEquipment.FindAsync(id);
if (e == null) return;
e.IsActive = false; e.UpdatedAt = DateTime.UtcNow;
await _dbContext.SaveChangesAsync();
}
public Task<int> GetTotalCountAsync() => _dbContext.PidEquipment.CountAsync();
public Task<int> GetConfidenceItemsCountAsync() => _dbContext.PidEquipment.CountAsync(e => e.Confidence >= 0.7);
public Task<int> GetLowConfidenceItemsCountAsync() => _dbContext.PidEquipment.CountAsync(e => e.Confidence < 0.5);
public Task<int> GetDrawingCountAsync() => _dbContext.PidEquipment.Select(e => e.PidDrawingNo).Distinct().CountAsync();
public async Task<IDictionary<string, int>> GetConfidenceDistributionAsync()
{
var items = await _dbContext.PidEquipment.ToListAsync();
return new Dictionary<string, int>
{
["High (>=0.7)"] = items.Count(i => i.Confidence >= 0.7),
["Medium (0.5-0.7)"] = items.Count(i => i.Confidence >= 0.5 && i.Confidence < 0.7),
["Low (<0.5)"] = items.Count(i => i.Confidence < 0.5)
};
}
public Task<string> ExportToCsvAsync(IEnumerable<PidEquipment> items)
{
var sb = new StringBuilder();
sb.AppendLine("TagNo,EquipmentName,InstrumentType,LineNumber,PidDrawingNo,Confidence,IsActive,ExtractedAt,ExperionTagId");
foreach (var i in items)
sb.AppendLine($"{Csv(i.TagNo)},{Csv(i.EquipmentName)},{Csv(i.InstrumentType)},{Csv(i.LineNumber)},{Csv(i.PidDrawingNo)},{i.Confidence},{i.IsActive},{i.ExtractedAt:O},{i.ExperionTagId}");
return Task.FromResult(sb.ToString());
}
private static string Csv(string? v)
{
if (string.IsNullOrEmpty(v)) return "";
return (v.Contains(',') || v.Contains('"') || v.Contains('\n'))
? $"\"{v.Replace("\"", "\"\"")}\"" : v;
}
public Task<byte[]> ExportToExcelAsync(IEnumerable<PidEquipment> items)
{
using var package = new OfficeOpenXml.ExcelPackage();
var worksheet = package.Workbook.Worksheets.Add("P&ID Equipment");
// 헤더
worksheet.Cells[1, 1].Value = "태그번호";
worksheet.Cells[1, 2].Value = "장비명";
worksheet.Cells[1, 3].Value = "계기유형";
worksheet.Cells[1, 4].Value = "라인번호";
worksheet.Cells[1, 5].Value = "도면번호";
worksheet.Cells[1, 6].Value = "신뢰도";
worksheet.Cells[1, 7].Value = "상태";
worksheet.Cells[1, 8].Value = "추출일시";
worksheet.Cells[1, 9].Value = "Experion 태그";
int row = 2;
foreach (var item in items)
{
worksheet.Cells[row, 1].Value = item.TagNo;
worksheet.Cells[row, 2].Value = item.EquipmentName ?? "";
worksheet.Cells[row, 3].Value = item.InstrumentType ?? "";
worksheet.Cells[row, 4].Value = item.LineNumber ?? "";
worksheet.Cells[row, 5].Value = item.PidDrawingNo ?? "";
worksheet.Cells[row, 6].Value = item.Confidence;
worksheet.Cells[row, 7].Value = item.IsActive ? "활성" : "비활성";
worksheet.Cells[row, 8].Value = item.ExtractedAt;
worksheet.Cells[row, 9].Value = item.ExperionTag?.TagName ?? "";
row++;
}
return Task.FromResult(package.GetAsByteArray());
}
}
// ── 내부 파싱용 모델 ──────────────────────────────────────────────────────────
public class ExtractedItem
{
public string TagNo { get; set; } = "";
public string? EquipmentName { get; set; }
public string? InstrumentType { get; set; }
public string? LineNumber { get; set; }
public string? PidDrawingNo { get; set; }
public double Confidence { get; set; } = 0.5;
}
public class MappingItem
{
public string PidTag { get; set; } = "";
public string? ExperionTag { get; set; }
public double Confidence { get; set; }
}