2026년 4월 30일 Stable State

2026-04-30 08:16:21 +09:00
parent c0f32177bf
commit fb11359b4c
41 changed files with 7977 additions and 88 deletions
--- a/bench_qwen3_rag.py
+++ b/bench_qwen3_rag.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+Qwen3-Coder-Next-FP8 RAG 연동 벤치마크
+- Qdrant 코드베이스 + OPC UA 문서에서 컨텍스트 수집
+- 수집된 실제 코드/문서 기반으로 복잡한 신규 기능 구현 요청
+- 스트리밍으로 토큰/초 측정
+"""
+
+import time
+import sys
+import httpx
+from openai import OpenAI
+
+VLLM_BASE_URL = "http://localhost:8000/v1"
+VLLM_MODEL    = "Qwen/Qwen3-Coder-Next-FP8"
+OLLAMA_URL    = "http://localhost:11434"
+EMBED_MODEL   = "nomic-embed-text"
+QDRANT_URL    = "http://localhost:6333"
+COL_CODEBASE  = "ws-65f457145aee80b2"
+COL_OPC_DOCS  = "experion-opc-docs"
+
+
+def embed(text: str) -> list[float]:
+    with httpx.Client(timeout=30) as c:
+        r = c.post(f"{OLLAMA_URL}/api/embeddings", json={"model": EMBED_MODEL, "prompt": text})
+        r.raise_for_status()
+        return r.json()["embedding"]
+
+
+def search(collection: str, query: str, top_k: int = 5) -> list[dict]:
+    vec = embed(query)
+    with httpx.Client(timeout=20) as c:
+        r = c.post(
+            f"{QDRANT_URL}/collections/{collection}/points/search",
+            json={"vector": vec, "limit": top_k, "with_payload": True},
+        )
+        r.raise_for_status()
+        return r.json()["result"]
+
+
+def fmt_hits(hits: list[dict], label: str) -> str:
+    chunks = []
+    for i, h in enumerate(hits, 1):
+        p = h["payload"]
+        src = p.get("file_path") or p.get("source") or p.get("filename") or "unknown"
+        text = p.get("text") or p.get("content") or p.get("chunk") or str(p)
+        score = h.get("score", 0)
+        chunks.append(f"[{label} #{i} | {src} | score={score:.3f}]\n{text}")
+    return "\n\n".join(chunks)
+
+
+def run_benchmark():
+    client = OpenAI(base_url=VLLM_BASE_URL, api_key="dummy")
+
+    # ── RAG 컨텍스트 수집 ──────────────────────────────────────────────────────
+    print("RAG 검색 중...")
+    t0 = time.perf_counter()
+
+    # 코드베이스: 실시간 서비스 구조 + DB 저장 패턴
+    hits_realtime = search(COL_CODEBASE, "ExperionRealtimeService FlushLoop subscription MonitoredItem", top_k=4)
+    hits_db       = search(COL_CODEBASE, "ExperionDbContext history snapshot PostgreSQL EF Core", top_k=3)
+
+    # OPC UA 문서: 알람/이벤트 관련
+    hits_alarm    = search(COL_OPC_DOCS, "alarm event notification EventNotifier condition OPC UA", top_k=4)
+
+    rag_time = time.perf_counter() - t0
+    total_hits = len(hits_realtime) + len(hits_db) + len(hits_alarm)
+    print(f"검색 완료: {total_hits}개 청크 ({rag_time:.2f}s)")
+    print()
+
+    ctx_realtime = fmt_hits(hits_realtime, "코드베이스/Realtime")
+    ctx_db       = fmt_hits(hits_db,       "코드베이스/DB")
+    ctx_alarm    = fmt_hits(hits_alarm,    "OPC UA 문서/Alarm")
+
+    # ── 프롬프트 구성 ──────────────────────────────────────────────────────────
+    prompt = f"""\
+아래는 ExperionCrawler 프로젝트의 실제 코드와 OPC UA 공식 문서 발췌입니다.
+이 컨텍스트를 기반으로 새로운 기능을 구현해줘.
+
+━━━ 코드베이스 컨텍스트 ━━━
+
+{ctx_realtime}
+
+{ctx_db}
+
+━━━ OPC UA 문서 컨텍스트 ━━━
+
+{ctx_alarm}
+
+━━━ 구현 요청 ━━━
+
+위 컨텍스트를 바탕으로 ExperionAlarmService를 C#으로 구현해줘.
+
+요구사항:
+1. `IHostedService` + `IExperionAlarmService` 패턴 (기존 ExperionRealtimeService와 동일한 구조).
+2. OPC UA `EventNotifier` 방식으로 알람/이벤트를 구독한다.
+   구독 대상 EventType: ConditionType, AlarmConditionType (OPC UA 표준).
+3. 이벤트 수신 시 다음 정보를 `alarm_history` PostgreSQL 테이블에 저장한다:
+   - `id` (bigserial), `tagname`, `event_type`, `severity` (int), `message`, `active` (bool), `occurred_at` (timestamptz)
+4. 기존 `ExperionDbContext` / EF Core 패턴을 따른다 (새 DbSet 추가).
+5. 컨트롤러 `ExperionAlarmController` — start/stop/status + 최근 알람 조회 (GET /api/alarm/recent?limit=50).
+6. `appsettings.json`에 `AlarmServer` 섹션 추가 (NodeId 목록, MaxSeverityFilter).
+7. 각 클래스/메서드에 한 줄 XML 문서 주석 포함.
+
+코드는 완성된 형태로 작성하고, 파일별로 명확히 구분해줘.
+"""
+
+    prompt_chars = len(prompt)
+    print(f"프롬프트 길이: {prompt_chars:,} chars (RAG 컨텍스트 포함)")
+    print(f"모델: {VLLM_MODEL}")
+    print("=" * 60)
+    print()
+
+    # ── 스트리밍 LLM 요청 ──────────────────────────────────────────────────────
+    stream = client.chat.completions.create(
+        model=VLLM_MODEL,
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "당신은 C#/.NET 백엔드와 OPC UA 프로토콜 전문가입니다. "
+                    "ExperionCrawler 프로젝트의 기존 코드 스타일과 패턴을 그대로 따르며 "
+                    "완성도 높은 코드를 작성합니다."
+                ),
+            },
+            {"role": "user", "content": prompt},
+        ],
+        max_tokens=4096,
+        temperature=0.1,
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+
+    # ── 스트리밍 수신 + 측정 ────────────────────────────────────────────────────
+    first_token_time  = None
+    start_time        = time.perf_counter()
+    completion_tokens = 0
+
+    for chunk in stream:
+        if chunk.usage:
+            completion_tokens = chunk.usage.completion_tokens
+        if not chunk.choices:
+            continue
+        delta = chunk.choices[0].delta
+        if delta.content:
+            if first_token_time is None:
+                first_token_time = time.perf_counter()
+                ttft = first_token_time - start_time
+                print(f"[TTFT: {ttft:.3f}s] ", end="", flush=True)
+            sys.stdout.write(delta.content)
+            sys.stdout.flush()
+
+    end_time = time.perf_counter()
+
+    # ── 결과 출력 ──────────────────────────────────────────────────────────────
+    total_time = end_time - start_time
+    gen_time   = end_time - (first_token_time or start_time)
+    tps_gen    = completion_tokens / gen_time   if gen_time   > 0 else 0
+    tps_wall   = completion_tokens / total_time if total_time > 0 else 0
+
+    print()
+    print()
+    print("=" * 60)
+    print(f"RAG 검색 시간 : {rag_time:.2f}s  ({total_hits}개 청크)")
+    print(f"총 출력 토큰  : {completion_tokens:,}")
+    print(f"총 소요 시간  : {total_time:.2f}s")
+    print(f"생성 시간     : {gen_time:.2f}s  (첫 토큰 이후)")
+    print(f"TTFT          : {(first_token_time or start_time) - start_time:.3f}s")
+    print(f"토큰 속도     : {tps_gen:.1f} tok/s  (생성 구간)")
+    print(f"토큰 속도     : {tps_wall:.1f} tok/s  (전체 구간)")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    run_benchmark()