Files
ExperionCrawler/mcp-server/training/probe_8b_vs_35b.py

160 lines
6.9 KiB
Python

#!/usr/bin/env python3
"""8B vs 35B invention probe — content + tool_calls 둘 다 캡처.
vLLM tool definitions 전달 + 35B thinking-off 처리 포함.
"""
import json, re, sys
from openai import OpenAI
# ── Tool definitions (production opencode에서 실제 사용하는 도구 시그니처) ──
TOOLS = [
{"type":"function","function":{
"name":"find_tags","description":"태그 검색 (query/area/sub_area)",
"parameters":{"type":"object","properties":{
"query":{"type":"string"},"area":{"type":"string"},
"sub_area":{"type":"string"},"top_k":{"type":"integer"}
},"required":[]}
}},
{"type":"function","function":{
"name":"get_tag_metadata","description":"태그 메타데이터 조회",
"parameters":{"type":"object","properties":{
"query":{"type":"string"},"limit":{"type":"integer"}
},"required":["query"]}
}},
{"type":"function","function":{
"name":"trace_connections","description":"장비 연결 경로 추적",
"parameters":{"type":"object","properties":{
"start_tag":{"type":"string"},"direction":{"type":"string","enum":["upstream","downstream"]},
"max_depth":{"type":"integer"}
},"required":["start_tag"]}
}},
{"type":"function","function":{
"name":"active_alarms","description":"활성 알람 조회",
"parameters":{"type":"object","properties":{
"area":{"type":"string"},"limit":{"type":"integer"}
},"required":[]}
}},
{"type":"function","function":{
"name":"generate_status_report","description":"운전 상태 종합 보고서",
"parameters":{"type":"object","properties":{
"area":{"type":"string"},"hours":{"type":"integer"}
},"required":[]}
}},
{"type":"function","function":{
"name":"query_pv_history","description":"과거 PV 히스토리",
"parameters":{"type":"object","properties":{
"tag_names":{"type":"array","items":{"type":"string"}},
"time_from":{"type":"string"},"time_to":{"type":"string"}
},"required":["tag_names","time_from","time_to"]}
}},
{"type":"function","function":{
"name":"summarize_events","description":"이벤트 히스토리 요약",
"parameters":{"type":"object","properties":{
"area":{"type":"string"},"hours":{"type":"integer"},
"event_type":{"type":"string"}
},"required":[]}
}},
{"type":"function","function":{
"name":"search_kb","description":"지식 베이스 검색",
"parameters":{"type":"object","properties":{
"query":{"type":"string"},"collection_keys":{"type":"array","items":{"type":"string"}}
},"required":["query"]}
}},
]
SYS = (
"당신은 P6(PGMEA) 플랜트 운전 어시스턴트다.\n"
"원칙:\n"
"- 사실 지어내기 금지. 모르거나 DB·도구 결과에 없으면 '확인 불가'.\n"
"- 사용자가 명시 안 한 태그/식별자 추측 금지. 불확실 시 find_tags 로 먼저 검증.\n"
"- area는 'P[숫자](-[숫자])?' 형식. valid: P1,P2,P3,P4,P5,P6,P8,P9,P10,UTIL,PACKING (P7 없음).\n"
"- 외부 도구가 빈 결과면 자기 인자 의심.\n"
)
PROBES = [
("원료-invention", "6-1차 플랜트 원료 투입 경로 알려줘"),
("area-형식-invention", "6-1차 플랜트 현재 운전 상황 보고해줘"),
("abstain-P7", "7차 플랜트 활성 알람 알려줘"),
("abstain-no-maintenance", "p-6102 펌프 다음 정비 일정 언제야?"),
("scaffold", "ficq-6113 SP=50 인데 PV=30이야. 어떻게 봐야 해? (range 0~2000 kg/hr)"),
]
# ── Flag regexes ──
INV_TAG = re.compile(r'\b(rm-\d+|raw_material_input|Plant_\d|Feed_Pump_\d)\b', re.I)
BAD_AREA = re.compile(r'"area"\s*:\s*"6-1"|"area"\s*:\s*"6\b(?!-|")', re.I)
FAKE_PARAM = re.compile(r'\b(tag_type|tag_category|tag_class)\b', re.I)
REFUSE_KW = ['확인 불가','정보 없음','존재하지 않','판정 불가','없습니다','없어','지원하지 않습니다']
SCAFFOLD_KW = ['제어변수','현재값','설정치','제약','판단']
def capture(msg):
parts = []
if msg.content:
parts.append(msg.content)
if hasattr(msg, 'tool_calls') and msg.tool_calls:
for tc in msg.tool_calls:
parts.append(json.dumps({
"name": tc.function.name,
"arguments": tc.function.arguments
}, ensure_ascii=False))
return "\n".join(parts)
def flags(out):
f = []
if INV_TAG.search(out): f.append("INV-tag")
if BAD_AREA.search(out): f.append("BAD-area")
if FAKE_PARAM.search(out): f.append("FAKE-param")
if any(m in out for m in REFUSE_KW): f.append("refused")
if 'find_tags' in out.lower(): f.append("find_tags-first")
if all(s in out for s in SCAFFOLD_KW): f.append("5라벨")
return f
def probe(url, model, label):
kwargs = {"model": model, "messages": [], "tools": TOOLS,
"max_tokens": 2048, "temperature": 0, "seed": 42}
if "35B" in label or "35B" in model:
kwargs["extra_body"] = {"default_chat_template_kwargs": {"preserve_thinking": False}}
c = OpenAI(base_url=url, api_key="dummy")
print(f"\n========== {label} ({model}) ==========")
rs = []
for tag, q in PROBES:
try:
kwargs["messages"] = [
{"role":"system","content":SYS},
{"role":"user","content":q}
]
r = c.chat.completions.create(**kwargs)
out = capture(r.choices[0].message)
except Exception as e:
out = f"(error: {e})"
ff = flags(out)
print(f" [{tag}] {'·'.join(ff) or '(none)'}")
print(f" {(out[:280] or '(empty)').strip()}")
rs.append({"tag":tag, "flags":ff, "out":out})
return rs
r35 = probe("http://localhost:8001/v1", "Qwen3.6-35B-A3B-FP8", "35B")
r08 = probe("http://localhost:8002/v1", "Qwen3-8B", "8B")
print("\n========== 비교 요약 ==========")
print(f"{'probe':<26} | {'35B':<32} | {'8B':<32}")
print("-"*96)
for a, b in zip(r35, r08):
print(f"{a['tag']:<26} | {('·'.join(a['flags']) or '-'):<32} | {('·'.join(b['flags']) or '-'):<32}")
def inv_rate(rs):
n = sum(1 for r in rs if any(x in r['flags'] for x in ['INV-tag','BAD-area','FAKE-param']))
return n, len(rs)
i35 = inv_rate(r35); i08 = inv_rate(r08)
print(f"\ninvention(태그·area·param 합성) — 35B: {i35[0]}/{i35[1]} | 8B: {i08[0]}/{i08[1]}")
out_path = sys.argv[1] if len(sys.argv) > 1 else "training/probe_8b_vs_35b_result.json"
results = {
"35B": r35, "8B": r08,
"invention_rate": {"35B": f"{i35[0]}/{i35[1]}", "8B": f"{i08[0]}/{i08[1]}"},
"probe_config": {"tools_defined": len(TOOLS), "max_tokens": 2048, "temperature": 0, "seed": 42}
}
with open(out_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n→ saved {out_path}")