2026년 4월 30일 Stable State

2026-04-30 08:16:21 +09:00
parent c0f32177bf
commit fb11359b4c
41 changed files with 7977 additions and 88 deletions
--- a/bench_qwen3.py
+++ b/bench_qwen3.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+Qwen3-Coder-Next-FP8 출력 토큰 속도 벤치마크
+- 스트리밍 모드로 수신하며 토큰/초 실시간 측정
+- usage.completion_tokens 기반 최종 속도 산출
+"""
+
+import time
+import sys
+from openai import OpenAI
+
+VLLM_BASE_URL = "http://localhost:8000/v1"
+VLLM_MODEL    = "Qwen/Qwen3-Coder-Next-FP8"
+
+# ── 프로그램 작성 예제 프롬프트 ────────────────────────────────────────────────
+PROMPT = """\
+Python으로 다음 조건을 만족하는 TTL-LRU 캐시 클래스를 작성해줘.
+
+요구사항:
+1. `capacity` (최대 항목 수)와 `ttl_seconds` (항목 유효 시간)를 생성자에서 받는다.
+2. `get(key)` — 없거나 만료된 항목은 None 반환.
+3. `set(key, value)` — 캐시가 가득 차면 가장 오래된 항목을 제거한다.
+4. `delete(key)` — 명시적 삭제.
+5. `size()` — 현재 유효한 항목 수 반환 (만료된 항목 제외).
+6. 스레드 안전해야 한다 (threading.Lock 사용).
+7. 클래스 하단에 동작을 검증하는 `if __name__ == '__main__':` 테스트 코드를 포함한다.
+
+추가 조건:
+- 외부 라이브러리 사용 금지 (표준 라이브러리만).
+- 타입 힌트를 모든 메서드에 명시한다.
+- 각 메서드에 한 줄 docstring을 작성한다.
+"""
+
+def run_benchmark():
+    client = OpenAI(base_url=VLLM_BASE_URL, api_key="dummy")
+
+    print(f"모델  : {VLLM_MODEL}")
+    print(f"프롬프트 길이: {len(PROMPT)} chars")
+    print("=" * 60)
+    print()
+
+    # ── 스트리밍 요청 ──────────────────────────────────────────────
+    stream = client.chat.completions.create(
+        model=VLLM_MODEL,
+        messages=[
+            {
+                "role": "system",
+                "content": "당신은 숙련된 Python 개발자입니다. 명확하고 실용적인 코드를 작성합니다.",
+            },
+            {"role": "user", "content": PROMPT},
+        ],
+        max_tokens=2048,
+        temperature=0.1,
+        stream=True,
+        stream_options={"include_usage": True},  # 마지막 청크에 usage 포함
+    )
+
+    # ── 스트리밍 수신 + 측정 ────────────────────────────────────────
+    first_token_time = None
+    start_time       = time.perf_counter()
+    char_count       = 0
+    completion_tokens = 0
+    full_text        = []
+
+    for chunk in stream:
+        # usage 청크 (마지막)
+        if chunk.usage:
+            completion_tokens = chunk.usage.completion_tokens
+
+        if not chunk.choices:
+            continue
+
+        delta = chunk.choices[0].delta
+        if delta.content:
+            if first_token_time is None:
+                first_token_time = time.perf_counter()
+                ttft = first_token_time - start_time
+                print(f"[TTFT: {ttft:.3f}s] ", end="", flush=True)
+
+            sys.stdout.write(delta.content)
+            sys.stdout.flush()
+            full_text.append(delta.content)
+            char_count += len(delta.content)
+
+    end_time = time.perf_counter()
+
+    # ── 결과 출력 ──────────────────────────────────────────────────
+    total_time    = end_time - start_time
+    gen_time      = end_time - (first_token_time or start_time)
+    tps_wall      = completion_tokens / total_time if total_time > 0 else 0
+    tps_gen       = completion_tokens / gen_time   if gen_time   > 0 else 0
+
+    print()
+    print()
+    print("=" * 60)
+    print(f"총 출력 토큰 : {completion_tokens:,}")
+    print(f"총 소요 시간 : {total_time:.2f}s")
+    print(f"생성 시간    : {gen_time:.2f}s  (첫 토큰 이후)")
+    print(f"TTFT         : {(first_token_time or start_time) - start_time:.3f}s")
+    print(f"토큰 속도    : {tps_gen:.1f} tok/s  (생성 구간)")
+    print(f"토큰 속도    : {tps_wall:.1f} tok/s  (전체 구간, TTFT 포함)")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    run_benchmark()