#!/usr/bin/env python3
"""
Qwen3-Coder-Next-FP8 출력 토큰 속도 벤치마크
- 스트리밍 모드로 수신하며 토큰/초 실시간 측정
- usage.completion_tokens 기반 최종 속도 산출
"""

import time
import sys
from openai import OpenAI

VLLM_BASE_URL = "http://localhost:8000/v1"
VLLM_MODEL    = "Qwen3.6-27B-FP8"

# ── 프로그램 작성 예제 프롬프트 ────────────────────────────────────────────────
PROMPT = """\
Python으로 다음 조건을 만족하는 TTL-LRU 캐시 클래스를 작성해줘.

요구사항:
1. `capacity` (최대 항목 수)와 `ttl_seconds` (항목 유효 시간)를 생성자에서 받는다.
2. `get(key)` — 없거나 만료된 항목은 None 반환.
3. `set(key, value)` — 캐시가 가득 차면 가장 오래된 항목을 제거한다.
4. `delete(key)` — 명시적 삭제.
5. `size()` — 현재 유효한 항목 수 반환 (만료된 항목 제외).
6. 스레드 안전해야 한다 (threading.Lock 사용).
7. 클래스 하단에 동작을 검증하는 `if __name__ == '__main__':` 테스트 코드를 포함한다.

추가 조건:
- 외부 라이브러리 사용 금지 (표준 라이브러리만).
- 타입 힌트를 모든 메서드에 명시한다.
- 각 메서드에 한 줄 docstring을 작성한다.
"""

def run_benchmark():
    client = OpenAI(base_url=VLLM_BASE_URL, api_key="dummy")

    print(f"모델  : {VLLM_MODEL}")
    print(f"프롬프트 길이: {len(PROMPT)} chars")
    print("=" * 60)
    print()

    # ── 스트리밍 요청 ──────────────────────────────────────────────
    stream = client.chat.completions.create(
        model=VLLM_MODEL,
        messages=[
            {
                "role": "system",
                "content": "당신은 숙련된 Python 개발자입니다. 명확하고 실용적인 코드를 작성합니다.",
            },
            {"role": "user", "content": PROMPT},
        ],
        max_tokens=2048,
        temperature=0.1,
        stream=True,
        stream_options={"include_usage": True},  # 마지막 청크에 usage 포함
    )

    # ── 스트리밍 수신 + 측정 ────────────────────────────────────────
    first_token_time = None
    start_time       = time.perf_counter()
    char_count       = 0
    completion_tokens = 0
    full_text        = []

    for chunk in stream:
        # usage 청크 (마지막)
        if chunk.usage:
            completion_tokens = chunk.usage.completion_tokens

        if not chunk.choices:
            continue

        delta = chunk.choices[0].delta
        if delta.content:
            if first_token_time is None:
                first_token_time = time.perf_counter()
                ttft = first_token_time - start_time
                print(f"[TTFT: {ttft:.3f}s] ", end="", flush=True)

            sys.stdout.write(delta.content)
            sys.stdout.flush()
            full_text.append(delta.content)
            char_count += len(delta.content)

    end_time = time.perf_counter()

    # ── 결과 출력 ──────────────────────────────────────────────────
    total_time    = end_time - start_time
    gen_time      = end_time - (first_token_time or start_time)
    tps_wall      = completion_tokens / total_time if total_time > 0 else 0
    tps_gen       = completion_tokens / gen_time   if gen_time   > 0 else 0

    print()
    print()
    print("=" * 60)
    print(f"총 출력 토큰 : {completion_tokens:,}")
    print(f"총 소요 시간 : {total_time:.2f}s")
    print(f"생성 시간    : {gen_time:.2f}s  (첫 토큰 이후)")
    print(f"TTFT         : {(first_token_time or start_time) - start_time:.3f}s")
    print(f"토큰 속도    : {tps_gen:.1f} tok/s  (생성 구간)")
    print(f"토큰 속도    : {tps_wall:.1f} tok/s  (전체 구간, TTFT 포함)")
    print("=" * 60)


if __name__ == "__main__":
    run_benchmark()