#!/usr/bin/env bash # Phase 0 eval 대상 모델 서빙 (소형 dense 등). 35B 전용 플래그(instanttensor/MTP) 없음. # 사용: bash scripts/run-vllm-eval-model.sh [gpu_util] [max_len] [port] set -euo pipefail MODEL="${1:?model HF id or path}" NAME="${2:?served-model-name}" UTIL="${3:-0.30}" MAXLEN="${4:-32768}" PORT="${5:-8001}" CNAME="vllm_eval" docker rm -f "$CNAME" 2>/dev/null || true docker run -d --name "$CNAME" \ --gpus all --network host --ipc host \ --ulimit memlock=-1 --ulimit stack=67108864 \ -v /home/windpacer/.cache/huggingface:/root/.cache/huggingface \ -v /home/windpacer/.cache/vllm:/root/.cache/vllm \ -v /home/windpacer/ai-models:/root/ai-models \ --entrypoint "" \ vllm-node-tf5 \ bash -c " exec vllm serve ${MODEL} \ --served-model-name ${NAME} \ --max-model-len ${MAXLEN} \ --max-num-seqs 8 \ --gpu-memory-utilization ${UTIL} \ --port ${PORT} --host 0.0.0.0 \ --enable-chunked-prefill \ --enable-auto-tool-choice --tool-call-parser hermes \ --trust-remote-code \ --kv-cache-dtype fp8 \ -tp 1 " echo "Waiting for ${NAME} on :${PORT} ..." for i in $(seq 1 48); do if curl -sf "http://localhost:${PORT}/v1/models" >/dev/null 2>&1; then echo "✓ Ready: ${NAME}" curl -s "http://localhost:${PORT}/v1/models" | python3 -m json.tool 2>/dev/null | grep '"id"' || true exit 0 fi sleep 5 done echo "✗ Timed out — docker logs ${CNAME}"; exit 1