47 lines
1.4 KiB
Bash
47 lines
1.4 KiB
Bash
#!/usr/bin/env bash
|
|
# Phase 0 eval 대상 모델 서빙 (소형 dense 등). 35B 전용 플래그(instanttensor/MTP) 없음.
|
|
# 사용: bash scripts/run-vllm-eval-model.sh <hf_id_or_path> <served-name> [gpu_util] [max_len] [port]
|
|
set -euo pipefail
|
|
|
|
MODEL="${1:?model HF id or path}"
|
|
NAME="${2:?served-model-name}"
|
|
UTIL="${3:-0.30}"
|
|
MAXLEN="${4:-32768}"
|
|
PORT="${5:-8001}"
|
|
CNAME="vllm_eval"
|
|
|
|
docker rm -f "$CNAME" 2>/dev/null || true
|
|
|
|
docker run -d --name "$CNAME" \
|
|
--gpus all --network host --ipc host \
|
|
--ulimit memlock=-1 --ulimit stack=67108864 \
|
|
-v /home/windpacer/.cache/huggingface:/root/.cache/huggingface \
|
|
-v /home/windpacer/.cache/vllm:/root/.cache/vllm \
|
|
-v /home/windpacer/ai-models:/root/ai-models \
|
|
--entrypoint "" \
|
|
vllm-node-tf5 \
|
|
bash -c "
|
|
exec vllm serve ${MODEL} \
|
|
--served-model-name ${NAME} \
|
|
--max-model-len ${MAXLEN} \
|
|
--max-num-seqs 8 \
|
|
--gpu-memory-utilization ${UTIL} \
|
|
--port ${PORT} --host 0.0.0.0 \
|
|
--enable-chunked-prefill \
|
|
--enable-auto-tool-choice --tool-call-parser hermes \
|
|
--trust-remote-code \
|
|
--kv-cache-dtype fp8 \
|
|
-tp 1
|
|
"
|
|
|
|
echo "Waiting for ${NAME} on :${PORT} ..."
|
|
for i in $(seq 1 48); do
|
|
if curl -sf "http://localhost:${PORT}/v1/models" >/dev/null 2>&1; then
|
|
echo "✓ Ready: ${NAME}"
|
|
curl -s "http://localhost:${PORT}/v1/models" | python3 -m json.tool 2>/dev/null | grep '"id"' || true
|
|
exit 0
|
|
fi
|
|
sleep 5
|
|
done
|
|
echo "✗ Timed out — docker logs ${CNAME}"; exit 1
|