Files
ExperionCrawler/scripts/run-vllm-eval-model.sh

47 lines
1.4 KiB
Bash

#!/usr/bin/env bash
# Phase 0 eval 대상 모델 서빙 (소형 dense 등). 35B 전용 플래그(instanttensor/MTP) 없음.
# 사용: bash scripts/run-vllm-eval-model.sh <hf_id_or_path> <served-name> [gpu_util] [max_len] [port]
set -euo pipefail
MODEL="${1:?model HF id or path}"
NAME="${2:?served-model-name}"
UTIL="${3:-0.30}"
MAXLEN="${4:-32768}"
PORT="${5:-8001}"
CNAME="vllm_eval"
docker rm -f "$CNAME" 2>/dev/null || true
docker run -d --name "$CNAME" \
--gpus all --network host --ipc host \
--ulimit memlock=-1 --ulimit stack=67108864 \
-v /home/windpacer/.cache/huggingface:/root/.cache/huggingface \
-v /home/windpacer/.cache/vllm:/root/.cache/vllm \
-v /home/windpacer/ai-models:/root/ai-models \
--entrypoint "" \
vllm-node-tf5 \
bash -c "
exec vllm serve ${MODEL} \
--served-model-name ${NAME} \
--max-model-len ${MAXLEN} \
--max-num-seqs 8 \
--gpu-memory-utilization ${UTIL} \
--port ${PORT} --host 0.0.0.0 \
--enable-chunked-prefill \
--enable-auto-tool-choice --tool-call-parser hermes \
--trust-remote-code \
--kv-cache-dtype fp8 \
-tp 1
"
echo "Waiting for ${NAME} on :${PORT} ..."
for i in $(seq 1 48); do
if curl -sf "http://localhost:${PORT}/v1/models" >/dev/null 2>&1; then
echo "✓ Ready: ${NAME}"
curl -s "http://localhost:${PORT}/v1/models" | python3 -m json.tool 2>/dev/null | grep '"id"' || true
exit 0
fi
sleep 5
done
echo "✗ Timed out — docker logs ${CNAME}"; exit 1