#!/usr/bin/env bash set -euo pipefail NAME="vllm_qwen8b" PORT="${1:-8001}" echo "Starting Qwen3-8B-FP8 container on port ${PORT}..." docker run -d --name "$NAME" \ --restart unless-stopped \ --gpus all --network host --ipc host \ --ulimit memlock=-1 --ulimit stack=67108864 \ -v /home/windpacer/.cache/huggingface:/root/.cache/huggingface \ -v /home/windpacer/.cache/vllm:/root/.cache/vllm \ --entrypoint "" \ vllm-node-tf5 \ bash -c " exec vllm serve Qwen/Qwen3-8B-FP8 \ --served-model-name Qwen3-8B-FP8 \ --max-model-len 32768 \ --max-num-seqs 8 \ --gpu-memory-utilization 0.25 \ --port ${PORT} --host 0.0.0.0 \ --enable-chunked-prefill \ --enable-auto-tool-choice \ --tool-call-parser hermes \ --trust-remote-code \ --kv-cache-dtype fp8 \ -tp 1 " echo "Waiting for model to load..." for i in $(seq 1 60); do if curl -sf "http://localhost:${PORT}/v1/models" > /dev/null 2>&1; then echo "✓ Ready on port ${PORT}" curl -s "http://localhost:${PORT}/v1/models" | python3 -m json.tool 2>/dev/null || true exit 0 fi echo " Waiting... (${i}/60)" sleep 5 done echo "✗ Timed out. Check: docker logs $NAME" exit 1