#!/usr/bin/env bash
set -euo pipefail

NAME="vllm_qwen8b"
PORT="${1:-8001}"

echo "Starting Qwen3-8B-FP8 container on port ${PORT}..."

docker run -d --name "$NAME" \
  --restart unless-stopped \
  --gpus all --network host --ipc host \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  -v /home/windpacer/.cache/huggingface:/root/.cache/huggingface \
  -v /home/windpacer/.cache/vllm:/root/.cache/vllm \
  --entrypoint "" \
  vllm-node-tf5 \
  bash -c "
exec vllm serve Qwen/Qwen3-8B-FP8 \
  --served-model-name Qwen3-8B-FP8 \
  --max-model-len 32768 \
  --max-num-seqs 8 \
  --gpu-memory-utilization 0.25 \
  --port ${PORT} --host 0.0.0.0 \
  --enable-chunked-prefill \
  --enable-auto-tool-choice \
  --tool-call-parser hermes \
  --trust-remote-code \
  --kv-cache-dtype fp8 \
  -tp 1
"

echo "Waiting for model to load..."
for i in $(seq 1 60); do
  if curl -sf "http://localhost:${PORT}/v1/models" > /dev/null 2>&1; then
    echo "✓ Ready on port ${PORT}"
    curl -s "http://localhost:${PORT}/v1/models" | python3 -m json.tool 2>/dev/null || true
    exit 0
  fi
  echo "  Waiting... (${i}/60)"
  sleep 5
done

echo "✗ Timed out. Check: docker logs $NAME"
exit 1