#!/usr/bin/env bash
set -euo pipefail

NAME="vllm_qwen35b"
PORT="${1:-8001}"

echo "Starting Qwen3.6-35B-A3B-FP8 on port ${PORT} (LoRA enabled)..."

docker rm -f "$NAME" 2>/dev/null || true

docker run -d --name "$NAME" \
  --restart unless-stopped \
  --gpus all --network host --ipc host \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  -v /home/windpacer/.cache/huggingface:/root/.cache/huggingface \
  -v /home/windpacer/.cache/vllm:/root/.cache/vllm \
  -v /home/windpacer/ai-models:/root/ai-models \
  --entrypoint "" \
  vllm-node-tf5 \
  bash -c "
exec vllm serve /root/ai-models/Qwen3.6-35B-A3B-FP8 \
  --served-model-name Qwen3.6-35B-A3B-FP8 \
  --max-model-len 65536 \
  --max-num-seqs 4 \
  --gpu-memory-utilization 0.55 \
  --port ${PORT} --host 0.0.0.0 \
  --enable-chunked-prefill \
  --enable-auto-tool-choice \
  --tool-call-parser qwen3_coder \
  --reasoning-parser qwen3 \
  --trust-remote-code \
  --kv-cache-dtype fp8 \
  --default-chat-template-kwargs '{\"preserve_thinking\": true}' \
  --speculative-config '{\"method\": \"qwen3_next_mtp\", \"num_speculative_tokens\": 2}' \
  --override-generation-config '{\"temperature\": 0.6, \"top_p\": 0.95}' \
  --load-format instanttensor \
  --enable-lora \
  --max-lora-rank 64 \
  --max-loras 4 \
  --lora-dtype auto \
  -tp 1
"

echo "Waiting for model to load..."
for i in $(seq 1 120); do
  if curl -sf "http://localhost:${PORT}/v1/models" > /dev/null 2>&1; then
    echo "✓ Ready on port ${PORT}"
    curl -s "http://localhost:${PORT}/v1/models" | python3 -m json.tool 2>/dev/null || true
    exit 0
  fi
  echo "  Waiting... (${i}/120)"
  sleep 5
done
echo "❌ Failed to start within 10 minutes"
exit 1