#!/usr/bin/env bash set -euo pipefail NAME="vllm_qwen35b" PORT="${1:-8001}" echo "Starting Qwen3.6-35B-A3B-FP8 on port ${PORT} (LoRA enabled)..." docker rm -f "$NAME" 2>/dev/null || true docker run -d --name "$NAME" \ --restart unless-stopped \ --gpus all --network host --ipc host \ --ulimit memlock=-1 --ulimit stack=67108864 \ -v /home/windpacer/.cache/huggingface:/root/.cache/huggingface \ -v /home/windpacer/.cache/vllm:/root/.cache/vllm \ -v /home/windpacer/ai-models:/root/ai-models \ --entrypoint "" \ vllm-node-tf5 \ bash -c " exec vllm serve /root/ai-models/Qwen3.6-35B-A3B-FP8 \ --served-model-name Qwen3.6-35B-A3B-FP8 \ --max-model-len 65536 \ --max-num-seqs 4 \ --gpu-memory-utilization 0.55 \ --port ${PORT} --host 0.0.0.0 \ --enable-chunked-prefill \ --enable-auto-tool-choice \ --tool-call-parser qwen3_coder \ --reasoning-parser qwen3 \ --trust-remote-code \ --kv-cache-dtype fp8 \ --default-chat-template-kwargs '{\"preserve_thinking\": true}' \ --speculative-config '{\"method\": \"qwen3_next_mtp\", \"num_speculative_tokens\": 2}' \ --override-generation-config '{\"temperature\": 0.6, \"top_p\": 0.95}' \ --load-format instanttensor \ --enable-lora \ --max-lora-rank 64 \ --max-loras 4 \ --lora-dtype auto \ -tp 1 " echo "Waiting for model to load..." for i in $(seq 1 120); do if curl -sf "http://localhost:${PORT}/v1/models" > /dev/null 2>&1; then echo "✓ Ready on port ${PORT}" curl -s "http://localhost:${PORT}/v1/models" | python3 -m json.tool 2>/dev/null || true exit 0 fi echo " Waiting... (${i}/120)" sleep 5 done echo "❌ Failed to start within 10 minutes" exit 1