docker run -d --name gemma4-dflash \ --gpus all --network host --ipc host \ --ulimit memlock=-1 --ulimit stack=67108864 \ -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \ -e PORT=8000 \ -e SERVED_MODEL_NAME=google/gemma-4-31B-it-vllm-fp8-dflash-16k \ -e HF_TOKEN=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 \ -e VLLM_DISABLE_COMPILE_CACHE=1 \ --entrypoint "" \ gemma4-dflash:arm64 \ bash -c " python3 /opt/gemma4-dflash-spark-vllm/scripts/patch_vllm_gb10_gemma4_dflash_runtime.py exec vllm serve google/gemma-4-31B-it \ --host 0.0.0.0 \ --port 8000 \ --served-model-name google/gemma-4-31B-it-vllm-fp8-dflash-16k \ --trust-remote-code \ --max-model-len 16384 \ --gpu-memory-utilization 0.80 \ --quantization fp8 \ --tensor-parallel-size 1 \ --max-num-batched-tokens 16384 \ --enforce-eager \ --speculative-config '{"model":"RedHatAI/gemma-4-31B-it-speculator.dflash","num_speculative_tokens":8,"method":"dflash"}' \ --limit-mm-per-prompt '{"image":0,"video":0}' \ --enable-auto-tool-choice \ --tool-call-parser hermes "