Agreed, the OOB experience kind of suck.
Here is the magic (assuming a 4x)...
docker run -it --rm \
--pull=always \
--ipc=host \
--network=host \
--privileged \
--cap-add=CAP_SYS_ADMIN \
--device=/dev/kfd \
--device=/dev/dri \
--device=/dev/mem \
--group-add render \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
-v /home/hotaisle:/mnt/data \
-v /root/.cache:/mnt/model \
rocm/vllm-dev:nightly
mv /root/.cache /root/.cache.foo
ln -s /mnt/model /root/.cache
VLLM_ROCM_USE_AITER=1 vllm serve zai-org/GLM-4.7-FP8 \
--tensor-parallel-size 4 \
--kv-cache-dtype fp8 \
--quantization fp8 \
--enable-auto-tool-choice \
--tool-call-parser glm47 \
--reasoning-parser glm45 \
--load-format fastsafetensors \
--enable-expert-parallel \
--allowed-local-media-path / \
--speculative-config.method mtp \
--speculative-config.num_speculative_tokens 1 \
--mm-encoder-tp-mode data