Qwen3.5-9B-NVFP4 / docker-compose.yaml
hekmon's picture
Update docker-compose.yaml
3b9e07b verified
services:
qwen35-rp:
image: ghcr.io/iguanesolutions/qwen35-rp:v0.2.0
container_name: Qwen3.5-ReverseProxy
environment:
QWEN35RP_LOGLEVEL: INFO
QWEN35RP_ENFORCE_SAMPLING_PARAMS: "true"
QWEN35RP_TARGET: "http://vllm-qwen3.5-9b-nvfp4:8000"
QWEN35RP_SERVED_MODEL_NAME: "Qwen3.5-9B"
QWEN35RP_THINKING_GENERAL_MODEL: "Qwen3.5-9B Thinking General"
QWEN35RP_THINKING_CODING_MODEL: "Qwen3.5-9B Thinking Coding"
QWEN35RP_INSTRUCT_GENERAL_MODEL: "Qwen3.5-9B Instruct General"
QWEN35RP_INSTRUCT_REASONING_MODEL: "Qwen3.5-9B Instruct Creative"
ports:
- "127.0.0.1:8000:9000"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/health"]
interval: 30s
timeout: 10s
retries: 3
restart: unless-stopped
vllm-qwen3.5-9b-nvfp4:
image: vllm/vllm-openai:v0.18.0-cu130
container_name: Qwen3.5-9B-NVFP4-vLLM
command:
- "ig1/Qwen3.5-9B-NVFP4"
- --served-model-name
- "Qwen3.5-9B"
- --reasoning-parser
- "qwen3"
- --enable-auto-tool-choice
- --tool-call-parser
- "qwen3_coder"
- --max-model-len
- "auto"
- --limit-mm-per-prompt.video
- "0"
- --max-cudagraph-capture-size
- "64"
- --max-num-seqs
- "64"
- --gpu-memory-utilization
- "0.8"
environment:
HF_TOKEN: ${HF_TOKEN:-} # Uses env var if set, otherwise empty
VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: "1"
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
# ports:
# - "127.0.0.1:8000:8000"
volumes:
- E:\cache:/root/.cache # Adapt to your host
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
restart: unless-stopped