services: qwen35-rp: image: ghcr.io/iguanesolutions/qwen35-rp:v0.2.0 container_name: Qwen3.5-ReverseProxy environment: QWEN35RP_LOGLEVEL: INFO QWEN35RP_ENFORCE_SAMPLING_PARAMS: "true" QWEN35RP_TARGET: "http://vllm-qwen3.5-9b-nvfp4:8000" QWEN35RP_SERVED_MODEL_NAME: "Qwen3.5-9B" QWEN35RP_THINKING_GENERAL_MODEL: "Qwen3.5-9B Thinking General" QWEN35RP_THINKING_CODING_MODEL: "Qwen3.5-9B Thinking Coding" QWEN35RP_INSTRUCT_GENERAL_MODEL: "Qwen3.5-9B Instruct General" QWEN35RP_INSTRUCT_REASONING_MODEL: "Qwen3.5-9B Instruct Creative" ports: - "127.0.0.1:8000:9000" healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/health"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped vllm-qwen3.5-9b-nvfp4: image: vllm/vllm-openai:v0.18.0-cu130 container_name: Qwen3.5-9B-NVFP4-vLLM command: - "ig1/Qwen3.5-9B-NVFP4" - --served-model-name - "Qwen3.5-9B" - --reasoning-parser - "qwen3" - --enable-auto-tool-choice - --tool-call-parser - "qwen3_coder" - --max-model-len - "auto" - --limit-mm-per-prompt.video - "0" - --max-cudagraph-capture-size - "64" - --max-num-seqs - "64" - --gpu-memory-utilization - "0.8" environment: HF_TOKEN: ${HF_TOKEN:-} # Uses env var if set, otherwise empty VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: "1" runtime: nvidia deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] # ports: # - "127.0.0.1:8000:8000" volumes: - E:\cache:/root/.cache # Adapt to your host healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped