services:
  qwen35-rp:
    image: ghcr.io/iguanesolutions/qwen35-rp:v0.2.0
    container_name: Qwen3.5-ReverseProxy
    environment:
      QWEN35RP_LOGLEVEL: INFO
      QWEN35RP_ENFORCE_SAMPLING_PARAMS: "true"
      QWEN35RP_TARGET: "http://vllm-qwen3.5-9b-nvfp4:8000"
      QWEN35RP_SERVED_MODEL_NAME: "Qwen3.5-9B"
      QWEN35RP_THINKING_GENERAL_MODEL: "Qwen3.5-9B Thinking General"
      QWEN35RP_THINKING_CODING_MODEL: "Qwen3.5-9B Thinking Coding"
      QWEN35RP_INSTRUCT_GENERAL_MODEL: "Qwen3.5-9B Instruct General"
      QWEN35RP_INSTRUCT_REASONING_MODEL: "Qwen3.5-9B Instruct Creative"
    ports:
      - "127.0.0.1:8000:9000"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
  vllm-qwen3.5-9b-nvfp4:
    image: vllm/vllm-openai:v0.18.0-cu130
    container_name: Qwen3.5-9B-NVFP4-vLLM
    command:
      - "ig1/Qwen3.5-9B-NVFP4"
      - --served-model-name
      - "Qwen3.5-9B"
      - --reasoning-parser
      - "qwen3"
      - --enable-auto-tool-choice
      - --tool-call-parser
      - "qwen3_coder"
      - --max-model-len
      - "auto"
      - --limit-mm-per-prompt.video
      - "0"
      - --max-cudagraph-capture-size
      - "64"
      - --max-num-seqs
      - "64"
      - --gpu-memory-utilization
      - "0.8"
    environment:
      HF_TOKEN: ${HF_TOKEN:-} # Uses env var if set, otherwise empty
      VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: "1"
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    # ports:
    #   - "127.0.0.1:8000:8000"
    volumes:
      - E:\cache:/root/.cache # Adapt to your host
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped