ig1sa commited on
Commit
e55d04c
·
verified ·
1 Parent(s): ca4a26c

Add files using upload-large-folder tool

Browse files
Files changed (1) hide show
  1. docker-compose.yaml +65 -0
docker-compose.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ qwen35-rp:
3
+ image: ghcr.io/iguanesolutions/qwen35-rp:v0.2.0
4
+ container_name: Qwen3.5-ReverseProxy
5
+ environment:
6
+ QWEN35RP_LOGLEVEL: INFO
7
+ QWEN35RP_ENFORCE_SAMPLING_PARAMS: "true"
8
+ QWEN35RP_TARGET: "http://vllm-qwen3.5-9b-nvfp4:8000"
9
+ QWEN35RP_SERVED_MODEL_NAME: "Qwen3.5-9B"
10
+ QWEN35RP_THINKING_GENERAL_MODEL: "Qwen3.5-9B Thinking General"
11
+ QWEN35RP_THINKING_CODING_MODEL: "Qwen3.5-9B Thinking Coding"
12
+ QWEN35RP_INSTRUCT_GENERAL_MODEL: "Qwen3.5-9B Instruct General"
13
+ QWEN35RP_INSTRUCT_REASONING_MODEL: "Qwen3.5-9B Instruct Creative"
14
+ ports:
15
+ - "127.0.0.1:8000:9000"
16
+ healthcheck:
17
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
18
+ interval: 30s
19
+ timeout: 10s
20
+ retries: 3
21
+ restart: unless-stopped
22
+ vllm-qwen3.5-9b-nvfp4:
23
+ image: vllm/vllm-openai:v0.18.0-cu130
24
+ container_name: Qwen3.5-9B-NVFP4-vLLM
25
+ command:
26
+ - "ig1/Qwen3.5-9B-NVFP4"
27
+ - --served-model-name
28
+ - "Qwen3.5-9B"
29
+ - --reasoning-parser
30
+ - "qwen3"
31
+ - --enable-auto-tool-choice
32
+ - --tool-call-parser
33
+ - "qwen3_coder"
34
+ - --max-model-len
35
+ - "auto"
36
+ - --limit-mm-per-prompt.video
37
+ - "0"
38
+ - --max-cudagraph-capture-size
39
+ - "64"
40
+ - --max-num-seqs
41
+ - "64"
42
+ - --gpu-memory-utilization
43
+ - "0.8"
44
+ environment:
45
+ HF_TOKEN: ${HF_TOKEN:-} # Uses env var if set, otherwise empty
46
+ VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: "1"
47
+ runtime: nvidia
48
+ deploy:
49
+ resources:
50
+ reservations:
51
+ devices:
52
+ - driver: nvidia
53
+ count: 1
54
+ capabilities: [gpu]
55
+ # ports:
56
+ # - "127.0.0.1:8000:8000"
57
+ volumes:
58
+ - E:\cache:/root/.cache # Adapt to your host
59
+ healthcheck:
60
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
61
+ interval: 30s
62
+ timeout: 10s
63
+ retries: 3
64
+ restart: unless-stopped
65
+