1+ apiVersion : ray.io/v1
2+ kind : RayCluster
3+ metadata :
4+ labels :
5+ ray.io/cluster : raycluster-historyserver
6+ name : raycluster-historyserver
7+ namespace : default
8+ spec :
9+ headGroupSpec :
10+ rayStartParams :
11+ dashboard-host : 0.0.0.0
12+ num-cpus : " 0"
13+ serviceType : ClusterIP
14+ template :
15+ metadata :
16+ labels :
17+ test : raycluster-historyserver
18+ spec :
19+ imagePullSecrets :
20+ affinity :
21+ containers :
22+ - env :
23+ - name : RAY_enable_core_worker_ray_event_to_aggregator
24+ value : " 1"
25+ - name : RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR
26+ value : " http://localhost:8084/v1/events"
27+ image : xxx
28+ imagePullPolicy : IfNotPresent
29+ command :
30+ - ' echo "=========================================="; [ -d "/tmp/ray/session_latest" ] && dest="/tmp/ray/prev-logs/$(basename $(readlink /tmp/ray/session_latest))/$(cat /tmp/ray/raylet_node_id)" && echo "dst is $dest" && mkdir -p "$dest" && mv /tmp/ray/session_latest/logs "$dest/logs"; echo "========================================="'
31+ securityContext :
32+ allowPrivilegeEscalation : true # 允许特权提升
33+ privileged : true
34+ name : ray-head
35+ lifecycle :
36+ postStart :
37+ exec :
38+ command :
39+ - /bin/sh
40+ - -lc
41+ - --
42+ - |
43+ GetNodeId(){
44+ while true;
45+ do
46+ nodeid=$(ps -ef | grep raylet | grep node_id | grep -v grep | grep -oP '(?<=--node_id=)[^ ]*')
47+ if [ -n "$nodeid" ]; then
48+ echo "$(date) raylet started: \"$(ps -ef | grep raylet | grep node_id | grep -v grep | grep -oP '(?<=--node_id=)[^ ]*')\" => ${nodeid}" >> /tmp/ray/init.log
49+ echo $nodeid > /tmp/ray/raylet_node_id
50+ break
51+ else
52+ echo "$(date) raylet not start >> /tmp/ray/init.log"
53+ sleep 1
54+ fi
55+ done
56+ }
57+ GetNodeId
58+ resources :
59+ limits :
60+ cpu : " 5"
61+ memory : 10G
62+ requests :
63+ cpu : " 50m"
64+ memory : 1G
65+ volumeMounts :
66+ - name : historyserver
67+ mountPath : /tmp/ray
68+ - name : collector
69+ image : xxx
70+ imagePullPolicy : Always
71+ env :
72+ - name : S3DISABLE_SSL
73+ value : " true"
74+ - name : AWS_S3ID
75+ value : minioadmin
76+ - name : AWS_S3SECRET
77+ value : minioadmin
78+ - name : AWS_S3TOKEN
79+ value : " "
80+ - name : S3_BUCKET
81+ value : " ray-historyserver-log"
82+ - name : S3_ENDPOINT
83+ value : " minio-service.minio-dev:9000"
84+ - name : S3_REGION
85+ value : " test"
86+ - name : S3FORCE_PATH_STYPE
87+ value : " true"
88+ command :
89+ - collector
90+ - --role=Head
91+ - --runtime-class-name=s3
92+ - --ray-cluster-name=raycluster-historyserver
93+ - --ray-root-dir=log
94+ - --events-port=8084
95+ volumeMounts :
96+ - name : historyserver
97+ mountPath : /tmp/ray
98+ tolerations :
99+ - key : ray
100+ operator : Equal
101+ value : cpu
102+ volumes :
103+ - name : historyserver
104+ emptyDir : {}
105+ workerGroupSpecs :
106+ - groupName : cpu
107+ maxReplicas : 1000
108+ minReplicas : 0
109+ numOfHosts : 1
110+ rayStartParams : {}
111+ replicas : 0
112+ template :
113+ metadata :
114+ labels :
115+ test : raycluster-historyserver
116+ spec :
117+ imagePullSecrets :
118+ containers :
119+ - env :
120+ - name : RAY_enable_core_worker_ray_event_to_aggregator
121+ value : " 1"
122+ - name : RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR
123+ value : " http://localhost:8084/v1/events"
124+ image : xxx
125+ command :
126+ - ' echo "=========================================="; [ -d "/tmp/ray/session_latest" ] && dest="/tmp/ray/prev-logs/$(basename $(readlink /tmp/ray/session_latest))/$(cat /tmp/ray/raylet_node_id)" && echo "dst is $dest" && mkdir -p "$dest" && mv /tmp/ray/session_latest/logs "$dest/logs"; echo "========================================="'
127+ imagePullPolicy : IfNotPresent
128+ name : ray-worker
129+ securityContext :
130+ allowPrivilegeEscalation : true # 允许特权提升
131+ privileged : true
132+ lifecycle :
133+ postStart :
134+ exec :
135+ command :
136+ - /bin/sh
137+ - -lc
138+ - --
139+ - |
140+ GetNodeId(){
141+ while true;
142+ do
143+ nodeid=$(ps -ef | grep raylet | grep node_id | grep -v grep | grep -oP '(?<=--node_id=)[^ ]*')
144+ if [ -n "$nodeid" ]; then
145+ echo "$(date) raylet started: \"$(ps -ef | grep raylet | grep node_id | grep -v grep | grep -oP '(?<=--node_id=)[^ ]*')\" => ${nodeid}" >> /tmp/ray/init.log
146+ echo $nodeid > /tmp/ray/raylet_node_id
147+ break
148+ else
149+ echo "$(date) raylet not start >> /tmp/ray/init.log"
150+ sleep 1
151+ fi
152+ done
153+ }
154+ GetNodeId
155+ resources :
156+ limits :
157+ cpu : " 30"
158+ memory : 30G
159+ requests :
160+ cpu : " 50m"
161+ memory : 1G
162+ volumeMounts :
163+ - name : historyserver
164+ mountPath : /tmp/ray
165+ - name : collector
166+ image : xxx
167+ imagePullPolicy : Always
168+ env :
169+ - name : AWS_S3ID
170+ value : minioadmin
171+ - name : AWS_S3SECRET
172+ value : minioadmin
173+ - name : AWS_S3TOKEN
174+ value : " "
175+ - name : S3_BUCKET
176+ value : " ray-historyserver-log"
177+ - name : S3_ENDPOINT
178+ value : " minio-service.minio-dev:9000"
179+ - name : S3_REGION
180+ value : " test"
181+ - name : S3FORCE_PATH_STYPE
182+ value : " true"
183+ - name : S3DISABLE_SSL
184+ value : " true"
185+ command :
186+ - collector
187+ - --role=Worker
188+ - --runtime-class-name=s3
189+ - --ray-cluster-name=raycluster-historyserver
190+ - --ray-root-dir=log
191+ - --events-port=8084
192+ volumeMounts :
193+ - name : historyserver
194+ mountPath : /tmp/ray
195+ tolerations :
196+ - key : ray
197+ operator : Equal
198+ value : cpu
199+ volumes :
200+ - name : historyserver
201+ emptyDir : {}
0 commit comments