|
17 | 17 | (in the cluster sub-module) for AppWrapper generation.
|
18 | 18 | """
|
19 | 19 |
|
| 20 | +import json |
20 | 21 | from typing import Optional
|
21 | 22 | import typing
|
22 | 23 | import yaml
|
|
31 | 32 | from base64 import b64encode
|
32 | 33 | from urllib3.util import parse_url
|
33 | 34 | from kubernetes.client.exceptions import ApiException
|
| 35 | +import codeflare_sdk |
34 | 36 |
|
35 | 37 |
|
36 | 38 | def read_template(template):
|
@@ -78,10 +80,13 @@ def is_kind_cluster():
|
78 | 80 | return False
|
79 | 81 |
|
80 | 82 |
|
81 |
| -def update_names(cluster_yaml, cluster_name, namespace): |
82 |
| - meta = cluster_yaml.get("metadata") |
83 |
| - meta["name"] = cluster_name |
84 |
| - meta["namespace"] = namespace |
| 83 | +def update_names( |
| 84 | + cluster_yaml: dict, |
| 85 | + cluster: "codeflare_sdk.cluster.Cluster", |
| 86 | +): |
| 87 | + metadata = cluster_yaml.get("metadata") |
| 88 | + metadata["name"] = cluster.config.name |
| 89 | + metadata["namespace"] = cluster.config.namespace |
85 | 90 |
|
86 | 91 |
|
87 | 92 | def update_image(spec, image):
|
@@ -114,67 +119,111 @@ def update_resources(
|
114 | 119 | worker_cpu_limits,
|
115 | 120 | worker_memory_requests,
|
116 | 121 | worker_memory_limits,
|
117 |
| - num_worker_gpus, |
| 122 | + custom_resources, |
118 | 123 | ):
|
119 | 124 | container = spec.get("containers")
|
120 | 125 | for resource in container:
|
121 | 126 | requests = resource.get("resources").get("requests")
|
122 | 127 | if requests is not None:
|
123 | 128 | requests["cpu"] = worker_cpu_requests
|
124 | 129 | requests["memory"] = worker_memory_requests
|
125 |
| - requests["nvidia.com/gpu"] = num_worker_gpus |
126 | 130 | limits = resource.get("resources").get("limits")
|
127 | 131 | if limits is not None:
|
128 | 132 | limits["cpu"] = worker_cpu_limits
|
129 | 133 | limits["memory"] = worker_memory_limits
|
130 |
| - limits["nvidia.com/gpu"] = num_worker_gpus |
| 134 | + for k in custom_resources.keys(): |
| 135 | + limits[k] = custom_resources[k] |
| 136 | + requests[k] = custom_resources[k] |
| 137 | + |
| 138 | + |
| 139 | +def head_worker_gpu_count_from_cluster( |
| 140 | + cluster: "codeflare_sdk.cluster.Cluster", |
| 141 | +) -> typing.Tuple[int, int]: |
| 142 | + head_gpus = 0 |
| 143 | + worker_gpus = 0 |
| 144 | + for k in cluster.config.head_extended_resource_requests.keys(): |
| 145 | + resource_type = cluster.config.extended_resource_mapping[k] |
| 146 | + if resource_type == "GPU": |
| 147 | + head_gpus += int(cluster.config.head_extended_resource_requests[k]) |
| 148 | + for k in cluster.config.worker_extended_resource_requests.keys(): |
| 149 | + resource_type = cluster.config.extended_resource_mapping[k] |
| 150 | + if resource_type == "GPU": |
| 151 | + worker_gpus += int(cluster.config.worker_extended_resource_requests[k]) |
| 152 | + |
| 153 | + return head_gpus, worker_gpus |
| 154 | + |
| 155 | + |
| 156 | +FORBIDDEN_CUSTOM_RESOURCE_TYPES = ["GPU", "CPU", "memory"] |
| 157 | + |
| 158 | + |
| 159 | +def head_worker_resources_from_cluster( |
| 160 | + cluster: "codeflare_sdk.cluster.Cluster", |
| 161 | +) -> typing.Tuple[dict, dict]: |
| 162 | + to_return = {}, {} |
| 163 | + for k in cluster.config.head_extended_resource_requests.keys(): |
| 164 | + resource_type = cluster.config.extended_resource_mapping[k] |
| 165 | + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: |
| 166 | + continue |
| 167 | + to_return[0][resource_type] = cluster.config.head_extended_resource_requests[ |
| 168 | + k |
| 169 | + ] + to_return[0].get(resource_type, 0) |
| 170 | + |
| 171 | + for k in cluster.config.worker_extended_resource_requests.keys(): |
| 172 | + resource_type = cluster.config.extended_resource_mapping[k] |
| 173 | + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: |
| 174 | + continue |
| 175 | + to_return[1][resource_type] = cluster.config.worker_extended_resource_requests[ |
| 176 | + k |
| 177 | + ] + to_return[1].get(resource_type, 0) |
| 178 | + return to_return |
131 | 179 |
|
132 | 180 |
|
133 | 181 | def update_nodes(
|
134 |
| - cluster_yaml, |
135 |
| - appwrapper_name, |
136 |
| - worker_cpu_requests, |
137 |
| - worker_cpu_limits, |
138 |
| - worker_memory_requests, |
139 |
| - worker_memory_limits, |
140 |
| - num_worker_gpus, |
141 |
| - workers, |
142 |
| - image, |
143 |
| - env, |
144 |
| - image_pull_secrets, |
145 |
| - head_cpus, |
146 |
| - head_memory, |
147 |
| - num_head_gpus, |
| 182 | + ray_cluster_dict: dict, |
| 183 | + cluster: "codeflare_sdk.cluster.Cluster", |
148 | 184 | ):
|
149 |
| - head = cluster_yaml.get("spec").get("headGroupSpec") |
150 |
| - head["rayStartParams"]["num-gpus"] = str(int(num_head_gpus)) |
| 185 | + head = ray_cluster_dict.get("spec").get("headGroupSpec") |
| 186 | + worker = ray_cluster_dict.get("spec").get("workerGroupSpecs")[0] |
| 187 | + head_gpus, worker_gpus = head_worker_gpu_count_from_cluster(cluster) |
| 188 | + head_resources, worker_resources = head_worker_resources_from_cluster(cluster) |
| 189 | + head_resources = json.dumps(head_resources).replace('"', '\\"') |
| 190 | + head_resources = f'"{head_resources}"' |
| 191 | + worker_resources = json.dumps(worker_resources).replace('"', '\\"') |
| 192 | + worker_resources = f'"{worker_resources}"' |
| 193 | + head["rayStartParams"]["num-gpus"] = str(head_gpus) |
| 194 | + head["rayStartParams"]["resources"] = head_resources |
151 | 195 |
|
152 |
| - worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0] |
153 | 196 | # Head counts as first worker
|
154 |
| - worker["replicas"] = workers |
155 |
| - worker["minReplicas"] = workers |
156 |
| - worker["maxReplicas"] = workers |
157 |
| - worker["groupName"] = "small-group-" + appwrapper_name |
158 |
| - worker["rayStartParams"]["num-gpus"] = str(int(num_worker_gpus)) |
| 197 | + worker["replicas"] = cluster.config.num_workers |
| 198 | + worker["minReplicas"] = cluster.config.num_workers |
| 199 | + worker["maxReplicas"] = cluster.config.num_workers |
| 200 | + worker["groupName"] = "small-group-" + cluster.config.name |
| 201 | + worker["rayStartParams"]["num-gpus"] = str(worker_gpus) |
| 202 | + worker["rayStartParams"]["resources"] = worker_resources |
159 | 203 |
|
160 | 204 | for comp in [head, worker]:
|
161 | 205 | spec = comp.get("template").get("spec")
|
162 |
| - update_image_pull_secrets(spec, image_pull_secrets) |
163 |
| - update_image(spec, image) |
164 |
| - update_env(spec, env) |
| 206 | + update_image_pull_secrets(spec, cluster.config.image_pull_secrets) |
| 207 | + update_image(spec, cluster.config.image) |
| 208 | + update_env(spec, cluster.config.envs) |
165 | 209 | if comp == head:
|
166 | 210 | # TODO: Eventually add head node configuration outside of template
|
167 | 211 | update_resources(
|
168 |
| - spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus |
| 212 | + spec, |
| 213 | + cluster.config.head_cpus, |
| 214 | + cluster.config.head_cpus, |
| 215 | + cluster.config.head_memory, |
| 216 | + cluster.config.head_memory, |
| 217 | + cluster.config.head_extended_resource_requests, |
169 | 218 | )
|
170 | 219 | else:
|
171 | 220 | update_resources(
|
172 | 221 | spec,
|
173 |
| - worker_cpu_requests, |
174 |
| - worker_cpu_limits, |
175 |
| - worker_memory_requests, |
176 |
| - worker_memory_limits, |
177 |
| - num_worker_gpus, |
| 222 | + cluster.config.worker_cpu_requests, |
| 223 | + cluster.config.worker_cpu_limits, |
| 224 | + cluster.config.worker_memory_requests, |
| 225 | + cluster.config.worker_memory_limits, |
| 226 | + cluster.config.worker_extended_resource_requests, |
178 | 227 | )
|
179 | 228 |
|
180 | 229 |
|
@@ -278,63 +327,30 @@ def write_user_yaml(user_yaml, output_file_name):
|
278 | 327 | print(f"Written to: {output_file_name}")
|
279 | 328 |
|
280 | 329 |
|
281 |
| -def generate_appwrapper( |
282 |
| - name: str, |
283 |
| - namespace: str, |
284 |
| - head_cpus: int, |
285 |
| - head_memory: int, |
286 |
| - num_head_gpus: int, |
287 |
| - worker_cpu_requests: int, |
288 |
| - worker_cpu_limits: int, |
289 |
| - worker_memory_requests: int, |
290 |
| - worker_memory_limits: int, |
291 |
| - num_worker_gpus: int, |
292 |
| - workers: int, |
293 |
| - template: str, |
294 |
| - image: str, |
295 |
| - appwrapper: bool, |
296 |
| - env, |
297 |
| - image_pull_secrets: list, |
298 |
| - write_to_file: bool, |
299 |
| - local_queue: Optional[str], |
300 |
| - labels, |
301 |
| -): |
302 |
| - cluster_yaml = read_template(template) |
303 |
| - appwrapper_name, cluster_name = gen_names(name) |
304 |
| - update_names(cluster_yaml, cluster_name, namespace) |
305 |
| - update_nodes( |
| 330 | +def generate_appwrapper(cluster: "codeflare_sdk.cluster.Cluster"): |
| 331 | + cluster_yaml = read_template(cluster.config.template) |
| 332 | + appwrapper_name, _ = gen_names(cluster.config.name) |
| 333 | + update_names( |
306 | 334 | cluster_yaml,
|
307 |
| - appwrapper_name, |
308 |
| - worker_cpu_requests, |
309 |
| - worker_cpu_limits, |
310 |
| - worker_memory_requests, |
311 |
| - worker_memory_limits, |
312 |
| - num_worker_gpus, |
313 |
| - workers, |
314 |
| - image, |
315 |
| - env, |
316 |
| - image_pull_secrets, |
317 |
| - head_cpus, |
318 |
| - head_memory, |
319 |
| - num_head_gpus, |
| 335 | + cluster, |
320 | 336 | )
|
321 |
| - augment_labels(cluster_yaml, labels) |
| 337 | + update_nodes(cluster_yaml, cluster) |
| 338 | + augment_labels(cluster_yaml, cluster.config.labels) |
322 | 339 | notebook_annotations(cluster_yaml)
|
323 |
| - |
324 | 340 | user_yaml = (
|
325 |
| - wrap_cluster(cluster_yaml, appwrapper_name, namespace) |
326 |
| - if appwrapper |
| 341 | + wrap_cluster(cluster_yaml, appwrapper_name, cluster.config.namespace) |
| 342 | + if cluster.config.appwrapper |
327 | 343 | else cluster_yaml
|
328 | 344 | )
|
329 | 345 |
|
330 |
| - add_queue_label(user_yaml, namespace, local_queue) |
| 346 | + add_queue_label(user_yaml, cluster.config.namespace, cluster.config.local_queue) |
331 | 347 |
|
332 |
| - if write_to_file: |
| 348 | + if cluster.config.write_to_file: |
333 | 349 | directory_path = os.path.expanduser("~/.codeflare/resources/")
|
334 | 350 | outfile = os.path.join(directory_path, appwrapper_name + ".yaml")
|
335 | 351 | write_user_yaml(user_yaml, outfile)
|
336 | 352 | return outfile
|
337 | 353 | else:
|
338 | 354 | user_yaml = yaml.dump(user_yaml)
|
339 |
| - print(f"Yaml resources loaded for {name}") |
| 355 | + print(f"Yaml resources loaded for {cluster.config.name}") |
340 | 356 | return user_yaml
|
0 commit comments