Skip to content

Commit afbb6f1

Browse files
d4l3kfacebook-github-bot
authored andcommitted
schedulers/kubernetes_scheduler: add support for resource instance-type node selectors (#433)
Summary: This allows specifying specific instance types when scheduling kubernetes jobs. It uses `node_selectors` and the `node.kubernetes.io/instance-type` label on nodes to limit pods to specific instances. To avoid instance type cpu and memory hitting issues with the node reserved cpu/mem this will subtract a small amount of CPU and memory from the requested resources. Limits remains the same. Also adds g4dn.xlarge resource type. * https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes/ * https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/ Pull Request resolved: #433 Test Plan: Unit tests, updated kube dist integration test to specify instance type ``` spec: containers: - command: - python - -m - torch.distributed.run - --rdzv_backend - etcd - --rdzv_endpoint - etcd-server:2379 - --rdzv_id - cv-trainer-smvh1095z11h5 - --nnodes - "2" - --nproc_per_node - "1" - -m - torchx.examples.apps.lightning_classy_vision.train - --load_path - "" - --log_path - /tmp/logs - --epochs - "1" - --output_path - s3://torchx-test/integration-tests/runner_wh5wn4cz4b7fcd/output env: - name: TORCHX_RANK0_HOST value: localhost - name: VC_WORKER_0_HOSTS valueFrom: configMapKeyRef: key: VC_WORKER_0_HOSTS name: cv-trainer-smvh1095z11h5-svc - name: VC_WORKER_0_NUM valueFrom: configMapKeyRef: key: VC_WORKER_0_NUM name: cv-trainer-smvh1095z11h5-svc - name: VC_WORKER_1_HOSTS valueFrom: configMapKeyRef: key: VC_WORKER_1_HOSTS name: cv-trainer-smvh1095z11h5-svc - name: VC_WORKER_1_NUM valueFrom: configMapKeyRef: key: VC_WORKER_1_NUM name: cv-trainer-smvh1095z11h5-svc - name: VK_TASK_INDEX value: "0" - name: VC_TASK_INDEX value: "0" image: 495572122715.dkr.ecr.us-west-2.amazonaws.com/torchx/integration-tests:canary_runner_wh5wn4cz4b7fcd_torchx imagePullPolicy: IfNotPresent name: worker-0 resources: limits: cpu: "2" memory: 8192M nvidia.com/gpu: "1" requests: cpu: 1900m memory: 7168M nvidia.com/gpu: "1" terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /dev/shm name: dshm - mountPath: /etc/volcano name: cv-trainer-smvh1095z11h5-svc - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: default-token-z8vb9 readOnly: true dnsPolicy: ClusterFirst enableServiceLinks: true hostname: cv-trainer-smvh1095z11h5-worker-0-0 nodeName: ip-192-168-16-165.us-west-2.compute.internal nodeSelector: node.kubernetes.io/instance-type: p3.2xlarge ``` [https://github.com/pytorch/torchx/runs/5698855673?check_suite_focus=true](https://github.com/pytorch/torchx/runs/5698855673?check_suite_focus=true) Reviewed By: kiukchung Differential Revision: D35158355 Pulled By: d4l3k fbshipit-source-id: 1e96ab80059e0c809bfba64aa004a888dac17f15
1 parent abb8ed6 commit afbb6f1

8 files changed

+139
-23
lines changed

docs/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@ ipykernel
88
nbsphinx
99
jupytext
1010
ipython_genutils
11+
# https://github.com/jupyter/nbconvert/issues/1736
12+
jinja2<=3.0.3

scripts/kube_dist_trainer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ def register_gpu_resource() -> None:
3232
cpu=2,
3333
gpu=1,
3434
memMB=8 * GiB,
35+
capabilities={
36+
"node.kubernetes.io/instance-type": "p3.2xlarge",
37+
},
3538
)
3639
print(f"Registering resource: {res}")
3740
named_resources["GPU_X1"] = res

torchx/schedulers/kubernetes_scheduler.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,14 @@
8585

8686
logger: logging.Logger = logging.getLogger(__name__)
8787

88+
# Kubernetes reserves a small amount of resources per host for the system. For
89+
# TorchX we always assume the entire host is being requested so we adjust the
90+
# requested numbers account for the node reserved resources.
91+
#
92+
# https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/
93+
RESERVED_MILLICPU = 100
94+
RESERVED_MEMMB = 1024
95+
8896
RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
8997
RetryPolicy.REPLICA: [],
9098
RetryPolicy.APPLICATION: [
@@ -152,6 +160,8 @@
152160

153161
ANNOTATION_ISTIO_SIDECAR = "sidecar.istio.io/inject"
154162

163+
LABEL_INSTANCE_TYPE = "node.kubernetes.io/instance-type"
164+
155165

156166
def sanitize_for_serialization(obj: object) -> object:
157167
from kubernetes import client
@@ -176,21 +186,35 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
176186
V1EmptyDirVolumeSource,
177187
)
178188

189+
# limits puts an upper cap on the resources a pod may consume.
190+
# requests is how much the scheduler allocates. We assume that the jobs will
191+
# be allocation the whole machine so requests is slightly lower than the
192+
# requested resources to account for the Kubernetes node reserved resources.
193+
limits = {}
179194
requests = {}
180195

181196
resource = role.resource
182-
if resource.cpu >= 0:
183-
requests["cpu"] = f"{int(resource.cpu * 1000)}m"
184-
if resource.memMB >= 0:
185-
requests["memory"] = f"{int(resource.memMB)}M"
186-
if resource.gpu >= 0:
187-
requests["nvidia.com/gpu"] = str(resource.gpu)
197+
if resource.cpu > 0:
198+
mcpu = int(resource.cpu * 1000)
199+
limits["cpu"] = f"{mcpu}m"
200+
request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
201+
requests["cpu"] = f"{request_mcpu}m"
202+
if resource.memMB > 0:
203+
limits["memory"] = f"{int(resource.memMB)}M"
204+
request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
205+
requests["memory"] = f"{request_memMB}M"
206+
if resource.gpu > 0:
207+
requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
188208

189209
resources = V1ResourceRequirements(
190-
limits=requests,
210+
limits=limits,
191211
requests=requests,
192212
)
193213

214+
node_selector: Dict[str, str] = {}
215+
if LABEL_INSTANCE_TYPE in resource.capabilities:
216+
node_selector[LABEL_INSTANCE_TYPE] = resource.capabilities[LABEL_INSTANCE_TYPE]
217+
194218
# To support PyTorch dataloaders we need to set /dev/shm to larger than the
195219
# 64M default so we mount an unlimited sized tmpfs directory on it.
196220
SHM_VOL = "dshm"
@@ -264,6 +288,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
264288
restart_policy="Never",
265289
service_account_name=service_account,
266290
volumes=volumes,
291+
node_selector=node_selector,
267292
),
268293
metadata=V1ObjectMeta(
269294
annotations={
@@ -416,6 +441,29 @@ class KubernetesScheduler(Scheduler, DockerWorkspace):
416441
417442
External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
418443
444+
**Resources / Allocation**
445+
446+
To select a specific machine type you can add a capability to your resources
447+
with ``node.kubernetes.io/instance-type`` which will constrain the launched
448+
jobs to nodes of that instance type.
449+
450+
>>> from torchx import specs
451+
>>> specs.Resource(
452+
... cpu=4,
453+
... memMB=16000,
454+
... gpu=2,
455+
... capabilities={
456+
... "node.kubernetes.io/instance-type": "<cloud instance type>",
457+
... },
458+
... )
459+
Resource(...)
460+
461+
Kubernetes may reserve some memory for the host. TorchX assumes you're
462+
scheduling on whole hosts and thus will automatically reduce the resource
463+
request by a small amount to account for the node reserved CPU and memory.
464+
If you run into scheduling issues you may need to reduce the requested CPU
465+
and memory from the host values.
466+
419467
**Compatibility**
420468
421469
.. compatibility::

torchx/schedulers/test/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
actors.json

torchx/schedulers/test/kubernetes_scheduler_test.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
cleanup_str,
2525
create_scheduler,
2626
role_to_pod,
27+
LABEL_INSTANCE_TYPE,
2728
)
2829

2930
SKIP_DOCKER: bool = not has_docker()
@@ -124,13 +125,18 @@ def test_role_to_pod(self) -> None:
124125
app = _test_app()
125126
pod = role_to_pod("name", app.roles[0], service_account="srvacc")
126127

127-
requests = {
128+
limits = {
128129
"cpu": "2000m",
129130
"memory": "3000M",
130131
"nvidia.com/gpu": "4",
131132
}
133+
requests = {
134+
"cpu": "1900m",
135+
"memory": "1976M",
136+
"nvidia.com/gpu": "4",
137+
}
132138
resources = V1ResourceRequirements(
133-
limits=requests,
139+
limits=limits,
134140
requests=requests,
135141
)
136142
container = V1Container(
@@ -179,6 +185,7 @@ def test_role_to_pod(self) -> None:
179185
),
180186
),
181187
],
188+
node_selector={},
182189
),
183190
metadata=V1ObjectMeta(
184191
annotations={
@@ -279,15 +286,16 @@ def test_submit_dryrun(self) -> None:
279286
memory: 3000M
280287
nvidia.com/gpu: '4'
281288
requests:
282-
cpu: 2000m
283-
memory: 3000M
289+
cpu: 1900m
290+
memory: 1976M
284291
nvidia.com/gpu: '4'
285292
volumeMounts:
286293
- mountPath: /dev/shm
287294
name: dshm
288295
- mountPath: /dst
289296
name: mount-0
290297
readOnly: true
298+
nodeSelector: {{}}
291299
restartPolicy: Never
292300
volumes:
293301
- emptyDir:
@@ -348,6 +356,29 @@ def test_volume_mounts(self) -> None:
348356
],
349357
)
350358

359+
def test_instance_type(self) -> None:
360+
scheduler = create_scheduler("test")
361+
role = specs.Role(
362+
name="foo",
363+
image="",
364+
mounts=[],
365+
resource=specs.Resource(
366+
cpu=4,
367+
memMB=4000,
368+
gpu=8,
369+
capabilities={
370+
LABEL_INSTANCE_TYPE: "some_instance",
371+
},
372+
),
373+
)
374+
pod = role_to_pod("foo", role, service_account="")
375+
self.assertEqual(
376+
pod.spec.node_selector,
377+
{
378+
"node.kubernetes.io/instance-type": "some_instance",
379+
},
380+
)
381+
351382
def test_rank0_env(self) -> None:
352383
from kubernetes.client.models import (
353384
V1EnvVar,

torchx/specs/__init__.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from typing import Dict, Optional
1515

16-
import torchx.specs.named_resources_aws as aws_resources
16+
from torchx.specs.named_resources_aws import NAMED_RESOURCES as AWS_NAMED_RESOURCES
1717
from torchx.util.entrypoints import load_group
1818

1919
from .api import ( # noqa: F401 F403
@@ -58,14 +58,9 @@
5858
def _load_named_resources() -> Dict[str, Resource]:
5959
resource_methods = load_group("torchx.named_resources", default={})
6060
materialized_resources = {}
61-
default = {
62-
"aws_t3.medium": aws_resources.aws_t3_medium(),
63-
"aws_m5.2xlarge": aws_resources.aws_m5_2xlarge(),
64-
"aws_p3.2xlarge": aws_resources.aws_p3_2xlarge(),
65-
"aws_p3.8xlarge": aws_resources.aws_p3_8xlarge(),
66-
}
61+
default = AWS_NAMED_RESOURCES
6762
for name, resource in default.items():
68-
materialized_resources[name] = resource
63+
materialized_resources[name] = resource()
6964
for resource_name, resource_method in resource_methods.items():
7065
materialized_resources[resource_name] = resource_method()
7166
materialized_resources["NULL"] = NULL_RESOURCE

torchx/specs/named_resources_aws.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
3030
"""
3131

32+
from typing import Mapping, Callable
33+
3234
from torchx.specs.api import Resource
3335

3436
GiB: int = 1024
@@ -39,7 +41,9 @@ def aws_p3_2xlarge() -> Resource:
3941
cpu=8,
4042
gpu=1,
4143
memMB=61 * GiB,
42-
capabilities={},
44+
capabilities={
45+
"node.kubernetes.io/instance-type": "p3.2xlarge",
46+
},
4347
)
4448

4549

@@ -48,7 +52,9 @@ def aws_p3_8xlarge() -> Resource:
4852
cpu=32,
4953
gpu=4,
5054
memMB=244 * GiB,
51-
capabilities={},
55+
capabilities={
56+
"node.kubernetes.io/instance-type": "p3.8xlarge",
57+
},
5258
)
5359

5460

@@ -57,7 +63,9 @@ def aws_t3_medium() -> Resource:
5763
cpu=2,
5864
gpu=0,
5965
memMB=4 * GiB,
60-
capabilities={},
66+
capabilities={
67+
"node.kubernetes.io/instance-type": "t3.medium",
68+
},
6169
)
6270

6371

@@ -66,5 +74,27 @@ def aws_m5_2xlarge() -> Resource:
6674
cpu=8,
6775
gpu=0,
6876
memMB=32 * GiB,
69-
capabilities={},
77+
capabilities={
78+
"node.kubernetes.io/instance-type": "m5.2xlarge",
79+
},
80+
)
81+
82+
83+
def aws_g4dn_xlarge() -> Resource:
84+
return Resource(
85+
cpu=4,
86+
gpu=1,
87+
memMB=16 * GiB,
88+
capabilities={
89+
"node.kubernetes.io/instance-type": "g4dn.xlarge",
90+
},
7091
)
92+
93+
94+
NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
95+
"aws_t3.medium": aws_t3_medium,
96+
"aws_m5.2xlarge": aws_m5_2xlarge,
97+
"aws_p3.2xlarge": aws_p3_2xlarge,
98+
"aws_p3.8xlarge": aws_p3_8xlarge,
99+
"aws_g4dn.xlarge": aws_g4dn_xlarge,
100+
}

torchx/specs/test/named_resources_test_aws.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
aws_m5_2xlarge,
1414
aws_t3_medium,
1515
GiB,
16+
NAMED_RESOURCES,
1617
)
1718

1819

@@ -40,3 +41,8 @@ def test_aws_t3_medium(self) -> None:
4041
self.assertEqual(2, resource.cpu)
4142
self.assertEqual(0, resource.gpu)
4243
self.assertEqual(4 * GiB, resource.memMB)
44+
45+
def test_capabilities(self) -> None:
46+
for name, func in NAMED_RESOURCES.items():
47+
resource = func()
48+
self.assertIn("node.kubernetes.io/instance-type", resource.capabilities)

0 commit comments

Comments
 (0)