|
99 | 99 | TAG_TORCHX_USER = "torchx.pytorch.org/user"
|
100 | 100 |
|
101 | 101 |
|
| 102 | +def parse_ulimits(ulimits_list: list[str]) -> List[Dict[str, Any]]: |
| 103 | + """ |
| 104 | + Parse ulimit string in format: name:softLimit:hardLimit |
| 105 | + Multiple ulimits separated by commas. |
| 106 | + """ |
| 107 | + if not ulimits_list: |
| 108 | + return [] |
| 109 | + |
| 110 | + ulimits = [] |
| 111 | + for ulimit_str in ulimits_list: |
| 112 | + if not ulimit_str.strip(): |
| 113 | + continue |
| 114 | + |
| 115 | + parts = ulimit_str.strip().split(":") |
| 116 | + if len(parts) != 3: |
| 117 | + raise ValueError( |
| 118 | + f"ulimit must be in format name:softLimit:hardLimit, got: {ulimit_str}" |
| 119 | + ) |
| 120 | + |
| 121 | + name, soft_limit, hard_limit = parts |
| 122 | + ulimits.append( |
| 123 | + { |
| 124 | + "name": name, |
| 125 | + "softLimit": int(soft_limit) if soft_limit != "-1" else -1, |
| 126 | + "hardLimit": int(hard_limit) if hard_limit != "-1" else -1, |
| 127 | + } |
| 128 | + ) |
| 129 | + |
| 130 | + return ulimits |
| 131 | + |
| 132 | + |
102 | 133 | if TYPE_CHECKING:
|
103 | 134 | from docker import DockerClient
|
104 | 135 |
|
@@ -177,7 +208,8 @@ def _role_to_node_properties(
|
177 | 208 | privileged: bool = False,
|
178 | 209 | job_role_arn: Optional[str] = None,
|
179 | 210 | execution_role_arn: Optional[str] = None,
|
180 |
| -) -> Dict[str, object]: |
| 211 | + ulimits: Optional[List[Dict[str, Any]]] = None, |
| 212 | +) -> Dict[str, Any]: |
181 | 213 | role.mounts += get_device_mounts(role.resource.devices)
|
182 | 214 |
|
183 | 215 | mount_points = []
|
@@ -239,6 +271,7 @@ def _role_to_node_properties(
|
239 | 271 | "environment": [{"name": k, "value": v} for k, v in role.env.items()],
|
240 | 272 | "privileged": privileged,
|
241 | 273 | "resourceRequirements": resource_requirements_from_resource(role.resource),
|
| 274 | + **({"ulimits": ulimits} if ulimits else {}), |
242 | 275 | "linuxParameters": {
|
243 | 276 | # To support PyTorch dataloaders we need to set /dev/shm to larger
|
244 | 277 | # than the 64M default.
|
@@ -361,6 +394,7 @@ class AWSBatchOpts(TypedDict, total=False):
|
361 | 394 | priority: int
|
362 | 395 | job_role_arn: Optional[str]
|
363 | 396 | execution_role_arn: Optional[str]
|
| 397 | + ulimits: Optional[list[str]] |
364 | 398 |
|
365 | 399 |
|
366 | 400 | class AWSBatchScheduler(
|
@@ -514,6 +548,7 @@ def _submit_dryrun(self, app: AppDef, cfg: AWSBatchOpts) -> AppDryRunInfo[BatchJ
|
514 | 548 | privileged=cfg["privileged"],
|
515 | 549 | job_role_arn=cfg.get("job_role_arn"),
|
516 | 550 | execution_role_arn=cfg.get("execution_role_arn"),
|
| 551 | + ulimits=parse_ulimits(cfg.get("ulimits") or []), |
517 | 552 | )
|
518 | 553 | )
|
519 | 554 | node_idx += role.num_replicas
|
@@ -599,6 +634,11 @@ def _run_opts(self) -> runopts:
|
599 | 634 | type_=str,
|
600 | 635 | help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
|
601 | 636 | )
|
| 637 | + opts.add( |
| 638 | + "ulimits", |
| 639 | + type_=List[str], |
| 640 | + help="Ulimit settings in format: name:softLimit:hardLimit (multiple separated by commas)", |
| 641 | + ) |
602 | 642 | return opts
|
603 | 643 |
|
604 | 644 | def _get_job_id(self, app_id: str) -> Optional[str]:
|
|
0 commit comments