From 3726fd68eaf75db81c3aa963777f49f426de2a78 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 16:32:51 -0800 Subject: [PATCH 01/10] pin awscli and botocore versions --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 2 ++ pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index f3c32204a86b..98d594ac7a2c 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -281,6 +281,8 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ + "awscli<=1.42.61" \ + "boto3<=1.40.61" \ smclarify \ "sagemaker>=2" \ sagemaker-experiments \ diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 52997f799aca..494da7667df6 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -247,8 +247,8 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" - "awscli<1.42.50" \ - "boto3<1.40.50" \ + "awscli<=1.42.61" \ + "boto3<=1.40.61" \ smclarify \ "sagemaker>=2" \ sagemaker-experiments \ From 00374802ae7ac92164b0c8b08f890eb5d98ef34b Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 16:37:54 -0800 Subject: [PATCH 02/10] build and test ec2 image --- dlc_developer_config.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2ddfe8ccb932..fc79a3bd26a6 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -63,19 +63,19 @@ use_new_test_structure = false ### On by default sanity_tests = true security_tests = true - safety_check_test = false - ecr_scan_allowlist_feature = false + safety_check_test = true + ecr_scan_allowlist_feature = true ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = false +ec2_benchmark_tests = true ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### On by default sagemaker_local_tests = true @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 0e7642afb92ddbb7ab6b992a92d77a545d43d06a Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 16:40:00 -0800 Subject: [PATCH 03/10] build and test sm image --- dlc_developer_config.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index fc79a3bd26a6..516a367c14c8 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From af5bcc873b40ab060efd6225f75c09273d9ce4a0 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 18:56:48 -0800 Subject: [PATCH 04/10] pin spacy version to 3.8.7 --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 2 +- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 98d594ac7a2c..583726cacffb 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -206,7 +206,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis - spacy \ + spacy==3.8.7 \ # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) thinc==8.3.4 \ blis \ diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 494da7667df6..15b66ef34562 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -126,7 +126,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis - spacy \ + spacy=3.8.7 \ # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) thinc==8.3.4 \ blis \ From 3fa19ff7d1257a3bdb52835e23f91b1757a2cb64 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 18:57:13 -0800 Subject: [PATCH 05/10] rebuild sm image with pinned spacy --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 516a367c14c8..84d3f539c744 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 8e22ff98945603954a8f07d40898a73a3ba7d38e Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 18:58:50 -0800 Subject: [PATCH 06/10] rebuild ec2 image with spinned spacy --- dlc_developer_config.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 84d3f539c744..fc79a3bd26a6 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 0adc56ac2cc36c7fb0f975d1b7390008015a59db Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 19:34:57 -0800 Subject: [PATCH 07/10] fix type and rebuild ec2 image --- dlc_developer_config.toml | 2 +- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index fc79a3bd26a6..af1315d38f51 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 15b66ef34562..68d3e7363a53 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -126,7 +126,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis - spacy=3.8.7 \ + spacy==3.8.7 \ # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) thinc==8.3.4 \ blis \ From eae3546e02a322301b0d95fcd1a24160f1727668 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 20:49:19 -0800 Subject: [PATCH 08/10] remove pin of awscli and boto3 for cpu, revert changes on pin for gpu, rebuild ec2 --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 2 -- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 583726cacffb..9e305de1b54a 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -281,8 +281,6 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ - "awscli<=1.42.61" \ - "boto3<=1.40.61" \ smclarify \ "sagemaker>=2" \ sagemaker-experiments \ diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 68d3e7363a53..0b90890db2fb 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -247,8 +247,8 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" - "awscli<=1.42.61" \ - "boto3<=1.40.61" \ + "awscli<1.42.50" \ + "boto3<1.40.50" \ smclarify \ "sagemaker>=2" \ sagemaker-experiments \ From e5e149e4751d3cca91d19ee0424f115b84f2547e Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 20:57:57 -0800 Subject: [PATCH 09/10] rebuild sm image --- dlc_developer_config.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index af1315d38f51..516a367c14c8 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 8bd5519a386561f2726ddd54fd482967f196296e Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 18 Nov 2025 22:24:18 -0800 Subject: [PATCH 10/10] revert changes to toml --- dlc_developer_config.toml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 516a367c14c8..2ddfe8ccb932 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["pytorch"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = false +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -63,19 +63,19 @@ use_new_test_structure = false ### On by default sanity_tests = true security_tests = true - safety_check_test = true - ecr_scan_allowlist_feature = true + safety_check_test = false + ecr_scan_allowlist_feature = false ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = true +ec2_benchmark_tests = false ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = true +ec2_tests_on_heavy_instances = false ### SM specific tests ### On by default sagemaker_local_tests = true @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" +dlc-pr-pytorch-training = "" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = ""