File tree Expand file tree Collapse file tree 3 files changed +29
-2
lines changed
cookbooks/aws-parallelcluster-install/recipes Expand file tree Collapse file tree 3 files changed +29
-2
lines changed Original file line number Diff line number Diff line change @@ -23,6 +23,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2323** CHANGES**
2424- Upgrade NVIDIA driver to version 470.141.03.
2525- Upgrade NVIDIA Fabric Manager to version 470.141.03.
26+ - Upgrade NVIDIA CUDA Toolkit to version 11.7.1.
2627- Disable cron job tasks man-db and mlocate, which may have a negative impact on node performance.
2728- Add support for generating Slurm Configuration files for Compute Resources with Multiple Instance Types.
2829- Reduce timeout from 50 to a maximum of 5min in case of DynamoDB connection issues at compute node bootstrap.
Original file line number Diff line number Diff line change 189189# NVIDIA
190190default [ 'cluster' ] [ 'nvidia' ] [ 'enabled' ] = 'no'
191191default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '470.141.03'
192- default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] = '11.4'
192+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] = '11.7'
193+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_samples_version' ] = '11.6'
193194default [ 'cluster' ] [ 'nvidia' ] [ 'driver_url_architecture_id' ] = arm_instance? ? 'aarch64' : 'x86_64'
194195default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] = arm_instance? ? 'linux_sbsa' : 'linux'
195196default [ 'cluster' ] [ 'nvidia' ] [ 'driver_url' ] = "https://us.download.nvidia.com/tesla/#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] } /NVIDIA-Linux-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_url_architecture_id' ] } -#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] } .run"
196- default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url' ] = "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] } .run"
197+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url' ] = "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] } .run"
198+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_samples_url' ] = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_samples_version' ] } .tar.gz"
197199
198200# NVIDIA fabric-manager
199201# The package name of Fabric Manager for alinux2 and centos7 is nvidia-fabric-manager-version
Original file line number Diff line number Diff line change 8383 creates "/usr/local/cuda-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] } "
8484 end
8585
86+ # Get CUDA Sample Files
87+ cuda_samples_directory = "/usr/local/cuda-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] } /samples"
88+ cuda_tmp_sample_file = "/tmp/cuda-sample.tar.gz"
89+ remote_file cuda_tmp_sample_file do
90+ source node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_samples_url' ]
91+ mode '0644'
92+ retries 3
93+ retry_delay 5
94+ not_if { ::File . exist? ( cuda_samples_directory ) }
95+ end
96+
97+ # Unpack CUDA Samples
98+ bash 'cuda.sample install' do
99+ user 'root'
100+ group 'root'
101+ cwd '/tmp'
102+ code <<-CUDA
103+ set -e
104+ tar xf "#{ cuda_tmp_sample_file } " --directory "/usr/local/"
105+ rm -f "#{ cuda_tmp_sample_file } "
106+ CUDA
107+ creates cuda_samples_directory
108+ end
109+
86110 cookbook_file 'blacklist-nouveau.conf' do
87111 source 'nvidia/blacklist-nouveau.conf'
88112 path '/etc/modprobe.d/blacklist-nouveau.conf'
You can’t perform that action at this time.
0 commit comments