diff --git a/.gitlab/tests/shared_flux_clusters.yml b/.gitlab/tests/shared_flux_clusters.yml index 041610b30..4c2d128b9 100644 --- a/.gitlab/tests/shared_flux_clusters.yml +++ b/.gitlab/tests/shared_flux_clusters.yml @@ -126,6 +126,11 @@ run_tests_flux_tuolumne: BENCHMARK: [kripke] VARIANT: ['+rocm caliper=mpi,time,rocm'] GPUMODE: SPX + # spack-pip + - HOST: tuolumne + ARCHCONFIG: llnl-elcapitan + BENCHMARK: [py-scaffold] + VARIANT: ['+rocm package_manager=spack-pip caliper=mpi,time,rocm'] # rocm7 - HOST: tuolumne ARCHCONFIG: llnl-elcapitan @@ -143,3 +148,8 @@ run_tests_flux_tioga: ARCHCONFIG: llnl-elcapitan BENCHMARK: [amg2023, kripke, laghos, raja-perf] VARIANT: [+rocm] + # spack-pip + - HOST: tioga + ARCHCONFIG: llnl-elcapitan + BENCHMARK: [py-scaffold] + VARIANT: ['+rocm package_manager=spack-pip caliper=mpi,time,rocm'] diff --git a/.gitlab/tests/shared_slurm_clusters.yml b/.gitlab/tests/shared_slurm_clusters.yml index d31e48fae..61118b1e1 100644 --- a/.gitlab/tests/shared_slurm_clusters.yml +++ b/.gitlab/tests/shared_slurm_clusters.yml @@ -142,3 +142,8 @@ run_tests_slurm_matrix: ARCHCONFIG: llnl-matrix BENCHMARK: [sparta-snl] VARIANT: [+cuda~gpu-aware-mpi] + # spack-pip + - HOST: matrix + ARCHCONFIG: llnl-matrix + BENCHMARK: [py-scaffold] + VARIANT: ['+cuda package_manager=spack-pip caliper=mpi,time,cuda'] diff --git a/experiments/py-scaffold/experiment.py b/experiments/py-scaffold/experiment.py new file mode 100644 index 000000000..d23b0e140 --- /dev/null +++ b/experiments/py-scaffold/experiment.py @@ -0,0 +1,93 @@ +# Copyright 2024 Lawrence Livermore National Security, LLC and other +# Benchpark Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +from benchpark.caliper import Caliper +from benchpark.cuda import CudaExperiment +from benchpark.directives import maintainers, variant +from benchpark.experiment import Experiment +from benchpark.rocm import ROCmExperiment +from benchpark.scaling import Scaling, ScalingMode + + +class PyScaffold( + Experiment, + CudaExperiment, + ROCmExperiment, + Scaling(ScalingMode.Strong, ScalingMode.Weak), + Caliper, +): + + maintainers("michaelmckinsey1") + + variant( + "workload", + default="sweep", + values=("sweep",), + ) + + variant("version", default="main", values=("main",), description="app version") + + def compute_applications_section(self): + if self.spec.satisfies("+strong"): + n_gpus = 4 + if self.spec.satisfies("exec_mode=test"): + problem_scale = 5 + else: + problem_scale = 6 + elif self.spec.satisfies("+weak"): + n_gpus = 1 + problem_scale = 5 + else: + n_gpus = 1 + problem_scale = 6 + + self.add_experiment_variable("n_gpus", n_gpus, True) + self.add_experiment_variable("problem_scale", problem_scale, True) + + self.register_scaling_config( + { + ScalingMode.Strong: { + "n_gpus": lambda var, itr, dim, scaling_factor: var.val(dim) + * scaling_factor, + "problem_scale": lambda var, itr, dim, scaling_factor: var.val(dim), + }, + ScalingMode.Weak: { + "n_gpus": lambda var, itr, dim, scaling_factor: var.val(dim) * 2**3, + "problem_scale": lambda var, itr, dim, scaling_factor: var.val(dim) + + 1, + }, + } + ) + + self.set_required_variables( + n_resources="{n_gpus}", + process_problem_size="({problem_scale}-4)/({n_gpus}/({problem_scale}-4)**3)", + total_problem_size="{problem_scale}", + ) + + def compute_package_section(self): + # Spec that will be written into requirements.txt for pip install + sys_name = self.system_spec._name + if self.spec.satisfies("+rocm"): + if "llnl" in sys_name: + # site-specific wheel for rocm + model = "rocmwci" + else: + model = "rocm" + elif self.spec.satisfies("+cuda"): + model = "cuda" + self.add_package_spec( + self.name, + [f"py-scaffold@{self.spec.variants['version'][0]}"], + package_manager="spack", + ) + self.add_package_spec( + self.name, + [ + # extra index for torch wheel and pypi index for packages that won't be found on WCI + f"--extra-index-url https://download.pytorch.org/whl/\n--extra-index-url https://pypi.org/simple\nScaFFold[{model}] @ git+https://github.com/LBANN/ScaFFold.git", + ], + package_manager="pip", + ) diff --git a/lib/benchpark/caliper.py b/lib/benchpark/caliper.py index 403a3f70d..2be65cd94 100644 --- a/lib/benchpark/caliper.py +++ b/lib/benchpark/caliper.py @@ -28,6 +28,17 @@ class Caliper: description="caliper mode", ) + variant( + "cali_version", + default="master", + values=( + "master", + "2.14.0", + "2.13.1", + ), + description="version", + ) + class Helper(ExperimentHelper): def compute_modifiers_section(self): modifier_list = [] @@ -44,7 +55,7 @@ def compute_modifiers_section(self): def compute_package_section(self): # set package versions - caliper_version = "master" + caliper_version = self.spec.variants["cali_version"][0] # get system config options # TODO: Get compiler/mpi/package handles directly from system.py @@ -60,12 +71,12 @@ def compute_package_section(self): if not self.spec.satisfies("caliper=none"): package_specs["caliper"] = { - "pkg_spec": f"caliper@{caliper_version}+adiak+mpi~libunwind~libdw", + "spack_pkg_spec": f"caliper@{caliper_version}+adiak+mpi~libunwind~libdw", } if any("topdown" in var for var in self.spec.variants["caliper"]): papi_support = True # check if target system supports papi if papi_support: - package_specs["caliper"]["pkg_spec"] += "+papi" + package_specs["caliper"]["spack_pkg_spec"] += "+papi" else: raise NotImplementedError( "Target system does not support the papi interface" @@ -76,7 +87,7 @@ def compute_package_section(self): ) # check if target system supports cuda if cuda_support: package_specs["caliper"][ - "pkg_spec" + "spack_pkg_spec" ] += "~papi+cuda cuda_arch={}".format(system_specs["cuda_arch"]) else: raise NotImplementedError( @@ -88,7 +99,7 @@ def compute_package_section(self): ) # check if target system supports rocm if rocm_support: package_specs["caliper"][ - "pkg_spec" + "spack_pkg_spec" ] += "~papi+rocm amdgpu_target={}".format( system_specs["rocm_arch"] ) @@ -99,7 +110,7 @@ def compute_package_section(self): elif self.spec.satisfies("caliper=time") or self.spec.satisfies( "caliper=mpi" ): - package_specs["caliper"]["pkg_spec"] += "~papi" + package_specs["caliper"]["spack_pkg_spec"] += "~papi" return { "packages": {k: v for k, v in package_specs.items() if v}, diff --git a/lib/benchpark/cmd/setup.py b/lib/benchpark/cmd/setup.py index aaa9ba024..ff14cfb20 100644 --- a/lib/benchpark/cmd/setup.py +++ b/lib/benchpark/cmd/setup.py @@ -204,7 +204,7 @@ def include_fn(fname): ) pkg_str = "" - if pkg_manager == "spack": + if "spack" in pkg_manager: spack_build_stage = experiments_root / "builds" spack_user_cache_path = experiments_root / "spack-cache" spack, first_time_spack = per_workspace_setup.spack_first_time_setup() diff --git a/lib/benchpark/experiment.py b/lib/benchpark/experiment.py index ab2d299a7..5f23a48f2 100644 --- a/lib/benchpark/experiment.py +++ b/lib/benchpark/experiment.py @@ -129,14 +129,14 @@ def compute_package_section(self): if not self.spec.satisfies("affinity=none"): package_specs["affinity"] = { - "pkg_spec": f"affinity@{affinity_version}+mpi", + "spack_pkg_spec": f"affinity@{affinity_version}+mpi", "compiler": system_specs["compiler"], } if self.spec.satisfies("+cuda"): - package_specs["affinity"]["pkg_spec"] += "+cuda" + package_specs["affinity"]["spack_pkg_spec"] += "+cuda" elif self.spec.satisfies("+rocm"): package_specs["affinity"][ - "pkg_spec" + "spack_pkg_spec" ] += "+rocm amdgpu_target={rocm_arch}" return { @@ -201,7 +201,7 @@ class Experiment(ExperimentSystemBase, ExecMode, Affinity, Hwloc): variant( "package_manager", default="spack", - values=("spack", "environment-modules", "user-managed"), + values=("spack", "environment-modules", "user-managed", "pip", "spack-pip"), description="package manager to use", ) @@ -360,7 +360,7 @@ def compute_config_section(self): "system": system_dict, "spec": str(self.spec), } - if self.spec.variants["package_manager"][0] == "spack": + if "spack" in self.spec.variants["package_manager"][0]: default_config["spack_flags"] = { "install": "--add --keep-stage", "concretize": "-U -f", @@ -485,11 +485,16 @@ def compute_applications_section_wrapper(self): } } - def add_package_spec(self, package_name, spec=None): + def add_package_spec(self, package_name, spec=None, package_manager="spack"): if spec: - self.package_specs[package_name] = { - "pkg_spec": spec[0], - } + if package_name not in self.package_specs: + self.package_specs[package_name] = { + f"{package_manager}_pkg_spec": spec[0], + } + else: + self.package_specs[package_name][f"{package_manager}_pkg_spec"] = spec[ + 0 + ] else: self.package_specs[package_name] = {} @@ -517,14 +522,14 @@ def compute_package_section_wrapper(self): f"Package section must be defined for application package {self.name}" ) - if pkg_manager == "spack": + if "spack" in pkg_manager: spack_variants = list( filter( lambda v: v is not None, (cls.get_spack_variants() for cls in self.helpers), ) ) - self.package_specs[self.name]["pkg_spec"] += " ".join( + self.package_specs[self.name]["spack_pkg_spec"] += " ".join( spack_variants ).strip() diff --git a/lib/benchpark/test/caliper.py b/lib/benchpark/test/caliper.py index 7aca10d05..0636c73f3 100644 --- a/lib/benchpark/test/caliper.py +++ b/lib/benchpark/test/caliper.py @@ -39,6 +39,7 @@ def test_experiment_compute_variables_section_caliper(monkeypatch): "n_threads_per_proc": "{n_threads_per_proc}", "benchpark_spec": ["~cuda+mpi~openmp~rocm"], "append_path": "'", + "cali_version": "master", "caliper": "time", "exec_mode": "test", "package_manager": "spack", @@ -119,6 +120,7 @@ def test_caliper_modifier(monkeypatch): "benchpark_spec": "['~cuda+mpi~openmp~rocm']", "affinity": "none", "append_path": "'", + "cali_version": "master", "caliper": "time", "exec_mode": "test", "hwloc": "none", diff --git a/modifiers/caliper/modifier.py b/modifiers/caliper/modifier.py index 5f7a47d27..b98f0fa6f 100644 --- a/modifiers/caliper/modifier.py +++ b/modifiers/caliper/modifier.py @@ -89,7 +89,7 @@ def modify_experiment(self, app): add_mode( mode_name="rocm", - mode_option="profile.hip,rocm.gputime", + mode_option="rocm.gputime", description="Profile HIP API functions, time spent on GPU", ) @@ -156,6 +156,6 @@ def _build_metadata(self, workspace, app_inst): with open(cali_metadata_file, "w") as f: f.write(json.dumps(cali_metadata)) - software_spec("caliper", pkg_spec="caliper") - - required_package("caliper") + with when("package_manager_family=spack"): + software_spec("caliper", pkg_spec="caliper", package_manager="spack") + required_package("caliper") diff --git a/repo/py-scaffold/application.py b/repo/py-scaffold/application.py new file mode 100644 index 000000000..9635edb7a --- /dev/null +++ b/repo/py-scaffold/application.py @@ -0,0 +1,55 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# Benchpark Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import yaml +from ramble.appkit import * + + +class PyScaffold(ExecutableApplication): + """Scale-Free Fractal benchmark - A scalable deep learning benchmark: UNet trained on procedurally-generated, 3D fractal data""" + + name = "scaffold" + + tags = ["python"] + + register_phase("prepend_library_path", pipeline="setup", run_before=["make_experiments"]) + + def _prepend_library_path(self, workspace, app_inst=None): + """Function to prepend to LD_LIBRARY_PATH, can't do in spack because python_platlib points to wrong site-packages dir""" + paths = [] + # if cuda + if "cuda_arch" in app_inst.variables.keys(): + # Avoid libcudnn_graph.so error (unnecessary if cuX_full, necessary if cuX wheel) + paths.append("{pip_site_packages_path}/nvidia/cudnn/lib") + + app_inst.variables["rocm_mods"] = "" + if "rocm_arch" in app_inst.variables.keys(): + app_inst.variables["rocm_mods"] = "module load rocm/6.4.2 rccl/fast-env-slows-mpi libfabric\nexport SPINDLE_FLUXOPT=off\nexport LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so\nexport MPICH_GPU_SUPPORT_ENABLED=0\nexport LD_LIBRARY_PATH=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-6.4.1/install/lib/:$LD_LIBRARY_PATH\nexport LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce/x86_64/lib:$LD_LIBRARY_PATH\n" + + # if caliper - Avoid libcaffe2_nvrtc.so + paths.append("{pip_site_packages_path}/torch/lib") + + app_inst.variables["ld_paths"] = ":".join(paths) + + with when("package_manager_family=pip"): + software_spec("scaffold", pkg_spec="py-scaffold") + + # TODO: Figure out MPICH_GPU_SUPPORT_ENABLED=0, disabling GTL otherwise linker error. + executable( + "modules", + "{rocm_mods}export LD_LIBRARY_PATH={ld_paths}:$LD_LIBRARY_PATH", + ) + executable( + "generate", + "scaffold generate_fractals -c {package_path}ScaFFold/configs/benchmark_default.yml --problem-scale {problem_scale}", + use_mpi=True, + ) + executable( + "run", + "scaffold benchmark -c {package_path}ScaFFold/configs/benchmark_default.yml --problem-scale {problem_scale}", + use_mpi=True, + ) + + workload("sweep", executables=["modules", "generate", "run"]) diff --git a/repo/py-scaffold/package.py b/repo/py-scaffold/package.py new file mode 100644 index 000000000..058c0c649 --- /dev/null +++ b/repo/py-scaffold/package.py @@ -0,0 +1,75 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# Benchpark Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import os + +from spack.package import * +from spack_repo.builtin.build_systems.python import PythonPackage + + +class PyScaffold(PythonPackage, CudaPackage, ROCmPackage): + """Scale-Free Fractal benchmark""" + + git = "https://github.com/LBANN/ScaFFold.git" + + version("main", branch="main") + + maintainers("michaelmckinsey") + license("Apache-2.0") + + variant("mpi", default=True, description="MPI support") + variant("caliper", default=False, description="Build with Caliper support enabled.") + + depends_on("python@3.11:", type=("build", "run")) + # TODO: Get pip._vendor.pyproject_hooks._impl.BackendUnavailable: Cannot import 'setuptools.build_meta' from pip otherwise + depends_on("py-setuptools", type="build") + + depends_on("mpi") + + depends_on("c", type="build") + depends_on("cxx", type="build") + + depends_on("caliper+python", when="+caliper", type=("build", "run")) + depends_on("adiak+python", when="+caliper", type=("build", "run")) + + def cmake_args(self): + args = super().cmake_args(self) + + args.append(self.define("CMAKE_EXE_LINKER_FLAGS", self.spec['mpi'].libs.ld_flags)) + args.append(self.define("MPI_CXX_LINK_FLAGS", self.spec['mpi'].libs.ld_flags)) + + return args + + def setup_build_environment(self, env): + super().setup_build_environment(env) + + if self.compiler.extra_rpaths: + for rpath in self.compiler.extra_rpaths: + env.prepend_path("LD_LIBRARY_PATH", rpath) + + if "+mpi" in self.spec: + if self.spec["mpi"].extra_attributes: + if "ldflags" in self.spec["mpi"].extra_attributes: + env.append_flags("LDFLAGS", self.spec["mpi"].extra_attributes["ldflags"]) + if "gtl_lib_path" in self.spec["mpi"].extra_attributes: + env.prepend_path("LD_LIBRARY_PATH", self.spec['mpi'].extra_attributes["gtl_lib_path"]) + + def setup_run_environment(self, env): + super().setup_run_environment(env) + + if "+mpi" in self.spec: + if self.spec["mpi"].extra_attributes: + if "gtl_lib_path" in self.spec["mpi"].extra_attributes: + # Avoid gtl error + env.prepend_path("LD_LIBRARY_PATH", self.spec['mpi'].extra_attributes["gtl_lib_path"]) + + # if self.spec.satisfies("+caliper"): + # if self.spec.satisfies("+rocm"): + # # Need to set this to libcaliper.so to avoid rocprofiler context error + # env.set("ROCP_TOOL_LIBRARIES", os.path.join(self.spec["caliper"].prefix, "lib64", "libcaliper.so")) + + if self.compiler.extra_rpaths: + for rpath in self.compiler.extra_rpaths: + env.prepend_path("LD_LIBRARY_PATH", rpath) diff --git a/systems/llnl-elcapitan/system.py b/systems/llnl-elcapitan/system.py index eda9cb10a..0aef5bc30 100644 --- a/systems/llnl-elcapitan/system.py +++ b/systems/llnl-elcapitan/system.py @@ -223,17 +223,23 @@ def __init__(self, spec): if self.rocm_version >= Version("6.4.0"): self.cce_version = Version("20.0.0") self.mpi_version = Version("9.0.1") + self.rccl_version = Version("6.4.1") elif self.rocm_version >= Version("6.0.0"): self.cce_version = Version("18.0.1") self.mpi_version = Version("8.1.31") + self.rccl_version = Version("6.3.1") else: self.cce_version = Version("16.0.0") self.mpi_version = Version("8.1.26") - if self.rocm_version >= Version("6.0.0"): + self.rccl_version = Version("5.4.3") + if self.rocm_version >= Version("6.4.0"): + self.pmi_version = Version("6.1.15.6") + self.pals_version = Version("1.2.12") + self.llvm_version = Version("19.0.0") + elif self.rocm_version >= Version("6.0.0"): self.pmi_version = Version("6.1.15.6") self.pals_version = Version("1.2.12") self.llvm_version = Version("18.0.1") - else: self.pmi_version = Version("6.1.12") self.pals_version = Version("1.2.9") @@ -338,6 +344,7 @@ def compute_packages_section(self): "cvs": {"externals": [{"spec": "cvs@1.11.23", "prefix": "/usr"}]}, "git": { "externals": [ + {"spec": "git@2.43.7", "prefix": "/usr"}, {"spec": "git@2.31.1+tcltk", "prefix": "/usr"}, {"spec": "git@2.29.1+tcltk", "prefix": "/usr/tce"}, ] @@ -354,7 +361,15 @@ def compute_packages_section(self): { "spec": "python@3.9.12", "prefix": "/usr/tce/packages/python/python-3.9.12", - } + }, + { + "spec": "python@3.11.5", + "prefix": "/usr/tce/packages/python/python-3.11.5", + }, + { + "spec": "python@3.12.2", + "prefix": "/usr/tce/packages/python/python-3.12.2", + }, ], }, "unzip": { @@ -387,6 +402,18 @@ def compute_packages_section(self): ], "buildable": False, }, + "ncurses": { + "externals": [{"spec": "ncurses@6.1.20180224", "prefix": "/usr"}], + "buildable": False, + }, + "libxcrypt": { + "externals": [{"spec": "libxcrypt@4.1.1", "prefix": "/usr"}], + "buildable": False, + }, + "opengl": { + "externals": [{"spec": "opengl@4.5", "prefix": "/usr"}], + "buildable": False, + }, } } @@ -720,7 +747,7 @@ def rocm_config(self): { "spec": f"llvm@{self.llvm_version}", "prefix": f"/opt/rocm-{self.rocm_version}/llvm", - } + }, ], "buildable": False, }, @@ -759,6 +786,7 @@ def rocm_cce_compiler_cfg(self): f"/opt/rocm-{self.rocm_version}/lib", "/opt/cray/pe/gcc-libs", f"/opt/cray/pe/cce/{self.cce_version}/cce/x86_64/lib", + f"/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-{self.rccl_version}/install/lib", ] # Avoid libunwind.so.1 error on tioga if self.spec.variants["cluster"][0] in ["tioga", "tuolumne"]: diff --git a/systems/llnl-matrix/system.py b/systems/llnl-matrix/system.py index c6204f393..65746a121 100644 --- a/systems/llnl-matrix/system.py +++ b/systems/llnl-matrix/system.py @@ -64,6 +64,14 @@ class LlnlMatrix(System): description="Submit a job to a specific named bank", ) + variant( + "queue", + default="none", + values=("none", "pbatch", "pdebug"), + multi=False, + description="Submit to named queue", + ) + def __init__(self, spec): super().__init__(spec) self.programming_models = [CudaSystem(), OpenMPCPUOnlySystem()] @@ -151,13 +159,21 @@ def compute_packages_section(self): "buildable": False, }, "python": { + "buildable": False, "externals": [ { - "spec": "python@3.9.12+bz2+crypt+ctypes+dbm+lzma+pyexpat~pythoncmd+readline+sqlite3+ssl+tix+tkinter+uuid+zlib", - "prefix": "/usr/tce", + "spec": "python@3.9.12", + "prefix": "/usr/tce/packages/python/python-3.9.12", + }, + { + "spec": "python@3.11.9", + "prefix": "/usr/tce/packages/python/python-3.11.9", + }, + { + "spec": "python@3.12.4", + "prefix": "/usr/tce/packages/python/python-3.12.4", }, ], - "buildable": False, }, "hwloc": { "externals": [{"spec": "hwloc@2.11.2", "prefix": "/usr"}], @@ -171,6 +187,10 @@ def compute_packages_section(self): "externals": [{"spec": "curl@7.61.1", "prefix": "/usr"}], "buildable": False, }, + "git": { + "externals": [{"spec": "git@2.43.7", "prefix": "/usr"}], + "buildable": False, + }, "mpi": {"buildable": False}, } }