From 49d47ecacf203a029bbdd2924bfd6df6b9dcad63 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 2 Apr 2026 17:32:42 +0000 Subject: [PATCH 01/23] Fix NUMA GPU pinning bug --- presto/slurm/presto-nvl72/functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 3b3ef1ba..34a0ada8 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -179,7 +179,7 @@ function run_worker { local gpu_id=$1 image=$2 node=$3 worker_id=$4 # Assign NUMA node based on GPU ID: GPUs 0-3 → node 0, GPUs 4-7 → node 1, etc. - local numa_node=$((gpu_id / 4)) + local numa_node=$((gpu_id / 2)) echo "running worker ${worker_id} with image ${image} on node ${node} with gpu_id ${gpu_id} numa_node ${numa_node}" local worker_image="${IMAGE_DIR}/${image}.sqsh" From 86eeb14b0caa2d95d425a0d2e4cce37743b73e95 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 2 Apr 2026 17:33:30 +0000 Subject: [PATCH 02/23] Update comment --- presto/slurm/presto-nvl72/functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 34a0ada8..ba9e3ca7 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -178,7 +178,7 @@ function run_worker { validate_environment_preconditions LOGS CONFIGS VT_ROOT COORD CUDF_LIB DATA local gpu_id=$1 image=$2 node=$3 worker_id=$4 - # Assign NUMA node based on GPU ID: GPUs 0-3 → node 0, GPUs 4-7 → node 1, etc. + # Assign NUMA node based on GPU ID: GPUs 0-1 → node 0, GPUs 2-3 → node 1, etc. local numa_node=$((gpu_id / 2)) echo "running worker ${worker_id} with image ${image} on node ${node} with gpu_id ${gpu_id} numa_node ${numa_node}" From 912a80aa35bc7def9387baf51fb4cefd2972d2ba Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 2 Apr 2026 21:36:05 +0000 Subject: [PATCH 03/23] Enable gds --- presto/slurm/presto-nvl72/defaults.env | 4 +++ presto/slurm/presto-nvl72/functions.sh | 49 +++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env index 9a8e9c1b..b3973a77 100644 --- a/presto/slurm/presto-nvl72/defaults.env +++ b/presto/slurm/presto-nvl72/defaults.env @@ -30,3 +30,7 @@ unset _vt_path # --- SLURM node defaults (cluster-specific) --- : "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-13,15-16]}" : "${DEFAULT_SINGLE_NODE:=presto-gb200-gcn-01}" + +# --- I/O settings --- +: "${ENABLE_GDS:=1}" +export ENABLE_GDS diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index ba9e3ca7..c61cc71b 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -199,6 +199,51 @@ function run_worker { mkdir -p ${worker_data}/hive/data/user_data mkdir -p ${VT_ROOT}/.hive_metastore + local vt_cufile_log_dir="/var/log/cufile" + local vt_cufile_log="${vt_cufile_log_dir}/cufile_worker_${worker_id}.log" + + local gds_mounts="" + local gds_env_args="" + + function add_gds_sys_path { + local path="${1:?Path argument missing}" + local read_only="${2:-0}" + + # System file path must exist + if [[ ! -e ${path} ]]; then + echo "${path} required by GDS does not exist" + exit 1 + fi + + # If gds_mounts is not empty, append a comma + [[ -n "${gds_mounts}" ]] && gds_mounts+="," + + # Append path + if [[ "${read_only}" == "1" ]]; then + gds_mounts+="${path}:${path}:ro" + else + gds_mounts+="${path}" + fi + } + + if [[ "${ENABLE_GDS}" == "1" ]]; then + # Add GDS-required system paths + add_gds_sys_path "/run/udev" 1 + add_gds_sys_path "/dev/infiniband" + add_gds_sys_path "/etc/cufile.json" 1 + for dev in /dev/nvidia-fs*; do + # If file exists, append the path, otherwise, exit the loop + [[ -e "${dev}" ]] || continue + add_gds_sys_path "${dev}" + done + + # Add the log directory + gds_mounts+=",${LOGS}:${vt_cufile_log_dir}" + + # Add GDS-related env vars + gds_env_args="--container-env=KVIKIO_COMPAT_MODE=OFF --container-env=CUFILE_LOGFILE_PATH=${vt_cufile_log}" + fi + # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel # capabilities are already set up for the job cgroup. Do NOT use --gres=gpu:1 # on the step: it restricts the step's cgroup to one GPU and then nvidia-container-cli @@ -225,7 +270,9 @@ ${worker_data}:/var/lib/presto/data,\ ${DATA}:/var/lib/presto/data/hive/data/user_data,\ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\ /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\ -/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1 \ +/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\ +${gds_mounts:+,${gds_mounts}} \ +${gds_env_args} \ --container-env=LD_LIBRARY_PATH="$CUDF_LIB:$LD_LIBRARY_PATH" \ --container-env=GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3 \ --container-env=GLOG_logtostderr=1 \ From c36a52a0f6fd0f5d86889fe07a43d62b39cc5662 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 3 Apr 2026 04:01:07 +0000 Subject: [PATCH 04/23] Fix bugs --- presto/slurm/presto-nvl72/functions.sh | 32 +++++++++++--------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index c61cc71b..7d2d6b0b 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -94,9 +94,7 @@ function run_coord_image { srun -w $COORD --ntasks=1 --overlap \ --container-image=${coord_image} \ --container-remap-root \ ---export=ALL,JAVA_HOME=/usr/lib/jvm/jre-17-openjdk \ ---container-env=JAVA_HOME=/usr/lib/jvm/jre-17-openjdk \ ---container-env=PATH=/usr/lib/jvm/jre-17-openjdk/bin:$PATH \ +--export=ALL \ --container-mounts=${VT_ROOT}:/workspace,\ ${coord_data}:/var/lib/presto/data,\ ${CONFIGS}/etc_common:/opt/presto-server/etc,\ @@ -110,9 +108,7 @@ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore${extra_mounts} \ srun -w $COORD --ntasks=1 --overlap \ --container-remap-root \ --container-image=${coord_image} \ ---export=ALL,JAVA_HOME=/usr/lib/jvm/jre-17-openjdk \ ---container-env=JAVA_HOME=/usr/lib/jvm/jre-17-openjdk \ ---container-env=PATH=/usr/lib/jvm/jre-17-openjdk/bin:$PATH \ +--export=ALL \ --container-mounts=${VT_ROOT}:/workspace,\ ${coord_data}:/var/lib/presto/data,\ ${CONFIGS}/etc_common:/opt/presto-server/etc,\ @@ -203,8 +199,6 @@ function run_worker { local vt_cufile_log="${vt_cufile_log_dir}/cufile_worker_${worker_id}.log" local gds_mounts="" - local gds_env_args="" - function add_gds_sys_path { local path="${1:?Path argument missing}" local read_only="${2:-0}" @@ -219,10 +213,9 @@ function run_worker { [[ -n "${gds_mounts}" ]] && gds_mounts+="," # Append path + gds_mounts+="${path}:${path}" if [[ "${read_only}" == "1" ]]; then - gds_mounts+="${path}:${path}:ro" - else - gds_mounts+="${path}" + gds_mounts+=":ro" fi } @@ -239,9 +232,6 @@ function run_worker { # Add the log directory gds_mounts+=",${LOGS}:${vt_cufile_log_dir}" - - # Add GDS-related env vars - gds_env_args="--container-env=KVIKIO_COMPAT_MODE=OFF --container-env=CUFILE_LOGFILE_PATH=${vt_cufile_log}" fi # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel @@ -272,13 +262,19 @@ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\ /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\ /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\ ${gds_mounts:+,${gds_mounts}} \ -${gds_env_args} \ ---container-env=LD_LIBRARY_PATH="$CUDF_LIB:$LD_LIBRARY_PATH" \ ---container-env=GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3 \ ---container-env=GLOG_logtostderr=1 \ -- /bin/bash -c " +export LD_LIBRARY_PATH=\"${CUDF_LIB}:${LD_LIBRARY_PATH}\" +export GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3 +export GLOG_logtostderr=1 +if [[ '${ENABLE_GDS}' == '1' ]]; then + export KVIKIO_COMPAT_MODE=OFF + export CUFILE_LOGFILE_PATH=${vt_cufile_log} +fi if [[ '${VARIANT_TYPE}' == 'gpu' ]]; then export CUDA_VISIBLE_DEVICES=${gpu_id}; fi echo \"Worker ${worker_id}: CUDA_VISIBLE_DEVICES=\${CUDA_VISIBLE_DEVICES:-none}, NUMA_NODE=${numa_node}\" +echo \"Worker ${worker_id}: ENABLE_GDS=\${ENABLE_GDS:-unset}\" +echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\" +echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\" if [[ '${USE_NUMA}' == '1' ]]; then numactl --cpubind=${numa_node} --membind=${numa_node} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc else From a1389266da5574ab41f0b04dec8f8ba07df2c675 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 3 Apr 2026 04:22:37 +0000 Subject: [PATCH 05/23] Fix huge dump size --- presto/slurm/presto-nvl72/functions.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 7d2d6b0b..7dbee15b 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -247,6 +247,8 @@ function run_worker { # compat library with the host driver so cudaMallocAsync works. # CUDA_VISIBLE_DEVICES=${gpu_id} inside the container restricts each worker to # its assigned GPU while still allowing the CUDA driver to enumerate all devices. + # export GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3 + # export GLOG_logtostderr=1 srun -N1 -w $node --ntasks=1 --overlap \ --container-image=${worker_image} \ --container-remap-root \ @@ -264,11 +266,10 @@ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\ ${gds_mounts:+,${gds_mounts}} \ -- /bin/bash -c " export LD_LIBRARY_PATH=\"${CUDF_LIB}:${LD_LIBRARY_PATH}\" -export GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3 -export GLOG_logtostderr=1 if [[ '${ENABLE_GDS}' == '1' ]]; then export KVIKIO_COMPAT_MODE=OFF export CUFILE_LOGFILE_PATH=${vt_cufile_log} + export CUFILE_LOGGING_LEVEL=INFO fi if [[ '${VARIANT_TYPE}' == 'gpu' ]]; then export CUDA_VISIBLE_DEVICES=${gpu_id}; fi echo \"Worker ${worker_id}: CUDA_VISIBLE_DEVICES=\${CUDA_VISIBLE_DEVICES:-none}, NUMA_NODE=${numa_node}\" From 9be3af624d708ca7b14ad46c6f14b3760401b802 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 8 Apr 2026 21:29:34 +0000 Subject: [PATCH 06/23] Update --- presto/slurm/presto-nvl72/defaults.env | 9 ++++- presto/slurm/presto-nvl72/functions.sh | 34 ++++++++++++++++--- presto/slurm/presto-nvl72/launch-run.sh | 4 +-- .../presto-nvl72/run-presto-benchmarks.sh | 16 +++++++++ 4 files changed, 56 insertions(+), 7 deletions(-) diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env index b3973a77..2c7782f0 100644 --- a/presto/slurm/presto-nvl72/defaults.env +++ b/presto/slurm/presto-nvl72/defaults.env @@ -28,9 +28,16 @@ unset _vt_path : "${HIVE_METASTORE_SOURCE:=/mnt/data/tpch-rs/HIVE-METASTORE-MG-260313}" # --- SLURM node defaults (cluster-specific) --- -: "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-13,15-16]}" +: "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-16]}" : "${DEFAULT_SINGLE_NODE:=presto-gb200-gcn-01}" # --- I/O settings --- : "${ENABLE_GDS:=1}" export ENABLE_GDS + +# --- Profiling --- +: "${ENABLE_NSYS:=0}" +export ENABLE_NSYS + +: "${NSYS_WORKER_ID:=0}" +export NSYS_WORKER_ID diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 7dbee15b..cdeb740b 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -229,9 +229,27 @@ function run_worker { [[ -e "${dev}" ]] || continue add_gds_sys_path "${dev}" done + fi - # Add the log directory - gds_mounts+=",${LOGS}:${vt_cufile_log_dir}" + local nsys_bin="" + local nsys_opts="" + local vt_nsys_report_dir="/var/log/nsys" + if [[ "${ENABLE_NSYS}" == "1" && "${NSYS_WORKER_ID}" == "${worker_id}" ]]; then + nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" + nsys_opts="profile \ + -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ + -t cuda,nvtx \ + -f true \ + --sample=none \ + --cpuctxsw=none \ + --cuda-memory-usage=true \ + --nvtx-domain-exclude=CCCL" + # nsys_opts="profile \ + # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ + # -t cuda,ucx,nvtx,osrt \ + # -f true \ + # --cuda-memory-usage=true \ + # --nvtx-domain-exclude=CCCL" fi # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel @@ -261,6 +279,8 @@ ${worker_hive}:/opt/presto-server/etc/catalog/hive.properties,\ ${worker_data}:/var/lib/presto/data,\ ${DATA}:/var/lib/presto/data/hive/data/user_data,\ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\ +${LOGS}:${vt_cufile_log_dir},\ +${LOGS}:${vt_nsys_report_dir},\ /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\ /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\ ${gds_mounts:+,${gds_mounts}} \ @@ -274,12 +294,18 @@ fi if [[ '${VARIANT_TYPE}' == 'gpu' ]]; then export CUDA_VISIBLE_DEVICES=${gpu_id}; fi echo \"Worker ${worker_id}: CUDA_VISIBLE_DEVICES=\${CUDA_VISIBLE_DEVICES:-none}, NUMA_NODE=${numa_node}\" echo \"Worker ${worker_id}: ENABLE_GDS=\${ENABLE_GDS:-unset}\" +echo \"Worker ${worker_id}: ENABLE_NSYS=\${ENABLE_NSYS:-unset}\" echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\" echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\" + +if [[ -n '${nsys_bin}' ]]; then + echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\" +fi + if [[ '${USE_NUMA}' == '1' ]]; then - numactl --cpubind=${numa_node} --membind=${numa_node} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc + numactl --cpubind=${numa_node} --membind=${numa_node} ${nsys_bin} ${nsys_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc else - /usr/bin/presto_server --etc-dir=/opt/presto-server/etc + ${nsys_bin} ${nsys_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc fi" > ${LOGS}/worker_${worker_id}.log 2>&1 & } diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 1a0a02d7..594974ad 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -39,9 +39,9 @@ EXTRA_ARGS=() NUM_GPUS_PER_NODE="4" USE_NUMA="1" VARIANT_TYPE="gpu" -#WORKER_IMAGE="presto-native-worker-gpu" +# WORKER_IMAGE="presto-native-worker-gpu" +WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys" COORD_IMAGE="presto-coordinator-karth-Mar11" -WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11" #COORD_IMAGE="presto-coordinator-ibm-03-11" #WORKER_IMAGE="presto-native-worker-gpu-ibm-03-11" #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64" diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index 298276fc..f5b953d1 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -38,10 +38,16 @@ wait_until_coordinator_is_running echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..." worker_id=0 +nsys_worker_pid="" for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do echo " Starting worker ${worker_id} on node ${node} GPU ${gpu_id}" run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id" + + if [[ "${ENABLE_NSYS}" == "1" && "${NSYS_WORKER_ID}" == "${worker_id}" ]]; then + nsys_worker_pid=$! + fi + worker_id=$((worker_id + 1)) done done @@ -72,6 +78,16 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt echo "Collecting configs and logs into result directory..." collect_results +if [[ -n "${nsys_worker_pid}" ]]; then + echo "Sending SIGINT to nsys (worker srun PID ${nsys_worker_pid})..." + # Send the interrupt signal to the nsys process + # If the process has already terminated, `kill` will have an error, hence `|| true` + kill -INT "${nsys_worker_pid}" 2>/dev/null || true + echo "Waiting for nsys to finalize report..." + # Wait for the nsys process to finalize the report and store to disk + wait "${nsys_worker_pid}" 2>/dev/null || true +fi + echo "========================================" echo "Benchmark complete!" echo "Results saved to: ${SCRIPT_DIR}/results_dir" From e46a5b0cb6fb86170ec19ad6f3a4904cf6991aad Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 9 Apr 2026 03:46:21 +0000 Subject: [PATCH 07/23] Add metrics support --- presto/slurm/presto-nvl72/defaults.env | 4 ++++ presto/slurm/presto-nvl72/functions.sh | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env index 2c7782f0..970612c5 100644 --- a/presto/slurm/presto-nvl72/defaults.env +++ b/presto/slurm/presto-nvl72/defaults.env @@ -35,6 +35,10 @@ unset _vt_path : "${ENABLE_GDS:=1}" export ENABLE_GDS +# --- Query metrics --- +: "${ENABLE_METRICS:=0}" +export ENABLE_METRICS + # --- Profiling --- : "${ENABLE_NSYS:=0}" export ENABLE_NSYS diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index cdeb740b..ec405800 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -344,6 +344,9 @@ function run_queries { [ $# -ne 2 ] && echo_error "$0 expected two arguments for '' and ''" local num_iterations=$1 local scale_factor=$2 + local metrics_flag="" + [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m" + source "${SCRIPT_DIR}/defaults.env" # We currently skip dropping cache because it requires docker (not available on the cluster). run_coord_image "export PORT=$PORT; \ @@ -352,7 +355,7 @@ function run_queries { export MINIFORGE_HOME=/workspace/miniforge3; \ export HOME=/workspace; \ cd /workspace/presto/scripts; \ - ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} \ + ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} \ --hostname ${COORD} --port $PORT -o /workspace/presto/slurm/presto-nvl72/result_dir --skip-drop-cache; \ echo 'Validating query results...'; \ MINIFORGE_HOME=/workspace/miniforge3 /workspace/scripts/run_py_script.sh \ From 8091544e2941c30ca5da77f6ccb36f2b0c3c5153 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 9 Apr 2026 16:48:50 +0000 Subject: [PATCH 08/23] Allow post results to handle failed queries --- benchmark_reporting_tools/post_results.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py index bf3da37d..0c52fc30 100644 --- a/benchmark_reporting_tools/post_results.py +++ b/benchmark_reporting_tools/post_results.py @@ -424,6 +424,8 @@ def build_submission_payload( for query_name in query_names: times = raw_times[query_name] + if times is None: + times = [] is_failed = query_name in failed_queries # Look up validation result for this query (keys are lowercase e.g. "q1") From f3012589b042ea9facf36f6319b9d5ec5ac8ccfa Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 9 Apr 2026 20:03:47 +0000 Subject: [PATCH 09/23] Update --- benchmark_reporting_tools/post_results.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py index 0c52fc30..1e2489dc 100644 --- a/benchmark_reporting_tools/post_results.py +++ b/benchmark_reporting_tools/post_results.py @@ -548,6 +548,9 @@ async def upload_log_files( List of asset IDs from the uploaded files """ log_files = sorted(benchmark_dir.glob("*.log")) + metrics_dir = benchmark_dir / "metrics" + if metrics_dir.is_dir(): + log_files.extend(sorted(metrics_dir.glob("*.json"))) if not log_files: return [] @@ -560,10 +563,11 @@ async def _upload_one(log_file: Path) -> int: async with semaphore: print(f" Uploading {log_file.name}...", file=sys.stderr) content = log_file.read_bytes() + media_type = "application/json" if log_file.suffix == ".json" else "text/plain" response = await client.post( "/api/assets/upload/", - files={"file": (log_file.name, content, "text/plain")}, - data={"title": log_file.name, "media_type": "text/plain"}, + files={"file": (log_file.name, content, media_type)}, + data={"title": log_file.name, "media_type": media_type}, ) if response.status_code >= 400: raise RuntimeError(f"Failed to upload {log_file.name}: {response.status_code} {response.text}") @@ -738,6 +742,9 @@ async def process_benchmark_dir( if upload_logs: if dry_run: log_files = sorted(benchmark_dir.glob("*.log")) + metrics_dir = benchmark_dir / "metrics" + if metrics_dir.is_dir(): + log_files.extend(sorted(metrics_dir.glob("*.json"))) print( f" [DRY RUN] Would upload {len(log_files)} log file(s): {[f.name for f in log_files]}", file=sys.stderr ) From 8c8c9204fb4b0b946186a1d99202b8d676619045 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 10 Apr 2026 15:46:45 +0000 Subject: [PATCH 10/23] Refactor --- presto/scripts/run_benchmark.sh | 15 +++++++- presto/slurm/presto-nvl72/defaults.env | 15 -------- presto/slurm/presto-nvl72/functions.sh | 23 +++++------ presto/slurm/presto-nvl72/launch-run.sh | 38 +++++++++++++++++-- .../slurm/presto-nvl72/profiler_functions.sh | 13 +++++++ .../presto-nvl72/run-presto-benchmarks.sh | 16 -------- .../performance_benchmarks/common_fixtures.py | 1 + 7 files changed, 74 insertions(+), 47 deletions(-) create mode 100755 presto/slurm/presto-nvl72/profiler_functions.sh diff --git a/presto/scripts/run_benchmark.sh b/presto/scripts/run_benchmark.sh index c0242f5d..d73bb51f 100755 --- a/presto/scripts/run_benchmark.sh +++ b/presto/scripts/run_benchmark.sh @@ -35,6 +35,7 @@ OPTIONS: stored inside a directory under the --output-dir path with a name matching the tag name. Tags must contain only alphanumeric and underscore characters. -p, --profile Enable profiling of benchmark queries. + --profile-script-path Path to a custom profiler functions script. Defaults to ./profiler_functions.sh. --skip-drop-cache Skip dropping system caches before each benchmark query (dropped by default). -m, --metrics Collect detailed metrics from Presto REST API after each query. Metrics are stored in query-specific directories. @@ -154,6 +155,15 @@ parse_args() { PROFILE=true shift ;; + --profile-script-path) + if [[ -n $2 ]]; then + PROFILE_SCRIPT_PATH=$2 + shift 2 + else + echo "Error: --profile-script-path requires a value" + exit 1 + fi + ;; --skip-drop-cache) SKIP_DROP_CACHE=true shift @@ -236,7 +246,10 @@ if [[ -n ${TAG} ]]; then fi if [[ "${PROFILE}" == "true" ]]; then - PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f ./profiler_functions.sh)") + if [[ -z "${PROFILE_SCRIPT_PATH}" ]]; then + PROFILE_SCRIPT_PATH="$(readlink -f ./profiler_functions.sh)" + fi + PYTEST_ARGS+=("--profile --profile-script-path ${PROFILE_SCRIPT_PATH}") fi if [[ "${METRICS}" == "true" ]]; then diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env index 970612c5..ff45ba20 100644 --- a/presto/slurm/presto-nvl72/defaults.env +++ b/presto/slurm/presto-nvl72/defaults.env @@ -30,18 +30,3 @@ unset _vt_path # --- SLURM node defaults (cluster-specific) --- : "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-16]}" : "${DEFAULT_SINGLE_NODE:=presto-gb200-gcn-01}" - -# --- I/O settings --- -: "${ENABLE_GDS:=1}" -export ENABLE_GDS - -# --- Query metrics --- -: "${ENABLE_METRICS:=0}" -export ENABLE_METRICS - -# --- Profiling --- -: "${ENABLE_NSYS:=0}" -export ENABLE_NSYS - -: "${NSYS_WORKER_ID:=0}" -export NSYS_WORKER_ID diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index ec405800..474c742b 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -231,19 +231,19 @@ function run_worker { done fi + # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} + # --cpuctxsw=none + # --nvtx-domain-exclude=CCCL local nsys_bin="" local nsys_opts="" - local vt_nsys_report_dir="/var/log/nsys" - if [[ "${ENABLE_NSYS}" == "1" && "${NSYS_WORKER_ID}" == "${worker_id}" ]]; then + if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" - nsys_opts="profile \ - -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ - -t cuda,nvtx \ - -f true \ - --sample=none \ - --cpuctxsw=none \ + nsys_opts="launch \ + -t nvtx,cuda,osrt,ucx \ --cuda-memory-usage=true \ - --nvtx-domain-exclude=CCCL" + --cuda-um-cpu-page-faults=true \ + --cuda-um-gpu-page-faults=true \ + --cudabacktrace=true" # nsys_opts="profile \ # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ # -t cuda,ucx,nvtx,osrt \ @@ -280,7 +280,6 @@ ${worker_data}:/var/lib/presto/data,\ ${DATA}:/var/lib/presto/data/hive/data/user_data,\ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\ ${LOGS}:${vt_cufile_log_dir},\ -${LOGS}:${vt_nsys_report_dir},\ /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\ /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\ ${gds_mounts:+,${gds_mounts}} \ @@ -346,6 +345,8 @@ function run_queries { local scale_factor=$2 local metrics_flag="" [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m" + local profile_flag="" + [[ "${ENABLE_NSYS}" == "1" ]] && profile_flag="-p --profile-script-path $(readlink -f ./profiler_functions.sh)" source "${SCRIPT_DIR}/defaults.env" # We currently skip dropping cache because it requires docker (not available on the cluster). @@ -355,7 +356,7 @@ function run_queries { export MINIFORGE_HOME=/workspace/miniforge3; \ export HOME=/workspace; \ cd /workspace/presto/scripts; \ - ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} \ + ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} ${profile_flag} -q 1 \ --hostname ${COORD} --port $PORT -o /workspace/presto/slurm/presto-nvl72/result_dir --skip-drop-cache; \ echo 'Validating query results...'; \ MINIFORGE_HOME=/workspace/miniforge3 /workspace/scripts/run_py_script.sh \ diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 594974ad..80f07bef 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -47,6 +47,10 @@ COORD_IMAGE="presto-coordinator-karth-Mar11" #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64" #COORD_IMAGE="presto-coordinator" OUTPUT_PATH="" +ENABLE_GDS=1 +ENABLE_METRICS=0 +ENABLE_NSYS=0 + while [[ $# -gt 0 ]]; do case "$1" in -n|--nodes) @@ -79,7 +83,7 @@ while [[ $# -gt 0 ]]; do exit 1 fi ;; - -g|--num-gpus-per-node) + -g|--num-gpus-per-node) if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then NUM_GPUS_PER_NODE="$2" shift 2 @@ -89,7 +93,7 @@ while [[ $# -gt 0 ]]; do exit 1 fi ;; - -w|--worker-image) + -w|--worker-image) if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then WORKER_IMAGE="$2" shift 2 @@ -99,7 +103,7 @@ while [[ $# -gt 0 ]]; do exit 1 fi ;; - -c|--coord-image) + -c|--coord-image) if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then COORD_IMAGE="$2" shift 2 @@ -128,6 +132,18 @@ while [[ $# -gt 0 ]]; do exit 1 fi ;; + --disable-gds) + ENABLE_GDS=0 + shift + ;; + -m|--metrics) + ENABLE_METRICS=1 + shift + ;; + -p|--profile) + ENABLE_NSYS=1 + shift + ;; --) shift break @@ -158,8 +174,22 @@ JOB_NAME="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}" # Node 5 has known issues; nodes above 10 are not yet functional. NODELIST="${NODELIST:-${DEFAULT_NODELIST}}" GRES_OPT=$([[ "$VARIANT_TYPE" == "gpu" ]] && echo "--gres=gpu:${NUM_GPUS_PER_NODE}" || echo "") + +EXPORT_VARS="ALL" +EXPORT_VARS+=",SCALE_FACTOR=${SCALE_FACTOR}" +EXPORT_VARS+=",NUM_ITERATIONS=${NUM_ITERATIONS}" +EXPORT_VARS+=",SCRIPT_DIR=${SCRIPT_DIR}" +EXPORT_VARS+=",NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE}" +EXPORT_VARS+=",WORKER_IMAGE=${WORKER_IMAGE}" +EXPORT_VARS+=",COORD_IMAGE=${COORD_IMAGE}" +EXPORT_VARS+=",USE_NUMA=${USE_NUMA}" +EXPORT_VARS+=",VARIANT_TYPE=${VARIANT_TYPE}" +EXPORT_VARS+=",ENABLE_GDS=${ENABLE_GDS}" +EXPORT_VARS+=",ENABLE_METRICS=${ENABLE_METRICS}" +EXPORT_VARS+=",ENABLE_NSYS=${ENABLE_NSYS}" + JOB_ID=$(sbatch --job-name="${JOB_NAME}" --nodes="${NODES_COUNT}" --nodelist="${NODELIST}" \ ---export="ALL,SCALE_FACTOR=${SCALE_FACTOR},NUM_ITERATIONS=${NUM_ITERATIONS},SCRIPT_DIR=${SCRIPT_DIR},NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE},WORKER_IMAGE=${WORKER_IMAGE},COORD_IMAGE=${COORD_IMAGE},USE_NUMA=${USE_NUMA},VARIANT_TYPE=${VARIANT_TYPE}" \ +--export="${EXPORT_VARS}" \ --output="${OUT_FMT}" --error="${ERR_FMT}" "${EXTRA_ARGS[@]}" ${GRES_OPT} \ run-presto-benchmarks.slurm | awk '{print $NF}') OUT_FILE="${OUT_FMT//%j/${JOB_ID}}" diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh new file mode 100755 index 00000000..65805a82 --- /dev/null +++ b/presto/slurm/presto-nvl72/profiler_functions.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -e + +function start_profiler() { + echo "start profiler" +} + +function stop_profiler() { + echo "stop profiler" +} diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index f5b953d1..298276fc 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -38,16 +38,10 @@ wait_until_coordinator_is_running echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..." worker_id=0 -nsys_worker_pid="" for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do echo " Starting worker ${worker_id} on node ${node} GPU ${gpu_id}" run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id" - - if [[ "${ENABLE_NSYS}" == "1" && "${NSYS_WORKER_ID}" == "${worker_id}" ]]; then - nsys_worker_pid=$! - fi - worker_id=$((worker_id + 1)) done done @@ -78,16 +72,6 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt echo "Collecting configs and logs into result directory..." collect_results -if [[ -n "${nsys_worker_pid}" ]]; then - echo "Sending SIGINT to nsys (worker srun PID ${nsys_worker_pid})..." - # Send the interrupt signal to the nsys process - # If the process has already terminated, `kill` will have an error, hence `|| true` - kill -INT "${nsys_worker_pid}" 2>/dev/null || true - echo "Waiting for nsys to finalize report..." - # Wait for the nsys process to finalize the report and store to disk - wait "${nsys_worker_pid}" 2>/dev/null || true -fi - echo "========================================" echo "Benchmark complete!" echo "Results saved to: ${SCRIPT_DIR}/results_dir" diff --git a/presto/testing/performance_benchmarks/common_fixtures.py b/presto/testing/performance_benchmarks/common_fixtures.py index 8ea3b6db..27c79f36 100644 --- a/presto/testing/performance_benchmarks/common_fixtures.py +++ b/presto/testing/performance_benchmarks/common_fixtures.py @@ -74,6 +74,7 @@ def benchmark_query_function(query_id): if profile: # Base path without .nsys-rep extension: {dir}/{query_id} profile_output_file_path = f"{profile_output_dir_path.absolute()}/{query_id}" + print(f">>> profile_script_path: {profile_script_path}, profile_output_file_path: {profile_output_file_path}") start_profiler(profile_script_path, profile_output_file_path) result = [] for iteration_num in range(iterations): From 04e76900bb0f1c3837bb86ca58dfd6fbf6ee7703 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 10 Apr 2026 19:13:50 +0000 Subject: [PATCH 11/23] Update --- presto/slurm/presto-nvl72/functions.sh | 45 ++++++++++--------- presto/slurm/presto-nvl72/launch-run.sh | 8 ++++ .../slurm/presto-nvl72/profiler_functions.sh | 13 +++++- presto/slurm/presto-nvl72/run_interactive.sh | 4 +- .../performance_benchmarks/common_fixtures.py | 1 - 5 files changed, 44 insertions(+), 27 deletions(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 474c742b..9a62b54e 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -231,26 +231,26 @@ function run_worker { done fi + local nsys_args="" + [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]] && nsys_args="${NSYS_BIN} ${NSYS_OPTS}" # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} # --cpuctxsw=none # --nvtx-domain-exclude=CCCL - local nsys_bin="" - local nsys_opts="" - if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then - nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" - nsys_opts="launch \ - -t nvtx,cuda,osrt,ucx \ - --cuda-memory-usage=true \ - --cuda-um-cpu-page-faults=true \ - --cuda-um-gpu-page-faults=true \ - --cudabacktrace=true" - # nsys_opts="profile \ - # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ - # -t cuda,ucx,nvtx,osrt \ - # -f true \ - # --cuda-memory-usage=true \ - # --nvtx-domain-exclude=CCCL" - fi + # if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then + # nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" + # nsys_opts="launch \ + # -t nvtx,cuda,osrt,ucx \ + # --cuda-memory-usage=true \ + # --cuda-um-cpu-page-faults=true \ + # --cuda-um-gpu-page-faults=true \ + # --cudabacktrace=true" + # nsys_opts="profile \ + # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ + # -t cuda,ucx,nvtx,osrt \ + # -f true \ + # --cuda-memory-usage=true \ + # --nvtx-domain-exclude=CCCL" + # fi # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel # capabilities are already set up for the job cgroup. Do NOT use --gres=gpu:1 @@ -297,14 +297,15 @@ echo \"Worker ${worker_id}: ENABLE_NSYS=\${ENABLE_NSYS:-unset}\" echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\" echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\" -if [[ -n '${nsys_bin}' ]]; then - echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\" +if [[ -n '${nsys_args}' ]]; then + echo \"Worker ${worker_id}: Nsight System program at ${NSYS_BIN}\" + ls ${NSYS_BIN} fi if [[ '${USE_NUMA}' == '1' ]]; then - numactl --cpubind=${numa_node} --membind=${numa_node} ${nsys_bin} ${nsys_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc + numactl --cpubind=${numa_node} --membind=${numa_node} ${nsys_args} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc else - ${nsys_bin} ${nsys_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc + ${nsys_args} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc fi" > ${LOGS}/worker_${worker_id}.log 2>&1 & } @@ -346,7 +347,7 @@ function run_queries { local metrics_flag="" [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m" local profile_flag="" - [[ "${ENABLE_NSYS}" == "1" ]] && profile_flag="-p --profile-script-path $(readlink -f ./profiler_functions.sh)" + [[ "${ENABLE_NSYS}" == "1" ]] && profile_flag="-p --profile-script-path /workspace/presto/slurm/presto-nvl72/profiler_functions.sh" source "${SCRIPT_DIR}/defaults.env" # We currently skip dropping cache because it requires docker (not available on the cluster). diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 80f07bef..54f5c158 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -175,6 +175,14 @@ JOB_NAME="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}" NODELIST="${NODELIST:-${DEFAULT_NODELIST}}" GRES_OPT=$([[ "$VARIANT_TYPE" == "gpu" ]] && echo "--gres=gpu:${NUM_GPUS_PER_NODE}" || echo "") +export NSYS_BIN="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" +export NSYS_OPTS="launch \ +-t nvtx,cuda,osrt,ucx \ +--cuda-memory-usage=true \ +--cuda-um-cpu-page-faults=true \ +--cuda-um-gpu-page-faults=true \ +--cudabacktrace=true" + EXPORT_VARS="ALL" EXPORT_VARS+=",SCALE_FACTOR=${SCALE_FACTOR}" EXPORT_VARS+=",NUM_ITERATIONS=${NUM_ITERATIONS}" diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh index 65805a82..83491e9a 100755 --- a/presto/slurm/presto-nvl72/profiler_functions.sh +++ b/presto/slurm/presto-nvl72/profiler_functions.sh @@ -5,9 +5,18 @@ set -e function start_profiler() { - echo "start profiler" + local -r profile_output_file_path=$1 + ${NSYS_BIN} start --gpu-metrics-devices=all -o ${profile_output_file_path}.nsys-rep } function stop_profiler() { - echo "stop profiler" + local -r profile_output_file_path=$1.nsys-rep +# local -r container_file_path="/presto_profiles/$(basename $profile_output_file_path)" + ${NSYS_BIN} stop +# chown -R $(id -u):$(id -g) /presto_profiles + +# local container_id +# container_id=$(get_worker_container_id) +# docker cp ${container_id}:${container_file_path} $profile_output_file_path +# $docker_exec_command rm ${container_file_path} } diff --git a/presto/slurm/presto-nvl72/run_interactive.sh b/presto/slurm/presto-nvl72/run_interactive.sh index 70301ed0..fdf9445c 100755 --- a/presto/slurm/presto-nvl72/run_interactive.sh +++ b/presto/slurm/presto-nvl72/run_interactive.sh @@ -5,7 +5,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/defaults.env" -: "${IMAGE:=${IMAGE_DIR}/presto-native-worker-gpu.sqsh}" +: "${IMAGE:=${IMAGE_DIR}/presto-native-worker-gpu-karth-Mar11-with-nsys.sqsh}" : "${NODELIST:=${DEFAULT_SINGLE_NODE}}" : "${GRES:=gpu:4}" : "${TIME_LIMIT:=01:00:00}" @@ -17,7 +17,7 @@ srun --nodes=1 \ --exclusive \ --time="${TIME_LIMIT}" \ --container-image="${IMAGE}" \ - --container-mounts="${HOME}:${HOME},/scratch:/scratch" \ + --container-mounts="/scratch:/scratch" \ --container-remap-root \ --container-writable \ --pty bash diff --git a/presto/testing/performance_benchmarks/common_fixtures.py b/presto/testing/performance_benchmarks/common_fixtures.py index 27c79f36..8ea3b6db 100644 --- a/presto/testing/performance_benchmarks/common_fixtures.py +++ b/presto/testing/performance_benchmarks/common_fixtures.py @@ -74,7 +74,6 @@ def benchmark_query_function(query_id): if profile: # Base path without .nsys-rep extension: {dir}/{query_id} profile_output_file_path = f"{profile_output_dir_path.absolute()}/{query_id}" - print(f">>> profile_script_path: {profile_script_path}, profile_output_file_path: {profile_output_file_path}") start_profiler(profile_script_path, profile_output_file_path) result = [] for iteration_num in range(iterations): From 450c6e609c0540197d4eda77cf1c04ed0f0e7713 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 10 Apr 2026 21:36:10 +0000 Subject: [PATCH 12/23] Revert changes to nsys --- presto/scripts/run_benchmark.sh | 15 +------ presto/slurm/presto-nvl72/functions.sh | 44 +++++++++---------- presto/slurm/presto-nvl72/launch-run.sh | 11 +---- .../slurm/presto-nvl72/profiler_functions.sh | 22 ---------- .../presto-nvl72/run-presto-benchmarks.sh | 17 +++++++ 5 files changed, 40 insertions(+), 69 deletions(-) delete mode 100755 presto/slurm/presto-nvl72/profiler_functions.sh diff --git a/presto/scripts/run_benchmark.sh b/presto/scripts/run_benchmark.sh index d73bb51f..c0242f5d 100755 --- a/presto/scripts/run_benchmark.sh +++ b/presto/scripts/run_benchmark.sh @@ -35,7 +35,6 @@ OPTIONS: stored inside a directory under the --output-dir path with a name matching the tag name. Tags must contain only alphanumeric and underscore characters. -p, --profile Enable profiling of benchmark queries. - --profile-script-path Path to a custom profiler functions script. Defaults to ./profiler_functions.sh. --skip-drop-cache Skip dropping system caches before each benchmark query (dropped by default). -m, --metrics Collect detailed metrics from Presto REST API after each query. Metrics are stored in query-specific directories. @@ -155,15 +154,6 @@ parse_args() { PROFILE=true shift ;; - --profile-script-path) - if [[ -n $2 ]]; then - PROFILE_SCRIPT_PATH=$2 - shift 2 - else - echo "Error: --profile-script-path requires a value" - exit 1 - fi - ;; --skip-drop-cache) SKIP_DROP_CACHE=true shift @@ -246,10 +236,7 @@ if [[ -n ${TAG} ]]; then fi if [[ "${PROFILE}" == "true" ]]; then - if [[ -z "${PROFILE_SCRIPT_PATH}" ]]; then - PROFILE_SCRIPT_PATH="$(readlink -f ./profiler_functions.sh)" - fi - PYTEST_ARGS+=("--profile --profile-script-path ${PROFILE_SCRIPT_PATH}") + PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f ./profiler_functions.sh)") fi if [[ "${METRICS}" == "true" ]]; then diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 9a62b54e..6488e0f4 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -231,26 +231,24 @@ function run_worker { done fi + local nsys_bin="" + local nsys_opts="" local nsys_args="" - [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]] && nsys_args="${NSYS_BIN} ${NSYS_OPTS}" - # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} - # --cpuctxsw=none - # --nvtx-domain-exclude=CCCL - # if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then - # nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" - # nsys_opts="launch \ - # -t nvtx,cuda,osrt,ucx \ - # --cuda-memory-usage=true \ - # --cuda-um-cpu-page-faults=true \ - # --cuda-um-gpu-page-faults=true \ - # --cudabacktrace=true" - # nsys_opts="profile \ - # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ - # -t cuda,ucx,nvtx,osrt \ - # -f true \ - # --cuda-memory-usage=true \ - # --nvtx-domain-exclude=CCCL" - # fi + local vt_nsys_report_dir="/var/log/nsys" + if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then + nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" + nsys_opts="profile \ + -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ + -t nvtx,cuda,osrt,ucx \ + -f true \ + --sample=none \ + --cpuctxsw=none \ + --cuda-memory-usage=true \ + --cuda-um-cpu-page-faults=true \ + --cuda-um-gpu-page-faults=true \ + --nvtx-domain-exclude=CCCL" + nsys_args="${nsys_bin} ${nsys_opts}" + fi # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel # capabilities are already set up for the job cgroup. Do NOT use --gres=gpu:1 @@ -280,6 +278,7 @@ ${worker_data}:/var/lib/presto/data,\ ${DATA}:/var/lib/presto/data/hive/data/user_data,\ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\ ${LOGS}:${vt_cufile_log_dir},\ +${LOGS}:${vt_nsys_report_dir},\ /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\ /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\ ${gds_mounts:+,${gds_mounts}} \ @@ -298,8 +297,7 @@ echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\" echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\" if [[ -n '${nsys_args}' ]]; then - echo \"Worker ${worker_id}: Nsight System program at ${NSYS_BIN}\" - ls ${NSYS_BIN} + echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\" fi if [[ '${USE_NUMA}' == '1' ]]; then @@ -346,8 +344,6 @@ function run_queries { local scale_factor=$2 local metrics_flag="" [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m" - local profile_flag="" - [[ "${ENABLE_NSYS}" == "1" ]] && profile_flag="-p --profile-script-path /workspace/presto/slurm/presto-nvl72/profiler_functions.sh" source "${SCRIPT_DIR}/defaults.env" # We currently skip dropping cache because it requires docker (not available on the cluster). @@ -357,7 +353,7 @@ function run_queries { export MINIFORGE_HOME=/workspace/miniforge3; \ export HOME=/workspace; \ cd /workspace/presto/scripts; \ - ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} ${profile_flag} -q 1 \ + ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} -q 1 \ --hostname ${COORD} --port $PORT -o /workspace/presto/slurm/presto-nvl72/result_dir --skip-drop-cache; \ echo 'Validating query results...'; \ MINIFORGE_HOME=/workspace/miniforge3 /workspace/scripts/run_py_script.sh \ diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 54f5c158..1e74fbd6 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -41,7 +41,8 @@ USE_NUMA="1" VARIANT_TYPE="gpu" # WORKER_IMAGE="presto-native-worker-gpu" WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys" -COORD_IMAGE="presto-coordinator-karth-Mar11" +# COORD_IMAGE="presto-coordinator-karth-Mar11" +COORD_IMAGE="presto-coordinator-karth-Mar11-with-nsys" #COORD_IMAGE="presto-coordinator-ibm-03-11" #WORKER_IMAGE="presto-native-worker-gpu-ibm-03-11" #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64" @@ -175,14 +176,6 @@ JOB_NAME="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}" NODELIST="${NODELIST:-${DEFAULT_NODELIST}}" GRES_OPT=$([[ "$VARIANT_TYPE" == "gpu" ]] && echo "--gres=gpu:${NUM_GPUS_PER_NODE}" || echo "") -export NSYS_BIN="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" -export NSYS_OPTS="launch \ --t nvtx,cuda,osrt,ucx \ ---cuda-memory-usage=true \ ---cuda-um-cpu-page-faults=true \ ---cuda-um-gpu-page-faults=true \ ---cudabacktrace=true" - EXPORT_VARS="ALL" EXPORT_VARS+=",SCALE_FACTOR=${SCALE_FACTOR}" EXPORT_VARS+=",NUM_ITERATIONS=${NUM_ITERATIONS}" diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh deleted file mode 100755 index 83491e9a..00000000 --- a/presto/slurm/presto-nvl72/profiler_functions.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -e - -function start_profiler() { - local -r profile_output_file_path=$1 - ${NSYS_BIN} start --gpu-metrics-devices=all -o ${profile_output_file_path}.nsys-rep -} - -function stop_profiler() { - local -r profile_output_file_path=$1.nsys-rep -# local -r container_file_path="/presto_profiles/$(basename $profile_output_file_path)" - ${NSYS_BIN} stop -# chown -R $(id -u):$(id -g) /presto_profiles - -# local container_id -# container_id=$(get_worker_container_id) -# docker cp ${container_id}:${container_file_path} $profile_output_file_path -# $docker_exec_command rm ${container_file_path} -} diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index 298276fc..6ec84848 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -38,10 +38,17 @@ wait_until_coordinator_is_running echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..." worker_id=0 +nsys_worker_pid="" for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do echo " Starting worker ${worker_id} on node ${node} GPU ${gpu_id}" run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id" + + if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then + nsys_worker_pid=$! + echo "profiled worker PID ${nsys_worker_pid}" + fi + worker_id=$((worker_id + 1)) done done @@ -72,6 +79,16 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt echo "Collecting configs and logs into result directory..." collect_results +if [[ -n "${nsys_worker_pid}" ]]; then + echo "Sending SIGINT to profiled worker PID ${nsys_worker_pid}..." + # Send the interrupt signal to the nsys process + # If the process has already terminated, `kill` will have an error, hence `|| true` + kill -TERM "${nsys_worker_pid}" 2>/dev/null || true + echo "Waiting for nsys to finalize report..." + # Wait for the nsys process to finalize the report and store to disk + wait "${nsys_worker_pid}" 2>/dev/null || true +fi + echo "========================================" echo "Benchmark complete!" echo "Results saved to: ${SCRIPT_DIR}/results_dir" From b9201135a4ebb1b9292488ea947cea000b49b5dd Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 10 Apr 2026 21:37:51 +0000 Subject: [PATCH 13/23] Revert changes to run interative --- presto/slurm/presto-nvl72/run_interactive.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presto/slurm/presto-nvl72/run_interactive.sh b/presto/slurm/presto-nvl72/run_interactive.sh index fdf9445c..70301ed0 100755 --- a/presto/slurm/presto-nvl72/run_interactive.sh +++ b/presto/slurm/presto-nvl72/run_interactive.sh @@ -5,7 +5,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/defaults.env" -: "${IMAGE:=${IMAGE_DIR}/presto-native-worker-gpu-karth-Mar11-with-nsys.sqsh}" +: "${IMAGE:=${IMAGE_DIR}/presto-native-worker-gpu.sqsh}" : "${NODELIST:=${DEFAULT_SINGLE_NODE}}" : "${GRES:=gpu:4}" : "${TIME_LIMIT:=01:00:00}" @@ -17,7 +17,7 @@ srun --nodes=1 \ --exclusive \ --time="${TIME_LIMIT}" \ --container-image="${IMAGE}" \ - --container-mounts="/scratch:/scratch" \ + --container-mounts="${HOME}:${HOME},/scratch:/scratch" \ --container-remap-root \ --container-writable \ --pty bash From a04a8670e6e3218b6035a3c0c291fcf2fcae04d7 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Sat, 11 Apr 2026 04:08:37 +0000 Subject: [PATCH 14/23] Update --- presto/slurm/presto-nvl72/functions.sh | 53 +++++++++++++------ .../presto-nvl72/run-presto-benchmarks.sh | 37 +++++++------ 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 6488e0f4..542ea034 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -231,23 +231,21 @@ function run_worker { done fi + # --nvtx-domain-exclude=CCCL + # --cpuctxsw=none + # --sample=none local nsys_bin="" - local nsys_opts="" - local nsys_args="" + local nsys_launch_opts="" + local nsys_start_opts="" local vt_nsys_report_dir="/var/log/nsys" if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" - nsys_opts="profile \ - -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ - -t nvtx,cuda,osrt,ucx \ - -f true \ - --sample=none \ - --cpuctxsw=none \ + nsys_launch_opts="-t nvtx,cuda,osrt,ucx \ --cuda-memory-usage=true \ --cuda-um-cpu-page-faults=true \ - --cuda-um-gpu-page-faults=true \ - --nvtx-domain-exclude=CCCL" - nsys_args="${nsys_bin} ${nsys_opts}" + --cuda-um-gpu-page-faults=true" + nsys_start_opts="-o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ + -f true" fi # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel @@ -296,15 +294,36 @@ echo \"Worker ${worker_id}: ENABLE_NSYS=\${ENABLE_NSYS:-unset}\" echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\" echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\" -if [[ -n '${nsys_args}' ]]; then +if [[ -n '${nsys_bin}' ]]; then + ( + echo \"Worker ${worker_id}: nsys subshell started, waiting for start token\" + while [[ ! -f ${vt_nsys_report_dir}/.nsys_start_token ]]; do + read -t 2 -r _ <<< '' || true + done + echo \"Worker ${worker_id}: start token found, running nsys start\" + ${nsys_bin} start ${nsys_start_opts} + echo \"Worker ${worker_id}: nsys start exit code: \$?\" + while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token ]]; do + read -t 2 -r _ <<< '' || true + done + echo \"Worker ${worker_id}: stop token found, running nsys stop\" + ${nsys_bin} stop + echo \"Worker ${worker_id}: nsys stop exit code: \$?\" + ) & + echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\" + echo \"Worker ${worker_id}: running nsys launch\" + ${nsys_bin} launch ${nsys_launch_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc + echo \"Worker ${worker_id}: nsys launch exited with code: \$?\" +else + if [[ '${USE_NUMA}' == '1' ]]; then + numactl --cpubind=${numa_node} --membind=${numa_node} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc + else + /usr/bin/presto_server --etc-dir=/opt/presto-server/etc + fi fi -if [[ '${USE_NUMA}' == '1' ]]; then - numactl --cpubind=${numa_node} --membind=${numa_node} ${nsys_args} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc -else - ${nsys_args} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc -fi" > ${LOGS}/worker_${worker_id}.log 2>&1 & +" > ${LOGS}/worker_${worker_id}.log 2>&1 & } function copy_hive_metastore { diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index 6ec84848..5253ffdc 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -38,17 +38,10 @@ wait_until_coordinator_is_running echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..." worker_id=0 -nsys_worker_pid="" for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do echo " Starting worker ${worker_id} on node ${node} GPU ${gpu_id}" run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id" - - if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then - nsys_worker_pid=$! - echo "profiled worker PID ${nsys_worker_pid}" - fi - worker_id=$((worker_id + 1)) done done @@ -67,7 +60,10 @@ wait_for_workers_to_register $NUM_WORKERS # Run Queries # ============================================================================== echo "Running TPC-H queries (${NUM_ITERATIONS} iterations, scale factor ${SCALE_FACTOR})..." + +touch "${LOGS}/.nsys_start_token" run_queries ${NUM_ITERATIONS} ${SCALE_FACTOR} +touch "${LOGS}/.nsys_stop_token" # ============================================================================== # Process Results @@ -79,15 +75,24 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt echo "Collecting configs and logs into result directory..." collect_results -if [[ -n "${nsys_worker_pid}" ]]; then - echo "Sending SIGINT to profiled worker PID ${nsys_worker_pid}..." - # Send the interrupt signal to the nsys process - # If the process has already terminated, `kill` will have an error, hence `|| true` - kill -TERM "${nsys_worker_pid}" 2>/dev/null || true - echo "Waiting for nsys to finalize report..." - # Wait for the nsys process to finalize the report and store to disk - wait "${nsys_worker_pid}" 2>/dev/null || true -fi +# rm "${LOGS}/.nsys_start_token" "${LOGS}/.nsys_stop_token" +echo "Waiting for nsys report generation..." +prev_size=0 +stable_count=0 +for i in {1..120}; do + cur_size=$(stat -c%s "${LOGS}/nsys_worker_0.nsys-rep" 2>/dev/null || echo 0) + if (( cur_size > 0 && cur_size == prev_size )); then + stable_count=$((stable_count + 1)) + if (( stable_count >= 3 )); then + echo "nsys report complete: ${cur_size} bytes" + break + fi + else + stable_count=0 + fi + prev_size=$cur_size + sleep 5 +done echo "========================================" echo "Benchmark complete!" From 2b87c9465a96e00eb5f01b67b95288f40a07e756 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Sun, 12 Apr 2026 19:18:56 +0000 Subject: [PATCH 15/23] Add initial profile. Add query selection --- presto/scripts/run_benchmark.sh | 16 +++++- presto/slurm/presto-nvl72/functions.sh | 46 ++++++++++------- presto/slurm/presto-nvl72/launch-run.sh | 15 +++++- .../slurm/presto-nvl72/profiler_functions.sh | 17 +++++++ .../presto-nvl72/run-presto-benchmarks.sh | 50 +++++++++++-------- 5 files changed, 104 insertions(+), 40 deletions(-) create mode 100755 presto/slurm/presto-nvl72/profiler_functions.sh diff --git a/presto/scripts/run_benchmark.sh b/presto/scripts/run_benchmark.sh index c0242f5d..bf54979a 100755 --- a/presto/scripts/run_benchmark.sh +++ b/presto/scripts/run_benchmark.sh @@ -35,6 +35,7 @@ OPTIONS: stored inside a directory under the --output-dir path with a name matching the tag name. Tags must contain only alphanumeric and underscore characters. -p, --profile Enable profiling of benchmark queries. + --profile-script-path Path to a custom profiler functions script. Defaults to ./profiler_functions.sh. --skip-drop-cache Skip dropping system caches before each benchmark query (dropped by default). -m, --metrics Collect detailed metrics from Presto REST API after each query. Metrics are stored in query-specific directories. @@ -154,6 +155,15 @@ parse_args() { PROFILE=true shift ;; + --profile-script-path) + if [[ -n $2 ]]; then + PROFILE_SCRIPT_PATH=$2 + shift 2 + else + echo "Error: --profile-script-path requires a value" + exit 1 + fi + ;; --skip-drop-cache) SKIP_DROP_CACHE=true shift @@ -236,7 +246,11 @@ if [[ -n ${TAG} ]]; then fi if [[ "${PROFILE}" == "true" ]]; then - PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f ./profiler_functions.sh)") + if [[ -z "${PROFILE_SCRIPT_PATH}" ]]; then + PROFILE_SCRIPT_PATH="$(readlink -f ./profiler_functions.sh)" + fi + PYTEST_ARGS+=("--profile --profile-script-path ${PROFILE_SCRIPT_PATH}") + fi if [[ "${METRICS}" == "true" ]]; then diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 542ea034..ffcbc40d 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -236,7 +236,6 @@ function run_worker { # --sample=none local nsys_bin="" local nsys_launch_opts="" - local nsys_start_opts="" local vt_nsys_report_dir="/var/log/nsys" if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" @@ -244,8 +243,6 @@ function run_worker { --cuda-memory-usage=true \ --cuda-um-cpu-page-faults=true \ --cuda-um-gpu-page-faults=true" - nsys_start_opts="-o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \ - -f true" fi # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel @@ -296,19 +293,30 @@ echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\" if [[ -n '${nsys_bin}' ]]; then ( - echo \"Worker ${worker_id}: nsys subshell started, waiting for start token\" - while [[ ! -f ${vt_nsys_report_dir}/.nsys_start_token ]]; do - read -t 2 -r _ <<< '' || true + echo \"Worker ${worker_id}: nsys subshell started\" + while true; do + # Wait for any start token + start_token='' + while [[ -z \"\${start_token}\" ]]; do + for f in ${vt_nsys_report_dir}/.nsys_start_token_Q*; do + [[ -f \"\$f\" ]] && start_token=\"\$f\" && break + done + [[ -z \"\${start_token}\" ]] && { read -t 2 -r _ <<< '' || true; } + done + query_id=\${start_token##*_token_} + echo \"Worker ${worker_id}: start token found for \${query_id}, running nsys start\" + rm \"\${start_token}\" + ${nsys_bin} start -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}_\${query_id} -f true + echo \"Worker ${worker_id}: nsys start exit code: \$?\" + + # Wait for corresponding stop token + while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token_\${query_id} ]]; do + read -t 2 -r _ <<< '' || true + done + echo \"Worker ${worker_id}: stop token found for \${query_id}, running nsys stop\" + rm ${vt_nsys_report_dir}/.nsys_stop_token_\${query_id} + ${nsys_bin} stop; echo \"Worker ${worker_id}: nsys stop exit code: \$?\" done - echo \"Worker ${worker_id}: start token found, running nsys start\" - ${nsys_bin} start ${nsys_start_opts} - echo \"Worker ${worker_id}: nsys start exit code: \$?\" - while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token ]]; do - read -t 2 -r _ <<< '' || true - done - echo \"Worker ${worker_id}: stop token found, running nsys stop\" - ${nsys_bin} stop - echo \"Worker ${worker_id}: nsys stop exit code: \$?\" ) & echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\" @@ -361,8 +369,10 @@ function run_queries { [ $# -ne 2 ] && echo_error "$0 expected two arguments for '' and ''" local num_iterations=$1 local scale_factor=$2 - local metrics_flag="" - [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m" + local extra_args=() + [[ "${ENABLE_METRICS}" == "1" ]] && extra_args+=("-m") + [[ "${ENABLE_NSYS}" == "1" ]] && extra_args+=("-p" "--profile-script-path" "/workspace/presto/slurm/presto-nvl72/profiler_functions.sh") + [[ -n "${QUERIES:-}" ]] && extra_args+=("-q" "${QUERIES}") source "${SCRIPT_DIR}/defaults.env" # We currently skip dropping cache because it requires docker (not available on the cluster). @@ -372,7 +382,7 @@ function run_queries { export MINIFORGE_HOME=/workspace/miniforge3; \ export HOME=/workspace; \ cd /workspace/presto/scripts; \ - ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} -q 1 \ + ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${extra_args[*]} \ --hostname ${COORD} --port $PORT -o /workspace/presto/slurm/presto-nvl72/result_dir --skip-drop-cache; \ echo 'Validating query results...'; \ MINIFORGE_HOME=/workspace/miniforge3 /workspace/scripts/run_py_script.sh \ diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 1e74fbd6..49fffc63 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -51,6 +51,7 @@ OUTPUT_PATH="" ENABLE_GDS=1 ENABLE_METRICS=0 ENABLE_NSYS=0 +QUERIES="" while [[ $# -gt 0 ]]; do case "$1" in @@ -145,7 +146,16 @@ while [[ $# -gt 0 ]]; do ENABLE_NSYS=1 shift ;; - --) + -q|--queries) + if [[ -n $2 ]]; then + QUERIES=$2 + shift 2 + else + echo "Error: --queries requires a value" + exit 1 + fi + ;; + --) shift break ;; @@ -188,6 +198,9 @@ EXPORT_VARS+=",VARIANT_TYPE=${VARIANT_TYPE}" EXPORT_VARS+=",ENABLE_GDS=${ENABLE_GDS}" EXPORT_VARS+=",ENABLE_METRICS=${ENABLE_METRICS}" EXPORT_VARS+=",ENABLE_NSYS=${ENABLE_NSYS}" +if [[ -n "${QUERIES}" ]]; then + EXPORT_VARS+=",QUERIES='${QUERIES}'" +fi JOB_ID=$(sbatch --job-name="${JOB_NAME}" --nodes="${NODES_COUNT}" --nodelist="${NODELIST}" \ --export="${EXPORT_VARS}" \ diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh new file mode 100755 index 00000000..0f3eeffd --- /dev/null +++ b/presto/slurm/presto-nvl72/profiler_functions.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -e + +function start_profiler() { + local -r profile_output_file_path=$1 + local -r query_id=$(basename ${profile_output_file_path}) + touch "/workspace/presto/slurm/presto-nvl72/logs/.nsys_start_token_${query_id}" +} + +function stop_profiler() { + local -r profile_output_file_path=$1 + local -r query_id=$(basename ${profile_output_file_path}) + touch "/workspace/presto/slurm/presto-nvl72/logs/.nsys_stop_token_${query_id}" +} diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index 5253ffdc..a254922d 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -60,10 +60,7 @@ wait_for_workers_to_register $NUM_WORKERS # Run Queries # ============================================================================== echo "Running TPC-H queries (${NUM_ITERATIONS} iterations, scale factor ${SCALE_FACTOR})..." - -touch "${LOGS}/.nsys_start_token" run_queries ${NUM_ITERATIONS} ${SCALE_FACTOR} -touch "${LOGS}/.nsys_stop_token" # ============================================================================== # Process Results @@ -75,24 +72,37 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt echo "Collecting configs and logs into result directory..." collect_results -# rm "${LOGS}/.nsys_start_token" "${LOGS}/.nsys_stop_token" -echo "Waiting for nsys report generation..." -prev_size=0 -stable_count=0 -for i in {1..120}; do - cur_size=$(stat -c%s "${LOGS}/nsys_worker_0.nsys-rep" 2>/dev/null || echo 0) - if (( cur_size > 0 && cur_size == prev_size )); then - stable_count=$((stable_count + 1)) - if (( stable_count >= 3 )); then - echo "nsys report complete: ${cur_size} bytes" - break +echo "--> QUERIES: ${QUERIES:-UNDEFINED}" + +if [[ "${ENABLE_NSYS}" == "1" ]]; then + echo "Waiting for nsys report generation..." + stable_count=0 + declare -A prev_sizes + for i in {1..120}; do + all_stable=true + found_any=false + for f in "${LOGS}"/nsys_worker_*.nsys-rep; do + [[ -f "$f" ]] || continue + found_any=true + cur_size=$(stat -c%s "$f" 2>/dev/null || echo 0) + prev=${prev_sizes["$f"]:-0} + if (( cur_size == 0 || cur_size != prev )); then + all_stable=false + fi + prev_sizes["$f"]=$cur_size + done + if $all_stable && $found_any; then + stable_count=$((stable_count + 1)) + if (( stable_count >= 3 )); then + echo "All ${#prev_sizes[@]} nsys reports stable." + break + fi + else + stable_count=0 fi - else - stable_count=0 - fi - prev_size=$cur_size - sleep 5 -done + sleep 5 + done +fi echo "========================================" echo "Benchmark complete!" From 49ee7674f184d3616153a497ddbf82b284115c58 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 13 Apr 2026 14:03:22 +0000 Subject: [PATCH 16/23] Finally make nsys work with file-based sync hack --- presto/slurm/presto-nvl72/functions.sh | 37 ++++++++++--------- presto/slurm/presto-nvl72/launch-run.sh | 5 ++- .../slurm/presto-nvl72/profiler_functions.sh | 10 ++++- .../presto-nvl72/run-presto-benchmarks.sh | 36 +++++++++++++----- 4 files changed, 57 insertions(+), 31 deletions(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index ffcbc40d..ec4373b4 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -294,29 +294,30 @@ echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\" if [[ -n '${nsys_bin}' ]]; then ( echo \"Worker ${worker_id}: nsys subshell started\" - while true; do - # Wait for any start token - start_token='' - while [[ -z \"\${start_token}\" ]]; do - for f in ${vt_nsys_report_dir}/.nsys_start_token_Q*; do - [[ -f \"\$f\" ]] && start_token=\"\$f\" && break - done - [[ -z \"\${start_token}\" ]] && { read -t 2 -r _ <<< '' || true; } + if [[ -n '${QUERIES:-}' ]]; then + IFS=',' read -ra qlist <<< '${QUERIES}' + else + qlist=({1..22}) + fi + for qnum in \"\${qlist[@]}\"; do + qid=\"Q\${qnum}\" + while [[ ! -f ${vt_nsys_report_dir}/.nsys_start_token_\${qid} ]]; do + read -t 2 -r _ <<< '' || true done - query_id=\${start_token##*_token_} - echo \"Worker ${worker_id}: start token found for \${query_id}, running nsys start\" - rm \"\${start_token}\" - ${nsys_bin} start -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}_\${query_id} -f true - echo \"Worker ${worker_id}: nsys start exit code: \$?\" - - # Wait for corresponding stop token - while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token_\${query_id} ]]; do + echo \"Worker ${worker_id}: start token found for \${qid}\" + rm ${vt_nsys_report_dir}/.nsys_start_token_\${qid} + ${nsys_bin} start -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}_\${qid} -f true; echo \"Worker ${worker_id}: nsys start exit code: \$?\" + echo \"Worker ${worker_id}: post-start token created for \${qid}\" + touch ${vt_nsys_report_dir}/.nsys_started_token_\${qid} + + while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token_\${qid} ]]; do read -t 2 -r _ <<< '' || true done - echo \"Worker ${worker_id}: stop token found for \${query_id}, running nsys stop\" - rm ${vt_nsys_report_dir}/.nsys_stop_token_\${query_id} + echo \"Worker ${worker_id}: stop token found for \${qid}\" + rm ${vt_nsys_report_dir}/.nsys_stop_token_\${qid} ${nsys_bin} stop; echo \"Worker ${worker_id}: nsys stop exit code: \$?\" done + echo \"Worker ${worker_id}: nsys subshell done, all queries profiled\" ) & echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\" diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 49fffc63..de7789fa 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -199,7 +199,10 @@ EXPORT_VARS+=",ENABLE_GDS=${ENABLE_GDS}" EXPORT_VARS+=",ENABLE_METRICS=${ENABLE_METRICS}" EXPORT_VARS+=",ENABLE_NSYS=${ENABLE_NSYS}" if [[ -n "${QUERIES}" ]]; then - EXPORT_VARS+=",QUERIES='${QUERIES}'" + # Do not append to EXPORT_VARS since comma seprator is ambiguous. + # Single quote causes further issue down the line. + # So using env var directly is the simplest correct approach. + export QUERIES fi JOB_ID=$(sbatch --job-name="${JOB_NAME}" --nodes="${NODES_COUNT}" --nodelist="${NODELIST}" \ diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh index 0f3eeffd..c49a1ad6 100755 --- a/presto/slurm/presto-nvl72/profiler_functions.sh +++ b/presto/slurm/presto-nvl72/profiler_functions.sh @@ -7,11 +7,17 @@ set -e function start_profiler() { local -r profile_output_file_path=$1 local -r query_id=$(basename ${profile_output_file_path}) - touch "/workspace/presto/slurm/presto-nvl72/logs/.nsys_start_token_${query_id}" + local -r logs_dir="/workspace/presto/slurm/presto-nvl72/logs" + touch "${logs_dir}/.nsys_start_token_${query_id}" + while [[ ! -f "${logs_dir}/.nsys_started_token_${query_id}" ]]; do + read -t 2 -r _ <<< '' || true + done + rm "${logs_dir}/.nsys_started_token_${query_id}" } function stop_profiler() { local -r profile_output_file_path=$1 local -r query_id=$(basename ${profile_output_file_path}) - touch "/workspace/presto/slurm/presto-nvl72/logs/.nsys_stop_token_${query_id}" + local -r logs_dir="/workspace/presto/slurm/presto-nvl72/logs" + touch "${logs_dir}/.nsys_stop_token_${query_id}" } diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index a254922d..834ecce9 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -76,25 +76,41 @@ echo "--> QUERIES: ${QUERIES:-UNDEFINED}" if [[ "${ENABLE_NSYS}" == "1" ]]; then echo "Waiting for nsys report generation..." - stable_count=0 + if [[ -n "${QUERIES:-}" ]]; then + IFS=',' read -ra qlist <<< "${QUERIES}" + else + qlist=({1..22}) + fi + declare -A prev_sizes + stable_count=0 for i in {1..120}; do all_stable=true - found_any=false - for f in "${LOGS}"/nsys_worker_*.nsys-rep; do - [[ -f "$f" ]] || continue - found_any=true - cur_size=$(stat -c%s "$f" 2>/dev/null || echo 0) - prev=${prev_sizes["$f"]:-0} + for qnum in "${qlist[@]}"; do + report="${LOGS}/nsys_worker_0_Q${qnum}.nsys-rep" + fallback="${LOGS}/nsys_worker_0_Q${qnum}.qdstrm" + if [[ -f "$report" ]]; then + target="$report" + elif [[ -f "$fallback" ]]; then + target="$fallback" + else + echo " Q${qnum}: no file yet" + all_stable=false + continue + fi + cur_size=$(stat -c%s "$target" 2>/dev/null || echo 0) + prev=${prev_sizes["$target"]:-0} + echo " Q${qnum}: cur=${cur_size} prev=${prev}" if (( cur_size == 0 || cur_size != prev )); then all_stable=false fi - prev_sizes["$f"]=$cur_size + prev_sizes["$target"]=$cur_size done - if $all_stable && $found_any; then + echo " all_stable=${all_stable} stable_count=${stable_count}" + if $all_stable; then stable_count=$((stable_count + 1)) if (( stable_count >= 3 )); then - echo "All ${#prev_sizes[@]} nsys reports stable." + echo "All ${#qlist[@]} nsys reports stable." break fi else From 1b356c4f2ac876a73a5b24470e95ca060ca910af Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 13 Apr 2026 14:57:55 +0000 Subject: [PATCH 17/23] Clean up. Allow posting of nsys-rep files --- benchmark_reporting_tools/post_results.py | 9 ++++++++- presto/slurm/presto-nvl72/functions.sh | 8 +++++--- presto/slurm/presto-nvl72/launch-run.sh | 1 - presto/slurm/presto-nvl72/run-presto-benchmarks.sh | 2 -- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py index 1e2489dc..fa9c5d6f 100644 --- a/benchmark_reporting_tools/post_results.py +++ b/benchmark_reporting_tools/post_results.py @@ -548,6 +548,7 @@ async def upload_log_files( List of asset IDs from the uploaded files """ log_files = sorted(benchmark_dir.glob("*.log")) + log_files.extend(sorted(benchmark_dir.glob("*.nsys-rep"))) metrics_dir = benchmark_dir / "metrics" if metrics_dir.is_dir(): log_files.extend(sorted(metrics_dir.glob("*.json"))) @@ -563,7 +564,12 @@ async def _upload_one(log_file: Path) -> int: async with semaphore: print(f" Uploading {log_file.name}...", file=sys.stderr) content = log_file.read_bytes() - media_type = "application/json" if log_file.suffix == ".json" else "text/plain" + if log_file.suffix == ".json": + media_type = "application/json" + elif log_file.suffix == ".nsys-rep": + media_type = "application/octet-stream" + else: + media_type = "text/plain" response = await client.post( "/api/assets/upload/", files={"file": (log_file.name, content, media_type)}, @@ -742,6 +748,7 @@ async def process_benchmark_dir( if upload_logs: if dry_run: log_files = sorted(benchmark_dir.glob("*.log")) + log_files.extend(sorted(benchmark_dir.glob("*.nsys-rep"))) metrics_dir = benchmark_dir / "metrics" if metrics_dir.is_dir(): log_files.extend(sorted(metrics_dir.glob("*.json"))) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index ec4373b4..de2dd007 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -231,9 +231,6 @@ function run_worker { done fi - # --nvtx-domain-exclude=CCCL - # --cpuctxsw=none - # --sample=none local nsys_bin="" local nsys_launch_opts="" local vt_nsys_report_dir="/var/log/nsys" @@ -498,6 +495,11 @@ function collect_results { echo "Copying logs to ${result_dir}/..." cp "${LOGS}"/*.log "${result_dir}/" + + if [[ "${ENABLE_NSYS}" == "1" ]]; then + echo "Copying nsys reports to ${result_dir}/..." + cp "${LOGS}"/*.nsys-rep "${result_dir}/" + fi } function inject_benchmark_metadata { diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index de7789fa..67775e0e 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -182,7 +182,6 @@ OUT_FMT="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}_i${NUM_ITERATIONS}_%j ERR_FMT="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}_i${NUM_ITERATIONS}_%j.err" SCRIPT_DIR="$PWD" JOB_NAME="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}" -# Node 5 has known issues; nodes above 10 are not yet functional. NODELIST="${NODELIST:-${DEFAULT_NODELIST}}" GRES_OPT=$([[ "$VARIANT_TYPE" == "gpu" ]] && echo "--gres=gpu:${NUM_GPUS_PER_NODE}" || echo "") diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index 834ecce9..5baf549a 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -72,8 +72,6 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt echo "Collecting configs and logs into result directory..." collect_results -echo "--> QUERIES: ${QUERIES:-UNDEFINED}" - if [[ "${ENABLE_NSYS}" == "1" ]]; then echo "Waiting for nsys report generation..." if [[ -n "${QUERIES:-}" ]]; then From 22053ee57ad1acaed774664eda19e1a9a3c70372 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 13 Apr 2026 15:20:32 +0000 Subject: [PATCH 18/23] Post large files via S3 presigned URL. Fix file copy bugs --- benchmark_reporting_tools/post_results.py | 70 ++++++++++++++++--- presto/slurm/presto-nvl72/functions.sh | 5 -- .../presto-nvl72/run-presto-benchmarks.sh | 3 + 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py index fa9c5d6f..2e89e906 100644 --- a/benchmark_reporting_tools/post_results.py +++ b/benchmark_reporting_tools/post_results.py @@ -53,9 +53,10 @@ from datetime import datetime from pathlib import Path from urllib.parse import urlparse, urlunparse - +from typing import Any import httpx +LARGE_ASSET_DIRECT_UPLOAD_THRESHOLD_BYTES = 10 * 1024 * 1024 @dataclasses.dataclass(kw_only=True) class BenchmarkMetadata: @@ -527,6 +528,50 @@ def build_http_client(api_url: str, api_key: str, timeout: float) -> httpx.Async timeout=timeout, ) +async def _s3_presigned_put( + upload_url: str, + required_headers: dict[str, Any], + content: bytes, + timeout: float, +) -> tuple[int, str]: + headers = {str(k): str(v) for k, v in required_headers.items()} + async with httpx.AsyncClient(timeout=timeout) as s3_client: + response = await s3_client.put(upload_url, headers=headers, content=content) + return response.status_code, response.text + + +async def _upload_asset_presigned( + client: httpx.AsyncClient, + content: bytes, + filename: str, + title: str, + media_type: str, + timeout: float, +) -> int: + url_resp = await client.post( + "/api/assets/upload-url/", + json={"original_filename": filename, "media_type": media_type}, + ) + if url_resp.status_code not in (200, 201): + raise RuntimeError(f"Failed to get upload URL: {url_resp.status_code} {url_resp.text}") + + presign = url_resp.json() + upload_url = presign["upload_url"] + s3_key = presign["s3_key"] + required_headers = presign.get("required_headers") or {} + + put_status, put_body = await _s3_presigned_put(upload_url, required_headers, content, timeout) + if put_status not in (200, 204): + raise RuntimeError(f"S3 PUT failed: {put_status} {put_body}") + + complete = await client.post( + "/api/assets/complete-upload/", + json={"s3_key": s3_key, "title": title, "media_type": media_type}, + ) + if complete.status_code != 201: + raise RuntimeError(f"Complete upload failed: {complete.status_code} {complete.text}") + return complete.json()["asset_id"] + async def upload_log_files( benchmark_dir: Path, @@ -570,15 +615,20 @@ async def _upload_one(log_file: Path) -> int: media_type = "application/octet-stream" else: media_type = "text/plain" - response = await client.post( - "/api/assets/upload/", - files={"file": (log_file.name, content, media_type)}, - data={"title": log_file.name, "media_type": media_type}, - ) - if response.status_code >= 400: - raise RuntimeError(f"Failed to upload {log_file.name}: {response.status_code} {response.text}") - result = response.json() - asset_id = result["asset_id"] + + if len(content) > LARGE_ASSET_DIRECT_UPLOAD_THRESHOLD_BYTES: + print(f" Using presigned upload for {log_file.name} ({len(content) // (1024 * 1024)} MiB)...", file=sys.stderr) + asset_id = await _upload_asset_presigned(client, content, log_file.name, log_file.name, media_type, timeout) + else: + response = await client.post( + "/api/assets/upload/", + files={"file": (log_file.name, content, media_type)}, + data={"title": log_file.name, "media_type": media_type}, + ) + if response.status_code >= 400: + raise RuntimeError(f"Failed to upload {log_file.name}: {response.status_code} {response.text}") + asset_id = response.json()["asset_id"] + print(f" Uploaded {log_file.name} (asset_id={asset_id})", file=sys.stderr) return asset_id diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index de2dd007..9b67182e 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -495,11 +495,6 @@ function collect_results { echo "Copying logs to ${result_dir}/..." cp "${LOGS}"/*.log "${result_dir}/" - - if [[ "${ENABLE_NSYS}" == "1" ]]; then - echo "Copying nsys reports to ${result_dir}/..." - cp "${LOGS}"/*.nsys-rep "${result_dir}/" - fi } function inject_benchmark_metadata { diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index 5baf549a..a8e5a0f3 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -116,6 +116,9 @@ if [[ "${ENABLE_NSYS}" == "1" ]]; then fi sleep 5 done + + echo "Copying nsys reports to ${result_dir}/..." + cp "${LOGS}"/*.nsys-rep "${result_dir}/" fi echo "========================================" From 09b13bff0c73a618d5bad4b4b3885392a67d8e15 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 13 Apr 2026 16:20:12 +0000 Subject: [PATCH 19/23] Fix bugs --- presto/slurm/presto-nvl72/run-presto-benchmarks.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index a8e5a0f3..719c1751 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -117,8 +117,8 @@ if [[ "${ENABLE_NSYS}" == "1" ]]; then sleep 5 done - echo "Copying nsys reports to ${result_dir}/..." - cp "${LOGS}"/*.nsys-rep "${result_dir}/" + echo "Copying nsys reports to ${SCRIPT_DIR}/result_dir/..." + cp "${LOGS}"/*.nsys-rep "${SCRIPT_DIR}/result_dir/" fi echo "========================================" From 418d651e5bed1f20be0cc896997256ed87f2ec88 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 13 Apr 2026 17:59:11 +0000 Subject: [PATCH 20/23] Use an older nsys version to avoid a new bug --- presto/slurm/presto-nvl72/functions.sh | 2 +- presto/slurm/presto-nvl72/launch-run.sh | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 9b67182e..171655c6 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -235,7 +235,7 @@ function run_worker { local nsys_launch_opts="" local vt_nsys_report_dir="/var/log/nsys" if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then - nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys" + nsys_bin="/opt/nvidia/nsight-systems-cli/2025.5.1/bin/nsys" nsys_launch_opts="-t nvtx,cuda,osrt,ucx \ --cuda-memory-usage=true \ --cuda-um-cpu-page-faults=true \ diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 67775e0e..56abf0e6 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -40,9 +40,8 @@ NUM_GPUS_PER_NODE="4" USE_NUMA="1" VARIANT_TYPE="gpu" # WORKER_IMAGE="presto-native-worker-gpu" -WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys" -# COORD_IMAGE="presto-coordinator-karth-Mar11" -COORD_IMAGE="presto-coordinator-karth-Mar11-with-nsys" +WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys-2025.5.1" +COORD_IMAGE="presto-coordinator-karth-Mar11" #COORD_IMAGE="presto-coordinator-ibm-03-11" #WORKER_IMAGE="presto-native-worker-gpu-ibm-03-11" #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64" From 82557ad606ecca17517bc5b1b15811043482aff1 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 15 Apr 2026 17:17:23 +0000 Subject: [PATCH 21/23] Update --- presto/slurm/presto-nvl72/functions.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 171655c6..fb9b685c 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -236,10 +236,11 @@ function run_worker { local vt_nsys_report_dir="/var/log/nsys" if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then nsys_bin="/opt/nvidia/nsight-systems-cli/2025.5.1/bin/nsys" - nsys_launch_opts="-t nvtx,cuda,osrt,ucx \ - --cuda-memory-usage=true \ - --cuda-um-cpu-page-faults=true \ - --cuda-um-gpu-page-faults=true" + nsys_launch_opts="-t nvtx,cuda" + # nsys_launch_opts="-t nvtx,cuda,osrt,ucx \ + # --cuda-memory-usage=true \ + # --cuda-um-cpu-page-faults=true \ + # --cuda-um-gpu-page-faults=true" fi # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel From 63201b5ae8108b83b601491bbc6be35c1ffa5c70 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 15 Apr 2026 18:42:55 +0000 Subject: [PATCH 22/23] Critical bug fixes --- presto/slurm/presto-nvl72/functions.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index fb9b685c..e2fbe25a 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -276,7 +276,7 @@ ${LOGS}:${vt_nsys_report_dir},\ /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\ ${gds_mounts:+,${gds_mounts}} \ -- /bin/bash -c " -export LD_LIBRARY_PATH=\"${CUDF_LIB}:${LD_LIBRARY_PATH}\" +export LD_LIBRARY_PATH=\"${CUDF_LIB}\${LD_LIBRARY_PATH:+:\${LD_LIBRARY_PATH}}\" if [[ '${ENABLE_GDS}' == '1' ]]; then export KVIKIO_COMPAT_MODE=OFF export CUFILE_LOGFILE_PATH=${vt_cufile_log} @@ -293,7 +293,7 @@ if [[ -n '${nsys_bin}' ]]; then ( echo \"Worker ${worker_id}: nsys subshell started\" if [[ -n '${QUERIES:-}' ]]; then - IFS=',' read -ra qlist <<< '${QUERIES}' + IFS=',' read -ra qlist <<< '${QUERIES:-}' else qlist=({1..22}) fi From 109c4168312340cc33842cbc63cfec592b6eb733 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 16 Apr 2026 14:00:39 +0000 Subject: [PATCH 23/23] Add more images --- presto/slurm/presto-nvl72/launch-run.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 56abf0e6..f1af1949 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -41,7 +41,10 @@ USE_NUMA="1" VARIANT_TYPE="gpu" # WORKER_IMAGE="presto-native-worker-gpu" WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys-2025.5.1" +# WORKER_IMAGE="velox-testing-images-presto-766546f-velox-1ca955b-gpu-cuda12.9-20260415-arm64-with-nsys" COORD_IMAGE="presto-coordinator-karth-Mar11" +# COORD_IMAGE="velox-testing-images-presto-coordinator-766546f-20260415-arm64-with-jq" +# COORD_IMAGE="presto-coordinator-karth-Mar11" #COORD_IMAGE="presto-coordinator-ibm-03-11" #WORKER_IMAGE="presto-native-worker-gpu-ibm-03-11" #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64"