From 49d47ecacf203a029bbdd2924bfd6df6b9dcad63 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Thu, 2 Apr 2026 17:32:42 +0000
Subject: [PATCH 01/23] Fix NUMA GPU pinning bug

---
 presto/slurm/presto-nvl72/functions.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 3b3ef1ba..34a0ada8 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -179,7 +179,7 @@ function run_worker {
 
     local gpu_id=$1 image=$2 node=$3 worker_id=$4
     # Assign NUMA node based on GPU ID: GPUs 0-3 → node 0, GPUs 4-7 → node 1, etc.
-    local numa_node=$((gpu_id / 4))
+    local numa_node=$((gpu_id / 2))
     echo "running worker ${worker_id} with image ${image} on node ${node} with gpu_id ${gpu_id} numa_node ${numa_node}"
 
     local worker_image="${IMAGE_DIR}/${image}.sqsh"

From 86eeb14b0caa2d95d425a0d2e4cce37743b73e95 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Thu, 2 Apr 2026 17:33:30 +0000
Subject: [PATCH 02/23] Update comment

---
 presto/slurm/presto-nvl72/functions.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 34a0ada8..ba9e3ca7 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -178,7 +178,7 @@ function run_worker {
     validate_environment_preconditions LOGS CONFIGS VT_ROOT COORD CUDF_LIB DATA
 
     local gpu_id=$1 image=$2 node=$3 worker_id=$4
-    # Assign NUMA node based on GPU ID: GPUs 0-3 → node 0, GPUs 4-7 → node 1, etc.
+    # Assign NUMA node based on GPU ID: GPUs 0-1 → node 0, GPUs 2-3 → node 1, etc.
     local numa_node=$((gpu_id / 2))
     echo "running worker ${worker_id} with image ${image} on node ${node} with gpu_id ${gpu_id} numa_node ${numa_node}"
 

From 912a80aa35bc7def9387baf51fb4cefd2972d2ba Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Thu, 2 Apr 2026 21:36:05 +0000
Subject: [PATCH 03/23] Enable gds

---
 presto/slurm/presto-nvl72/defaults.env |  4 +++
 presto/slurm/presto-nvl72/functions.sh | 49 +++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env
index 9a8e9c1b..b3973a77 100644
--- a/presto/slurm/presto-nvl72/defaults.env
+++ b/presto/slurm/presto-nvl72/defaults.env
@@ -30,3 +30,7 @@ unset _vt_path
 # --- SLURM node defaults (cluster-specific) ---
 : "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-13,15-16]}"
 : "${DEFAULT_SINGLE_NODE:=presto-gb200-gcn-01}"
+
+# --- I/O settings ---
+: "${ENABLE_GDS:=1}"
+export ENABLE_GDS
diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index ba9e3ca7..c61cc71b 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -199,6 +199,51 @@ function run_worker {
     mkdir -p ${worker_data}/hive/data/user_data
     mkdir -p ${VT_ROOT}/.hive_metastore
 
+    local vt_cufile_log_dir="/var/log/cufile"
+    local vt_cufile_log="${vt_cufile_log_dir}/cufile_worker_${worker_id}.log"
+
+    local gds_mounts=""
+    local gds_env_args=""
+
+    function add_gds_sys_path {
+        local path="${1:?Path argument missing}"
+        local read_only="${2:-0}"
+
+        # System file path must exist
+        if [[ ! -e ${path} ]]; then
+            echo "${path} required by GDS does not exist"
+            exit 1
+        fi
+
+        # If gds_mounts is not empty, append a comma
+        [[ -n "${gds_mounts}" ]] && gds_mounts+=","
+
+        # Append path
+        if [[ "${read_only}" == "1" ]]; then
+            gds_mounts+="${path}:${path}:ro"
+        else
+            gds_mounts+="${path}"
+        fi
+    }
+
+    if [[ "${ENABLE_GDS}" == "1" ]]; then
+        # Add GDS-required system paths
+        add_gds_sys_path "/run/udev" 1
+        add_gds_sys_path "/dev/infiniband"
+        add_gds_sys_path "/etc/cufile.json" 1
+        for dev in /dev/nvidia-fs*; do
+            # If file exists, append the path, otherwise, exit the loop
+            [[ -e "${dev}" ]] || continue
+            add_gds_sys_path "${dev}"
+        done
+
+        # Add the log directory
+        gds_mounts+=",${LOGS}:${vt_cufile_log_dir}"
+
+        # Add GDS-related env vars
+        gds_env_args="--container-env=KVIKIO_COMPAT_MODE=OFF --container-env=CUFILE_LOGFILE_PATH=${vt_cufile_log}"
+    fi
+
     # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel
     # capabilities are already set up for the job cgroup.  Do NOT use --gres=gpu:1
     # on the step: it restricts the step's cgroup to one GPU and then nvidia-container-cli
@@ -225,7 +270,9 @@ ${worker_data}:/var/lib/presto/data,\
 ${DATA}:/var/lib/presto/data/hive/data/user_data,\
 ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\
 /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\
-/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1 \
+/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\
+${gds_mounts:+,${gds_mounts}} \
+${gds_env_args} \
 --container-env=LD_LIBRARY_PATH="$CUDF_LIB:$LD_LIBRARY_PATH" \
 --container-env=GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3 \
 --container-env=GLOG_logtostderr=1 \

From c36a52a0f6fd0f5d86889fe07a43d62b39cc5662 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Fri, 3 Apr 2026 04:01:07 +0000
Subject: [PATCH 04/23] Fix bugs

---
 presto/slurm/presto-nvl72/functions.sh | 32 +++++++++++---------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index c61cc71b..7d2d6b0b 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -94,9 +94,7 @@ function run_coord_image {
         srun -w $COORD --ntasks=1 --overlap \
 --container-image=${coord_image} \
 --container-remap-root \
---export=ALL,JAVA_HOME=/usr/lib/jvm/jre-17-openjdk \
---container-env=JAVA_HOME=/usr/lib/jvm/jre-17-openjdk \
---container-env=PATH=/usr/lib/jvm/jre-17-openjdk/bin:$PATH \
+--export=ALL \
 --container-mounts=${VT_ROOT}:/workspace,\
 ${coord_data}:/var/lib/presto/data,\
 ${CONFIGS}/etc_common:/opt/presto-server/etc,\
@@ -110,9 +108,7 @@ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore${extra_mounts} \
         srun -w $COORD --ntasks=1 --overlap \
 --container-remap-root \
 --container-image=${coord_image} \
---export=ALL,JAVA_HOME=/usr/lib/jvm/jre-17-openjdk \
---container-env=JAVA_HOME=/usr/lib/jvm/jre-17-openjdk \
---container-env=PATH=/usr/lib/jvm/jre-17-openjdk/bin:$PATH \
+--export=ALL \
 --container-mounts=${VT_ROOT}:/workspace,\
 ${coord_data}:/var/lib/presto/data,\
 ${CONFIGS}/etc_common:/opt/presto-server/etc,\
@@ -203,8 +199,6 @@ function run_worker {
     local vt_cufile_log="${vt_cufile_log_dir}/cufile_worker_${worker_id}.log"
 
     local gds_mounts=""
-    local gds_env_args=""
-
     function add_gds_sys_path {
         local path="${1:?Path argument missing}"
         local read_only="${2:-0}"
@@ -219,10 +213,9 @@ function run_worker {
         [[ -n "${gds_mounts}" ]] && gds_mounts+=","
 
         # Append path
+        gds_mounts+="${path}:${path}"
         if [[ "${read_only}" == "1" ]]; then
-            gds_mounts+="${path}:${path}:ro"
-        else
-            gds_mounts+="${path}"
+            gds_mounts+=":ro"
         fi
     }
 
@@ -239,9 +232,6 @@ function run_worker {
 
         # Add the log directory
         gds_mounts+=",${LOGS}:${vt_cufile_log_dir}"
-
-        # Add GDS-related env vars
-        gds_env_args="--container-env=KVIKIO_COMPAT_MODE=OFF --container-env=CUFILE_LOGFILE_PATH=${vt_cufile_log}"
     fi
 
     # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel
@@ -272,13 +262,19 @@ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\
 /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\
 /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\
 ${gds_mounts:+,${gds_mounts}} \
-${gds_env_args} \
---container-env=LD_LIBRARY_PATH="$CUDF_LIB:$LD_LIBRARY_PATH" \
---container-env=GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3 \
---container-env=GLOG_logtostderr=1 \
 -- /bin/bash -c "
+export LD_LIBRARY_PATH=\"${CUDF_LIB}:${LD_LIBRARY_PATH}\"
+export GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3
+export GLOG_logtostderr=1
+if [[ '${ENABLE_GDS}' == '1' ]]; then
+    export KVIKIO_COMPAT_MODE=OFF
+    export CUFILE_LOGFILE_PATH=${vt_cufile_log}
+fi
 if [[ '${VARIANT_TYPE}' == 'gpu' ]]; then export CUDA_VISIBLE_DEVICES=${gpu_id}; fi
 echo \"Worker ${worker_id}: CUDA_VISIBLE_DEVICES=\${CUDA_VISIBLE_DEVICES:-none}, NUMA_NODE=${numa_node}\"
+echo \"Worker ${worker_id}: ENABLE_GDS=\${ENABLE_GDS:-unset}\"
+echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\"
+echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\"
 if [[ '${USE_NUMA}' == '1' ]]; then
     numactl --cpubind=${numa_node} --membind=${numa_node} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
 else

From a1389266da5574ab41f0b04dec8f8ba07df2c675 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Fri, 3 Apr 2026 04:22:37 +0000
Subject: [PATCH 05/23] Fix huge dump size

---
 presto/slurm/presto-nvl72/functions.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 7d2d6b0b..7dbee15b 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -247,6 +247,8 @@ function run_worker {
     # compat library with the host driver so cudaMallocAsync works.
     # CUDA_VISIBLE_DEVICES=${gpu_id} inside the container restricts each worker to
     # its assigned GPU while still allowing the CUDA driver to enumerate all devices.
+    # export GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3
+    # export GLOG_logtostderr=1
     srun -N1 -w $node --ntasks=1 --overlap \
 --container-image=${worker_image} \
 --container-remap-root \
@@ -264,11 +266,10 @@ ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\
 ${gds_mounts:+,${gds_mounts}} \
 -- /bin/bash -c "
 export LD_LIBRARY_PATH=\"${CUDF_LIB}:${LD_LIBRARY_PATH}\"
-export GLOG_vmodule=IntraNodeTransferRegistry=3,ExchangeOperator=3
-export GLOG_logtostderr=1
 if [[ '${ENABLE_GDS}' == '1' ]]; then
     export KVIKIO_COMPAT_MODE=OFF
     export CUFILE_LOGFILE_PATH=${vt_cufile_log}
+    export CUFILE_LOGGING_LEVEL=INFO
 fi
 if [[ '${VARIANT_TYPE}' == 'gpu' ]]; then export CUDA_VISIBLE_DEVICES=${gpu_id}; fi
 echo \"Worker ${worker_id}: CUDA_VISIBLE_DEVICES=\${CUDA_VISIBLE_DEVICES:-none}, NUMA_NODE=${numa_node}\"

From 9be3af624d708ca7b14ad46c6f14b3760401b802 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Wed, 8 Apr 2026 21:29:34 +0000
Subject: [PATCH 06/23] Update

---
 presto/slurm/presto-nvl72/defaults.env        |  9 ++++-
 presto/slurm/presto-nvl72/functions.sh        | 34 ++++++++++++++++---
 presto/slurm/presto-nvl72/launch-run.sh       |  4 +--
 .../presto-nvl72/run-presto-benchmarks.sh     | 16 +++++++++
 4 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env
index b3973a77..2c7782f0 100644
--- a/presto/slurm/presto-nvl72/defaults.env
+++ b/presto/slurm/presto-nvl72/defaults.env
@@ -28,9 +28,16 @@ unset _vt_path
 : "${HIVE_METASTORE_SOURCE:=/mnt/data/tpch-rs/HIVE-METASTORE-MG-260313}"
 
 # --- SLURM node defaults (cluster-specific) ---
-: "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-13,15-16]}"
+: "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-16]}"
 : "${DEFAULT_SINGLE_NODE:=presto-gb200-gcn-01}"
 
 # --- I/O settings ---
 : "${ENABLE_GDS:=1}"
 export ENABLE_GDS
+
+# --- Profiling ---
+: "${ENABLE_NSYS:=0}"
+export ENABLE_NSYS
+
+: "${NSYS_WORKER_ID:=0}"
+export NSYS_WORKER_ID
diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 7dbee15b..cdeb740b 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -229,9 +229,27 @@ function run_worker {
             [[ -e "${dev}" ]] || continue
             add_gds_sys_path "${dev}"
         done
+    fi
 
-        # Add the log directory
-        gds_mounts+=",${LOGS}:${vt_cufile_log_dir}"
+    local nsys_bin=""
+    local nsys_opts=""
+    local vt_nsys_report_dir="/var/log/nsys"
+    if [[ "${ENABLE_NSYS}" == "1" && "${NSYS_WORKER_ID}" == "${worker_id}" ]]; then
+        nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
+        nsys_opts="profile \
+        -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
+        -t cuda,nvtx \
+        -f true \
+        --sample=none \
+        --cpuctxsw=none \
+        --cuda-memory-usage=true \
+        --nvtx-domain-exclude=CCCL"
+        # nsys_opts="profile \
+        # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
+        # -t cuda,ucx,nvtx,osrt \
+        # -f true \
+        # --cuda-memory-usage=true \
+        # --nvtx-domain-exclude=CCCL"
     fi
 
     # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel
@@ -261,6 +279,8 @@ ${worker_hive}:/opt/presto-server/etc/catalog/hive.properties,\
 ${worker_data}:/var/lib/presto/data,\
 ${DATA}:/var/lib/presto/data/hive/data/user_data,\
 ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\
+${LOGS}:${vt_cufile_log_dir},\
+${LOGS}:${vt_nsys_report_dir},\
 /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\
 /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\
 ${gds_mounts:+,${gds_mounts}} \
@@ -274,12 +294,18 @@ fi
 if [[ '${VARIANT_TYPE}' == 'gpu' ]]; then export CUDA_VISIBLE_DEVICES=${gpu_id}; fi
 echo \"Worker ${worker_id}: CUDA_VISIBLE_DEVICES=\${CUDA_VISIBLE_DEVICES:-none}, NUMA_NODE=${numa_node}\"
 echo \"Worker ${worker_id}: ENABLE_GDS=\${ENABLE_GDS:-unset}\"
+echo \"Worker ${worker_id}: ENABLE_NSYS=\${ENABLE_NSYS:-unset}\"
 echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\"
 echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\"
+
+if [[ -n '${nsys_bin}' ]]; then
+    echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\"
+fi
+
 if [[ '${USE_NUMA}' == '1' ]]; then
-    numactl --cpubind=${numa_node} --membind=${numa_node} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
+    numactl --cpubind=${numa_node} --membind=${numa_node} ${nsys_bin} ${nsys_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
 else
-    /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
+    ${nsys_bin} ${nsys_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
 fi" > ${LOGS}/worker_${worker_id}.log 2>&1 &
 }
 
diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index 1a0a02d7..594974ad 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -39,9 +39,9 @@ EXTRA_ARGS=()
 NUM_GPUS_PER_NODE="4"
 USE_NUMA="1"
 VARIANT_TYPE="gpu"
-#WORKER_IMAGE="presto-native-worker-gpu"
+# WORKER_IMAGE="presto-native-worker-gpu"
+WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys"
 COORD_IMAGE="presto-coordinator-karth-Mar11"
-WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11"
 #COORD_IMAGE="presto-coordinator-ibm-03-11"
 #WORKER_IMAGE="presto-native-worker-gpu-ibm-03-11"
 #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64"
diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index 298276fc..f5b953d1 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -38,10 +38,16 @@ wait_until_coordinator_is_running
 echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..."
 
 worker_id=0
+nsys_worker_pid=""
 for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do
     for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do
         echo "  Starting worker ${worker_id} on node ${node} GPU ${gpu_id}"
         run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id"
+
+        if [[ "${ENABLE_NSYS}" == "1" && "${NSYS_WORKER_ID}" == "${worker_id}" ]]; then
+            nsys_worker_pid=$!
+        fi
+
         worker_id=$((worker_id + 1))
     done
 done
@@ -72,6 +78,16 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt
 echo "Collecting configs and logs into result directory..."
 collect_results
 
+if [[ -n "${nsys_worker_pid}" ]]; then
+    echo "Sending SIGINT to nsys (worker srun PID ${nsys_worker_pid})..."
+    # Send the interrupt signal to the nsys process
+    # If the process has already terminated, `kill` will have an error, hence `|| true`
+    kill -INT "${nsys_worker_pid}" 2>/dev/null || true
+    echo "Waiting for nsys to finalize report..."
+    # Wait for the nsys process to finalize the report and store to disk
+    wait "${nsys_worker_pid}" 2>/dev/null || true
+fi
+
 echo "========================================"
 echo "Benchmark complete!"
 echo "Results saved to: ${SCRIPT_DIR}/results_dir"

From e46a5b0cb6fb86170ec19ad6f3a4904cf6991aad Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Thu, 9 Apr 2026 03:46:21 +0000
Subject: [PATCH 07/23] Add metrics support

---
 presto/slurm/presto-nvl72/defaults.env | 4 ++++
 presto/slurm/presto-nvl72/functions.sh | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env
index 2c7782f0..970612c5 100644
--- a/presto/slurm/presto-nvl72/defaults.env
+++ b/presto/slurm/presto-nvl72/defaults.env
@@ -35,6 +35,10 @@ unset _vt_path
 : "${ENABLE_GDS:=1}"
 export ENABLE_GDS
 
+# --- Query metrics ---
+: "${ENABLE_METRICS:=0}"
+export ENABLE_METRICS
+
 # --- Profiling ---
 : "${ENABLE_NSYS:=0}"
 export ENABLE_NSYS
diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index cdeb740b..ec405800 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -344,6 +344,9 @@ function run_queries {
     [ $# -ne 2 ] && echo_error "$0 expected two arguments for '<iterations>' and '<scale_factor>'"
     local num_iterations=$1
     local scale_factor=$2
+    local metrics_flag=""
+    [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m"
+
     source "${SCRIPT_DIR}/defaults.env"
     # We currently skip dropping cache because it requires docker (not available on the cluster).
     run_coord_image "export PORT=$PORT; \
@@ -352,7 +355,7 @@ function run_queries {
     export MINIFORGE_HOME=/workspace/miniforge3; \
     export HOME=/workspace; \
     cd /workspace/presto/scripts; \
-    ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} \
+    ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} \
         --hostname ${COORD} --port $PORT -o /workspace/presto/slurm/presto-nvl72/result_dir --skip-drop-cache; \
     echo 'Validating query results...'; \
     MINIFORGE_HOME=/workspace/miniforge3 /workspace/scripts/run_py_script.sh \

From 8091544e2941c30ca5da77f6ccb36f2b0c3c5153 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Thu, 9 Apr 2026 16:48:50 +0000
Subject: [PATCH 08/23] Allow post results to handle failed queries

---
 benchmark_reporting_tools/post_results.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py
index bf3da37d..0c52fc30 100644
--- a/benchmark_reporting_tools/post_results.py
+++ b/benchmark_reporting_tools/post_results.py
@@ -424,6 +424,8 @@ def build_submission_payload(
 
     for query_name in query_names:
         times = raw_times[query_name]
+        if times is None:
+            times = []
         is_failed = query_name in failed_queries
 
         # Look up validation result for this query (keys are lowercase e.g. "q1")

From f3012589b042ea9facf36f6319b9d5ec5ac8ccfa Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Thu, 9 Apr 2026 20:03:47 +0000
Subject: [PATCH 09/23] Update

---
 benchmark_reporting_tools/post_results.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py
index 0c52fc30..1e2489dc 100644
--- a/benchmark_reporting_tools/post_results.py
+++ b/benchmark_reporting_tools/post_results.py
@@ -548,6 +548,9 @@ async def upload_log_files(
         List of asset IDs from the uploaded files
     """
     log_files = sorted(benchmark_dir.glob("*.log"))
+    metrics_dir = benchmark_dir / "metrics"
+    if metrics_dir.is_dir():
+        log_files.extend(sorted(metrics_dir.glob("*.json")))
     if not log_files:
         return []
 
@@ -560,10 +563,11 @@ async def _upload_one(log_file: Path) -> int:
             async with semaphore:
                 print(f"    Uploading {log_file.name}...", file=sys.stderr)
                 content = log_file.read_bytes()
+                media_type = "application/json" if log_file.suffix == ".json" else "text/plain"
                 response = await client.post(
                     "/api/assets/upload/",
-                    files={"file": (log_file.name, content, "text/plain")},
-                    data={"title": log_file.name, "media_type": "text/plain"},
+                    files={"file": (log_file.name, content, media_type)},
+                    data={"title": log_file.name, "media_type": media_type},
                 )
                 if response.status_code >= 400:
                     raise RuntimeError(f"Failed to upload {log_file.name}: {response.status_code} {response.text}")
@@ -738,6 +742,9 @@ async def process_benchmark_dir(
     if upload_logs:
         if dry_run:
             log_files = sorted(benchmark_dir.glob("*.log"))
+            metrics_dir = benchmark_dir / "metrics"
+            if metrics_dir.is_dir():
+                log_files.extend(sorted(metrics_dir.glob("*.json")))
             print(
                 f"  [DRY RUN] Would upload {len(log_files)} log file(s): {[f.name for f in log_files]}", file=sys.stderr
             )

From 8c8c9204fb4b0b946186a1d99202b8d676619045 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Fri, 10 Apr 2026 15:46:45 +0000
Subject: [PATCH 10/23] Refactor

---
 presto/scripts/run_benchmark.sh               | 15 +++++++-
 presto/slurm/presto-nvl72/defaults.env        | 15 --------
 presto/slurm/presto-nvl72/functions.sh        | 23 +++++------
 presto/slurm/presto-nvl72/launch-run.sh       | 38 +++++++++++++++++--
 .../slurm/presto-nvl72/profiler_functions.sh  | 13 +++++++
 .../presto-nvl72/run-presto-benchmarks.sh     | 16 --------
 .../performance_benchmarks/common_fixtures.py |  1 +
 7 files changed, 74 insertions(+), 47 deletions(-)
 create mode 100755 presto/slurm/presto-nvl72/profiler_functions.sh

diff --git a/presto/scripts/run_benchmark.sh b/presto/scripts/run_benchmark.sh
index c0242f5d..d73bb51f 100755
--- a/presto/scripts/run_benchmark.sh
+++ b/presto/scripts/run_benchmark.sh
@@ -35,6 +35,7 @@ OPTIONS:
                             stored inside a directory under the --output-dir path with a name matching the tag name.
                             Tags must contain only alphanumeric and underscore characters.
     -p, --profile           Enable profiling of benchmark queries.
+    --profile-script-path   Path to a custom profiler functions script. Defaults to ./profiler_functions.sh.
     --skip-drop-cache       Skip dropping system caches before each benchmark query (dropped by default).
     -m, --metrics           Collect detailed metrics from Presto REST API after each query.
                             Metrics are stored in query-specific directories.
@@ -154,6 +155,15 @@ parse_args() {
         PROFILE=true
         shift
         ;;
+      --profile-script-path)
+        if [[ -n $2 ]]; then
+          PROFILE_SCRIPT_PATH=$2
+          shift 2
+        else
+          echo "Error: --profile-script-path requires a value"
+          exit 1
+        fi
+        ;;
       --skip-drop-cache)
         SKIP_DROP_CACHE=true
         shift
@@ -236,7 +246,10 @@ if [[ -n ${TAG} ]]; then
 fi
 
 if [[ "${PROFILE}" == "true" ]]; then
-  PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f ./profiler_functions.sh)")
+  if [[ -z "${PROFILE_SCRIPT_PATH}" ]]; then
+    PROFILE_SCRIPT_PATH="$(readlink -f ./profiler_functions.sh)"
+  fi
+  PYTEST_ARGS+=("--profile --profile-script-path ${PROFILE_SCRIPT_PATH}")
 fi
 
 if [[ "${METRICS}" == "true" ]]; then
diff --git a/presto/slurm/presto-nvl72/defaults.env b/presto/slurm/presto-nvl72/defaults.env
index 970612c5..ff45ba20 100644
--- a/presto/slurm/presto-nvl72/defaults.env
+++ b/presto/slurm/presto-nvl72/defaults.env
@@ -30,18 +30,3 @@ unset _vt_path
 # --- SLURM node defaults (cluster-specific) ---
 : "${DEFAULT_NODELIST:=presto-gb200-gcn-[01-16]}"
 : "${DEFAULT_SINGLE_NODE:=presto-gb200-gcn-01}"
-
-# --- I/O settings ---
-: "${ENABLE_GDS:=1}"
-export ENABLE_GDS
-
-# --- Query metrics ---
-: "${ENABLE_METRICS:=0}"
-export ENABLE_METRICS
-
-# --- Profiling ---
-: "${ENABLE_NSYS:=0}"
-export ENABLE_NSYS
-
-: "${NSYS_WORKER_ID:=0}"
-export NSYS_WORKER_ID
diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index ec405800..474c742b 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -231,19 +231,19 @@ function run_worker {
         done
     fi
 
+    # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}
+    # --cpuctxsw=none
+    # --nvtx-domain-exclude=CCCL
     local nsys_bin=""
     local nsys_opts=""
-    local vt_nsys_report_dir="/var/log/nsys"
-    if [[ "${ENABLE_NSYS}" == "1" && "${NSYS_WORKER_ID}" == "${worker_id}" ]]; then
+    if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
         nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
-        nsys_opts="profile \
-        -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
-        -t cuda,nvtx \
-        -f true \
-        --sample=none \
-        --cpuctxsw=none \
+        nsys_opts="launch \
+        -t nvtx,cuda,osrt,ucx \
         --cuda-memory-usage=true \
-        --nvtx-domain-exclude=CCCL"
+        --cuda-um-cpu-page-faults=true \
+        --cuda-um-gpu-page-faults=true \
+        --cudabacktrace=true"
         # nsys_opts="profile \
         # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
         # -t cuda,ucx,nvtx,osrt \
@@ -280,7 +280,6 @@ ${worker_data}:/var/lib/presto/data,\
 ${DATA}:/var/lib/presto/data/hive/data/user_data,\
 ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\
 ${LOGS}:${vt_cufile_log_dir},\
-${LOGS}:${vt_nsys_report_dir},\
 /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\
 /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\
 ${gds_mounts:+,${gds_mounts}} \
@@ -346,6 +345,8 @@ function run_queries {
     local scale_factor=$2
     local metrics_flag=""
     [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m"
+    local profile_flag=""
+    [[ "${ENABLE_NSYS}" == "1" ]] && profile_flag="-p --profile-script-path $(readlink -f ./profiler_functions.sh)"
 
     source "${SCRIPT_DIR}/defaults.env"
     # We currently skip dropping cache because it requires docker (not available on the cluster).
@@ -355,7 +356,7 @@ function run_queries {
     export MINIFORGE_HOME=/workspace/miniforge3; \
     export HOME=/workspace; \
     cd /workspace/presto/scripts; \
-    ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} \
+    ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} ${profile_flag} -q 1 \
         --hostname ${COORD} --port $PORT -o /workspace/presto/slurm/presto-nvl72/result_dir --skip-drop-cache; \
     echo 'Validating query results...'; \
     MINIFORGE_HOME=/workspace/miniforge3 /workspace/scripts/run_py_script.sh \
diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index 594974ad..80f07bef 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -47,6 +47,10 @@ COORD_IMAGE="presto-coordinator-karth-Mar11"
 #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64"
 #COORD_IMAGE="presto-coordinator"
 OUTPUT_PATH=""
+ENABLE_GDS=1
+ENABLE_METRICS=0
+ENABLE_NSYS=0
+
 while [[ $# -gt 0 ]]; do
     case "$1" in
         -n|--nodes)
@@ -79,7 +83,7 @@ while [[ $# -gt 0 ]]; do
                 exit 1
             fi
             ;;
-	-g|--num-gpus-per-node)
+	    -g|--num-gpus-per-node)
             if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then
                 NUM_GPUS_PER_NODE="$2"
                 shift 2
@@ -89,7 +93,7 @@ while [[ $# -gt 0 ]]; do
                 exit 1
             fi
             ;;
-	-w|--worker-image)
+	    -w|--worker-image)
             if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then
                 WORKER_IMAGE="$2"
                 shift 2
@@ -99,7 +103,7 @@ while [[ $# -gt 0 ]]; do
                 exit 1
             fi
             ;;
-	-c|--coord-image)
+	    -c|--coord-image)
             if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then
                 COORD_IMAGE="$2"
                 shift 2
@@ -128,6 +132,18 @@ while [[ $# -gt 0 ]]; do
                 exit 1
             fi
             ;;
+        --disable-gds)
+            ENABLE_GDS=0
+            shift
+            ;;
+        -m|--metrics)
+            ENABLE_METRICS=1
+            shift
+            ;;
+        -p|--profile)
+            ENABLE_NSYS=1
+            shift
+            ;;
         --)
             shift
             break
@@ -158,8 +174,22 @@ JOB_NAME="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}"
 # Node 5 has known issues; nodes above 10 are not yet functional.
 NODELIST="${NODELIST:-${DEFAULT_NODELIST}}"
 GRES_OPT=$([[ "$VARIANT_TYPE" == "gpu" ]] && echo "--gres=gpu:${NUM_GPUS_PER_NODE}" || echo "")
+
+EXPORT_VARS="ALL"
+EXPORT_VARS+=",SCALE_FACTOR=${SCALE_FACTOR}"
+EXPORT_VARS+=",NUM_ITERATIONS=${NUM_ITERATIONS}"
+EXPORT_VARS+=",SCRIPT_DIR=${SCRIPT_DIR}"
+EXPORT_VARS+=",NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE}"
+EXPORT_VARS+=",WORKER_IMAGE=${WORKER_IMAGE}"
+EXPORT_VARS+=",COORD_IMAGE=${COORD_IMAGE}"
+EXPORT_VARS+=",USE_NUMA=${USE_NUMA}"
+EXPORT_VARS+=",VARIANT_TYPE=${VARIANT_TYPE}"
+EXPORT_VARS+=",ENABLE_GDS=${ENABLE_GDS}"
+EXPORT_VARS+=",ENABLE_METRICS=${ENABLE_METRICS}"
+EXPORT_VARS+=",ENABLE_NSYS=${ENABLE_NSYS}"
+
 JOB_ID=$(sbatch --job-name="${JOB_NAME}" --nodes="${NODES_COUNT}" --nodelist="${NODELIST}" \
---export="ALL,SCALE_FACTOR=${SCALE_FACTOR},NUM_ITERATIONS=${NUM_ITERATIONS},SCRIPT_DIR=${SCRIPT_DIR},NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE},WORKER_IMAGE=${WORKER_IMAGE},COORD_IMAGE=${COORD_IMAGE},USE_NUMA=${USE_NUMA},VARIANT_TYPE=${VARIANT_TYPE}" \
+--export="${EXPORT_VARS}" \
 --output="${OUT_FMT}" --error="${ERR_FMT}" "${EXTRA_ARGS[@]}" ${GRES_OPT} \
 run-presto-benchmarks.slurm | awk '{print $NF}')
 OUT_FILE="${OUT_FMT//%j/${JOB_ID}}"
diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh
new file mode 100755
index 00000000..65805a82
--- /dev/null
+++ b/presto/slurm/presto-nvl72/profiler_functions.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+
+function start_profiler() {
+  echo "start profiler"
+}
+
+function stop_profiler() {
+  echo "stop profiler"
+}
diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index f5b953d1..298276fc 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -38,16 +38,10 @@ wait_until_coordinator_is_running
 echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..."
 
 worker_id=0
-nsys_worker_pid=""
 for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do
     for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do
         echo "  Starting worker ${worker_id} on node ${node} GPU ${gpu_id}"
         run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id"
-
-        if [[ "${ENABLE_NSYS}" == "1" && "${NSYS_WORKER_ID}" == "${worker_id}" ]]; then
-            nsys_worker_pid=$!
-        fi
-
         worker_id=$((worker_id + 1))
     done
 done
@@ -78,16 +72,6 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt
 echo "Collecting configs and logs into result directory..."
 collect_results
 
-if [[ -n "${nsys_worker_pid}" ]]; then
-    echo "Sending SIGINT to nsys (worker srun PID ${nsys_worker_pid})..."
-    # Send the interrupt signal to the nsys process
-    # If the process has already terminated, `kill` will have an error, hence `|| true`
-    kill -INT "${nsys_worker_pid}" 2>/dev/null || true
-    echo "Waiting for nsys to finalize report..."
-    # Wait for the nsys process to finalize the report and store to disk
-    wait "${nsys_worker_pid}" 2>/dev/null || true
-fi
-
 echo "========================================"
 echo "Benchmark complete!"
 echo "Results saved to: ${SCRIPT_DIR}/results_dir"
diff --git a/presto/testing/performance_benchmarks/common_fixtures.py b/presto/testing/performance_benchmarks/common_fixtures.py
index 8ea3b6db..27c79f36 100644
--- a/presto/testing/performance_benchmarks/common_fixtures.py
+++ b/presto/testing/performance_benchmarks/common_fixtures.py
@@ -74,6 +74,7 @@ def benchmark_query_function(query_id):
             if profile:
                 # Base path without .nsys-rep extension: {dir}/{query_id}
                 profile_output_file_path = f"{profile_output_dir_path.absolute()}/{query_id}"
+                print(f">>> profile_script_path: {profile_script_path}, profile_output_file_path: {profile_output_file_path}")
                 start_profiler(profile_script_path, profile_output_file_path)
             result = []
             for iteration_num in range(iterations):

From 04e76900bb0f1c3837bb86ca58dfd6fbf6ee7703 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Fri, 10 Apr 2026 19:13:50 +0000
Subject: [PATCH 11/23] Update

---
 presto/slurm/presto-nvl72/functions.sh        | 45 ++++++++++---------
 presto/slurm/presto-nvl72/launch-run.sh       |  8 ++++
 .../slurm/presto-nvl72/profiler_functions.sh  | 13 +++++-
 presto/slurm/presto-nvl72/run_interactive.sh  |  4 +-
 .../performance_benchmarks/common_fixtures.py |  1 -
 5 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 474c742b..9a62b54e 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -231,26 +231,26 @@ function run_worker {
         done
     fi
 
+    local nsys_args=""
+    [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]] && nsys_args="${NSYS_BIN} ${NSYS_OPTS}"
     # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}
     # --cpuctxsw=none
     # --nvtx-domain-exclude=CCCL
-    local nsys_bin=""
-    local nsys_opts=""
-    if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
-        nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
-        nsys_opts="launch \
-        -t nvtx,cuda,osrt,ucx \
-        --cuda-memory-usage=true \
-        --cuda-um-cpu-page-faults=true \
-        --cuda-um-gpu-page-faults=true \
-        --cudabacktrace=true"
-        # nsys_opts="profile \
-        # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
-        # -t cuda,ucx,nvtx,osrt \
-        # -f true \
-        # --cuda-memory-usage=true \
-        # --nvtx-domain-exclude=CCCL"
-    fi
+    # if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
+    #     nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
+    #     nsys_opts="launch \
+    #     -t nvtx,cuda,osrt,ucx \
+    #     --cuda-memory-usage=true \
+    #     --cuda-um-cpu-page-faults=true \
+    #     --cuda-um-gpu-page-faults=true \
+    #     --cudabacktrace=true"
+    #     nsys_opts="profile \
+    #     -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
+    #     -t cuda,ucx,nvtx,osrt \
+    #     -f true \
+    #     --cuda-memory-usage=true \
+    #     --nvtx-domain-exclude=CCCL"
+    # fi
 
     # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel
     # capabilities are already set up for the job cgroup.  Do NOT use --gres=gpu:1
@@ -297,14 +297,15 @@ echo \"Worker ${worker_id}: ENABLE_NSYS=\${ENABLE_NSYS:-unset}\"
 echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\"
 echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\"
 
-if [[ -n '${nsys_bin}' ]]; then
-    echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\"
+if [[ -n '${nsys_args}' ]]; then
+    echo \"Worker ${worker_id}: Nsight System program at ${NSYS_BIN}\"
+    ls ${NSYS_BIN}
 fi
 
 if [[ '${USE_NUMA}' == '1' ]]; then
-    numactl --cpubind=${numa_node} --membind=${numa_node} ${nsys_bin} ${nsys_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
+    numactl --cpubind=${numa_node} --membind=${numa_node} ${nsys_args} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
 else
-    ${nsys_bin} ${nsys_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
+    ${nsys_args} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
 fi" > ${LOGS}/worker_${worker_id}.log 2>&1 &
 }
 
@@ -346,7 +347,7 @@ function run_queries {
     local metrics_flag=""
     [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m"
     local profile_flag=""
-    [[ "${ENABLE_NSYS}" == "1" ]] && profile_flag="-p --profile-script-path $(readlink -f ./profiler_functions.sh)"
+    [[ "${ENABLE_NSYS}" == "1" ]] && profile_flag="-p --profile-script-path /workspace/presto/slurm/presto-nvl72/profiler_functions.sh"
 
     source "${SCRIPT_DIR}/defaults.env"
     # We currently skip dropping cache because it requires docker (not available on the cluster).
diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index 80f07bef..54f5c158 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -175,6 +175,14 @@ JOB_NAME="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}"
 NODELIST="${NODELIST:-${DEFAULT_NODELIST}}"
 GRES_OPT=$([[ "$VARIANT_TYPE" == "gpu" ]] && echo "--gres=gpu:${NUM_GPUS_PER_NODE}" || echo "")
 
+export NSYS_BIN="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
+export NSYS_OPTS="launch \
+-t nvtx,cuda,osrt,ucx \
+--cuda-memory-usage=true \
+--cuda-um-cpu-page-faults=true \
+--cuda-um-gpu-page-faults=true \
+--cudabacktrace=true"
+
 EXPORT_VARS="ALL"
 EXPORT_VARS+=",SCALE_FACTOR=${SCALE_FACTOR}"
 EXPORT_VARS+=",NUM_ITERATIONS=${NUM_ITERATIONS}"
diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh
index 65805a82..83491e9a 100755
--- a/presto/slurm/presto-nvl72/profiler_functions.sh
+++ b/presto/slurm/presto-nvl72/profiler_functions.sh
@@ -5,9 +5,18 @@
 set -e
 
 function start_profiler() {
-  echo "start profiler"
+  local -r profile_output_file_path=$1
+  ${NSYS_BIN} start --gpu-metrics-devices=all -o ${profile_output_file_path}.nsys-rep
 }
 
 function stop_profiler() {
-  echo "stop profiler"
+  local -r profile_output_file_path=$1.nsys-rep
+#   local -r container_file_path="/presto_profiles/$(basename $profile_output_file_path)"
+  ${NSYS_BIN} stop
+#   chown -R $(id -u):$(id -g) /presto_profiles
+
+#   local container_id
+#   container_id=$(get_worker_container_id)
+#   docker cp ${container_id}:${container_file_path} $profile_output_file_path
+#   $docker_exec_command rm ${container_file_path}
 }
diff --git a/presto/slurm/presto-nvl72/run_interactive.sh b/presto/slurm/presto-nvl72/run_interactive.sh
index 70301ed0..fdf9445c 100755
--- a/presto/slurm/presto-nvl72/run_interactive.sh
+++ b/presto/slurm/presto-nvl72/run_interactive.sh
@@ -5,7 +5,7 @@
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/defaults.env"
 
-: "${IMAGE:=${IMAGE_DIR}/presto-native-worker-gpu.sqsh}"
+: "${IMAGE:=${IMAGE_DIR}/presto-native-worker-gpu-karth-Mar11-with-nsys.sqsh}"
 : "${NODELIST:=${DEFAULT_SINGLE_NODE}}"
 : "${GRES:=gpu:4}"
 : "${TIME_LIMIT:=01:00:00}"
@@ -17,7 +17,7 @@ srun --nodes=1 \
      --exclusive \
      --time="${TIME_LIMIT}" \
      --container-image="${IMAGE}" \
-     --container-mounts="${HOME}:${HOME},/scratch:/scratch" \
+     --container-mounts="/scratch:/scratch" \
      --container-remap-root \
      --container-writable \
      --pty bash
diff --git a/presto/testing/performance_benchmarks/common_fixtures.py b/presto/testing/performance_benchmarks/common_fixtures.py
index 27c79f36..8ea3b6db 100644
--- a/presto/testing/performance_benchmarks/common_fixtures.py
+++ b/presto/testing/performance_benchmarks/common_fixtures.py
@@ -74,7 +74,6 @@ def benchmark_query_function(query_id):
             if profile:
                 # Base path without .nsys-rep extension: {dir}/{query_id}
                 profile_output_file_path = f"{profile_output_dir_path.absolute()}/{query_id}"
-                print(f">>> profile_script_path: {profile_script_path}, profile_output_file_path: {profile_output_file_path}")
                 start_profiler(profile_script_path, profile_output_file_path)
             result = []
             for iteration_num in range(iterations):

From 450c6e609c0540197d4eda77cf1c04ed0f0e7713 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Fri, 10 Apr 2026 21:36:10 +0000
Subject: [PATCH 12/23] Revert changes to nsys

---
 presto/scripts/run_benchmark.sh               | 15 +------
 presto/slurm/presto-nvl72/functions.sh        | 44 +++++++++----------
 presto/slurm/presto-nvl72/launch-run.sh       | 11 +----
 .../slurm/presto-nvl72/profiler_functions.sh  | 22 ----------
 .../presto-nvl72/run-presto-benchmarks.sh     | 17 +++++++
 5 files changed, 40 insertions(+), 69 deletions(-)
 delete mode 100755 presto/slurm/presto-nvl72/profiler_functions.sh

diff --git a/presto/scripts/run_benchmark.sh b/presto/scripts/run_benchmark.sh
index d73bb51f..c0242f5d 100755
--- a/presto/scripts/run_benchmark.sh
+++ b/presto/scripts/run_benchmark.sh
@@ -35,7 +35,6 @@ OPTIONS:
                             stored inside a directory under the --output-dir path with a name matching the tag name.
                             Tags must contain only alphanumeric and underscore characters.
     -p, --profile           Enable profiling of benchmark queries.
-    --profile-script-path   Path to a custom profiler functions script. Defaults to ./profiler_functions.sh.
     --skip-drop-cache       Skip dropping system caches before each benchmark query (dropped by default).
     -m, --metrics           Collect detailed metrics from Presto REST API after each query.
                             Metrics are stored in query-specific directories.
@@ -155,15 +154,6 @@ parse_args() {
         PROFILE=true
         shift
         ;;
-      --profile-script-path)
-        if [[ -n $2 ]]; then
-          PROFILE_SCRIPT_PATH=$2
-          shift 2
-        else
-          echo "Error: --profile-script-path requires a value"
-          exit 1
-        fi
-        ;;
       --skip-drop-cache)
         SKIP_DROP_CACHE=true
         shift
@@ -246,10 +236,7 @@ if [[ -n ${TAG} ]]; then
 fi
 
 if [[ "${PROFILE}" == "true" ]]; then
-  if [[ -z "${PROFILE_SCRIPT_PATH}" ]]; then
-    PROFILE_SCRIPT_PATH="$(readlink -f ./profiler_functions.sh)"
-  fi
-  PYTEST_ARGS+=("--profile --profile-script-path ${PROFILE_SCRIPT_PATH}")
+  PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f ./profiler_functions.sh)")
 fi
 
 if [[ "${METRICS}" == "true" ]]; then
diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 9a62b54e..6488e0f4 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -231,26 +231,24 @@ function run_worker {
         done
     fi
 
+    local nsys_bin=""
+    local nsys_opts=""
     local nsys_args=""
-    [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]] && nsys_args="${NSYS_BIN} ${NSYS_OPTS}"
-    # -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}
-    # --cpuctxsw=none
-    # --nvtx-domain-exclude=CCCL
-    # if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
-    #     nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
-    #     nsys_opts="launch \
-    #     -t nvtx,cuda,osrt,ucx \
-    #     --cuda-memory-usage=true \
-    #     --cuda-um-cpu-page-faults=true \
-    #     --cuda-um-gpu-page-faults=true \
-    #     --cudabacktrace=true"
-    #     nsys_opts="profile \
-    #     -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
-    #     -t cuda,ucx,nvtx,osrt \
-    #     -f true \
-    #     --cuda-memory-usage=true \
-    #     --nvtx-domain-exclude=CCCL"
-    # fi
+    local vt_nsys_report_dir="/var/log/nsys"
+    if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
+        nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
+        nsys_opts="profile \
+        -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
+        -t nvtx,cuda,osrt,ucx \
+        -f true \
+        --sample=none \
+        --cpuctxsw=none \
+        --cuda-memory-usage=true \
+        --cuda-um-cpu-page-faults=true \
+        --cuda-um-gpu-page-faults=true \
+        --nvtx-domain-exclude=CCCL"
+        nsys_args="${nsys_bin} ${nsys_opts}"
+    fi
 
     # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel
     # capabilities are already set up for the job cgroup.  Do NOT use --gres=gpu:1
@@ -280,6 +278,7 @@ ${worker_data}:/var/lib/presto/data,\
 ${DATA}:/var/lib/presto/data/hive/data/user_data,\
 ${VT_ROOT}/.hive_metastore:/var/lib/presto/data/hive/metastore,\
 ${LOGS}:${vt_cufile_log_dir},\
+${LOGS}:${vt_nsys_report_dir},\
 /usr/lib/aarch64-linux-gnu/libcuda.so.580.105.08:/usr/local/cuda-13.0/compat/libcuda.so.1,\
 /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\
 ${gds_mounts:+,${gds_mounts}} \
@@ -298,8 +297,7 @@ echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\"
 echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\"
 
 if [[ -n '${nsys_args}' ]]; then
-    echo \"Worker ${worker_id}: Nsight System program at ${NSYS_BIN}\"
-    ls ${NSYS_BIN}
+    echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\"
 fi
 
 if [[ '${USE_NUMA}' == '1' ]]; then
@@ -346,8 +344,6 @@ function run_queries {
     local scale_factor=$2
     local metrics_flag=""
     [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m"
-    local profile_flag=""
-    [[ "${ENABLE_NSYS}" == "1" ]] && profile_flag="-p --profile-script-path /workspace/presto/slurm/presto-nvl72/profiler_functions.sh"
 
     source "${SCRIPT_DIR}/defaults.env"
     # We currently skip dropping cache because it requires docker (not available on the cluster).
@@ -357,7 +353,7 @@ function run_queries {
     export MINIFORGE_HOME=/workspace/miniforge3; \
     export HOME=/workspace; \
     cd /workspace/presto/scripts; \
-    ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} ${profile_flag} -q 1 \
+    ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} -q 1 \
         --hostname ${COORD} --port $PORT -o /workspace/presto/slurm/presto-nvl72/result_dir --skip-drop-cache; \
     echo 'Validating query results...'; \
     MINIFORGE_HOME=/workspace/miniforge3 /workspace/scripts/run_py_script.sh \
diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index 54f5c158..1e74fbd6 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -41,7 +41,8 @@ USE_NUMA="1"
 VARIANT_TYPE="gpu"
 # WORKER_IMAGE="presto-native-worker-gpu"
 WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys"
-COORD_IMAGE="presto-coordinator-karth-Mar11"
+# COORD_IMAGE="presto-coordinator-karth-Mar11"
+COORD_IMAGE="presto-coordinator-karth-Mar11-with-nsys"
 #COORD_IMAGE="presto-coordinator-ibm-03-11"
 #WORKER_IMAGE="presto-native-worker-gpu-ibm-03-11"
 #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64"
@@ -175,14 +176,6 @@ JOB_NAME="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}"
 NODELIST="${NODELIST:-${DEFAULT_NODELIST}}"
 GRES_OPT=$([[ "$VARIANT_TYPE" == "gpu" ]] && echo "--gres=gpu:${NUM_GPUS_PER_NODE}" || echo "")
 
-export NSYS_BIN="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
-export NSYS_OPTS="launch \
--t nvtx,cuda,osrt,ucx \
---cuda-memory-usage=true \
---cuda-um-cpu-page-faults=true \
---cuda-um-gpu-page-faults=true \
---cudabacktrace=true"
-
 EXPORT_VARS="ALL"
 EXPORT_VARS+=",SCALE_FACTOR=${SCALE_FACTOR}"
 EXPORT_VARS+=",NUM_ITERATIONS=${NUM_ITERATIONS}"
diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh
deleted file mode 100755
index 83491e9a..00000000
--- a/presto/slurm/presto-nvl72/profiler_functions.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-set -e
-
-function start_profiler() {
-  local -r profile_output_file_path=$1
-  ${NSYS_BIN} start --gpu-metrics-devices=all -o ${profile_output_file_path}.nsys-rep
-}
-
-function stop_profiler() {
-  local -r profile_output_file_path=$1.nsys-rep
-#   local -r container_file_path="/presto_profiles/$(basename $profile_output_file_path)"
-  ${NSYS_BIN} stop
-#   chown -R $(id -u):$(id -g) /presto_profiles
-
-#   local container_id
-#   container_id=$(get_worker_container_id)
-#   docker cp ${container_id}:${container_file_path} $profile_output_file_path
-#   $docker_exec_command rm ${container_file_path}
-}
diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index 298276fc..6ec84848 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -38,10 +38,17 @@ wait_until_coordinator_is_running
 echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..."
 
 worker_id=0
+nsys_worker_pid=""
 for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do
     for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do
         echo "  Starting worker ${worker_id} on node ${node} GPU ${gpu_id}"
         run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id"
+
+        if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
+            nsys_worker_pid=$!
+            echo "profiled worker PID ${nsys_worker_pid}"
+        fi
+
         worker_id=$((worker_id + 1))
     done
 done
@@ -72,6 +79,16 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt
 echo "Collecting configs and logs into result directory..."
 collect_results
 
+if [[ -n "${nsys_worker_pid}" ]]; then
+    echo "Sending SIGINT to profiled worker PID ${nsys_worker_pid}..."
+    # Send the interrupt signal to the nsys process
+    # If the process has already terminated, `kill` will have an error, hence `|| true`
+    kill -TERM "${nsys_worker_pid}" 2>/dev/null || true
+    echo "Waiting for nsys to finalize report..."
+    # Wait for the nsys process to finalize the report and store to disk
+    wait "${nsys_worker_pid}" 2>/dev/null || true
+fi
+
 echo "========================================"
 echo "Benchmark complete!"
 echo "Results saved to: ${SCRIPT_DIR}/results_dir"

From b9201135a4ebb1b9292488ea947cea000b49b5dd Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Fri, 10 Apr 2026 21:37:51 +0000
Subject: [PATCH 13/23] Revert changes to run interative

---
 presto/slurm/presto-nvl72/run_interactive.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/presto/slurm/presto-nvl72/run_interactive.sh b/presto/slurm/presto-nvl72/run_interactive.sh
index fdf9445c..70301ed0 100755
--- a/presto/slurm/presto-nvl72/run_interactive.sh
+++ b/presto/slurm/presto-nvl72/run_interactive.sh
@@ -5,7 +5,7 @@
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/defaults.env"
 
-: "${IMAGE:=${IMAGE_DIR}/presto-native-worker-gpu-karth-Mar11-with-nsys.sqsh}"
+: "${IMAGE:=${IMAGE_DIR}/presto-native-worker-gpu.sqsh}"
 : "${NODELIST:=${DEFAULT_SINGLE_NODE}}"
 : "${GRES:=gpu:4}"
 : "${TIME_LIMIT:=01:00:00}"
@@ -17,7 +17,7 @@ srun --nodes=1 \
      --exclusive \
      --time="${TIME_LIMIT}" \
      --container-image="${IMAGE}" \
-     --container-mounts="/scratch:/scratch" \
+     --container-mounts="${HOME}:${HOME},/scratch:/scratch" \
      --container-remap-root \
      --container-writable \
      --pty bash

From a04a8670e6e3218b6035a3c0c291fcf2fcae04d7 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Sat, 11 Apr 2026 04:08:37 +0000
Subject: [PATCH 14/23] Update

---
 presto/slurm/presto-nvl72/functions.sh        | 53 +++++++++++++------
 .../presto-nvl72/run-presto-benchmarks.sh     | 37 +++++++------
 2 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 6488e0f4..542ea034 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -231,23 +231,21 @@ function run_worker {
         done
     fi
 
+    # --nvtx-domain-exclude=CCCL
+    # --cpuctxsw=none
+    # --sample=none
     local nsys_bin=""
-    local nsys_opts=""
-    local nsys_args=""
+    local nsys_launch_opts=""
+    local nsys_start_opts=""
     local vt_nsys_report_dir="/var/log/nsys"
     if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
         nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
-        nsys_opts="profile \
-        -o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
-        -t nvtx,cuda,osrt,ucx \
-        -f true \
-        --sample=none \
-        --cpuctxsw=none \
+        nsys_launch_opts="-t nvtx,cuda,osrt,ucx \
         --cuda-memory-usage=true \
         --cuda-um-cpu-page-faults=true \
-        --cuda-um-gpu-page-faults=true \
-        --nvtx-domain-exclude=CCCL"
-        nsys_args="${nsys_bin} ${nsys_opts}"
+        --cuda-um-gpu-page-faults=true"
+        nsys_start_opts="-o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
+        -f true"
     fi
 
     # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel
@@ -296,15 +294,36 @@ echo \"Worker ${worker_id}: ENABLE_NSYS=\${ENABLE_NSYS:-unset}\"
 echo \"Worker ${worker_id}: KVIKIO_COMPAT_MODE=\${KVIKIO_COMPAT_MODE:-unset}\"
 echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\"
 
-if [[ -n '${nsys_args}' ]]; then
+if [[ -n '${nsys_bin}' ]]; then
+    (
+        echo \"Worker ${worker_id}: nsys subshell started, waiting for start token\"
+        while [[ ! -f ${vt_nsys_report_dir}/.nsys_start_token ]]; do
+            read -t 2 -r _ <<< '' || true
+        done
+        echo \"Worker ${worker_id}: start token found, running nsys start\"
+        ${nsys_bin} start ${nsys_start_opts}
+        echo \"Worker ${worker_id}: nsys start exit code: \$?\"
+        while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token ]]; do
+            read -t 2 -r _ <<< '' || true
+        done
+        echo \"Worker ${worker_id}: stop token found, running nsys stop\"
+        ${nsys_bin} stop
+        echo \"Worker ${worker_id}: nsys stop exit code: \$?\"
+    ) &
+
     echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\"
+    echo \"Worker ${worker_id}: running nsys launch\"
+    ${nsys_bin} launch ${nsys_launch_opts} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
+    echo \"Worker ${worker_id}: nsys launch exited with code: \$?\"
+else
+    if [[ '${USE_NUMA}' == '1' ]]; then
+        numactl --cpubind=${numa_node} --membind=${numa_node} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
+    else
+        /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
+    fi
 fi
 
-if [[ '${USE_NUMA}' == '1' ]]; then
-    numactl --cpubind=${numa_node} --membind=${numa_node} ${nsys_args} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
-else
-    ${nsys_args} /usr/bin/presto_server --etc-dir=/opt/presto-server/etc
-fi" > ${LOGS}/worker_${worker_id}.log 2>&1 &
+" > ${LOGS}/worker_${worker_id}.log 2>&1 &
 }
 
 function copy_hive_metastore {
diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index 6ec84848..5253ffdc 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -38,17 +38,10 @@ wait_until_coordinator_is_running
 echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..."
 
 worker_id=0
-nsys_worker_pid=""
 for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do
     for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do
         echo "  Starting worker ${worker_id} on node ${node} GPU ${gpu_id}"
         run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id"
-
-        if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
-            nsys_worker_pid=$!
-            echo "profiled worker PID ${nsys_worker_pid}"
-        fi
-
         worker_id=$((worker_id + 1))
     done
 done
@@ -67,7 +60,10 @@ wait_for_workers_to_register $NUM_WORKERS
 # Run Queries
 # ==============================================================================
 echo "Running TPC-H queries (${NUM_ITERATIONS} iterations, scale factor ${SCALE_FACTOR})..."
+
+touch "${LOGS}/.nsys_start_token"
 run_queries ${NUM_ITERATIONS} ${SCALE_FACTOR}
+touch "${LOGS}/.nsys_stop_token"
 
 # ==============================================================================
 # Process Results
@@ -79,15 +75,24 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt
 echo "Collecting configs and logs into result directory..."
 collect_results
 
-if [[ -n "${nsys_worker_pid}" ]]; then
-    echo "Sending SIGINT to profiled worker PID ${nsys_worker_pid}..."
-    # Send the interrupt signal to the nsys process
-    # If the process has already terminated, `kill` will have an error, hence `|| true`
-    kill -TERM "${nsys_worker_pid}" 2>/dev/null || true
-    echo "Waiting for nsys to finalize report..."
-    # Wait for the nsys process to finalize the report and store to disk
-    wait "${nsys_worker_pid}" 2>/dev/null || true
-fi
+# rm "${LOGS}/.nsys_start_token" "${LOGS}/.nsys_stop_token"
+echo "Waiting for nsys report generation..."
+prev_size=0
+stable_count=0
+for i in {1..120}; do
+    cur_size=$(stat -c%s "${LOGS}/nsys_worker_0.nsys-rep" 2>/dev/null || echo 0)
+    if (( cur_size > 0 && cur_size == prev_size )); then
+        stable_count=$((stable_count + 1))
+        if (( stable_count >= 3 )); then
+            echo "nsys report complete: ${cur_size} bytes"
+            break
+        fi
+    else
+        stable_count=0
+    fi
+    prev_size=$cur_size
+    sleep 5
+done
 
 echo "========================================"
 echo "Benchmark complete!"

From 2b87c9465a96e00eb5f01b67b95288f40a07e756 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Sun, 12 Apr 2026 19:18:56 +0000
Subject: [PATCH 15/23] Add initial profile. Add query selection

---
 presto/scripts/run_benchmark.sh               | 16 +++++-
 presto/slurm/presto-nvl72/functions.sh        | 46 ++++++++++-------
 presto/slurm/presto-nvl72/launch-run.sh       | 15 +++++-
 .../slurm/presto-nvl72/profiler_functions.sh  | 17 +++++++
 .../presto-nvl72/run-presto-benchmarks.sh     | 50 +++++++++++--------
 5 files changed, 104 insertions(+), 40 deletions(-)
 create mode 100755 presto/slurm/presto-nvl72/profiler_functions.sh

diff --git a/presto/scripts/run_benchmark.sh b/presto/scripts/run_benchmark.sh
index c0242f5d..bf54979a 100755
--- a/presto/scripts/run_benchmark.sh
+++ b/presto/scripts/run_benchmark.sh
@@ -35,6 +35,7 @@ OPTIONS:
                             stored inside a directory under the --output-dir path with a name matching the tag name.
                             Tags must contain only alphanumeric and underscore characters.
     -p, --profile           Enable profiling of benchmark queries.
+    --profile-script-path   Path to a custom profiler functions script. Defaults to ./profiler_functions.sh.
     --skip-drop-cache       Skip dropping system caches before each benchmark query (dropped by default).
     -m, --metrics           Collect detailed metrics from Presto REST API after each query.
                             Metrics are stored in query-specific directories.
@@ -154,6 +155,15 @@ parse_args() {
         PROFILE=true
         shift
         ;;
+      --profile-script-path)
+        if [[ -n $2 ]]; then
+          PROFILE_SCRIPT_PATH=$2
+          shift 2
+        else
+          echo "Error: --profile-script-path requires a value"
+          exit 1
+        fi
+        ;;
       --skip-drop-cache)
         SKIP_DROP_CACHE=true
         shift
@@ -236,7 +246,11 @@ if [[ -n ${TAG} ]]; then
 fi
 
 if [[ "${PROFILE}" == "true" ]]; then
-  PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f ./profiler_functions.sh)")
+  if [[ -z "${PROFILE_SCRIPT_PATH}" ]]; then
+    PROFILE_SCRIPT_PATH="$(readlink -f ./profiler_functions.sh)"
+  fi
+  PYTEST_ARGS+=("--profile --profile-script-path ${PROFILE_SCRIPT_PATH}")
+
 fi
 
 if [[ "${METRICS}" == "true" ]]; then
diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 542ea034..ffcbc40d 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -236,7 +236,6 @@ function run_worker {
     # --sample=none
     local nsys_bin=""
     local nsys_launch_opts=""
-    local nsys_start_opts=""
     local vt_nsys_report_dir="/var/log/nsys"
     if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
         nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
@@ -244,8 +243,6 @@ function run_worker {
         --cuda-memory-usage=true \
         --cuda-um-cpu-page-faults=true \
         --cuda-um-gpu-page-faults=true"
-        nsys_start_opts="-o ${vt_nsys_report_dir}/nsys_worker_${worker_id} \
-        -f true"
     fi
 
     # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel
@@ -296,19 +293,30 @@ echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\"
 
 if [[ -n '${nsys_bin}' ]]; then
     (
-        echo \"Worker ${worker_id}: nsys subshell started, waiting for start token\"
-        while [[ ! -f ${vt_nsys_report_dir}/.nsys_start_token ]]; do
-            read -t 2 -r _ <<< '' || true
+        echo \"Worker ${worker_id}: nsys subshell started\"
+        while true; do
+            # Wait for any start token
+            start_token=''
+            while [[ -z \"\${start_token}\" ]]; do
+                for f in ${vt_nsys_report_dir}/.nsys_start_token_Q*; do
+                    [[ -f \"\$f\" ]] && start_token=\"\$f\" && break
+                done
+                [[ -z \"\${start_token}\" ]] && { read -t 2 -r _ <<< '' || true; }
+            done
+            query_id=\${start_token##*_token_}
+            echo \"Worker ${worker_id}: start token found for \${query_id}, running nsys start\"
+            rm \"\${start_token}\"
+            ${nsys_bin} start -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}_\${query_id} -f true
+            echo \"Worker ${worker_id}: nsys start exit code: \$?\"
+
+            # Wait for corresponding stop token
+            while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token_\${query_id} ]]; do
+                read -t 2 -r _ <<< '' || true
+            done
+            echo \"Worker ${worker_id}: stop token found for \${query_id}, running nsys stop\"
+            rm ${vt_nsys_report_dir}/.nsys_stop_token_\${query_id}
+            ${nsys_bin} stop; echo \"Worker ${worker_id}: nsys stop exit code: \$?\"
         done
-        echo \"Worker ${worker_id}: start token found, running nsys start\"
-        ${nsys_bin} start ${nsys_start_opts}
-        echo \"Worker ${worker_id}: nsys start exit code: \$?\"
-        while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token ]]; do
-            read -t 2 -r _ <<< '' || true
-        done
-        echo \"Worker ${worker_id}: stop token found, running nsys stop\"
-        ${nsys_bin} stop
-        echo \"Worker ${worker_id}: nsys stop exit code: \$?\"
     ) &
 
     echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\"
@@ -361,8 +369,10 @@ function run_queries {
     [ $# -ne 2 ] && echo_error "$0 expected two arguments for '<iterations>' and '<scale_factor>'"
     local num_iterations=$1
     local scale_factor=$2
-    local metrics_flag=""
-    [[ "${ENABLE_METRICS}" == "1" ]] && metrics_flag="-m"
+    local extra_args=()
+    [[ "${ENABLE_METRICS}" == "1" ]] && extra_args+=("-m")
+    [[ "${ENABLE_NSYS}" == "1" ]] && extra_args+=("-p" "--profile-script-path" "/workspace/presto/slurm/presto-nvl72/profiler_functions.sh")
+    [[ -n "${QUERIES:-}" ]] && extra_args+=("-q" "${QUERIES}")
 
     source "${SCRIPT_DIR}/defaults.env"
     # We currently skip dropping cache because it requires docker (not available on the cluster).
@@ -372,7 +382,7 @@ function run_queries {
     export MINIFORGE_HOME=/workspace/miniforge3; \
     export HOME=/workspace; \
     cd /workspace/presto/scripts; \
-    ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${metrics_flag} -q 1 \
+    ./run_benchmark.sh -b tpch -s tpchsf${scale_factor} -i ${num_iterations} ${extra_args[*]} \
         --hostname ${COORD} --port $PORT -o /workspace/presto/slurm/presto-nvl72/result_dir --skip-drop-cache; \
     echo 'Validating query results...'; \
     MINIFORGE_HOME=/workspace/miniforge3 /workspace/scripts/run_py_script.sh \
diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index 1e74fbd6..49fffc63 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -51,6 +51,7 @@ OUTPUT_PATH=""
 ENABLE_GDS=1
 ENABLE_METRICS=0
 ENABLE_NSYS=0
+QUERIES=""
 
 while [[ $# -gt 0 ]]; do
     case "$1" in
@@ -145,7 +146,16 @@ while [[ $# -gt 0 ]]; do
             ENABLE_NSYS=1
             shift
             ;;
-        --)
+        -q|--queries)
+          if [[ -n $2 ]]; then
+            QUERIES=$2
+            shift 2
+          else
+            echo "Error: --queries requires a value"
+            exit 1
+          fi
+          ;;
+          --)
             shift
             break
             ;;
@@ -188,6 +198,9 @@ EXPORT_VARS+=",VARIANT_TYPE=${VARIANT_TYPE}"
 EXPORT_VARS+=",ENABLE_GDS=${ENABLE_GDS}"
 EXPORT_VARS+=",ENABLE_METRICS=${ENABLE_METRICS}"
 EXPORT_VARS+=",ENABLE_NSYS=${ENABLE_NSYS}"
+if [[ -n "${QUERIES}" ]]; then
+    EXPORT_VARS+=",QUERIES='${QUERIES}'"
+fi
 
 JOB_ID=$(sbatch --job-name="${JOB_NAME}" --nodes="${NODES_COUNT}" --nodelist="${NODELIST}" \
 --export="${EXPORT_VARS}" \
diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh
new file mode 100755
index 00000000..0f3eeffd
--- /dev/null
+++ b/presto/slurm/presto-nvl72/profiler_functions.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+
+function start_profiler() {
+    local -r profile_output_file_path=$1
+    local -r query_id=$(basename ${profile_output_file_path})
+    touch "/workspace/presto/slurm/presto-nvl72/logs/.nsys_start_token_${query_id}"
+}
+
+function stop_profiler() {
+    local -r profile_output_file_path=$1
+    local -r query_id=$(basename ${profile_output_file_path})
+    touch "/workspace/presto/slurm/presto-nvl72/logs/.nsys_stop_token_${query_id}"
+}
diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index 5253ffdc..a254922d 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -60,10 +60,7 @@ wait_for_workers_to_register $NUM_WORKERS
 # Run Queries
 # ==============================================================================
 echo "Running TPC-H queries (${NUM_ITERATIONS} iterations, scale factor ${SCALE_FACTOR})..."
-
-touch "${LOGS}/.nsys_start_token"
 run_queries ${NUM_ITERATIONS} ${SCALE_FACTOR}
-touch "${LOGS}/.nsys_stop_token"
 
 # ==============================================================================
 # Process Results
@@ -75,24 +72,37 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt
 echo "Collecting configs and logs into result directory..."
 collect_results
 
-# rm "${LOGS}/.nsys_start_token" "${LOGS}/.nsys_stop_token"
-echo "Waiting for nsys report generation..."
-prev_size=0
-stable_count=0
-for i in {1..120}; do
-    cur_size=$(stat -c%s "${LOGS}/nsys_worker_0.nsys-rep" 2>/dev/null || echo 0)
-    if (( cur_size > 0 && cur_size == prev_size )); then
-        stable_count=$((stable_count + 1))
-        if (( stable_count >= 3 )); then
-            echo "nsys report complete: ${cur_size} bytes"
-            break
+echo "--> QUERIES: ${QUERIES:-UNDEFINED}"
+
+if [[ "${ENABLE_NSYS}" == "1" ]]; then
+    echo "Waiting for nsys report generation..."
+    stable_count=0
+    declare -A prev_sizes
+    for i in {1..120}; do
+        all_stable=true
+        found_any=false
+        for f in "${LOGS}"/nsys_worker_*.nsys-rep; do
+            [[ -f "$f" ]] || continue
+            found_any=true
+            cur_size=$(stat -c%s "$f" 2>/dev/null || echo 0)
+            prev=${prev_sizes["$f"]:-0}
+            if (( cur_size == 0 || cur_size != prev )); then
+                all_stable=false
+            fi
+            prev_sizes["$f"]=$cur_size
+        done
+        if $all_stable && $found_any; then
+            stable_count=$((stable_count + 1))
+            if (( stable_count >= 3 )); then
+                echo "All ${#prev_sizes[@]} nsys reports stable."
+                break
+            fi
+        else
+            stable_count=0
         fi
-    else
-        stable_count=0
-    fi
-    prev_size=$cur_size
-    sleep 5
-done
+        sleep 5
+    done
+fi
 
 echo "========================================"
 echo "Benchmark complete!"

From 49ee7674f184d3616153a497ddbf82b284115c58 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Mon, 13 Apr 2026 14:03:22 +0000
Subject: [PATCH 16/23] Finally make nsys work with file-based sync hack

---
 presto/slurm/presto-nvl72/functions.sh        | 37 ++++++++++---------
 presto/slurm/presto-nvl72/launch-run.sh       |  5 ++-
 .../slurm/presto-nvl72/profiler_functions.sh  | 10 ++++-
 .../presto-nvl72/run-presto-benchmarks.sh     | 36 +++++++++++++-----
 4 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index ffcbc40d..ec4373b4 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -294,29 +294,30 @@ echo \"Worker ${worker_id}: CUFILE_LOGFILE_PATH=\${CUFILE_LOGFILE_PATH:-unset}\"
 if [[ -n '${nsys_bin}' ]]; then
     (
         echo \"Worker ${worker_id}: nsys subshell started\"
-        while true; do
-            # Wait for any start token
-            start_token=''
-            while [[ -z \"\${start_token}\" ]]; do
-                for f in ${vt_nsys_report_dir}/.nsys_start_token_Q*; do
-                    [[ -f \"\$f\" ]] && start_token=\"\$f\" && break
-                done
-                [[ -z \"\${start_token}\" ]] && { read -t 2 -r _ <<< '' || true; }
+        if [[ -n '${QUERIES:-}' ]]; then
+            IFS=',' read -ra qlist <<< '${QUERIES}'
+        else
+            qlist=({1..22})
+        fi
+        for qnum in \"\${qlist[@]}\"; do
+            qid=\"Q\${qnum}\"
+            while [[ ! -f ${vt_nsys_report_dir}/.nsys_start_token_\${qid} ]]; do
+                read -t 2 -r _ <<< '' || true
             done
-            query_id=\${start_token##*_token_}
-            echo \"Worker ${worker_id}: start token found for \${query_id}, running nsys start\"
-            rm \"\${start_token}\"
-            ${nsys_bin} start -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}_\${query_id} -f true
-            echo \"Worker ${worker_id}: nsys start exit code: \$?\"
-
-            # Wait for corresponding stop token
-            while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token_\${query_id} ]]; do
+            echo \"Worker ${worker_id}: start token found for \${qid}\"
+            rm ${vt_nsys_report_dir}/.nsys_start_token_\${qid}
+            ${nsys_bin} start -o ${vt_nsys_report_dir}/nsys_worker_${worker_id}_\${qid} -f true; echo \"Worker ${worker_id}: nsys start exit code: \$?\"
+            echo \"Worker ${worker_id}: post-start token created for \${qid}\"
+            touch ${vt_nsys_report_dir}/.nsys_started_token_\${qid}
+
+            while [[ ! -f ${vt_nsys_report_dir}/.nsys_stop_token_\${qid} ]]; do
                 read -t 2 -r _ <<< '' || true
             done
-            echo \"Worker ${worker_id}: stop token found for \${query_id}, running nsys stop\"
-            rm ${vt_nsys_report_dir}/.nsys_stop_token_\${query_id}
+            echo \"Worker ${worker_id}: stop token found for \${qid}\"
+            rm ${vt_nsys_report_dir}/.nsys_stop_token_\${qid}
             ${nsys_bin} stop; echo \"Worker ${worker_id}: nsys stop exit code: \$?\"
         done
+        echo \"Worker ${worker_id}: nsys subshell done, all queries profiled\"
     ) &
 
     echo \"Worker ${worker_id}: Nsight System program at ${nsys_bin}\"
diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index 49fffc63..de7789fa 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -199,7 +199,10 @@ EXPORT_VARS+=",ENABLE_GDS=${ENABLE_GDS}"
 EXPORT_VARS+=",ENABLE_METRICS=${ENABLE_METRICS}"
 EXPORT_VARS+=",ENABLE_NSYS=${ENABLE_NSYS}"
 if [[ -n "${QUERIES}" ]]; then
-    EXPORT_VARS+=",QUERIES='${QUERIES}'"
+    # Do not append to EXPORT_VARS since comma seprator is ambiguous.
+    # Single quote causes further issue down the line.
+    # So using env var directly is the simplest correct approach.
+    export QUERIES
 fi
 
 JOB_ID=$(sbatch --job-name="${JOB_NAME}" --nodes="${NODES_COUNT}" --nodelist="${NODELIST}" \
diff --git a/presto/slurm/presto-nvl72/profiler_functions.sh b/presto/slurm/presto-nvl72/profiler_functions.sh
index 0f3eeffd..c49a1ad6 100755
--- a/presto/slurm/presto-nvl72/profiler_functions.sh
+++ b/presto/slurm/presto-nvl72/profiler_functions.sh
@@ -7,11 +7,17 @@ set -e
 function start_profiler() {
     local -r profile_output_file_path=$1
     local -r query_id=$(basename ${profile_output_file_path})
-    touch "/workspace/presto/slurm/presto-nvl72/logs/.nsys_start_token_${query_id}"
+    local -r logs_dir="/workspace/presto/slurm/presto-nvl72/logs"
+    touch "${logs_dir}/.nsys_start_token_${query_id}"
+    while [[ ! -f "${logs_dir}/.nsys_started_token_${query_id}" ]]; do
+        read -t 2 -r _ <<< '' || true
+    done
+    rm "${logs_dir}/.nsys_started_token_${query_id}"
 }
 
 function stop_profiler() {
     local -r profile_output_file_path=$1
     local -r query_id=$(basename ${profile_output_file_path})
-    touch "/workspace/presto/slurm/presto-nvl72/logs/.nsys_stop_token_${query_id}"
+    local -r logs_dir="/workspace/presto/slurm/presto-nvl72/logs"
+    touch "${logs_dir}/.nsys_stop_token_${query_id}"
 }
diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index a254922d..834ecce9 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -76,25 +76,41 @@ echo "--> QUERIES: ${QUERIES:-UNDEFINED}"
 
 if [[ "${ENABLE_NSYS}" == "1" ]]; then
     echo "Waiting for nsys report generation..."
-    stable_count=0
+    if [[ -n "${QUERIES:-}" ]]; then
+        IFS=',' read -ra qlist <<< "${QUERIES}"
+    else
+        qlist=({1..22})
+    fi
+
     declare -A prev_sizes
+    stable_count=0
     for i in {1..120}; do
         all_stable=true
-        found_any=false
-        for f in "${LOGS}"/nsys_worker_*.nsys-rep; do
-            [[ -f "$f" ]] || continue
-            found_any=true
-            cur_size=$(stat -c%s "$f" 2>/dev/null || echo 0)
-            prev=${prev_sizes["$f"]:-0}
+        for qnum in "${qlist[@]}"; do
+            report="${LOGS}/nsys_worker_0_Q${qnum}.nsys-rep"
+            fallback="${LOGS}/nsys_worker_0_Q${qnum}.qdstrm"
+            if [[ -f "$report" ]]; then
+                target="$report"
+            elif [[ -f "$fallback" ]]; then
+                target="$fallback"
+            else
+                echo "    Q${qnum}: no file yet"
+                all_stable=false
+                continue
+            fi
+            cur_size=$(stat -c%s "$target" 2>/dev/null || echo 0)
+            prev=${prev_sizes["$target"]:-0}
+            echo "    Q${qnum}: cur=${cur_size} prev=${prev}"
             if (( cur_size == 0 || cur_size != prev )); then
                 all_stable=false
             fi
-            prev_sizes["$f"]=$cur_size
+            prev_sizes["$target"]=$cur_size
         done
-        if $all_stable && $found_any; then
+        echo "  all_stable=${all_stable} stable_count=${stable_count}"
+        if $all_stable; then
             stable_count=$((stable_count + 1))
             if (( stable_count >= 3 )); then
-                echo "All ${#prev_sizes[@]} nsys reports stable."
+                echo "All ${#qlist[@]} nsys reports stable."
                 break
             fi
         else

From 1b356c4f2ac876a73a5b24470e95ca060ca910af Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Mon, 13 Apr 2026 14:57:55 +0000
Subject: [PATCH 17/23] Clean up. Allow posting of nsys-rep files

---
 benchmark_reporting_tools/post_results.py          | 9 ++++++++-
 presto/slurm/presto-nvl72/functions.sh             | 8 +++++---
 presto/slurm/presto-nvl72/launch-run.sh            | 1 -
 presto/slurm/presto-nvl72/run-presto-benchmarks.sh | 2 --
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py
index 1e2489dc..fa9c5d6f 100644
--- a/benchmark_reporting_tools/post_results.py
+++ b/benchmark_reporting_tools/post_results.py
@@ -548,6 +548,7 @@ async def upload_log_files(
         List of asset IDs from the uploaded files
     """
     log_files = sorted(benchmark_dir.glob("*.log"))
+    log_files.extend(sorted(benchmark_dir.glob("*.nsys-rep")))
     metrics_dir = benchmark_dir / "metrics"
     if metrics_dir.is_dir():
         log_files.extend(sorted(metrics_dir.glob("*.json")))
@@ -563,7 +564,12 @@ async def _upload_one(log_file: Path) -> int:
             async with semaphore:
                 print(f"    Uploading {log_file.name}...", file=sys.stderr)
                 content = log_file.read_bytes()
-                media_type = "application/json" if log_file.suffix == ".json" else "text/plain"
+                if log_file.suffix == ".json":
+                    media_type = "application/json"
+                elif log_file.suffix == ".nsys-rep":
+                    media_type = "application/octet-stream"
+                else:
+                    media_type = "text/plain"
                 response = await client.post(
                     "/api/assets/upload/",
                     files={"file": (log_file.name, content, media_type)},
@@ -742,6 +748,7 @@ async def process_benchmark_dir(
     if upload_logs:
         if dry_run:
             log_files = sorted(benchmark_dir.glob("*.log"))
+            log_files.extend(sorted(benchmark_dir.glob("*.nsys-rep")))
             metrics_dir = benchmark_dir / "metrics"
             if metrics_dir.is_dir():
                 log_files.extend(sorted(metrics_dir.glob("*.json")))
diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index ec4373b4..de2dd007 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -231,9 +231,6 @@ function run_worker {
         done
     fi
 
-    # --nvtx-domain-exclude=CCCL
-    # --cpuctxsw=none
-    # --sample=none
     local nsys_bin=""
     local nsys_launch_opts=""
     local vt_nsys_report_dir="/var/log/nsys"
@@ -498,6 +495,11 @@ function collect_results {
 
     echo "Copying logs to ${result_dir}/..."
     cp "${LOGS}"/*.log "${result_dir}/"
+
+    if [[ "${ENABLE_NSYS}" == "1" ]]; then
+        echo "Copying nsys reports to ${result_dir}/..."
+        cp "${LOGS}"/*.nsys-rep "${result_dir}/"
+    fi
 }
 
 function inject_benchmark_metadata {
diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index de7789fa..67775e0e 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -182,7 +182,6 @@ OUT_FMT="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}_i${NUM_ITERATIONS}_%j
 ERR_FMT="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}_i${NUM_ITERATIONS}_%j.err"
 SCRIPT_DIR="$PWD"
 JOB_NAME="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}"
-# Node 5 has known issues; nodes above 10 are not yet functional.
 NODELIST="${NODELIST:-${DEFAULT_NODELIST}}"
 GRES_OPT=$([[ "$VARIANT_TYPE" == "gpu" ]] && echo "--gres=gpu:${NUM_GPUS_PER_NODE}" || echo "")
 
diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index 834ecce9..5baf549a 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -72,8 +72,6 @@ cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt
 echo "Collecting configs and logs into result directory..."
 collect_results
 
-echo "--> QUERIES: ${QUERIES:-UNDEFINED}"
-
 if [[ "${ENABLE_NSYS}" == "1" ]]; then
     echo "Waiting for nsys report generation..."
     if [[ -n "${QUERIES:-}" ]]; then

From 22053ee57ad1acaed774664eda19e1a9a3c70372 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Mon, 13 Apr 2026 15:20:32 +0000
Subject: [PATCH 18/23] Post large files via S3 presigned URL. Fix file copy
 bugs

---
 benchmark_reporting_tools/post_results.py     | 70 ++++++++++++++++---
 presto/slurm/presto-nvl72/functions.sh        |  5 --
 .../presto-nvl72/run-presto-benchmarks.sh     |  3 +
 3 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py
index fa9c5d6f..2e89e906 100644
--- a/benchmark_reporting_tools/post_results.py
+++ b/benchmark_reporting_tools/post_results.py
@@ -53,9 +53,10 @@
 from datetime import datetime
 from pathlib import Path
 from urllib.parse import urlparse, urlunparse
-
+from typing import Any
 import httpx
 
+LARGE_ASSET_DIRECT_UPLOAD_THRESHOLD_BYTES = 10 * 1024 * 1024
 
 @dataclasses.dataclass(kw_only=True)
 class BenchmarkMetadata:
@@ -527,6 +528,50 @@ def build_http_client(api_url: str, api_key: str, timeout: float) -> httpx.Async
         timeout=timeout,
     )
 
+async def _s3_presigned_put(
+    upload_url: str,
+    required_headers: dict[str, Any],
+    content: bytes,
+    timeout: float,
+) -> tuple[int, str]:
+    headers = {str(k): str(v) for k, v in required_headers.items()}
+    async with httpx.AsyncClient(timeout=timeout) as s3_client:
+        response = await s3_client.put(upload_url, headers=headers, content=content)
+    return response.status_code, response.text
+
+
+async def _upload_asset_presigned(
+    client: httpx.AsyncClient,
+    content: bytes,
+    filename: str,
+    title: str,
+    media_type: str,
+    timeout: float,
+) -> int:
+    url_resp = await client.post(
+        "/api/assets/upload-url/",
+        json={"original_filename": filename, "media_type": media_type},
+    )
+    if url_resp.status_code not in (200, 201):
+        raise RuntimeError(f"Failed to get upload URL: {url_resp.status_code} {url_resp.text}")
+
+    presign = url_resp.json()
+    upload_url = presign["upload_url"]
+    s3_key = presign["s3_key"]
+    required_headers = presign.get("required_headers") or {}
+
+    put_status, put_body = await _s3_presigned_put(upload_url, required_headers, content, timeout)
+    if put_status not in (200, 204):
+        raise RuntimeError(f"S3 PUT failed: {put_status} {put_body}")
+
+    complete = await client.post(
+        "/api/assets/complete-upload/",
+        json={"s3_key": s3_key, "title": title, "media_type": media_type},
+    )
+    if complete.status_code != 201:
+        raise RuntimeError(f"Complete upload failed: {complete.status_code} {complete.text}")
+    return complete.json()["asset_id"]
+
 
 async def upload_log_files(
     benchmark_dir: Path,
@@ -570,15 +615,20 @@ async def _upload_one(log_file: Path) -> int:
                     media_type = "application/octet-stream"
                 else:
                     media_type = "text/plain"
-                response = await client.post(
-                    "/api/assets/upload/",
-                    files={"file": (log_file.name, content, media_type)},
-                    data={"title": log_file.name, "media_type": media_type},
-                )
-                if response.status_code >= 400:
-                    raise RuntimeError(f"Failed to upload {log_file.name}: {response.status_code} {response.text}")
-                result = response.json()
-                asset_id = result["asset_id"]
+
+                if len(content) > LARGE_ASSET_DIRECT_UPLOAD_THRESHOLD_BYTES:
+                    print(f"    Using presigned upload for {log_file.name} ({len(content) // (1024 * 1024)} MiB)...", file=sys.stderr)
+                    asset_id = await _upload_asset_presigned(client, content, log_file.name, log_file.name, media_type, timeout)
+                else:
+                    response = await client.post(
+                        "/api/assets/upload/",
+                        files={"file": (log_file.name, content, media_type)},
+                        data={"title": log_file.name, "media_type": media_type},
+                    )
+                    if response.status_code >= 400:
+                        raise RuntimeError(f"Failed to upload {log_file.name}: {response.status_code} {response.text}")
+                    asset_id = response.json()["asset_id"]
+
                 print(f"    Uploaded {log_file.name} (asset_id={asset_id})", file=sys.stderr)
                 return asset_id
 
diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index de2dd007..9b67182e 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -495,11 +495,6 @@ function collect_results {
 
     echo "Copying logs to ${result_dir}/..."
     cp "${LOGS}"/*.log "${result_dir}/"
-
-    if [[ "${ENABLE_NSYS}" == "1" ]]; then
-        echo "Copying nsys reports to ${result_dir}/..."
-        cp "${LOGS}"/*.nsys-rep "${result_dir}/"
-    fi
 }
 
 function inject_benchmark_metadata {
diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index 5baf549a..a8e5a0f3 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -116,6 +116,9 @@ if [[ "${ENABLE_NSYS}" == "1" ]]; then
         fi
         sleep 5
     done
+
+    echo "Copying nsys reports to ${result_dir}/..."
+    cp "${LOGS}"/*.nsys-rep "${result_dir}/"
 fi
 
 echo "========================================"

From 09b13bff0c73a618d5bad4b4b3885392a67d8e15 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Mon, 13 Apr 2026 16:20:12 +0000
Subject: [PATCH 19/23] Fix bugs

---
 presto/slurm/presto-nvl72/run-presto-benchmarks.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
index a8e5a0f3..719c1751 100755
--- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
+++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh
@@ -117,8 +117,8 @@ if [[ "${ENABLE_NSYS}" == "1" ]]; then
         sleep 5
     done
 
-    echo "Copying nsys reports to ${result_dir}/..."
-    cp "${LOGS}"/*.nsys-rep "${result_dir}/"
+    echo "Copying nsys reports to ${SCRIPT_DIR}/result_dir/..."
+    cp "${LOGS}"/*.nsys-rep "${SCRIPT_DIR}/result_dir/"
 fi
 
 echo "========================================"

From 418d651e5bed1f20be0cc896997256ed87f2ec88 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Mon, 13 Apr 2026 17:59:11 +0000
Subject: [PATCH 20/23] Use an older nsys version to avoid a new bug

---
 presto/slurm/presto-nvl72/functions.sh  | 2 +-
 presto/slurm/presto-nvl72/launch-run.sh | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 9b67182e..171655c6 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -235,7 +235,7 @@ function run_worker {
     local nsys_launch_opts=""
     local vt_nsys_report_dir="/var/log/nsys"
     if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
-        nsys_bin="/opt/nvidia/nsight-systems-cli/2026.2.1/bin/nsys"
+        nsys_bin="/opt/nvidia/nsight-systems-cli/2025.5.1/bin/nsys"
         nsys_launch_opts="-t nvtx,cuda,osrt,ucx \
         --cuda-memory-usage=true \
         --cuda-um-cpu-page-faults=true \
diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index 67775e0e..56abf0e6 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -40,9 +40,8 @@ NUM_GPUS_PER_NODE="4"
 USE_NUMA="1"
 VARIANT_TYPE="gpu"
 # WORKER_IMAGE="presto-native-worker-gpu"
-WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys"
-# COORD_IMAGE="presto-coordinator-karth-Mar11"
-COORD_IMAGE="presto-coordinator-karth-Mar11-with-nsys"
+WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys-2025.5.1"
+COORD_IMAGE="presto-coordinator-karth-Mar11"
 #COORD_IMAGE="presto-coordinator-ibm-03-11"
 #WORKER_IMAGE="presto-native-worker-gpu-ibm-03-11"
 #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64"

From 82557ad606ecca17517bc5b1b15811043482aff1 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Wed, 15 Apr 2026 17:17:23 +0000
Subject: [PATCH 21/23] Update

---
 presto/slurm/presto-nvl72/functions.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index 171655c6..fb9b685c 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -236,10 +236,11 @@ function run_worker {
     local vt_nsys_report_dir="/var/log/nsys"
     if [[ "${ENABLE_NSYS}" == "1" && "${worker_id}" == "0" ]]; then
         nsys_bin="/opt/nvidia/nsight-systems-cli/2025.5.1/bin/nsys"
-        nsys_launch_opts="-t nvtx,cuda,osrt,ucx \
-        --cuda-memory-usage=true \
-        --cuda-um-cpu-page-faults=true \
-        --cuda-um-gpu-page-faults=true"
+        nsys_launch_opts="-t nvtx,cuda"
+        # nsys_launch_opts="-t nvtx,cuda,osrt,ucx \
+        # --cuda-memory-usage=true \
+        # --cuda-um-cpu-page-faults=true \
+        # --cuda-um-gpu-page-faults=true"
     fi
 
     # The parent SLURM job allocates --gres=gpu:NUM_GPUS_PER_NODE so all GPU kernel

From 63201b5ae8108b83b601491bbc6be35c1ffa5c70 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Wed, 15 Apr 2026 18:42:55 +0000
Subject: [PATCH 22/23] Critical bug fixes

---
 presto/slurm/presto-nvl72/functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh
index fb9b685c..e2fbe25a 100755
--- a/presto/slurm/presto-nvl72/functions.sh
+++ b/presto/slurm/presto-nvl72/functions.sh
@@ -276,7 +276,7 @@ ${LOGS}:${vt_nsys_report_dir},\
 /usr/lib/aarch64-linux-gnu/libnvidia-ml.so.580.105.08:/usr/local/lib/libnvidia-ml.so.1\
 ${gds_mounts:+,${gds_mounts}} \
 -- /bin/bash -c "
-export LD_LIBRARY_PATH=\"${CUDF_LIB}:${LD_LIBRARY_PATH}\"
+export LD_LIBRARY_PATH=\"${CUDF_LIB}\${LD_LIBRARY_PATH:+:\${LD_LIBRARY_PATH}}\"
 if [[ '${ENABLE_GDS}' == '1' ]]; then
     export KVIKIO_COMPAT_MODE=OFF
     export CUFILE_LOGFILE_PATH=${vt_cufile_log}
@@ -293,7 +293,7 @@ if [[ -n '${nsys_bin}' ]]; then
     (
         echo \"Worker ${worker_id}: nsys subshell started\"
         if [[ -n '${QUERIES:-}' ]]; then
-            IFS=',' read -ra qlist <<< '${QUERIES}'
+            IFS=',' read -ra qlist <<< '${QUERIES:-}'
         else
             qlist=({1..22})
         fi

From 109c4168312340cc33842cbc63cfec592b6eb733 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Thu, 16 Apr 2026 14:00:39 +0000
Subject: [PATCH 23/23] Add more images

---
 presto/slurm/presto-nvl72/launch-run.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh
index 56abf0e6..f1af1949 100755
--- a/presto/slurm/presto-nvl72/launch-run.sh
+++ b/presto/slurm/presto-nvl72/launch-run.sh
@@ -41,7 +41,10 @@ USE_NUMA="1"
 VARIANT_TYPE="gpu"
 # WORKER_IMAGE="presto-native-worker-gpu"
 WORKER_IMAGE="presto-native-worker-gpu-karth-Mar11-with-nsys-2025.5.1"
+# WORKER_IMAGE="velox-testing-images-presto-766546f-velox-1ca955b-gpu-cuda12.9-20260415-arm64-with-nsys"
 COORD_IMAGE="presto-coordinator-karth-Mar11"
+# COORD_IMAGE="velox-testing-images-presto-coordinator-766546f-20260415-arm64-with-jq"
+# COORD_IMAGE="presto-coordinator-karth-Mar11"
 #COORD_IMAGE="presto-coordinator-ibm-03-11"
 #WORKER_IMAGE="presto-native-worker-gpu-ibm-03-11"
 #WORKER_IMAGE="velox-testing-images-presto-471cf1a-velox-1a2f63f-gpu-cuda13.1-20260312-arm64"