diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 73fcefb538..cd8e5364d2 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -65,6 +65,8 @@ env: VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }} TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} N_RUNS: ${{ inputs.n_runs || '1' }} + PTI_FORCE_BRIDGE_COMMANDS: "1" + UR_LOADER_USE_LEVEL_ZERO_V2: "0" jobs: build: @@ -117,17 +119,9 @@ jobs: cd benchmarks pip install . - - name: Build PTI - run: | - ./scripts/install-pti.sh --build-level-zero - PTI_LIBS_DIR=$(python ./scripts/pti_lib.py) - ls $PTI_LIBS_DIR - echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV - - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -138,7 +132,6 @@ jobs: - name: Run Triton Softmax kernel benchmark with Proton if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark BENCHMARKING_METHOD=PROTON_PROFILER python fused_softmax.py source ../../scripts/capture-hw-details.sh @@ -146,7 +139,6 @@ jobs: - name: Run Triton GEMM kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -160,7 +152,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -173,7 +164,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -186,7 +176,6 @@ jobs: - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -198,7 +187,6 @@ jobs: - name: Run Triton GEMM (A^t@B) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -210,7 +198,6 @@ jobs: - name: Run Triton GEMM (stream-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -220,7 +207,6 @@ jobs: - name: Run Triton GEMM (split-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -230,7 +216,6 @@ jobs: - name: Run Triton GEMM + PreOp (exp) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -239,7 +224,6 @@ jobs: - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -248,7 +232,6 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -258,7 +241,6 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -268,7 +250,6 @@ jobs: - name: Run Triton FA fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -279,7 +260,6 @@ jobs: - name: Run Triton FA bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -292,7 +272,6 @@ jobs: - name: Run Triton FA fwd kernel benchmark - with tensor descriptors if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -304,7 +283,6 @@ jobs: - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -315,7 +293,6 @@ jobs: - name: Run Triton FlexAttention (batch_size=4) Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch4-causal_mask.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark BATCH_SIZE=4 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -326,7 +303,6 @@ jobs: - name: Run Triton FlexAttention (batch_size=16) Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_batch16-causal_mask.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark BATCH_SIZE=16 python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -337,7 +313,6 @@ jobs: - name: Run Triton FlexAttention Causal Mask bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_bwd_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_bwd_benchmark_causal_mask.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE='bwd' \ python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -349,7 +324,6 @@ jobs: - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS @@ -363,7 +337,6 @@ jobs: - name: Run Prefix Sums kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -372,7 +345,6 @@ jobs: - name: Run micro benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS