ROCm · ggottipa-amd · Jun 30, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
@@ -14,6 +14,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 * Improved GPU Benchmarking and Roofline profiling/analysis support for gfx1150/gfx1151/gfx1152 architectures.
   * gfx11 supports Wave Matrix Multiply Accumulate (WMMA), replacing MFMA operations.
 
+* Added experimental Triton support to ML API tracing. Profile with `--experimental --triton-trace` to emit a ROCTX marker per Triton/Inductor kernel launch attributed to the user call site, and analyze with `--experimental --list-triton-operators` or `--experimental --triton-operator <pattern>` to list or filter Triton operators independently of Torch.
+
 ### Changed
 
 * `--pc-sampling-sorting-type` now defaults to `count` (was `offset`), so the PC sampling table shows the most-sampled instructions first.

@@ -441,6 +441,14 @@ add_test(
         tests/test_profile_general.py ${WORKING_DIR_OPTION}
 )
 
+add_test(
+    NAME test_profile_triton_trace
+    COMMAND
+        ${PYTHON_TEST_COMMAND} -m pytest ${STANDALONEBINARY_TEST_OPTION} -m triton_trace
+        --junitxml=tests/test_profile_triton_trace.xml ${COV_OPTION}
+        tests/test_profile_general.py ${WORKING_DIR_OPTION}
+)
+
 set_tests_properties(
     test_profile_kernel_execution
     test_profile_dispatch
@@ -456,6 +464,7 @@ set_tests_properties(
     test_profile_iteration_multiplexing_2
     test_profile_iteration_multiplexing_stochastic
     test_profile_torch_trace
+    test_profile_triton_trace
     PROPERTIES LABELS "profile" RESOURCE_GROUPS gpus:1 TIMEOUT 1800
 )
 
@@ -842,6 +851,7 @@ if(${ENABLE_COVERAGE})
         test_torch_trace_analysis
         test_torch_trace_coverage
         test_profile_torch_trace
+        test_profile_triton_trace
         test_torch_cpp_loader
         test_inject_roctx_package
     )

@@ -681,7 +681,7 @@ Analysis database example
    WARNING Created file: test.db
 
 
-PyTorch Operator Analysis
+PyTorch operator analysis
 =========================
 
 .. warning::
@@ -691,18 +691,18 @@ PyTorch Operator Analysis
 
    These options require ``--experimental``. After profiling with
    ``--experimental --torch-trace`` (see :ref:`torch-operator-profiling`),
-   use ``rocprof-compute --experimental analyze ...`` with
+   use ``rocprof-compute analyze ... --experimental`` with
    ``--list-torch-operators`` or ``--torch-operator`` as needed.
 
 
-Listing All Operators
+List all operators
 ---------------------
 
 Display all PyTorch operators captured during profiling:
 
 .. code-block:: shell-session
 
-   $ rocprof-compute --experimental analyze --path ./workload --list-torch-operators
+   $ rocprof-compute analyze --experimental --list-torch-operators --path ./workload
 
    ================================================================================
    PyTorch Operator Call Tree: ./workload
@@ -759,6 +759,8 @@ milliseconds and microseconds per cell; missing values render as ``N/A``.
 When no operator has any recorded dispatches, the table is replaced by the
 line ``Operator summary: (no operators with recorded dispatches)``.
 
+.. _operator-filtering:
+
 Filtering by Operator
 ---------------------
 
@@ -775,17 +777,84 @@ operators. Operator hierarchies are ``/``-separated (e.g.
 .. code-block:: shell-session
 
    # Wildcard match
-   $ rocprof-compute --experimental analyze --path ./workload --torch-operator "*relu"
+   $ rocprof-compute analyze --experimental --torch-operator "*relu" --path ./workload
 
    # Exact match
-   $ rocprof-compute --experimental analyze --path ./workload --torch-operator torch.nn.functional.relu
+   $ rocprof-compute analyze --experimental --torch-operator torch.nn.functional.relu --path ./workload
 
    # Match all operators (no arguments)
-   $ rocprof-compute --experimental analyze --path ./workload --torch-operator
+   $ rocprof-compute analyze --experimental --torch-operator --path ./workload
 
 **Filter multiple operators** (space or comma separated):
 
 .. code-block:: shell-session
 
-   $ rocprof-compute --experimental analyze --path ./workload \
-       --torch-operator "*relu,*conv*,*linear"
+   $ rocprof-compute analyze --experimental \
+       --torch-operator "*relu,*conv*,*linear" --path ./workload
+
+
+Triton operator analysis
+========================
+
+.. warning::
+
+   Triton operator analysis is currently available only in CLI mode and
+   requires ``--experimental``. After profiling with
+   ``--experimental --triton-trace`` (see :ref:`triton-trace`), use
+   ``rocprof-compute analyze ... --experimental`` with
+   ``--list-triton-operators`` or ``--triton-operator`` as needed.
+
+Triton kernels can be analyzed similar to PyTorch operators. You can use the
+``--list-triton-operators`` and ``--triton-operator`` options. Both options read the
+same ``ml_api_trace/consolidated.csv`` and select rows where the ``Backend`` column is
+``triton``. As a result, Triton kernels are reported independently even if PyTorch
+operators appear in the same run.
+
+List all captured Triton kernels
+---------------------------------
+
+Display all Triton kernels captured during profiling:
+
+.. code-block:: shell-session
+
+   $ rocprof-compute analyze --experimental --list-triton-operators --path ./workload
+
+   ================================================================================
+   Triton Operator Call Tree: ./workload
+   Grouped by source location, sorted by total GPU kernel duration.
+   ================================================================================
+
+   torch_compile_triton.py:26 (dispatches: 39, total: 4.22 ms, dispatch_mean: 0.11 ms, dispatch_min: 0.05 ms, dispatch_max: 0.81 ms)
+   └─ torch.compile.fused (calls: 1)
+      └─ triton.CompiledKernel.triton_poi_fused_add_mul_relu_0 (calls: 3)
+         └─ triton_poi_fused_add_mul_relu_0 (id 0) (dispatches: 39, total: 4.22 ms)
+
+   Operator summary (Min/Max/Mean are per-dispatch over the subtree; sorted by Total):
+   ╒══════════════════════════════════════════════════════════════════════════╤═════════╤══════════════╤═════════╤═══════════╤═════════════╤═════════╤═════════╤═════════╕
+   │ Operator                                                                 │   Calls │   Dispatches │   Total │   % Total │   Mean/Call │    Mean │     Min │     Max │
+   ╞══════════════════════════════════════════════════════════════════════════╪═════════╪══════════════╪═════════╪═══════════╪═════════════╪═════════╪═════════╪═════════╡
+   │ torch.compile.fused                                                      │       1 │           39 │ 4.22 ms │    100.00 │     4.22 ms │ 0.11 ms │ 0.05 ms │ 0.81 ms │
+   ├──────────────────────────────────────────────────────────────────────────┼─────────┼──────────────┼─────────┼───────────┼─────────────┼─────────┼─────────┼─────────┤
+   │ torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_ │       3 │           39 │ 4.22 ms │    100.00 │     1.41 ms │ 0.11 ms │ 0.05 ms │ 0.81 ms │
+   │ 0                                                                        │         │              │         │           │             │         │         │         │
+   ╘══════════════════════════════════════════════════════════════════════════╧═════════╧══════════════╧═════════╧═══════════╧═════════════╧═════════╧═════════╧═════════╛
+
+Filter the Triton kernels
+-------------------------
+
+``--triton-operator`` uses the same shell-style glob matching as
+``--torch-operator``; see :ref:`operator-filtering` for the full pattern syntax.
+
+.. code-block:: shell-session
+
+   # Wildcard match
+   $ rocprof-compute analyze --experimental --triton-operator "*matmul*" --path ./workload
+
+   # Filter multiple kernels (space or comma separated)
+   $ rocprof-compute analyze --experimental \
+       --triton-operator "*matmul*,*softmax*" --path ./workload
+
+.. note::
+
+   ``--torch-operator`` and ``--triton-operator`` are mutually exclusive; use
+   one operator filter per analysis run.
@@ -878,8 +878,8 @@ workload (counter data, traces) untouched:
 
 .. _torch-operator-mapping:
 
-Torch operator mapping
-========================
+Torch trace
+===========
 
 ROCm Compute Profiler offers Torch operator mapping functionality to analyze the performance metrics at the PyTorch operator level. This feature maps the performance counters to specific PyTorch operators, enabling detailed performance analysis of
 the PyTorch workloads at the operator granularity.
@@ -920,7 +920,7 @@ option when profiling a PyTorch workload:
 
 .. code-block:: shell-session
 
-   $ rocprof-compute --experimental profile --name mnist_torch --torch-trace -- python train.py
+   $ rocprof-compute profile --experimental --torch-trace --name mnist_torch -- python train.py
 
                                     __                                       _
     _ __ ___   ___ _ __  _ __ ___  / _|       ___ ___  _ __ ___  _ __  _   _| |_ ___
@@ -1042,12 +1042,14 @@ The ``pmc_perf.csv`` file contains the standard performance counter data (same a
 * Correlating operator-level timing with kernel-level hardware metrics
 * Tracing the execution flow from high-level PyTorch API to low-level GPU kernels
 
+.. _torch-trace-limitations:
+
 Limitations
 -----------
 
 The Torch trace feature currently has the following limitations:
 
-* Torch trace is experimental. Use ``rocprof-compute --experimental profile ... --torch-trace`` and ``rocprof-compute --experimental analyze ...`` with ``--list-torch-operators`` or ``--torch-operator`` as needed.
+* Torch trace is experimental. Use ``rocprof-compute profile ... --experimental --torch-trace`` and ``rocprof-compute analyze ... --experimental`` with ``--list-torch-operators`` or ``--torch-operator`` as needed.
 
 * The ``--torch-trace`` option requires the application to be a Python command or Python script.
 
@@ -1112,13 +1114,84 @@ Torch operator mapping can be combined with other profiling options. Use
 .. code-block:: shell-session
 
    # Combine with block filtering for targeted counter collection
-   $ rocprof-compute --experimental profile --name mnist --torch-trace -b 11 12 -- python train.py
+   $ rocprof-compute profile -b 11 12 --experimental --torch-trace --name mnist -- python train.py
 
    # Combine with iteration multiplexing
-   $ rocprof-compute --experimental profile --name mnist --torch-trace --iteration-multiplexing kernel -- python train.py
+   $ rocprof-compute profile --experimental --torch-trace --name mnist --iteration-multiplexing kernel -- python train.py
 
    # Combine with kernel filtering (filters by GPU kernel name)
-   $ rocprof-compute --experimental profile --name mnist --torch-trace -k elementwise -- python train.py
+   $ rocprof-compute profile --experimental --torch-trace --name mnist -k elementwise -- python train.py
+
+.. _triton-trace:
+
+Triton trace
+============
+
+In addition to PyTorch, ROCm Compute Profiler can map performance counters to
+Triton kernels (including Triton kernels launched by ``torch.compile`` /
+Inductor). This is enabled with the ``--triton-trace`` option and shares the
+same ``ml_api_trace`` output, ``Backend`` attribution, and analysis flow as Torch
+trace.
+
+.. warning::
+
+   Triton trace is currently an experimental feature. You must pass
+   ``--experimental`` to both **profile** and **analyze** commands when using the
+   Triton trace related options (``--triton-trace`` for profile;
+   ``--list-triton-operators`` and ``--triton-operator`` for analyze).
+
+Requirements
+------------
+
+Triton trace has the same requirements and limitations as Torch trace (see
+:ref:`torch-trace-limitations`), with a valid Triton installation required in
+place of PyTorch.
+
+Usage
+-----
+
+To enable Triton kernel mapping, use ``--experimental`` with the
+``--triton-trace`` option:
+
+.. code-block:: shell-session
+
+   $ rocprof-compute profile --experimental --triton-trace --name triton_gemm -- python gemm.py
+
+``--triton-trace`` can be combined with ``--torch-trace`` to instrument both
+frameworks in a single run:
+
+.. code-block:: shell-session
+
+   $ rocprof-compute profile --experimental --torch-trace --triton-trace --name compiled_model -- python train.py
+
+Each captured marker records its originating framework in the ``Backend`` column
+of ``ml_api_trace/consolidated.csv``, so each framework can be analyzed
+independently. To enable all supported backends at once, use
+:ref:`--ml-api-trace <ml-api-trace>`.
+
+To analyze the captured Triton kernels, use the ``--list-triton-operators`` and
+``--triton-operator`` options in analyze mode (see :doc:`../analyze/cli`).
+
+.. _ml-api-trace:
+
+ML API trace
+============
+
+``--ml-api-trace`` enables marker tracing for all supported ML framework backends in a
+single option.
+
+.. warning::
+
+   ML API trace is currently an experimental feature. You must pass
+   ``--experimental`` when using it.
+
+.. code-block:: shell-session
+
+   $ rocprof-compute profile --experimental --ml-api-trace --name model -- python train.py
+
+The output is identical to enabling each framework's trace flag individually.
+Captured kernels are attributed in the ``Backend`` column and analyzed with the
+corresponding per-framework operator options (see :doc:`../analyze/cli`).
 
 .. _iteration-multiplexing:
 

@@ -110,6 +110,7 @@ markers = [
     "noise_clamp",
     "torch_ops",
     "torch_trace",
+    "triton_trace",
     "division_by_zero",
     "multi_rank",
     "experimental_feature",

@@ -0,0 +1,33 @@
+# Copyright (c) Advanced Micro Devices, Inc.
+# SPDX-License-Identifier:  MIT
+
+"""Minimal torch.compile workload that generates Triton kernels."""
+
+import sys
+
+import torch
+
+
+@torch.compile
+def fused(x, y):
+    return torch.relu(x) * y + x
+
+
+def main():
+    if not torch.cuda.is_available():
+        print("GPU is required for this sample. Exiting.")
+        sys.exit(1)
+
+    x = torch.randn(4096, 4096, device="cuda")
+    y = torch.randn(4096, 4096, device="cuda")
+
+    # First call compiles; later calls reuse the generated Triton kernels.
+    for _ in range(3):
+        fused(x, y)
+
+    torch.cuda.synchronize()
+    print("Compiled workload completed")
+
+
+if __name__ == "__main__":
+    main()