diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 13f926c5eed..fc6fca923f3 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -14,6 +14,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Improved GPU Benchmarking and Roofline profiling/analysis support for gfx1150/gfx1151/gfx1152 architectures. * gfx11 supports Wave Matrix Multiply Accumulate (WMMA), replacing MFMA operations. +* Added experimental Triton support to ML API tracing. Profile with `--experimental --triton-trace` to emit a ROCTX marker per Triton/Inductor kernel launch attributed to the user call site, and analyze with `--experimental --list-triton-operators` or `--experimental --triton-operator ` to list or filter Triton operators independently of Torch. + ### Changed * `--pc-sampling-sorting-type` now defaults to `count` (was `offset`), so the PC sampling table shows the most-sampled instructions first. diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index 4dd9f45ab9c..f33d4766864 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -441,6 +441,14 @@ add_test( tests/test_profile_general.py ${WORKING_DIR_OPTION} ) +add_test( + NAME test_profile_triton_trace + COMMAND + ${PYTHON_TEST_COMMAND} -m pytest ${STANDALONEBINARY_TEST_OPTION} -m triton_trace + --junitxml=tests/test_profile_triton_trace.xml ${COV_OPTION} + tests/test_profile_general.py ${WORKING_DIR_OPTION} +) + set_tests_properties( test_profile_kernel_execution test_profile_dispatch @@ -456,6 +464,7 @@ set_tests_properties( test_profile_iteration_multiplexing_2 test_profile_iteration_multiplexing_stochastic test_profile_torch_trace + test_profile_triton_trace PROPERTIES LABELS "profile" RESOURCE_GROUPS gpus:1 TIMEOUT 1800 ) @@ -842,6 +851,7 @@ if(${ENABLE_COVERAGE}) test_torch_trace_analysis test_torch_trace_coverage test_profile_torch_trace + test_profile_triton_trace test_torch_cpp_loader test_inject_roctx_package ) diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst index 6bb9a634d9e..387df802722 100644 --- a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst +++ b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst @@ -681,7 +681,7 @@ Analysis database example WARNING Created file: test.db -PyTorch Operator Analysis +PyTorch operator analysis ========================= .. warning:: @@ -691,18 +691,18 @@ PyTorch Operator Analysis These options require ``--experimental``. After profiling with ``--experimental --torch-trace`` (see :ref:`torch-operator-profiling`), - use ``rocprof-compute --experimental analyze ...`` with + use ``rocprof-compute analyze ... --experimental`` with ``--list-torch-operators`` or ``--torch-operator`` as needed. -Listing All Operators +List all operators --------------------- Display all PyTorch operators captured during profiling: .. code-block:: shell-session - $ rocprof-compute --experimental analyze --path ./workload --list-torch-operators + $ rocprof-compute analyze --experimental --list-torch-operators --path ./workload ================================================================================ PyTorch Operator Call Tree: ./workload @@ -759,6 +759,8 @@ milliseconds and microseconds per cell; missing values render as ``N/A``. When no operator has any recorded dispatches, the table is replaced by the line ``Operator summary: (no operators with recorded dispatches)``. +.. _operator-filtering: + Filtering by Operator --------------------- @@ -775,17 +777,84 @@ operators. Operator hierarchies are ``/``-separated (e.g. .. code-block:: shell-session # Wildcard match - $ rocprof-compute --experimental analyze --path ./workload --torch-operator "*relu" + $ rocprof-compute analyze --experimental --torch-operator "*relu" --path ./workload # Exact match - $ rocprof-compute --experimental analyze --path ./workload --torch-operator torch.nn.functional.relu + $ rocprof-compute analyze --experimental --torch-operator torch.nn.functional.relu --path ./workload # Match all operators (no arguments) - $ rocprof-compute --experimental analyze --path ./workload --torch-operator + $ rocprof-compute analyze --experimental --torch-operator --path ./workload **Filter multiple operators** (space or comma separated): .. code-block:: shell-session - $ rocprof-compute --experimental analyze --path ./workload \ - --torch-operator "*relu,*conv*,*linear" + $ rocprof-compute analyze --experimental \ + --torch-operator "*relu,*conv*,*linear" --path ./workload + + +Triton operator analysis +======================== + +.. warning:: + + Triton operator analysis is currently available only in CLI mode and + requires ``--experimental``. After profiling with + ``--experimental --triton-trace`` (see :ref:`triton-trace`), use + ``rocprof-compute analyze ... --experimental`` with + ``--list-triton-operators`` or ``--triton-operator`` as needed. + +Triton kernels can be analyzed similar to PyTorch operators. You can use the +``--list-triton-operators`` and ``--triton-operator`` options. Both options read the +same ``ml_api_trace/consolidated.csv`` and select rows where the ``Backend`` column is +``triton``. As a result, Triton kernels are reported independently even if PyTorch +operators appear in the same run. + +List all captured Triton kernels +--------------------------------- + +Display all Triton kernels captured during profiling: + +.. code-block:: shell-session + + $ rocprof-compute analyze --experimental --list-triton-operators --path ./workload + + ================================================================================ + Triton Operator Call Tree: ./workload + Grouped by source location, sorted by total GPU kernel duration. + ================================================================================ + + torch_compile_triton.py:26 (dispatches: 39, total: 4.22 ms, dispatch_mean: 0.11 ms, dispatch_min: 0.05 ms, dispatch_max: 0.81 ms) + └─ torch.compile.fused (calls: 1) + └─ triton.CompiledKernel.triton_poi_fused_add_mul_relu_0 (calls: 3) + └─ triton_poi_fused_add_mul_relu_0 (id 0) (dispatches: 39, total: 4.22 ms) + + Operator summary (Min/Max/Mean are per-dispatch over the subtree; sorted by Total): + ╒══════════════════════════════════════════════════════════════════════════╤═════════╤══════════════╤═════════╤═══════════╤═════════════╤═════════╤═════════╤═════════╕ + │ Operator │ Calls │ Dispatches │ Total │ % Total │ Mean/Call │ Mean │ Min │ Max │ + ╞══════════════════════════════════════════════════════════════════════════╪═════════╪══════════════╪═════════╪═══════════╪═════════════╪═════════╪═════════╪═════════╡ + │ torch.compile.fused │ 1 │ 39 │ 4.22 ms │ 100.00 │ 4.22 ms │ 0.11 ms │ 0.05 ms │ 0.81 ms │ + ├──────────────────────────────────────────────────────────────────────────┼─────────┼──────────────┼─────────┼───────────┼─────────────┼─────────┼─────────┼─────────┤ + │ torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_ │ 3 │ 39 │ 4.22 ms │ 100.00 │ 1.41 ms │ 0.11 ms │ 0.05 ms │ 0.81 ms │ + │ 0 │ │ │ │ │ │ │ │ │ + ╘══════════════════════════════════════════════════════════════════════════╧═════════╧══════════════╧═════════╧═══════════╧═════════════╧═════════╧═════════╧═════════╛ + +Filter the Triton kernels +------------------------- + +``--triton-operator`` uses the same shell-style glob matching as +``--torch-operator``; see :ref:`operator-filtering` for the full pattern syntax. + +.. code-block:: shell-session + + # Wildcard match + $ rocprof-compute analyze --experimental --triton-operator "*matmul*" --path ./workload + + # Filter multiple kernels (space or comma separated) + $ rocprof-compute analyze --experimental \ + --triton-operator "*matmul*,*softmax*" --path ./workload + +.. note:: + + ``--torch-operator`` and ``--triton-operator`` are mutually exclusive; use + one operator filter per analysis run. diff --git a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst index a1ca04d5f51..c1bea129fce 100644 --- a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst +++ b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst @@ -878,8 +878,8 @@ workload (counter data, traces) untouched: .. _torch-operator-mapping: -Torch operator mapping -======================== +Torch trace +=========== ROCm Compute Profiler offers Torch operator mapping functionality to analyze the performance metrics at the PyTorch operator level. This feature maps the performance counters to specific PyTorch operators, enabling detailed performance analysis of the PyTorch workloads at the operator granularity. @@ -920,7 +920,7 @@ option when profiling a PyTorch workload: .. code-block:: shell-session - $ rocprof-compute --experimental profile --name mnist_torch --torch-trace -- python train.py + $ rocprof-compute profile --experimental --torch-trace --name mnist_torch -- python train.py __ _ _ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___ @@ -1042,12 +1042,14 @@ The ``pmc_perf.csv`` file contains the standard performance counter data (same a * Correlating operator-level timing with kernel-level hardware metrics * Tracing the execution flow from high-level PyTorch API to low-level GPU kernels +.. _torch-trace-limitations: + Limitations ----------- The Torch trace feature currently has the following limitations: -* Torch trace is experimental. Use ``rocprof-compute --experimental profile ... --torch-trace`` and ``rocprof-compute --experimental analyze ...`` with ``--list-torch-operators`` or ``--torch-operator`` as needed. +* Torch trace is experimental. Use ``rocprof-compute profile ... --experimental --torch-trace`` and ``rocprof-compute analyze ... --experimental`` with ``--list-torch-operators`` or ``--torch-operator`` as needed. * The ``--torch-trace`` option requires the application to be a Python command or Python script. @@ -1112,13 +1114,84 @@ Torch operator mapping can be combined with other profiling options. Use .. code-block:: shell-session # Combine with block filtering for targeted counter collection - $ rocprof-compute --experimental profile --name mnist --torch-trace -b 11 12 -- python train.py + $ rocprof-compute profile -b 11 12 --experimental --torch-trace --name mnist -- python train.py # Combine with iteration multiplexing - $ rocprof-compute --experimental profile --name mnist --torch-trace --iteration-multiplexing kernel -- python train.py + $ rocprof-compute profile --experimental --torch-trace --name mnist --iteration-multiplexing kernel -- python train.py # Combine with kernel filtering (filters by GPU kernel name) - $ rocprof-compute --experimental profile --name mnist --torch-trace -k elementwise -- python train.py + $ rocprof-compute profile --experimental --torch-trace --name mnist -k elementwise -- python train.py + +.. _triton-trace: + +Triton trace +============ + +In addition to PyTorch, ROCm Compute Profiler can map performance counters to +Triton kernels (including Triton kernels launched by ``torch.compile`` / +Inductor). This is enabled with the ``--triton-trace`` option and shares the +same ``ml_api_trace`` output, ``Backend`` attribution, and analysis flow as Torch +trace. + +.. warning:: + + Triton trace is currently an experimental feature. You must pass + ``--experimental`` to both **profile** and **analyze** commands when using the + Triton trace related options (``--triton-trace`` for profile; + ``--list-triton-operators`` and ``--triton-operator`` for analyze). + +Requirements +------------ + +Triton trace has the same requirements and limitations as Torch trace (see +:ref:`torch-trace-limitations`), with a valid Triton installation required in +place of PyTorch. + +Usage +----- + +To enable Triton kernel mapping, use ``--experimental`` with the +``--triton-trace`` option: + +.. code-block:: shell-session + + $ rocprof-compute profile --experimental --triton-trace --name triton_gemm -- python gemm.py + +``--triton-trace`` can be combined with ``--torch-trace`` to instrument both +frameworks in a single run: + +.. code-block:: shell-session + + $ rocprof-compute profile --experimental --torch-trace --triton-trace --name compiled_model -- python train.py + +Each captured marker records its originating framework in the ``Backend`` column +of ``ml_api_trace/consolidated.csv``, so each framework can be analyzed +independently. To enable all supported backends at once, use +:ref:`--ml-api-trace `. + +To analyze the captured Triton kernels, use the ``--list-triton-operators`` and +``--triton-operator`` options in analyze mode (see :doc:`../analyze/cli`). + +.. _ml-api-trace: + +ML API trace +============ + +``--ml-api-trace`` enables marker tracing for all supported ML framework backends in a +single option. + +.. warning:: + + ML API trace is currently an experimental feature. You must pass + ``--experimental`` when using it. + +.. code-block:: shell-session + + $ rocprof-compute profile --experimental --ml-api-trace --name model -- python train.py + +The output is identical to enabling each framework's trace flag individually. +Captured kernels are attributed in the ``Backend`` column and analyzed with the +corresponding per-framework operator options (see :doc:`../analyze/cli`). .. _iteration-multiplexing: diff --git a/projects/rocprofiler-compute/pyproject.toml b/projects/rocprofiler-compute/pyproject.toml index 27180f7e390..cc1a2f9dd7b 100644 --- a/projects/rocprofiler-compute/pyproject.toml +++ b/projects/rocprofiler-compute/pyproject.toml @@ -110,6 +110,7 @@ markers = [ "noise_clamp", "torch_ops", "torch_trace", + "triton_trace", "division_by_zero", "multi_rank", "experimental_feature", diff --git a/projects/rocprofiler-compute/sample/torch_compile_triton.py b/projects/rocprofiler-compute/sample/torch_compile_triton.py new file mode 100644 index 00000000000..154e51fbfe5 --- /dev/null +++ b/projects/rocprofiler-compute/sample/torch_compile_triton.py @@ -0,0 +1,33 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +"""Minimal torch.compile workload that generates Triton kernels.""" + +import sys + +import torch + + +@torch.compile +def fused(x, y): + return torch.relu(x) * y + x + + +def main(): + if not torch.cuda.is_available(): + print("GPU is required for this sample. Exiting.") + sys.exit(1) + + x = torch.randn(4096, 4096, device="cuda") + y = torch.randn(4096, 4096, device="cuda") + + # First call compiles; later calls reuse the generated Triton kernels. + for _ in range(3): + fused(x, y) + + torch.cuda.synchronize() + print("Compiled workload completed") + + +if __name__ == "__main__": + main() diff --git a/projects/rocprofiler-compute/sample/triton_ffn.py b/projects/rocprofiler-compute/sample/triton_ffn.py new file mode 100644 index 00000000000..63172a29a15 --- /dev/null +++ b/projects/rocprofiler-compute/sample/triton_ffn.py @@ -0,0 +1,179 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +""" +Pure-Triton LLaMA-style feed-forward block (RMSNorm + gated MLP). + +Implements one transformer FFN entirely in Triton: + + h = rmsnorm(x) + gate = h @ Wgate + up = h @ Wup + out = x + (silu(gate) * up) @ Wdown + +This exercises five distinct Triton kernels (rmsnorm, matmul, silu, mul, add); +the matmul kernel is launched from three call sites (the gate, up, and down +projections). +""" + +import sys + +import torch +import triton +import triton.language as tl + +BLOCK_M = 64 +BLOCK_N = 64 +BLOCK_K = 64 +ELT_BLOCK = 1024 + + +@triton.jit +def rmsnorm_kernel(x_ptr, w_ptr, out_ptr, n_cols, eps, BLOCK: tl.constexpr): + row = tl.program_id(0) + cols = tl.arange(0, BLOCK) + mask = cols < n_cols + x = tl.load(x_ptr + row * n_cols + cols, mask=mask, other=0.0) + rstd = 1.0 / tl.sqrt(tl.sum(x * x, axis=0) / n_cols + eps) + w = tl.load(w_ptr + cols, mask=mask, other=0.0) + tl.store(out_ptr + row * n_cols + cols, x * rstd * w, mask=mask) + + +@triton.jit +def matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_K) + a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(0, K, BLOCK_K): + k_mask = offs_k[None, :] < K - k + a = tl.load(a_ptrs, mask=k_mask, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k, other=0.0) + acc += tl.dot(a, b) + a_ptrs += BLOCK_K * stride_ak + b_ptrs += BLOCK_K * stride_bk + c_ptrs = c_ptr + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn + c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N) + tl.store(c_ptrs, acc, mask=c_mask) + + +@triton.jit +def silu_kernel(x_ptr, out_ptr, n, BLOCK_SIZE: tl.constexpr): + offs = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < n + x = tl.load(x_ptr + offs, mask=mask) + tl.store(out_ptr + offs, x * tl.sigmoid(x), mask=mask) + + +@triton.jit +def mul_kernel(x_ptr, y_ptr, out_ptr, n, BLOCK_SIZE: tl.constexpr): + offs = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < n + x = tl.load(x_ptr + offs, mask=mask) + y = tl.load(y_ptr + offs, mask=mask) + tl.store(out_ptr + offs, x * y, mask=mask) + + +@triton.jit +def add_kernel(x_ptr, y_ptr, out_ptr, n, BLOCK_SIZE: tl.constexpr): + offs = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < n + x = tl.load(x_ptr + offs, mask=mask) + y = tl.load(y_ptr + offs, mask=mask) + tl.store(out_ptr + offs, x + y, mask=mask) + + +def rmsnorm(x, weight, eps=1e-6): + rows, cols = x.shape + out = torch.empty_like(x) + rmsnorm_kernel[(rows,)]( + x, weight, out, cols, eps, BLOCK=triton.next_power_of_2(cols) + ) + return out + + +def matmul(a, b): + m, k = a.shape + _, n = b.shape + c = torch.empty((m, n), device=a.device, dtype=a.dtype) + grid = (triton.cdiv(m, BLOCK_M), triton.cdiv(n, BLOCK_N)) + matmul_kernel[grid]( + a, + b, + c, + m, + n, + k, + a.stride(0), + a.stride(1), + b.stride(0), + b.stride(1), + c.stride(0), + c.stride(1), + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + BLOCK_K=BLOCK_K, + ) + return c + + +def elementwise(kernel, *tensors): + out = torch.empty_like(tensors[0]) + n = out.numel() + grid = (triton.cdiv(n, ELT_BLOCK),) + kernel[grid](*tensors, out, n, BLOCK_SIZE=ELT_BLOCK) + return out + + +def ffn(x, w_norm, w_gate, w_up, w_down): + h = rmsnorm(x, w_norm) + gate = matmul(h, w_gate) + up = matmul(h, w_up) + act = elementwise(silu_kernel, gate) + fused = elementwise(mul_kernel, act, up) + down = matmul(fused, w_down) + return elementwise(add_kernel, x, down) + + +def main(): + if not torch.cuda.is_available(): + print("GPU is required for this sample. Exiting.") + sys.exit(1) + + tokens, hidden, inter = 512, 512, 2048 + dev = "cuda" + x = torch.randn(tokens, hidden, device=dev) + w_norm = torch.randn(hidden, device=dev) + w_gate = torch.randn(hidden, inter, device=dev) + w_up = torch.randn(hidden, inter, device=dev) + w_down = torch.randn(inter, hidden, device=dev) + + for _ in range(3): + out = ffn(x, w_norm, w_gate, w_up, w_down) + + torch.cuda.synchronize() + print(f"FFN completed, output sum: {out.sum().item():.3f}") + + +if __name__ == "__main__": + main() diff --git a/projects/rocprofiler-compute/src/argparser.py b/projects/rocprofiler-compute/src/argparser.py index 9af3a50f061..e4e8036a742 100644 --- a/projects/rocprofiler-compute/src/argparser.py +++ b/projects/rocprofiler-compute/src/argparser.py @@ -183,6 +183,9 @@ def add_general_group( " GUI (--gui)\n" " TUI (--tui)\n" " Torch trace (--torch-trace, --list-torch-operators, --torch-operator)\n" + " Triton trace (--triton-trace, --list-triton-operators, " + "--triton-operator)\n" + " ML API trace (--ml-api-trace)\n" " PC Sampling (--pc-sampling, --pc-sampling-method, " "--pc-sampling-interval)\n" ), @@ -552,6 +555,40 @@ def omniarg_parser( # Experimental Features ## ---------------------------- + profile_group.add_argument( + "--triton-trace", + dest="triton_trace", + required=False, + default=False, + const=True, + nargs=0, + base_action="store_true", + action=ExperimentalAction, + experimental_enabled=experimental_enabled, + feature_label="Triton trace", + help=( + "\t\t\tTriton Trace, maps Triton kernels to performance counters.\n" + "\t\t\tUse when profiling Triton kernels, including those generated\n" + "\t\t\tby torch.compile / Inductor.\n" + "\t\t\tCan be combined with --torch-trace." + ), + ) + profile_group.add_argument( + "--ml-api-trace", + dest="ml_api_trace", + required=False, + default=False, + const=True, + nargs=0, + base_action="store_true", + action=ExperimentalAction, + experimental_enabled=experimental_enabled, + feature_label="ML API trace", + help=( + "\t\t\tML API Trace, enables tracing for all supported machine\n" + "\t\t\tlearning framework backends (e.g. PyTorch, Triton)." + ), + ) profile_group.add_argument( "--membw-analysis", dest="membw_analysis", @@ -710,6 +747,44 @@ def omniarg_parser( "\t\t\tCombine with -k to intersect with kernel IDs." ), ) + analyze_group.add_argument( + "--list-triton-operators", + dest="list_triton_operators", + default=False, + const=True, + nargs=0, + base_action="store_true", + action=ExperimentalAction, + experimental_enabled=experimental_enabled, + feature_label="List triton operators", + help=( + "\t\tList Triton kernels as a unified call tree grouped by " + "source location with kernel launch stats. " + "Recreates ml_api_trace output directory." + ), + ) + analyze_group.add_argument( + "--triton-operator", + metavar="", + type=str, + dest="triton_operator", + nargs="*", + base_action="store", + action=ExperimentalAction, + experimental_enabled=experimental_enabled, + feature_label="Triton operator filter", + help=( + "\t\tFilter Triton kernels using shell-style glob patterns\n" + "\t\t\t(fnmatch), select their GPU kernels, and display metrics.\n" + "\t\t\tWith no arguments, matches all kernels (default: **).\n" + "\t\t\tExamples:\n" + "\t\t\t *matmul* contains matmul\n" + "\t\t\t all or '*' match every kernel\n" + "\t\t\tMultiple patterns (space or comma-separated):\n" + "\t\t\t --triton-operator *matmul*,*softmax*\n" + "\t\t\tCombine with -k to intersect with kernel IDs." + ), + ) analyze_group.add_argument( "-k", "--kernel", diff --git a/projects/rocprofiler-compute/src/lib/roctx_recordfn/roctx_recordfn.cpp b/projects/rocprofiler-compute/src/lib/roctx_recordfn/roctx_recordfn.cpp index 82fbef6d477..39a0c0f1e56 100644 --- a/projects/rocprofiler-compute/src/lib/roctx_recordfn/roctx_recordfn.cpp +++ b/projects/rocprofiler-compute/src/lib/roctx_recordfn/roctx_recordfn.cpp @@ -126,7 +126,31 @@ void maybe_capture(const std::string& s) } } -// Renders the stack as "marker1/.../markerN:context1/.../contextN". +// Percent-encoding of the two characters that would otherwise collide with the +// marker-path grammar. The inverse decode lives with the Python readers +// (utils/inject_roctx/core.py decode_marker_name, utils/utils_analysis.py); the +// C++ round-trip test reuses these same constants so the escape table has a +// single definition. +constexpr const char* kEncodedPercent = "%25"; +constexpr const char* kEncodedSlash = "%2F"; + +// Appends name to out with '%' and '/' percent-encoded so an embedded '/' is +// not read as the frame separator in build_marker_string. +void encode_marker_segment(const std::string& name, std::string& out) +{ + for (char c : name) + { + if (c == '%') + out += kEncodedPercent; + else if (c == '/') + out += kEncodedSlash; + else + out += c; + } +} + +// Renders the stack as "marker1/.../markerN:context1/.../contextN". Marker names +// are percent-encoded so an embedded '/' is not read as the frame separator. std::string build_marker_string(const std::vector& stack) { std::size_t marker_len = 0; @@ -134,6 +158,10 @@ std::string build_marker_string(const std::vector& stack) for (const auto& e : stack) { marker_len += e.marker.size() + 1; + // Each '%' or '/' expands from one char to three when encoded. + for (char c : e.marker) + if (c == '%' || c == '/') + marker_len += 2; ctx_len += e.context.size() + 1; } std::string out; @@ -144,7 +172,7 @@ std::string build_marker_string(const std::vector& stack) { if (!first) out += '/'; - out += e.marker; + encode_marker_segment(e.marker, out); first = false; } out += ':'; diff --git a/projects/rocprofiler-compute/src/lib/roctx_recordfn/tests/test_roctx_recordfn.cpp b/projects/rocprofiler-compute/src/lib/roctx_recordfn/tests/test_roctx_recordfn.cpp index c7d47197694..98ce8f2b271 100644 --- a/projects/rocprofiler-compute/src/lib/roctx_recordfn/tests/test_roctx_recordfn.cpp +++ b/projects/rocprofiler-compute/src/lib/roctx_recordfn/tests/test_roctx_recordfn.cpp @@ -120,6 +120,89 @@ TEST(LeafContext, BackwardWithoutSeqLeafIsAutogradEngine) roctx_recordfn::kAutogradEngineLeaf); } +namespace +{ + +// Reverses build_marker_string: split the operator path on the '/' separator, +// then decode each segment ('%2F' -> '/', then '%25' -> '%'), matching +// utils_analysis.build_call_trees. +std::vector decode_marker_path(const std::string& wire) +{ + const auto colon = wire.find(':'); + const std::string path = (colon == std::string::npos) ? wire : wire.substr(0, colon); + + std::vector segments; + std::size_t start = 0; + while (true) + { + const auto sep = path.find('/', start); + const std::string raw = path.substr(start, + sep == std::string::npos ? std::string::npos : sep - start); + + std::string decoded; + for (std::size_t i = 0; i < raw.size();) + { + if (raw.compare(i, 3, kEncodedSlash) == 0) + { + decoded += '/'; + i += 3; + } + else if (raw.compare(i, 3, kEncodedPercent) == 0) + { + decoded += '%'; + i += 3; + } + else + { + decoded += raw[i]; + ++i; + } + } + segments.push_back(decoded); + + if (sep == std::string::npos) + return segments; + start = sep + 1; + } +} + +} // namespace + +TEST(MarkerEncoding, EscapesSlashAndPercentWithinNames) +{ + // '/' encodes to %2F and '%' to %25 within a name; the '/' between frames + // remains the separator. + const std::vector stack = { + {"Torch-Compiled Region: 0/0", "#1@a:1"}, + {"k%2F%name", "#2@b:2"}, + }; + + const std::string wire = build_marker_string(stack); + + EXPECT_EQ(wire, "Torch-Compiled Region: 0%2F0/k%252F%25name:#1@a:1/#2@b:2"); +} + +TEST(MarkerEncoding, RoundTripsThroughBuildCallTreesDecode) +{ + // Encoding then decoding returns the original names. + const std::vector names = { + "Torch-Compiled Region 0/0", + "k%2F%name", + "plain_kernel", + "literal%2Fnot_a_slash", + "100%/sec", + }; + + std::vector stack; + stack.reserve(names.size()); + for (const auto& name : names) + stack.push_back(StackEntry{name, "ctx"}); + + const std::vector decoded = decode_marker_path(build_marker_string(stack)); + + EXPECT_EQ(decoded, names); +} + TEST_F(RoctxRecordFnTest, SaveThenConsumeReturnsSavedStack) { const std::vector stack = {{"A", "a"}, {"B", "b"}}; diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py index 17ee046b480..ed7c389a6a7 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py @@ -16,6 +16,7 @@ import config from rocprof_compute_soc.soc_base import OmniSoC_Base from utils import file_io, parser, schema +from utils.inject_roctx.constants import KNOWN_ML_API_BACKENDS from utils.logger import ( console_debug, console_error, @@ -321,15 +322,22 @@ def sanitize(self) -> None: ) profiling_config = self.get_profiling_config() - needs_ml_api_trace = getattr( - args, "torch_operator", None - ) is not None or getattr(args, "list_torch_operators", False) - if needs_ml_api_trace and not profiling_config.get("torch_trace", False): - console_error( - "ml api trace", - 'Workload was not profiled with "--torch-trace". ' - "Cannot use --torch-operator or --list-torch-operators.", - ) + # --ml-api-trace enables every backend. + ml_api_trace = profiling_config.get("ml_api_trace", False) + for backend in KNOWN_ML_API_BACKENDS: + needs_trace = getattr( + args, f"{backend}_operator", None + ) is not None or getattr(args, f"list_{backend}_operators", False) + if needs_trace and not ( + profiling_config.get(f"{backend}_trace", False) or ml_api_trace + ): + console_error( + "ml api trace", + f'Workload was not profiled with "--{backend}-trace" or ' + '"--ml-api-trace". ' + f"Cannot use --{backend}-operator or " + f"--list-{backend}-operators.", + ) for dir_info in args.path: if not any([ diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_cli.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_cli.py index 5ff38db4e75..37f7751d962 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_cli.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_cli.py @@ -16,20 +16,35 @@ build_call_trees, build_call_trees_with_kernel_ids, build_operator_summary, + decode_marker_name, get_matrix_ops_type, process_ml_api_trace_output, write_ml_api_trace_consolidated_csv, ) from utils.utils_common import validate_roofline_csv - -def parse_torch_operator_patterns(args: argparse.Namespace) -> list[str]: - """Extract and flatten --torch-operator patterns from args. - - Returns ``["**"]`` when ``--torch-operator`` is given with no arguments, - which matches all operators. Returns ``[]`` when the flag is absent. +# Maps each ML API trace backend to its analyze CLI attributes and display label. +_ML_API_ANALYSIS_CLI_OPTIONS = { + "torch": { + "filter_attr": "torch_operator", + "list_attr": "list_torch_operators", + "label": "PyTorch", + }, + "triton": { + "filter_attr": "triton_operator", + "list_attr": "list_triton_operators", + "label": "Triton", + }, +} + + +def parse_operator_patterns(args: argparse.Namespace, attr: str) -> list[str]: + """Extract and flatten operator glob patterns from ``args.``. + + Returns ``["**"]`` when the flag is given with no arguments (match all), + and ``[]`` when the flag is absent. """ - raw = getattr(args, "torch_operator", None) + raw = getattr(args, attr, None) if raw is None: return [] pattern_list: list[str] = [] @@ -58,6 +73,30 @@ def pre_processing(self) -> None: if args.random_port: console_error("--gui flag is required to enable --random-port") + active_operator_filters = [ + cli["filter_attr"] + for cli in _ML_API_ANALYSIS_CLI_OPTIONS.values() + if getattr(args, cli["filter_attr"], None) is not None + ] + if len(active_operator_filters) > 1: + console_error( + "analysis", + "Only one operator filter may be used per analysis run. " + "Run the analysis separately for each framework.", + ) + + active_operator_lists = [ + cli["list_attr"] + for cli in _ML_API_ANALYSIS_CLI_OPTIONS.values() + if getattr(args, cli["list_attr"], False) + ] + if len(active_operator_lists) > 1: + console_error( + "analysis", + "Only one operator listing may be used per analysis run. " + "Run the analysis separately for each framework.", + ) + for path_info in args.path: workload = self._runs[path_info[0]] @@ -103,24 +142,14 @@ def pre_processing(self) -> None: workload.dfs[parser.PMC_KERNEL_TOP_TABLE_ID] = kernel_top_df workload.dfs[parser.PMC_DISPATCH_INFO_TABLE_ID] = dispatch_info_df - if getattr(args, "list_torch_operators", False): - consolidated_df, ml_api_trace_path = process_ml_api_trace_output( - path_info[0] - ) - if consolidated_df.empty: - tty.list_torch_operators(path_info[0], {}) + for backend, cli in _ML_API_ANALYSIS_CLI_OPTIONS.items(): + if getattr(args, cli["list_attr"], False): + self.list_operators(path_info[0], kernel_top_df, backend) sys.exit(0) - write_ml_api_trace_consolidated_csv(consolidated_df, ml_api_trace_path) - call_trees = build_call_trees_with_kernel_ids( - consolidated_df=consolidated_df, - kernel_top_df=kernel_top_df, - ) - tty.list_torch_operators(path_info[0], call_trees) - sys.exit(0) - - if getattr(args, "torch_operator", None) is not None: - self.apply_torch_operator_filter(args, workload, path_info[0]) + for backend, cli in _ML_API_ANALYSIS_CLI_OPTIONS.items(): + if getattr(args, cli["filter_attr"], None) is not None: + self.apply_operator_filter(args, workload, path_info[0], backend) # create the loaded table gpu_arch = workload.sys_info.iloc[0]["gpu_arch"] @@ -145,8 +174,9 @@ def run_analysis(self) -> None: gpu_arch = workload.sys_info.iloc[0]["gpu_arch"] arch_config = self._arch_configs[gpu_arch] - if getattr(args, "torch_operator", None) is not None: - self.handle_torch_operator(args, workload) + for backend, cli in _ML_API_ANALYSIS_CLI_OPTIONS.items(): + if getattr(args, cli["filter_attr"], None) is not None: + self.handle_operator(args, workload, backend) if args.list_stats: tty.show_kernel_stats( @@ -233,15 +263,59 @@ def run_analysis(self) -> None: roof_plot=roof_plot, ) - def apply_torch_operator_filter( - self, args: argparse.Namespace, workload: schema.Workload, workload_path: str + @staticmethod + def _filter_by_backend(consolidated_df: pd.DataFrame, backend: str) -> pd.DataFrame: + """Return the rows attributed to ``backend``. + + When the Backend column is absent, rows are treated as the torch + backend. + """ + if "Backend" in consolidated_df.columns: + return consolidated_df[consolidated_df["Backend"] == backend].copy() + if backend == "torch": + return consolidated_df.copy() + return consolidated_df.iloc[0:0].copy() + + def list_operators( + self, + workload_path: str, + kernel_top_df: pd.DataFrame, + backend: str, + ) -> None: + """Render the operator call tree for a single backend.""" + label = _ML_API_ANALYSIS_CLI_OPTIONS[backend]["label"] + consolidated_df, ml_api_trace_path = process_ml_api_trace_output(workload_path) + if consolidated_df.empty: + tty.list_ml_operators(workload_path, {}, framework_label=label) + return + + # Write the full consolidated trace before narrowing to the backend. + write_ml_api_trace_consolidated_csv(consolidated_df, ml_api_trace_path) + backend_df = self._filter_by_backend(consolidated_df, backend) + if backend_df.empty: + tty.list_ml_operators(workload_path, {}, framework_label=label) + return + + call_trees = build_call_trees_with_kernel_ids( + consolidated_df=backend_df, + kernel_top_df=kernel_top_df, + ) + tty.list_ml_operators(workload_path, call_trees, framework_label=label) + + def apply_operator_filter( + self, + args: argparse.Namespace, + workload: schema.Workload, + workload_path: str, + backend: str, ) -> None: - """Set workload.filter_kernel_ids based on --torch-operator patterns. + """Set workload.filter_kernel_ids from the backend's operator filter. - Called in pre_processing *before* load_table_data so that metric - evaluation runs once with the correct kernel filter — the same - approach used by -k/--kernel. + Operator matches are intersected with the -k/--kernel filter when set; + matched rows are stored in workload.matched_ml_api_trace_dfs[backend]. """ + cli = _ML_API_ANALYSIS_CLI_OPTIONS[backend] + label = cli["label"] ml_api_trace_dir = Path(workload_path) / "ml_api_trace" consolidated_path = ml_api_trace_dir / "consolidated.csv" @@ -259,19 +333,33 @@ def apply_torch_operator_filter( if consolidated_df.empty: console_warning( "ml api trace", - "No torch operator data found in this workload. " - "Proceeding without torch operator filter.", + f"No {label} operator data found in this workload. " + f"Proceeding without {label} operator filter.", ) return write_ml_api_trace_consolidated_csv(consolidated_df, ml_api_trace_path) - pattern_list = parse_torch_operator_patterns(args) + consolidated_df = self._filter_by_backend(consolidated_df, backend) + if consolidated_df.empty: + console_warning( + "ml api trace", + f"No {label} operator data found in this workload. " + f"Proceeding without {label} operator filter.", + ) + return + + pattern_list = parse_operator_patterns(args, cli["filter_attr"]) all_operators = consolidated_df["Operator_Name"].dropna().unique() + # Match each name in both its encoded and decoded forms. matched_names = [ str(op).strip() for op in all_operators if any( - parser.torch_operator_pattern_matches(p.strip(), str(op).strip()) + parser.torch_operator_pattern_matches(p.strip(), candidate) + for candidate in { + str(op).strip(), + decode_marker_name(str(op).strip()), + } for p in pattern_list ) ] @@ -279,7 +367,7 @@ def apply_torch_operator_filter( if not matched_names: console_warning( "ml api trace", - f"No operators matched the pattern(s): {pattern_list}", + f"No {label} operators matched the pattern(s): {pattern_list}", ) sys.exit(0) @@ -294,7 +382,7 @@ def apply_torch_operator_filter( } matched_df["Kernel_ID"] = matched_df["Kernel_Name"].str.strip().map(name_to_id) - workload.matched_ml_api_trace_df = matched_df + workload.matched_ml_api_trace_dfs[backend] = matched_df kernel_names = set(matched_df["Kernel_Name"].dropna().str.strip().unique()) kernel_ids = sorted( @@ -313,36 +401,37 @@ def apply_torch_operator_filter( workload.filter_kernel_ids = kernel_ids console_log( "ml api trace", - f"Torch operator filter selected {len(kernel_ids)} kernel(s) " + f"{label} operator filter selected {len(kernel_ids)} kernel(s) " "for metric analysis.", ) + elif workload.filter_kernel_ids: + console_error( + "ml api trace", + f"No {label}-operator kernels overlap with the -k filter " + f"{workload.filter_kernel_ids}. No kernels to analyze.", + ) else: - if workload.filter_kernel_ids: - console_error( - "ml api trace", - "No torch-operator kernels overlap with the -k filter " - f"{workload.filter_kernel_ids}. No kernels to analyze.", - ) - else: - console_error( - "ml api trace", - "No kernels found for matched operators. No kernels to analyze.", - ) + console_error( + "ml api trace", + "No kernels found for matched operators. No kernels to analyze.", + ) - def handle_torch_operator( - self, args: argparse.Namespace, workload: schema.Workload + def handle_operator( + self, args: argparse.Namespace, workload: schema.Workload, backend: str ) -> None: - """Display matched torch operator call tree.""" - matched_df = workload.matched_ml_api_trace_df - if matched_df.empty: + """Display the matched operator call tree for a single backend.""" + cli = _ML_API_ANALYSIS_CLI_OPTIONS[backend] + label = cli["label"] + matched_df = workload.matched_ml_api_trace_dfs.get(backend) + if matched_df is None or matched_df.empty: return call_trees = build_call_trees(matched_df) - pattern_list = parse_torch_operator_patterns(args) + pattern_list = parse_operator_patterns(args, cli["filter_attr"]) matched_operators = matched_df["Operator_Name"].dropna().unique() print(f"\n{'=' * 80}") - print(f"Matched PyTorch Operators: {', '.join(pattern_list)}") + print(f"Matched {label} Operators: {', '.join(pattern_list)}") print("Grouped by source location, sorted by total GPU kernel duration.") print(f"{'=' * 80}") tty.show_call_tree(call_trees) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_base.py b/projects/rocprofiler-compute/src/rocprof_compute_base.py index e03b38e2959..df8db5dd596 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_base.py @@ -346,51 +346,54 @@ def handle_profile_args(self) -> None: def handle_analyze_args(self) -> None: """Handle analyze-specific argument processing""" args = self.__args - torch_operator = args.torch_operator - list_torch_operators = args.list_torch_operators + operator_filter = ( + args.torch_operator is not None or args.triton_operator is not None + ) + operator_listing = args.list_torch_operators or args.list_triton_operators - if torch_operator is not None or list_torch_operators: + if operator_filter or operator_listing: if args.gui is not None: console_error( "ml api trace", - "--torch-operator and --list-torch-operators are not " + "Operator flags (--torch-operator, --triton-operator, " + "--list-torch-operators, --list-triton-operators) are not " "supported in --gui mode. Please remove --gui or run " - "without the torch-operator flags.", + "without the operator flags.", ) if args.tui: console_error( "ml api trace", - "--torch-operator and --list-torch-operators are not " + "Operator flags (--torch-operator, --triton-operator, " + "--list-torch-operators, --list-triton-operators) are not " "supported in --tui mode. Please remove --tui or run " - "without the torch-operator flags.", + "without the operator flags.", ) if args.output_format != "stdout": console_error( "ml api trace", - "--torch-operator and --list-torch-operators are only " + "Operator flags (--torch-operator, --triton-operator, " + "--list-torch-operators, --list-triton-operators) are only " "supported with --output-format stdout (the default). " "The matched operator call tree is printed directly to " "stdout and is not captured in txt, csv, or db output. " - "Remove the --output-format option or drop the " - "torch-operator flags.", + "Remove the --output-format option or drop the operator flags.", ) - if torch_operator is not None: + if operator_filter: if args.list_stats: console_warning( "ml api trace", - "--torch-operator is ignored by --list-stats; the " + "Operator filters are ignored by --list-stats; the " "full kernel stats table will be shown regardless " "of the operator filter.", ) - if list_torch_operators: + if operator_listing: console_warning( "ml api trace", - "--torch-operator is ignored when " - "--list-torch-operators is used; the full operator " - "tree will be shown. Drop --list-torch-operators to " - "apply the operator filter to the analysis, or drop " - "--torch-operator to list all operators.", + "Operator filters are ignored when a --list-*-operators " + "flag is used; the full operator tree will be shown. " + "Drop the listing flag to apply the operator filter, or " + "drop the filter to list all operators.", ) @demarcate diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py index 0120e87b2e8..4541f8c8057 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py @@ -13,6 +13,7 @@ from pc_sampling.pc_sampling_profile import PCSamplingProfile from rocprof_compute_soc.soc_base import OmniSoC_Base +from utils.inject_roctx.constants import KNOWN_ML_API_BACKENDS from utils.logger import ( console_debug, console_error, @@ -38,6 +39,8 @@ # Maps each CLI flag to the backends it enables. _FLAG_TO_FRAMEWORKS: dict[str, tuple[str, ...]] = { "torch_trace": ("torch",), + "triton_trace": ("triton",), + "ml_api_trace": KNOWN_ML_API_BACKENDS, } diff --git a/projects/rocprofiler-compute/src/utils/inject_roctx/_backends/torch.py b/projects/rocprofiler-compute/src/utils/inject_roctx/_backends/torch.py index bc6724bd254..62edb35e1e1 100644 --- a/projects/rocprofiler-compute/src/utils/inject_roctx/_backends/torch.py +++ b/projects/rocprofiler-compute/src/utils/inject_roctx/_backends/torch.py @@ -654,16 +654,10 @@ def start_disp(op_name: str) -> None: location = core.resolve_user_caller_location() marker_stack = core.get_marker_stack() context_stack = core.get_context_stack() - # Mirror the _push_scope wire format, including the backend suffix. - full_marker = ( - "/".join([*marker_stack, op_name]) - + ":" - + "/".join([*context_stack, f"#{idx}@{location}"]) - + f"|{_BACKEND_NAME}" - ) - rangePush(full_marker) + context = f"#{idx}@{location}" + rangePush(core.compose_marker(op_name, context, _BACKEND_NAME)) marker_stack.append(op_name) - context_stack.append(f"#{idx}@{location}") + context_stack.append(context) def end_disp() -> None: marker_stack = core.get_marker_stack() diff --git a/projects/rocprofiler-compute/src/utils/inject_roctx/_backends/triton.py b/projects/rocprofiler-compute/src/utils/inject_roctx/_backends/triton.py index 2a23b4c47a5..63827581c56 100644 --- a/projects/rocprofiler-compute/src/utils/inject_roctx/_backends/triton.py +++ b/projects/rocprofiler-compute/src/utils/inject_roctx/_backends/triton.py @@ -3,18 +3,22 @@ """ROCTX instrumentation backend for Triton. -Wraps triton.compiler.CompiledKernel.__call__ so Triton and Inductor kernel -launches appear in ROCTX markers. +Wraps the Triton kernel-launch entry points (``JITFunction.run`` and +``CompiledKernel.run`` / ``CompiledKernel.__call__``) so that Triton and +Inductor kernel launches appear in ROCTX markers. """ import importlib.util -from functools import wraps -from typing import Any +import inspect +import threading +from functools import partial, partialmethod +from pathlib import Path +from typing import Any, Callable +from utils.inject_roctx import core from utils.inject_roctx.core import ( _pop_scope, _push_scope, - ensure_python_tier, resolve_user_caller_location, ) from utils.inject_roctx.registry import register @@ -23,86 +27,262 @@ _BACKEND_NAME = "triton" -class TritonBackend: - name = "triton" +class _TritonState: + """Resolved Triton launch entry-point handles, populated by _resolve_triton().""" def __init__(self) -> None: - self._compiled_kernel: Any = None - - def _resolve(self) -> bool: - """Bind the CompiledKernel handle. Returns True if triton is importable.""" - if importlib.util.find_spec("triton") is None: - return False - try: - from triton.compiler import CompiledKernel - except Exception: - return False - self._compiled_kernel = CompiledKernel + self.compiled_kernel: Any = None + self.jit_function: Any = None + + +_STATE = _TritonState() + +# Per-thread guard so nested launches emit a single marker. +_thread_local = threading.local() + + +def _in_launch() -> bool: + return getattr(_thread_local, "in_launch", False) + + +def _next_launch_index(marker: str) -> int: + """Per-thread occurrence count for marker.""" + counters = getattr(_thread_local, "launch_counters", None) + if counters is None: + counters = {} + _thread_local.launch_counters = counters + counters[marker] = counters.get(marker, 0) + 1 + return counters[marker] + + +def _resolve_triton() -> bool: + """Bind the triton handles on _STATE. Returns True if triton is importable.""" + if importlib.util.find_spec("triton") is None: + return False + try: + from triton.compiler import CompiledKernel as _CK + + _STATE.compiled_kernel = _CK + except Exception: + _STATE.compiled_kernel = None + try: + from triton.runtime.jit import JITFunction as _JIT + + _STATE.jit_function = _JIT + except Exception: + _STATE.jit_function = None + return _STATE.compiled_kernel is not None or _STATE.jit_function is not None + + +def _register_framework_root() -> None: + """Register triton's package directory as a framework root for + caller-location resolution.""" + try: + import triton + + console_log( + "ml api trace", + f"Triton version: {getattr(triton, '__version__', '')}", + ) + triton_file = getattr(triton, "__file__", None) + if triton_file: + core.add_framework_root(str(Path(triton_file).parent)) + except Exception as exc: + console_warning( + "ml api trace", + f"Could not register triton framework root: {exc}", + ) + + +def _extract_kernel_name(obj: object, default: str = "") -> str: + """Resolve the kernel name from ``name``, ``metadata``, or ``fn``, + returning ``default`` when none is available.""" + name = getattr(obj, "name", None) + if isinstance(name, str) and name: + return name + + metadata = getattr(obj, "metadata", None) + if isinstance(metadata, dict): + meta_name = metadata.get("name") + if isinstance(meta_name, str) and meta_name: + return meta_name + else: + meta_name = getattr(metadata, "name", None) + if isinstance(meta_name, str) and meta_name: + return meta_name + + fn = getattr(obj, "fn", None) + fn_name = getattr(fn, "__name__", None) + if isinstance(fn_name, str) and fn_name: + return fn_name + + return default + + +def _run_with_marker( + self_obj: object, + marker_prefix: str, + thunk: Callable[[], Any], +) -> object: + """Run ``thunk`` inside a ROCTX range; nested launches reuse the outer range.""" + if _in_launch(): + return thunk() + kernel_name = _extract_kernel_name(self_obj) + marker = f"{marker_prefix}.{kernel_name}" + location = resolve_user_caller_location() + index = _next_launch_index(marker) + _thread_local.in_launch = True + pushed = False + try: + _push_scope(marker, f"#{index}@{location}", backend=_BACKEND_NAME) + pushed = True + return thunk() + finally: + if pushed: + _pop_scope() + _thread_local.in_launch = False + + +def _roctx_method_call( + instance: object, + marker_prefix: str, + original: Callable[..., Any], + *args: Any, + **kwargs: Any, +) -> object: + """Run a wrapped method ``original`` inside a ROCTX range.""" + return _run_with_marker( + instance, marker_prefix, partial(original, instance, *args, **kwargs) + ) + + +def _wrap_method( + owner: type, method_name: str, marker_prefix: str, original: Callable[..., Any] +) -> bool: + wrapper = partialmethod(_roctx_method_call, marker_prefix, original) + wrapper._roctx_wrapped = True + setattr(owner, method_name, wrapper) + return True + + +def _roctx_launch( + instance: object, + marker_prefix: str, + launcher: Callable[..., Any], + *args: Any, + **kwargs: Any, +) -> object: + """Run a property-returned ``launcher`` inside a ROCTX range.""" + return _run_with_marker(instance, marker_prefix, partial(launcher, *args, **kwargs)) + + +def _roctx_property_get( + marker_prefix: str, original_getter: Callable[..., Any], instance: object +) -> object: + """Property getter that wraps the launcher it returns with a ROCTX range.""" + launcher = original_getter(instance) + if launcher is None or getattr(launcher, "_roctx_launcher", False): + return launcher + wrapped = partial(_roctx_launch, instance, marker_prefix, launcher) + wrapped._roctx_launcher = True + return wrapped + + +def _wrap_property( + owner: type, method_name: str, marker_prefix: str, wrapped_property: property +) -> bool: + original_getter = wrapped_property.fget + if original_getter is None: + return False + getter = partial(_roctx_property_get, marker_prefix, original_getter) + getter._roctx_wrapped = True + setattr( + owner, + method_name, + property(getter, wrapped_property.fset, wrapped_property.fdel), + ) + return True + + +def _wrap_launch( + owner: type, + method_name: str, + marker_prefix: str, +) -> bool: + """Wrap ``owner.method_name`` (a method or property) with a ROCTX range. + + Idempotent. Returns True when the wrapper is installed or already present. + """ + attr = inspect.getattr_static(owner, method_name, None) + if attr is None: + return False + if isinstance(attr, property): + if attr.fget is not None and getattr(attr.fget, "_roctx_wrapped", False): + return True + elif getattr(attr, "_roctx_wrapped", False): return True - def patch_launcher(self) -> None: - """Wrap CompiledKernel.__call__ so Triton/Inductor launches show in markers.""" - compiled_kernel = self._compiled_kernel - if compiled_kernel is None: - return + try: + if isinstance(attr, property): + installed = _wrap_property(owner, method_name, marker_prefix, attr) + else: + installed = _wrap_method(owner, method_name, marker_prefix, attr) + except Exception as exc: + console_warning( + "ml api trace", + f"Could not patch {owner.__name__}.{method_name}: {exc}", + ) + return False - original_call = getattr(compiled_kernel, "__call__", None) - if original_call is None: - return - if getattr(original_call, "_roctx_wrapped", False): - return + if installed: + console_log( + "ml api trace", + f"Wrapped {owner.__name__}.{method_name} with ROCTX markers", + ) + return installed - call_counts: dict[str, int] = {} - @wraps(original_call) - def call_with_roctx(kernel: object, *args: Any, **kwargs: Any) -> object: - kernel_name = ( - getattr(kernel, "name", None) - or getattr(kernel, "metadata", None) - or "" - ) - if isinstance(kernel_name, dict): - kernel_name = kernel_name.get("name", "") - marker = f"triton.CompiledKernel.{kernel_name}" - call_counts[marker] = call_counts.get(marker, 0) + 1 - location = resolve_user_caller_location() - _push_scope( - marker, - f"#{call_counts[marker]}@{location}", - backend=_BACKEND_NAME, - ) - try: - return original_call(kernel, *args, **kwargs) - finally: - _pop_scope() - - call_with_roctx._roctx_wrapped = True - try: - compiled_kernel.__call__ = call_with_roctx - console_log( - "ml api trace", - "Wrapped triton.CompiledKernel.__call__ with ROCTX markers", - ) - except Exception as exc: - console_warning( - "ml api trace", - f"Could not patch triton.CompiledKernel.__call__: {exc}", +def patch_triton_launcher() -> None: + """Wrap every available Triton launch entry point.""" + wrapped_any = False + jit_function = _STATE.jit_function + compiled_kernel = _STATE.compiled_kernel + if jit_function is not None: + wrapped_any |= _wrap_launch(jit_function, "run", "triton.JITFunction") + if compiled_kernel is not None: + # Prefer run(); fall back to __call__. + if hasattr(compiled_kernel, "run"): + wrapped_any |= _wrap_launch(compiled_kernel, "run", "triton.CompiledKernel") + else: + wrapped_any |= _wrap_launch( + compiled_kernel, "__call__", "triton.CompiledKernel" ) + if not wrapped_any: + console_warning( + "ml api trace", + "No Triton launch entry points found to instrument; " + "Triton API tracing may have no effect.", + ) + + +class TritonBackend: + name = "triton" def install(self) -> None: - if not self._resolve(): + if not _resolve_triton(): console_warning( "ml api trace", "Triton is not installed; skipping triton instrumentation.", ) return - if not ensure_python_tier(): + if not core.ensure_python_tier(): console_warning( "ml api trace", "ROCTX bindings not found; skipping triton instrumentation.", ) return - self.patch_launcher() + _register_framework_root() + patch_triton_launcher() register(TritonBackend()) diff --git a/projects/rocprofiler-compute/src/utils/inject_roctx/constants.py b/projects/rocprofiler-compute/src/utils/inject_roctx/constants.py index 445220249df..b1ebfb1d344 100644 --- a/projects/rocprofiler-compute/src/utils/inject_roctx/constants.py +++ b/projects/rocprofiler-compute/src/utils/inject_roctx/constants.py @@ -3,8 +3,5 @@ """Backend-selection constants for inject_roctx.""" -# Backends recognized by install_global_wraps and the "api" alias. -KNOWN_BACKENDS: tuple[str, ...] = ("torch", "triton") - -# The "api" alias selects every backend in KNOWN_BACKENDS. -API_ALIAS = "api" +# Backends recognized by install_global_wraps. +KNOWN_ML_API_BACKENDS: tuple[str, ...] = ("torch", "triton") diff --git a/projects/rocprofiler-compute/src/utils/inject_roctx/core.py b/projects/rocprofiler-compute/src/utils/inject_roctx/core.py index 41d7e6c4082..0862936afde 100644 --- a/projects/rocprofiler-compute/src/utils/inject_roctx/core.py +++ b/projects/rocprofiler-compute/src/utils/inject_roctx/core.py @@ -17,8 +17,6 @@ from pathlib import Path from typing import Callable, Union -from .constants import API_ALIAS, KNOWN_BACKENDS - def _missing_range_push(_label: str) -> None: raise RuntimeError( @@ -144,13 +142,21 @@ def resolve_user_caller_location() -> str: # "|" suffix attributes the scope to its backend. +def encode_marker_name(name: str) -> str: + """Percent-encode a marker segment so an embedded '/' is not read as the + frame separator. + """ + return name.replace("%", "%25").replace("/", "%2F") + + def compose_marker(marker: str, context: str, backend: str = "") -> str: """Return the wire-format string for a scope nested under the current - marker and context stacks. + marker and context stacks. Marker segments are percent-encoded. """ marker_stack = get_marker_stack() context_stack = get_context_stack() - full = "/".join([*marker_stack, marker]) + ":" + "/".join([*context_stack, context]) + op_path = "/".join(encode_marker_name(name) for name in [*marker_stack, marker]) + full = op_path + ":" + "/".join([*context_stack, context]) if backend: full = f"{full}|{backend}" return full @@ -186,7 +192,7 @@ def _pop_scope() -> None: def install_global_wraps(backends: Union[str, Iterable[str]] = "") -> None: """Install ROCTX instrumentation for each backend in backends. - "api" expands to every known backend. Empty input is a no-op. + Empty input is a no-op. """ from .registry import install_many @@ -195,13 +201,6 @@ def install_global_wraps(backends: Union[str, Iterable[str]] = "") -> None: else: names = [str(n).strip() for n in backends if str(n).strip()] - expanded: list[str] = [] - for n in names: - if n == API_ALIAS: - expanded.extend(KNOWN_BACKENDS) - else: - expanded.append(n) - - if not expanded: + if not names: return - install_many(expanded) + install_many(names) diff --git a/projects/rocprofiler-compute/src/utils/schema.py b/projects/rocprofiler-compute/src/utils/schema.py index fe4d77f90fa..ffc5c54cb78 100644 --- a/projects/rocprofiler-compute/src/utils/schema.py +++ b/projects/rocprofiler-compute/src/utils/schema.py @@ -48,7 +48,8 @@ class Workload: roofline_metrics: dict[int, dict[str, Any]] = field(default_factory=dict) path: str = field(default_factory=str) filter_top_n: str = field(default_factory=str) - matched_ml_api_trace_df: pd.DataFrame = field(default_factory=pd.DataFrame) + # Matched ML API trace rows keyed by backend, populated by operator filters. + matched_ml_api_trace_dfs: dict[str, pd.DataFrame] = field(default_factory=dict) # The prefix of raw pmc_perf.csv diff --git a/projects/rocprofiler-compute/src/utils/tty.py b/projects/rocprofiler-compute/src/utils/tty.py index 44f46a731fc..8c92dd594db 100644 --- a/projects/rocprofiler-compute/src/utils/tty.py +++ b/projects/rocprofiler-compute/src/utils/tty.py @@ -299,18 +299,23 @@ def is_roofline_shown( return True -def list_torch_operators( +def list_ml_operators( workload_path: str, call_trees: dict[str, CallTreeNode], + framework_label: str = "PyTorch", ) -> None: - """Display PyTorch operators as a unified call tree grouped by source location.""" + """Display operators as a unified call tree grouped by source location. + + ``framework_label`` sets the heading text (for example "PyTorch" or + "Triton"). + """ if not call_trees: - print(f"\nPyTorch Operators in: {workload_path}") + print(f"\n{framework_label} Operators in: {workload_path}") print("Total: 0 operators") return print(f"\n{'=' * 80}") - print(f"PyTorch Operator Call Tree: {workload_path}") + print(f"{framework_label} Operator Call Tree: {workload_path}") print("Grouped by source location, sorted by total GPU kernel duration.") print(f"{'=' * 80}") show_call_tree(call_trees) diff --git a/projects/rocprofiler-compute/src/utils/utils_analysis.py b/projects/rocprofiler-compute/src/utils/utils_analysis.py index b4489b68019..96d866c2ec2 100644 --- a/projects/rocprofiler-compute/src/utils/utils_analysis.py +++ b/projects/rocprofiler-compute/src/utils/utils_analysis.py @@ -202,6 +202,11 @@ def rollup_node_stats(node: CallTreeNode) -> NodeRollup: ) +def decode_marker_name(name: str) -> str: + """Decode a percent-encoded marker segment ('%2F' -> '/', '%25' -> '%').""" + return name.replace("%2F", "/").replace("%25", "%") + + def build_call_trees( df: pd.DataFrame, ) -> dict[str, CallTreeNode]: @@ -254,7 +259,6 @@ def build_call_trees( call_trees[location] = CallTreeNode(name=location) location_root = call_trees[location] - op_segments = op_path.split("/") ctx_segments = ( str(context_id).split("/") if has_context_id and context_id is not None and pd.notna(context_id) @@ -262,7 +266,8 @@ def build_call_trees( ) current_node = location_root - for i, path_segment in enumerate(op_segments): + for i, encoded_segment in enumerate(op_path.split("/")): + path_segment = decode_marker_name(encoded_segment) if path_segment not in current_node.children: current_node.children[path_segment] = CallTreeNode(name=path_segment) current_node = current_node.children[path_segment] diff --git a/projects/rocprofiler-compute/src/utils/utils_profile.py b/projects/rocprofiler-compute/src/utils/utils_profile.py index 89065510538..59d369cb53d 100644 --- a/projects/rocprofiler-compute/src/utils/utils_profile.py +++ b/projects/rocprofiler-compute/src/utils/utils_profile.py @@ -15,7 +15,7 @@ import config import utils.utils_profile_csv as csv_ops from utils import rocpd_data -from utils.inject_roctx.constants import KNOWN_BACKENDS +from utils.inject_roctx.constants import KNOWN_ML_API_BACKENDS from utils.logger import ( console_debug, console_error, @@ -42,7 +42,7 @@ # inject_roctx appends a trailing "|" suffix to marker names. _UNKNOWN_BACKEND = "unknown" _BACKEND_SUFFIX_RE = re.compile( - r"\|(" + "|".join(re.escape(b) for b in KNOWN_BACKENDS) + r")$" + r"\|(" + "|".join(re.escape(b) for b in KNOWN_ML_API_BACKENDS) + r")$" ) diff --git a/projects/rocprofiler-compute/tests/CMakeLists.txt b/projects/rocprofiler-compute/tests/CMakeLists.txt index 7370fb18e3b..507f2d7132f 100644 --- a/projects/rocprofiler-compute/tests/CMakeLists.txt +++ b/projects/rocprofiler-compute/tests/CMakeLists.txt @@ -109,3 +109,21 @@ add_custom_target( ${CMAKE_SOURCE_DIR}/tests/simple_net.py COMMENT "Copying simple_net.py to tests directory" ) + +add_custom_target( + triton_ffn + ALL + COMMAND + ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/sample/triton_ffn.py + ${CMAKE_SOURCE_DIR}/tests/triton_ffn.py + COMMENT "Copying triton_ffn.py to tests directory" +) + +add_custom_target( + torch_compile_triton + ALL + COMMAND + ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/sample/torch_compile_triton.py + ${CMAKE_SOURCE_DIR}/tests/torch_compile_triton.py + COMMENT "Copying torch_compile_triton.py to tests directory" +) diff --git a/projects/rocprofiler-compute/tests/conftest.py b/projects/rocprofiler-compute/tests/conftest.py index 88812204e71..92d7cbae8bb 100644 --- a/projects/rocprofiler-compute/tests/conftest.py +++ b/projects/rocprofiler-compute/tests/conftest.py @@ -203,6 +203,17 @@ def require_torch(*, gpu: bool = False) -> None: pytest.skip("torch.cuda.is_available() is False") +def require_triton(*, gpu: bool = False) -> None: + """Skip when Triton, or the PyTorch/GPU stack it requires, is unavailable.""" + require_torch(gpu=gpu) + if importlib.util.find_spec("triton") is None: + pytest.skip("Triton is not installed") + try: + import triton # noqa: F401 + except Exception as e: + pytest.skip(f"Triton import failed: {type(e).__name__}: {e}") + + @pytest.fixture(autouse=True) def skip_monkeypatch_with_binary(request): """Skip monkeypatch tests under --call-binary (patches don't cross processes).""" diff --git a/projects/rocprofiler-compute/tests/test_analyze_workloads.py b/projects/rocprofiler-compute/tests/test_analyze_workloads.py index 1769127b09c..84f071bd6c3 100644 --- a/projects/rocprofiler-compute/tests/test_analyze_workloads.py +++ b/projects/rocprofiler-compute/tests/test_analyze_workloads.py @@ -16,6 +16,13 @@ # Cached PyTorch trace workload shared by the torch-operator analyze tests. TORCH_TRACE_WORKLOAD = "tests/workloads/torch_trace/MI300X_A1" +# Cached Triton trace workload (triton_ffn.py): Triton kernels only. +TRITON_TRACE_WORKLOAD = "tests/workloads/triton_trace/MI300A" + +# Cached ML API trace workload (torch_compile_triton.py): both PyTorch and +# Triton operators in a single run. +ML_API_TRACE_WORKLOAD = "tests/workloads/ml_api_trace/MI300A" + # 30 workloads common to MI100, MI200, MI300A_A1, MI300X_A1. CDNA_WORKLOADS = [ "device_filter", @@ -211,7 +218,7 @@ def test_analyze_torch_trace_invalid_operator_MI300X_A1( assert code == 0 output = capsys.readouterr().out - assert "No operators matched" in output + assert "No PyTorch operators matched" in output common.clean_output_dir(config["cleanup"], workload_dir) @@ -263,3 +270,200 @@ def test_analyze_torch_trace_torch_prefix_MI300X_A1( assert "dispatches:" in output common.clean_output_dir(config["cleanup"], workload_dir) + + +################################################## +## Triton trace analysis tests ## +################################################## + + +def test_analyze_triton_trace_list_operators_MI300A( + binary_handler_analyze_rocprof_compute, capsys +): + workload_dir = common.setup_workload_dir(TRITON_TRACE_WORKLOAD) + + code = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--list-triton-operators", + ]) + assert code == 0 + + output = capsys.readouterr().out + + assert "Triton Operator Call Tree:" in output + assert "Grouped by source location" in output + assert "triton.JITFunction.matmul_kernel" in output + assert "rmsnorm_kernel" in output + assert "dispatches:" in output + assert "total:" in output + + common.clean_output_dir(config["cleanup"], workload_dir) + + +def test_analyze_triton_trace_filter_operator_MI300A( + binary_handler_analyze_rocprof_compute, capsys +): + workload_dir = common.setup_workload_dir(TRITON_TRACE_WORKLOAD) + + code = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--triton-operator", + "*matmul*", + ]) + assert code == 0 + + output = capsys.readouterr().out + + assert "Matched Triton Operators:" in output + assert "matmul_kernel" in output + assert "dispatches:" in output + + common.clean_output_dir(config["cleanup"], workload_dir) + + +def test_analyze_triton_trace_multi_operator_MI300A( + binary_handler_analyze_rocprof_compute, capsys +): + workload_dir = common.setup_workload_dir(TRITON_TRACE_WORKLOAD) + + code = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--triton-operator", + "*matmul*", + "*silu*", + ]) + assert code == 0 + + output = capsys.readouterr().out + + assert "Matched Triton Operators:" in output + assert "matmul_kernel" in output + assert "silu_kernel" in output + + common.clean_output_dir(config["cleanup"], workload_dir) + + +def test_analyze_triton_trace_invalid_operator_MI300A( + binary_handler_analyze_rocprof_compute, capsys +): + workload_dir = common.setup_workload_dir(TRITON_TRACE_WORKLOAD) + + code = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--triton-operator", + "nonexistent_op", + ]) + assert code == 0 + + output = capsys.readouterr().out + assert "No Triton operators matched" in output + + common.clean_output_dir(config["cleanup"], workload_dir) + + +################################################## +## ML API trace analysis tests ## +################################################## + + +def test_analyze_ml_api_trace_list_triton_operators_MI300A( + binary_handler_analyze_rocprof_compute, capsys +): + workload_dir = common.setup_workload_dir(ML_API_TRACE_WORKLOAD) + + code = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--list-triton-operators", + ]) + assert code == 0 + + output = capsys.readouterr().out + + assert "Triton Operator Call Tree:" in output + assert "torch.compile.fused" in output + assert "triton_poi_fused_add_mul_relu_0" in output + + common.clean_output_dir(config["cleanup"], workload_dir) + + +def test_analyze_ml_api_trace_list_torch_operators_MI300A( + binary_handler_analyze_rocprof_compute, capsys +): + workload_dir = common.setup_workload_dir(ML_API_TRACE_WORKLOAD) + + code = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--list-torch-operators", + ]) + assert code == 0 + + output = capsys.readouterr().out + + assert "PyTorch Operator Call Tree:" in output + assert "aten::randn" in output + + common.clean_output_dir(config["cleanup"], workload_dir) + + +def test_analyze_ml_api_trace_filter_triton_operator_MI300A( + binary_handler_analyze_rocprof_compute, capsys +): + workload_dir = common.setup_workload_dir(ML_API_TRACE_WORKLOAD) + + code = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--triton-operator", + "all", + ]) + assert code == 0 + + output = capsys.readouterr().out + + assert "Matched Triton Operators:" in output + assert "triton_poi_fused_add_mul_relu_0" in output + + common.clean_output_dir(config["cleanup"], workload_dir) + + +def test_analyze_ml_api_trace_filter_torch_operator_MI300A( + binary_handler_analyze_rocprof_compute, capsys +): + workload_dir = common.setup_workload_dir(ML_API_TRACE_WORKLOAD) + + code = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--torch-operator", + "*randn*", + ]) + assert code == 0 + + output = capsys.readouterr().out + + assert "Matched PyTorch Operators:" in output + assert "randn" in output + + common.clean_output_dir(config["cleanup"], workload_dir) diff --git a/projects/rocprofiler-compute/tests/test_categories.yaml b/projects/rocprofiler-compute/tests/test_categories.yaml index 556eed5e2df..7aea2d75ec2 100644 --- a/projects/rocprofiler-compute/tests/test_categories.yaml +++ b/projects/rocprofiler-compute/tests/test_categories.yaml @@ -105,6 +105,7 @@ test_categories: - test_soc_base - test-pc-sampling-collector - test_profile_torch_trace + - test_profile_triton_trace - test_profile_live_attach_detach - test_metric_validation # Remove once ROCm/TheRock#6087 lands and the rocm-systems hash is bumped in TheRock; therock_ci_exclude replaces this. @@ -172,6 +173,7 @@ test_categories: - test_soc_base - test-pc-sampling-collector - test_profile_torch_trace + - test_profile_triton_trace - test_profile_live_attach_detach - test_metric_validation # Remove once ROCm/TheRock#6087 lands and the rocm-systems hash is bumped in TheRock; therock_ci_exclude replaces this. @@ -239,6 +241,7 @@ test_categories: - test_soc_base - test-pc-sampling-collector - test_profile_torch_trace + - test_profile_triton_trace - test_profile_live_attach_detach - test_metric_validation # Remove once ROCm/TheRock#6087 lands and the rocm-systems hash is bumped in TheRock; therock_ci_exclude replaces this. diff --git a/projects/rocprofiler-compute/tests/test_inject_roctx_package.py b/projects/rocprofiler-compute/tests/test_inject_roctx_package.py index 46e7e458530..0750f347d79 100644 --- a/projects/rocprofiler-compute/tests/test_inject_roctx_package.py +++ b/projects/rocprofiler-compute/tests/test_inject_roctx_package.py @@ -5,6 +5,7 @@ ``core.install_global_wraps``, ``registry.install_many``, ``TritonBackend``, and ``core._push_scope`` / ``_pop_scope``.""" +import functools import importlib import sys import types @@ -12,6 +13,35 @@ import common # noqa: F401 import pytest +# --------------------------------------------------------------------------- +# Test helpers +# --------------------------------------------------------------------------- + + +def record_call(calls, names): + """Append a snapshot of ``names`` to ``calls`` (an ``install_many`` spy).""" + calls.append(list(names)) + + +def find_spec_without_name(absent_name, real_find_spec, name, *args, **kwargs): + """Behave like ``find_spec`` but report ``absent_name`` as missing.""" + if name == absent_name: + return None + return real_find_spec(name, *args, **kwargs) + + +def raise_import_skipped(*args, **kwargs): + """Raise to assert that an import path is never reached.""" + raise AssertionError("roctx import should be skipped") + + +def make_backend(name, install_fn=None): + backend = types.SimpleNamespace() + backend.name = name + backend.install = install_fn or (lambda: None) + return backend + + # --------------------------------------------------------------------------- # install_global_wraps # --------------------------------------------------------------------------- @@ -23,11 +53,9 @@ def captured_install(monkeypatch): from utils.inject_roctx import registry as registry_pkg calls: list[list[str]] = [] - - def _record(names): - calls.append(list(names)) - - monkeypatch.setattr(registry_pkg, "install_many", _record) + monkeypatch.setattr( + registry_pkg, "install_many", functools.partial(record_call, calls) + ) return calls @@ -60,20 +88,6 @@ def test_install_global_wraps_iterable_input(captured_install): assert captured_install == [["torch", "triton"]] -def test_install_global_wraps_api_alias_expands(captured_install): - from utils.inject_roctx.core import install_global_wraps - - install_global_wraps("api") - assert captured_install == [["torch", "triton"]] - - -def test_install_global_wraps_api_alongside_explicit_name(captured_install): - from utils.inject_roctx.core import install_global_wraps - - install_global_wraps("api,torch") - assert captured_install == [["torch", "triton", "torch"]] - - # --------------------------------------------------------------------------- # registry.install_many # --------------------------------------------------------------------------- @@ -88,17 +102,10 @@ def fresh_registry(monkeypatch): return registry_pkg -def _make_backend(name, install_fn=None): - backend = types.SimpleNamespace() - backend.name = name - backend.install = install_fn or (lambda: None) - return backend - - def test_install_many_invokes_registered_backends(fresh_registry): calls: list[str] = [] - fresh_registry.register(_make_backend("alpha", lambda: calls.append("alpha"))) - fresh_registry.register(_make_backend("beta", lambda: calls.append("beta"))) + fresh_registry.register(make_backend("alpha", lambda: calls.append("alpha"))) + fresh_registry.register(make_backend("beta", lambda: calls.append("beta"))) fresh_registry.install_many(["alpha", "beta"]) assert calls == ["alpha", "beta"] @@ -106,7 +113,7 @@ def test_install_many_invokes_registered_backends(fresh_registry): def test_install_many_dedupes_duplicate_names(fresh_registry): calls: list[str] = [] - fresh_registry.register(_make_backend("alpha", lambda: calls.append("alpha"))) + fresh_registry.register(make_backend("alpha", lambda: calls.append("alpha"))) fresh_registry.install_many(["alpha", "alpha", "alpha"]) assert calls == ["alpha"] @@ -118,9 +125,9 @@ def test_install_many_continues_after_backend_failure(fresh_registry, monkeypatc other_calls: list[str] = [] fresh_registry.register( - _make_backend("bad", lambda: (_ for _ in ()).throw(RuntimeError("boom"))) + make_backend("bad", lambda: (_ for _ in ()).throw(RuntimeError("boom"))) ) - fresh_registry.register(_make_backend("good", lambda: other_calls.append("good"))) + fresh_registry.register(make_backend("good", lambda: other_calls.append("good"))) fresh_registry.install_many(["bad", "good"]) assert other_calls == ["good"] @@ -160,13 +167,11 @@ def test_triton_backend_skips_when_triton_missing(monkeypatch): from utils.inject_roctx._backends import triton as triton_backend real_find_spec = importlib.util.find_spec - - def _no_triton(name, *args, **kwargs): - if name == "triton": - return None - return real_find_spec(name, *args, **kwargs) - - monkeypatch.setattr(importlib.util, "find_spec", _no_triton) + monkeypatch.setattr( + importlib.util, + "find_spec", + functools.partial(find_spec_without_name, "triton", real_find_spec), + ) warnings: list[tuple] = [] monkeypatch.setattr( @@ -179,39 +184,34 @@ def _no_triton(name, *args, **kwargs): ) -def test_triton_backend_wraps_compiled_kernel_call(monkeypatch): +def test_triton_backend_wraps_compiled_kernel_run(monkeypatch): + """CompiledKernel.run() is wrapped in preference to __call__.""" from utils.inject_roctx._backends import triton as triton_backend pushes: list[tuple] = [] - pops: list[None] = [] monkeypatch.setattr( triton_backend, "_push_scope", - lambda marker, ctx, backend="": pushes.append((marker, ctx, backend)), + lambda marker, ctx, backend="": pushes.append((marker, backend)), ) - monkeypatch.setattr(triton_backend, "_pop_scope", lambda: pops.append(None)) + monkeypatch.setattr(triton_backend, "_pop_scope", lambda: None) + monkeypatch.setattr(triton_backend._STATE, "jit_function", None) - class FakeKernel: - name = "my_kernel" + class FakeCompiledKernel: + name = "rk" - def __call__(self, *a, **kw): - return ("ran", a, kw) + def run(self, *a, **kw): + return "ran" - backend = triton_backend.TritonBackend() - backend._compiled_kernel = FakeKernel - backend.patch_launcher() + monkeypatch.setattr(triton_backend._STATE, "compiled_kernel", FakeCompiledKernel) + triton_backend.patch_triton_launcher() - out = FakeKernel()(1, x=2) - assert out == ("ran", (1,), {"x": 2}) - assert len(pushes) == 1 - marker, ctx, backend = pushes[0] - assert marker == "triton.CompiledKernel.my_kernel" - assert ctx.startswith("#1@") - assert backend == "triton" - assert pops == [None] + assert FakeCompiledKernel().run() == "ran" + assert pushes == [("triton.CompiledKernel.rk", "triton")] -def test_triton_backend_kernel_name_fallbacks(monkeypatch): +def test_triton_backend_wraps_jitfunction_run(monkeypatch): + """JITFunction.run is wrapped for eager launches.""" from utils.inject_roctx._backends import triton as triton_backend pushes: list[str] = [] @@ -221,46 +221,167 @@ def test_triton_backend_kernel_name_fallbacks(monkeypatch): lambda marker, ctx, backend="": pushes.append(marker), ) monkeypatch.setattr(triton_backend, "_pop_scope", lambda: None) + monkeypatch.setattr(triton_backend._STATE, "compiled_kernel", None) - class KernelWithDictMeta: - metadata = {"name": "from_meta"} + class FakeJIT: + def __init__(self): + self.fn = types.SimpleNamespace(__name__="add_kernel") - def __call__(self): - pass + def run(self, *a, **kw): + return "launched" - class KernelNoName: - def __call__(self): - pass + monkeypatch.setattr(triton_backend._STATE, "jit_function", FakeJIT) + triton_backend.patch_triton_launcher() + + assert FakeJIT().run() == "launched" + assert pushes == ["triton.JITFunction.add_kernel"] + + +def test_triton_backend_reentrancy_dedups_nested_launch(monkeypatch): + """Nested JITFunction.run and CompiledKernel.run emit one marker.""" + from utils.inject_roctx._backends import triton as triton_backend + + pushes: list[str] = [] + monkeypatch.setattr( + triton_backend, + "_push_scope", + lambda marker, ctx, backend="": pushes.append(marker), + ) + monkeypatch.setattr(triton_backend, "_pop_scope", lambda: None) + # Reset the per-thread guard. + if hasattr(triton_backend._thread_local, "in_launch"): + del triton_backend._thread_local.in_launch + + class FakeCompiledKernel: + name = "inner" - for cls, expected in ( - (KernelWithDictMeta, "triton.CompiledKernel.from_meta"), - (KernelNoName, "triton.CompiledKernel."), - ): - backend = triton_backend.TritonBackend() - backend._compiled_kernel = cls - backend.patch_launcher() - cls()() - assert pushes[-1] == expected + def run(self, *a, **kw): + return "inner_ran" + + class FakeJIT: + name = "outer" + + def __init__(self, compiled): + self._compiled = compiled + + def run(self, *a, **kw): + return self._compiled.run() + + monkeypatch.setattr(triton_backend._STATE, "compiled_kernel", FakeCompiledKernel) + monkeypatch.setattr(triton_backend._STATE, "jit_function", FakeJIT) + triton_backend.patch_triton_launcher() + + compiled = FakeCompiledKernel() + out = FakeJIT(compiled).run() + + assert out == "inner_ran" + assert pushes == ["triton.JITFunction.outer"] def test_triton_backend_patch_is_idempotent(monkeypatch): + """Patching twice does not re-wrap the launch entry point.""" from utils.inject_roctx._backends import triton as triton_backend - monkeypatch.setattr(triton_backend, "_push_scope", lambda *a, **k: None) + pushes: list[str] = [] + monkeypatch.setattr( + triton_backend, + "_push_scope", + lambda marker, ctx, backend="": pushes.append(marker), + ) monkeypatch.setattr(triton_backend, "_pop_scope", lambda: None) + # Reset the per-thread guard. + if hasattr(triton_backend._thread_local, "in_launch"): + del triton_backend._thread_local.in_launch - class FakeKernel: + class FakeJIT: name = "k" - def __call__(self): - pass + def run(self, *a, **kw): + return "ran" + + monkeypatch.setattr(triton_backend._STATE, "compiled_kernel", None) + monkeypatch.setattr(triton_backend._STATE, "jit_function", FakeJIT) + + triton_backend.patch_triton_launcher() + wrapped_once = FakeJIT.__dict__["run"] + triton_backend.patch_triton_launcher() + + assert FakeJIT.__dict__["run"] is wrapped_once, ( + "second patch re-wrapped the launcher" + ) + assert FakeJIT().run() == "ran" + assert pushes == ["triton.JITFunction.k"], "exactly one marker per launch" + + +def test_triton_backend_registers_framework_root(monkeypatch): + """install() registers triton's package directory as a framework root.""" + from utils.inject_roctx._backends import triton as triton_backend + + monkeypatch.setattr(triton_backend, "_resolve_triton", lambda: True) + monkeypatch.setattr(triton_backend, "patch_triton_launcher", lambda: None) + + fake_triton = types.ModuleType("triton") + fake_triton.__file__ = "/opt/fake/triton/__init__.py" + monkeypatch.setitem(sys.modules, "triton", fake_triton) + + roots: list[str] = [] + monkeypatch.setattr( + triton_backend.core, "add_framework_root", lambda p: roots.append(p) + ) + + triton_backend.TritonBackend().install() + assert roots == ["/opt/fake/triton"] - backend = triton_backend.TritonBackend() - backend._compiled_kernel = FakeKernel - backend.patch_launcher() - first = FakeKernel.__call__ - backend.patch_launcher() - assert FakeKernel.__call__ is first + +def test_triton_backend_skips_when_python_tier_unavailable(monkeypatch): + from utils.inject_roctx._backends import triton as triton_backend + + monkeypatch.setattr(triton_backend, "_resolve_triton", lambda: True) + monkeypatch.setattr(triton_backend.core, "ensure_python_tier", lambda: False) + + patched: list[bool] = [] + monkeypatch.setattr( + triton_backend, "patch_triton_launcher", lambda: patched.append(True) + ) + warnings: list[tuple] = [] + monkeypatch.setattr( + triton_backend, "console_warning", lambda *a: warnings.append(a) + ) + + triton_backend.TritonBackend().install() + assert patched == [] + assert any("ROCTX bindings not found" in str(a[1]) for a in warnings if len(a) > 1) + + +def test_ensure_python_tier_short_circuits_when_already_configured(monkeypatch): + from utils.inject_roctx import core + + saved_push, saved_pop = core._STATE.range_push, core._STATE.range_pop + try: + core.set_python_tier_io(lambda _s: None, lambda: None) + + monkeypatch.setattr(core.importlib, "import_module", raise_import_skipped) + assert core.ensure_python_tier() is True + finally: + core._STATE.range_push, core._STATE.range_pop = saved_push, saved_pop + + +def test_extract_kernel_name_prefers_attr_then_meta_then_fn(): + from utils.inject_roctx._backends import triton as triton_backend + + named = types.SimpleNamespace(name="direct") + assert triton_backend._extract_kernel_name(named) == "direct" + + meta = types.SimpleNamespace(metadata={"name": "meta_name"}) + assert triton_backend._extract_kernel_name(meta) == "meta_name" + + via_fn = types.SimpleNamespace(fn=types.SimpleNamespace(__name__="fn_name")) + assert triton_backend._extract_kernel_name(via_fn) == "fn_name" + + assert ( + triton_backend._extract_kernel_name(types.SimpleNamespace()) + == "" + ) # --------------------------------------------------------------------------- diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 23ecda88584..74220f1a420 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -16,7 +16,7 @@ import pandas as pd import pytest import yaml -from conftest import require_torch +from conftest import require_torch, require_triton from utils.utils_common import canonical_config_arch @@ -43,6 +43,8 @@ config["app_mpi_aware_laplace_eqn"] = ["./tests/mpi_aware_laplace_eqn", "-i", "5"] config["rocflop"] = ["./tests/rocflop", "--device", "0", "--fp16"] config["torch_test_app"] = ["python3", "./tests/simple_net.py"] +config["triton_test_app"] = ["python3", "./tests/triton_ffn.py"] +config["torch_compile_test_app"] = ["python3", "./tests/torch_compile_triton.py"] config["cleanup"] = True config["METRIC_COMPARE"] = False config["METRIC_LOGGING"] = False @@ -3003,13 +3005,252 @@ def test_torch_trace_profile( "Analyze with non-matching --torch-operator should not crash" ) out_nomatch = capsys.readouterr().out - assert "No operators matched" in out_nomatch, ( + assert "No PyTorch operators matched" in out_nomatch, ( "Expected warning about no operators matched" ) common.clean_output_dir(config["cleanup"], workload_dir) +@pytest.mark.triton_trace +def test_triton_trace_profile( + binary_handler_profile_rocprof_compute, + binary_handler_analyze_rocprof_compute, + capsys, +): + """ + Profile and analyze flow for the Triton backend. + + Profiles a Triton FFN workload with --triton-trace, verifies the marker and + counter CSV outputs contain Triton markers, then runs analyze with + --list-triton-operators and --triton-operator and checks the call-tree + banner, the consolidated ml_api_trace CSV, and the matched and no-match output. + Requires PyTorch, Triton, and a GPU. + """ + require_triton(gpu=True) + workload_dir = common.get_output_dir(param_id="triton_trace") + + options = [ + "--experimental", + "--triton-trace", + "--iteration-multiplexing", + ] + + returncode = binary_handler_profile_rocprof_compute( + config, + workload_dir, + options, + check_success=True, + app_name="triton_test_app", + ) + + # ---- Profiling output ---- + + assert returncode == 0, "Profiling the Triton application failed" + + marker_api_trace_files = list(Path(workload_dir).glob("**/*marker_api_trace.csv")) + counter_collection_files = list( + Path(workload_dir).glob("**/*counter_collection.csv") + ) + assert marker_api_trace_files, "No marker_api_trace.csv produced" + assert len(marker_api_trace_files) == len(counter_collection_files), ( + "marker_api_trace.csv and counter_collection.csv counts differ" + ) + + found_triton_marker = False + for marker_file in marker_api_trace_files: + with open(marker_file, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + assert reader.fieldnames is not None, f"No columns in {marker_file}" + assert "Function" in reader.fieldnames, ( + f"'Function' column missing in {marker_file}" + ) + for row in reader: + if "triton" in str(row["Function"]).lower(): + found_triton_marker = True + break + if found_triton_marker: + break + assert found_triton_marker, "No Triton markers in marker_api_trace output" + + # Flush profiling output so capsys captures only the analyze output. + capsys.readouterr() + + # ---- analyze --list-triton-operators ---- + + returncode_list = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--list-triton-operators", + ]) + assert returncode_list == 0, "Analyze with --list-triton-operators failed" + + list_output = capsys.readouterr().out + assert "Triton Operator Call Tree:" in list_output, "Missing call-tree banner" + # The workload launches a Triton matmul kernel. + assert "matmul" in list_output, "matmul kernel missing from operator list" + + consolidated_csv = Path(workload_dir) / "ml_api_trace" / "consolidated.csv" + assert consolidated_csv.exists(), "consolidated.csv not found in ml_api_trace" + df = pd.read_csv(consolidated_csv) + assert not df.empty, "consolidated.csv is empty" + assert "Operator_Name" in df.columns, "Operator_Name column missing" + assert df["Operator_Name"].astype(str).str.contains("triton").any(), ( + "No Triton operators in consolidated.csv" + ) + + # ---- analyze --triton-operator ---- + + capsys.readouterr() + returncode_match = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--triton-operator", + "*matmul*", + ]) + assert returncode_match == 0, "Analyze with --triton-operator *matmul* failed" + out_match = capsys.readouterr().out + assert "Matched Triton Operators" in out_match, "Missing matched-operators header" + assert "matmul" in out_match, "matmul kernel missing from matched output" + + capsys.readouterr() + returncode_nomatch = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--triton-operator", + "nonexistent_kernel_xyz", + ]) + assert returncode_nomatch == 0, ( + "Analyze with a non-matching --triton-operator failed" + ) + out_nomatch = capsys.readouterr().out + assert "No Triton operators matched" in out_nomatch, "Missing no-match warning" + + common.clean_output_dir(config["cleanup"], workload_dir) + + +@pytest.mark.triton_trace +def test_ml_api_trace_torch_compile_triton( + binary_handler_profile_rocprof_compute, + binary_handler_analyze_rocprof_compute, + capsys, +): + """ + ML API trace flow for a torch.compile workload that generates Triton kernels. + + Profiles sample/torch_compile_triton.py with --ml-api-trace, verifies Triton + markers reach the marker and counter CSVs, then analyzes the consolidated + ml_api_trace with --triton-operator and --torch-operator. + Requires PyTorch, Triton, and a GPU. + """ + require_triton(gpu=True) + workload_dir = common.get_output_dir(param_id="ml_api_trace") + + options = [ + "--experimental", + "--ml-api-trace", + "--iteration-multiplexing", + ] + + returncode = binary_handler_profile_rocprof_compute( + config, + workload_dir, + options, + check_success=True, + app_name="torch_compile_test_app", + ) + + # ---- Profiling output ---- + + assert returncode == 0, "Profiling the torch.compile/Triton workload failed" + + marker_api_trace_files = list(Path(workload_dir).glob("**/*marker_api_trace.csv")) + counter_collection_files = list( + Path(workload_dir).glob("**/*counter_collection.csv") + ) + assert marker_api_trace_files, "No marker_api_trace.csv produced" + assert len(marker_api_trace_files) == len(counter_collection_files), ( + "marker_api_trace.csv and counter_collection.csv counts differ" + ) + + found_triton_marker = False + for marker_file in marker_api_trace_files: + with open(marker_file, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + assert reader.fieldnames is not None, f"No columns in {marker_file}" + assert "Function" in reader.fieldnames, ( + f"'Function' column missing in {marker_file}" + ) + for row in reader: + if "triton" in str(row["Function"]).lower(): + found_triton_marker = True + break + if found_triton_marker: + break + assert found_triton_marker, "No Triton markers in marker_api_trace output" + + # Flush profiling output so capsys captures only the analyze output. + capsys.readouterr() + + # ---- Consolidated ml_api_trace ---- + + returncode_list = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--list-triton-operators", + ]) + assert returncode_list == 0, "Analyze with --list-triton-operators failed" + list_output = capsys.readouterr().out + assert "Triton Operator Call Tree:" in list_output, "Missing call-tree banner" + + consolidated_csv = Path(workload_dir) / "ml_api_trace" / "consolidated.csv" + assert consolidated_csv.exists(), "consolidated.csv not found in ml_api_trace" + df = pd.read_csv(consolidated_csv) + assert not df.empty, "consolidated.csv is empty" + assert "Operator_Name" in df.columns, "Operator_Name column missing" + assert df["Operator_Name"].astype(str).str.contains("triton").any(), ( + "No Triton operators in consolidated.csv" + ) + + # ---- analyze --triton-operator ---- + + capsys.readouterr() + returncode_triton = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--triton-operator", + "all", + ]) + assert returncode_triton == 0, "Analyze with --triton-operator all failed" + out_triton = capsys.readouterr().out + assert "Matched Triton Operators" in out_triton, "Missing matched-operators header" + + # Torch operators may be absent under torch.compile; the analyze run only + # needs to complete successfully. + capsys.readouterr() + returncode_torch = binary_handler_analyze_rocprof_compute([ + "--experimental", + "analyze", + "--path", + workload_dir, + "--torch-operator", + "all", + ]) + assert returncode_torch == 0, "Analyze with --torch-operator all failed" + + common.clean_output_dir(config["cleanup"], workload_dir) + + @pytest.mark.torch_trace def test_torch_trace_overhead(binary_handler_profile_rocprof_compute): """ diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py index 1885e518eb6..5c1662d2c46 100644 --- a/projects/rocprofiler-compute/tests/test_utils.py +++ b/projects/rocprofiler-compute/tests/test_utils.py @@ -6029,7 +6029,7 @@ def test_display_empty_inputs(): assert get_matched_torch_operators_for_display({"x": pd.DataFrame()}, []) == [] -# -- parse_torch_operator_patterns ------------------------------------------ +# -- parse_operator_patterns (torch_operator) ------------------------------- @pytest.mark.torch_ops @@ -6037,13 +6037,13 @@ def test_parse_patterns_basic(): """Single and multiple patterns are parsed correctly.""" from argparse import Namespace - from rocprof_compute_analyze.analysis_cli import parse_torch_operator_patterns + from rocprof_compute_analyze.analysis_cli import parse_operator_patterns args = Namespace(torch_operator=["relu"]) - assert parse_torch_operator_patterns(args) == ["relu"] + assert parse_operator_patterns(args, "torch_operator") == ["relu"] args = Namespace(torch_operator=["relu", "conv2d"]) - assert parse_torch_operator_patterns(args) == ["relu", "conv2d"] + assert parse_operator_patterns(args, "torch_operator") == ["relu", "conv2d"] @pytest.mark.torch_ops @@ -6051,10 +6051,10 @@ def test_parse_patterns_comma_split(): """Comma-separated patterns in a single arg are split.""" from argparse import Namespace - from rocprof_compute_analyze.analysis_cli import parse_torch_operator_patterns + from rocprof_compute_analyze.analysis_cli import parse_operator_patterns args = Namespace(torch_operator=["relu,conv2d"]) - assert parse_torch_operator_patterns(args) == ["relu", "conv2d"] + assert parse_operator_patterns(args, "torch_operator") == ["relu", "conv2d"] @pytest.mark.torch_ops @@ -6062,10 +6062,11 @@ def test_parse_patterns_whitespace(): """Leading/trailing whitespace is stripped.""" from argparse import Namespace - from rocprof_compute_analyze.analysis_cli import parse_torch_operator_patterns + from rocprof_compute_analyze.analysis_cli import parse_operator_patterns args = Namespace(torch_operator=[" relu ", " conv2d , linear "]) - assert parse_torch_operator_patterns(args) == ["relu", "conv2d", "linear"] + result = parse_operator_patterns(args, "torch_operator") + assert result == ["relu", "conv2d", "linear"] @pytest.mark.torch_ops @@ -6073,11 +6074,62 @@ def test_parse_patterns_empty(): """Flag given with no args defaults to '**'; absent flag returns empty.""" from argparse import Namespace - from rocprof_compute_analyze.analysis_cli import parse_torch_operator_patterns + from rocprof_compute_analyze.analysis_cli import parse_operator_patterns - assert parse_torch_operator_patterns(Namespace(torch_operator=[])) == ["**"] - assert parse_torch_operator_patterns(Namespace(torch_operator=None)) == [] - assert parse_torch_operator_patterns(Namespace()) == [] + parse = parse_operator_patterns + assert parse(Namespace(torch_operator=[]), "torch_operator") == ["**"] + assert parse(Namespace(torch_operator=None), "torch_operator") == [] + assert parse(Namespace(), "torch_operator") == [] + + +# -- parse_operator_patterns / triton backend selection --------------------- + + +@pytest.mark.torch_ops +def test_parse_operator_patterns_generic_attr(): + """parse_operator_patterns reads the given dest attribute.""" + from argparse import Namespace + + from rocprof_compute_analyze.analysis_cli import parse_operator_patterns + + args = Namespace(triton_operator=["*matmul*,*softmax*"], torch_operator=None) + assert parse_operator_patterns(args, "triton_operator") == [ + "*matmul*", + "*softmax*", + ] + assert parse_operator_patterns(args, "triton_operator") != parse_operator_patterns( + args, "torch_operator" + ) + assert parse_operator_patterns( + Namespace(triton_operator=[]), "triton_operator" + ) == ["**"] + + +@pytest.mark.torch_ops +def test_filter_by_backend_selects_only_requested_backend(): + from rocprof_compute_analyze.analysis_cli import cli_analysis + + df = pd.DataFrame({ + "Operator_Name": ["aten::mm", "triton_matmul", "aten::relu"], + "Backend": ["torch", "triton", "torch"], + }) + + triton_df = cli_analysis._filter_by_backend(df, "triton") + assert triton_df["Operator_Name"].tolist() == ["triton_matmul"] + + torch_df = cli_analysis._filter_by_backend(df, "torch") + assert torch_df["Operator_Name"].tolist() == ["aten::mm", "aten::relu"] + + +@pytest.mark.torch_ops +def test_filter_by_backend_without_column_defaults_to_torch(): + from rocprof_compute_analyze.analysis_cli import cli_analysis + + df = pd.DataFrame({"Operator_Name": ["aten::mm", "aten::relu"]}) + + # Without a Backend column, rows are treated as torch. + assert len(cli_analysis._filter_by_backend(df, "torch")) == 2 + assert cli_analysis._filter_by_backend(df, "triton").empty # -- fnmatch_glob_matches --------------------------------------------------- @@ -6306,13 +6358,13 @@ def test_parse_patterns_star(): """'*' is passed through as-is by the pattern parser.""" from argparse import Namespace - from rocprof_compute_analyze.analysis_cli import parse_torch_operator_patterns + from rocprof_compute_analyze.analysis_cli import parse_operator_patterns args = Namespace(torch_operator=["*"]) - assert parse_torch_operator_patterns(args) == ["*"] + assert parse_operator_patterns(args, "torch_operator") == ["*"] args = Namespace(torch_operator=["*,torch.relu"]) - assert parse_torch_operator_patterns(args) == ["*", "torch.relu"] + assert parse_operator_patterns(args, "torch_operator") == ["*", "torch.relu"] # ============================================================================= diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/log.txt b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/log.txt new file mode 100644 index 00000000000..1b1bbf7cbf1 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/log.txt @@ -0,0 +1,69 @@ +Rocprofiler-Compute version: 3.7.0 +Profiler choice: rocprofiler-sdk +Output directory: /app/projects/rocprofiler-compute/workloads/ml_api_trace/MI300A +Target: MI300A_A1 +Command: python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py --frameworks torch,triton -- ./tests/torch_compile_triton.py +Kernel Selection: None +Dispatch Selection: None +Filtered sections: ['11.2.2', '11.2.3', '11.2.4', '11.2.5'] + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Collecting Performance Counters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Generating native tool project using command: cmake -S /app/projects/rocprofiler-compute/src/lib -B /app/projects/rocprofiler-compute/src/lib/_build +-- HIP_PLATFORM using HIPCONFIG_EXEC: amd +-- {fmt} version: 12.1.0 +-- Build type: +-- Using the multi-header code from /app/projects/rocprofiler-compute/src/lib/external/json/include/ +-- Configuring done +-- Generating done +-- Build files have been written to: /app/projects/rocprofiler-compute/src/lib/_build +Building native tool using command: cmake --build /app/projects/rocprofiler-compute/src/lib/_build --parallel +Consolidate compiler generated dependencies of target fmt +[ 0%] Built target gsl_assert +[ 20%] Built target fmt +[ 20%] Built target synchronized +Consolidate compiler generated dependencies of target pc-sampling-collector +[ 46%] Built target pc-sampling-collector +Consolidate compiler generated dependencies of target rocprofiler-compute-tool +[100%] Built target rocprofiler-compute-tool +Searching /app/projects/rocprofiler-compute/src by lib/_build/lib/librocprofiler-compute-tool.so for native collector +Using native collector: /app/projects/rocprofiler-compute/src/lib/_build/lib/librocprofiler-compute-tool.so +Using native counter collection tool: /app/projects/rocprofiler-compute/src/lib/_build/lib/librocprofiler-compute-tool.so +[profiling] Iteration multiplexing: Disabled +[Run 1/1][Approximate profiling time left: pending first measurement...] +[profiling] Current input file: /app/projects/rocprofiler-compute/workloads/ml_api_trace/MI300A/perfmon/pmc_perf_0.yaml + |-> [rocprofiler-sdk] [rocprofiler-compute] [rocprofiler_configure] (priority=1) is using rocprofiler-sdk v1.2.0 (1.2.0) + |-> [rocprofiler-sdk] W20260626 11:07:10.887885 139900068108672 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.155092 sec + |-> [rocprofiler-sdk] [rocprofiler-compute] In tool init + |-> [rocprofiler-sdk] W20260626 11:07:10.892836 139900068108672 simple_timer.cpp:55] [rocprofv3] 'python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py --frameworks torch,triton -- ./tests/torch_compile_triton.py' :: 0.000000 sec + |-> [rocprofiler-sdk] W20260626 11:07:13.061005 139900068108672 tool.cpp:2424] MARKER (ROCTx) version 1.2.0 initialized (instance=0) + |-> [rocprofiler-sdk] W20260626 11:07:13.992071 139900068108672 tool.cpp:2424] HSA version 1.18.0 initialized (instance=0) + |-> [rocprofiler-sdk] Compiled workload completed + |-> [rocprofiler-sdk] W20260626 11:07:21.749269 139900068108672 simple_timer.cpp:55] [rocprofv3] 'python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py --frameworks torch,triton -- ./tests/torch_compile_triton.py' :: 10.856434 sec + |-> [rocprofiler-sdk] W20260626 11:07:21.774417 139900068108672 generateRocpd.cpp:583] writing SQL database for process 28000 on node 2537838091 + |-> [rocprofiler-sdk] E20260626 11:07:21.775514 139900068108672 generateRocpd.cpp:606] Opened result file: /app/projects/rocprofiler-compute/workloads/ml_api_trace/MI300A/out/pmc_1/0fdd2f08d7ba/28000_results.db (UUID=00020c1d-65ad-75ad-904c-858f5b4c09dc) + |-> [rocprofiler-sdk] W20260626 11:07:23.074845 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.046613 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.100243 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.025373 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.120161 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.019900 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.195849 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.031669 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.223799 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.027936 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.251953 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.028142 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.276159 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.024193 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.310108 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.033936 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.310121 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.000000 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.310124 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000001 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.310128 139900068108672 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.310278 139900068108672 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000148 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.311649 139900068108672 simple_timer.cpp:55] SQLite3 generation :: total :: 1.537234 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.314770 139900068108672 simple_timer.cpp:55] [rocprofv3] output generation :: 1.562857 sec + |-> [rocprofiler-sdk] W20260626 11:07:23.314995 139900068108672 simple_timer.cpp:55] [rocprofv3] tool finalization :: 1.565602 sec + |-> [rocprofiler-sdk] [rocprofiler-compute] In tool fini + |-> [rocprofiler-sdk] [rocprofiler-compute] [write_counters] Counter collection data has been written to: /app/projects/rocprofiler-compute/workloads/ml_api_trace/MI300A/out/pmc_1/28000_native_counter_collection.csv +Intermediate results_*.csv generation from rocpd databases is deprecated and will be replaced with automatic .db file retention in a future release. +[ml api trace] Moved counter collection and marker trace files to workload dir for ML API trace creation. +[Counter Collection: ] /app/projects/rocprofiler-compute/workloads/ml_api_trace/MI300A/ml_api_trace_pmc_perf_0_counter_collection.csv +[Marker API Trace: ] /app/projects/rocprofiler-compute/workloads/ml_api_trace/MI300A/ml_api_trace_pmc_perf_0_marker_api_trace.csv +PC sampling data collection skipped as --pc-sampling is not specified. +[roofline] Skipping roofline diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace/consolidated.csv b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace/consolidated.csv new file mode 100644 index 00000000000..bd8a6b33230 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace/consolidated.csv @@ -0,0 +1,43 @@ +Operator_Name,Context_Id,Backend,Kernel_Name,Counter_Name,Counter_Value,Start_Timestamp_function,End_Timestamp_function,Start_Timestamp_kernel,End_Timestamp_kernel +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",GRBM_GUI_ACTIVE,596902.0,8793192073958240,8793192239891637,8793192239890296,8793192239957786 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_WAVES,7296.0,8793192244701346,8793192244955747,8793192244978576,8793192245805101 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_WAVES,7296.0,8793192073958240,8793192239891637,8793192239890296,8793192239957786 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_VMEM,0.0,8793192244701346,8793192244955747,8793192244978576,8793192245805101 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_VMEM,0.0,8793192073958240,8793192239891637,8793192239890296,8793192239957786 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_VALU,14856832.0,8793192244701346,8793192244955747,8793192244978576,8793192245805101 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_SCA,1736448.0,8793192244701346,8793192244955747,8793192244978576,8793192245805101 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_VALU,14856832.0,8793192073958240,8793192239891637,8793192239890296,8793192239957786 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_MISC,634240.0,8793192244701346,8793192244955747,8793192244978576,8793192245805101 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_MISC,634240.0,8793192073958240,8793192239891637,8793192239890296,8793192239957786 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_FLAT,262144.0,8793192244701346,8793192244955747,8793192244978576,8793192245805101 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_FLAT,262144.0,8793192073958240,8793192239891637,8793192239890296,8793192239957786 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",GRBM_GUI_ACTIVE,10435240.0,8793192244701346,8793192244955747,8793192244978576,8793192245805101 +aten::randn/aten::normal_,1@aten:0/#1@aten.nested:0,torch,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",SQ_ACTIVE_INST_SCA,1736448.0,8793192073958240,8793192239891637,8793192239890296,8793192239957786 +torch.compile.fused/aten::zeros/aten::zero_/aten::fill_,1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,torch,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",SQ_ACTIVE_INST_VALU,69.0,8793192261668078,8793192279230654,8793192279239641,8793192279241721 +torch.compile.fused/aten::zeros/aten::zero_/aten::fill_,1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,torch,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",SQ_ACTIVE_INST_VMEM,0.0,8793192261668078,8793192279230654,8793192279239641,8793192279241721 +torch.compile.fused/aten::zeros/aten::zero_/aten::fill_,1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,torch,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",SQ_ACTIVE_INST_SCA,148.0,8793192261668078,8793192279230654,8793192279239641,8793192279241721 +torch.compile.fused/aten::zeros/aten::zero_/aten::fill_,1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,torch,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",SQ_WAVES,4.0,8793192261668078,8793192279230654,8793192279239641,8793192279241721 +torch.compile.fused/aten::zeros/aten::zero_/aten::fill_,1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,torch,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",SQ_ACTIVE_INST_FLAT,1.0,8793192261668078,8793192279230654,8793192279239641,8793192279241721 +torch.compile.fused/aten::zeros/aten::zero_/aten::fill_,1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,torch,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",GRBM_GUI_ACTIVE,81562.0,8793192261668078,8793192279230654,8793192279239641,8793192279241721 +torch.compile.fused/aten::zeros/aten::zero_/aten::fill_,1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,torch,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",SQ_ACTIVE_INST_MISC,76.0,8793192261668078,8793192279230654,8793192279239641,8793192279241721 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#3@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_VALU,1638400.0,8793193421101321,8793193421131141,8793193421157790,8793193421206870 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#3@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_SCA,65536.0,8793193421101321,8793193421131141,8793193421157790,8793193421206870 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#3@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_MISC,65536.0,8793193421101321,8793193421131141,8793193421157790,8793193421206870 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#3@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_FLAT,196608.0,8793193421101321,8793193421131141,8793193421157790,8793193421206870 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#3@:5,triton,triton_poi_fused_add_mul_relu_0,GRBM_GUI_ACTIVE,484270.0,8793193421101321,8793193421131141,8793193421157790,8793193421206870 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#2@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_WAVES,65536.0,8793193420563948,8793193420942430,8793193420965749,8793193421014989 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#2@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_VMEM,0.0,8793193420563948,8793193420942430,8793193420965749,8793193421014989 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#2@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_VALU,1638400.0,8793193420563948,8793193420942430,8793193420965749,8793193421014989 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#2@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_SCA,65536.0,8793193420563948,8793193420942430,8793193420965749,8793193421014989 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#2@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_FLAT,196608.0,8793193420563948,8793193420942430,8793193420965749,8793193421014989 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#2@:5,triton,triton_poi_fused_add_mul_relu_0,GRBM_GUI_ACTIVE,489996.0,8793193420563948,8793193420942430,8793193420965749,8793193421014989 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#1@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_WAVES,65536.0,8793193419854863,8793193420112635,8793193420132825,8793193420819789 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#1@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_VMEM,0.0,8793193419854863,8793193420112635,8793193420132825,8793193420819789 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#1@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_VALU,1638400.0,8793193419854863,8793193420112635,8793193420132825,8793193420819789 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#1@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_SCA,65536.0,8793193419854863,8793193420112635,8793193420132825,8793193420819789 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#1@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_MISC,65536.0,8793193419854863,8793193420112635,8793193420132825,8793193420819789 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#1@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_FLAT,196608.0,8793193419854863,8793193420112635,8793193420132825,8793193420819789 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#1@:5,triton,triton_poi_fused_add_mul_relu_0,GRBM_GUI_ACTIVE,6256580.0,8793193419854863,8793193420112635,8793193420132825,8793193420819789 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#3@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_VMEM,0.0,8793193421101321,8793193421131141,8793193421157790,8793193421206870 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#2@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_ACTIVE_INST_MISC,65536.0,8793193420563948,8793193420942430,8793193420965749,8793193421014989 +torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0,1@torch_compile_triton.py:26/#3@:5,triton,triton_poi_fused_add_mul_relu_0,SQ_WAVES,65536.0,8793193421101321,8793193421131141,8793193421157790,8793193421206870 diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace_pmc_perf_0_counter_collection.csv b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace_pmc_perf_0_counter_collection.csv new file mode 100644 index 00000000000..ba9d68b035c --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace_pmc_perf_0_counter_collection.csv @@ -0,0 +1,43 @@ +GPU_ID,GUID,Correlation_Id,Dispatch_ID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Kernel_Name,Start_Timestamp,End_Timestamp,Kernel_ID,Counter_Name,Counter_Value +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,GRBM_GUI_ACTIVE,596902.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_FLAT,262144.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_MISC,634240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_SCA,1736448.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_VALU,14856832.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_WAVES,7296.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,GRBM_GUI_ACTIVE,10435240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_FLAT,262144.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_MISC,634240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_SCA,1736448.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_VALU,14856832.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_WAVES,7296.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,GRBM_GUI_ACTIVE,81562.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_FLAT,1.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_MISC,76.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_SCA,148.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_VALU,69.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_WAVES,4.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,GRBM_GUI_ACTIVE,6256580.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_WAVES,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,GRBM_GUI_ACTIVE,489996.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_WAVES,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,GRBM_GUI_ACTIVE,484270.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_WAVES,65536.0 diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace_pmc_perf_0_marker_api_trace.csv b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace_pmc_perf_0_marker_api_trace.csv new file mode 100644 index 00000000000..f68a5aa4569 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/ml_api_trace_pmc_perf_0_marker_api_trace.csv @@ -0,0 +1,98 @@ +Domain,Function,Process_Id,Thread_Id,Correlation_Id,GUID,Start_Timestamp,End_Timestamp,Backend +MARKER_CORE_RANGE_API,torch.compile:#1@torch_compile_triton.py:11,28000,28000,1,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793190095142850,8793192066018442,torch +MARKER_CORE_RANGE_API,aten::randn:#1@aten:0,28000,28000,2,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192068303966,8793192239911787,torch +MARKER_CORE_RANGE_API,aten::randn/aten::empty:#1@aten:0/#1@aten.nested:0,28000,28000,3,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192068425856,8793192073907309,torch +MARKER_CORE_RANGE_API,aten::randn/aten::normal_:#1@aten:0/#1@aten.nested:0,28000,28000,4,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192073958240,8793192239891637,torch +MARKER_CORE_RANGE_API,aten::randn:#1@aten:0,28000,28000,5,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192240048968,8793192244960367,torch +MARKER_CORE_RANGE_API,aten::randn/aten::empty:#1@aten:0/#1@aten.nested:0,28000,28000,6,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192240066428,8793192244663676,torch +MARKER_CORE_RANGE_API,aten::randn/aten::normal_:#1@aten:0/#1@aten.nested:0,28000,28000,7,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192244701346,8793192244955747,torch +MARKER_CORE_RANGE_API,torch.compile.fused:#1@torch_compile_triton.py:26,28000,28000,8,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192245209959,8793193420259016,torch +MARKER_CORE_RANGE_API,torch.compile.fused/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,9,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192245476181,8793192245477661,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.device_count:#1@torch_compile_triton.py:26/#1@torch_compile_triton.py:26,28000,28000,10,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253277148,8793192253299918,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.current_device:#1@torch_compile_triton.py:26/#1@torch_compile_triton.py:26,28000,28000,11,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253362758,8793192253367528,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.device_count:#1@torch_compile_triton.py:26/#2@torch_compile_triton.py:26,28000,28000,12,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253393418,8793192253395898,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.device_count:#1@torch_compile_triton.py:26/#3@torch_compile_triton.py:26,28000,28000,13,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253422898,8793192253424678,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.device_count:#1@torch_compile_triton.py:26/#4@torch_compile_triton.py:26,28000,28000,14,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253457089,8793192253460199,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.device_count:#1@torch_compile_triton.py:26/#5@torch_compile_triton.py:26,28000,28000,15,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253491629,8793192253493019,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.device_count:#1@torch_compile_triton.py:26/#6@torch_compile_triton.py:26,28000,28000,16,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253517889,8793192253519389,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.current_device:#1@torch_compile_triton.py:26/#2@torch_compile_triton.py:26,28000,28000,17,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253604519,8793192253606920,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.device_count:#1@torch_compile_triton.py:26/#7@torch_compile_triton.py:26,28000,28000,18,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192253629810,8793192253631400,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::empty_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,19,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192260905303,8793192261069384,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,20,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192261130265,8793192261177635,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::detach/detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,21,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192261135045,8793192261177035,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::zeros:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,22,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192261237825,8793192279235624,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::zeros/aten::empty:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,23,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192261285826,8793192261562457,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::zeros/aten::zero_:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,24,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192261593028,8793192279233374,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::zeros/aten::zero_/aten::fill_:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,25,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192261668078,8793192279230654,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,26,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192280532542,8793192282325192,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,27,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192281080575,8793192282071021,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::le:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,28,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192281248186,8793192281790499,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::le/prims::le:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,29,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192281413977,8793192281778969,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::le/prims::le/aten::empty_permuted:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,30,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192281609628,8793192281771099,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::le/prims::le/aten::empty_permuted/aten::empty:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,31,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192281675088,8793192281684299,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::le/prims::le/aten::empty_permuted/aten::as_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,32,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192281703269,8793192281769589,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::where:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,33,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192281828899,8793192282067021,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::where/prims::where:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,34,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192281947030,8793192282061831,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::where/prims::where/aten::empty_permuted:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,35,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282035501,8793192282056611,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::where/prims::where/aten::empty_permuted/aten::empty:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,36,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282042521,8793192282047491,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::relu/aten::where/prims::where/aten::empty_permuted/aten::as_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,37,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282049441,8793192282055391,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,38,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282122561,8793192282123751,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::empty_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,39,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282268882,8793192282280632,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::relu/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,40,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282292762,8793192282293472,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::empty_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,41,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282799745,8793192282812075,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,42,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282818505,8793192282823465,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::detach/detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,43,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192282820505,8793192282823065,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,44,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192283220218,8793192284027263,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul/aten::mul:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,45,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192283639820,8793192283858282,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul/aten::mul/prims::mul:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,46,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192283734891,8793192283849132,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul/aten::mul/prims::mul/aten::empty_permuted:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,47,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192283819401,8793192283843802,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul/aten::mul/prims::mul/aten::empty_permuted/aten::empty:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,48,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192283828541,8793192283834591,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul/aten::mul/prims::mul/aten::empty_permuted/aten::as_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,49,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192283837121,8793192283842242,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,50,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192283885272,8793192283886372,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul/aten::empty_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,51,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192283984382,8793192283993162,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::mul/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,52,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284003342,8793192284004032,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,53,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284277824,8793192284936298,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add/aten::add:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,54,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284565756,8793192284782377,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add/aten::add/prims::add:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,55,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284660626,8793192284773707,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add/aten::add/prims::add/aten::empty_permuted:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,56,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284742557,8793192284768787,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add/aten::add/prims::add/aten::empty_permuted/aten::empty:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,57,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284751457,8793192284759107,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add/aten::add/prims::add/aten::empty_permuted/aten::as_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0/#1@aten.nested:0,28000,28000,58,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284761507,8793192284767347,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,59,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284807387,8793192284808467,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add/aten::empty_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,60,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284895978,8793192284904968,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::add/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,61,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192284913938,8793192284914658,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::empty_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,62,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192394817810,8793192394840490,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,63,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192394854090,8793192394863900,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::detach/detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,64,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192394857460,8793192394863330,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::empty_strided:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,65,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192395064401,8793192395073091,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::detach:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,66,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192395078521,8793192395089191,torch +MARKER_CORE_RANGE_API,torch.compile.fused/aten::detach/detach:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,67,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192395079971,8793192395081891,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.current_device:#1@torch_compile_triton.py:26/#3@torch_compile_triton.py:26,28000,28000,68,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192985330741,8793192985450112,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.device_count:#1@torch_compile_triton.py:26/#8@torch_compile_triton.py:26,28000,28000,69,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793192985510992,8793192985533082,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.current_device:#1@torch_compile_triton.py:26/#4@cycpf7fn66lwfhowefwrp3nthmu3qnub5vcnyt6g5bk3qirl7uzu.py:50,28000,28000,70,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193376139070,8793193376167010,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.synchronize:#1@torch_compile_triton.py:26/#1@cycpf7fn66lwfhowefwrp3nthmu3qnub5vcnyt6g5bk3qirl7uzu.py:50,28000,28000,71,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193376189480,8793193376299341,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.current_device:#1@torch_compile_triton.py:26/#5@cycpf7fn66lwfhowefwrp3nthmu3qnub5vcnyt6g5bk3qirl7uzu.py:50,28000,28000,72,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193376418482,8793193376421892,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.current_device:#1@torch_compile_triton.py:26/#6@cycpf7fn66lwfhowefwrp3nthmu3qnub5vcnyt6g5bk3qirl7uzu.py:50,28000,28000,73,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193378034282,8793193378040762,torch +MARKER_CORE_RANGE_API,torch.compile.fused/torch.cuda.current_device:#1@torch_compile_triton.py:26/#7@torch_compile_triton.py:26,28000,28000,74,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193378765256,8793193378771066,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,75,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413061892,8793193420221036,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,76,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413085643,8793193413090593,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,77,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413379354,8793193413380194,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,78,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413485945,8793193413486495,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,79,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413565275,8793193413565745,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,80,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413627976,8793193413628536,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,81,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413755817,8793193413756287,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,82,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413803557,8793193413804177,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@aten.nested:0,28000,28000,83,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193413926628,8793193413927028,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/torch.cuda.set_device:#1@torch_compile_triton.py:26/#1@aten.nested:0/#1@cycpf7fn66lwfhowefwrp3nthmu3qnub5vcnyt6g5bk3qirl7uzu.py:102,28000,28000,84,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193414198679,8793193414217889,torch +MARKER_CORE_RANGE_API,torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0:#1@torch_compile_triton.py:26/#1@:5,28000,28000,85,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193419854863,8793193420112635,triton +MARKER_CORE_RANGE_API,torch.compile.fused:#1@torch_compile_triton.py:26,28000,28000,86,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193420309166,8793193420981820,torch +MARKER_CORE_RANGE_API,torch.compile.fused/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,87,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193420357736,8793193420399167,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,88,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193420402607,8793193420975680,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/torch.cuda.set_device:#1@torch_compile_triton.py:26/#1@aten.nested:0/#2@cycpf7fn66lwfhowefwrp3nthmu3qnub5vcnyt6g5bk3qirl7uzu.py:102,28000,28000,89,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193420509757,8793193420518557,torch +MARKER_CORE_RANGE_API,torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0:#1@torch_compile_triton.py:26/#2@:5,28000,28000,90,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193420563948,8793193420942430,triton +MARKER_CORE_RANGE_API,torch.compile.fused:#1@torch_compile_triton.py:26,28000,28000,91,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193421004880,8793193421157821,torch +MARKER_CORE_RANGE_API,torch.compile.fused/TorchDynamo Cache Lookup:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,92,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193421016390,8793193421025260,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0:#1@torch_compile_triton.py:26/#1@aten.nested:0,28000,28000,93,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193421026580,8793193421153691,torch +MARKER_CORE_RANGE_API,torch.compile.fused/Torch-Compiled Region: 0%2F0/torch.cuda.set_device:#1@torch_compile_triton.py:26/#1@aten.nested:0/#3@cycpf7fn66lwfhowefwrp3nthmu3qnub5vcnyt6g5bk3qirl7uzu.py:102,28000,28000,94,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193421069451,8793193421072761,torch +MARKER_CORE_RANGE_API,torch.compile.fused/triton.CompiledKernel.triton_poi_fused_add_mul_relu_0:#1@torch_compile_triton.py:26/#3@:5,28000,28000,95,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193421101321,8793193421131141,triton +MARKER_CORE_RANGE_API,torch.cuda.synchronize:#2@torch_compile_triton.py:28,28000,28000,96,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193421178481,8793193421303512,torch +MARKER_CORE_RANGE_API,torch.cuda.synchronize/torch.cuda.current_device:#2@torch_compile_triton.py:28/#8@torch_compile_triton.py:28,28000,28000,97,00020c1d-65ad-75ad-904c-858f5b4c09dc,8793193421250842,8793193421255162,torch diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/perfmon/pmc_perf_0.yaml b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/perfmon/pmc_perf_0.yaml new file mode 100644 index 00000000000..ec8ba4fb295 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/perfmon/pmc_perf_0.yaml @@ -0,0 +1,9 @@ +jobs: +- pmc: + - SQ_ACTIVE_INST_FLAT + - SQ_ACTIVE_INST_MISC + - SQ_ACTIVE_INST_SCA + - SQ_ACTIVE_INST_VALU + - SQ_ACTIVE_INST_VMEM + - SQ_WAVES + - GRBM_GUI_ACTIVE diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/pmc_dispatch_info.csv b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/pmc_dispatch_info.csv new file mode 100644 index 00000000000..b647c556912 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/pmc_dispatch_info.csv @@ -0,0 +1,7 @@ +Dispatch_ID,Kernel_Name,GPU_ID +0,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",0 +1,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",0 +2,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",0 +3,triton_poi_fused_add_mul_relu_0,0 +4,triton_poi_fused_add_mul_relu_0,0 +5,triton_poi_fused_add_mul_relu_0,0 diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/pmc_perf.csv b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/pmc_perf.csv new file mode 100644 index 00000000000..ba9d68b035c --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/pmc_perf.csv @@ -0,0 +1,43 @@ +GPU_ID,GUID,Correlation_Id,Dispatch_ID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Kernel_Name,Start_Timestamp,End_Timestamp,Kernel_ID,Counter_Name,Counter_Value +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,GRBM_GUI_ACTIVE,596902.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_FLAT,262144.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_MISC,634240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_SCA,1736448.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_VALU,14856832.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_WAVES,7296.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,GRBM_GUI_ACTIVE,10435240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_FLAT,262144.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_MISC,634240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_SCA,1736448.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_VALU,14856832.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_WAVES,7296.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,GRBM_GUI_ACTIVE,81562.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_FLAT,1.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_MISC,76.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_SCA,148.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_VALU,69.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_WAVES,4.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,GRBM_GUI_ACTIVE,6256580.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_WAVES,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,GRBM_GUI_ACTIVE,489996.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_WAVES,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,GRBM_GUI_ACTIVE,484270.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_WAVES,65536.0 diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/profiling_config.yaml b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/profiling_config.yaml new file mode 100644 index 00000000000..9f1eeeabe34 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/profiling_config.yaml @@ -0,0 +1,45 @@ +attach_duration_msec: null +attach_pid: null +bench_only: false +config_dir: /app/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs +device: 0 +dispatch: null +experimental: true +filter_blocks: +- 11.2.2 +- 11.2.3 +- 11.2.4 +- 11.2.5 +format_rocprof_output: rocpd +iteration_multiplexing: null +join_type: grid +kernel: null +kokkos_trace: false +list_available_metrics: false +list_blocks: null +list_metrics: null +list_sets: false +loglevel: 20 +membw_analysis: false +ml_api_trace: true +mode: profile +name: ml_api_trace +no_native_tool: false +no_roof: true +output_directory: /app/projects/rocprofiler-compute/workloads/ml_api_trace/MI300A +overwrite: false +pc_sampling: false +pc_sampling_interval: null +pc_sampling_method: stochastic +quiet: false +remaining: python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py + --frameworks torch,triton -- ./tests/torch_compile_triton.py +retain_rocpd_output: false +rocprofiler_sdk_tool_path: /rocm-venv/lib/python3.12/site-packages/_rocm_sdk_core/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so +roof_only: false +set_selected: compute_thruput_util +specs: false +target: null +torch_trace: false +triton_trace: false +verbose: 0 diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/results_pmc_perf_0.csv b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/results_pmc_perf_0.csv new file mode 100644 index 00000000000..ba9d68b035c --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/results_pmc_perf_0.csv @@ -0,0 +1,43 @@ +GPU_ID,GUID,Correlation_Id,Dispatch_ID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Kernel_Name,Start_Timestamp,End_Timestamp,Kernel_ID,Counter_Name,Counter_Value +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,GRBM_GUI_ACTIVE,596902.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_FLAT,262144.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_MISC,634240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_SCA,1736448.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_VALU,14856832.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,4,0,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192239890296,8793192239957786,0,SQ_WAVES,7296.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,GRBM_GUI_ACTIVE,10435240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_FLAT,262144.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_MISC,634240.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_SCA,1736448.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_VALU,14856832.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,7,1,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793192244978576,8793192245805101,0,SQ_WAVES,7296.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,GRBM_GUI_ACTIVE,81562.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_FLAT,1.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_MISC,76.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_SCA,148.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_VALU,69.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,25,2,256,256,0,0,12,4,32,"void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)",8793192279239641,8793192279241721,1,SQ_WAVES,4.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,GRBM_GUI_ACTIVE,6256580.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,85,3,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420132825,8793193420819789,2,SQ_WAVES,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,GRBM_GUI_ACTIVE,489996.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,90,4,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193420965749,8793193421014989,2,SQ_WAVES,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,GRBM_GUI_ACTIVE,484270.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_FLAT,196608.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_MISC,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_SCA,65536.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_VALU,1638400.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1d-65ad-75ad-904c-858f5b4c09dc,95,5,4194304,256,0,0,16,0,32,triton_poi_fused_add_mul_relu_0,8793193421157790,8793193421206870,2,SQ_WAVES,65536.0 diff --git a/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/sysinfo.csv b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/sysinfo.csv new file mode 100644 index 00000000000..a8c25fcdda9 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/ml_api_trace/MI300A/sysinfo.csv @@ -0,0 +1,2 @@ +workload_path,command,ip_blocks,timestamp,version,hostname,cpu_model,sbios,linux_distro,linux_kernel_version,amd_gpu_kernel_version,cpu_memory,gpu_memory,rocm_version,vbios,compute_partition,memory_partition,gpu_series,gpu_model,gpu_arch,gpu_chip_id,gpu_l1,gpu_l2,cu_per_gpu,simd_per_cu,se_per_gpu,sa_per_se,wave_size,workgroup_max_size,max_waves_per_cu,max_sclk,max_mclk,cur_sclk,cur_mclk,l2_banks,total_l2_chan,lds_banks_per_cu,sqc_per_gpu,pipes_per_gpu,num_xcd,num_hbm_channels +/app/projects/rocprofiler-compute/workloads/ml_api_trace/MI300A,"python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py --frameworks torch,triton -- ./tests/torch_compile_triton.py",SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF,Fri Jun 26 11:07:07 2026 (UTC),3,0fdd2f08d7ba,AMD Instinct MI300A Accelerator,"American Megatrends International, LLC.RMP1007AS",Ubuntu 22.04.5 LTS,5.18.2-mi300-build-140423-ubuntu-22.04+,6.18.4,526647832,98746368,7.12.0,,SPX,NPS1,MI300,MI300A_A1,gfx942,29856,32,4096,228,4,24,1,64,1024,32,2100,1300,2100,1300,16,96,32,120,4,6,128 diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/log.txt b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/log.txt new file mode 100644 index 00000000000..f57c75c50f6 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/log.txt @@ -0,0 +1,69 @@ +Rocprofiler-Compute version: 3.7.0 +Profiler choice: rocprofiler-sdk +Output directory: /app/projects/rocprofiler-compute/workloads/triton_trace/MI300A +Target: MI300A_A1 +Command: python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py --frameworks triton -- ./tests/triton_ffn.py +Kernel Selection: None +Dispatch Selection: None +Filtered sections: ['11.2.2', '11.2.3', '11.2.4', '11.2.5'] + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Collecting Performance Counters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Generating native tool project using command: cmake -S /app/projects/rocprofiler-compute/src/lib -B /app/projects/rocprofiler-compute/src/lib/_build +-- HIP_PLATFORM using HIPCONFIG_EXEC: amd +-- {fmt} version: 12.1.0 +-- Build type: +-- Using the multi-header code from /app/projects/rocprofiler-compute/src/lib/external/json/include/ +-- Configuring done +-- Generating done +-- Build files have been written to: /app/projects/rocprofiler-compute/src/lib/_build +Building native tool using command: cmake --build /app/projects/rocprofiler-compute/src/lib/_build --parallel +Consolidate compiler generated dependencies of target fmt +[ 0%] Built target gsl_assert +[ 0%] Built target synchronized +[ 20%] Built target fmt +Consolidate compiler generated dependencies of target pc-sampling-collector +[ 46%] Built target pc-sampling-collector +Consolidate compiler generated dependencies of target rocprofiler-compute-tool +[100%] Built target rocprofiler-compute-tool +Searching /app/projects/rocprofiler-compute/src by lib/_build/lib/librocprofiler-compute-tool.so for native collector +Using native collector: /app/projects/rocprofiler-compute/src/lib/_build/lib/librocprofiler-compute-tool.so +Using native counter collection tool: /app/projects/rocprofiler-compute/src/lib/_build/lib/librocprofiler-compute-tool.so +[profiling] Iteration multiplexing: Disabled +[Run 1/1][Approximate profiling time left: pending first measurement...] +[profiling] Current input file: /app/projects/rocprofiler-compute/workloads/triton_trace/MI300A/perfmon/pmc_perf_0.yaml + |-> [rocprofiler-sdk] [rocprofiler-compute] [rocprofiler_configure] (priority=1) is using rocprofiler-sdk v1.2.0 (1.2.0) + |-> [rocprofiler-sdk] W20260626 11:06:13.388257 139842801629568 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.164065 sec + |-> [rocprofiler-sdk] [rocprofiler-compute] In tool init + |-> [rocprofiler-sdk] W20260626 11:06:13.394477 139842801629568 simple_timer.cpp:55] [rocprofv3] 'python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py --frameworks triton -- ./tests/triton_ffn.py' :: 0.000000 sec + |-> [rocprofiler-sdk] W20260626 11:06:15.886201 139842801629568 tool.cpp:2424] HSA version 1.18.0 initialized (instance=0) + |-> [rocprofiler-sdk] W20260626 11:06:16.132781 139842801629568 tool.cpp:2424] MARKER (ROCTx) version 1.2.0 initialized (instance=0) + |-> [rocprofiler-sdk] FFN completed, output sum: 12040514.000 + |-> [rocprofiler-sdk] W20260626 11:06:17.468690 139842801629568 simple_timer.cpp:55] [rocprofv3] 'python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py --frameworks triton -- ./tests/triton_ffn.py' :: 4.074212 sec + |-> [rocprofiler-sdk] W20260626 11:06:17.499064 139842801629568 generateRocpd.cpp:583] writing SQL database for process 26931 on node 2537838091 + |-> [rocprofiler-sdk] E20260626 11:06:17.500268 139842801629568 generateRocpd.cpp:606] Opened result file: /app/projects/rocprofiler-compute/workloads/triton_trace/MI300A/out/pmc_1/0fdd2f08d7ba/26931_results.db (UUID=00020c1c-8508-7508-8d07-3a98d0a74f41) + |-> [rocprofiler-sdk] W20260626 11:06:18.743573 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.044603 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.768990 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.025396 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.794986 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.025982 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.858373 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.025064 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.897235 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.038836 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.925308 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.028059 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.951315 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.025995 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.983280 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.031951 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.983304 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.000001 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.983311 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000001 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.983318 139842801629568 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000002 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.983593 139842801629568 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000269 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.985629 139842801629568 simple_timer.cpp:55] SQLite3 generation :: total :: 1.486564 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.990996 139842801629568 simple_timer.cpp:55] [rocprofv3] output generation :: 1.519819 sec + |-> [rocprofiler-sdk] W20260626 11:06:18.991380 139842801629568 simple_timer.cpp:55] [rocprofv3] tool finalization :: 1.522577 sec + |-> [rocprofiler-sdk] [rocprofiler-compute] In tool fini + |-> [rocprofiler-sdk] [rocprofiler-compute] [write_counters] Counter collection data has been written to: /app/projects/rocprofiler-compute/workloads/triton_trace/MI300A/out/pmc_1/26931_native_counter_collection.csv +Intermediate results_*.csv generation from rocpd databases is deprecated and will be replaced with automatic .db file retention in a future release. +[ml api trace] Moved counter collection and marker trace files to workload dir for ML API trace creation. +[Counter Collection: ] /app/projects/rocprofiler-compute/workloads/triton_trace/MI300A/ml_api_trace_pmc_perf_0_counter_collection.csv +[Marker API Trace: ] /app/projects/rocprofiler-compute/workloads/triton_trace/MI300A/ml_api_trace_pmc_perf_0_marker_api_trace.csv +PC sampling data collection skipped as --pc-sampling is not specified. +[roofline] Skipping roofline diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace/consolidated.csv b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace/consolidated.csv new file mode 100644 index 00000000000..5da0934f1fa --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace/consolidated.csv @@ -0,0 +1,148 @@ +Operator_Name,Context_Id,Backend,Kernel_Name,Counter_Name,Counter_Value,Start_Timestamp_function,End_Timestamp_function,Start_Timestamp_kernel,End_Timestamp_kernel +triton.JITFunction.add_kernel,1@triton_ffn.py:144,triton,add_kernel,GRBM_GUI_ACTIVE,454691.0,8793134124335413,8793134126014973,8793134126033548,8793134126065428 +triton.JITFunction.add_kernel,3@triton_ffn.py:144,triton,add_kernel,SQ_WAVES,1024.0,8793134131048353,8793134131088063,8793134131156857,8793134131159137 +triton.JITFunction.add_kernel,3@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134131048353,8793134131088063,8793134131156857,8793134131159137 +triton.JITFunction.add_kernel,3@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_VALU,9216.0,8793134131048353,8793134131088063,8793134131156857,8793134131159137 +triton.JITFunction.add_kernel,3@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_MISC,2048.0,8793134131048353,8793134131088063,8793134131156857,8793134131159137 +triton.JITFunction.add_kernel,3@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_FLAT,3072.0,8793134131048353,8793134131088063,8793134131156857,8793134131159137 +triton.JITFunction.add_kernel,3@triton_ffn.py:144,triton,add_kernel,GRBM_GUI_ACTIVE,78305.0,8793134131048353,8793134131088063,8793134131156857,8793134131159137 +triton.JITFunction.add_kernel,2@triton_ffn.py:144,triton,add_kernel,SQ_WAVES,1024.0,8793134130453920,8793134130507710,8793134130529174,8793134130531454 +triton.JITFunction.add_kernel,2@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130453920,8793134130507710,8793134130529174,8793134130531454 +triton.JITFunction.add_kernel,2@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_VALU,9216.0,8793134130453920,8793134130507710,8793134130529174,8793134130531454 +triton.JITFunction.add_kernel,3@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_SCA,2048.0,8793134131048353,8793134131088063,8793134131156857,8793134131159137 +triton.JITFunction.add_kernel,2@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_MISC,2048.0,8793134130453920,8793134130507710,8793134130529174,8793134130531454 +triton.JITFunction.add_kernel,2@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_FLAT,3072.0,8793134130453920,8793134130507710,8793134130529174,8793134130531454 +triton.JITFunction.add_kernel,2@triton_ffn.py:144,triton,add_kernel,GRBM_GUI_ACTIVE,78725.0,8793134130453920,8793134130507710,8793134130529174,8793134130531454 +triton.JITFunction.add_kernel,1@triton_ffn.py:144,triton,add_kernel,SQ_WAVES,1024.0,8793134124335413,8793134126014973,8793134126033548,8793134126065428 +triton.JITFunction.add_kernel,1@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134124335413,8793134126014973,8793134126033548,8793134126065428 +triton.JITFunction.add_kernel,1@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_VALU,9216.0,8793134124335413,8793134126014973,8793134126033548,8793134126065428 +triton.JITFunction.add_kernel,1@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_SCA,2048.0,8793134124335413,8793134126014973,8793134126033548,8793134126065428 +triton.JITFunction.add_kernel,1@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_MISC,2048.0,8793134124335413,8793134126014973,8793134126033548,8793134126065428 +triton.JITFunction.add_kernel,1@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_FLAT,3072.0,8793134124335413,8793134126014973,8793134126033548,8793134126065428 +triton.JITFunction.add_kernel,2@triton_ffn.py:144,triton,add_kernel,SQ_ACTIVE_INST_SCA,2048.0,8793134130453920,8793134130507710,8793134130529174,8793134130531454 +triton.JITFunction.matmul_kernel,7@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,102400.0,8793134130674971,8793134130727171,8793134130749375,8793134130772375 +triton.JITFunction.matmul_kernel,7@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,69632.0,8793134130674971,8793134130727171,8793134130749375,8793134130772375 +triton.JITFunction.matmul_kernel,7@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,335923.0,8793134130674971,8793134130727171,8793134130749375,8793134130772375 +triton.JITFunction.matmul_kernel,6@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,256.0,8793134130297659,8793134130408939,8793134130417053,8793134130487893 +triton.JITFunction.matmul_kernel,6@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130297659,8793134130408939,8793134130417053,8793134130487893 +triton.JITFunction.matmul_kernel,6@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,736000.0,8793134130297659,8793134130408939,8793134130417053,8793134130487893 +triton.JITFunction.matmul_kernel,5@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,1024.0,8793134126332865,8793134126387255,8793134126408950,8793134126431671 +triton.JITFunction.matmul_kernel,6@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,99328.0,8793134130297659,8793134130408939,8793134130417053,8793134130487893 +triton.JITFunction.matmul_kernel,6@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,66560.0,8793134130297659,8793134130408939,8793134130417053,8793134130487893 +triton.JITFunction.matmul_kernel,6@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,939269.0,8793134130297659,8793134130408939,8793134130417053,8793134130487893 +triton.JITFunction.matmul_kernel,5@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134126332865,8793134126387255,8793134126408950,8793134126431671 +triton.JITFunction.matmul_kernel,7@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,198656.0,8793134130674971,8793134130727171,8793134130749375,8793134130772375 +triton.JITFunction.matmul_kernel,6@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,178688.0,8793134130297659,8793134130408939,8793134130417053,8793134130487893 +triton.JITFunction.matmul_kernel,7@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,904192.0,8793134130674971,8793134130727171,8793134130749375,8793134130772375 +triton.JITFunction.matmul_kernel,9@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,99328.0,8793134130978963,8793134131025843,8793134131047657,8793134131117937 +triton.JITFunction.matmul_kernel,7@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,1024.0,8793134130674971,8793134130727171,8793134130749375,8793134130772375 +triton.JITFunction.matmul_kernel,8@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,336891.0,8793134130756921,8793134130818982,8793134130841455,8793134130863935 +triton.JITFunction.matmul_kernel,8@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,69632.0,8793134130756921,8793134130818982,8793134130841455,8793134130863935 +triton.JITFunction.matmul_kernel,8@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,102400.0,8793134130756921,8793134130818982,8793134130841455,8793134130863935 +triton.JITFunction.matmul_kernel,8@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,904192.0,8793134130756921,8793134130818982,8793134130841455,8793134130863935 +triton.JITFunction.matmul_kernel,8@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130756921,8793134130818982,8793134130841455,8793134130863935 +triton.JITFunction.matmul_kernel,8@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,1024.0,8793134130756921,8793134130818982,8793134130841455,8793134130863935 +triton.JITFunction.matmul_kernel,9@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,935064.0,8793134130978963,8793134131025843,8793134131047657,8793134131117937 +triton.JITFunction.matmul_kernel,9@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,66560.0,8793134130978963,8793134131025843,8793134131047657,8793134131117937 +triton.JITFunction.matmul_kernel,5@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,904192.0,8793134126332865,8793134126387255,8793134126408950,8793134126431671 +triton.JITFunction.matmul_kernel,9@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,178688.0,8793134130978963,8793134131025843,8793134131047657,8793134131117937 +triton.JITFunction.matmul_kernel,9@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,736000.0,8793134130978963,8793134131025843,8793134131047657,8793134131117937 +triton.JITFunction.matmul_kernel,9@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130978963,8793134131025843,8793134131047657,8793134131117937 +triton.JITFunction.matmul_kernel,9@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,256.0,8793134130978963,8793134131025843,8793134131047657,8793134131117937 +triton.JITFunction.matmul_kernel,7@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130674971,8793134130727171,8793134130749375,8793134130772375 +triton.JITFunction.matmul_kernel,5@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,198656.0,8793134126332865,8793134126387255,8793134126408950,8793134126431671 +triton.JITFunction.matmul_kernel,8@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,198656.0,8793134130756921,8793134130818982,8793134130841455,8793134130863935 +triton.JITFunction.matmul_kernel,5@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,69632.0,8793134126332865,8793134126387255,8793134126408950,8793134126431671 +triton.JITFunction.matmul_kernel,1@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,347342.0,8793134115438319,8793134118569718,8793134118588026,8793134118611746 +triton.JITFunction.matmul_kernel,1@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,69632.0,8793134115438319,8793134118569718,8793134118588026,8793134118611746 +triton.JITFunction.matmul_kernel,1@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,102400.0,8793134115438319,8793134118569718,8793134118588026,8793134118611746 +triton.JITFunction.matmul_kernel,1@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,198656.0,8793134115438319,8793134118569718,8793134118588026,8793134118611746 +triton.JITFunction.matmul_kernel,1@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,904192.0,8793134115438319,8793134118569718,8793134118588026,8793134118611746 +triton.JITFunction.matmul_kernel,1@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134115438319,8793134118569718,8793134118588026,8793134118611746 +triton.JITFunction.matmul_kernel,1@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,1024.0,8793134115438319,8793134118569718,8793134118588026,8793134118611746 +triton.JITFunction.matmul_kernel,2@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,333035.0,8793134118637958,8793134118707599,8793134118727907,8793134118751267 +triton.JITFunction.matmul_kernel,2@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,69632.0,8793134118637958,8793134118707599,8793134118727907,8793134118751267 +triton.JITFunction.matmul_kernel,2@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,102400.0,8793134118637958,8793134118707599,8793134118727907,8793134118751267 +triton.JITFunction.matmul_kernel,2@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,198656.0,8793134118637958,8793134118707599,8793134118727907,8793134118751267 +triton.JITFunction.matmul_kernel,2@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,904192.0,8793134118637958,8793134118707599,8793134118727907,8793134118751267 +triton.JITFunction.matmul_kernel,2@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134118637958,8793134118707599,8793134118727907,8793134118751267 +triton.JITFunction.matmul_kernel,3@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,945476.0,8793134124032561,8793134124097991,8793134124116617,8793134124187938 +triton.JITFunction.matmul_kernel,2@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,1024.0,8793134118637958,8793134118707599,8793134118727907,8793134118751267 +triton.JITFunction.matmul_kernel,4@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,69632.0,8793134126227324,8793134126296225,8793134126316470,8793134126339150 +triton.JITFunction.matmul_kernel,5@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,328060.0,8793134126332865,8793134126387255,8793134126408950,8793134126431671 +triton.JITFunction.matmul_kernel,4@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,1024.0,8793134126227324,8793134126296225,8793134126316470,8793134126339150 +triton.JITFunction.matmul_kernel,4@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134126227324,8793134126296225,8793134126316470,8793134126339150 +triton.JITFunction.matmul_kernel,4@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,904192.0,8793134126227324,8793134126296225,8793134126316470,8793134126339150 +triton.JITFunction.matmul_kernel,4@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,198656.0,8793134126227324,8793134126296225,8793134126316470,8793134126339150 +triton.JITFunction.matmul_kernel,4@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,102400.0,8793134126227324,8793134126296225,8793134126316470,8793134126339150 +triton.JITFunction.matmul_kernel,3@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_FLAT,66560.0,8793134124032561,8793134124097991,8793134124116617,8793134124187938 +triton.JITFunction.matmul_kernel,5@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,102400.0,8793134126332865,8793134126387255,8793134126408950,8793134126431671 +triton.JITFunction.matmul_kernel,3@triton_ffn.py:120,triton,matmul_kernel,SQ_WAVES,256.0,8793134124032561,8793134124097991,8793134124116617,8793134124187938 +triton.JITFunction.matmul_kernel,3@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134124032561,8793134124097991,8793134124116617,8793134124187938 +triton.JITFunction.matmul_kernel,3@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_VALU,736000.0,8793134124032561,8793134124097991,8793134124116617,8793134124187938 +triton.JITFunction.matmul_kernel,3@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_SCA,178688.0,8793134124032561,8793134124097991,8793134124116617,8793134124187938 +triton.JITFunction.matmul_kernel,3@triton_ffn.py:120,triton,matmul_kernel,SQ_ACTIVE_INST_MISC,99328.0,8793134124032561,8793134124097991,8793134124116617,8793134124187938 +triton.JITFunction.matmul_kernel,4@triton_ffn.py:120,triton,matmul_kernel,GRBM_GUI_ACTIVE,330896.0,8793134126227324,8793134126296225,8793134126316470,8793134126339150 +triton.JITFunction.mul_kernel,2@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130152598,8793134130207458,8793134130229572,8793134130234172 +triton.JITFunction.mul_kernel,2@triton_ffn.py:144,triton,mul_kernel,SQ_WAVES,4096.0,8793134130152598,8793134130207458,8793134130229572,8793134130234172 +triton.JITFunction.mul_kernel,3@triton_ffn.py:144,triton,mul_kernel,GRBM_GUI_ACTIVE,106463.0,8793134130914742,8793134130949583,8793134130970536,8793134130975176 +triton.JITFunction.mul_kernel,3@triton_ffn.py:144,triton,mul_kernel,SQ_WAVES,4096.0,8793134130914742,8793134130949583,8793134130970536,8793134130975176 +triton.JITFunction.mul_kernel,3@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_SCA,8192.0,8793134130914742,8793134130949583,8793134130970536,8793134130975176 +triton.JITFunction.mul_kernel,3@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_VALU,36864.0,8793134130914742,8793134130949583,8793134130970536,8793134130975176 +triton.JITFunction.mul_kernel,3@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130914742,8793134130949583,8793134130970536,8793134130975176 +triton.JITFunction.mul_kernel,2@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_VALU,36864.0,8793134130152598,8793134130207458,8793134130229572,8793134130234172 +triton.JITFunction.mul_kernel,3@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_MISC,8192.0,8793134130914742,8793134130949583,8793134130970536,8793134130975176 +triton.JITFunction.mul_kernel,2@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_SCA,8192.0,8793134130152598,8793134130207458,8793134130229572,8793134130234172 +triton.JITFunction.mul_kernel,3@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_FLAT,12288.0,8793134130914742,8793134130949583,8793134130970536,8793134130975176 +triton.JITFunction.mul_kernel,2@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_FLAT,12288.0,8793134130152598,8793134130207458,8793134130229572,8793134130234172 +triton.JITFunction.mul_kernel,2@triton_ffn.py:144,triton,mul_kernel,GRBM_GUI_ACTIVE,103382.0,8793134130152598,8793134130207458,8793134130229572,8793134130234172 +triton.JITFunction.mul_kernel,1@triton_ffn.py:144,triton,mul_kernel,SQ_WAVES,4096.0,8793134122122169,8793134123933670,8793134123952617,8793134124065537 +triton.JITFunction.mul_kernel,1@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134122122169,8793134123933670,8793134123952617,8793134124065537 +triton.JITFunction.mul_kernel,1@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_VALU,36864.0,8793134122122169,8793134123933670,8793134123952617,8793134124065537 +triton.JITFunction.mul_kernel,1@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_SCA,8192.0,8793134122122169,8793134123933670,8793134123952617,8793134124065537 +triton.JITFunction.mul_kernel,1@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_MISC,8192.0,8793134122122169,8793134123933670,8793134123952617,8793134124065537 +triton.JITFunction.mul_kernel,1@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_FLAT,12288.0,8793134122122169,8793134123933670,8793134123952617,8793134124065537 +triton.JITFunction.mul_kernel,1@triton_ffn.py:144,triton,mul_kernel,GRBM_GUI_ACTIVE,1478575.0,8793134122122169,8793134123933670,8793134123952617,8793134124065537 +triton.JITFunction.mul_kernel,2@triton_ffn.py:144,triton,mul_kernel,SQ_ACTIVE_INST_MISC,8192.0,8793134130152598,8793134130207458,8793134130229572,8793134130234172 +triton.JITFunction.rmsnorm_kernel,2@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134126121034,8793134126182924,8793134126204629,8793134126235949 +triton.JITFunction.rmsnorm_kernel,2@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_WAVES,2048.0,8793134126121034,8793134126182924,8793134126204629,8793134126235949 +triton.JITFunction.rmsnorm_kernel,3@triton_ffn.py:109,triton,rmsnorm_kernel,GRBM_GUI_ACTIVE,87899.0,8793134130576570,8793134130640051,8793134130660454,8793134130663134 +triton.JITFunction.rmsnorm_kernel,3@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_WAVES,2048.0,8793134130576570,8793134130640051,8793134130660454,8793134130663134 +triton.JITFunction.rmsnorm_kernel,3@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_SCA,36864.0,8793134130576570,8793134130640051,8793134130660454,8793134130663134 +triton.JITFunction.rmsnorm_kernel,3@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_VALU,141312.0,8793134130576570,8793134130640051,8793134130660454,8793134130663134 +triton.JITFunction.rmsnorm_kernel,3@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130576570,8793134130640051,8793134130660454,8793134130663134 +triton.JITFunction.rmsnorm_kernel,2@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_VALU,141312.0,8793134126121034,8793134126182924,8793134126204629,8793134126235949 +triton.JITFunction.rmsnorm_kernel,3@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_MISC,10240.0,8793134130576570,8793134130640051,8793134130660454,8793134130663134 +triton.JITFunction.rmsnorm_kernel,2@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_SCA,36864.0,8793134126121034,8793134126182924,8793134126204629,8793134126235949 +triton.JITFunction.rmsnorm_kernel,3@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_FLAT,6144.0,8793134130576570,8793134130640051,8793134130660454,8793134130663134 +triton.JITFunction.rmsnorm_kernel,2@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_FLAT,6144.0,8793134126121034,8793134126182924,8793134126204629,8793134126235949 +triton.JITFunction.rmsnorm_kernel,2@triton_ffn.py:109,triton,rmsnorm_kernel,GRBM_GUI_ACTIVE,443730.0,8793134126121034,8793134126182924,8793134126204629,8793134126235949 +triton.JITFunction.rmsnorm_kernel,1@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_WAVES,2048.0,8793133166884863,8793134114343043,8793134114329043,8793134114333123 +triton.JITFunction.rmsnorm_kernel,1@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793133166884863,8793134114343043,8793134114329043,8793134114333123 +triton.JITFunction.rmsnorm_kernel,1@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_VALU,141312.0,8793133166884863,8793134114343043,8793134114329043,8793134114333123 +triton.JITFunction.rmsnorm_kernel,1@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_SCA,36864.0,8793133166884863,8793134114343043,8793134114329043,8793134114333123 +triton.JITFunction.rmsnorm_kernel,1@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_MISC,10240.0,8793133166884863,8793134114343043,8793134114329043,8793134114333123 +triton.JITFunction.rmsnorm_kernel,1@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_FLAT,6144.0,8793133166884863,8793134114343043,8793134114329043,8793134114333123 +triton.JITFunction.rmsnorm_kernel,1@triton_ffn.py:109,triton,rmsnorm_kernel,GRBM_GUI_ACTIVE,86452.0,8793133166884863,8793134114343043,8793134114329043,8793134114333123 +triton.JITFunction.rmsnorm_kernel,2@triton_ffn.py:109,triton,rmsnorm_kernel,SQ_ACTIVE_INST_MISC,10240.0,8793134126121034,8793134126182924,8793134126204629,8793134126235949 +triton.JITFunction.silu_kernel,2@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134129672135,8793134130104747,8793134130114291,8793134130118291 +triton.JITFunction.silu_kernel,2@triton_ffn.py:144,triton,silu_kernel,SQ_WAVES,4096.0,8793134129672135,8793134130104747,8793134130114291,8793134130118291 +triton.JITFunction.silu_kernel,3@triton_ffn.py:144,triton,silu_kernel,GRBM_GUI_ACTIVE,98415.0,8793134130848972,8793134130890192,8793134130911016,8793134130914976 +triton.JITFunction.silu_kernel,3@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_VALU,458752.0,8793134130848972,8793134130890192,8793134130911016,8793134130914976 +triton.JITFunction.silu_kernel,3@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_MISC,8192.0,8793134130848972,8793134130890192,8793134130911016,8793134130914976 +triton.JITFunction.silu_kernel,3@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_SCA,28672.0,8793134130848972,8793134130890192,8793134130911016,8793134130914976 +triton.JITFunction.silu_kernel,2@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_VALU,458752.0,8793134129672135,8793134130104747,8793134130114291,8793134130118291 +triton.JITFunction.silu_kernel,3@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_FLAT,8192.0,8793134130848972,8793134130890192,8793134130911016,8793134130914976 +triton.JITFunction.silu_kernel,2@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_SCA,28672.0,8793134129672135,8793134130104747,8793134130114291,8793134130118291 +triton.JITFunction.silu_kernel,1@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134120188608,8793134121994489,8793134122012486,8793134122105766 +triton.JITFunction.silu_kernel,2@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_FLAT,8192.0,8793134129672135,8793134130104747,8793134130114291,8793134130118291 +triton.JITFunction.silu_kernel,2@triton_ffn.py:144,triton,silu_kernel,GRBM_GUI_ACTIVE,98373.0,8793134129672135,8793134130104747,8793134130114291,8793134130118291 +triton.JITFunction.silu_kernel,1@triton_ffn.py:144,triton,silu_kernel,SQ_WAVES,4096.0,8793134120188608,8793134121994489,8793134122012486,8793134122105766 +triton.JITFunction.silu_kernel,1@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_VALU,458752.0,8793134120188608,8793134121994489,8793134122012486,8793134122105766 +triton.JITFunction.silu_kernel,1@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_SCA,28672.0,8793134120188608,8793134121994489,8793134122012486,8793134122105766 +triton.JITFunction.silu_kernel,1@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_MISC,8192.0,8793134120188608,8793134121994489,8793134122012486,8793134122105766 +triton.JITFunction.silu_kernel,1@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_FLAT,8192.0,8793134120188608,8793134121994489,8793134122012486,8793134122105766 +triton.JITFunction.silu_kernel,1@triton_ffn.py:144,triton,silu_kernel,GRBM_GUI_ACTIVE,1231301.0,8793134120188608,8793134121994489,8793134122012486,8793134122105766 +triton.JITFunction.silu_kernel,3@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_VMEM,0.0,8793134130848972,8793134130890192,8793134130911016,8793134130914976 +triton.JITFunction.silu_kernel,2@triton_ffn.py:144,triton,silu_kernel,SQ_ACTIVE_INST_MISC,8192.0,8793134129672135,8793134130104747,8793134130114291,8793134130118291 +triton.JITFunction.silu_kernel,3@triton_ffn.py:144,triton,silu_kernel,SQ_WAVES,4096.0,8793134130848972,8793134130890192,8793134130911016,8793134130914976 diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace_pmc_perf_0_counter_collection.csv b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace_pmc_perf_0_counter_collection.csv new file mode 100644 index 00000000000..92f069a7cf0 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace_pmc_perf_0_counter_collection.csv @@ -0,0 +1,204 @@ +GPU_ID,GUID,Correlation_Id,Dispatch_ID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Kernel_Name,Start_Timestamp,End_Timestamp,Kernel_ID,Counter_Name,Counter_Value +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,GRBM_GUI_ACTIVE,129243.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_FLAT,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_MISC,57344.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_SCA,417792.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_VALU,1208320.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,GRBM_GUI_ACTIVE,82877.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_FLAT,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_MISC,112.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_SCA,816.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_VALU,2360.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_WAVES,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,GRBM_GUI_ACTIVE,359514.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,GRBM_GUI_ACTIVE,572373.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,GRBM_GUI_ACTIVE,711868.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,GRBM_GUI_ACTIVE,86452.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,GRBM_GUI_ACTIVE,347342.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,GRBM_GUI_ACTIVE,333035.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,GRBM_GUI_ACTIVE,1231301.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,GRBM_GUI_ACTIVE,1478575.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,GRBM_GUI_ACTIVE,945476.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,GRBM_GUI_ACTIVE,454691.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,GRBM_GUI_ACTIVE,443730.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,GRBM_GUI_ACTIVE,330896.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,GRBM_GUI_ACTIVE,328060.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,GRBM_GUI_ACTIVE,98373.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,GRBM_GUI_ACTIVE,103382.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,GRBM_GUI_ACTIVE,939269.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,GRBM_GUI_ACTIVE,78725.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,GRBM_GUI_ACTIVE,87899.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,GRBM_GUI_ACTIVE,335923.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,GRBM_GUI_ACTIVE,336891.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,GRBM_GUI_ACTIVE,98415.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,GRBM_GUI_ACTIVE,106463.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,GRBM_GUI_ACTIVE,935064.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,GRBM_GUI_ACTIVE,78305.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,GRBM_GUI_ACTIVE,81370.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_FLAT,2.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_MISC,41.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_SCA,97.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_VALU,22.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_WAVES,4.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,GRBM_GUI_ACTIVE,175824.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_FLAT,1170.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_MISC,27439.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_SCA,65253.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_VALU,56466.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_WAVES,512.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,GRBM_GUI_ACTIVE,126114.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_FLAT,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_MISC,44.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_SCA,118.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_VALU,57.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_WAVES,8.0 diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace_pmc_perf_0_marker_api_trace.csv b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace_pmc_perf_0_marker_api_trace.csv new file mode 100644 index 00000000000..618cea267d1 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/ml_api_trace_pmc_perf_0_marker_api_trace.csv @@ -0,0 +1,22 @@ +Domain,Function,Process_Id,Thread_Id,Correlation_Id,GUID,Start_Timestamp,End_Timestamp,Backend +MARKER_CORE_RANGE_API,triton.JITFunction.rmsnorm_kernel:#1@triton_ffn.py:109,26931,26931,6,00020c1c-8508-7508-8d07-3a98d0a74f41,8793133166884863,8793134114343043,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#1@triton_ffn.py:120,26931,26931,7,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134115438319,8793134118569718,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#2@triton_ffn.py:120,26931,26931,8,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134118637958,8793134118707599,triton +MARKER_CORE_RANGE_API,triton.JITFunction.silu_kernel:#1@triton_ffn.py:144,26931,26931,9,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134120188608,8793134121994489,triton +MARKER_CORE_RANGE_API,triton.JITFunction.mul_kernel:#1@triton_ffn.py:144,26931,26931,10,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134122122169,8793134123933670,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#3@triton_ffn.py:120,26931,26931,11,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134124032561,8793134124097991,triton +MARKER_CORE_RANGE_API,triton.JITFunction.add_kernel:#1@triton_ffn.py:144,26931,26931,12,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134124335413,8793134126014973,triton +MARKER_CORE_RANGE_API,triton.JITFunction.rmsnorm_kernel:#2@triton_ffn.py:109,26931,26931,13,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134126121034,8793134126182924,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#4@triton_ffn.py:120,26931,26931,14,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134126227324,8793134126296225,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#5@triton_ffn.py:120,26931,26931,15,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134126332865,8793134126387255,triton +MARKER_CORE_RANGE_API,triton.JITFunction.silu_kernel:#2@triton_ffn.py:144,26931,26931,16,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134129672135,8793134130104747,triton +MARKER_CORE_RANGE_API,triton.JITFunction.mul_kernel:#2@triton_ffn.py:144,26931,26931,17,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130152598,8793134130207458,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#6@triton_ffn.py:120,26931,26931,18,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130297659,8793134130408939,triton +MARKER_CORE_RANGE_API,triton.JITFunction.add_kernel:#2@triton_ffn.py:144,26931,26931,19,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130453920,8793134130507710,triton +MARKER_CORE_RANGE_API,triton.JITFunction.rmsnorm_kernel:#3@triton_ffn.py:109,26931,26931,20,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130576570,8793134130640051,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#7@triton_ffn.py:120,26931,26931,21,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130674971,8793134130727171,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#8@triton_ffn.py:120,26931,26931,22,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130756921,8793134130818982,triton +MARKER_CORE_RANGE_API,triton.JITFunction.silu_kernel:#3@triton_ffn.py:144,26931,26931,23,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130848972,8793134130890192,triton +MARKER_CORE_RANGE_API,triton.JITFunction.mul_kernel:#3@triton_ffn.py:144,26931,26931,24,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130914742,8793134130949583,triton +MARKER_CORE_RANGE_API,triton.JITFunction.matmul_kernel:#9@triton_ffn.py:120,26931,26931,25,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134130978963,8793134131025843,triton +MARKER_CORE_RANGE_API,triton.JITFunction.add_kernel:#3@triton_ffn.py:144,26931,26931,26,00020c1c-8508-7508-8d07-3a98d0a74f41,8793134131048353,8793134131088063,triton diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/perfmon/pmc_perf_0.yaml b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/perfmon/pmc_perf_0.yaml new file mode 100644 index 00000000000..ec8ba4fb295 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/perfmon/pmc_perf_0.yaml @@ -0,0 +1,9 @@ +jobs: +- pmc: + - SQ_ACTIVE_INST_FLAT + - SQ_ACTIVE_INST_MISC + - SQ_ACTIVE_INST_SCA + - SQ_ACTIVE_INST_VALU + - SQ_ACTIVE_INST_VMEM + - SQ_WAVES + - GRBM_GUI_ACTIVE diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/pmc_dispatch_info.csv b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/pmc_dispatch_info.csv new file mode 100644 index 00000000000..75e2a6b374b --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/pmc_dispatch_info.csv @@ -0,0 +1,30 @@ +Dispatch_ID,Kernel_Name,GPU_ID +0,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",0 +1,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",0 +2,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",0 +3,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",0 +4,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",0 +5,rmsnorm_kernel,0 +6,matmul_kernel,0 +7,matmul_kernel,0 +8,silu_kernel,0 +9,mul_kernel,0 +10,matmul_kernel,0 +11,add_kernel,0 +12,rmsnorm_kernel,0 +13,matmul_kernel,0 +14,matmul_kernel,0 +15,silu_kernel,0 +16,mul_kernel,0 +17,matmul_kernel,0 +18,add_kernel,0 +19,rmsnorm_kernel,0 +20,matmul_kernel,0 +21,matmul_kernel,0 +22,silu_kernel,0 +23,mul_kernel,0 +24,matmul_kernel,0 +25,add_kernel,0 +26,__amd_rocclr_fillBufferAligned,0 +27,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",0 +28,__amd_rocclr_copyBuffer,0 diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/pmc_perf.csv b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/pmc_perf.csv new file mode 100644 index 00000000000..92f069a7cf0 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/pmc_perf.csv @@ -0,0 +1,204 @@ +GPU_ID,GUID,Correlation_Id,Dispatch_ID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Kernel_Name,Start_Timestamp,End_Timestamp,Kernel_ID,Counter_Name,Counter_Value +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,GRBM_GUI_ACTIVE,129243.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_FLAT,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_MISC,57344.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_SCA,417792.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_VALU,1208320.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,GRBM_GUI_ACTIVE,82877.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_FLAT,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_MISC,112.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_SCA,816.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_VALU,2360.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_WAVES,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,GRBM_GUI_ACTIVE,359514.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,GRBM_GUI_ACTIVE,572373.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,GRBM_GUI_ACTIVE,711868.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,GRBM_GUI_ACTIVE,86452.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,GRBM_GUI_ACTIVE,347342.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,GRBM_GUI_ACTIVE,333035.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,GRBM_GUI_ACTIVE,1231301.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,GRBM_GUI_ACTIVE,1478575.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,GRBM_GUI_ACTIVE,945476.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,GRBM_GUI_ACTIVE,454691.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,GRBM_GUI_ACTIVE,443730.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,GRBM_GUI_ACTIVE,330896.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,GRBM_GUI_ACTIVE,328060.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,GRBM_GUI_ACTIVE,98373.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,GRBM_GUI_ACTIVE,103382.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,GRBM_GUI_ACTIVE,939269.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,GRBM_GUI_ACTIVE,78725.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,GRBM_GUI_ACTIVE,87899.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,GRBM_GUI_ACTIVE,335923.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,GRBM_GUI_ACTIVE,336891.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,GRBM_GUI_ACTIVE,98415.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,GRBM_GUI_ACTIVE,106463.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,GRBM_GUI_ACTIVE,935064.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,GRBM_GUI_ACTIVE,78305.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,GRBM_GUI_ACTIVE,81370.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_FLAT,2.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_MISC,41.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_SCA,97.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_VALU,22.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_WAVES,4.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,GRBM_GUI_ACTIVE,175824.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_FLAT,1170.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_MISC,27439.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_SCA,65253.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_VALU,56466.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_WAVES,512.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,GRBM_GUI_ACTIVE,126114.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_FLAT,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_MISC,44.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_SCA,118.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_VALU,57.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_WAVES,8.0 diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/profiling_config.yaml b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/profiling_config.yaml new file mode 100644 index 00000000000..06c19f304b7 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/profiling_config.yaml @@ -0,0 +1,45 @@ +attach_duration_msec: null +attach_pid: null +bench_only: false +config_dir: /app/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs +device: 0 +dispatch: null +experimental: true +filter_blocks: +- 11.2.2 +- 11.2.3 +- 11.2.4 +- 11.2.5 +format_rocprof_output: rocpd +iteration_multiplexing: null +join_type: grid +kernel: null +kokkos_trace: false +list_available_metrics: false +list_blocks: null +list_metrics: null +list_sets: false +loglevel: 20 +membw_analysis: false +ml_api_trace: false +mode: profile +name: triton_trace +no_native_tool: false +no_roof: true +output_directory: /app/projects/rocprofiler-compute/workloads/triton_trace/MI300A +overwrite: false +pc_sampling: false +pc_sampling_interval: null +pc_sampling_method: stochastic +quiet: false +remaining: python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py + --frameworks triton -- ./tests/triton_ffn.py +retain_rocpd_output: false +rocprofiler_sdk_tool_path: /rocm-venv/lib/python3.12/site-packages/_rocm_sdk_core/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so +roof_only: false +set_selected: compute_thruput_util +specs: false +target: null +torch_trace: false +triton_trace: true +verbose: 0 diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/results_pmc_perf_0.csv b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/results_pmc_perf_0.csv new file mode 100644 index 00000000000..92f069a7cf0 --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/results_pmc_perf_0.csv @@ -0,0 +1,204 @@ +GPU_ID,GUID,Correlation_Id,Dispatch_ID,Grid_Size,Workgroup_Size,LDS_Per_Workgroup,Scratch_Per_Workitem,Arch_VGPR,Accum_VGPR,SGPR,Kernel_Name,Start_Timestamp,End_Timestamp,Kernel_ID,Counter_Name,Counter_Value +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,GRBM_GUI_ACTIVE,129243.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_FLAT,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_MISC,57344.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_SCA,417792.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_VALU,1208320.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,1,0,262144,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133164597482,8793133164603682,0,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,GRBM_GUI_ACTIVE,82877.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_FLAT,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_MISC,112.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_SCA,816.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_VALU,2360.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,2,1,512,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165004804,8793133165008004,1,SQ_WAVES,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,GRBM_GUI_ACTIVE,359514.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,3,2,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165928568,8793133165953648,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,GRBM_GUI_ACTIVE,572373.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,4,3,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133165995328,8793133166037248,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,GRBM_GUI_ACTIVE,711868.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_FLAT,16384.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_MISC,102144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_SCA,744192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_VALU,2243200.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,5,4,466944,256,0,0,60,4,48,"void at::native::(anonymous namespace)::distribution_elementwise_grid_stride_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1}>(long, at::PhiloxCudaState, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::(anonymous namespace)::distribution_nullary_kernel, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2}, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_and_transform(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}>(at::TensorIteratorBase&, at::CUDAGeneratorImpl*, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(hiprandStatePhilox4_32_10*)#2} const&, at::native::templates::cuda::normal_kernel(at::TensorBase const&, double, double, at::CUDAGeneratorImpl*)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1})::{lambda(int, float)#1})",8793133166097889,8793133166151169,2,SQ_WAVES,7296.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,GRBM_GUI_ACTIVE,86452.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,6,5,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134114329043,8793134114333123,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,GRBM_GUI_ACTIVE,347342.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,7,6,65536,256,32768,0,28,148,32,matmul_kernel,8793134118588026,8793134118611746,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,GRBM_GUI_ACTIVE,333035.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,8,7,65536,256,32768,0,28,148,32,matmul_kernel,8793134118727907,8793134118751267,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,GRBM_GUI_ACTIVE,1231301.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,9,8,262144,256,0,0,32,0,32,silu_kernel,8793134122012486,8793134122105766,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,GRBM_GUI_ACTIVE,1478575.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,10,9,262144,256,0,0,12,4,32,mul_kernel,8793134123952617,8793134124065537,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,GRBM_GUI_ACTIVE,945476.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,11,10,16384,256,32768,0,28,148,32,matmul_kernel,8793134124116617,8793134124187938,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,GRBM_GUI_ACTIVE,454691.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,12,11,65536,256,0,0,12,4,32,add_kernel,8793134126033548,8793134126065428,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,GRBM_GUI_ACTIVE,443730.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,13,12,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134126204629,8793134126235949,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,GRBM_GUI_ACTIVE,330896.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,14,13,65536,256,32768,0,28,148,32,matmul_kernel,8793134126316470,8793134126339150,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,GRBM_GUI_ACTIVE,328060.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,15,14,65536,256,32768,0,28,148,32,matmul_kernel,8793134126408950,8793134126431671,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,GRBM_GUI_ACTIVE,98373.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,16,15,262144,256,0,0,32,0,32,silu_kernel,8793134130114291,8793134130118291,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,GRBM_GUI_ACTIVE,103382.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,17,16,262144,256,0,0,12,4,32,mul_kernel,8793134130229572,8793134130234172,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,GRBM_GUI_ACTIVE,939269.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,18,17,16384,256,32768,0,28,148,32,matmul_kernel,8793134130417053,8793134130487893,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,GRBM_GUI_ACTIVE,78725.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,19,18,65536,256,0,0,12,4,32,add_kernel,8793134130529174,8793134130531454,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,GRBM_GUI_ACTIVE,87899.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_FLAT,6144.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_MISC,10240.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_SCA,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_VALU,141312.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,20,19,131072,256,16,0,12,4,32,rmsnorm_kernel,8793134130660454,8793134130663134,3,SQ_WAVES,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,GRBM_GUI_ACTIVE,335923.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,21,20,65536,256,32768,0,28,148,32,matmul_kernel,8793134130749375,8793134130772375,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,GRBM_GUI_ACTIVE,336891.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_FLAT,69632.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_MISC,102400.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_SCA,198656.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_VALU,904192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,22,21,65536,256,32768,0,28,148,32,matmul_kernel,8793134130841455,8793134130863935,4,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,GRBM_GUI_ACTIVE,98415.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_FLAT,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_SCA,28672.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_VALU,458752.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,23,22,262144,256,0,0,32,0,32,silu_kernel,8793134130911016,8793134130914976,5,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,GRBM_GUI_ACTIVE,106463.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_FLAT,12288.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_MISC,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_SCA,8192.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_VALU,36864.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,24,23,262144,256,0,0,12,4,32,mul_kernel,8793134130970536,8793134130975176,6,SQ_WAVES,4096.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,GRBM_GUI_ACTIVE,935064.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_FLAT,66560.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_MISC,99328.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_SCA,178688.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_VALU,736000.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,25,24,16384,256,32768,0,28,148,32,matmul_kernel,8793134131047657,8793134131117937,7,SQ_WAVES,256.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,GRBM_GUI_ACTIVE,78305.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_FLAT,3072.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_MISC,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_SCA,2048.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_VALU,9216.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,26,25,65536,256,0,0,12,4,32,add_kernel,8793134131156857,8793134131159137,8,SQ_WAVES,1024.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,GRBM_GUI_ACTIVE,81370.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_FLAT,2.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_MISC,41.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_SCA,97.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_VALU,22.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,27,26,256,256,0,0,12,4,48,__amd_rocclr_fillBufferAligned,8793134131938982,8793134131941182,9,SQ_WAVES,4.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,GRBM_GUI_ACTIVE,175824.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_FLAT,1170.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_MISC,27439.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_SCA,65253.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_VALU,56466.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,28,27,32768,512,2049,0,32,0,80,"void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#2}>, unsigned int, float, 4, 4>)",8793134168726439,8793134168736639,10,SQ_WAVES,512.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,GRBM_GUI_ACTIVE,126114.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_FLAT,8.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_MISC,44.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_SCA,118.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_VALU,57.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_ACTIVE_INST_VMEM,0.0 +4,00020c1c-8508-7508-8d07-3a98d0a74f41,29,28,512,512,0,0,16,0,32,__amd_rocclr_copyBuffer,8793134169497003,8793134169499843,11,SQ_WAVES,8.0 diff --git a/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/sysinfo.csv b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/sysinfo.csv new file mode 100644 index 00000000000..ad5d8cf486f --- /dev/null +++ b/projects/rocprofiler-compute/tests/workloads/triton_trace/MI300A/sysinfo.csv @@ -0,0 +1,2 @@ +workload_path,command,ip_blocks,timestamp,version,hostname,cpu_model,sbios,linux_distro,linux_kernel_version,amd_gpu_kernel_version,cpu_memory,gpu_memory,rocm_version,vbios,compute_partition,memory_partition,gpu_series,gpu_model,gpu_arch,gpu_chip_id,gpu_l1,gpu_l2,cu_per_gpu,simd_per_cu,se_per_gpu,sa_per_se,wave_size,workgroup_max_size,max_waves_per_cu,max_sclk,max_mclk,cur_sclk,cur_mclk,l2_banks,total_l2_chan,lds_banks_per_cu,sqc_per_gpu,pipes_per_gpu,num_xcd,num_hbm_channels +/app/projects/rocprofiler-compute/workloads/triton_trace/MI300A,python3 /app/projects/rocprofiler-compute/src/utils/inject_roctx/launch.py --frameworks triton -- ./tests/triton_ffn.py,SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF,Fri Jun 26 11:06:10 2026 (UTC),3,0fdd2f08d7ba,AMD Instinct MI300A Accelerator,"American Megatrends International, LLC.RMP1007AS",Ubuntu 22.04.5 LTS,5.18.2-mi300-build-140423-ubuntu-22.04+,6.18.4,526647832,98746368,7.12.0,,SPX,NPS1,MI300,MI300A_A1,gfx942,29856,32,4096,228,4,24,1,64,1024,32,2100,1300,2100,1300,16,96,32,120,4,6,128