Skip to content

[SYCL][Performance] host_task's performance isn't good on a small workload #11593

Open
@wenju-he

Description

@wenju-he

Describe the performance issue
For small workloads, its performance isn't as good as single_task on native-cpu target due to overhead of sycl runtime api calls.
Since SYCL host_task is managed and scheduled by sycl runtime, the performance issue impacts all devices.

To Reproduce

  1. test.cpp
#include <sycl/sycl.hpp>
#include <chrono>
#include <iostream>
#include <vector>

sycl::event foo_single_task(sycl::queue &q, sycl::buffer<float> &r, size_t n) {
  return q.submit([&](sycl::handler &cgh) {
    sycl::accessor acc{r, cgh, sycl::write_only, sycl::no_init};
    cgh.single_task([=]() {
      for (size_t i = 0; i < n; i++) {
        acc[i] = i;
      }
    });
  });
}

sycl::event foo_host_task(sycl::queue &q, sycl::buffer<float> &r, size_t n) {
  return q.submit([&](sycl::handler &cgh) {
    sycl::accessor acc{r, cgh, sycl::write_only_host_task, sycl::no_init};
    cgh.host_task([=]() {
      for (size_t i = 0; i < n; i++) {
        acc[i] = i;
      }
    });
  });
}

int main() {
  sycl::queue q;
  std::cout << "Running on " << q.get_device().get_info<sycl::info::device::name>() << std::endl;

  int n_run = 10000;
  size_t n = 1000000;
  sycl::event event;

  std::cout << "n = " << n << std::endl;
  {
    auto start = std::chrono::steady_clock::now();
    sycl::buffer<float> r(n);
    for (int i = 0; i < n_run; i++) {
      event = foo_single_task(q, r, n);
      event.wait();
    }
    auto end = std::chrono::steady_clock::now();
    std::cout << "single_task " << (std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() * 1e-06) << "ms" << std::endl;
  }
  {
    auto start = std::chrono::steady_clock::now();
    sycl::buffer<float> r(n);
    for (int i = 0; i < n_run; i++) {
      event = foo_host_task(q, r, n);
      event.wait();
    }
    auto end = std::chrono::steady_clock::now();
    std::cout << "host_task " << (std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() * 1e-06) << "ms" << std::endl;
  }

  return 0;
}
  1. Compile command
clang++ -fsycl -fsycl-targets=native_cpu -O2 test.cpp
  1. Launch the program
ONEAPI_DEVICE_SELECTOR=native_cpu:cpu ./a.out
  1. Output shows host_task is ~7 times slower than single_task
Running on SYCL Native CPU
n = 1000000
single_task 6148.84ms
host_task 42818.1ms

Environment (please complete the following information):

Additional context
Assembly of host_task function shows there are calls to sycl runtime api inside the loop:

0000000000405f70 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data>:
  405f70:       41 57                   push   %r15
  405f72:       41 56                   push   %r14
  405f74:       41 54                   push   %r12
  405f76:       53                      push   %rbx
  405f77:       50                      push   %rax
  405f78:       4c 8b 37                mov    (%rdi),%r14
  405f7b:       49 83 3e 00             cmpq   $0x0,(%r14)
  405f7f:       74 72                   je     405ff3 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x83>
  405f81:       49 8d 5e 08             lea    0x8(%r14),%rbx
  405f85:       45 31 ff                xor    %r15d,%r15d
  405f88:       eb 47                   jmp    405fd1 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x61>
  405f8a:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  405f90:       0f 57 c0                xorps  %xmm0,%xmm0
  405f93:       f3 49 0f 2a c7          cvtsi2ss %r15,%xmm0
  405f98:       f3 0f 11 44 24 04       movss  %xmm0,0x4(%rsp)
  405f9e:       48 89 df                mov    %rbx,%rdi
  405fa1:       e8 0a d4 ff ff          callq  4033b0 <_ZNK4sycl3_V16detail16AccessorBaseHost14getMemoryRangeEv@plt>
  405fa6:       48 89 df                mov    %rbx,%rdi
  405fa9:       e8 82 d3 ff ff          callq  403330 <_ZNK4sycl3_V16detail16AccessorBaseHost9getOffsetEv@plt>
  405fae:       4c 8b 20                mov    (%rax),%r12
  405fb1:       48 89 df                mov    %rbx,%rdi
  405fb4:       e8 37 d1 ff ff          callq  4030f0 <_ZNK4sycl3_V16detail16AccessorBaseHost6getPtrEv@plt>
  405fb9:       4a 8d 04 a0             lea    (%rax,%r12,4),%rax
  405fbd:       f3 0f 10 44 24 04       movss  0x4(%rsp),%xmm0
  405fc3:       f3 42 0f 11 04 b8       movss  %xmm0,(%rax,%r15,4)
  405fc9:       49 ff c7                inc    %r15
  405fcc:       4d 3b 3e                cmp    (%r14),%r15
  405fcf:       73 22                   jae    405ff3 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x83>
  405fd1:       4d 85 ff                test   %r15,%r15
  405fd4:       79 ba                   jns    405f90 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x20>
  405fd6:       4c 89 f8                mov    %r15,%rax
  405fd9:       48 d1 e8                shr    %rax
  405fdc:       44 89 f9                mov    %r15d,%ecx
  405fdf:       83 e1 01                and    $0x1,%ecx
  405fe2:       48 09 c1                or     %rax,%rcx
  405fe5:       0f 57 c0                xorps  %xmm0,%xmm0
  405fe8:       f3 48 0f 2a c1          cvtsi2ss %rcx,%xmm0
  405fed:       f3 0f 58 c0             addss  %xmm0,%xmm0
  405ff1:       eb a5                   jmp    405f98 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x28>
  405ff3:       48 83 c4 08             add    $0x8,%rsp
  405ff7:       5b                      pop    %rbx
  405ff8:       41 5c                   pop    %r12
  405ffa:       41 5e                   pop    %r14
  405ffc:       41 5f                   pop    %r15
  405ffe:       c3                      retq
  405fff:       90                      nop

On the other hand, single_task function on native-cpu device only contains a simple loop:

0000000000408850 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU>:
  408850:       48 8b 02                mov    (%rdx),%rax
  408853:       48 8d 04 86             lea    (%rsi,%rax,4),%rax
  408857:       31 c9                   xor    %ecx,%ecx
  408859:       eb 18                   jmp    408873 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU+0x23>
  40885b:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  408860:       0f 57 c0                xorps  %xmm0,%xmm0
  408863:       f3 48 0f 2a c1          cvtsi2ss %rcx,%xmm0
  408868:       f3 0f 11 00             movss  %xmm0,(%rax)
  40886c:       48 ff c1                inc    %rcx
  40886f:       48 83 c0 04             add    $0x4,%rax
  408873:       48 39 f9                cmp    %rdi,%rcx
  408876:       73 21                   jae    408899 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU+0x49>
  408878:       48 85 c9                test   %rcx,%rcx
  40887b:       79 e3                   jns    408860 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU+0x10>
  40887d:       48 89 ca                mov    %rcx,%rdx
  408880:       48 d1 ea                shr    %rdx
  408883:       89 ce                   mov    %ecx,%esi
  408885:       83 e6 01                and    $0x1,%esi
  408888:       48 09 d6                or     %rdx,%rsi
  40888b:       0f 57 c0                xorps  %xmm0,%xmm0
  40888e:       f3 48 0f 2a c6          cvtsi2ss %rsi,%xmm0
  408893:       f3 0f 58 c0             addss  %xmm0,%xmm0
  408897:       eb cf                   jmp    408868 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU+0x18>
  408899:       c3                      retq
  40889a:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

vtune result:
Screenshot 2023-10-19 092853
Screenshot 2023-10-19 092742

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions