Open
Description
Describe the performance issue
For small workloads, its performance isn't as good as single_task on native-cpu target due to overhead of sycl runtime api calls.
Since SYCL host_task is managed and scheduled by sycl runtime, the performance issue impacts all devices.
To Reproduce
- test.cpp
#include <sycl/sycl.hpp>
#include <chrono>
#include <iostream>
#include <vector>
sycl::event foo_single_task(sycl::queue &q, sycl::buffer<float> &r, size_t n) {
return q.submit([&](sycl::handler &cgh) {
sycl::accessor acc{r, cgh, sycl::write_only, sycl::no_init};
cgh.single_task([=]() {
for (size_t i = 0; i < n; i++) {
acc[i] = i;
}
});
});
}
sycl::event foo_host_task(sycl::queue &q, sycl::buffer<float> &r, size_t n) {
return q.submit([&](sycl::handler &cgh) {
sycl::accessor acc{r, cgh, sycl::write_only_host_task, sycl::no_init};
cgh.host_task([=]() {
for (size_t i = 0; i < n; i++) {
acc[i] = i;
}
});
});
}
int main() {
sycl::queue q;
std::cout << "Running on " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
int n_run = 10000;
size_t n = 1000000;
sycl::event event;
std::cout << "n = " << n << std::endl;
{
auto start = std::chrono::steady_clock::now();
sycl::buffer<float> r(n);
for (int i = 0; i < n_run; i++) {
event = foo_single_task(q, r, n);
event.wait();
}
auto end = std::chrono::steady_clock::now();
std::cout << "single_task " << (std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() * 1e-06) << "ms" << std::endl;
}
{
auto start = std::chrono::steady_clock::now();
sycl::buffer<float> r(n);
for (int i = 0; i < n_run; i++) {
event = foo_host_task(q, r, n);
event.wait();
}
auto end = std::chrono::steady_clock::now();
std::cout << "host_task " << (std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() * 1e-06) << "ms" << std::endl;
}
return 0;
}
- Compile command
clang++ -fsycl -fsycl-targets=native_cpu -O2 test.cpp
- Launch the program
ONEAPI_DEVICE_SELECTOR=native_cpu:cpu ./a.out
- Output shows host_task is ~7 times slower than single_task
Running on SYCL Native CPU
n = 1000000
single_task 6148.84ms
host_task 42818.1ms
Environment (please complete the following information):
- OS: RHEL9.0
- Target device and vendor: native_cpu
- DPC++ version: clang version 18.0.0 (https://github.com/intel/llvm.git d17a3f1)
Additional context
Assembly of host_task function shows there are calls to sycl runtime api inside the loop:
0000000000405f70 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data>:
405f70: 41 57 push %r15
405f72: 41 56 push %r14
405f74: 41 54 push %r12
405f76: 53 push %rbx
405f77: 50 push %rax
405f78: 4c 8b 37 mov (%rdi),%r14
405f7b: 49 83 3e 00 cmpq $0x0,(%r14)
405f7f: 74 72 je 405ff3 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x83>
405f81: 49 8d 5e 08 lea 0x8(%r14),%rbx
405f85: 45 31 ff xor %r15d,%r15d
405f88: eb 47 jmp 405fd1 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x61>
405f8a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
405f90: 0f 57 c0 xorps %xmm0,%xmm0
405f93: f3 49 0f 2a c7 cvtsi2ss %r15,%xmm0
405f98: f3 0f 11 44 24 04 movss %xmm0,0x4(%rsp)
405f9e: 48 89 df mov %rbx,%rdi
405fa1: e8 0a d4 ff ff callq 4033b0 <_ZNK4sycl3_V16detail16AccessorBaseHost14getMemoryRangeEv@plt>
405fa6: 48 89 df mov %rbx,%rdi
405fa9: e8 82 d3 ff ff callq 403330 <_ZNK4sycl3_V16detail16AccessorBaseHost9getOffsetEv@plt>
405fae: 4c 8b 20 mov (%rax),%r12
405fb1: 48 89 df mov %rbx,%rdi
405fb4: e8 37 d1 ff ff callq 4030f0 <_ZNK4sycl3_V16detail16AccessorBaseHost6getPtrEv@plt>
405fb9: 4a 8d 04 a0 lea (%rax,%r12,4),%rax
405fbd: f3 0f 10 44 24 04 movss 0x4(%rsp),%xmm0
405fc3: f3 42 0f 11 04 b8 movss %xmm0,(%rax,%r15,4)
405fc9: 49 ff c7 inc %r15
405fcc: 4d 3b 3e cmp (%r14),%r15
405fcf: 73 22 jae 405ff3 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x83>
405fd1: 4d 85 ff test %r15,%r15
405fd4: 79 ba jns 405f90 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x20>
405fd6: 4c 89 f8 mov %r15,%rax
405fd9: 48 d1 e8 shr %rax
405fdc: 44 89 f9 mov %r15d,%ecx
405fdf: 83 e1 01 and $0x1,%ecx
405fe2: 48 09 c1 or %rax,%rcx
405fe5: 0f 57 c0 xorps %xmm0,%xmm0
405fe8: f3 48 0f 2a c1 cvtsi2ss %rcx,%xmm0
405fed: f3 0f 58 c0 addss %xmm0,%xmm0
405ff1: eb a5 jmp 405f98 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x28>
405ff3: 48 83 c4 08 add $0x8,%rsp
405ff7: 5b pop %rbx
405ff8: 41 5c pop %r12
405ffa: 41 5e pop %r14
405ffc: 41 5f pop %r15
405ffe: c3 retq
405fff: 90 nop
On the other hand, single_task function on native-cpu device only contains a simple loop:
0000000000408850 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU>:
408850: 48 8b 02 mov (%rdx),%rax
408853: 48 8d 04 86 lea (%rsi,%rax,4),%rax
408857: 31 c9 xor %ecx,%ecx
408859: eb 18 jmp 408873 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU+0x23>
40885b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
408860: 0f 57 c0 xorps %xmm0,%xmm0
408863: f3 48 0f 2a c1 cvtsi2ss %rcx,%xmm0
408868: f3 0f 11 00 movss %xmm0,(%rax)
40886c: 48 ff c1 inc %rcx
40886f: 48 83 c0 04 add $0x4,%rax
408873: 48 39 f9 cmp %rdi,%rcx
408876: 73 21 jae 408899 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU+0x49>
408878: 48 85 c9 test %rcx,%rcx
40887b: 79 e3 jns 408860 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU+0x10>
40887d: 48 89 ca mov %rcx,%rdx
408880: 48 d1 ea shr %rdx
408883: 89 ce mov %ecx,%esi
408885: 83 e6 01 and $0x1,%esi
408888: 48 09 d6 or %rdx,%rsi
40888b: 0f 57 c0 xorps %xmm0,%xmm0
40888e: f3 48 0f 2a c6 cvtsi2ss %rsi,%xmm0
408893: f3 0f 58 c0 addss %xmm0,%xmm0
408897: eb cf jmp 408868 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU+0x18>
408899: c3 retq
40889a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)