timeit_debug is not zero-cost #120

maleadt · 2021-05-18T09:22:50Z

About 1us difference, as noticed in CUDA.jl

Without @timeit_debug to function cufunction(...):

julia> code_llvm(cufunction, Tuple{typeof(identity), Type{Tuple{Nothing}}})
;  @ /home/tim/Julia/pkg/CUDA/src/compiler/execution.jl:281 within `cufunction'
define void @julia_cufunction_5927({ {}*, {}*, { i64, {}* } }* noalias nocapture sret %0, [3 x {}*]* noalias nocapture %1) {
top:
  %gcframe2 = alloca [5 x {}*], align 16
  %gcframe2.sub = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 0
  %2 = bitcast [5 x {}*]* %gcframe2 to i8*
  call void @llvm.memset.p0i8.i32(i8* nonnull align 16 dereferenceable(40) %2, i8 0, i32 40, i1 false)
  %3 = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 2
  %4 = bitcast {}** %3 to [3 x {}*]*
  %5 = alloca { {}*, {}*, { i64, {}* } }, align 16
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #5
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -32768
;  @ /home/tim/Julia/pkg/CUDA/src/compiler/execution.jl:282 within `cufunction'
  %6 = bitcast [5 x {}*]* %gcframe2 to i64*
  store i64 12, i64* %6, align 16
  %7 = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 1
  %8 = bitcast i8* %ptls_i8 to i64*
  %9 = load i64, i64* %8, align 8
  %10 = bitcast {}** %7 to i64*
  store i64 %9, i64* %10, align 8
  %11 = bitcast i8* %ptls_i8 to {}***
  store {}** %gcframe2.sub, {}*** %11, align 8
  call void @"j_#cufunction#401_5929"({ {}*, {}*, { i64, {}* } }* noalias nocapture nonnull sret %5, [3 x {}*]* noalias nocapture nonnull %4)
  %12 = bitcast { {}*, {}*, { i64, {}* } }* %5 to <2 x {}*>*
  %13 = load <2 x {}*>, <2 x {}*>* %12, align 16
  %14 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %5, i64 0, i32 2, i32 1
  %15 = bitcast {}** %14 to i64*
  %16 = load i64, i64* %15, align 8
  %17 = bitcast [3 x {}*]* %1 to <2 x {}*>*
  store <2 x {}*> %13, <2 x {}*>* %17, align 8
  %18 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 2
  %19 = bitcast {}** %18 to i64*
  store i64 %16, i64* %19, align 8
  %20 = bitcast { {}*, {}*, { i64, {}* } }* %0 to i8*
  %21 = bitcast { {}*, {}*, { i64, {}* } }* %5 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(32) %20, i8* nonnull align 16 dereferenceable(32) %21, i64 32, i1 false)
  %22 = load i64, i64* %10, align 8
  store i64 %22, i64* %8, align 8
  ret void
}

Adding the timing macro:

julia> code_llvm(cufunction, Tuple{typeof(identity), Type{Tuple{Nothing}}})
;  @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:246 within `cufunction'
define void @julia_cufunction_5924({ {}*, {}*, { i64, {}* } }* noalias nocapture sret %0, [3 x {}*]* noalias nocapture %1) {
top:
  %gcframe2 = alloca [8 x {}*], align 16
  %gcframe2.sub = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 0
  %2 = bitcast [8 x {}*]* %gcframe2 to i8*
  call void @llvm.memset.p0i8.i32(i8* nonnull align 16 dereferenceable(64) %2, i8 0, i32 64, i1 false)
  %3 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 2
  %4 = bitcast {}** %3 to { { {}* }, {}*, {}*, {}*, {}*, {}* }*
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #5
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -32768
; ┌ @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:246 within `#cufunction#398'
; │┌ @ /home/tim/Julia/depot/packages/FastClosures/3LNoZ/src/FastClosures.jl:52 within `macro expansion'
    %5 = bitcast [8 x {}*]* %gcframe2 to i64*
    store i64 24, i64* %5, align 16
    %6 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 1
    %7 = bitcast i8* %ptls_i8 to i64*
    %8 = load i64, i64* %7, align 8
    %9 = bitcast {}** %6 to i64*
    store i64 %8, i64* %9, align 8
    %10 = bitcast i8* %ptls_i8 to {}***
    store {}** %gcframe2.sub, {}*** %10, align 8
    %.unpack3 = load i64, i64* inttoptr (i64 139664990837568 to i64*), align 64
; └└
; ┌ @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:226 within `#cufunction#398'
   %11 = bitcast {}** %3 to i64*
   store i64 %.unpack3, i64* %11, align 16
   %12 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 3
   store {}* inttoptr (i64 139665057363584 to {}*), {}** %12, align 8
   %13 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 4
   store {}* inttoptr (i64 139664987851056 to {}*), {}** %13, align 16
   %14 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 5
   store {}* inttoptr (i64 139665054897152 to {}*), {}** %14, align 8
   %15 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 6
   store {}* inttoptr (i64 139665057363584 to {}*), {}** %15, align 16
   %16 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 7
   store {}* inttoptr (i64 139660335218704 to {}*), {}** %16, align 8
   %17 = call nonnull {}* @"j_#399_5926"({ { {}* }, {}*, {}*, {}*, {}*, {}* }* nocapture readonly %4)
; └
  %18 = bitcast {}* %17 to { {}*, {}*, { i64, {}* } }*
  %19 = bitcast {}* %17 to i64*
  %20 = load i64, i64* %19, align 8
  %21 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %18, i64 0, i32 1
  %22 = bitcast {}** %21 to i64*
  %23 = load i64, i64* %22, align 8
  %24 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %18, i64 0, i32 2, i32 1
  %25 = bitcast {}** %24 to i64*
  %26 = load i64, i64* %25, align 8
  %27 = bitcast [3 x {}*]* %1 to i64*
  store i64 %20, i64* %27, align 8
  %28 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 1
  %29 = bitcast {}** %28 to i64*
  store i64 %23, i64* %29, align 8
  %30 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 2
  %31 = bitcast {}** %30 to i64*
  store i64 %26, i64* %31, align 8
  %32 = bitcast { {}*, {}*, { i64, {}* } }* %0 to i8*
  %33 = bitcast {}* %17 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(32) %32, i8* nonnull align 1 dereferenceable(32) %33, i64 32, i1 false)
  %34 = load i64, i64* %9, align 8
  store i64 %34, i64* %7, align 8
  ret void
}

So there's a clear difference in IR...

The text was updated successfully, but these errors were encountered:

maleadt mentioned this issue May 18, 2021

Use FastClosures to ensure no boxes are created. #121

Draft

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

timeit_debug is not zero-cost #120

timeit_debug is not zero-cost #120

maleadt commented May 18, 2021

timeit_debug is not zero-cost #120

timeit_debug is not zero-cost #120

Comments

maleadt commented May 18, 2021