Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

timeit_debug is not zero-cost #120

Open
maleadt opened this issue May 18, 2021 · 0 comments
Open

timeit_debug is not zero-cost #120

maleadt opened this issue May 18, 2021 · 0 comments

Comments

@maleadt
Copy link
Contributor

maleadt commented May 18, 2021

About 1us difference, as noticed in CUDA.jl

Without @timeit_debug to function cufunction(...):

julia> code_llvm(cufunction, Tuple{typeof(identity), Type{Tuple{Nothing}}})
;  @ /home/tim/Julia/pkg/CUDA/src/compiler/execution.jl:281 within `cufunction'
define void @julia_cufunction_5927({ {}*, {}*, { i64, {}* } }* noalias nocapture sret %0, [3 x {}*]* noalias nocapture %1) {
top:
  %gcframe2 = alloca [5 x {}*], align 16
  %gcframe2.sub = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 0
  %2 = bitcast [5 x {}*]* %gcframe2 to i8*
  call void @llvm.memset.p0i8.i32(i8* nonnull align 16 dereferenceable(40) %2, i8 0, i32 40, i1 false)
  %3 = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 2
  %4 = bitcast {}** %3 to [3 x {}*]*
  %5 = alloca { {}*, {}*, { i64, {}* } }, align 16
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #5
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -32768
;  @ /home/tim/Julia/pkg/CUDA/src/compiler/execution.jl:282 within `cufunction'
  %6 = bitcast [5 x {}*]* %gcframe2 to i64*
  store i64 12, i64* %6, align 16
  %7 = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 1
  %8 = bitcast i8* %ptls_i8 to i64*
  %9 = load i64, i64* %8, align 8
  %10 = bitcast {}** %7 to i64*
  store i64 %9, i64* %10, align 8
  %11 = bitcast i8* %ptls_i8 to {}***
  store {}** %gcframe2.sub, {}*** %11, align 8
  call void @"j_#cufunction#401_5929"({ {}*, {}*, { i64, {}* } }* noalias nocapture nonnull sret %5, [3 x {}*]* noalias nocapture nonnull %4)
  %12 = bitcast { {}*, {}*, { i64, {}* } }* %5 to <2 x {}*>*
  %13 = load <2 x {}*>, <2 x {}*>* %12, align 16
  %14 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %5, i64 0, i32 2, i32 1
  %15 = bitcast {}** %14 to i64*
  %16 = load i64, i64* %15, align 8
  %17 = bitcast [3 x {}*]* %1 to <2 x {}*>*
  store <2 x {}*> %13, <2 x {}*>* %17, align 8
  %18 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 2
  %19 = bitcast {}** %18 to i64*
  store i64 %16, i64* %19, align 8
  %20 = bitcast { {}*, {}*, { i64, {}* } }* %0 to i8*
  %21 = bitcast { {}*, {}*, { i64, {}* } }* %5 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(32) %20, i8* nonnull align 16 dereferenceable(32) %21, i64 32, i1 false)
  %22 = load i64, i64* %10, align 8
  store i64 %22, i64* %8, align 8
  ret void
}

Adding the timing macro:

julia> code_llvm(cufunction, Tuple{typeof(identity), Type{Tuple{Nothing}}})
;  @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:246 within `cufunction'
define void @julia_cufunction_5924({ {}*, {}*, { i64, {}* } }* noalias nocapture sret %0, [3 x {}*]* noalias nocapture %1) {
top:
  %gcframe2 = alloca [8 x {}*], align 16
  %gcframe2.sub = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 0
  %2 = bitcast [8 x {}*]* %gcframe2 to i8*
  call void @llvm.memset.p0i8.i32(i8* nonnull align 16 dereferenceable(64) %2, i8 0, i32 64, i1 false)
  %3 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 2
  %4 = bitcast {}** %3 to { { {}* }, {}*, {}*, {}*, {}*, {}* }*
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #5
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -32768
; ┌ @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:246 within `#cufunction#398'
; │┌ @ /home/tim/Julia/depot/packages/FastClosures/3LNoZ/src/FastClosures.jl:52 within `macro expansion'
    %5 = bitcast [8 x {}*]* %gcframe2 to i64*
    store i64 24, i64* %5, align 16
    %6 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 1
    %7 = bitcast i8* %ptls_i8 to i64*
    %8 = load i64, i64* %7, align 8
    %9 = bitcast {}** %6 to i64*
    store i64 %8, i64* %9, align 8
    %10 = bitcast i8* %ptls_i8 to {}***
    store {}** %gcframe2.sub, {}*** %10, align 8
    %.unpack3 = load i64, i64* inttoptr (i64 139664990837568 to i64*), align 64
; └└
; ┌ @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:226 within `#cufunction#398'
   %11 = bitcast {}** %3 to i64*
   store i64 %.unpack3, i64* %11, align 16
   %12 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 3
   store {}* inttoptr (i64 139665057363584 to {}*), {}** %12, align 8
   %13 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 4
   store {}* inttoptr (i64 139664987851056 to {}*), {}** %13, align 16
   %14 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 5
   store {}* inttoptr (i64 139665054897152 to {}*), {}** %14, align 8
   %15 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 6
   store {}* inttoptr (i64 139665057363584 to {}*), {}** %15, align 16
   %16 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 7
   store {}* inttoptr (i64 139660335218704 to {}*), {}** %16, align 8
   %17 = call nonnull {}* @"j_#399_5926"({ { {}* }, {}*, {}*, {}*, {}*, {}* }* nocapture readonly %4)
; └
  %18 = bitcast {}* %17 to { {}*, {}*, { i64, {}* } }*
  %19 = bitcast {}* %17 to i64*
  %20 = load i64, i64* %19, align 8
  %21 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %18, i64 0, i32 1
  %22 = bitcast {}** %21 to i64*
  %23 = load i64, i64* %22, align 8
  %24 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %18, i64 0, i32 2, i32 1
  %25 = bitcast {}** %24 to i64*
  %26 = load i64, i64* %25, align 8
  %27 = bitcast [3 x {}*]* %1 to i64*
  store i64 %20, i64* %27, align 8
  %28 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 1
  %29 = bitcast {}** %28 to i64*
  store i64 %23, i64* %29, align 8
  %30 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 2
  %31 = bitcast {}** %30 to i64*
  store i64 %26, i64* %31, align 8
  %32 = bitcast { {}*, {}*, { i64, {}* } }* %0 to i8*
  %33 = bitcast {}* %17 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(32) %32, i8* nonnull align 1 dereferenceable(32) %33, i64 32, i1 false)
  %34 = load i64, i64* %9, align 8
  store i64 %34, i64* %7, align 8
  ret void
}

So there's a clear difference in IR...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant