We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
About 1us difference, as noticed in CUDA.jl
Without @timeit_debug to function cufunction(...):
@timeit_debug to function cufunction(...)
julia> code_llvm(cufunction, Tuple{typeof(identity), Type{Tuple{Nothing}}}) ; @ /home/tim/Julia/pkg/CUDA/src/compiler/execution.jl:281 within `cufunction' define void @julia_cufunction_5927({ {}*, {}*, { i64, {}* } }* noalias nocapture sret %0, [3 x {}*]* noalias nocapture %1) { top: %gcframe2 = alloca [5 x {}*], align 16 %gcframe2.sub = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 0 %2 = bitcast [5 x {}*]* %gcframe2 to i8* call void @llvm.memset.p0i8.i32(i8* nonnull align 16 dereferenceable(40) %2, i8 0, i32 40, i1 false) %3 = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 2 %4 = bitcast {}** %3 to [3 x {}*]* %5 = alloca { {}*, {}*, { i64, {}* } }, align 16 %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #5 %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -32768 ; @ /home/tim/Julia/pkg/CUDA/src/compiler/execution.jl:282 within `cufunction' %6 = bitcast [5 x {}*]* %gcframe2 to i64* store i64 12, i64* %6, align 16 %7 = getelementptr inbounds [5 x {}*], [5 x {}*]* %gcframe2, i64 0, i64 1 %8 = bitcast i8* %ptls_i8 to i64* %9 = load i64, i64* %8, align 8 %10 = bitcast {}** %7 to i64* store i64 %9, i64* %10, align 8 %11 = bitcast i8* %ptls_i8 to {}*** store {}** %gcframe2.sub, {}*** %11, align 8 call void @"j_#cufunction#401_5929"({ {}*, {}*, { i64, {}* } }* noalias nocapture nonnull sret %5, [3 x {}*]* noalias nocapture nonnull %4) %12 = bitcast { {}*, {}*, { i64, {}* } }* %5 to <2 x {}*>* %13 = load <2 x {}*>, <2 x {}*>* %12, align 16 %14 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %5, i64 0, i32 2, i32 1 %15 = bitcast {}** %14 to i64* %16 = load i64, i64* %15, align 8 %17 = bitcast [3 x {}*]* %1 to <2 x {}*>* store <2 x {}*> %13, <2 x {}*>* %17, align 8 %18 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 2 %19 = bitcast {}** %18 to i64* store i64 %16, i64* %19, align 8 %20 = bitcast { {}*, {}*, { i64, {}* } }* %0 to i8* %21 = bitcast { {}*, {}*, { i64, {}* } }* %5 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(32) %20, i8* nonnull align 16 dereferenceable(32) %21, i64 32, i1 false) %22 = load i64, i64* %10, align 8 store i64 %22, i64* %8, align 8 ret void }
Adding the timing macro:
julia> code_llvm(cufunction, Tuple{typeof(identity), Type{Tuple{Nothing}}}) ; @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:246 within `cufunction' define void @julia_cufunction_5924({ {}*, {}*, { i64, {}* } }* noalias nocapture sret %0, [3 x {}*]* noalias nocapture %1) { top: %gcframe2 = alloca [8 x {}*], align 16 %gcframe2.sub = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 0 %2 = bitcast [8 x {}*]* %gcframe2 to i8* call void @llvm.memset.p0i8.i32(i8* nonnull align 16 dereferenceable(64) %2, i8 0, i32 64, i1 false) %3 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 2 %4 = bitcast {}** %3 to { { {}* }, {}*, {}*, {}*, {}*, {}* }* %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #5 %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -32768 ; ┌ @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:246 within `#cufunction#398' ; │┌ @ /home/tim/Julia/depot/packages/FastClosures/3LNoZ/src/FastClosures.jl:52 within `macro expansion' %5 = bitcast [8 x {}*]* %gcframe2 to i64* store i64 24, i64* %5, align 16 %6 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 1 %7 = bitcast i8* %ptls_i8 to i64* %8 = load i64, i64* %7, align 8 %9 = bitcast {}** %6 to i64* store i64 %8, i64* %9, align 8 %10 = bitcast i8* %ptls_i8 to {}*** store {}** %gcframe2.sub, {}*** %10, align 8 %.unpack3 = load i64, i64* inttoptr (i64 139664990837568 to i64*), align 64 ; └└ ; ┌ @ /home/tim/Julia/pkg/TimerOutputs/src/TimerOutput.jl:226 within `#cufunction#398' %11 = bitcast {}** %3 to i64* store i64 %.unpack3, i64* %11, align 16 %12 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 3 store {}* inttoptr (i64 139665057363584 to {}*), {}** %12, align 8 %13 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 4 store {}* inttoptr (i64 139664987851056 to {}*), {}** %13, align 16 %14 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 5 store {}* inttoptr (i64 139665054897152 to {}*), {}** %14, align 8 %15 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 6 store {}* inttoptr (i64 139665057363584 to {}*), {}** %15, align 16 %16 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2, i64 0, i64 7 store {}* inttoptr (i64 139660335218704 to {}*), {}** %16, align 8 %17 = call nonnull {}* @"j_#399_5926"({ { {}* }, {}*, {}*, {}*, {}*, {}* }* nocapture readonly %4) ; └ %18 = bitcast {}* %17 to { {}*, {}*, { i64, {}* } }* %19 = bitcast {}* %17 to i64* %20 = load i64, i64* %19, align 8 %21 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %18, i64 0, i32 1 %22 = bitcast {}** %21 to i64* %23 = load i64, i64* %22, align 8 %24 = getelementptr inbounds { {}*, {}*, { i64, {}* } }, { {}*, {}*, { i64, {}* } }* %18, i64 0, i32 2, i32 1 %25 = bitcast {}** %24 to i64* %26 = load i64, i64* %25, align 8 %27 = bitcast [3 x {}*]* %1 to i64* store i64 %20, i64* %27, align 8 %28 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 1 %29 = bitcast {}** %28 to i64* store i64 %23, i64* %29, align 8 %30 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 2 %31 = bitcast {}** %30 to i64* store i64 %26, i64* %31, align 8 %32 = bitcast { {}*, {}*, { i64, {}* } }* %0 to i8* %33 = bitcast {}* %17 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(32) %32, i8* nonnull align 1 dereferenceable(32) %33, i64 32, i1 false) %34 = load i64, i64* %9, align 8 store i64 %34, i64* %7, align 8 ret void }
So there's a clear difference in IR...
The text was updated successfully, but these errors were encountered:
No branches or pull requests
About 1us difference, as noticed in CUDA.jl
Without
@timeit_debug to function cufunction(...)
:Adding the timing macro:
So there's a clear difference in IR...
The text was updated successfully, but these errors were encountered: