-
Notifications
You must be signed in to change notification settings - Fork 254
Make resize!
run faster
#2828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make resize!
run faster
#2828
Conversation
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/array.jl b/src/array.jl
index 5d184179c..91c59e30a 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -62,7 +62,7 @@ function valid_type(@nospecialize(T))
end
@inline function check_eltype(name, T)
- if !valid_type(T)
+ return if !valid_type(T)
explanation = explain_eltype(T)
error("""
$name only supports element types that are allocated inline.
@@ -877,7 +877,7 @@ Base.unsafe_convert(::Type{CuPtr{T}}, A::PermutedDimsArray) where {T} =
## resizing
const RESIZE_THRESHOLD = 100 * 1024^2 # 100 MiB
-const RESIZE_INCREMENT = 32 * 1024^2 # 32 MiB
+const RESIZE_INCREMENT = 32 * 1024^2 # 32 MiB
"""
resize!(a::CuVector, n::Integer)
@@ -889,60 +889,60 @@ guaranteed to be initialized.
function Base.resize!(A::CuVector{T}, n::Integer) where T
n == length(A) && return A
- # only resize when the new length exceeds the capacity or is much smaller
- cap = A.maxsize ÷ aligned_sizeof(T)
- if n > cap || n < cap ÷ 4
- len = if n < cap
- # shrink to fit
- n
- elseif A.maxsize > RESIZE_THRESHOLD
- # large arrays grown by fixed increments
- max(n, cap + RESIZE_INCREMENT ÷ aligned_sizeof(T))
- else
- # small arrays are doubled in size
- max(n, 2 * length(A))
- end
+ # only resize when the new length exceeds the capacity or is much smaller
+ cap = A.maxsize ÷ aligned_sizeof(T)
+ if n > cap || n < cap ÷ 4
+ len = if n < cap
+ # shrink to fit
+ n
+ elseif A.maxsize > RESIZE_THRESHOLD
+ # large arrays grown by fixed increments
+ max(n, cap + RESIZE_INCREMENT ÷ aligned_sizeof(T))
+ else
+ # small arrays are doubled in size
+ max(n, 2 * length(A))
+ end
- # determine the new buffer size
- maxsize = len * aligned_sizeof(T)
- bufsize = if isbitstype(T)
- maxsize
- else
- # type tag array past the data
- maxsize + len
- end
+ # determine the new buffer size
+ maxsize = len * aligned_sizeof(T)
+ bufsize = if isbitstype(T)
+ maxsize
+ else
+ # type tag array past the data
+ maxsize + len
+ end
- # allocate new data
- old_data = A.data
- new_data = context!(context(A)) do
- mem = pool_alloc(memory_type(A), bufsize)
- ptr = convert(CuPtr{T}, mem)
- DataRef(pool_free, mem)
- end
+ # allocate new data
+ old_data = A.data
+ new_data = context!(context(A)) do
+ mem = pool_alloc(memory_type(A), bufsize)
+ ptr = convert(CuPtr{T}, mem)
+ DataRef(pool_free, mem)
+ end
- # replace the data with a new one. this 'unshares' the array.
- # as a result, we can safely support resizing unowned buffers.
- old_pointer = pointer(A)
- old_typetagdata = typetagdata(A)
- A.data = new_data
- A.maxsize = maxsize
- A.offset = 0
- new_pointer = pointer(A)
- new_typetagdata = typetagdata(A)
-
- # copy existing elements and type tags
+ # replace the data with a new one. this 'unshares' the array.
+ # as a result, we can safely support resizing unowned buffers.
+ old_pointer = pointer(A)
+ old_typetagdata = typetagdata(A)
+ A.data = new_data
+ A.maxsize = maxsize
+ A.offset = 0
+ new_pointer = pointer(A)
+ new_typetagdata = typetagdata(A)
+
+ # copy existing elements and type tags
m = min(length(A), n)
if m > 0
- context!(context(A)) do
- unsafe_copyto!(new_pointer, old_pointer, m; async=true)
- if Base.isbitsunion(T)
- unsafe_copyto!(new_typetagdata, old_typetagdata, m; async=true)
- end
- end
+ context!(context(A)) do
+ unsafe_copyto!(new_pointer, old_pointer, m; async = true)
+ if Base.isbitsunion(T)
+ unsafe_copyto!(new_typetagdata, old_typetagdata, m; async = true)
+ end
+ end
+ end
+ unsafe_free!(old_data)
end
- unsafe_free!(old_data)
- end
A.dims = (n,)
- return A
+ return A
end
diff --git a/test/base/array.jl b/test/base/array.jl
index 4c0ebca8d..eb641f2bc 100644
--- a/test/base/array.jl
+++ b/test/base/array.jl
@@ -550,41 +550,41 @@ end
end
@testset "resizing" begin
- for data in ([1, 2, 3], [1, nothing, 3])
- a = CuArray(data)
- initial_capacity = a.maxsize
- @test initial_capacity == sizeof(a)
-
- # resizing an array should increment the capacity
- CUDA.resize!(a, 4)
- @test length(a) == 4
- @test Array(a)[1:3] == data
- resized_capacity = a.maxsize
- @test resized_capacity > sizeof(a)
-
- # resizing again should use the existing capacity
- CUDA.resize!(a, 5)
+ for data in ([1, 2, 3], [1, nothing, 3])
+ a = CuArray(data)
+ initial_capacity = a.maxsize
+ @test initial_capacity == sizeof(a)
+
+ # resizing an array should increment the capacity
+ CUDA.resize!(a, 4)
+ @test length(a) == 4
+ @test Array(a)[1:3] == data
+ resized_capacity = a.maxsize
+ @test resized_capacity > sizeof(a)
+
+ # resizing again should use the existing capacity
+ CUDA.resize!(a, 5)
@test length(a) == 5
- @test a.maxsize == resized_capacity
-
- # resizing significantly should trigger an exact reallocation
- CUDA.resize!(a, 1000)
- @test length(a) == 1000
- @test Array(a)[1:3] == data
- resized_capacity = a.maxsize
- @test resized_capacity == sizeof(a)
-
- # shrinking back down shouldn't immediately reduce capacity
- CUDA.resize!(a, 999)
- @test length(a) == 999
- @test a.maxsize == resized_capacity
-
- # shrinking significantly should trigger an exact reallocation
- CUDA.resize!(a, 10)
- @test length(a) == 10
- @test Array(a)[1:3] == data
- @test a.maxsize == sizeof(a)
- end
+ @test a.maxsize == resized_capacity
+
+ # resizing significantly should trigger an exact reallocation
+ CUDA.resize!(a, 1000)
+ @test length(a) == 1000
+ @test Array(a)[1:3] == data
+ resized_capacity = a.maxsize
+ @test resized_capacity == sizeof(a)
+
+ # shrinking back down shouldn't immediately reduce capacity
+ CUDA.resize!(a, 999)
+ @test length(a) == 999
+ @test a.maxsize == resized_capacity
+
+ # shrinking significantly should trigger an exact reallocation
+ CUDA.resize!(a, 10)
+ @test length(a) == 10
+ @test Array(a)[1:3] == data
+ @test a.maxsize == sizeof(a)
+ end
end
@testset "aliasing" begin |
@maleadt Please review. Thanks! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you modify resize!
instead of adding a new_resize!
?
The change should also make use of the maxsize
property of CuArray
, which is there to allow additional data to be allocated without the dimensions of the array having to match. resize
should be made aware of that, not doing anything when the needed size is smaller than maxsize
.
In addition, IIUC you're using a growth factor of 2 now (ignoring resizes when shrinking by up to a half, resizing by 2 when requesting a larger array), which I think may be too aggressive for GPU arrays. For small arrays it's probably fine, but at some point (> 10MB?) we should probably use a fixed (1MB?) increment instead.
In addition,
👍
👍 but why do we choose 10MB and 1MB?
Do you have any other comments? |
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #2828 +/- ##
==========================================
- Coverage 89.45% 89.32% -0.14%
==========================================
Files 150 150
Lines 13078 13093 +15
==========================================
- Hits 11699 11695 -4
- Misses 1379 1398 +19 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
Can you @maleadt point me to the existing CI benchmark results? I could not find them. Thanks! |
Please review @maleadt. Thanks! |
Sorry for the delay. I simplified the tests a little and added support for shrinking, as well as isbits-union arrays (although untested).
That was arbitrary. Would probably be useful to see what other projects do. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Benchmark suite | Current: 17565a4 | Previous: 2130acf | Ratio |
---|---|---|---|
latency/precompile |
56738141787 ns |
56500883334.5 ns |
1.00 |
latency/ttfp |
8429462389 ns |
8365271392 ns |
1.01 |
latency/import |
4514032627 ns |
4507842458 ns |
1.00 |
integration/volumerhs |
9628219 ns |
9611007.5 ns |
1.00 |
integration/byval/slices=1 |
147198 ns |
146935 ns |
1.00 |
integration/byval/slices=3 |
426477 ns |
425946 ns |
1.00 |
integration/byval/reference |
145011 ns |
145067 ns |
1.00 |
integration/byval/slices=2 |
286716 ns |
286530 ns |
1.00 |
integration/cudadevrt |
103677 ns |
103628 ns |
1.00 |
kernel/indexing |
14292 ns |
14200 ns |
1.01 |
kernel/indexing_checked |
14954 ns |
15046.5 ns |
0.99 |
kernel/occupancy |
707.3241379310344 ns |
677.6815286624204 ns |
1.04 |
kernel/launch |
2156.1111111111113 ns |
2183.1111111111113 ns |
0.99 |
kernel/rand |
18723 ns |
14941 ns |
1.25 |
array/reverse/1d |
20126 ns |
20250 ns |
0.99 |
array/reverse/2dL_inplace |
66949 ns |
67030 ns |
1.00 |
array/reverse/1dL |
70325 ns |
70487 ns |
1.00 |
array/reverse/2d |
22042 ns |
22100 ns |
1.00 |
array/reverse/1d_inplace |
9673 ns |
9646 ns |
1.00 |
array/reverse/2d_inplace |
13448 ns |
13444 ns |
1.00 |
array/reverse/2dL |
74074 ns |
74138 ns |
1.00 |
array/reverse/1dL_inplace |
66854 ns |
66810 ns |
1.00 |
array/copy |
20997 ns |
20566 ns |
1.02 |
array/iteration/findall/int |
158461 ns |
158051 ns |
1.00 |
array/iteration/findall/bool |
140371 ns |
140105.5 ns |
1.00 |
array/iteration/findfirst/int |
161326.5 ns |
161684.5 ns |
1.00 |
array/iteration/findfirst/bool |
162046 ns |
162377 ns |
1.00 |
array/iteration/scalar |
72273 ns |
73398 ns |
0.98 |
array/iteration/logical |
217476 ns |
216289 ns |
1.01 |
array/iteration/findmin/1d |
51081 ns |
50985 ns |
1.00 |
array/iteration/findmin/2d |
97113 ns |
96912 ns |
1.00 |
array/reductions/reduce/Int64/1d |
43245 ns |
43765 ns |
0.99 |
array/reductions/reduce/Int64/dims=1 |
44805 ns |
45043 ns |
0.99 |
array/reductions/reduce/Int64/dims=2 |
61576 ns |
61514 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
89140 ns |
89178.5 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
88283 ns |
88181 ns |
1.00 |
array/reductions/reduce/Float32/1d |
37404 ns |
37076.5 ns |
1.01 |
array/reductions/reduce/Float32/dims=1 |
41817 ns |
41802.5 ns |
1.00 |
array/reductions/reduce/Float32/dims=2 |
60022 ns |
59975 ns |
1.00 |
array/reductions/reduce/Float32/dims=1L |
52719 ns |
52588 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
72606 ns |
72470 ns |
1.00 |
array/reductions/mapreduce/Int64/1d |
43496 ns |
43472 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=1 |
55559.5 ns |
44367 ns |
1.25 |
array/reductions/mapreduce/Int64/dims=2 |
61790 ns |
61825 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=1L |
89096 ns |
89416 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
88662 ns |
88547 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
38506 ns |
37303 ns |
1.03 |
array/reductions/mapreduce/Float32/dims=1 |
42206.5 ns |
46174.5 ns |
0.91 |
array/reductions/mapreduce/Float32/dims=2 |
60340 ns |
60053 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1L |
53001 ns |
52864.5 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2L |
72556 ns |
72451.5 ns |
1.00 |
array/broadcast |
20671 ns |
20237.5 ns |
1.02 |
array/copyto!/gpu_to_gpu |
11734 ns |
13049 ns |
0.90 |
array/copyto!/cpu_to_gpu |
214209 ns |
215931 ns |
0.99 |
array/copyto!/gpu_to_cpu |
284311 ns |
284076.5 ns |
1.00 |
array/accumulate/Int64/1d |
124765 ns |
124697 ns |
1.00 |
array/accumulate/Int64/dims=1 |
83540 ns |
83423 ns |
1.00 |
array/accumulate/Int64/dims=2 |
157883 ns |
157799 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1709761 ns |
1710025 ns |
1.00 |
array/accumulate/Int64/dims=2L |
966484 ns |
966307 ns |
1.00 |
array/accumulate/Float32/1d |
109093 ns |
109616 ns |
1.00 |
array/accumulate/Float32/dims=1 |
80727 ns |
80549 ns |
1.00 |
array/accumulate/Float32/dims=2 |
147490.5 ns |
147930 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1618951.5 ns |
1618991 ns |
1.00 |
array/accumulate/Float32/dims=2L |
698350 ns |
698663 ns |
1.00 |
array/construct |
1258.8 ns |
1280.7 ns |
0.98 |
array/random/randn/Float32 |
44815 ns |
44974 ns |
1.00 |
array/random/randn!/Float32 |
24777 ns |
25380 ns |
0.98 |
array/random/rand!/Int64 |
27309 ns |
27271 ns |
1.00 |
array/random/rand!/Float32 |
8916.333333333334 ns |
9033.666666666666 ns |
0.99 |
array/random/rand/Int64 |
29707 ns |
29955 ns |
0.99 |
array/random/rand/Float32 |
13012 ns |
13512 ns |
0.96 |
array/permutedims/4d |
60302 ns |
60442 ns |
1.00 |
array/permutedims/2d |
54019 ns |
54237.5 ns |
1.00 |
array/permutedims/3d |
55029.5 ns |
55099 ns |
1.00 |
array/sorting/1d |
2757377 ns |
2757475 ns |
1.00 |
array/sorting/by |
3368301.5 ns |
3344486 ns |
1.01 |
array/sorting/2d |
1088432.5 ns |
1080698 ns |
1.01 |
cuda/synchronization/stream/auto |
1065.1 ns |
1042.3 ns |
1.02 |
cuda/synchronization/stream/nonblocking |
8403 ns |
7397.299999999999 ns |
1.14 |
cuda/synchronization/stream/blocking |
836.468085106383 ns |
814.7333333333333 ns |
1.03 |
cuda/synchronization/context/auto |
1202.5 ns |
1171.5 ns |
1.03 |
cuda/synchronization/context/nonblocking |
7957.9 ns |
8941.8 ns |
0.89 |
cuda/synchronization/context/blocking |
923.0076923076923 ns |
908.7894736842105 ns |
1.02 |
This comment was automatically generated by workflow using github-action-benchmark.
CI failure related. |
I need
resize!
function run faster when it is called frequently.I'm not sure about whether 2 is a good resize factor but I think it might be a reasonable number. And I do the benchmarks based on the assumption that the resize length is uniformly distributed within a range.
Click for benchmark script
Click for benchmark result
But it is still not as fast as CPU
resize!
function.Do you have any other suggestions to make it run faster (like any other good resize factor)? And do we need to benchmark it based on other assumptions (like the resize length linearly grows or shrinks with the times)? I guess the performance will not be good on some corner cases (like when we keep expanding the GPU array to less than double its length).
I separate the new
resize!
and the oldresize!
temporarily for better comparison.