Skip to content

Commit 38bbee9

Browse files
committed
Reduce spurious allocations
Only store a Type{<:StaticArray} upon failed bounds checking of a StaticArray to prevent boxing (i.e allocation). A new @test_noalloc that captures variables to help the compiler avoid additional spurious boxing/allocations.
1 parent e23a2f5 commit 38bbee9

File tree

7 files changed

+162
-40
lines changed

7 files changed

+162
-40
lines changed

src/MArray.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ end
2828
end
2929

3030
@propagate_inbounds function setindex!(v::MArray, val, i::Int)
31-
@boundscheck checkbounds(v,i)
31+
@inline
32+
@boundscheck checkbounds((v),i)
3233
T = eltype(v)
3334

3435
if isbitstype(T)

src/SArray.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ sacollect
6262
####################
6363

6464
@propagate_inbounds function getindex(v::SArray, i::Int)
65-
getfield(v,:data)[i]
65+
@boundscheck checkbounds(v, i)
66+
@inbounds getfield(v,:data)[i]
6667
end
6768

6869
@inline Base.Tuple(v::SArray) = getfield(v,:data)

src/SUnitRange.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ SUnitRange(a::Int, b::Int) = SUnitRange{a, max(0, b - a + 1)}()
2020

2121
@propagate_inbounds function getindex(x::SUnitRange{Start, L}, i::Int) where {Start, L}
2222
@boundscheck if i < 1 || i > L
23-
throw(BoundsError(x, i))
23+
Base.throw_boundserror(x, i)
2424
end
2525
return Start + i - 1
2626
end

src/deque.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ julia> insert(@SVector[6, 5, 4, 2, 1], 4, 3)
7575
return quote
7676
@_propagate_inbounds_meta
7777
@boundscheck if (index < 1 || index > $newlen)
78-
throw(BoundsError(vec, index))
78+
Base.throw_boundserror(vec, index)
7979
end
8080
@inbounds return similar_type(vec, Size($newlen))(tuple($(exprs...)))
8181
end
@@ -150,7 +150,7 @@ julia> deleteat(@SVector[6, 5, 4, 3, 2, 1], 2)
150150
return quote
151151
@_propagate_inbounds_meta
152152
@boundscheck if (index < 1 || index > $(s[1]))
153-
throw(BoundsError(vec, index))
153+
Base.throw_boundserror(vec, index)
154154
end
155155
@inbounds return similar_type(vec, Size($newlen))(tuple($(exprs...)))
156156
end
@@ -188,7 +188,7 @@ julia> setindex(@SMatrix[2 4; 6 8], 1, 2)
188188
return quote
189189
@_propagate_inbounds_meta
190190
@boundscheck if (index < 1 || index > $(L))
191-
throw(BoundsError(a, index))
191+
Base.throw_boundserror(a, index)
192192
end
193193
@inbounds return typeof(a)(tuple($(exprs...)))
194194
end

src/indexing.jl

+16
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,22 @@ setindex!(a::StaticArray, value, i::Int) = error("setindex!(::$(typeof(a)), valu
88

99
# Note: all indexing behavior defaults to dense, linear indexing
1010

11+
Base.summary(io::IO, T::Type{<:StaticVector}) =
12+
print(io, length(T), "-element ", T)
13+
14+
Base.summary(io::IO, T::Type{<:StaticArray}) =
15+
print(io, join(size(T), "x"), " ", T)
16+
17+
"""
18+
Only store the type upon failed bounds checking of a StaticArray to prevent
19+
boxing and the corresponding allocation. Boxing is otherwise needed when
20+
bounds checking is active to make it possible to potentially put the
21+
StaticArray in BoundsError.a::Any.
22+
"""
23+
Base.throw_boundserror(T::Type{<:StaticArray},I) = (@noinline;throw(BoundsError(T,I)))
24+
25+
Base.throw_boundserror(A::StaticArray,I) = (@inline;Base.throw_boundserror(typeof(A),I))
26+
1127
@propagate_inbounds function getindex(a::StaticArray, inds::Int...)
1228
@boundscheck checkbounds(a, inds...)
1329
_getindex_scalar(Size(a), a, inds...)

test/matrix_multiply_add.jl

+23-34
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,6 @@ using LinearAlgebra
33
using BenchmarkTools
44
using Test
55

6-
macro test_noalloc(ex)
7-
esc(quote
8-
$ex
9-
@test(@allocated($ex) == 0)
10-
end)
11-
end
12-
136
mul_add_wrappers = [
147
m -> m,
158
m -> Symmetric(m, :U),
@@ -94,22 +87,18 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
9487
mul!(b,At,c,1.0,2.0)
9588
@test b 5A'c
9689

97-
if !(ArrayType <: SizedArray)
98-
@test_noalloc mul!(c,A,b)
99-
else
100-
mul!(c,A,b)
101-
@test_broken(@allocated(mul!(c,A,b)) == 0)
102-
end
90+
@test_noalloc_barrier mul!(c,A,b)
91+
10392
expected_transpose_allocs = 0
10493
bmark = @benchmark mul!($c,$A,$b,$α,$β) samples=10 evals=10
10594
@test minimum(bmark).allocs == 0
106-
# @test_noalloc mul!(c, A, b, α, β) # records 32 bytes
95+
@test_noalloc_barrier mul!(c, A, b, α, β)
10796
bmark = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10
10897
@test minimum(bmark).allocs <= expected_transpose_allocs
109-
# @test_noalloc mul!(b, Transpose(A), c) # records 16 bytes
98+
@test_noalloc_barrier mul!(b, Transpose(A), c)
11099
bmark = @benchmark mul!($b,Transpose($A),$c,$α,$β) samples=10 evals=10
111100
@test minimum(bmark).allocs <= expected_transpose_allocs
112-
# @test_noalloc mul!(b, Transpose(A), c, α, β) # records 48 bytes
101+
@test_noalloc_barrier mul!(b, Transpose(A), c, α, β)
113102

114103
# outer product
115104
C = rand(Mat{N1,N2})
@@ -122,9 +111,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
122111
mul!(C,a,b',1.,1.)
123112
@test C 3a*b'
124113

125-
b = @benchmark mul!($C,$a,$(b')) samples=10 evals=10
126-
@test minimum(b).allocs <= expected_transpose_allocs
127-
# @test_noalloc mul!(C, a, b') # records 16 bytes
114+
bmark = @benchmark mul!($C,$a,$(b')) samples=10 evals=10
115+
@test minimum(bmark).allocs <= expected_transpose_allocs
116+
@test_noalloc_barrier mul!(C, a, b')
128117

129118
# A × B
130119
A = rand(Mat{N1,N2})
@@ -137,9 +126,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
137126
mul!(C,A,B,2.0,1.0)
138127
@test C 4A*B
139128

140-
b = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10
141-
@test minimum(b).allocs == 0
142-
# @test_noalloc mul!(C, A, B, α, β) # records 32 bytes
129+
bmark = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10
130+
@test minimum(bmark).allocs == 0
131+
@test_noalloc_barrier mul!(C, A, B, α, β)
143132

144133
# A'B
145134
At = Transpose(A)
@@ -150,9 +139,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
150139
mul!(B,At,C,2.0,1.0)
151140
@test B 4A'C
152141

153-
b = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10
154-
@test minimum(b).allocs <= expected_transpose_allocs
155-
# @test_noalloc mul!(B, Transpose(A), C, α, β) # records 48 bytes
142+
bmark = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10
143+
@test minimum(bmark).allocs <= expected_transpose_allocs
144+
@test_noalloc_barrier mul!(B, Transpose(A), C, α, β)
156145

157146
# A*B'
158147
Bt = Transpose(B)
@@ -163,9 +152,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
163152
mul!(C,A,Bt,2.0,1.0)
164153
@test C 4A*B'
165154

166-
b = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10
167-
@test minimum(b).allocs <= expected_transpose_allocs
168-
# @test_noalloc mul!(C, A, Transpose(B), α, β) # records 48 bytes
155+
bmark = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10
156+
@test minimum(bmark).allocs <= expected_transpose_allocs
157+
@test_noalloc_barrier mul!(C, A, Transpose(B), α, β)
169158

170159
# A'B'
171160
B = rand(Mat{N1,N1})
@@ -177,17 +166,17 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
177166
mul!(C,Transpose(A),Transpose(B),2.0,1.0)
178167
@test C 4A'B'
179168

180-
b = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
181-
@test minimum(b).allocs <= 2*expected_transpose_allocs
182-
# @test_noalloc mul!(C, Transpose(A), Transpose(B), α, β) # records 64 bytes
169+
bmark = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
170+
@test minimum(bmark).allocs <= 2*expected_transpose_allocs
171+
@test_noalloc_barrier mul!(C, Transpose(A), Transpose(B), α, β)
183172

184173
# Transpose Output
185174
C = rand(Mat{N1,N2})
186175
mul!(Transpose(C),Transpose(A),Transpose(B))
187176
@test C' A'B'
188-
b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
189-
@test minimum(b).allocs <= expected_transpose_allocs*3
190-
# @test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β) # records 80 bytes
177+
bmark = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
178+
@test minimum(bmark).allocs <= expected_transpose_allocs*3
179+
@test_noalloc_barrier mul!(Transpose(C), Transpose(A), Transpose(B), α, β)
191180
end
192181

193182
# Test the three different

test/testutil.jl

+115
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,103 @@ macro test_was_once_broken(good_version, ex)
104104
end)
105105
end
106106

107+
108+
r"""
109+
@allocated_barrier f(b, g(c))
110+
111+
Is effectively translated to:
112+
113+
(function(b, c)
114+
@allocated f(b, g(c))
115+
end)(b, c)
116+
117+
The function barrier improves type stability which helps the compiler
118+
avoid unccessary heap allocations.
119+
120+
Functions/functors are not captured as local variables by default but
121+
they can be wrapped by prepending each function call with a $ sign.
122+
Values can also be interpolated with a $:
123+
124+
@allocated_barrier $f(b, $(g(c)))
125+
126+
Which effectively translates to:
127+
128+
(function(f, b, gc)
129+
@allocated f(b, gc)
130+
end)(f, b, g(c))
131+
132+
This is useful if `f` is a local variable or if `g(..)` causes allocations
133+
that should be excluded.
134+
135+
Another approach to is to wrap each call to `@allocated` with `@eval`:
136+
137+
@eval @allocated f($a,g($b,$c))
138+
@eval @allocated $a .= $f.($b, $c)
139+
140+
The number of allocated bytes reported is similar to this macro.
141+
"""
142+
macro allocated_barrier(ex)
143+
captured = Dict{Any,Symbol}()
144+
145+
function capture(s::Symbol)
146+
if Base.isidentifier(s)
147+
get!(captured, s) do
148+
gensym(s)
149+
end
150+
else
151+
s
152+
end
153+
end
154+
155+
function capture(expr::Expr)
156+
if expr.head == :$
157+
get!(captured, expr.args[1]) do
158+
gensym(string(expr.args[1]))
159+
end
160+
elseif expr.head == :. && last(expr.args) isa QuoteNode
161+
get!(captured, expr) do
162+
gensym(join(expr.args, "."))
163+
end
164+
else
165+
# Expr(expr.head, capture.(expr.args)...)
166+
arg1 = popfirst!(expr.args)
167+
Expr(expr.head,
168+
expr.head == :call && !(arg1 isa Expr && arg1.head==:$) ? arg1 : capture(arg1),
169+
capture.(expr.args)...)
170+
end
171+
end
172+
173+
capture(x) = x
174+
175+
inner_ex = capture(ex)
176+
177+
quote
178+
(function($(values(captured)...))
179+
180+
f() = $inner_ex
181+
Base.precompile(f, ())
182+
@allocated f()
183+
184+
end)($(esc.(keys(captured))...))
185+
end
186+
end
187+
188+
189+
macro test_noalloc(ex)
190+
a = :(
191+
@allocated_barrier($ex)
192+
)
193+
a.args[2] = () # tidy output
194+
195+
q = :(
196+
@test 0 == $a
197+
)
198+
q.args[2] = LineNumberNode(__source__.line, __source__.file)
199+
200+
esc(q)
201+
end
202+
203+
107204
@testset "test utils" begin
108205
@testset "@testinf" begin
109206
@testinf [1,2] == [1,2]
@@ -121,4 +218,22 @@ end
121218
end
122219
@test ts.errorcount == 0 && ts.failcount == 2 && ts.passcount == 0
123220
end
221+
222+
a = rand(3)
223+
z = ones(3)
224+
225+
@testset "@allocated_barrier" begin
226+
@test @allocated_barrier(z .= a .+ z) == 0
227+
@test z a .+ 1
228+
@test @allocated_barrier(z .= a + z) > 0
229+
@test @allocated_barrier(z .= $(a + z)) == 0
230+
231+
@test @allocated_barrier(z .= abs.(a + z)) > 0
232+
@test @allocated_barrier(z .= $(abs.(a + z))) == 0
233+
@test @allocated_barrier(z .= abs.(a .+ z)) == 0
234+
end
235+
236+
@testset "@test_noalloc" begin
237+
@test_noalloc z .= abs.(a .+ z)
238+
end
124239
end

0 commit comments

Comments
 (0)