Reduce spurious allocations

gustafsson · gustafsson · commit 38bbee9e12e7 · 2025-04-01T08:31:53.000+02:00
Only store a Type{&lt;:StaticArray} upon failed bounds checking of a
StaticArray to prevent boxing (i.e allocation).

A new @test_noalloc that captures variables to help the compiler avoid
additional spurious boxing/allocations.
diff --git a/src/MArray.jl b/src/MArray.jl
@@ -28,7 +28,8 @@ end
 end
 
 @propagate_inbounds function setindex!(v::MArray, val, i::Int)
-    @boundscheck checkbounds(v,i)
+    @inline
+    @boundscheck checkbounds((v),i)
     T = eltype(v)
 
     if isbitstype(T)
diff --git a/src/SArray.jl b/src/SArray.jl
@@ -62,7 +62,8 @@ sacollect
 ####################
 
 @propagate_inbounds function getindex(v::SArray, i::Int)
-    getfield(v,:data)[i]
+    @boundscheck checkbounds(v, i)
+    @inbounds getfield(v,:data)[i]
 end
 
 @inline Base.Tuple(v::SArray) = getfield(v,:data)
diff --git a/src/SUnitRange.jl b/src/SUnitRange.jl
@@ -20,7 +20,7 @@ SUnitRange(a::Int, b::Int) = SUnitRange{a, max(0, b - a + 1)}()
 
 @propagate_inbounds function getindex(x::SUnitRange{Start, L}, i::Int) where {Start, L}
     @boundscheck if i < 1 || i > L
-        throw(BoundsError(x, i))
+        Base.throw_boundserror(x, i)
     end
     return Start + i - 1
 end
diff --git a/src/deque.jl b/src/deque.jl
@@ -75,7 +75,7 @@ julia> insert(@SVector[6, 5, 4, 2, 1], 4, 3)
     return quote
         @_propagate_inbounds_meta
         @boundscheck if (index < 1 || index > $newlen)
-            throw(BoundsError(vec, index))
+            Base.throw_boundserror(vec, index)
         end
         @inbounds return similar_type(vec, Size($newlen))(tuple($(exprs...)))
     end
@@ -150,7 +150,7 @@ julia> deleteat(@SVector[6, 5, 4, 3, 2, 1], 2)
     return quote
         @_propagate_inbounds_meta
         @boundscheck if (index < 1 || index > $(s[1]))
-            throw(BoundsError(vec, index))
+            Base.throw_boundserror(vec, index)
         end
         @inbounds return similar_type(vec, Size($newlen))(tuple($(exprs...)))
     end
@@ -188,7 +188,7 @@ julia> setindex(@SMatrix[2 4; 6 8], 1, 2)
     return quote
         @_propagate_inbounds_meta
         @boundscheck if (index < 1 || index > $(L))
-            throw(BoundsError(a, index))
+            Base.throw_boundserror(a, index)
         end
         @inbounds return typeof(a)(tuple($(exprs...)))
     end
diff --git a/src/indexing.jl b/src/indexing.jl
@@ -8,6 +8,22 @@ setindex!(a::StaticArray, value, i::Int) = error("setindex!(::$(typeof(a)), valu
 
 # Note: all indexing behavior defaults to dense, linear indexing
 
+Base.summary(io::IO, T::Type{<:StaticVector}) =
+    print(io, length(T), "-element ", T)
+
+Base.summary(io::IO, T::Type{<:StaticArray}) =
+    print(io, join(size(T), "x"), " ", T)
+
+"""
+Only store the type upon failed bounds checking of a StaticArray to prevent
+boxing and the corresponding allocation. Boxing is otherwise needed when
+bounds checking is active to make it possible to potentially put the
+StaticArray in BoundsError.a::Any.
+"""
+Base.throw_boundserror(T::Type{<:StaticArray},I) = (@noinline;throw(BoundsError(T,I)))
+
+Base.throw_boundserror(A::StaticArray,I) = (@inline;Base.throw_boundserror(typeof(A),I))
+
 @propagate_inbounds function getindex(a::StaticArray, inds::Int...)
     @boundscheck checkbounds(a, inds...)
     _getindex_scalar(Size(a), a, inds...)
diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl
@@ -3,13 +3,6 @@ using LinearAlgebra
 using BenchmarkTools
 using Test
 
-macro test_noalloc(ex)
-    esc(quote
-        $ex
-        @test(@allocated($ex) == 0)
-    end)
-end
-
 mul_add_wrappers = [
     m -> m,
     m -> Symmetric(m, :U),
@@ -94,22 +87,18 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
     mul!(b,At,c,1.0,2.0)
     @test b ≈ 5A'c
 
-    if !(ArrayType <: SizedArray)
-        @test_noalloc mul!(c,A,b)
-    else
-        mul!(c,A,b)
-        @test_broken(@allocated(mul!(c,A,b)) == 0)
-    end
+    @test_noalloc_barrier mul!(c,A,b)
+
     expected_transpose_allocs = 0
     bmark = @benchmark mul!($c,$A,$b,$α,$β) samples=10 evals=10
     @test minimum(bmark).allocs == 0
-    # @test_noalloc mul!(c, A, b, α, β)  # records 32 bytes
+    @test_noalloc_barrier mul!(c, A, b, α, β)
     bmark = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10
     @test minimum(bmark).allocs <= expected_transpose_allocs
-    # @test_noalloc mul!(b, Transpose(A), c)  # records 16 bytes
+    @test_noalloc_barrier mul!(b, Transpose(A), c)
     bmark = @benchmark mul!($b,Transpose($A),$c,$α,$β) samples=10 evals=10
     @test minimum(bmark).allocs <= expected_transpose_allocs
-    # @test_noalloc mul!(b, Transpose(A), c, α, β)  # records 48 bytes
+    @test_noalloc_barrier mul!(b, Transpose(A), c, α, β)
 
     # outer product
     C = rand(Mat{N1,N2})
@@ -122,9 +111,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
     mul!(C,a,b',1.,1.)
     @test C ≈ 3a*b'
 
-    b = @benchmark mul!($C,$a,$(b')) samples=10 evals=10
-    @test minimum(b).allocs <= expected_transpose_allocs
-    # @test_noalloc mul!(C, a, b')  # records 16 bytes
+    bmark = @benchmark mul!($C,$a,$(b')) samples=10 evals=10
+    @test minimum(bmark).allocs <= expected_transpose_allocs
+    @test_noalloc_barrier mul!(C, a, b')
 
     # A × B
     A = rand(Mat{N1,N2})
@@ -137,9 +126,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
     mul!(C,A,B,2.0,1.0)
     @test C ≈ 4A*B
 
-    b = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10
-    @test minimum(b).allocs == 0
-    # @test_noalloc mul!(C, A, B, α, β)  # records 32 bytes
+    bmark = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10
+    @test minimum(bmark).allocs == 0
+    @test_noalloc_barrier mul!(C, A, B, α, β)
 
     # A'B
     At = Transpose(A)
@@ -150,9 +139,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
     mul!(B,At,C,2.0,1.0)
     @test B ≈ 4A'C
 
-    b = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10
-    @test minimum(b).allocs <= expected_transpose_allocs
-    # @test_noalloc mul!(B, Transpose(A), C, α, β)  # records 48 bytes
+    bmark = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10
+    @test minimum(bmark).allocs <= expected_transpose_allocs
+    @test_noalloc_barrier mul!(B, Transpose(A), C, α, β)
 
     # A*B'
     Bt = Transpose(B)
@@ -163,9 +152,9 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
     mul!(C,A,Bt,2.0,1.0)
     @test C ≈ 4A*B'
 
-    b = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10
-    @test minimum(b).allocs <= expected_transpose_allocs
-    # @test_noalloc mul!(C, A, Transpose(B), α, β)  # records 48 bytes
+    bmark = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10
+    @test minimum(bmark).allocs <= expected_transpose_allocs
+    @test_noalloc_barrier mul!(C, A, Transpose(B), α, β)
 
     # A'B'
     B = rand(Mat{N1,N1})
@@ -177,17 +166,17 @@ function test_multiply_add(N1,N2,ArrayType=MArray)
     mul!(C,Transpose(A),Transpose(B),2.0,1.0)
     @test C ≈ 4A'B'
 
-    b = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
-    @test minimum(b).allocs <= 2*expected_transpose_allocs
-    # @test_noalloc mul!(C, Transpose(A), Transpose(B), α, β)  # records 64 bytes
+    bmark = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
+    @test minimum(bmark).allocs <= 2*expected_transpose_allocs
+    @test_noalloc_barrier mul!(C, Transpose(A), Transpose(B), α, β)
 
     # Transpose Output
     C = rand(Mat{N1,N2})
     mul!(Transpose(C),Transpose(A),Transpose(B))
     @test C' ≈ A'B'
-    b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
-    @test minimum(b).allocs <= expected_transpose_allocs*3
-    # @test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β)  # records 80 bytes
+    bmark = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10
+    @test minimum(bmark).allocs <= expected_transpose_allocs*3
+    @test_noalloc_barrier mul!(Transpose(C), Transpose(A), Transpose(B), α, β)
 end
 
 # Test the three different
diff --git a/test/testutil.jl b/test/testutil.jl
@@ -104,6 +104,103 @@ macro test_was_once_broken(good_version, ex)
     end)
 end
 
+
+r"""
+    @allocated_barrier f(b, g(c))
+
+Is effectively translated to:
+
+    (function(b, c)
+        @allocated f(b, g(c))
+    end)(b, c)
+
+The function barrier improves type stability which helps the compiler
+avoid unccessary heap allocations.
+
+Functions/functors are not captured as local variables by default but
+they can be wrapped by prepending each function call with a $ sign.
+Values can also be interpolated with a $:
+
+    @allocated_barrier $f(b, $(g(c)))
+
+Which effectively translates to:
+
+    (function(f, b, gc)
+        @allocated f(b, gc)
+    end)(f, b, g(c))
+
+This is useful if `f` is a local variable or if `g(..)` causes allocations
+that should be excluded.
+
+Another approach to is to wrap each call to `@allocated` with `@eval`:
+
+    @eval @allocated f($a,g($b,$c))
+    @eval @allocated $a .= $f.($b, $c)
+
+The number of allocated bytes reported is similar to this macro.
+"""
+macro allocated_barrier(ex)
+    captured = Dict{Any,Symbol}()
+
+    function capture(s::Symbol)
+        if Base.isidentifier(s)
+            get!(captured, s) do
+                gensym(s)
+            end
+        else
+            s
+        end
+    end
+
+    function capture(expr::Expr)
+        if expr.head == :$
+            get!(captured, expr.args[1]) do
+                gensym(string(expr.args[1]))
+            end
+        elseif expr.head == :. && last(expr.args) isa QuoteNode
+            get!(captured, expr) do
+                gensym(join(expr.args, "."))
+            end
+        else
+            # Expr(expr.head, capture.(expr.args)...)
+            arg1 = popfirst!(expr.args)
+            Expr(expr.head,
+                expr.head == :call && !(arg1 isa Expr && arg1.head==:$) ? arg1 : capture(arg1),
+                capture.(expr.args)...)
+        end
+    end
+
+    capture(x) = x
+
+    inner_ex = capture(ex)
+
+    quote
+        (function($(values(captured)...))
+
+            f() = $inner_ex
+            Base.precompile(f, ())
+            @allocated f()
+
+        end)($(esc.(keys(captured))...))
+    end
+end
+
+
+macro test_noalloc(ex)
+    a = :(
+        @allocated_barrier($ex)
+    )
+    a.args[2] = () # tidy output
+
+    q = :(
+        @test 0 == $a
+    )
+    q.args[2] = LineNumberNode(__source__.line, __source__.file)
+
+    esc(q)
+end
+
+
 @testset "test utils" begin
     @testset "@testinf" begin
         @testinf [1,2] == [1,2]
@@ -121,4 +218,22 @@ end
         end
         @test ts.errorcount == 0 && ts.failcount == 2 && ts.passcount == 0
     end
+
+    a = rand(3)
+    z = ones(3)
+
+    @testset "@allocated_barrier" begin
+        @test @allocated_barrier(z .=        a .+ z)  == 0
+        @test z ≈ a .+ 1
+        @test @allocated_barrier(z .=        a + z)   > 0
+        @test @allocated_barrier(z .=      $(a + z))  == 0
+
+        @test @allocated_barrier(z .=   abs.(a + z))  > 0
+        @test @allocated_barrier(z .= $(abs.(a + z))) == 0
+        @test @allocated_barrier(z .=   abs.(a .+ z)) == 0
+    end
+
+    @testset "@test_noalloc" begin
+        @test_noalloc z .= abs.(a .+ z)
+    end
 end