Skip to content

Commit ad5ddf8

Browse files
lpawelaamontoison
andauthored
Set default buffer size in CUSPARSE mm! functions (#2298)
This works around a bug in CUSPARSE. Co-authored-by: Alexis Montoison <[email protected]>
1 parent a50b5b8 commit ad5ddf8

File tree

3 files changed

+139
-4
lines changed

3 files changed

+139
-4
lines changed

lib/cusparse/generic.jl

+9-4
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,10 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::Union{CuS
235235
# cusparseCsrSetStridedBatch(obj, batchsize, 0, nnz(A))
236236
# end
237237

238+
# Set default buffer for small matrices (10000 chosen arbitrarly)
239+
# Otherwise tries to allocate 120TB of memory (see #2296)
238240
function bufferSize()
239-
out = Ref{Csize_t}()
241+
out = Ref{Csize_t}(10000)
240242
cusparseSpMM_bufferSize(
241243
handle(), transa, transb, Ref{T}(alpha), descA, descB, Ref{T}(beta),
242244
descC, T, algo, out)
@@ -311,8 +313,10 @@ function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparse
311313
strideC = stride(C, 3)
312314
cusparseDnMatSetStridedBatch(descC, b, strideC)
313315

316+
# Set default buffer for small matrices (10000 chosen arbitrarly)
317+
# Otherwise tries to allocate 120TB of memory (see #2296)
314318
function bufferSize()
315-
out = Ref{Csize_t}()
319+
out = Ref{Csize_t}(10000)
316320
cusparseSpMM_bufferSize(
317321
handle(), transa, transb, Ref{T}(alpha), descA, descB, Ref{T}(beta),
318322
descC, T, algo, out)
@@ -337,7 +341,6 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMa
337341
beta::Number, C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T}
338342

339343
CUSPARSE.version() < v"11.7.4" && throw(ErrorException("This operation is not supported by the current CUDA version."))
340-
341344
# Support transa = 'C' and `transb = 'C' for real matrices
342345
transa = T <: Real && transa == 'C' ? 'T' : transa
343346
transb = T <: Real && transb == 'C' ? 'T' : transb
@@ -370,8 +373,10 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMa
370373
descB = CuSparseMatrixDescriptor(B, index, transposed=true)
371374
descC = CuDenseMatrixDescriptor(C, transposed=true)
372375

376+
# Set default buffer for small matrices (10000 chosen arbitrarly)
377+
# Otherwise tries to allocate 120TB of memory (see #2296)
373378
function bufferSize()
374-
out = Ref{Csize_t}()
379+
out = Ref{Csize_t}(10000)
375380
cusparseSpMM_bufferSize(
376381
handle(), transb, transa, Ref{T}(alpha), descB, descA, Ref{T}(beta),
377382
descC, T, algo, out)

test/libraries/cusparse/bmm.jl

+114
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,120 @@ if CUSPARSE.version() ≥ v"11.7.2"
7171
end
7272

7373

74+
@testset "C = αAᵀBᵀ + βC" begin
75+
A1 = CuSparseMatrixCSR{elty}(sprand(elty, k, m, p))
76+
A2 = copy(A1)
77+
A2.nzVal = CUDA.rand(elty, size(A2.nzVal)...)
78+
A = cat(A1, A2; dims=3)
79+
80+
B = CUDA.rand(elty, n, k, 2)
81+
C = CUDA.rand(elty, m, n, 2)
82+
D = copy(C)
83+
84+
CUSPARSE.bmm!('C', 'C', α, A, B, β, C, 'O')
85+
86+
D[:,:,1] = α * A1' * B[:,:,1]' + β * D[:,:,1]
87+
D[:,:,2] = α * A2' * B[:,:,2]' + β * D[:,:,2]
88+
89+
@test D C
90+
end
91+
92+
@testset "extended batch-dims" begin
93+
A1 = CuSparseMatrixCSR{elty}(sprand(elty, m, k, p))
94+
A2 = copy(A1)
95+
A2.nzVal = CUDA.rand(elty, size(A2.nzVal)...)
96+
A3 = cat(A1, A2; dims=3)
97+
98+
A4 = copy(A3)
99+
A4.nzVal = CUDA.rand(elty, size(A3.nzVal)...)
100+
101+
A5 = copy(A3)
102+
A5.nzVal = CUDA.rand(elty, size(A3.nzVal)...)
103+
104+
A = cat(A3, A4, A5; dims=4)
105+
106+
B = CUDA.rand(elty, k, n, 2, 3)
107+
C = CUDA.rand(elty, m, n, 2, 3)
108+
D = copy(C)
109+
110+
CUSPARSE.bmm!('N', 'N', α, A, B, β, C, 'O')
111+
112+
for c in CartesianIndices((2,3))
113+
CUDA.@allowscalar D[:,:,c] = α * A[:,:,c.I...] * B[:,:,c] + β*D[:,:,c]
114+
end
115+
116+
@test D C
117+
end
118+
end
119+
120+
m = 1
121+
n = 2
122+
# error when n == 1 and batchsize > 1 as cusparseSpMM fallsback to cusparseSpMV, which doesn't do batched computations.
123+
# see https://docs.nvidia.com/cuda/cusparse/#cusparsespmm
124+
k = 1
125+
p = 1.
126+
127+
@testset "Sparse-Dense $elty bmm! for small matrices" for elty in (Float64, Float32, ComplexF64, ComplexF32)
128+
# check if #2296 returns
129+
α = rand(elty)
130+
β = rand(elty)
131+
132+
@testset "C = αAB + βC" begin
133+
A1 = CuSparseMatrixCSR{elty}(sprand(elty, m, k, p))
134+
A2 = copy(A1)
135+
A2.nzVal = CUDA.rand(elty, size(A2.nzVal)...)
136+
A = cat(A1, A2; dims=3)
137+
138+
B = CUDA.rand(elty, k, n, 2)
139+
C = CUDA.rand(elty, m, n, 2)
140+
D = copy(C)
141+
142+
CUSPARSE.bmm!('N', 'N', α, A, B, β, C, 'O')
143+
144+
D[:,:,1] = α * A1 * B[:,:,1] + β * D[:,:,1]
145+
D[:,:,2] = α * A2 * B[:,:,2] + β * D[:,:,2]
146+
147+
@test D C
148+
end
149+
150+
@testset "C = αAᵀB + βC" begin
151+
A1 = CuSparseMatrixCSR{elty}(sprand(elty, k, m, p))
152+
A2 = copy(A1)
153+
A2.nzVal = CUDA.rand(elty, size(A2.nzVal)...)
154+
A = cat(A1, A2; dims=3)
155+
156+
B = CUDA.rand(elty, k, n, 2)
157+
C = CUDA.rand(elty, m, n, 2)
158+
D = copy(C)
159+
160+
CUSPARSE.bmm!('C', 'N', α, A, B, β, C, 'O')
161+
162+
D[:,:,1] = α * A1' * B[:,:,1] + β * D[:,:,1]
163+
D[:,:,2] = α * A2' * B[:,:,2] + β * D[:,:,2]
164+
165+
@test D C
166+
end
167+
168+
169+
@testset "C = αABᵀ + βC" begin
170+
A1 = CuSparseMatrixCSR{elty}(sprand(elty, m, k, p))
171+
A2 = copy(A1)
172+
A2.nzVal = CUDA.rand(elty, size(A2.nzVal)...)
173+
A = cat(A1, A2; dims=3)
174+
175+
B = CUDA.rand(elty, n, k, 2)
176+
C = CUDA.rand(elty, m, n, 2)
177+
D = copy(C)
178+
179+
CUSPARSE.bmm!('N', 'C', α, A, B, β, C, 'O')
180+
181+
D[:,:,1] = α * A1 * B[:,:,1]' + β * D[:,:,1]
182+
D[:,:,2] = α * A2 * B[:,:,2]' + β * D[:,:,2]
183+
184+
@test D C
185+
end
186+
187+
74188
@testset "C = αAᵀBᵀ + βC" begin
75189
A1 = CuSparseMatrixCSR{elty}(sprand(elty, k, m, p))
76190
A2 = copy(A1)

test/libraries/cusparse/generic.jl

+16
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,22 @@ if CUSPARSE.version() >= v"11.7.4"
134134
beta = rand(T)
135135
mm!(transa, transb, alpha, dA, dB, beta, dC, 'O', algo)
136136
@test alpha * opa(A) * opb(B) + beta * C collect(dC)
137+
138+
# add tests for very small matrices (see #2296)
139+
# skip conjugate transpose - causes errors with 1x1 matrices
140+
# CUSPARSE_SPMM_CSR_ALG3 also fails
141+
(algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 || transa == 'C') && continue
142+
A = rand(T, 1, 1)
143+
B = sprand(T, 1, 1, 1.)
144+
C = rand(T, 1, 1)
145+
dA = CuArray(A)
146+
dB = SparseMatrixType(B)
147+
dC = CuArray(C)
148+
149+
alpha = rand(T)
150+
beta = rand(T)
151+
mm!(transa, transb, alpha, dA, dB, beta, dC, 'O', algo)
152+
@test alpha * opa(A) * opb(B) + beta * C collect(dC)
137153
end
138154
end
139155
end

0 commit comments

Comments
 (0)