From 75009a74f5f1a5578282e508e30751f62bfc6b5e Mon Sep 17 00:00:00 2001 From: Romeo Valentin Date: Sat, 26 Jul 2025 12:02:22 -0700 Subject: [PATCH 1/4] Remove use of `CPUSummary` The use of CPUInfo makes `--trim` difficult. Removing this dependency here would unlock a large amount of libraries which use the Polyester library to be trimmmable (notably e.g. almost everything in the SciML ecosystem). However, we might need a bit more discussion on the exact removal of this feature. --- Project.toml | 2 -- src/request.jl | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/Project.toml b/Project.toml index bcf68b7..e988bd6 100644 --- a/Project.toml +++ b/Project.toml @@ -5,14 +5,12 @@ version = "0.2.2" [deps] BitTwiddlingConvenienceFunctions = "62783981-4cbd-42fc-bca8-16325de8dc4b" -CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5" [compat] BitTwiddlingConvenienceFunctions = "0.1" -CPUSummary = "0.1.2, 0.2" IfElse = "0.1" Static = "0.3.1, 0.4, 0.5, 0.6, 0.7, 0.8, 1" ThreadingUtilities = "0.4.5, 0.5" diff --git a/src/request.jl b/src/request.jl index 1f8e190..368d08d 100644 --- a/src/request.jl +++ b/src/request.jl @@ -1,7 +1,5 @@ -import CPUSummary - function worker_bits() - wts = nextpow2(CPUSummary.sys_threads()) # Typically sys_threads (i.e. Sys.CPU_THREADS) does not change between runs, thus it will precompile well. + wts = nextpow2(Threads.nthreads()) # Typically sys_threads (i.e. Sys.CPU_THREADS) does not change between runs, thus it will precompile well. ws = static(8sizeof(UInt)) # For testing purposes it can be overridden by JULIA_CPU_THREADS, ifelse(Static.lt(wts, ws), ws, wts) end From 7297852e576fd1766a4b2d27736719b696c6a4fd Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 28 Jul 2025 23:12:49 -0400 Subject: [PATCH 2/4] Fix type instability with high thread counts in PR #28 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes a critical bug that occurs when using more than 64 threads. The change from CPUSummary.sys_threads() to Threads.nthreads() introduced a type instability where worker_bits() and worker_mask_count() would return regular Int instead of StaticInt types with high thread counts. Changes: - Modified worker_bits() to always return Int for consistency - Updated worker_mask_count() to use regular integer division - Added new _request_threads method that handles Int parameter - Added test for high thread count compatibility The fix maintains backward compatibility while ensuring the code works correctly with any number of threads. Fixes the MethodError: no method matching _request_threads(::UInt32, ::Ptr{UInt64}, ::Int64, ::Nothing) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/request.jl | 29 +++++++++++++++++++++++++++-- test/test_high_thread_count.jl | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 test/test_high_thread_count.jl diff --git a/src/request.jl b/src/request.jl index 368d08d..87880fe 100644 --- a/src/request.jl +++ b/src/request.jl @@ -1,11 +1,12 @@ function worker_bits() wts = nextpow2(Threads.nthreads()) # Typically sys_threads (i.e. Sys.CPU_THREADS) does not change between runs, thus it will precompile well. ws = static(8sizeof(UInt)) # For testing purposes it can be overridden by JULIA_CPU_THREADS, - ifelse(Static.lt(wts, ws), ws, wts) + # Always return Int to avoid type instability with high thread counts + ifelse(wts < 64, 64, wts) end function worker_mask_count() bits = worker_bits() - (bits + StaticInt{63}()) ÷ StaticInt{64}() # cld not defined on `StaticInt` + cld(bits, 64) end worker_pointer() = Base.unsafe_convert(Ptr{UInt}, pointer_from_objref(WORKERS)) @@ -61,6 +62,30 @@ end (ui,), (ft,) end +# Handle regular Int for type stability with high thread counts +@inline function _request_threads( + num_requested::UInt32, + wp::Ptr, + N::Int, + threadmask +) + if N == 1 + ui, ft, num_requested, wp = + __request_threads(num_requested, wp, _first(threadmask)) + return (ui,), (ft,) + else + ui, ft, num_requested, wp = + __request_threads(num_requested, wp, _first(threadmask)) + uit, ftt = _request_threads( + num_requested, + wp, + N - 1, + _remaining(threadmask) + ) + return (ui, uit...), (ft, ftt...) + end +end + @inline function _exchange_mask!(wp, ::Nothing) all_threads = _atomic_xchg!(wp, zero(UInt)) all_threads, all_threads diff --git a/test/test_high_thread_count.jl b/test/test_high_thread_count.jl new file mode 100644 index 0000000..1236d3f --- /dev/null +++ b/test/test_high_thread_count.jl @@ -0,0 +1,33 @@ +using Test +using PolyesterWeave + +@testset "High thread count compatibility" begin + # Test worker_bits returns Int for all thread counts + @test isa(PolyesterWeave.worker_bits(), Int) + + # Test worker_mask_count returns Int + @test isa(PolyesterWeave.worker_mask_count(), Int) + + # Test that request_threads works with high thread counts + # This simulates the case where worker_mask_count() > 1 + if Threads.nthreads() > 64 + # With > 64 threads, worker_mask_count() should be 2 or more + @test PolyesterWeave.worker_mask_count() >= 2 + + # Test that request_threads doesn't throw + threads, torelease = PolyesterWeave.request_threads(10) + @test length(threads) >= 0 # May get 0 if no threads available + + # Free the threads + PolyesterWeave.free_threads!(torelease) + else + # With <= 64 threads, worker_mask_count() should be 1 + @test PolyesterWeave.worker_mask_count() == 1 + end + + # Test specific values + @test PolyesterWeave.worker_bits() == max(64, nextpow2(Threads.nthreads())) + @test PolyesterWeave.worker_mask_count() == cld(PolyesterWeave.worker_bits(), 64) +end + +println("All tests passed!") \ No newline at end of file From 3ccc2d5e5e5dca8c6023733a1d22b0864d41c177 Mon Sep 17 00:00:00 2001 From: Christopher Rackauckas Date: Tue, 29 Jul 2025 03:06:39 -0400 Subject: [PATCH 3/4] Update runtests.jl --- test/runtests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/runtests.jl b/test/runtests.jl index d51a37a..37ed7c3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,3 +28,4 @@ using Test end end Aqua.test_all(PolyesterWeave) +include("test_high_thread_count.jl") From aa4ae0ce3dbad0d00ff84cd38e775d4373df5356 Mon Sep 17 00:00:00 2001 From: Christopher Rackauckas Date: Tue, 29 Jul 2025 03:11:40 -0400 Subject: [PATCH 4/4] Update test_high_thread_count.jl --- test/test_high_thread_count.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_high_thread_count.jl b/test/test_high_thread_count.jl index 1236d3f..2a60efc 100644 --- a/test/test_high_thread_count.jl +++ b/test/test_high_thread_count.jl @@ -1,5 +1,6 @@ using Test using PolyesterWeave +using BitTwiddlingConvenienceFunctions: nextpow2 @testset "High thread count compatibility" begin # Test worker_bits returns Int for all thread counts @@ -29,5 +30,3 @@ using PolyesterWeave @test PolyesterWeave.worker_bits() == max(64, nextpow2(Threads.nthreads())) @test PolyesterWeave.worker_mask_count() == cld(PolyesterWeave.worker_bits(), 64) end - -println("All tests passed!") \ No newline at end of file