pybind · swolchok · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
@@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
 endif()
 
 set(PYBIND11_HEADERS
+    include/pybind11/detail/argument_vector.h
     include/pybind11/detail/class.h
     include/pybind11/detail/common.h
     include/pybind11/detail/cpp_conduit.h

diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include "detail/argument_vector.h"
 #include "detail/common.h"
 #include "detail/descr.h"
 #include "detail/native_enum_data.h"
@@ -2037,6 +2038,8 @@ using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
 // forward declaration (definition in attr.h)
 struct function_record;
 
+constexpr std::size_t arg_vector_small_size = 6;
+
 /// Internal data associated with a single function call
 struct function_call {
     function_call(const function_record &f, handle p); // Implementation in attr.h
@@ -2045,10 +2048,12 @@ struct function_call {
     const function_record &func;
 
     /// Arguments passed to the function:
-    std::vector<handle> args;
+    /// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
+    /// (16 pointers) in size.)
+    argument_vector<arg_vector_small_size> args;
 
     /// The `convert` value the arguments should be loaded with
-    std::vector<bool> args_convert;
+    args_convert_vector<arg_vector_small_size> args_convert;
 
     /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
     /// present, are also in `args` but without a reference).

diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h
@@ -0,0 +1,315 @@
+/*
+    pybind11/detail/argument_vector.h: small_vector-like containers to
+    avoid heap allocation of arguments during function call dispatch.
+
+    Copyright (c) Meta Platforms, Inc. and affiliates.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Shared implementation utility for our small_vector-like containers.
+// We support C++11 and C++14, so we cannot use
+// std::variant. Union with the tag packed next to the inline
+// array's size is smaller anyway, allowing 1 extra handle of
+// inline storage for free. Compare the layouts (1 line per
+// size_t/void*):
+// With variant, total is N + 2 for N >= 2:
+// - variant tag (cannot be packed with the array size)
+// - array size (or first pointer of 3 in std::vector)
+// - N pointers of inline storage (or 2 remaining pointers of std::vector)
+// Custom union, total is N + 1 for N >= 3:
+// - variant tag & array size if applicable
+// - N pointers of inline storage (or 3 pointers of std::vector)
+//
+// NOTE: this is a low-level representational convenience; the two
+// use cases of this union are materially different and in particular
+// have different semantics for inline_array::size. All that is being
+// shared is the memory management behavior.
+template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
+union inline_array_or_vector {
+    struct inline_array {
+        bool is_inline = true;
+        std::uint32_t size = 0;
+        std::array<ArrayT, InlineSize> arr;
+    };
+    struct heap_vector {
+        bool is_inline = false;
+        std::vector<VectorT> vec;
+
+        heap_vector() = default;
+        heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
+    };
+
+    inline_array array;
+    heap_vector vector;
+
+    static_assert(std::is_trivially_move_constructible<ArrayT>::value,
+                  "ArrayT must be trivially move constructible");
+    static_assert(std::is_trivially_destructible<ArrayT>::value,
+                  "ArrayT must be trivially destructible");
+
+    inline_array_or_vector() : array() {}
+    ~inline_array_or_vector() {
+        if (!is_inline()) {
+            vector.~heap_vector();
+        }
+    }
+    inline_array_or_vector(const inline_array_or_vector &) = delete;
+    inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;
+
+    inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
+        if (rhs.is_inline()) {
+            std::memcpy(&array, &rhs.array, sizeof(array));
+        } else {
+            new (&vector) heap_vector(std::move(rhs.vector));
+        }
+        assert(is_inline() == rhs.is_inline());
+    }
+
+    inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.is_inline()) {
+            if (!is_inline()) {
+                vector.~heap_vector();
+            }
+            std::memcpy(&array, &rhs.array, sizeof(array));
+        } else {
+            if (is_inline()) {
+                new (&vector) heap_vector(std::move(rhs.vector));
+            } else {
+                vector = std::move(rhs.vector);
+            }
+        }
+        return *this;
+    }
+
+    bool is_inline() const {
+        // It is undefined behavior to access the inactive member of a
+        // union directly. However, it is well-defined to reinterpret_cast any
+        // pointer into a pointer to char and examine it as an array
+        // of bytes. See
+        // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
+        bool result = false;
+        std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
+        return result;
+    }
+};
+
+// small_vector-like container to avoid heap allocation for N or fewer
+// arguments.
+template <std::size_t N>
+struct argument_vector {
+public:
+    argument_vector() = default;
+
+    argument_vector(const argument_vector &) = delete;
+    argument_vector &operator=(const argument_vector &) = delete;
+    argument_vector(argument_vector &&) noexcept = default;
+    argument_vector &operator=(argument_vector &&) noexcept = default;
+
+    std::size_t size() const {
+        if (is_inline()) {
+            return m_repr.array.size;
+        }
+        return m_repr.vector.vec.size();
+    }
+
+    handle &operator[](std::size_t idx) {
+        assert(idx < size());
+        if (is_inline()) {
+            return m_repr.array.arr[idx];
+        }
+        return m_repr.vector.vec[idx];
+    }
+
+    handle operator[](std::size_t idx) const {
+        assert(idx < size());
+        if (is_inline()) {
+            return m_repr.array.arr[idx];
+        }
+        return m_repr.vector.vec[idx];
+    }
+
+    void push_back(handle x) {
+        if (is_inline()) {
+            auto &ha = m_repr.array;
+            if (ha.size == N) {
+                move_to_vector_with_reserved_size(N + 1);
+                m_repr.vector.vec.push_back(x);
+            } else {
+                ha.arr[ha.size++] = x;
+            }
+        } else {
+            m_repr.vector.vec.push_back(x);
+        }
+    }
+
+    template <typename Arg>
+    void emplace_back(Arg &&x) {
+        push_back(handle(x));
+    }
+
+    void reserve(std::size_t sz) {
+        if (is_inline()) {
+            if (sz > N) {
+                move_to_vector_with_reserved_size(sz);
+            }
+        } else {
+            m_repr.vector.vec.reserve(sz);
+        }
+    }
+
+private:
+    using repr_type = inline_array_or_vector<handle, N>;
+    repr_type m_repr;
+
+    void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+        assert(is_inline());
+        auto &ha = m_repr.array;
+        using heap_vector = typename repr_type::heap_vector;
+        heap_vector hv;
+        hv.vec.reserve(reserved_size);
+        std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
+        new (&m_repr.vector) heap_vector(std::move(hv));
+    }
+
+    bool is_inline() const { return m_repr.is_inline(); }
+};
+
+// small_vector-like container to avoid heap allocation for N or fewer
+// arguments.
+template <std::size_t kRequestedInlineSize>
+struct args_convert_vector {
+private:
+public:
+    args_convert_vector() = default;
+
+    args_convert_vector(const args_convert_vector &) = delete;
+    args_convert_vector &operator=(const args_convert_vector &) = delete;
+    args_convert_vector(args_convert_vector &&) noexcept = default;
+    args_convert_vector &operator=(args_convert_vector &&) noexcept = default;
+
+    args_convert_vector(std::size_t count, bool value) {
+        if (count > kInlineSize) {
+            new (&m_repr.vector) typename repr_type::heap_vector(count, value);
+        } else {
+            auto &inline_arr = m_repr.array;
+            inline_arr.arr.fill(value ? std::size_t(-1) : 0);
+            inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
+        }
+    }
+
+    std::size_t size() const {
+        if (is_inline()) {
+            return m_repr.array.size;
+        }
+        return m_repr.vector.vec.size();
+    }
+
+    void reserve(std::size_t sz) {
+        if (is_inline()) {
+            if (sz > kInlineSize) {
+                move_to_vector_with_reserved_size(sz);
+            }
+        } else {
+            m_repr.vector.vec.reserve(sz);
+        }
+    }
+
+    bool operator[](std::size_t idx) const {
+        if (is_inline()) {
+            return inline_index(idx);
+        }
+        assert(idx < m_repr.vector.vec.size());
+        return m_repr.vector.vec[idx];
+    }
+
+    void push_back(bool b) {
+        if (is_inline()) {
+            auto &ha = m_repr.array;
+            if (ha.size == kInlineSize) {
+                move_to_vector_with_reserved_size(kInlineSize + 1);
+                m_repr.vector.vec.push_back(b);
+            } else {
+                assert(ha.size < kInlineSize);
+                const auto wbi = word_and_bit_index(ha.size++);
+                assert(wbi.word < kWords);
+                assert(wbi.bit < kBitsPerWord);
+                if (b) {
+                    ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
+                } else {
+                    ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
+                }
+                assert(operator[](ha.size - 1) == b);
+            }
+        } else {
+            m_repr.vector.vec.push_back(b);
+        }
+    }
+
+    void swap(args_convert_vector &rhs) noexcept { std::swap(m_repr, rhs.m_repr); }
+
+private:
+    struct WordAndBitIndex {
+        std::size_t word;
+        std::size_t bit;
+    };
+
+    static WordAndBitIndex word_and_bit_index(std::size_t idx) {
+        return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
+    }
+
+    bool inline_index(std::size_t idx) const {
+        const auto wbi = word_and_bit_index(idx);
+        assert(wbi.word < kWords);
+        assert(wbi.bit < kBitsPerWord);
+        return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit);
+    }
+
+    void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+        auto &inline_arr = m_repr.array;
+        using heap_vector = typename repr_type::heap_vector;
+        heap_vector hv;
+        hv.vec.reserve(reserved_size);
+        for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
+            hv.vec.push_back(inline_index(ii));
+        }
+        new (&m_repr.vector) heap_vector(std::move(hv));
+    }
+
+    static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
+    static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
+    static constexpr auto kInlineSize = kWords * kBitsPerWord;
+
+    using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
+    repr_type m_repr;
+
+    bool is_inline() const { return m_repr.is_inline(); }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h
@@ -1048,13 +1048,14 @@ class cpp_function : public function {
                 }
 #endif
 
-                std::vector<bool> second_pass_convert;
+                args_convert_vector<arg_vector_small_size> second_pass_convert;
                 if (overloaded) {
                     // We're in the first no-convert pass, so swap out the conversion flags for a
                     // set of all-false flags.  If the call fails, we'll swap the flags back in for
                     // the conversion-allowed call below.
-                    second_pass_convert.resize(func.nargs, false);
-                    call.args_convert.swap(second_pass_convert);
+                    second_pass_convert = std::move(call.args_convert);
+                    call.args_convert
+                        = args_convert_vector<arg_vector_small_size>(func.nargs, false);
                 }
 
                 // 6. Call the function.

@@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS)
   # Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target.
   add_subdirectory(pure_cpp)
 
-  # Test embedding the interpreter. Provides the `cpptest` target.
-  add_subdirectory(test_embed)
+  # Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
+  add_subdirectory(test_with_catch)
 
   # Test CMake build using functions and targets from subdirectory or installed location
   add_subdirectory(test_cmake_build)

diff --git a/tests/extra_python_package/test_files.py b/tests/extra_python_package/test_files.py
@@ -76,6 +76,7 @@
 }
 
 detail_headers = {
+    "include/pybind11/detail/argument_vector.h",
     "include/pybind11/detail/class.h",
     "include/pybind11/detail/common.h",
     "include/pybind11/detail/cpp_conduit.h",