diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4fa0c445..44b8624d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'windows-2022', 'macos-13'] - python: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13.0-rc.1', 'pypy3.9-v7.3.16', 'pypy3.10-v7.3.17'] + python: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13.0-rc.2', 'pypy3.9-v7.3.16', 'pypy3.10-v7.3.17'] name: "Python ${{ matrix.python }} / ${{ matrix.os }}" runs-on: ${{ matrix.os }} @@ -49,7 +49,7 @@ jobs: python -m pip install pytest pytest-github-actions-annotate-failures typing_extensions - name: Install NumPy - if: ${{ !startsWith(matrix.python, 'pypy') && !contains(matrix.python, 'rc.1') }} + if: ${{ !startsWith(matrix.python, 'pypy') && !contains(matrix.python, 'rc.2') }} run: | python -m pip install numpy scipy @@ -135,4 +135,39 @@ jobs: run: cmake --build build -j 2 - name: Run tests - run: cd build && python3 -m pytest + run: > + cd build; + python -m pytest + + free-threaded: + name: "Python 3.13-dev / ubuntu.latest [free-threaded]" + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - uses: deadsnakes/action@v3.1.0 + with: + python-version: 3.13-dev + nogil: true + + - name: Install the latest CMake + uses: lukka/get-cmake@latest + + - name: Install PyTest + run: | + python -m pip install pytest pytest-github-actions-annotate-failures + + - name: Configure + run: > + cmake -S . -B build -DNB_TEST_FREE_THREADED=ON + + - name: Build C++ + run: cmake --build build -j 2 + + - name: Run tests + run: > + cd build; + python -m pytest diff --git a/.gitignore b/.gitignore index 359fdefb..5ea58f3a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,15 @@ libnanobind-static.a libnanobind-static-abi3.a -libnanobind.dylib +libnanobind-static-ft.a libnanobind.so -libnanobind-abi3.dylib libnanobind-abi3.so +libnanobind-ft.so +libnanobind.dylib +libnanobind-abi3.dylib +libnanobind-ft.dylib nanobind.dll nanobind-abi3.dll +nanobind-ft.dll libinter_module.dylib libinter_module.so diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ec06780..c47b471d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,13 +27,14 @@ endif() option(NB_CREATE_INSTALL_RULES "Create installation rules" ${NB_MASTER_PROJECT}) option(NB_USE_SUBMODULE_DEPS "Use the nanobind dependencies shipped as a git submodule of this repository" ON) -option(NB_TEST "Compile nanobind tests?" ${NB_MASTER_PROJECT}) -option(NB_TEST_STABLE_ABI "Test the stable ABI interface?" OFF) -option(NB_TEST_SHARED_BUILD "Build a shared nanobind library for the test suite?" OFF) -option(NB_TEST_CUDA "Force the use of the CUDA/NVCC compiler for testing purposes" OFF) +option(NB_TEST "Compile nanobind tests?" ${NB_MASTER_PROJECT}) +option(NB_TEST_STABLE_ABI "Test the stable ABI interface?" OFF) +option(NB_TEST_SHARED_BUILD "Build a shared nanobind library for the test suite?" OFF) +option(NB_TEST_CUDA "Force the use of the CUDA/NVCC compiler for testing purposes" OFF) +option(NB_TEST_FREE_THREADED "Build free-threaded extensions for the test suite?" ON) if (NOT MSVC) - option(NB_TEST_SANITZE "Build tests with address and undefined behavior sanitizers?" OFF) + option(NB_TEST_SANITIZE "Build tests with address and undefined behavior sanitizers?" OFF) endif() # --------------------------------------------------------------------------- diff --git a/cmake/darwin-ld-cpython.sym b/cmake/darwin-ld-cpython.sym index cdd1cbe8..850dd384 100644 --- a/cmake/darwin-ld-cpython.sym +++ b/cmake/darwin-ld-cpython.sym @@ -908,6 +908,12 @@ -U __Py_INCREF_IncRefTotal -U __PyObject_GetDictPtr -U _PyList_GetItemRef +-U _PyDict_GetItemRef +-U _PyDict_GetItemStringRef +-U _PyDict_SetDefault +-U _PyDict_SetDefaultRef +-U _PyWeakref_GetRef +-U _PyImport_AddModuleRef -U _PyUnstable_Module_SetGIL -U _PyMutex_Unlock -U _PyMutex_Lock @@ -916,3 +922,4 @@ -U _PyCriticalSection_End -U _PyCriticalSection2_Begin -U _PyCriticalSection2_End +-U _PyUnicode_AsUTF8 diff --git a/cmake/nanobind-config.cmake b/cmake/nanobind-config.cmake index ec7ff583..3ef847e4 100644 --- a/cmake/nanobind-config.cmake +++ b/cmake/nanobind-config.cmake @@ -40,6 +40,9 @@ if(DEFINED NB_SOSABI) endif() endif() +# Extract Python version and extensions (e.g. free-threaded build) +string(REGEX REPLACE "[^-]*-([^-]*)-.*" "\\1" NB_ABI "${NB_SOABI}") + # If either suffix is missing, call Python to compute it if(NOT DEFINED NB_SUFFIX OR NOT DEFINED NB_SUFFIX_S) # Query Python directly to get the right suffix. @@ -72,6 +75,7 @@ endif() # Stash these for later use set(NB_SUFFIX ${NB_SUFFIX} CACHE INTERNAL "") set(NB_SUFFIX_S ${NB_SUFFIX_S} CACHE INTERNAL "") +set(NB_ABI ${NB_ABI} CACHE INTERNAL "") get_filename_component(NB_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) get_filename_component(NB_DIR "${NB_DIR}" PATH) @@ -201,13 +205,17 @@ function (nanobind_build_library TARGET_NAME) endif() if (WIN32) - if (${TARGET_NAME} MATCHES "abi3") + if (${TARGET_NAME} MATCHES "-abi3") target_link_libraries(${TARGET_NAME} PUBLIC Python::SABIModule) else() target_link_libraries(${TARGET_NAME} PUBLIC Python::Module) endif() endif() + if (TARGET_NAME MATCHES "-ft") + target_compile_definitions(${TARGET_NAME} PUBLIC NB_FREE_THREADED) + endif() + # Nanobind performs many assertion checks -- detailed error messages aren't # included in Release/MinSizeRel modes target_compile_definitions(${TARGET_NAME} PRIVATE @@ -297,7 +305,7 @@ endfunction() function(nanobind_add_module name) cmake_parse_arguments(PARSE_ARGV 1 ARG - "STABLE_ABI;NB_STATIC;NB_SHARED;PROTECT_STACK;LTO;NOMINSIZE;NOSTRIP;MUSL_DYNAMIC_LIBCPP" + "STABLE_ABI;FREE_THREADED;NB_STATIC;NB_SHARED;PROTECT_STACK;LTO;NOMINSIZE;NOSTRIP;MUSL_DYNAMIC_LIBCPP" "NB_DOMAIN" "") add_library(${name} MODULE ${ARG_UNPARSED_ARGUMENTS}) @@ -319,6 +327,12 @@ function(nanobind_add_module name) set(ARG_STABLE_ABI FALSE) endif() + if (NB_ABI MATCHES "t") + set(ARG_STABLE_ABI FALSE) + else(ARG_STABLE_ABI) + set(ARG_FREE_THREADED FALSE) + endif() + set(libname "nanobind") if (ARG_NB_STATIC) set(libname "${libname}-static") @@ -328,6 +342,10 @@ function(nanobind_add_module name) set(libname "${libname}-abi3") endif() + if (ARG_FREE_THREADED) + set(libname "${libname}-ft") + endif() + if (ARG_NB_DOMAIN AND ARG_NB_SHARED) set(libname ${libname}-${ARG_NB_DOMAIN}) endif() @@ -345,6 +363,10 @@ function(nanobind_add_module name) nanobind_extension(${name}) endif() + if (ARG_FREE_THREADED) + target_compile_definitions(${name} PRIVATE NB_FREE_THREADED) + endif() + target_link_libraries(${name} PRIVATE ${libname}) if (NOT ARG_PROTECT_STACK) diff --git a/docs/api_cmake.rst b/docs/api_cmake.rst index c8718176..17a15b67 100644 --- a/docs/api_cmake.rst +++ b/docs/api_cmake.rst @@ -69,6 +69,11 @@ The high-level interface consists of just one CMake command: `__ build, making it possible to use a compiled extension across Python minor versions. The flag is ignored on Python versions older than < 3.12. + * - ``FREE_THREADED`` + - Compile an Python extension that opts into free-threaded (i.e., + GIL-less) Python behavior, which requires a special free-threaded + build of Python 3.13 or newer. The flag is ignored on unsupported + Python versions. * - ``NB_STATIC`` - Compile the core nanobind library as a static library. This simplifies redistribution but can increase the combined binary @@ -270,6 +275,8 @@ The various commands are described below: - Perform a static library build (without this suffix, a shared build is used) * - ``-abi3`` - Perform a stable ABI build targeting Python v3.12+. + * - ``-ft`` + - Perform a build that opts into the Python 3.13+ free-threaded behavior. .. code-block:: cmake diff --git a/docs/api_core.rst b/docs/api_core.rst index 81f56a46..2a480dd5 100644 --- a/docs/api_core.rst +++ b/docs/api_core.rst @@ -790,6 +790,10 @@ Wrapper classes Return an item iterator that returns ``std::pair`` key-value pairs analogous to ``iter(dict.items())`` in Python. + In free-threaded Python, the :cpp:class:``detail::dict_iterator`` class + acquires a lock to the underlying dictionary to enable the use of the + efficient but thread-unsafe ``PyDict_Next()`` Python C traversal routine. + .. cpp:function:: detail::dict_iterator end() const Return a sentinel that ends the iteration. @@ -1616,7 +1620,9 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, .. cpp:function:: template arg_v operator=(T &&value) const - Assign a default value to the argument. + Return an argument annotation that is like this one but also assigns a + default value to the argument. The default will be converted into a Python + object immediately, so its bindings must have already been defined. .. cpp:function:: arg &none(bool value = true) @@ -1638,6 +1644,12 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, explain it in docstrings and stubs (``str(value)``) does not produce acceptable output. + .. cpp:function:: arg_locked lock() + + Return an argument annotation that is like this one but also requests that + this argument be locked when dispatching a function call in free-threaded + Python extensions. It does nothing in regular GIL-protected extensions. + .. cpp:struct:: is_method Indicate that the bound function is a method. @@ -1656,6 +1668,12 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, Indicate that the bound constructor can be used to perform implicit conversions. +.. cpp:struct:: lock_self + + Indicate that the implicit ``self`` argument of a method should be locked + when dispatching a call in a free-threaded extension. This annotation does + nothing in regular GIL-protected extensions. + .. cpp:struct:: template call_guard Invoke the call guard(s) `Ts` when the bound function executes. The RAII @@ -1875,10 +1893,8 @@ parameter of :cpp:func:`module_::def`, :cpp:func:`class_::def`, .. cpp:struct:: template for_setter - Analogous to :cpp:struct:`for_getter`, but for setters. - .. _class_binding_annotations: Class binding annotations @@ -2552,10 +2568,103 @@ running it in parallel from multiple Python threads. Release the GIL (**must** be currently held) + In :ref:`free-threaded extensions `, this operation also + temporarily releases all :ref:`argument locks ` held by + the current thread. + .. cpp:function:: ~gil_scoped_release() Reacquire the GIL +Free-threading +-------------- + +Nanobind provides abstractions to implement *additional* locking that is +needed to ensure the correctness of free-threaded Python extensions. + +.. cpp:struct:: ft_mutex + + Object-oriented wrapper representing a `PyMutex + `__. It can be + slightly more efficient than OS/language-provided primitives (e.g., + ``std::thread``, ``pthread_mutex_t``) and should generally be preferred when + adding critical sections to Python bindings. + + In Python builds *without* free-threading, this class does nothing. It has + no attributes and the :cpp:func:`lock` and :cpp:func:`unlock` functions + return immediately. + + .. cpp:function:: ft_mutex() + + Create a new (unlocked) mutex. + + .. cpp:function:: void lock() + + Acquire the mutex. + + .. cpp:function:: void unlock() + + Release the mutex. + +.. cpp:struct:: ft_lock_guard + + This class provides a RAII lock guard analogous to ``std::lock_guard`` and + ``std::unique_lock``. + + .. cpp:function:: ft_lock_guard(ft_mutex &mutex) + + Call :cpp:func:`mutex.lock() ` (no-op in non-free-threaded builds). + + .. cpp:function:: ~ft_lock_guard() + + Call :cpp:func:`mutex.unlock() ` (no-op in non-free-threaded builds). + +.. cpp:struct:: ft_object_guard + + This class provides a RAII guard that locks a single Python object within a + local scope (in contrast to :cpp:class:`ft_lock_guard`, which locks a + mutex). + + It is a thin wrapper around the Python `critical section API + `__. + Please refer to the Python documentation for details on the semantics of + this relaxed form of critical section (in particular, Python critical sections + may release previously held locks). + + In Python builds *without* free-threading, this class does nothing---the + constructor and destructor return immediately. + + .. cpp:function:: ft_object_guard(handle h) + + Lock the object ``h`` (no-op in non-free-threaded builds) + + .. cpp:function:: ~ft_object_guard() + + Unlock the object ``h`` (no-op in non-free-threaded builds) + +.. cpp:struct:: ft_object2_guard + + This class provides a RAII guard that locks *two* Python object within a + local scope (in contrast to :cpp:class:`ft_lock_guard`, which locks a + mutex). + + It is a thin wrapper around the Python `critical section API + `__. + Please refer to the Python documentation for details on the semantics of + this relaxed form of critical section (in particular, Python critical sections + may release previously held locks). + + In Python builds *without* free-threading, this class does nothing---the + constructor and destructor return immediately. + + .. cpp:function:: ft_object2_guard(handle h1, handle h2) + + Lock the objects ``h1`` and ``h2`` (no-op in non-free-threaded builds) + + .. cpp:function:: ~ft_object2_guard() + + Unlock the objects ``h1`` and ``h2`` (no-op in non-free-threaded builds) + Low-level type and instance access ---------------------------------- diff --git a/docs/changelog.rst b/docs/changelog.rst index 28d80ad7..5df92acd 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -18,7 +18,16 @@ below inherit that of the preceding release. Version 2.2.0 (TBA) ------------------- -* nanobind has always used `PEP 590 vector calls +- nanobind can now target `free-threaded Python + `__, which replaces the `Global + Interpreter Lock (GIL) + `__ with a + fine-grained locking scheme (see `PEP 703 + `__) to better leverage multi-core + parallelism. A `separate documation page `__ explains this in + detail. + +- nanobind has always used `PEP 590 vector calls `__ to efficiently dispatch calls to function and method bindings, but it lacked the ability to do so for constructors (e.g., ``MyType(arg1, arg2, ...)``). @@ -46,8 +55,10 @@ Version 2.2.0 (TBA) - **Memory order**: :cpp:class:`c_contig`, :cpp:class:`f_contig`. - **Shape**: :cpp:class:`nb::shape\<3, 4, 5\> `, etc. - - **Device type**: :cpp:class:`nb::device::cpu `, :cpp:class:`nb::device::cuda `, etc. - - **Framework**: :cpp:class:`nb::numpy `, :cpp:class:`nb::pytorch `, etc. + - **Device type**: :cpp:class:`nb::device::cpu `, + :cpp:class:`nb::device::cuda `, etc. + - **Framework**: :cpp:class:`nb::numpy `, + :cpp:class:`nb::pytorch `, etc. - **Data type**: ``uint64_t``, ``std::complex``, etc. Previously, only the **framework** and **data type** annotations were diff --git a/docs/free_threaded.rst b/docs/free_threaded.rst new file mode 100644 index 00000000..af2bfa87 --- /dev/null +++ b/docs/free_threaded.rst @@ -0,0 +1,301 @@ +.. _free-threaded: + +.. cpp:namespace:: nanobind + +Free-threaded Python +==================== + +**Free-threading** is an experimental new Python feature that replaces the +`Global Interpreter Lock (GIL) +`__ with a fine-grained +locking scheme to better leverage multi-core parallelism. The resulting +benefits do not come for free: extensions must explicitly opt-in and generally +require careful modifications to ensure correctness. + +Nanobind can target free-threaded Python since version 2.2.0. This page +explains how to do so and discusses a few caveats. Besides this page, make sure +to review `py-free-threading.github.io `__ +for a more comprehensive discussion of free-threaded Python. `PEP 703 +`__ explains the nitty gritty details. + +Opting in +--------- + +To opt into free-threaded Python, pass the ``FREE_THREADED`` parameter to the +:cmake:command:`nanobind_add_module()` CMake target command. For other build +systems, refer to their respective documentation pages. + +.. code-block:: cmake + + nanobind_add_module( + my_ext # Target name + FREE_THREADED # Opt into free-threading + my_ext.h # Source code files below + my_ext.cpp) + +nanobind ignores the ``FREE_THREADED`` parameter when the registered Python +version does not support free-threading. + +.. note:: + + **Stable ABI**: Note that there currently is no stable ABI for free-threaded + Python, hence the ``STABLE_ABI`` parameter will be ignored in free-threaded + extensions builds. It is valid to combine the ``STABLE_ABI`` and + ``FREE_THREADED`` arguments: the build system will choose between the two + depending on the detected Python version. + +.. warning:: + + Loading an Python extension that does not support free-threading disables + free-threading globally. In larger binding projects with multiple + extensions, all of them must be adapted. + +If free-threading was requested and is available, the build system will set the +``NB_FREE_THREADED`` preprocessor flag. This can be helpful to specialize +binding code with ``#ifdef`` blocks, e.g.: + +.. code-block:: cpp + + #if !defined(NB_FREE_THREADED) + ... // simple GIL-protected code + #else + ... // more complex thread-aware code + #endif + +Caveats +------- + +Free-threading can violate implicit assumptions made by extension developers +when previously serial operations suddenly run concurrently, producing +undefined behavior (race conditions, crashes, etc.). + +Let's consider a concrete example: the binding code below defines a ``Counter`` +class with an increment operation. + +.. code-block:: cpp + + struct Counter { + int value = 0; + void inc() { value++; } + }; + + nb::class_(m, "Counter") + .def("inc", &Counter::inc) + .def_ro("value", &Counter::value); + + +If multiple threads call the ``inc()`` method of a single ``Counter``, the +final count will generally be incorrect, as the increment operation ``value++`` +does not execute atomically. + +To fix this, we could modify the C++ type so that it protects its ``value`` +member from concurrent modification, for example using an atomic number type +(e.g., ``std::atomic``) or a critical section (e.g., based on +``std::mutex``). + +The race condition in the above example is relatively benign. However, +in more complex projects, combinations of concurrency and unsafe memory +accesses could introduce non-deterministic data corruption and crashes. + +Another common source of problems are *global variables* undergoing concurrent +modification when no longer protected by the GIL. They will likewise require +supplemental locking. The :ref:`next section ` explains a +Python-specific locking primitive that can be used in binding code besides +the solutions mentioned above. + +.. _free-threaded-locks: + +Python locks +------------ + +Nanobind provides convenience functionality encapsulating the mutex +implementation that is part of Python ("``PyMutex``"). It is slightly more +efficient than OS/language-provided synchronization primitives and generally +preferable within Python extensions. + +The class :cpp:class:`ft_mutex` is analogous to ``std::mutex``, and +:cpp:class:`ft_lock_guard` is analogous to ``std::lock_guard``. Note that they +only exist to add *supplemental* critical sections needed in free-threaded +Python, while becoming inactive (no-ops) when targeting regular GIL-protected +Python. + +With these abstractions, the previous ``Counter`` implementation could be +rewritten as: + +.. code-block:: cpp + :emphasize-lines: 3,6 + + struct Counter { + int value = 0; + nb::ft_mutex mutex; + + void inc() { + nb::ft_lock_guard guard(mutex); + value++; + } + }; + +These locks are very compact (``sizeof(nb::ft_mutex) == 1``), though this is a +Python implementation detail that could change in the future. + +.. _argument-locks: + +Argument locking +---------------- + +Modifying class and function definitions as shown above may not always be +possible. As an alternative, nanobind also provides a way to *retrofit* +supplemental locking onto existing code. The idea is to lock individual +arguments of a function *before* being allowed to invoke it. A built-in mutex +present in every Python object enables this. + +To do so, call the :cpp:func:`.lock() ` member of +:cpp:class:`nb::arg() ` annotations to indicate that an +argument must be locked, e.g.: + +- :cpp:func:`nb::arg("my_parameter").lock() ` +- :cpp:func:`"my_parameter"_a.lock() ` (short-hand form) + +In methods bindings, pass :cpp:struct:`nb::lock_self() ` to lock +the implicit ``self`` argument. Note that at most 2 arguments can be +locked per function, which is a limitation of the `Python locking API +`__. + +The example below shows how this functionality can be used to protect ``inc()`` +and a new ``merge()`` function that acquires two simultaneous locks. + +.. code-block:: cpp + + struct Counter { + int value = 0; + + void inc() { value++; } + void merge(Counter &other) { + value += other.value; + other.value = 0; + } + }; + + nb::class_(m, "Counter") + .def("inc", &Counter::inc, nb::lock_self()) + .def("merge", &Counter::merge, nb::lock_self(), "other"_a.lock()) + .def_ro("value", &Counter::value); + +The above solution has an obvious drawback: it only protects *bindings* (i.e., +transitions from Python to C++). For example, if some other part of a C++ +codebase calls ``merge()`` directly, the binding layer won't be involved, and +no locking takes place. If such behavior can introduce race conditions, a +larger-scale redesign of your project may be in order. + +.. note:: + + Adding locking annotations indiscriminately is inadvisable because locked + calls are more costly than unlocked ones. The :cpp:func:`.lock() + ` and :cpp:struct:`nb::lock_self() ` annotations are + ignored in GIL-protected builds, hence this added cost only applies to + free-threaded extensions. + + Furthermore, when adding locking annotations to a function, consider keeping + the arguments *unnamed* (i.e., :cpp:func:`nb::arg().lock() ` + instead of :cpp:func:`nb::arg("name").lock() `) if the function + will never be called with keyword arguments. Processing named arguments + causes small :ref:`binding overheads ` that may be + undesirable if a function that does very little is called at a very high + rate. + +.. note:: + + **Python API and locking**: When the lock-protected function performs Python + API calls (e.g., using :ref:`wrappers ` like :cpp:class:`nb::dict + `), Python may temporarily release locks to avoid deadlocks. Here, + even basic reference counting such as a :cpp:class:`nb::object + ` variable expiring at the end of a scope counts as an API call. + + These locks will be reacquired following the Python API call. This behavior + resembles ordinary (GIL-protected) Python code, where operations like + `Py_DECREF() + `__ can cause + cause arbitrary Python code to execute. The semantics of this kind of + relaxed critical section are described in the `Python documentation + `__. + +Miscellaneous notes +------------------- + +API +--- + +The following API specific to free-threading has been added: + +- :cpp:class:`nb::ft_mutex ` +- :cpp:class:`nb::ft_lock_guard ` +- :cpp:class:`nb::ft_object_guard ` +- :cpp:class:`nb::ft_object2_guard ` +- :cpp:func:`nb::arg::lock() ` + +API stability +_____________ + +The interface explained in this is excluded from the project's semantic +versioning policy. Free-threading is still experimental, and API breaks may be +necessary based on future experience and changes in Python itself. + +Wrappers +________ + +:ref:`Wrapper types ` like :cpp:class:`nb::list ` may be used +in multi-threaded code. Operations like :cpp:func:`nb::list::append() +` internally acquire locks and behave just like their ordinary +Python counterparts. This means that race conditions can still occur without +larger-scale synchronization, but such races won't jeopardize the memory safety +of the program. + +GIL scope guards +________________ + +Prior to free-threaded Python, the nanobind scope guards +:cpp:struct:`gil_scoped_acquire` and :cpp:struct:`gil_scoped_release` would +normally be used to acquire/release the GIL and enable parallel regions. + +These remain useful and should not be removed from existing code: while no +longer blocking operations, they set and unset the current Python thread +context and inform the garbage collector. + +The :cpp:struct:`gil_scoped_release` RAII scope guard class plays a special +role in free-threaded builds, since it releases all :ref:`argument locks +` held by the current thread. + +Immortalization +_______________ + +Python relies on a technique called *reference counting* to determine when an +object is no longer needed. This approach can become a bottleneck in +multi-threaded programs, since increasing and decreasing reference counts +requires coordination among multiple processor cores. Python type and function +objects are especially sensitive, since their reference counts change at a very +high rate. + +Similar to free-threaded Python itself, nanobind avoids this bottleneck by +*immortalizing* functions (``nanobind.nb_func``, ``nanobind.nb_method``) and +type bindings. Immortal objects don't require reference counting and therefore +cannot cause the bottleneck mentioned above. The main downside of this approach +is that these objects leak when the interpreter shuts down. Free-threaded +nanobind extensions disable the internal :ref:`leak checker `, +since it would produce many warning messages caused by immortal objects. + +Internal data structures +________________________ + +Nanobind maintains various internal data structures that store information +about instances and function/type bindings. These data structures also play an +important role to exchange type/instance data in larger projects that are split +across several independent extension modules. + +The layout of these data structures differs between ordinary and free-threaded +extensions, therefore nanobind isolates them from each other by assigning a +different ABI version tag. This means that multi-module projects will need +to consistently compile either free-threaded or non-free-threaded modules. + +Free-threaded nanobind uses thread-local and sharded data structures to avoid +lock and atomic contention on the internal data structures, which would +otherwise become a bottleneck in multi-threaded Python programs. diff --git a/docs/functions.rst b/docs/functions.rst index 8784ff47..8c364378 100644 --- a/docs/functions.rst +++ b/docs/functions.rst @@ -545,3 +545,58 @@ The following interactive session shows how to call them from Python. C++ libraries (e.g. GUI libraries, asynchronous networking libraries, etc.). +.. _binding-overheads: + +Minimizing binding overheads +---------------------------- + +The code that dispatches function calls from Python to C++ is in general +:ref:`highly optimized `. When it is important to further reduce +binding overheads to an absolute minimum, consider removing annotations for +:ref:`keyword and default arguments ` along with +other advanced binding annotations. + +In the snippet below, ``f1`` has lower binding overheads compared to ``f2``. + +.. code-block:: cpp + + NB_MODULE(my_ext, m) { + m.def("f1", [](int) { /* no-op */ }); + m.def("f2", [](int) { /* no-op */ }, "arg"_a); + } + +This is because ``f1``: + +1. Does *not* use any of the following advanced argument annotations features: + + - **Named function arguments**, e.g., :cpp:class:`nb::arg("name") ` or ``"name"_a``. + + - **Default argument values**, e.g., :cpp:func:`nb::arg() = 0 ` or ``"name"_a = false``. + + - **Nullability** or **implicit conversion** flags, e.g., + :cpp:func:`nb::arg().none() ` or :cpp:func:`"name"_a.noconvert() + `. + +2. Has no :cpp:class:`nb::keep_alive\() ` + annotations. + +3. Takes no variable-length positional (:cpp:class:`nb::args `) or keyword + (:cpp:class:`nb::kwargs `) arguments. + +4. Has a to total of **8 or fewer** function arguments. + +If all of the above conditions are satisfied, nanobind switches to a +specialized dispatcher that is optimized to handle a small number of positional +arguments. Otherwise, it uses the default dispatcher that works in any +situation. It is also worth noting that functions with many overloads generally +execute more slowly, since nanobind must first select a suitable one. + +These differences are mainly of interest when a function that does *very +little* is called at a *very high rate*, in which case binding overheads can +become noticeable. + +Regarding point 1 of the above list, note that **locking** is okay, as long as +the annotation does not provide an argument name. In other words, a function +binding with a :cpp:func:`nb::arg().lock() ` for some of its arguments stays on the fast +path. This is mainly of interest for :ref:`free-threaded ` +extensions. diff --git a/docs/index.rst b/docs/index.rst index e9b53e01..22133828 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -132,6 +132,7 @@ The nanobind logo was designed by `AndoTwin Studio :caption: Advanced :maxdepth: 1 + free_threaded ownership_adv lowlevel typeslots diff --git a/docs/typeslots.rst b/docs/typeslots.rst index ff00b9c8..34f67df4 100644 --- a/docs/typeslots.rst +++ b/docs/typeslots.rst @@ -122,7 +122,7 @@ reference cycles: // If w->value has an associated CPython object, return it. // If not, value.ptr() will equal NULL, which is also fine. - nb::object value = nb::find(w->value); + nb::handle value = nb::find(w->value); // Inform the Python GC about the instance (if non-NULL) Py_VISIT(value.ptr()); @@ -149,6 +149,25 @@ reference cycles: The type ``visitproc`` and macro ``Py_VISIT()`` are part of the Python C API. +.. note:: + + When targeting free-threaded Python, it is important that the ``tp_traverse`` + callback does not hold additional references to the objects being traversed. + + A previous version of this documentation page suggested the following + + .. code-block:: cpp + + nb::object value = nb::find(w->value); + Py_VISIT(value.ptr()); + + However, these now have to change to + + .. code-block:: cpp + + nb::handle value = nb::find(w->value); + Py_VISIT(value.ptr()); + The expression :cpp:func:`nb::inst_ptr\(self) ` efficiently returns the C++ instance associated with a Python object and is explained in the documentation about nanobind's :cpp:ref:`low level interface `. diff --git a/docs/why.rst b/docs/why.rst index e7819184..f20104e7 100644 --- a/docs/why.rst +++ b/docs/why.rst @@ -91,6 +91,13 @@ performance improvements: bottleneck. With nanobind's split into a precompiled library and minimal metatemplating, LTO is no longer crucial and can be skipped. +- **Free-threading**: Python 3.13+ supports a free-threaded mode that removes + the *Global Interpreter Lock* (GIL). Both pybind11 and nanobind support + free-threading as of recently. When comparing the two, nanobind provides + better multi-core scaling using a localized locking scheme. In pybind11, lock + contention on a central ``internals`` data structure used in every binding + operation becomes a bottleneck in practice. + - **Lifetime management**: nanobind maintains efficient internal data structures for lifetime management (needed for :cpp:class:`nb::keep_alive `, :cpp:enumerator:`nb::rv_policy::reference_internal diff --git a/include/nanobind/nanobind.h b/include/nanobind/nanobind.h index a138ec4f..71987856 100644 --- a/include/nanobind/nanobind.h +++ b/include/nanobind/nanobind.h @@ -48,10 +48,10 @@ #include "nb_error.h" #include "nb_attr.h" #include "nb_cast.h" +#include "nb_misc.h" #include "nb_call.h" #include "nb_func.h" #include "nb_class.h" -#include "nb_misc.h" #if defined(_MSC_VER) # pragma warning(pop) diff --git a/include/nanobind/nb_accessor.h b/include/nanobind/nb_accessor.h index f3a04d45..f690360d 100644 --- a/include/nanobind/nb_accessor.h +++ b/include/nanobind/nb_accessor.h @@ -133,21 +133,30 @@ struct num_item { }; struct num_item_list { - static constexpr bool cache_dec_ref = false; + #if defined(Py_GIL_DISABLED) + static constexpr bool cache_dec_ref = true; + #else + static constexpr bool cache_dec_ref = false; + #endif + using key_type = Py_ssize_t; NB_INLINE static void get(PyObject *obj, Py_ssize_t index, PyObject **cache) { - *cache = NB_LIST_GET_ITEM(obj, index); + #if defined(Py_GIL_DISABLED) + *cache = PyList_GetItemRef(obj, index); + #else + *cache = NB_LIST_GET_ITEM(obj, index); + #endif } NB_INLINE static void set(PyObject *obj, Py_ssize_t index, PyObject *v) { -#if !defined(Py_LIMITED_API) - // Handle differences between PyList_SetItem and PyList_SET_ITEM +#if defined(Py_LIMITED_API) || defined(NB_FREE_THREADED) + Py_INCREF(v); + PyList_SetItem(obj, index, v); +#else PyObject *old = NB_LIST_GET_ITEM(obj, index); -#endif Py_INCREF(v); NB_LIST_SET_ITEM(obj, index, v); -#if !defined(Py_LIMITED_API) Py_DECREF(old); #endif } diff --git a/include/nanobind/nb_attr.h b/include/nanobind/nb_attr.h index 72c71772..0476824e 100644 --- a/include/nanobind/nb_attr.h +++ b/include/nanobind/nb_attr.h @@ -20,9 +20,17 @@ struct name { }; struct arg_v; +struct arg_locked; +struct arg_locked_v; + +// Basic function argument descriptor (no default value, not locked) struct arg { NB_INLINE constexpr explicit arg(const char *name = nullptr) : name_(name), signature_(nullptr) { } + + // operator= can be used to provide a default value template NB_INLINE arg_v operator=(T &&value) const; + + // Mutators that don't change default value or locked state NB_INLINE arg &noconvert(bool value = true) { convert_ = !value; return *this; @@ -31,23 +39,75 @@ struct arg { none_ = value; return *this; } - NB_INLINE arg &sig(const char *value) { signature_ = value; return *this; } + // After lock(), this argument is locked + NB_INLINE arg_locked lock(); + const char *name_, *signature_; uint8_t convert_{ true }; bool none_{ false }; }; +// Function argument descriptor with default value (not locked) struct arg_v : arg { object value; NB_INLINE arg_v(const arg &base, object &&value) : arg(base), value(std::move(value)) {} + + private: + // Inherited mutators would slice off the default, and are not generally needed + using arg::noconvert; + using arg::none; + using arg::sig; + using arg::lock; +}; + +// Function argument descriptor that is locked (no default value) +struct arg_locked : arg { + NB_INLINE constexpr explicit arg_locked(const char *name = nullptr) : arg(name) { } + NB_INLINE constexpr explicit arg_locked(const arg &base) : arg(base) { } + + // operator= can be used to provide a default value + template NB_INLINE arg_locked_v operator=(T &&value) const; + + // Mutators must be respecified in order to not slice off the locked status + NB_INLINE arg_locked &noconvert(bool value = true) { + convert_ = !value; + return *this; + } + NB_INLINE arg_locked &none(bool value = true) { + none_ = value; + return *this; + } + NB_INLINE arg_locked &sig(const char *value) { + signature_ = value; + return *this; + } + + // Redundant extra lock() is allowed + NB_INLINE arg_locked &lock() { return *this; } }; +// Function argument descriptor that is potentially locked and has a default value +struct arg_locked_v : arg_locked { + object value; + NB_INLINE arg_locked_v(const arg_locked &base, object &&value) + : arg_locked(base), value(std::move(value)) {} + + private: + // Inherited mutators would slice off the default, and are not generally needed + using arg_locked::noconvert; + using arg_locked::none; + using arg_locked::sig; + using arg_locked::lock; +}; + +NB_INLINE arg_locked arg::lock() { return arg_locked{*this}; } + template struct call_guard { using type = detail::tuple; }; @@ -62,6 +122,7 @@ struct is_flag {}; struct is_final {}; struct is_generic {}; struct kw_only {}; +struct lock_self {}; template struct keep_alive {}; template struct supplement {}; @@ -129,13 +190,30 @@ enum class func_flags : uint32_t { has_keep_alive = (1 << 17) }; +enum cast_flags : uint8_t { + // Enable implicit conversions (code assumes this has value 1, don't reorder..) + convert = (1 << 0), + + // Passed to the 'self' argument in a constructor call (__init__) + construct = (1 << 1), + + // Indicates that the function dispatcher should accept 'None' arguments + accepts_none = (1 << 2), + + // Indicates that this cast is performed by nb::cast or nb::try_cast. + // This implies that objects added to the cleanup list may be + // released immediately after the caster's final output value is + // obtained, i.e., before it is used. + manual = (1 << 3) +}; + + struct arg_data { const char *name; const char *signature; PyObject *name_py; PyObject *value; - bool convert; - bool none; + uint8_t flag; }; template struct func_data_prelim { @@ -266,27 +344,41 @@ NB_INLINE void func_extra_apply(F &, std::nullptr_t, size_t &) { } template NB_INLINE void func_extra_apply(F &f, const arg &a, size_t &index) { - arg_data &arg = f.args[index++]; + uint8_t flag = 0; + if (a.none_) + flag |= (uint8_t) cast_flags::accepts_none; + if (a.convert_) + flag |= (uint8_t) cast_flags::convert; + + arg_data &arg = f.args[index]; + arg.flag = flag; arg.name = a.name_; arg.signature = a.signature_; arg.value = nullptr; - arg.convert = a.convert_; - arg.none = a.none_; + index++; } +// arg_locked will select the arg overload; the locking is added statically +// in nb_func.h template NB_INLINE void func_extra_apply(F &f, const arg_v &a, size_t &index) { - arg_data &arg = f.args[index++]; - arg.name = a.name_; - arg.signature = a.signature_; - arg.value = a.value.ptr(); - arg.convert = a.convert_; - arg.none = a.none_; + arg_data &ad = f.args[index]; + func_extra_apply(f, (const arg &) a, index); + ad.value = a.value.ptr(); +} +template +NB_INLINE void func_extra_apply(F &f, const arg_locked_v &a, size_t &index) { + arg_data &ad = f.args[index]; + func_extra_apply(f, (const arg_locked &) a, index); + ad.value = a.value.ptr(); } template NB_INLINE void func_extra_apply(F &, kw_only, size_t &) {} +template +NB_INLINE void func_extra_apply(F &, lock_self, size_t &) {} + template NB_INLINE void func_extra_apply(F &, call_guard, size_t &) {} @@ -298,6 +390,7 @@ NB_INLINE void func_extra_apply(F &f, nanobind::keep_alive, size template struct func_extra_info { using call_guard = void; static constexpr bool keep_alive = false; + static constexpr size_t nargs_locked = 0; }; template struct func_extra_info @@ -315,6 +408,16 @@ struct func_extra_info, Ts...> : func_extra static constexpr bool keep_alive = true; }; +template +struct func_extra_info : func_extra_info { + static constexpr size_t nargs_locked = 1 + func_extra_info::nargs_locked; +}; + +template +struct func_extra_info : func_extra_info { + static constexpr size_t nargs_locked = 1 + func_extra_info::nargs_locked; +}; + template NB_INLINE void process_keep_alive(PyObject **, PyObject *, T *) { } diff --git a/include/nanobind/nb_call.h b/include/nanobind/nb_call.h index 25150b0b..dfbeb45e 100644 --- a/include/nanobind/nb_call.h +++ b/include/nanobind/nb_call.h @@ -35,6 +35,9 @@ args_proxy api::operator*() const { template NB_INLINE void call_analyze(size_t &nargs, size_t &nkwargs, const T &value) { using D = std::decay_t; + static_assert(!std::is_base_of_v, + "nb::arg().lock() may be used only when defining functions, " + "not when calling them"); if constexpr (std::is_same_v) nkwargs++; @@ -65,7 +68,7 @@ NB_INLINE void call_init(PyObject **args, PyObject *kwnames, size_t &nargs, } else if constexpr (std::is_same_v) { PyObject *key, *entry; Py_ssize_t pos = 0; - + ft_object_guard guard(value); while (PyDict_Next(value.ptr(), &pos, &key, &entry)) { Py_INCREF(key); Py_INCREF(entry); args[kwargs_offset + nkwargs] = entry; diff --git a/include/nanobind/nb_cast.h b/include/nanobind/nb_cast.h index ec73f051..74ad6cef 100644 --- a/include/nanobind/nb_cast.h +++ b/include/nanobind/nb_cast.h @@ -32,20 +32,6 @@ NAMESPACE_BEGIN(NB_NAMESPACE) NAMESPACE_BEGIN(detail) -enum cast_flags : uint8_t { - // Enable implicit conversions (impl. assumes this is 1, don't reorder..) - convert = (1 << 0), - - // Passed to the 'self' argument in a constructor call (__init__) - construct = (1 << 1), - - // Indicates that this cast is performed by nb::cast or nb::try_cast. - // This implies that objects added to the cleanup list may be - // released immediately after the caster's final output value is - // obtained, i.e., before it is used. - manual = (1 << 2), -}; - /** * Type casters expose a member 'Cast' which users of a type caster must * query to determine what the caster actually can (and prefers) to produce. @@ -648,6 +634,9 @@ tuple make_tuple(Args &&...args) { template arg_v arg::operator=(T &&value) const { return arg_v(*this, cast((detail::forward_t) value)); } +template arg_locked_v arg_locked::operator=(T &&value) const { + return arg_locked_v(*this, cast((detail::forward_t) value)); +} template template detail::accessor& detail::accessor::operator=(T &&value) { diff --git a/include/nanobind/nb_defs.h b/include/nanobind/nb_defs.h index c49b7bb0..071e41f9 100644 --- a/include/nanobind/nb_defs.h +++ b/include/nanobind/nb_defs.h @@ -147,6 +147,10 @@ # error "nanobind requires a newer PyPy version (>= 7.3.10)" #endif +#if defined(NB_FREE_THREADED) && !defined(Py_GIL_DISABLED) +# error "Free-threaded extensions require a free-threaded version of Python" +#endif + #if defined(NB_DOMAIN) # define NB_DOMAIN_STR NB_TOSTRING(NB_DOMAIN) #else @@ -169,6 +173,11 @@ # define NB_TYPE_GET_SLOT_IMPL 1 #endif +#define NB_NONCOPYABLE(X) \ + X(const X &) = delete; \ + X &operator=(const X &) = delete; + + #define NB_MODULE_IMPL(name) \ extern "C" [[maybe_unused]] NB_EXPORT PyObject *PyInit_##name(); \ extern "C" NB_EXPORT PyObject *PyInit_##name() diff --git a/include/nanobind/nb_func.h b/include/nanobind/nb_func.h index 22684050..1cd98a1b 100644 --- a/include/nanobind/nb_func.h +++ b/include/nanobind/nb_func.h @@ -32,9 +32,40 @@ bool from_python_keep_alive(Caster &c, PyObject **args, uint8_t *args_flags, template constexpr size_t count_args_before_index(std::index_sequence) { static_assert(sizeof...(Is) == sizeof...(Ts)); - return ((Is < I && (std::is_same_v || std::is_same_v)) + ... + 0); + return ((Is < I && std::is_base_of_v) + ... + 0); } +#if defined(NB_FREE_THREADED) +struct ft_args_collector { + PyObject **args; + handle h1; + handle h2; + size_t index = 0; + + NB_INLINE explicit ft_args_collector(PyObject **a) : args(a) {} + NB_INLINE void apply(arg_locked *) { + if (h1.ptr() == nullptr) + h1 = args[index]; + h2 = args[index]; + ++index; + } + NB_INLINE void apply(arg *) { ++index; } + NB_INLINE void apply(...) {} +}; + +struct ft_args_guard { + NB_INLINE void lock(const ft_args_collector& info) { + PyCriticalSection2_Begin(&cs, info.h1.ptr(), info.h2.ptr()); + } + ~ft_args_guard() { + PyCriticalSection2_End(&cs); + } + PyCriticalSection2 cs; +}; +#endif + +struct no_guard {}; + template NB_INLINE PyObject *func_create(Func &&func, Return (*)(Args...), @@ -66,13 +97,21 @@ NB_INLINE PyObject *func_create(Func &&func, Return (*)(Args...), // Determine the number of nb::arg/nb::arg_v annotations constexpr size_t nargs_provided = - ((std::is_same_v + std::is_same_v) + ... + 0); + (std::is_base_of_v + ... + 0); constexpr bool is_method_det = (std::is_same_v + ... + 0) != 0; constexpr bool is_getter_det = (std::is_same_v + ... + 0) != 0; constexpr bool has_arg_annotations = nargs_provided > 0 && !is_getter_det; + // Determine the number of potentially-locked function arguments + constexpr bool lock_self_det = + (std::is_same_v + ... + 0) != 0; + static_assert(Info::nargs_locked <= 2, + "At most two function arguments can be locked"); + static_assert(!(lock_self_det && !is_method_det), + "The nb::lock_self() annotation only applies to methods"); + // Detect location of nb::kw_only annotation, if supplied. As with args/kwargs // we find the first and last location and later verify they match each other. // Note this is an index in Extra... while args/kwargs_pos_* are indices in @@ -187,6 +226,21 @@ NB_INLINE PyObject *func_create(Func &&func, Return (*)(Args...), tuple...> in; (void) in; +#if defined(NB_FREE_THREADED) + std::conditional_t guard; + if constexpr (Info::nargs_locked) { + ft_args_collector collector{args}; + if constexpr (is_method_det) { + if constexpr (lock_self_det) + collector.apply((arg_locked *) nullptr); + else + collector.apply((arg *) nullptr); + } + (collector.apply((Extra *) nullptr), ...); + guard.lock(collector); + } +#endif + if constexpr (Info::keep_alive) { if ((!from_python_keep_alive(in.template get(), args, args_flags, cleanup, Is) || ...)) diff --git a/include/nanobind/nb_lib.h b/include/nanobind/nb_lib.h index 358ebf83..3541bce8 100644 --- a/include/nanobind/nb_lib.h +++ b/include/nanobind/nb_lib.h @@ -555,6 +555,8 @@ NB_CORE bool is_alive() noexcept; NB_CORE void *type_get_slot(PyTypeObject *t, int slot_id); #endif +NB_CORE PyObject *dict_get_item_ref_or_fail(PyObject *d, PyObject *k); + NAMESPACE_END(detail) using detail::raise; diff --git a/include/nanobind/nb_misc.h b/include/nanobind/nb_misc.h index bd167d0b..b29d6b69 100644 --- a/include/nanobind/nb_misc.h +++ b/include/nanobind/nb_misc.h @@ -11,26 +11,76 @@ NAMESPACE_BEGIN(NB_NAMESPACE) struct gil_scoped_acquire { public: + NB_NONCOPYABLE(gil_scoped_acquire) gil_scoped_acquire() noexcept : state(PyGILState_Ensure()) { } ~gil_scoped_acquire() { PyGILState_Release(state); } - gil_scoped_acquire(const gil_scoped_acquire &) = delete; - gil_scoped_acquire& operator=(const gil_scoped_acquire &) = delete; private: const PyGILState_STATE state; }; -class gil_scoped_release { +struct gil_scoped_release { public: + NB_NONCOPYABLE(gil_scoped_release) gil_scoped_release() noexcept : state(PyEval_SaveThread()) { } ~gil_scoped_release() { PyEval_RestoreThread(state); } - gil_scoped_release(const gil_scoped_release &) = delete; - gil_scoped_release& operator=(const gil_scoped_release &) = delete; private: PyThreadState *state; }; +struct ft_mutex { +public: + NB_NONCOPYABLE(ft_mutex) + ft_mutex() = default; + +#if !defined(NB_FREE_THREADED) + void lock() { } + void unlock() { } +#else + void lock() { PyMutex_Lock(&mutex); } + void unlock() { PyMutex_Unlock(&mutex); } +private: + PyMutex mutex { 0 }; +#endif +}; + +struct ft_lock_guard { +public: + NB_NONCOPYABLE(ft_lock_guard) + ft_lock_guard(ft_mutex &m) : m(m) { m.lock(); } + ~ft_lock_guard() { m.unlock(); } +private: + ft_mutex &m; +}; + + +struct ft_object_guard { +public: + NB_NONCOPYABLE(ft_object_guard) +#if !defined(NB_FREE_THREADED) + ft_object_guard(handle) { } +#else + ft_object_guard(handle h) { PyCriticalSection_Begin(&cs, h.ptr()); } + ~ft_object_guard() { PyCriticalSection_End(&cs); } +private: + PyCriticalSection cs; +#endif +}; + +struct ft_object2_guard { +public: + NB_NONCOPYABLE(ft_object2_guard) +#if !defined(NB_FREE_THREADED) + ft_object2_guard(handle, handle) { } +#else + ft_object2_guard(handle h1, handle h2) { PyCriticalSection2_Begin(&cs, h1.ptr(), h2.ptr()); } + ~ft_object2_guard() { PyCriticalSection2_End(&cs); } +private: + PyCriticalSection2 cs; +#endif +}; + inline bool leak_warnings() noexcept { return detail::leak_warnings(); } diff --git a/include/nanobind/nb_types.h b/include/nanobind/nb_types.h index 68a16a2a..d2645593 100644 --- a/include/nanobind/nb_types.h +++ b/include/nanobind/nb_types.h @@ -819,24 +819,29 @@ struct fast_iterator { class dict_iterator { public: + NB_NONCOPYABLE(dict_iterator) + using value_type = std::pair; using reference = const value_type; - dict_iterator() : h(), pos(-1) { } - + dict_iterator() = default; dict_iterator(handle h) : h(h), pos(0) { +#if defined(NB_FREE_THREADED) + PyCriticalSection_Begin(&cs, h.ptr()); +#endif increment(); } - dict_iterator& operator++() { - increment(); - return *this; +#if defined(NB_FREE_THREADED) + ~dict_iterator() { + if (h.ptr()) + PyCriticalSection_End(&cs); } +#endif - dict_iterator operator++(int) { - dict_iterator rv = *this; + dict_iterator& operator++() { increment(); - return rv; + return *this; } void increment() { @@ -851,8 +856,12 @@ class dict_iterator { private: handle h; - Py_ssize_t pos; - PyObject *key = nullptr, *value = nullptr; + Py_ssize_t pos = -1; + PyObject *key = nullptr; + PyObject *value = nullptr; +#if defined(NB_FREE_THREADED) + PyCriticalSection cs { }; +#endif }; NB_IMPL_COMP(equal, Py_EQ) diff --git a/include/nanobind/stl/detail/nb_dict.h b/include/nanobind/stl/detail/nb_dict.h index eb0e2dea..24f77ea2 100644 --- a/include/nanobind/stl/detail/nb_dict.h +++ b/include/nanobind/stl/detail/nb_dict.h @@ -32,6 +32,8 @@ template struct dict_caster { return false; } + // 'items' is safe to access without locking and reference counting, it + // is unique to this thread Py_ssize_t size = NB_LIST_GET_SIZE(items); bool success = size >= 0; diff --git a/src/common.cpp b/src/common.cpp index 67db5a83..02bc02ad 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -143,6 +143,11 @@ PyObject *module_new(const char *name, PyModuleDef *def) noexcept { def->m_name = name; def->m_size = -1; PyObject *m = PyModule_Create(def); + + #ifdef NB_FREE_THREADED + PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED); + #endif + check(m, "nanobind::detail::module_new(): allocation failed!"); return m; } @@ -163,53 +168,46 @@ PyObject *module_import(PyObject *o) { PyObject *module_new_submodule(PyObject *base, const char *name, const char *doc) noexcept { - PyObject *name_py, *res; + const char *base_name, *tmp_str; + Py_ssize_t tmp_size = 0; + object tmp, res; -#if !defined(PYPY_VERSION) - PyObject *base_name = PyModule_GetNameObject(base); + base_name = PyModule_GetName(base); if (!base_name) goto fail; - name_py = PyUnicode_FromFormat("%U.%s", base_name, name); - Py_DECREF(base_name); -#else - const char *base_name = PyModule_GetName(base); - if (!base_name) + tmp = steal(PyUnicode_FromFormat("%s.%s", base_name, name)); + if (!tmp.is_valid()) goto fail; - name_py = PyUnicode_FromFormat("%s.%s", base_name, name); -#endif - if (!name_py) + tmp_str = PyUnicode_AsUTF8AndSize(tmp.ptr(), &tmp_size); + if (!tmp_str) goto fail; -#if !defined(PYPY_VERSION) - res = PyImport_AddModuleObject(name_py); +#if PY_VERSION_HEX < 0x030D00A0 || defined(Py_LIMITED_API) + res = borrow(PyImport_AddModule(tmp_str)); #else - res = PyImport_AddModule(PyUnicode_AsUTF8(name_py)); + res = steal(PyImport_AddModuleRef(tmp_str)); #endif - Py_DECREF(name_py); - if (!res) + + if (!res.is_valid()) goto fail; if (doc) { - PyObject *doc_py = PyUnicode_FromString(doc); - if (!doc_py) + tmp = steal(PyUnicode_FromString(doc)); + if (!tmp.is_valid()) goto fail; - int rv = PyObject_SetAttrString(res, "__doc__", doc_py); - Py_DECREF(doc_py); - if (rv) + if (PyObject_SetAttrString(res.ptr(), "__doc__", tmp.ptr())) goto fail; } - Py_INCREF(res); // extra reference for PyModule_AddObject - - if (PyModule_AddObject(base, name, res)) { // steals on success - Py_DECREF(res); + res.inc_ref(); // For PyModule_AddObject, which steals upon success + if (PyModule_AddObject(base, name, res.ptr())) { + res.dec_ref(); goto fail; } - Py_INCREF(res); // turned borrowed into new reference - return res; + return res.release().ptr(); fail: raise_python_error(); @@ -669,13 +667,15 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep still trigger a segfault if dereferenced. */ if (size == 0) result = (PyObject **) 1; +# if !defined(NB_FREE_THREADED) // Require immutable holder in free-threaded mode } else if (PyList_CheckExact(seq)) { size = (size_t) PyList_GET_SIZE(seq); result = ((PyListObject *) seq)->ob_item; if (size == 0) // ditto result = (PyObject **) 1; +# endif } else if (PySequence_Check(seq)) { - temp = PySequence_Fast(seq, ""); + temp = PySequence_Tuple(seq); if (temp) result = seq_get(temp, &size, temp_out); @@ -689,8 +689,8 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep Py_ssize_t size_seq = PySequence_Length(seq); if (size_seq >= 0) { - result = (PyObject **) PyObject_Malloc(sizeof(PyObject *) * - (size_seq + 1)); + result = (PyObject **) PyMem_Malloc(sizeof(PyObject *) * + (size_seq + 1)); if (result) { result[size_seq] = nullptr; @@ -704,7 +704,7 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep for (Py_ssize_t j = 0; j < i; ++j) Py_DECREF(result[j]); - PyObject_Free(result); + PyMem_Free(result); result = nullptr; break; } @@ -716,7 +716,7 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep PyObject **ptr = (PyObject **) PyCapsule_GetPointer(o, nullptr); for (size_t i = 0; ptr[i] != nullptr; ++i) Py_DECREF(ptr[i]); - PyObject_Free(ptr); + PyMem_Free(ptr); }); if (temp) { @@ -726,7 +726,7 @@ PyObject **seq_get(PyObject *seq, size_t *size_out, PyObject **temp_out) noexcep for (Py_ssize_t i = 0; i < size_seq; ++i) Py_DECREF(result[i]); - PyObject_Free(result); + PyMem_Free(result); result = nullptr; } } @@ -763,14 +763,16 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, if (size == 0) result = (PyObject **) 1; } +# if !defined(NB_FREE_THREADED) // Require immutable holder in free-threaded mode } else if (PyList_CheckExact(seq)) { if (size == (size_t) PyList_GET_SIZE(seq)) { result = ((PyListObject *) seq)->ob_item; if (size == 0) // ditto result = (PyObject **) 1; } +# endif } else if (PySequence_Check(seq)) { - temp = PySequence_Fast(seq, ""); + temp = PySequence_Tuple(seq); if (temp) result = seq_get_with_size(temp, size, temp_out); @@ -785,7 +787,7 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, if (size == (size_t) size_seq) { result = - (PyObject **) PyObject_Malloc(sizeof(PyObject *) * (size + 1)); + (PyObject **) PyMem_Malloc(sizeof(PyObject *) * (size + 1)); if (result) { result[size] = nullptr; @@ -799,7 +801,7 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, for (Py_ssize_t j = 0; j < i; ++j) Py_DECREF(result[j]); - PyObject_Free(result); + PyMem_Free(result); result = nullptr; break; } @@ -811,7 +813,7 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, PyObject **ptr = (PyObject **) PyCapsule_GetPointer(o, nullptr); for (size_t i = 0; ptr[i] != nullptr; ++i) Py_DECREF(ptr[i]); - PyObject_Free(ptr); + PyMem_Free(ptr); }); if (!temp) { @@ -819,7 +821,7 @@ PyObject **seq_get_with_size(PyObject *seq, size_t size, for (Py_ssize_t i = 0; i < size_seq; ++i) Py_DECREF(result[i]); - PyObject_Free(result); + PyMem_Free(result); result = nullptr; } } @@ -1165,5 +1167,40 @@ bool issubclass(PyObject *a, PyObject *b) { return bool(rv); } +// ======================================================================== + +/// Make an object immortal when targeting free-threaded Python +void maybe_make_immortal(PyObject *op) { +#ifdef NB_FREE_THREADED + // See CPython's Objects/object.c + if (PyObject_IS_GC(op)) + PyObject_GC_UnTrack(op); + op->ob_tid = _Py_UNOWNED_TID; + op->ob_ref_local = _Py_IMMORTAL_REFCNT_LOCAL; + op->ob_ref_shared = 0; +#else + (void) op; +#endif +} + +// ======================================================================== + +PyObject *dict_get_item_ref_or_fail(PyObject *d, PyObject *k) { + PyObject *value; + bool error = false; + +#if PY_VERSION_HEX < 0x030D00A1 || defined(Py_LIMITED_API) + value = PyDict_GetItemWithError(d, k); + if (value) + Py_INCREF(value); + else + error = PyErr_Occurred(); +#else + error = PyDict_GetItemRef(d, k, &value) == -1; +#endif + check(!error, "nanobind::detail::dict_get_item_ref_or_fail(): dictionary lookup failed!"); + return value; +} + NAMESPACE_END(detail) NAMESPACE_END(NB_NAMESPACE) diff --git a/src/error.cpp b/src/error.cpp index 1a36fe85..5d1d6666 100644 --- a/src/error.cpp +++ b/src/error.cpp @@ -15,6 +15,7 @@ NAMESPACE_BEGIN(NB_NAMESPACE) NAMESPACE_BEGIN(detail) +// Protected by internals->mutex in free-threaded builds Buffer buf(128); NAMESPACE_END(detail) @@ -116,13 +117,15 @@ void python_error::restore() noexcept { #endif const char *python_error::what() const noexcept { - using detail::buf; + using namespace nanobind::detail; // Return the existing error message if already computed once if (m_what) return m_what; gil_scoped_acquire acq; + // 'buf' is protected by internals->mutex in free-threaded builds + lock_internals guard(internals); // Try again with GIL held if (m_what) @@ -147,7 +150,7 @@ const char *python_error::what() const noexcept { #if defined(Py_LIMITED_API) || defined(PYPY_VERSION) object mod = module_::import_("traceback"), result = mod.attr("format_exception")(exc_type, exc_value, exc_traceback); - m_what = detail::strdup_check(borrow(str("\n").attr("join")(result)).c_str()); + m_what = strdup_check(borrow(str("\n").attr("join")(result)).c_str()); #else buf.clear(); if (exc_traceback.is_valid()) { @@ -160,7 +163,7 @@ const char *python_error::what() const noexcept { PyFrameObject *frame = to->tb_frame; Py_XINCREF(frame); - std::vector> frames; + std::vector> frames; while (frame) { frames.push_back(frame); diff --git a/src/implicit.cpp b/src/implicit.cpp index 721ddbc8..10702a06 100644 --- a/src/implicit.cpp +++ b/src/implicit.cpp @@ -15,10 +15,12 @@ NAMESPACE_BEGIN(detail) void implicitly_convertible(const std::type_info *src, const std::type_info *dst) noexcept { - type_data *t = nb_type_c2p(internals, dst); + nb_internals *internals_ = internals; + type_data *t = nb_type_c2p(internals_, dst); check(t, "nanobind::detail::implicitly_convertible(src=%s, dst=%s): " "destination type unknown!", type_name(src), type_name(dst)); + lock_internals guard(internals_); size_t size = 0; if (t->flags & (uint32_t) type_flags::has_implicit_conversions) { @@ -30,23 +32,25 @@ void implicitly_convertible(const std::type_info *src, t->flags |= (uint32_t) type_flags::has_implicit_conversions; } - void **data = (void **) malloc(sizeof(void *) * (size + 2)); + void **data = (void **) PyMem_Malloc(sizeof(void *) * (size + 2)); if (size) memcpy(data, t->implicit.cpp, size * sizeof(void *)); data[size] = (void *) src; data[size + 1] = nullptr; - free(t->implicit.cpp); + PyMem_Free(t->implicit.cpp); t->implicit.cpp = (decltype(t->implicit.cpp)) data; } void implicitly_convertible(bool (*predicate)(PyTypeObject *, PyObject *, cleanup_list *), const std::type_info *dst) noexcept { - type_data *t = nb_type_c2p(internals, dst); + nb_internals *internals_ = internals; + type_data *t = nb_type_c2p(internals_, dst); check(t, "nanobind::detail::implicitly_convertible(src=, dst=%s): " "destination type unknown!", type_name(dst)); + lock_internals guard(internals_); size_t size = 0; if (t->flags & (uint32_t) type_flags::has_implicit_conversions) { @@ -58,12 +62,12 @@ void implicitly_convertible(bool (*predicate)(PyTypeObject *, PyObject *, t->flags |= (uint32_t) type_flags::has_implicit_conversions; } - void **data = (void **) malloc(sizeof(void *) * (size + 2)); + void **data = (void **) PyMem_Malloc(sizeof(void *) * (size + 2)); if (size) memcpy(data, t->implicit.py, size * sizeof(void *)); data[size] = (void *) predicate; data[size + 1] = nullptr; - free(t->implicit.py); + PyMem_Free(t->implicit.py); t->implicit.py = (decltype(t->implicit.py)) data; } diff --git a/src/nb_enum.cpp b/src/nb_enum.cpp index 47c344b8..425008c8 100644 --- a/src/nb_enum.cpp +++ b/src/nb_enum.cpp @@ -17,12 +17,21 @@ using enum_map = tsl::robin_map; PyObject *enum_create(enum_init_data *ed) noexcept { // Update hash table that maps from std::type_info to Python type - auto [it, success] = internals->type_c2p_slow.try_emplace(ed->type, nullptr); - if (!success) { - PyErr_WarnFormat(PyExc_RuntimeWarning, 1, "nanobind: type '%s' was already registered!\n", ed->name); - PyObject *tp = (PyObject *) it->second->type_py; - Py_INCREF(tp); - return tp; + nb_internals *internals_ = internals; + bool success; + nb_type_map_slow::iterator it; + + { + lock_internals guard(internals_); + std::tie(it, success) = internals_->type_c2p_slow.try_emplace(ed->type, nullptr); + if (!success) { + PyErr_WarnFormat(PyExc_RuntimeWarning, 1, + "nanobind: type '%s' was already registered!\n", + ed->name); + PyObject *tp = (PyObject *) it->second->type_py; + Py_INCREF(tp); + return tp; + } } handle scope(ed->scope); @@ -77,15 +86,23 @@ PyObject *enum_create(enum_init_data *ed) noexcept { it.value() = t; - internals->type_c2p_fast[ed->type] = t; - internals->type_c2p_slow[ed->type] = t; + { + lock_internals guard(internals_); + internals_->type_c2p_slow[ed->type] = t; + + #if !defined(NB_FREE_THREADED) + internals_->type_c2p_fast[ed->type] = t; + #endif + } + + maybe_make_immortal(result.ptr()); result.attr("__nb_enum__") = capsule(t, [](void *p) noexcept { type_init_data *t = (type_init_data *) p; delete (enum_map *) t->enum_tbl.fwd; delete (enum_map *) t->enum_tbl.rev; nb_type_unregister(t); - free((char*)t->name); + free((char*) t->name); delete t; }); diff --git a/src/nb_func.cpp b/src/nb_func.cpp index 0965e6a9..5c1820c1 100644 --- a/src/nb_func.cpp +++ b/src/nb_func.cpp @@ -82,11 +82,14 @@ void nb_func_dealloc(PyObject *self) { func_data *f = nb_func_data(self); // Delete from registered function list +#if !defined(NB_FREE_THREADED) size_t n_deleted = internals->funcs.erase(self); check(n_deleted == 1, "nanobind::detail::nb_func_dealloc(\"%s\"): function not found!", ((f->flags & (uint32_t) func_flags::has_name) ? f->name - : "")); + : "")); +#endif + for (size_t i = 0; i < size; ++i) { if (f->flags & (uint32_t) func_flags::has_free) f->free_capture(f->capture); @@ -138,8 +141,8 @@ void nb_bound_method_dealloc(PyObject *self) { } static arg_data method_args[2] = { - { "self", nullptr, nullptr, nullptr, false, false }, - { nullptr, nullptr, nullptr, nullptr, false, false } + { "self", nullptr, nullptr, nullptr, 0 }, + { nullptr, nullptr, nullptr, nullptr, 0 } }; static bool set_builtin_exception_status(builtin_exception &e) { @@ -196,6 +199,8 @@ PyObject *nb_func_new(const void *in_) noexcept { bool has_scope = f->flags & (uint32_t) func_flags::has_scope, has_name = f->flags & (uint32_t) func_flags::has_name, has_args = f->flags & (uint32_t) func_flags::has_args, + has_var_args = f->flags & (uint32_t) func_flags::has_var_kwargs, + has_var_kwargs = f->flags & (uint32_t) func_flags::has_var_args, has_keep_alive = f->flags & (uint32_t) func_flags::has_keep_alive, has_doc = f->flags & (uint32_t) func_flags::has_doc, has_signature = f->flags & (uint32_t) func_flags::has_signature, @@ -219,14 +224,15 @@ PyObject *nb_func_new(const void *in_) noexcept { } // Check for previous overloads + nb_internals *internals_ = internals; if (has_scope && has_name) { name = PyUnicode_InternFromString(name_cstr); check(name, "nb::detail::nb_func_new(\"%s\"): invalid name.", name_cstr); func_prev = PyObject_GetAttr(f->scope, name); if (func_prev) { - if (Py_TYPE(func_prev) == internals->nb_func || - Py_TYPE(func_prev) == internals->nb_method) { + if (Py_TYPE(func_prev) == internals_->nb_func || + Py_TYPE(func_prev) == internals_->nb_method) { func_data *fp = nb_func_data(func_prev); check((fp->flags & (uint32_t) func_flags::is_method) == @@ -268,7 +274,7 @@ PyObject *nb_func_new(const void *in_) noexcept { if (is_constructor && f->nargs == 2 && f->descr_types[0] && f->descr_types[0] == f->descr_types[1]) { if (has_args) { - f->args[0].convert = false; + f->args[0].flag &= ~(uint8_t) cast_flags::convert; } else { args_in = method_args + 1; has_args = true; @@ -279,17 +285,26 @@ PyObject *nb_func_new(const void *in_) noexcept { // Create a new function and destroy the old one Py_ssize_t to_copy = func_prev ? Py_SIZE(func_prev) : 0; nb_func *func = (nb_func *) PyType_GenericAlloc( - is_method ? internals->nb_method : internals->nb_func, to_copy + 1); + is_method ? internals_->nb_method : internals_->nb_func, to_copy + 1); check(func, "nb::detail::nb_func_new(\"%s\"): alloc. failed (1).", name_cstr); - func->max_nargs = f->nargs; - func->complex_call = f->nargs_pos < f->nargs || has_args || has_keep_alive; + maybe_make_immortal((PyObject *) func); + + // Check if the complex dispatch loop is needed + bool complex_call = has_keep_alive || has_var_kwargs || has_var_args || f->nargs >= NB_MAXARGS_SIMPLE; + if (has_args) { + for (size_t i = 0; i < f->nargs; ++i) { + arg_data &a = f->args[i]; + complex_call |= a.name != nullptr || a.value != nullptr || + a.flag != cast_flags::convert; + } + } + uint32_t max_nargs = f->nargs; if (func_prev) { - func->complex_call |= ((nb_func *) func_prev)->complex_call; - func->max_nargs = std::max(func->max_nargs, - ((nb_func *) func_prev)->max_nargs); + complex_call |= ((nb_func *) func_prev)->complex_call; + max_nargs = std::max(max_nargs, ((nb_func *) func_prev)->max_nargs); func_data *cur = nb_func_data(func), *prev = nb_func_data(func_prev); @@ -299,29 +314,32 @@ PyObject *nb_func_new(const void *in_) noexcept { ((PyVarObject *) func_prev)->ob_size = 0; - size_t n_deleted = internals->funcs.erase(func_prev); +#if !defined(NB_FREE_THREADED) + size_t n_deleted = internals_->funcs.erase(func_prev); check(n_deleted == 1, "nanobind::detail::nb_func_new(): internal update failed (1)!"); +#endif Py_CLEAR(func_prev); } - func->complex_call |= func->max_nargs >= NB_MAXARGS_SIMPLE; - - func->vectorcall = func->complex_call ? nb_func_vectorcall_complex - : nb_func_vectorcall_simple; + func->max_nargs = max_nargs; + func->complex_call = complex_call; + func->vectorcall = complex_call ? nb_func_vectorcall_complex + : nb_func_vectorcall_simple; +#if !defined(NB_FREE_THREADED) // Register the function - auto [it, success] = internals->funcs.try_emplace(func, nullptr); + auto [it, success] = internals_->funcs.try_emplace(func, nullptr); check(success, "nanobind::detail::nb_func_new(): internal update failed (2)!"); +#endif func_data *fc = nb_func_data(func) + to_copy; memcpy(fc, f, sizeof(func_data_prelim<0>)); if (has_doc) { - if (fc->doc[0] == '\n') { + if (fc->doc[0] == '\n') fc->doc++; - } fc->doc = strdup_check(fc->doc); } @@ -381,7 +399,8 @@ PyObject *nb_func_new(const void *in_) noexcept { } else { a.name_py = nullptr; } - a.none |= a.value == Py_None; + if (a.value == Py_None) + a.flag |= (uint8_t) cast_flags::accepts_none; a.signature = a.signature ? strdup_check(a.signature) : nullptr; Py_XINCREF(a.value); } @@ -427,6 +446,9 @@ nb_func_error_overload(PyObject *self, PyObject *const *args_in, if (f->flags & (uint32_t) func_flags::is_operator) return not_implemented().release().ptr(); + // The buffer 'buf' is protected by 'internals.mutex' + lock_internals guard(internals); + buf.clear(); buf.put_dstr(f->name); buf.put("(): incompatible function arguments. The following argument types " @@ -486,6 +508,10 @@ static NB_NOINLINE PyObject *nb_func_error_noconvert(PyObject *self, if (PyErr_Occurred()) return nullptr; func_data *f = nb_func_data(self); + + // The buffer 'buf' is protected by 'internals.mutex' + lock_internals guard(internals); + buf.clear(); buf.put("Unable to convert function return value to a Python " "type! The signature was\n "); @@ -619,7 +645,7 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, until we get a result other than NB_NEXT_OVERLOAD. */ - for (int pass = (count > 1) ? 0 : 1; pass < 2; ++pass) { + for (size_t pass = (count > 1) ? 0 : 1; pass < 2; ++pass) { for (size_t k = 0; k < count; ++k) { const func_data *f = fr + k; @@ -655,8 +681,8 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, continue; // skip nb::args parameter, will be handled below PyObject *arg = nullptr; - bool arg_convert = pass == 1, - arg_none = false; + + uint8_t arg_flag = 1; // If i >= nargs_pos, then this is a keyword-only parameter. // (We skipped any *args parameter using the test above, @@ -690,16 +716,15 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, if (!arg) arg = ad.value; - - arg_convert &= ad.convert; - arg_none = ad.none; + arg_flag = ad.flag; } - if (!arg || (arg == Py_None && !arg_none)) + if (!arg || (arg == Py_None && (arg_flag & cast_flags::accepts_none) == 0)) break; + // Implicit conversion only active in the 2nd pass + args_flags[i] = arg_flag & ~uint8_t(pass == 0); args[i] = arg; - args_flags[i] = arg_convert ? (uint8_t) cast_flags::convert : (uint8_t) 0; } // Skip this overload if any arguments were unavailable @@ -742,37 +767,32 @@ static PyObject *nb_func_vectorcall_complex(PyObject *self, continue; } + if (is_constructor) - args_flags[0] = (uint8_t) cast_flags::construct; + args_flags[0] |= (uint8_t) cast_flags::construct; + + rv_policy policy = (rv_policy) (f->flags & 0b111); try { + result = nullptr; + // Found a suitable overload, let's try calling it result = f->impl((void *) f->capture, args, args_flags, - (rv_policy) (f->flags & 0b111), &cleanup); + policy, &cleanup); - if (NB_UNLIKELY(!result)) { + if (NB_UNLIKELY(!result)) error_handler = nb_func_error_noconvert; - goto done; - } } catch (builtin_exception &e) { - if (set_builtin_exception_status(e)) { - result = nullptr; - goto done; - } else { + if (!set_builtin_exception_status(e)) result = NB_NEXT_OVERLOAD; - } } catch (python_error &e) { e.restore(); - result = nullptr; - goto done; } catch (...) { nb_func_convert_cpp_exception(); - result = nullptr; - goto done; } if (result != NB_NEXT_OVERLOAD) { - if (is_constructor) { + if (is_constructor && result != nullptr) { nb_inst *self_arg_nb = (nb_inst *) self_arg; self_arg_nb->destruct = true; self_arg_nb->state = nb_inst::state_ready; @@ -832,7 +852,7 @@ static PyObject *nb_func_vectorcall_simple(PyObject *self, goto done; } - for (int pass = (count > 1) ? 0 : 1; pass < 2; ++pass) { + for (size_t pass = (count > 1) ? 0 : 1; pass < 2; ++pass) { for (int i = 0; i < NB_MAXARGS_SIMPLE; ++i) args_flags[i] = (uint8_t) pass; @@ -846,34 +866,26 @@ static PyObject *nb_func_vectorcall_simple(PyObject *self, continue; try { + result = nullptr; + // Found a suitable overload, let's try calling it result = f->impl((void *) f->capture, (PyObject **) args_in, args_flags, (rv_policy) (f->flags & 0b111), &cleanup); - if (NB_UNLIKELY(!result)) { + if (NB_UNLIKELY(!result)) error_handler = nb_func_error_noconvert; - goto done; - } } catch (builtin_exception &e) { - if (set_builtin_exception_status(e)) { - result = nullptr; - goto done; - } else { + if (!set_builtin_exception_status(e)) result = NB_NEXT_OVERLOAD; - } } catch (python_error &e) { e.restore(); - result = nullptr; - goto done; } catch (...) { nb_func_convert_cpp_exception(); - result = nullptr; - goto done; } if (result != NB_NEXT_OVERLOAD) { - if (is_constructor) { + if (is_constructor && result != nullptr) { nb_inst *self_arg_nb = (nb_inst *) self_arg; self_arg_nb->destruct = true; self_arg_nb->state = nb_inst::state_ready; @@ -961,8 +973,8 @@ PyObject *nb_method_descr_get(PyObject *self, PyObject *inst, PyObject *) { } } - -/// Render the function signature of a single function +/// Render the function signature of a single function. Callers must hold the +/// 'internals' mutex. static uint32_t nb_func_render_signature(const func_data *f, bool nb_signature_mode) noexcept { const bool is_method = f->flags & (uint32_t) func_flags::is_method, @@ -971,6 +983,7 @@ static uint32_t nb_func_render_signature(const func_data *f, has_var_kwargs = f->flags & (uint32_t) func_flags::has_var_kwargs, has_signature = f->flags & (uint32_t) func_flags::has_signature; + nb_internals *internals_ = internals; if (has_signature) { const char *s = f->signature; @@ -1064,8 +1077,9 @@ static uint32_t nb_func_render_signature(const func_data *f, } buf.put(": "); - if (has_args && f->args[arg_index].none) { - #if PY_VERSION_HEX < 0x030A0000 + if (has_args && f->args[arg_index].flag & + (uint8_t) cast_flags::accepts_none) { +#if PY_VERSION_HEX < 0x030A0000 buf.put("typing.Optional["); #else // See below @@ -1077,7 +1091,7 @@ static uint32_t nb_func_render_signature(const func_data *f, case '}': // Default argument if (has_args) { - if (f->args[arg_index].none) { + if (f->args[arg_index].flag & (uint8_t) cast_flags::accepts_none) { #if PY_VERSION_HEX < 0x030A0000 buf.put(']'); #else @@ -1086,7 +1100,7 @@ static uint32_t nb_func_render_signature(const func_data *f, } if (f->args[arg_index].value) { - const arg_data &arg= f->args[arg_index]; + const arg_data &arg = f->args[arg_index]; if (nb_signature_mode) { buf.put(" = \\"); if (arg.signature) @@ -1096,8 +1110,12 @@ static uint32_t nb_func_render_signature(const func_data *f, buf.put(" = "); buf.put_dstr(arg.signature); } else { - PyObject *o = arg.value; - PyObject *str = PyObject_Repr(o); + PyObject *o = arg.value, *str; + + { + unlock_internals guard2(internals_); + str = PyObject_Repr(o); + } if (str) { Py_ssize_t size = 0; @@ -1129,14 +1147,17 @@ static uint32_t nb_func_render_signature(const func_data *f, "nb::detail::nb_func_render_signature(): missing type!"); if (!(is_method && arg_index == 0)) { - auto it = internals->type_c2p_slow.find(*descr_type); + bool found = false; + auto it = internals_->type_c2p_slow.find(*descr_type); - if (it != internals->type_c2p_slow.end()) { + if (it != internals_->type_c2p_slow.end()) { handle th((PyObject *) it->second->type_py); buf.put_dstr((borrow(th.attr("__module__"))).c_str()); buf.put('.'); buf.put_dstr((borrow(th.attr("__qualname__"))).c_str()); - } else { + found = true; + } + if (!found) { if (nb_signature_mode) buf.put('"'); char *name = type_name(*descr_type); @@ -1227,6 +1248,9 @@ PyObject *nb_func_get_nb_signature(PyObject *self, void *) { Py_INCREF(docstr); } + // The buffer 'buf' is protected by 'internals.mutex' + lock_internals guard(internals); + buf.clear(); uint32_t n_default_args = nb_func_render_signature(fi, true); @@ -1285,6 +1309,9 @@ PyObject *nb_func_get_doc(PyObject *self, void *) { func_data *f = nb_func_data(self); uint32_t count = (uint32_t) Py_SIZE(self); + // The buffer 'buf' is protected by 'internals.mutex' + lock_internals guard(internals); + buf.clear(); size_t doc_count = 0; diff --git a/src/nb_internals.cpp b/src/nb_internals.cpp index 6060a79f..b4a3f1d9 100644 --- a/src/nb_internals.cpp +++ b/src/nb_internals.cpp @@ -10,6 +10,7 @@ #include #include #include "nb_internals.h" +#include #if defined(__GNUC__) && !defined(__clang__) # pragma GCC diagnostic ignored "-Wmissing-field-initializers" @@ -72,6 +73,12 @@ # define NB_STABLE_ABI "" #endif +#if defined(NB_FREE_THREADED) +# define NB_FREE_THREADED_ABI "_ft" +#else +# define NB_FREE_THREADED_ABI "" +#endif + #if NB_VERSION_DEV > 0 #define NB_VERSION_DEV_STR "_dev" NB_TOSTRING(NB_VERSION_DEV) #else @@ -80,7 +87,8 @@ #define NB_INTERNALS_ID \ "v" NB_TOSTRING(NB_INTERNALS_VERSION) \ - NB_COMPILER_TYPE NB_STDLIB NB_VERSION_DEV_STR NB_BUILD_ABI NB_BUILD_TYPE NB_STABLE_ABI + NB_VERSION_DEV_STR NB_COMPILER_TYPE NB_STDLIB NB_BUILD_ABI \ + NB_BUILD_TYPE NB_STABLE_ABI NB_FREE_THREADED_ABI NAMESPACE_BEGIN(NB_NAMESPACE) NAMESPACE_BEGIN(detail) @@ -225,6 +233,7 @@ void default_exception_translator(const std::exception_ptr &p, void *) { } } +// Initialized once when the module is loaded, no locking needed nb_internals *internals = nullptr; PyTypeObject *nb_meta_cache = nullptr; @@ -233,62 +242,72 @@ static bool *is_alive_ptr = &is_alive_value; bool is_alive() noexcept { return *is_alive_ptr; } static void internals_cleanup() { - if (!internals) + nb_internals *p = internals; + if (!p) return; *is_alive_ptr = false; -#if !defined(PYPY_VERSION) +#if !defined(PYPY_VERSION) && !defined(NB_FREE_THREADED) /* The memory leak checker is unsupported on PyPy, see - see https://foss.heptapod.net/pypy/pypy/-/issues/3855 */ + see https://foss.heptapod.net/pypy/pypy/-/issues/3855. - bool leak = false, print_leak_warnings = internals->print_leak_warnings; + Leak reporting is explicitly disabled on free-threaded builds + for now because of the decision to immortalize function and + type objects. This may change in the future. */ - if (!internals->inst_c2p.empty()) { - if (print_leak_warnings) { - fprintf(stderr, "nanobind: leaked %zu instances!\n", - internals->inst_c2p.size()); - #if !defined(Py_LIMITED_API) - auto print_leak = [](void* k, PyObject* v) { - PyTypeObject *tp = Py_TYPE(v); - fprintf(stderr, " - leaked instance %p of type \"%s\"\n", k, tp->tp_name); - }; - - for (auto [k, v]: internals->inst_c2p) { - if (NB_UNLIKELY(nb_is_seq(v))) { - nb_inst_seq* seq = nb_get_seq(v); - for(; seq != nullptr; seq = seq->next) - print_leak(k, seq->inst); - } else { - print_leak(k, (PyObject*)v); - } - } - #endif - } - leak = true; + bool print_leak_warnings = p->print_leak_warnings; + + size_t inst_leaks = 0, keep_alive_leaks = 0; + + // Shard locking no longer needed, Py_AtExit is single-threaded + for (size_t i = 0; i < p->shard_count; ++i) { + nb_shard &s = p->shards[i]; + inst_leaks += s.inst_c2p.size(); + keep_alive_leaks += s.keep_alive.size(); } - if (!internals->keep_alive.empty()) { - if (print_leak_warnings) { - fprintf(stderr, "nanobind: leaked %zu keep_alive records!\n", - internals->keep_alive.size()); + bool leak = inst_leaks > 0 || keep_alive_leaks > 0; + + if (print_leak_warnings && inst_leaks > 0) { + fprintf(stderr, "nanobind: leaked %zu instances!\n", inst_leaks); + +#if !defined(Py_LIMITED_API) + auto print_leak = [](void* k, PyObject* v) { + type_data *tp = nb_type_data(Py_TYPE(v)); + fprintf(stderr, " - leaked instance %p of type \"%s\"\n", k, tp->name); + }; + + for (size_t i = 0; i < p->shard_count; ++i) { + for (auto [k, v]: p->shards[i].inst_c2p) { + if (NB_UNLIKELY(nb_is_seq(v))) { + nb_inst_seq* seq = nb_get_seq(v); + for(; seq != nullptr; seq = seq->next) + print_leak(k, seq->inst); + } else { + print_leak(k, (PyObject*)v); + } + } } - leak = true; +#endif } + if (print_leak_warnings && keep_alive_leaks > 0) + fprintf(stderr, "nanobind: leaked %zu keep_alive records!\n", + keep_alive_leaks); + // Only report function/type leaks if actual nanobind instances were leaked #if !defined(NB_ABORT_ON_LEAK) if (!leak) print_leak_warnings = false; #endif - if (!internals->type_c2p_slow.empty() || - !internals->type_c2p_fast.empty()) { + if (!p->type_c2p_slow.empty()) { if (print_leak_warnings) { fprintf(stderr, "nanobind: leaked %zu types!\n", - internals->type_c2p_slow.size()); + p->type_c2p_slow.size()); int ctr = 0; - for (const auto &kv : internals->type_c2p_slow) { + for (const auto &kv : p->type_c2p_slow) { fprintf(stderr, " - leaked type \"%s\"\n", kv.second->name); if (ctr++ == 10) { fprintf(stderr, " - ... skipped remainder\n"); @@ -299,12 +318,12 @@ static void internals_cleanup() { leak = true; } - if (!internals->funcs.empty()) { + if (!p->funcs.empty()) { if (print_leak_warnings) { fprintf(stderr, "nanobind: leaked %zu functions!\n", - internals->funcs.size()); + p->funcs.size()); int ctr = 0; - for (auto [f, p] : internals->funcs) { + for (auto [f, p2] : p->funcs) { fprintf(stderr, " - leaked function \"%s\"\n", nb_func_data(f)->name); if (ctr++ == 10) { @@ -317,13 +336,23 @@ static void internals_cleanup() { } if (!leak) { - nb_translator_seq* t = internals->translators.next; + nb_translator_seq* t = p->translators.next; while (t) { nb_translator_seq *next = t->next; delete t; t = next; } - delete internals; + +#if defined(NB_FREE_THREADED) + // This code won't run for now but is kept here for a time when + // immortalization isn't needed anymore. + + PyThread_tss_delete(p->nb_static_property_disabled); + PyThread_tss_free(p->nb_static_property_disabled); + delete[] p->shards; +#endif + + delete p; internals = nullptr; nb_meta_cache = nullptr; } else { @@ -332,7 +361,7 @@ static void internals_cleanup() { "counting issue in the binding code.\n"); } - #if defined(NB_ABORT_ON_LEAK) + #if defined(NB_ABORT_ON_LEAK) && !defined(NB_FREE_THREADED) abort(); // Extra-strict behavior for the CI server #endif } @@ -356,7 +385,7 @@ NB_NOINLINE void init(const char *name) { NB_INTERNALS_ID, name ? name : ""); check(key, "nanobind::detail::init(): could not create dictionary key!"); - PyObject *capsule = PyDict_GetItem(dict, key); + PyObject *capsule = dict_get_item_ref_or_fail(dict, key); if (capsule) { Py_DECREF(key); internals = (nb_internals *) PyCapsule_GetPointer(capsule, "nb_internals"); @@ -364,11 +393,23 @@ NB_NOINLINE void init(const char *name) { "nanobind::detail::internals_fetch(): capsule pointer is NULL!"); nb_meta_cache = internals->nb_meta; is_alive_ptr = internals->is_alive_ptr; + Py_DECREF(capsule); return; } nb_internals *p = new nb_internals(); + size_t shard_count = 1; +#if defined(NB_FREE_THREADED) + size_t hw_concurrency = std::thread::hardware_concurrency(); + while (shard_count < hw_concurrency) + shard_count *= 2; + shard_count *= 2; + p->shards = new nb_shard[shard_count]; + p->shard_mask = shard_count - 1; +#endif + p->shard_count = shard_count; + str nb_name("nanobind"); p->nb_module = PyModule_NewObject(nb_name.ptr()); @@ -378,8 +419,16 @@ NB_NOINLINE void init(const char *name) { p->nb_func = (PyTypeObject *) PyType_FromSpec(&nb_func_spec); p->nb_method = (PyTypeObject *) PyType_FromSpec(&nb_method_spec); p->nb_bound_method = (PyTypeObject *) PyType_FromSpec(&nb_bound_method_spec); - p->keep_alive.min_load_factor(.1f); - p->inst_c2p.min_load_factor(.1f); + +#if defined(NB_FREE_THREADED) + p->nb_static_property_disabled = PyThread_tss_alloc(); + PyThread_tss_create(p->nb_static_property_disabled); +#endif + + for (size_t i = 0; i < shard_count; ++i) { + p->shards[i].keep_alive.min_load_factor(.1f); + p->shards[i].inst_c2p.min_load_factor(.1f); + } check(p->nb_module && p->nb_meta && p->nb_type_dict && p->nb_func && p->nb_method && p->nb_bound_method, diff --git a/src/nb_internals.h b/src/nb_internals.h index 94693e8c..fbe346d0 100644 --- a/src/nb_internals.h +++ b/src/nb_internals.h @@ -202,6 +202,107 @@ struct nb_translator_seq { nb_translator_seq *next = nullptr; }; +#if defined(NB_FREE_THREADED) +# define NB_SHARD_ALIGNMENT alignas(64) +#else +# define NB_SHARD_ALIGNMENT +#endif + +/** + * The following data structure stores information associated with individual + * instances. In free-threaded builds, it is split into multiple shards to avoid + * lock contention. + */ +struct NB_SHARD_ALIGNMENT nb_shard { + /** + * C++ -> Python instance map + * + * This associative data structure maps a C++ instance pointer onto its + * associated PyObject* (if bit 0 of the map value is zero) or a linked + * list of type `nb_inst_seq*` (if bit 0 is set---it must be cleared before + * interpreting the pointer in this case). + * + * The latter case occurs when several distinct Python objects reference + * the same memory address (e.g. a struct and its first member). + */ + nb_ptr_map inst_c2p; + + /// Dictionary storing keep_alive references + nb_ptr_map keep_alive; + +#if defined(NB_FREE_THREADED) + PyMutex mutex { }; +#endif +}; + +/** + * `nb_internals` is the central data structure storing information related to + * function/type bindings and instances. Separate nanobind extensions within the + * same NB_DOMAIN furthermore share `nb_internals` to communicate with each + * other, hence any changes here generally require an ABI version bump. + * + * The GIL protects the elements of this data structure from concurrent + * modification. In free-threaded builds, a combination of locking schemes is + * needed to achieve good performance. + * + * In particular, `inst_c2p` and `type_c2p_fast` are very hot and potentially + * accessed several times while dispatching a single function call. The other + * elements are accessed much less frequently and easier to deal with. + * + * The following list clarifies locking semantics for each member. + * + * - `nb_module`, `nb_meta`, `nb_func`, `nb_method`, `nb_bound_method`, + * `*_Type_tp_*`, `shard_count`, `is_alive_ptr`: these are initialized when + * loading the first nanobind extension within a domain, which happens within + * a critical section. They do not require locking. + * + * - `nb_type_dict`: created when the loading the first nanobind extension + * within a domain. While the dictionary itself is protected by its own + * lock, additional locking is needed to avoid races that create redundant + * entries. The `mutex` member is used for this. + * + * - `nb_static_property` and `nb_static_propert_descr_set`: created only once + * on demand, protected by `mutex`. + * + * - `nb_static_property_disabled`: needed to correctly implement assignments to + * static properties. Free-threaded builds store this flag using TLS to avoid + * concurrent modification. + * + * - `nb_static_property` and `nb_static_propert_descr_set`: created only once + * on demand, protected by `mutex`. + * + * - `nb_ndarray`: created only once on demand, protected by `mutex`. + * + * - `inst_c2p`: stores the C++ instance to Python object mapping. This + * data struture is *hot* and uses a sharded locking scheme to reduce + * lock contention. + * + * - `keep_alive`: stores lifetime dependencies (e.g., from the + * reference_internal return value policy). This data structure is + * potentially hot and shares the sharding scheme of `inst_c2p`. + * + * - `type_c2p_slow`: This is the ground-truth source of the `std::type_info` + * to `type_info *` mapping. Unrelated to free-threading, lookups into this + * data struture are generally costly because they use a string comparison on + * some platforms. Because it is only used as a fallback for 'type_c2p_fast', + * protecting this member via the global `mutex` is sufficient. + * + * - `type_c2p_fast`: this data structure is *hot* and mostly read. It maps + * `std::type_info` to `type_info *` but uses pointer-based comparisons. + * The implementation depends on the Python build. + * + * - `translators`: This is an append-to-front-only singly linked list traversed + * while raising exceptions. The main concern is losing elements during + * concurrent append operations. We assume that this data structure is only + * written during module initialization and don't use locking. + * + * - `funcs`: data structure for function leak tracking. Not used in + * free-threaded mode . + * + * - `print_leak_warnings`, `print_implicit_cast_warnings`: simple boolean + * flags. No protection against concurrent conflicting updates. + */ + struct nb_internals { /// Internal nanobind module PyObject *nb_module; @@ -217,36 +318,46 @@ struct nb_internals { /// Property variant for static attributes (created on demand) PyTypeObject *nb_static_property = nullptr; - bool nb_static_property_enabled = true; descrsetfunc nb_static_property_descr_set = nullptr; +#if defined(NB_FREE_THREADED) + Py_tss_t *nb_static_property_disabled = nullptr; +#else + bool nb_static_property_disabled = false; +#endif + /// N-dimensional array wrapper (created on demand) PyTypeObject *nb_ndarray = nullptr; - /** - * C++ -> Python instance map - * - * This associative data structure maps a C++ instance pointer onto its - * associated PyObject* (if bit 0 of the map value is zero) or a linked - * list of type `nb_inst_seq*` (if bit 0 is set---it must be cleared before - * interpreting the pointer in this case). - * - * The latter case occurs when several distinct Python objects reference - * the same memory address (e.g. a struct and its first member). - */ - nb_ptr_map inst_c2p; +#if defined(NB_FREE_THREADED) + nb_shard *shards = nullptr; + size_t shard_mask = 0; + // Heuristic shard selection (from pybind11 PR #5148 by @colesbury), uses + // high pointer bits to group allocations by individual threads/cores. + inline nb_shard &shard(void *p) { + uintptr_t highbits = ((uintptr_t) p) >> 20; + size_t index = ((size_t) fmix64((uint64_t) highbits)) & shard_mask; + return shards[index]; + } +#else + nb_shard shards[1]; + inline nb_shard &shard(void *) { return shards[0]; } +#endif + +#if !defined(NB_FREE_THREADED) /// C++ -> Python type map -- fast version based on std::type_info pointer equality nb_type_map_fast type_c2p_fast; +#endif /// C++ -> Python type map -- slow fallback version based on hashed strings nb_type_map_slow type_c2p_slow; - /// Dictionary storing keep_alive references - nb_ptr_map keep_alive; - +#if !defined(NB_FREE_THREADED) /// nb_func/meth instance map for leak reporting (used as set, the value is unused) + /// In free-threaded mode, functions are immortal and don't require this data structure. nb_ptr_map funcs; +#endif /// Registered C++ -> Python exception translators nb_translator_seq translators; @@ -270,6 +381,13 @@ struct nb_internals { descrsetfunc PyProperty_Type_tp_descr_set; size_t type_data_offset; #endif + +#if defined(NB_FREE_THREADED) + PyMutex mutex { }; +#endif + + // Size of the 'shards' data structure. Only rarely accessed, hence at the end + size_t shard_count = 1; }; /// Convenience macro to potentially access cached functions @@ -338,10 +456,37 @@ template struct scoped_pymalloc { T *ptr{ nullptr }; }; + +/// RAII lock/unlock guards for free-threaded builds +#if defined(NB_FREE_THREADED) +struct lock_shard { + nb_shard &s; + lock_shard(nb_shard &s) : s(s) { PyMutex_Lock(&s.mutex); } + ~lock_shard() { PyMutex_Unlock(&s.mutex); } +}; +struct lock_internals { + nb_internals *i; + lock_internals(nb_internals *i) : i(i) { PyMutex_Lock(&i->mutex); } + ~lock_internals() { PyMutex_Unlock(&i->mutex); } +}; +struct unlock_internals { + nb_internals *i; + unlock_internals(nb_internals *i) : i(i) { PyMutex_Unlock(&i->mutex); } + ~unlock_internals() { PyMutex_Lock(&i->mutex); } +}; +#else +struct lock_shard { lock_shard(nb_shard &) { } }; +struct lock_internals { lock_internals(nb_internals *) { } }; +struct unlock_internals { unlock_internals(nb_internals *) { } }; +struct lock_obj { lock_obj(PyObject *) { } }; +#endif + extern char *strdup_check(const char *); extern void *malloc_check(size_t size); +extern void maybe_make_immortal(PyObject *op); extern char *extract_name(const char *cmd, const char *prefix, const char *s); + NAMESPACE_END(detail) NAMESPACE_END(NB_NAMESPACE) diff --git a/src/nb_ndarray.cpp b/src/nb_ndarray.cpp index 2816c2cd..c8db618d 100644 --- a/src/nb_ndarray.cpp +++ b/src/nb_ndarray.cpp @@ -174,9 +174,15 @@ static PyMethodDef nb_ndarray_members[] = { }; static PyTypeObject *nd_ndarray_tp() noexcept { - PyTypeObject *tp = internals->nb_ndarray; + nb_internals *internals_ = internals; + PyTypeObject *tp = internals_->nb_ndarray; if (NB_UNLIKELY(!tp)) { + lock_internals guard(internals_); + tp = internals_->nb_ndarray; + if (tp) + return tp; + PyType_Slot slots[] = { { Py_tp_dealloc, (void *) nb_ndarray_dealloc }, { Py_tp_methods, (void *) nb_ndarray_members }, @@ -203,7 +209,7 @@ static PyTypeObject *nd_ndarray_tp() noexcept { tp->tp_as_buffer->bf_releasebuffer = nb_ndarray_releasebuffer; #endif - internals->nb_ndarray = tp; + internals_->nb_ndarray = tp; } return tp; diff --git a/src/nb_static_property.cpp b/src/nb_static_property.cpp index e8474c3a..0c8bc639 100644 --- a/src/nb_static_property.cpp +++ b/src/nb_static_property.cpp @@ -5,7 +5,16 @@ NAMESPACE_BEGIN(detail) /// `nb_static_property.__get__()`: Always pass the class instead of the instance. static PyObject *nb_static_property_descr_get(PyObject *self, PyObject *, PyObject *cls) { - if (internals->nb_static_property_enabled) { + + // Flag to avoid infinite recursion during static attribute assignment + bool static_property_disabled; +#if defined(NB_FREE_THREADED) + static_property_disabled = (bool) PyThread_tss_get(internals->nb_static_property_disabled); +#else + static_property_disabled = internals->nb_static_property_disabled; +#endif + + if (!static_property_disabled) { return NB_SLOT(PyProperty_Type, tp_descr_get)(self, cls, cls); } else { Py_INCREF(self); @@ -20,9 +29,16 @@ static int nb_static_property_descr_set(PyObject *self, PyObject *obj, PyObject } PyTypeObject *nb_static_property_tp() noexcept { - PyTypeObject *tp = internals->nb_static_property; + nb_internals *internals_ = internals; + PyTypeObject *tp = internals_->nb_static_property; if (NB_UNLIKELY(!tp)) { + lock_internals guard(internals_); + + tp = internals_->nb_static_property; + if (tp) + return tp; + PyMemberDef *members; #if defined(Py_LIMITED_API) @@ -49,8 +65,8 @@ PyTypeObject *nb_static_property_tp() noexcept { tp = (PyTypeObject *) PyType_FromSpec(&spec); check(tp, "nb_static_property type creation failed!"); - internals->nb_static_property = tp; - internals->nb_static_property_descr_set = nb_static_property_descr_set; + internals_->nb_static_property = tp; + internals_->nb_static_property_descr_set = nb_static_property_descr_set; } return tp; diff --git a/src/nb_type.cpp b/src/nb_type.cpp index 0bce5c28..9ed76201 100644 --- a/src/nb_type.cpp +++ b/src/nb_type.cpp @@ -100,7 +100,9 @@ PyObject *inst_new_int(PyTypeObject *tp, PyObject * /* args */, self->unused = 0; // Update hash table that maps from C++ to Python instance - auto [it, success] = internals->inst_c2p.try_emplace((void *) payload, self); + nb_shard &shard = internals->shard((void *) payload); + lock_shard guard(shard); + auto [it, success] = shard.inst_c2p.try_emplace((void *) payload, self); check(success, "nanobind::detail::inst_new_int(): unexpected collision!"); } @@ -162,8 +164,11 @@ PyObject *inst_new_ext(PyTypeObject *tp, void *value) { self->intrusive = intrusive; self->unused = 0; + nb_shard &shard = internals->shard(value); + lock_shard guard(shard); + // Update hash table that maps from C++ to Python instance - auto [it, success] = internals->inst_c2p.try_emplace(value, self); + auto [it, success] = shard.inst_c2p.try_emplace(value, self); if (NB_UNLIKELY(!success)) { void *entry = it->second; @@ -243,73 +248,81 @@ static void inst_dealloc(PyObject *self) { operator delete(p, std::align_val_t(t->align)); } - if (NB_UNLIKELY(inst->clear_keep_alive)) { - size_t self_hash = ptr_hash()(self); - nb_ptr_map &keep_alive = internals->keep_alive; - nb_ptr_map::iterator it = keep_alive.find(self, self_hash); - check(it != keep_alive.end(), - "nanobind::detail::inst_dealloc(\"%s\"): inconsistent " - "keep_alive information", t->name); - - nb_weakref_seq *s = (nb_weakref_seq *) it->second; - keep_alive.erase_fast(it); + nb_weakref_seq *wr_seq = nullptr; - do { - nb_weakref_seq *c = s; - s = c->next; + { + // Enter critical section of shard + nb_shard &shard = internals->shard(p); + lock_shard guard(shard); - if (c->callback) - c->callback(c->payload); - else - Py_DECREF((PyObject *) c->payload); + if (NB_UNLIKELY(inst->clear_keep_alive)) { + size_t self_hash = ptr_hash()(self); + nb_ptr_map &keep_alive = shard.keep_alive; + nb_ptr_map::iterator it = keep_alive.find(self, self_hash); + check(it != keep_alive.end(), + "nanobind::detail::inst_dealloc(\"%s\"): inconsistent " + "keep_alive information", t->name); - PyObject_Free(c); - } while (s); - } + wr_seq = (nb_weakref_seq *) it->second; + keep_alive.erase_fast(it); + } - size_t p_hash = ptr_hash()(p); + size_t p_hash = ptr_hash()(p); - // Update hash table that maps from C++ to Python instance - nb_ptr_map &inst_c2p = internals->inst_c2p; - nb_ptr_map::iterator it = inst_c2p.find(p, p_hash); - bool found = false; + // Update hash table that maps from C++ to Python instance + nb_ptr_map &inst_c2p = shard.inst_c2p; + nb_ptr_map::iterator it = inst_c2p.find(p, p_hash); + bool found = false; - if (NB_LIKELY(it != inst_c2p.end())) { - void *entry = it->second; - if (NB_LIKELY(entry == inst)) { - found = true; - inst_c2p.erase_fast(it); - } else if (nb_is_seq(entry)) { - // Multiple objects are associated with this address. Find the right one! - nb_inst_seq *seq = nb_get_seq(entry), - *pred = nullptr; - - do { - if ((nb_inst *) seq->inst == inst) { - found = true; - - if (pred) { - pred->next = seq->next; - } else { - if (seq->next) - it.value() = nb_mark_seq(seq->next); - else - inst_c2p.erase_fast(it); + if (NB_LIKELY(it != inst_c2p.end())) { + void *entry = it->second; + if (NB_LIKELY(entry == inst)) { + found = true; + inst_c2p.erase_fast(it); + } else if (nb_is_seq(entry)) { + // Multiple objects are associated with this address. Find the right one! + nb_inst_seq *seq = nb_get_seq(entry), + *pred = nullptr; + + do { + if ((nb_inst *) seq->inst == inst) { + found = true; + + if (pred) { + pred->next = seq->next; + } else { + if (seq->next) + it.value() = nb_mark_seq(seq->next); + else + inst_c2p.erase_fast(it); + } + + PyMem_Free(seq); + break; } - PyMem_Free(seq); - break; - } - - pred = seq; - seq = seq->next; - } while (seq); + pred = seq; + seq = seq->next; + } while (seq); + } } + + check(found, + "nanobind::detail::inst_dealloc(\"%s\"): attempted to delete an " + "unknown instance (%p)!", t->name, p); } - check(found, - "nanobind::detail::inst_dealloc(\"%s\"): attempted to delete an " - "unknown instance (%p)!", t->name, p); + while (wr_seq) { + nb_weakref_seq *c = wr_seq; + wr_seq = c->next; + + if (c->callback) + c->callback(c->payload); + else + Py_DECREF((PyObject *) c->payload); + + PyMem_Free(c); + } if (NB_UNLIKELY(gc)) PyObject_GC_Del(self); @@ -319,24 +332,37 @@ static void inst_dealloc(PyObject *self) { Py_DECREF(tp); } + type_data *nb_type_c2p(nb_internals *internals_, const std::type_info *type) { +#if defined(NB_FREE_THREADED) + thread_local nb_type_map_fast type_c2p_fast; +#else nb_type_map_fast &type_c2p_fast = internals_->type_c2p_fast; - nb_type_map_slow &type_c2p_slow = internals_->type_c2p_slow; +#endif nb_type_map_fast::iterator it_fast = type_c2p_fast.find(type); if (it_fast != type_c2p_fast.end()) return it_fast->second; + lock_internals guard(internals_); + nb_type_map_slow &type_c2p_slow = internals_->type_c2p_slow; nb_type_map_slow::iterator it_slow = type_c2p_slow.find(type); if (it_slow != type_c2p_slow.end()) { type_data *d = it_slow->second; - nb_alias_chain *chain = (nb_alias_chain *) PyMem_Malloc(sizeof(nb_alias_chain)); +#if !defined(NB_FREE_THREADED) + // Maintain a linked list to clean up 'type_c2p_fast' when the type + // expires (see nb_type_unregister). In free-threaded mode, we leak + // these entries until the thread destructs. + nb_alias_chain *chain = + (nb_alias_chain *) PyMem_Malloc(sizeof(nb_alias_chain)); check(chain, "Could not allocate nb_alias_chain entry!"); chain->next = d->alias_chain; chain->value = type; d->alias_chain = chain; +#endif + type_c2p_fast[type] = d; return d; } @@ -345,11 +371,35 @@ type_data *nb_type_c2p(nb_internals *internals_, } void nb_type_unregister(type_data *t) noexcept { - nb_type_map_slow &type_c2p_slow = internals->type_c2p_slow; - nb_type_map_fast &type_c2p_fast = internals->type_c2p_fast; + nb_internals *internals_ = internals; + nb_type_map_slow &type_c2p_slow = internals_->type_c2p_slow; - size_t n_del_slow = type_c2p_slow.erase(t->type), - n_del_fast = type_c2p_fast.erase(t->type); + lock_internals guard(internals_); + size_t n_del_slow = type_c2p_slow.erase(t->type); + +#if defined(NB_FREE_THREADED) + // In free-threaded mode, stale type information remains in the + // 'type_c2p_fast' TLS. This data structure is eventually deallocated + // when the thread terminates. + // + // In principle, this is dangerous because the user could delete a type + // binding from a module at runtime, causing the associated + // Python type object to be freed. If a function then attempts to return + // a value with such a de-registered type, nanobind should raise an + // exception, which requires knowing that the entry in 'type_c2p_fast' + // has become invalid in the meantime. + // + // Right now, this problem is avoided because we immortalize type objects in + // ``nb_type_new()`` and ``enum_create()``. However, we may not always + // want to stick with immortalization, which is just a workaround. + // + // In the future, a global version counter modified with acquire/release + // semantics (see https://github.com/wjakob/nanobind/pull/695#discussion_r1761600010) + // might prove to be a similarly efficient but more general solution. + bool fail = n_del_slow != 1; +#else + nb_type_map_fast &type_c2p_fast = internals_->type_c2p_fast; + size_t n_del_fast = type_c2p_fast.erase(t->type); bool fail = n_del_fast != 1 || n_del_slow != 1; if (!fail) { @@ -365,6 +415,7 @@ void nb_type_unregister(type_data *t) noexcept { cur = next; } } +#endif check(!fail, "nanobind::detail::nb_type_unregister(\"%s\"): could not " @@ -378,8 +429,8 @@ static void nb_type_dealloc(PyObject *o) { nb_type_unregister(t); if (t->flags & (uint32_t) type_flags::has_implicit_conversions) { - free(t->implicit.cpp); - free(t->implicit.py); + PyMem_Free(t->implicit.cpp); + PyMem_Free(t->implicit.py); } free((char *) t->name); @@ -442,11 +493,23 @@ static int nb_type_init(PyObject *self, PyObject *args, PyObject *kwds) { } /// Special case to handle 'Class.property = value' assignments -static int nb_type_setattro(PyObject* obj, PyObject* name, PyObject* value) { +int nb_type_setattro(PyObject* obj, PyObject* name, PyObject* value) { nb_internals *int_p = internals; - int_p->nb_static_property_enabled = false; + + // Set a flag to avoid infinite recursion during static attribute assignment +#if defined(NB_FREE_THREADED) + PyThread_tss_set(int_p->nb_static_property_disabled, (void *) 1); +#else + int_p->nb_static_property_disabled = true; +#endif + PyObject *cur = PyObject_GetAttr(obj, name); - int_p->nb_static_property_enabled = true; + +#if defined(NB_FREE_THREADED) + PyThread_tss_set(int_p->nb_static_property_disabled, (void *) 0); +#else + int_p->nb_static_property_disabled = false; +#endif if (cur) { PyTypeObject *tp = int_p->nb_static_property; @@ -711,6 +774,8 @@ static PyObject *nb_type_from_metaclass(PyTypeObject *meta, PyObject *mod, if (doc && !fail) { size_t size = strlen(doc) + 1; + /// This code path is only used for Python 3.12, where + /// PyObject_Malloc is the right allocation routine for tp_doc char *target = (char *) PyObject_Malloc(size); if (!target) { PyErr_NoMemory(); @@ -756,14 +821,23 @@ static PyObject *nb_type_from_metaclass(PyTypeObject *meta, PyObject *mod, #endif } +extern int nb_type_setattro(PyObject* obj, PyObject* name, PyObject* value); + static PyTypeObject *nb_type_tp(size_t supplement) noexcept { - nb_internals *internals_ = internals; object key = steal(PyLong_FromSize_t(supplement)); + nb_internals *internals_ = internals; PyTypeObject *tp = - (PyTypeObject *) PyDict_GetItem(internals_->nb_type_dict, key.ptr()); + (PyTypeObject *) dict_get_item_ref_or_fail(internals_->nb_type_dict, key.ptr()); if (NB_UNLIKELY(!tp)) { + // Retry in critical section to avoid races that create the same nb_type + lock_internals guard(internals_); + + tp = (PyTypeObject *) dict_get_item_ref_or_fail(internals_->nb_type_dict, key.ptr()); + if (tp) + return tp; + #if defined(Py_LIMITED_API) PyMemberDef members[] = { { "__vectorcalloffset__", Py_T_PYSSIZET, 0, Py_READONLY, nullptr }, @@ -773,6 +847,7 @@ static PyTypeObject *nb_type_tp(size_t supplement) noexcept { // Workaround because __vectorcalloffset__ does not support Py_RELATIVE_OFFSET members[0].offset = internals_->type_data_offset + offsetof(type_data, vectorcall); #endif + PyType_Slot slots[] = { { Py_tp_base, &PyType_Type }, { Py_tp_dealloc, (void *) nb_type_dealloc }, @@ -810,14 +885,14 @@ static PyTypeObject *nb_type_tp(size_t supplement) noexcept { tp = (PyTypeObject *) nb_type_from_metaclass( internals_->nb_meta, internals_->nb_module, &spec); + maybe_make_immortal((PyObject *) tp); + handle(tp).attr("__module__") = "nanobind"; int rv = 1; if (tp) rv = PyDict_SetItem(internals_->nb_type_dict, key.ptr(), (PyObject *) tp); check(rv == 0, "nb_type type creation failed!"); - - Py_DECREF(tp); } return tp; @@ -980,14 +1055,23 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { PyObject *mod = nullptr; // Update hash table that maps from std::type_info to Python type - auto [it, success] = internals->type_c2p_slow.try_emplace(t->type, nullptr); - if (!success) { - PyErr_WarnFormat(PyExc_RuntimeWarning, 1, "nanobind: type '%s' was already registered!\n", t_name); - PyObject *tp = (PyObject *) it->second->type_py; - Py_INCREF(tp); - if (has_signature) - free((char *) t_name); - return tp; + nb_type_map_slow::iterator it; + bool success; + nb_internals *internals_ = internals; + + { + lock_internals guard(internals_); + std::tie(it, success) = internals_->type_c2p_slow.try_emplace(t->type, nullptr); + if (!success) { + PyErr_WarnFormat(PyExc_RuntimeWarning, 1, + "nanobind: type '%s' was already registered!\n", + t_name); + PyObject *tp = (PyObject *) it->second->type_py; + Py_INCREF(tp); + if (has_signature) + free((char *) t_name); + return tp; + } } if (t->scope != nullptr) { @@ -1039,8 +1123,9 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { "nanobind::detail::nb_type_new(\"%s\"): base type is not a " "nanobind type!", t_name); } else if (has_base) { - nb_type_map_slow::iterator it2 = internals->type_c2p_slow.find(t->base); - check(it2 != internals->type_c2p_slow.end(), + lock_internals guard(internals_); + nb_type_map_slow::iterator it2 = internals_->type_c2p_slow.find(t->base); + check(it2 != internals_->type_c2p_slow.end(), "nanobind::detail::nb_type_new(\"%s\"): base type \"%s\" not " "known to nanobind!", t_name, type_name(t->base)); base = (PyObject *) it2->second->type_py; @@ -1202,6 +1287,10 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { "failed: %s!", t_name, err.what()); } + Py_DECREF(metaclass); + + maybe_make_immortal(result); + type_data *to = nb_type_data((PyTypeObject *) result); *to = *t; // note: slices off _init parts @@ -1251,8 +1340,14 @@ PyObject *nb_type_new(const type_init_data *t) noexcept { if (modname.is_valid()) setattr(result, "__module__", modname.ptr()); - internals->type_c2p_fast[t->type] = to; - internals->type_c2p_slow[t->type] = to; + { + lock_internals guard(internals_); + internals_->type_c2p_slow[t->type] = to; + + #if !defined(NB_FREE_THREADED) + internals_->type_c2p_fast[t->type] = to; + #endif + } if (has_signature) { setattr(result, "__nb_signature__", str(t->name)); @@ -1454,9 +1549,13 @@ void keep_alive(PyObject *nurse, PyObject *patient) { return; if (nb_type_check((PyObject *) Py_TYPE(nurse))) { - nb_weakref_seq **pp = - (nb_weakref_seq **) &internals->keep_alive[nurse]; +#if defined(NB_FREE_THREADED) + nb_shard &shard = internals->shard(inst_ptr((nb_inst *) nurse)); +#else + nb_shard &shard = internals->shards[0]; +#endif + nb_weakref_seq **pp = (nb_weakref_seq **) &shard.keep_alive[nurse]; do { nb_weakref_seq *p = *pp; if (!p) @@ -1467,7 +1566,7 @@ void keep_alive(PyObject *nurse, PyObject *patient) { } while (true); nb_weakref_seq *s = - (nb_weakref_seq *) PyObject_Malloc(sizeof(nb_weakref_seq)); + (nb_weakref_seq *) PyMem_Malloc(sizeof(nb_weakref_seq)); check(s, "nanobind::detail::keep_alive(): out of memory!"); s->payload = patient; @@ -1503,9 +1602,15 @@ void keep_alive(PyObject *nurse, void *payload, check(nurse, "nanobind::detail::keep_alive(): 'nurse' is undefined!"); if (nb_type_check((PyObject *) Py_TYPE(nurse))) { +#if defined(NB_FREE_THREADED) + nb_shard &shard = internals->shard(inst_ptr((nb_inst *) nurse)); +#else + nb_shard &shard = internals->shards[0]; +#endif + nb_weakref_seq - **pp = (nb_weakref_seq **) &internals->keep_alive[nurse], - *s = (nb_weakref_seq *) PyObject_Malloc(sizeof(nb_weakref_seq)); + **pp = (nb_weakref_seq **) &shard.keep_alive[nurse], + *s = (nb_weakref_seq *) PyMem_Malloc(sizeof(nb_weakref_seq)); check(s, "nanobind::detail::keep_alive(): out of memory!"); s->payload = payload; @@ -1620,7 +1725,6 @@ PyObject *nb_type_put(const std::type_info *cpp_type, } nb_internals *internals_ = internals; - nb_ptr_map &inst_c2p = internals_->inst_c2p; type_data *td = nullptr; auto lookup_type = [cpp_type, internals_, &td]() -> bool { @@ -1634,7 +1738,11 @@ PyObject *nb_type_put(const std::type_info *cpp_type, }; if (rvp != rv_policy::copy) { + nb_shard &shard = internals_->shard(value); + lock_shard guard(shard); + // Check if the instance is already registered with nanobind + nb_ptr_map &inst_c2p = shard.inst_c2p; nb_ptr_map::iterator it = inst_c2p.find(value); if (it != inst_c2p.end()) { @@ -1694,7 +1802,6 @@ PyObject *nb_type_put_p(const std::type_info *cpp_type, // Check if the instance is already registered with nanobind nb_internals *internals_ = internals; - nb_ptr_map &inst_c2p = internals_->inst_c2p; // Look up the corresponding Python type type_data *td = nullptr, @@ -1715,7 +1822,11 @@ PyObject *nb_type_put_p(const std::type_info *cpp_type, }; if (rvp != rv_policy::copy) { + nb_shard &shard = internals_->shard(value); + lock_shard guard(shard); + // Check if the instance is already registered with nanobind + nb_ptr_map &inst_c2p = shard.inst_c2p; nb_ptr_map::iterator it = inst_c2p.find(value); if (it != inst_c2p.end()) { diff --git a/src/trampoline.cpp b/src/trampoline.cpp index 4a71b04e..9671efd5 100644 --- a/src/trampoline.cpp +++ b/src/trampoline.cpp @@ -14,8 +14,13 @@ NAMESPACE_BEGIN(NB_NAMESPACE) NAMESPACE_BEGIN(detail) void trampoline_new(void **data, size_t size, void *ptr) noexcept { - // GIL is held when the trampoline constructor runs - nb_ptr_map &inst_c2p = internals->inst_c2p; + // GIL is held when the trampoline constructor runs. Lock the + // associated instance shard in GIL-less Python. + + nb_shard &shard = internals->shard(ptr); + lock_shard lock(shard); + + nb_ptr_map &inst_c2p = shard.inst_c2p; nb_ptr_map::iterator it = inst_c2p.find(ptr); check(it != inst_c2p.end() && (((uintptr_t) it->second) & 1) == 0, "nanobind::detail::trampoline_new(): unique instance not found!"); @@ -36,6 +41,7 @@ static void trampoline_enter_internal(void **data, size_t size, PyGILState_STATE state{ }; const char *error = nullptr; PyObject *key = nullptr, *value = nullptr; + PyObject *self = (PyObject *) data[0]; PyTypeObject *value_tp = nullptr; size_t offset = 0; @@ -51,8 +57,7 @@ static void trampoline_enter_internal(void **data, size_t size, } else { if (pure) { error = "tried to call a pure virtual function"; - state = PyGILState_Ensure(); - goto fail; + break; } else { return; } @@ -62,6 +67,11 @@ static void trampoline_enter_internal(void **data, size_t size, // Nothing found -- retry, now with lock held state = PyGILState_Ensure(); + ft_object_guard guard(self); + + if (error) + goto fail; + for (size_t i = 0; i < size; i++) { void *d_name = data[2*i + 1], *d_value = data[2*i + 2]; @@ -101,7 +111,7 @@ static void trampoline_enter_internal(void **data, size_t size, goto fail; } - value = PyObject_GetAttr((PyObject *) data[0], key); + value = PyObject_GetAttr(self, key); if (!value) { error = "lookup failed"; goto fail; @@ -136,14 +146,14 @@ static void trampoline_enter_internal(void **data, size_t size, } fail: - type_data *td = nb_type_data(Py_TYPE((PyObject *) data[0])); + type_data *td = nb_type_data(Py_TYPE(self)); PyGILState_Release(state); raise("nanobind::detail::get_trampoline('%s::%s()'): %s!", td->name, name, error); } -NB_THREAD_LOCAL ticket *current_ticket = nullptr; +static NB_THREAD_LOCAL ticket *current_ticket = nullptr; void trampoline_enter(void **data, size_t size, const char *name, bool pure, ticket *t) { trampoline_enter_internal(data, size, name, pure, t); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d6594729..f306cda7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,6 +5,10 @@ if (NB_TEST_STABLE_ABI) set(NB_EXTRA_ARGS ${NB_EXTRA_ARGS} STABLE_ABI) endif() +if (NB_TEST_FREE_THREADED) + set(NB_EXTRA_ARGS ${NB_EXTRA_ARGS} FREE_THREADED) +endif() + if (NB_TEST_SHARED_BUILD) set(NB_EXTRA_ARGS ${NB_EXTRA_ARGS} NB_SHARED) endif() @@ -49,6 +53,7 @@ set(TEST_NAMES typing issue intrusive + thread ) foreach (NAME ${TEST_NAMES}) @@ -126,6 +131,7 @@ set(TEST_FILES test_ndarray.py test_stubs.py test_typing.py + test_thread.py # Stub reference files test_classes_ext.pyi.ref diff --git a/tests/test_classes.cpp b/tests/test_classes.cpp index 22ff2ac8..a75da0fc 100644 --- a/tests/test_classes.cpp +++ b/tests/test_classes.cpp @@ -122,7 +122,7 @@ int wrapper_tp_traverse(PyObject *self, visitproc visit, void *arg) { Wrapper *w = nb::inst_ptr(self); // If c->value corresponds to an associated CPython object, return it - nb::object value = nb::find(w->value); + nb::handle value = nb::find(w->value); // Inform the Python GC about it Py_VISIT(value.ptr()); diff --git a/tests/test_stl.cpp b/tests/test_stl.cpp index 9dd97a96..bd75db5c 100644 --- a/tests/test_stl.cpp +++ b/tests/test_stl.cpp @@ -67,7 +67,7 @@ struct FuncWrapper { int funcwrapper_tp_traverse(PyObject *self, visitproc visit, void *arg) { FuncWrapper *w = nb::inst_ptr(self); - nb::object f = nb::cast(w->f, nb::rv_policy::none); + nb::handle f = nb::cast(w->f, nb::rv_policy::none); Py_VISIT(f.ptr()); #if PY_VERSION_HEX >= 0x03090000 diff --git a/tests/test_stl.py b/tests/test_stl.py index 008835cd..bb2d4e1e 100644 --- a/tests/test_stl.py +++ b/tests/test_stl.py @@ -306,6 +306,7 @@ def f(): # cpyext reference cycles are not supported, see https://foss.heptapod.net/pypy/pypy/-/issues/3849 @skip_on_pypy def test32_std_function_gc(): + # Test class -> function -> class cyclic reference class A(t.FuncWrapper): pass @@ -316,18 +317,29 @@ class A(t.FuncWrapper): collect() assert t.FuncWrapper.alive == 0 - # A class -> function -> class cyclic reference - class B(t.FuncWrapper): + # t.FuncWrapper is a C extension type with a custom property 'f', which + # can store Python function objects It implements the tp_traverse + # callback so that reference cycles arising from this function object + # can be detected. + + class Test(t.FuncWrapper): def __init__(self): super().__init__() + # The constructor creates a closure, which references 'self' + # and assigns it to the 'self.f' member. + # This creates a cycle self -> self.f -> self def f(): print(self.f) self.f = f + + # The Test class declared above inherits from 'FuncWrapper'. + # This class keeps track of how many references are alive at + # any point to help track down leak issues. assert t.FuncWrapper.alive == 0 - b = B() + b = Test() assert t.FuncWrapper.alive == 1 del b collect() diff --git a/tests/test_thread.cpp b/tests/test_thread.cpp new file mode 100644 index 00000000..230e1285 --- /dev/null +++ b/tests/test_thread.cpp @@ -0,0 +1,37 @@ +#include + +namespace nb = nanobind; +using namespace nb::literals; + +struct Counter { + size_t value = 0; + void inc() { value++; } + void merge(Counter &o) { + value += o.value; + o.value = 0; + } +}; + +nb::ft_mutex mutex; + +NB_MODULE(test_thread_ext, m) { + nb::class_(m, "Counter") + .def(nb::init<>()) + .def_ro("value", &Counter::value) + .def("inc_unsafe", &Counter::inc) + .def("inc_safe", &Counter::inc, nb::lock_self()) + .def("merge_unsafe", &Counter::merge) + .def("merge_safe", &Counter::merge, nb::lock_self(), "o"_a.lock()); + + m.def("return_self", [](Counter *c) -> Counter * { return c; }); + + m.def("inc_safe", + [](Counter &c) { c.inc(); }, + nb::arg().lock()); + + m.def("inc_global", + [](Counter &c) { + nb::ft_lock_guard guard(mutex); + c.inc(); + }, "counter"); +} diff --git a/tests/test_thread.py b/tests/test_thread.py new file mode 100644 index 00000000..f9fd85f1 --- /dev/null +++ b/tests/test_thread.py @@ -0,0 +1,100 @@ +import test_thread_ext as t +from test_thread_ext import Counter + +import threading + +# Helper function to parallelize execution of a function. We intentionally +# don't use the Python threads pools here to have threads shut down / start +# between test cases. +def parallelize(func, n_threads): + barrier = threading.Barrier(n_threads) + result = [None]*n_threads + + def wrapper(i): + barrier.wait() + result[i] = func() + + workers = [] + for i in range(n_threads): + t = threading.Thread(target=wrapper, args=(i,)) + t.start() + workers.append(t) + + for worker in workers: + worker.join() + return result + + +def test01_object_creation(n_threads=8): + # This test hammers 'inst_c2p' from multiple threads, and + # checks that the locking of internal data structures works + + n = 100000 + def f(): + r = [None]*n + for i in range(n): + c = Counter() + c.inc_unsafe() + r[i] = c + for i in range(n): + assert t.return_self(r[i]) is r[i] + return r + + v = parallelize(f, n_threads=n_threads) + assert len(v) == n_threads + for v2 in v: + assert len(v2) == n + for v3 in v2: + assert v3.value == 1 + +def test02_global_lock(n_threads=8): + # Test that a global PyMutex protects the counter + n = 100000 + c = Counter() + def f(): + for i in range(n): + t.inc_global(c) + + parallelize(f, n_threads=n_threads) + assert c.value == n * n_threads + + +def test03_locked_method(n_threads=8): + # Checks that nb::lock_self() protects an internal counter + n = 100000 + c = Counter() + def f(): + for i in range(n): + c.inc_safe() + + parallelize(f, n_threads=n_threads) + assert c.value == n * n_threads + + +def test04_locked_function(n_threads=8): + # Checks that nb::lock_self() protects an internal counter + n = 100000 + c = Counter() + def f(): + for i in range(n): + t.inc_safe(c) + + parallelize(f, n_threads=n_threads) + assert c.value == n * n_threads + + +def test05_locked_twoargs(n_threads=8): + # Check two-argument locking + n = 100000 + c = Counter() + def f(): + c2 = Counter() + for i in range(n): + c2.inc_unsafe() + if i & 1 == 0: + c2.merge_safe(c) + else: + c.merge_safe(c2) + + parallelize(f, n_threads=n_threads) + assert c.value == n * n_threads