diff --git a/python/rmm/CMakeLists.txt b/python/rmm/CMakeLists.txt index 6c2515102..4c4e603ef 100644 --- a/python/rmm/CMakeLists.txt +++ b/python/rmm/CMakeLists.txt @@ -30,4 +30,6 @@ rapids_cython_init() add_compile_definitions("SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}") add_subdirectory(rmm/_cuda) -add_subdirectory(rmm/_lib) +# add_subdirectory(rmm/_lib) +add_subdirectory(rmm/cpp) +add_subdirectory(rmm/python) diff --git a/python/rmm/rmm/cpp/CMakeLists.txt b/python/rmm/rmm/cpp/CMakeLists.txt new file mode 100644 index 000000000..8ac924df4 --- /dev/null +++ b/python/rmm/rmm/cpp/CMakeLists.txt @@ -0,0 +1,35 @@ +# ============================================================================= +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources cpp_logger.pyx cpp_memory_resource.pyx) +set(linked_libraries rmm::rmm) + +# Build all of the Cython targets +rapids_cython_create_modules(SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" + CXX) + +# mark all symbols in these Cython targets "hidden" by default, so they won't collide with symbols +# loaded from other DSOs +foreach(_cython_target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + set_target_properties(${_cython_target} PROPERTIES C_VISIBILITY_PRESET hidden + CXX_VISIBILITY_PRESET hidden) +endforeach() + +add_library(_torch_allocator SHARED _torch_allocator.cpp) +# Want the output to be called _torch_allocator.so +set_target_properties(_torch_allocator PROPERTIES PREFIX "" SUFFIX ".so") +target_link_libraries(_torch_allocator PRIVATE rmm::rmm) +cmake_path(RELATIVE_PATH CMAKE_CURRENT_SOURCE_DIR BASE_DIRECTORY "${PROJECT_SOURCE_DIR}" + OUTPUT_VARIABLE _torch_allocator_location) +install(TARGETS _torch_allocator DESTINATION "${_torch_allocator_location}") diff --git a/python/rmm/rmm/cpp/__init__.pxd b/python/rmm/rmm/cpp/__init__.pxd new file mode 100644 index 000000000..46753baa3 --- /dev/null +++ b/python/rmm/rmm/cpp/__init__.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/rmm/rmm/cpp/_torch_allocator.cpp b/python/rmm/rmm/cpp/_torch_allocator.cpp new file mode 100644 index 000000000..bfe94c2d0 --- /dev/null +++ b/python/rmm/rmm/cpp/_torch_allocator.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +// These signatures must match those required by CUDAPluggableAllocator in +// github.com/pytorch/pytorch/blob/main/torch/csrc/cuda/CUDAPluggableAllocator.h +// Since the loading is done at runtime via dlopen, no error checking +// can be performed for mismatching signatures. + +/** + * @brief Allocate memory of at least \p size bytes. + * + * @throws rmm::bad_alloc When the requested allocation cannot be satisfied. + * + * @param size The number of bytes to allocate + * @param device The device whose memory resource one should use + * @param stream CUDA stream to perform allocation on + * @return Pointer to the newly allocated memory + */ +extern "C" void* allocate(std::size_t size, int device, void* stream) +{ + rmm::cuda_device_id const device_id{device}; + rmm::cuda_set_device_raii with_device{device_id}; + auto mr = rmm::mr::get_per_device_resource_ref(device_id); + return mr.allocate_async( + size, rmm::CUDA_ALLOCATION_ALIGNMENT, rmm::cuda_stream_view{static_cast(stream)}); +} + +/** + * @brief Deallocate memory pointed to by \p ptr. + * + * @param ptr Pointer to be deallocated + * @param size The number of bytes in the allocation + * @param device The device whose memory resource one should use + * @param stream CUDA stream to perform deallocation on + */ +extern "C" void deallocate(void* ptr, std::size_t size, int device, void* stream) +{ + rmm::cuda_device_id const device_id{device}; + rmm::cuda_set_device_raii with_device{device_id}; + auto mr = rmm::mr::get_per_device_resource_ref(device_id); + mr.deallocate_async(ptr, + size, + rmm::CUDA_ALLOCATION_ALIGNMENT, + rmm::cuda_stream_view{static_cast(stream)}); +} diff --git a/python/rmm/rmm/cpp/cpp_cuda_stream.pxd b/python/rmm/rmm/cpp/cpp_cuda_stream.pxd new file mode 100644 index 000000000..16b66ee2b --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_cuda_stream.pxd @@ -0,0 +1,28 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cuda.ccudart cimport cudaStream_t +from libcpp cimport bool + +from rmm.cpp.cpp_cuda_stream_view cimport cuda_stream_view + + +cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil: + cdef cppclass cuda_stream: + cuda_stream() except + + bool is_valid() except + + cudaStream_t value() except + + cuda_stream_view view() except + + void synchronize() except + + void synchronize_no_throw() diff --git a/python/rmm/rmm/cpp/cpp_cuda_stream_pool.pxd b/python/rmm/rmm/cpp/cpp_cuda_stream_pool.pxd new file mode 100644 index 000000000..553b38514 --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_cuda_stream_pool.pxd @@ -0,0 +1,23 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from rmm.cpp.cpp_cuda_stream_view cimport cuda_stream_view + + +cdef extern from "rmm/cuda_stream_pool.hpp" namespace "rmm" nogil: + cdef cppclass cuda_stream_pool: + cuda_stream_pool(size_t pool_size) + cuda_stream_view get_stream() + cuda_stream_view get_stream(size_t stream_id) except + + size_t get_pool_size() diff --git a/python/rmm/rmm/cpp/cpp_cuda_stream_view.pxd b/python/rmm/rmm/cpp/cpp_cuda_stream_view.pxd new file mode 100644 index 000000000..bf0d33c24 --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_cuda_stream_view.pxd @@ -0,0 +1,32 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cuda.ccudart cimport cudaStream_t +from libcpp cimport bool + + +cdef extern from "rmm/cuda_stream_view.hpp" namespace "rmm" nogil: + cdef cppclass cuda_stream_view: + cuda_stream_view() + cuda_stream_view(cudaStream_t) + cudaStream_t value() + bool is_default() + bool is_per_thread_default() + void synchronize() except + + + cdef bool operator==(cuda_stream_view const, cuda_stream_view const) + + const cuda_stream_view cuda_stream_default + const cuda_stream_view cuda_stream_legacy + const cuda_stream_view cuda_stream_per_thread diff --git a/python/rmm/rmm/cpp/cpp_device_buffer.pxd b/python/rmm/rmm/cpp/cpp_device_buffer.pxd new file mode 100644 index 000000000..1aa7634cf --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_device_buffer.pxd @@ -0,0 +1,58 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from rmm.cpp.cpp_cuda_stream_view cimport cuda_stream_view +from rmm.cpp.cpp_memory_resource cimport device_memory_resource + + +cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: + cdef cppclass cuda_device_id: + ctypedef int value_type + cuda_device_id() + cuda_device_id(value_type id) + value_type value() + + cdef cuda_device_id get_current_cuda_device() + +cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil: + cdef void prefetch(const void* ptr, + size_t bytes, + cuda_device_id device, + cuda_stream_view stream) except + + +cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: + cdef cppclass device_buffer: + device_buffer() + device_buffer( + size_t size, + cuda_stream_view stream, + device_memory_resource * + ) except + + device_buffer( + const void* source_data, + size_t size, + cuda_stream_view stream, + device_memory_resource * + ) except + + device_buffer( + const device_buffer buf, + cuda_stream_view stream, + device_memory_resource * + ) except + + void reserve(size_t new_capacity, cuda_stream_view stream) except + + void resize(size_t new_size, cuda_stream_view stream) except + + void shrink_to_fit(cuda_stream_view stream) except + + void* data() + size_t size() + size_t capacity() diff --git a/python/rmm/rmm/cpp/cpp_device_uvector.pxd b/python/rmm/rmm/cpp/cpp_device_uvector.pxd new file mode 100644 index 000000000..2cb647e3c --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_device_uvector.pxd @@ -0,0 +1,39 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from rmm.cpp.cuda_stream_view cimport cuda_stream_view +from rmm.cpp.device_buffer cimport device_buffer +from rmm.cpp.memory_resource cimport device_memory_resource + + +cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: + cdef cppclass device_uvector[T]: + device_uvector(size_t size, cuda_stream_view stream) except + + T* element_ptr(size_t index) + void set_element(size_t element_index, const T& v, cuda_stream_view s) + void set_element_async( + size_t element_index, + const T& v, + cuda_stream_view s + ) except + + T front_element(cuda_stream_view s) except + + T back_element(cuda_stream_view s) except + + void reserve(size_t new_capacity, cuda_stream_view stream) except + + void resize(size_t new_size, cuda_stream_view stream) except + + void shrink_to_fit(cuda_stream_view stream) except + + device_buffer release() + size_t capacity() + T* data() + size_t size() + device_memory_resource* memory_resource() diff --git a/python/rmm/rmm/cpp/cpp_logger.pyx b/python/rmm/rmm/cpp/cpp_logger.pyx new file mode 100644 index 000000000..241a748c3 --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_logger.pyx @@ -0,0 +1,66 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libcpp cimport bool + + +cdef extern from "spdlog/common.h" namespace "spdlog::level" nogil: + cpdef enum logging_level "spdlog::level::level_enum": + """ + The debug logging level for RMM. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Valid levels, in decreasing order of verbosity, are TRACE, DEBUG, + INFO, WARN, ERR, CRITICAL, and OFF. Default is INFO. + + Examples + -------- + >>> import rmm + >>> rmm.logging_level.DEBUG + + >>> rmm.logging_level.DEBUG.value + 1 + >>> rmm.logging_level.DEBUG.name + 'DEBUG' + + See Also + -------- + set_logging_level : Set the debug logging level + get_logging_level : Get the current debug logging level + """ + TRACE "spdlog::level::trace" + DEBUG "spdlog::level::debug" + INFO "spdlog::level::info" + WARN "spdlog::level::warn" + ERR "spdlog::level::err" + CRITICAL "spdlog::level::critical" + OFF "spdlog::level::off" + + +cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil: + cdef cppclass spdlog_logger "spdlog::logger": + spdlog_logger() except + + void set_level(logging_level level) + logging_level level() + void flush() except + + void flush_on(logging_level level) + logging_level flush_level() + bool should_log(logging_level msg_level) + + +cdef extern from "rmm/logger.hpp" namespace "rmm" nogil: + cdef spdlog_logger& logger() except + diff --git a/python/rmm/rmm/cpp/cpp_memory_resource.pxd b/python/rmm/rmm/cpp/cpp_memory_resource.pxd new file mode 100644 index 000000000..cef5d4737 --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_memory_resource.pxd @@ -0,0 +1,34 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libcpp.pair cimport pair + +from rmm.cpp.cpp_cuda_stream_view cimport cuda_stream_view + + +cdef extern from "rmm/mr/device/device_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass device_memory_resource: + void* allocate(size_t bytes) except + + void* allocate(size_t bytes, cuda_stream_view stream) except + + void deallocate(void* ptr, size_t bytes) except + + void deallocate( + void* ptr, + size_t bytes, + cuda_stream_view stream + ) except + + +cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil: + size_t percent_of_free_device_memory(int percent) except + + pair[size_t, size_t] available_device_memory() except + diff --git a/python/rmm/rmm/cpp/cpp_memory_resource.pyx b/python/rmm/rmm/cpp/cpp_memory_resource.pyx new file mode 100644 index 000000000..50e201517 --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_memory_resource.pyx @@ -0,0 +1,215 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This import is needed for Cython typing in translate_python_except_to_cpp +# See https://github.com/cython/cython/issues/5589 +from builtins import BaseException + +from libc.stddef cimport size_t +from libc.stdint cimport int8_t, int64_t +from libcpp cimport bool +from libcpp.optional cimport optional +from libcpp.pair cimport pair +from libcpp.string cimport string + +from rmm.cpp.cpp_cuda_stream_view cimport cuda_stream_view +from rmm.cpp.cpp_memory_resource cimport device_memory_resource + +# Transparent handle of a C++ exception +ctypedef pair[int, string] CppExcept + +cdef CppExcept translate_python_except_to_cpp(err: BaseException) noexcept: + """Translate a Python exception into a C++ exception handle + + The returned exception handle can then be thrown by `throw_cpp_except()`, + which MUST be done without holding the GIL. + + This is useful when C++ calls a Python function and needs to catch or + propagate exceptions. + """ + if isinstance(err, MemoryError): + return CppExcept(0, str.encode(str(err))) + return CppExcept(-1, str.encode(str(err))) + +# Implementation of `throw_cpp_except()`, which throws a given `CppExcept`. +# This function MUST be called without the GIL otherwise the thrown C++ +# exception are translated back into a Python exception. +cdef extern from *: + """ + #include + #include + + void throw_cpp_except(std::pair res) { + switch(res.first) { + case 0: + throw rmm::out_of_memory(res.second); + default: + throw std::runtime_error(res.second); + } + } + """ + void throw_cpp_except(CppExcept) nogil + + +# NOTE: Keep extern declarations in .pyx file as much as possible to avoid +# leaking dependencies when importing RMM Cython .pxd files +cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass cuda_memory_resource(device_memory_resource): + cuda_memory_resource() except + + +cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass managed_memory_resource(device_memory_resource): + managed_memory_resource() except + + +cdef extern from "rmm/mr/device/system_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass system_memory_resource(device_memory_resource): + system_memory_resource() except + + +cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass sam_headroom_memory_resource(device_memory_resource): + sam_headroom_memory_resource(size_t headroom) except + + +cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + + cdef cppclass cuda_async_memory_resource(device_memory_resource): + cuda_async_memory_resource( + optional[size_t] initial_pool_size, + optional[size_t] release_threshold, + optional[allocation_handle_type] export_handle_type) except + + +# TODO: when we adopt Cython 3.0 use enum class +cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ + namespace \ + "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \ + nogil: + enum allocation_handle_type \ + "rmm::mr::cuda_async_memory_resource::allocation_handle_type": + none + posix_file_descriptor + win32 + win32_kmt + + +cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass pool_memory_resource[Upstream](device_memory_resource): + pool_memory_resource( + Upstream* upstream_mr, + size_t initial_pool_size, + optional[size_t] maximum_pool_size) except + + size_t pool_size() + +cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource): + fixed_size_memory_resource( + Upstream* upstream_mr, + size_t block_size, + size_t block_to_preallocate) except + + +cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*) + ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*) + + cdef cppclass callback_memory_resource(device_memory_resource): + callback_memory_resource( + allocate_callback_t allocate_callback, + deallocate_callback_t deallocate_callback, + void* allocate_callback_arg, + void* deallocate_callback_arg + ) except + + +cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass binning_memory_resource[Upstream](device_memory_resource): + binning_memory_resource(Upstream* upstream_mr) except + + binning_memory_resource( + Upstream* upstream_mr, + int8_t min_size_exponent, + int8_t max_size_exponent) except + + + void add_bin(size_t allocation_size) except + + void add_bin( + size_t allocation_size, + device_memory_resource* bin_resource) except + + +cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource): + limiting_resource_adaptor( + Upstream* upstream_mr, + size_t allocation_limit) except + + + size_t get_allocated_bytes() except + + size_t get_allocation_limit() except + + +cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource): + logging_resource_adaptor( + Upstream* upstream_mr, + string filename) except + + + void flush() except + + +cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource): + struct counter: + counter() + + int64_t value + int64_t peak + int64_t total + + statistics_resource_adaptor(Upstream* upstream_mr) except + + + counter get_bytes_counter() except + + counter get_allocations_counter() except + + pair[counter, counter] pop_counters() except + + pair[counter, counter] push_counters() except + + +cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource): + tracking_resource_adaptor( + Upstream* upstream_mr, + bool capture_stacks) except + + + size_t get_allocated_bytes() except + + string get_outstanding_allocations_str() except + + void log_outstanding_allocations() except + + +cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + ctypedef bool (*failure_callback_t)(size_t, void*) + cdef cppclass failure_callback_resource_adaptor[Upstream]( + device_memory_resource + ): + failure_callback_resource_adaptor( + Upstream* upstream_mr, + failure_callback_t callback, + void* callback_arg + ) except + + +cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource): + prefetch_resource_adaptor(Upstream* upstream_mr) except + diff --git a/python/rmm/rmm/cpp/cpp_per_device_resource.pxd b/python/rmm/rmm/cpp/cpp_per_device_resource.pxd new file mode 100644 index 000000000..59f651687 --- /dev/null +++ b/python/rmm/rmm/cpp/cpp_per_device_resource.pxd @@ -0,0 +1,36 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from rmm.cpp.cpp_memory_resource cimport device_memory_resource + + +cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: + cdef cppclass cuda_device_id: + ctypedef int value_type + + cuda_device_id(value_type id) + + value_type value() + +cdef extern from "rmm/mr/device/per_device_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef device_memory_resource* set_current_device_resource( + device_memory_resource* new_mr + ) + cdef device_memory_resource* get_current_device_resource() + cdef device_memory_resource* set_per_device_resource( + cuda_device_id id, device_memory_resource* new_mr + ) + cdef device_memory_resource* get_per_device_resource ( + cuda_device_id id + ) diff --git a/python/rmm/rmm/python/CMakeLists.txt b/python/rmm/rmm/python/CMakeLists.txt new file mode 100644 index 000000000..558c0ce9a --- /dev/null +++ b/python/rmm/rmm/python/CMakeLists.txt @@ -0,0 +1,28 @@ +# ============================================================================= +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx + helper.pyx) +set(linked_libraries rmm::rmm) + +# Build all of the Cython targets +rapids_cython_create_modules(SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" + CXX) + +# mark all symbols in these Cython targets "hidden" by default, so they won't collide with symbols +# loaded from other DSOs +foreach(_cython_target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + set_target_properties(${_cython_target} PROPERTIES C_VISIBILITY_PRESET hidden + CXX_VISIBILITY_PRESET hidden) +endforeach() diff --git a/python/rmm/rmm/python/__init__.pxd b/python/rmm/rmm/python/__init__.pxd new file mode 100644 index 000000000..46753baa3 --- /dev/null +++ b/python/rmm/rmm/python/__init__.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/rmm/rmm/python/__init__.py b/python/rmm/rmm/python/__init__.py new file mode 100644 index 000000000..0b8672ef6 --- /dev/null +++ b/python/rmm/rmm/python/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .device_buffer import DeviceBuffer diff --git a/python/rmm/rmm/python/cuda_stream.pyx b/python/rmm/rmm/python/cuda_stream.pyx new file mode 100644 index 000000000..f1529111d --- /dev/null +++ b/python/rmm/rmm/python/cuda_stream.pyx @@ -0,0 +1,36 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cimport cython +from cuda.ccudart cimport cudaStream_t +from libcpp cimport bool + +from rmm.cpp.cpp_cuda_stream cimport cuda_stream + + +@cython.final +cdef class CudaStream: + """ + Wrapper around a CUDA stream with RAII semantics. + When a CudaStream instance is GC'd, the underlying + CUDA stream is destroyed. + """ + def __cinit__(self): + self.c_obj.reset(new cuda_stream()) + + cdef cudaStream_t value(self) except * nogil: + return self.c_obj.get()[0].value() + + cdef bool is_valid(self) except * nogil: + return self.c_obj.get()[0].is_valid() diff --git a/python/rmm/rmm/python/device_buffer.pxd b/python/rmm/rmm/python/device_buffer.pxd new file mode 100644 index 000000000..d8b56d2fb --- /dev/null +++ b/python/rmm/rmm/python/device_buffer.pxd @@ -0,0 +1,71 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libc.stdint cimport uintptr_t +from libcpp.memory cimport unique_ptr + +from rmm._cuda.stream cimport Stream +from rmm.cpp.cpp_device_buffer cimport device_buffer +from rmm.python.memory_resource cimport DeviceMemoryResource + + +cdef class DeviceBuffer: + cdef unique_ptr[device_buffer] c_obj + + # Holds a reference to the DeviceMemoryResource used for allocation. + # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is + # needed for deallocation + cdef DeviceMemoryResource mr + + # Holds a reference to the stream used by the underlying `device_buffer`. + # Ensures the stream does not get destroyed before this DeviceBuffer + cdef Stream stream + + @staticmethod + cdef DeviceBuffer c_from_unique_ptr( + unique_ptr[device_buffer] ptr, + Stream stream=*, + DeviceMemoryResource mr=*, + ) + + @staticmethod + cdef DeviceBuffer c_to_device(const unsigned char[::1] b, + Stream stream=*) except * + cpdef copy_to_host(self, ary=*, Stream stream=*) + cpdef copy_from_host(self, ary, Stream stream=*) + cpdef copy_from_device(self, cuda_ary, Stream stream=*) + cpdef bytes tobytes(self, Stream stream=*) + + cdef size_t c_size(self) except * + cpdef void reserve(self, size_t new_capacity, Stream stream=*) except * + cpdef void resize(self, size_t new_size, Stream stream=*) except * + cpdef size_t capacity(self) except * + cdef void* c_data(self) except * + + cdef device_buffer c_release(self) except * + +cpdef DeviceBuffer to_device(const unsigned char[::1] b, + Stream stream=*) +cpdef void copy_ptr_to_host(uintptr_t db, + unsigned char[::1] hb, + Stream stream=*) except * + +cpdef void copy_host_to_ptr(const unsigned char[::1] hb, + uintptr_t db, + Stream stream=*) except * + +cpdef void copy_device_to_ptr(uintptr_t d_src, + uintptr_t d_dst, + size_t count, + Stream stream=*) except * diff --git a/python/rmm/rmm/python/device_buffer.pyx b/python/rmm/rmm/python/device_buffer.pyx new file mode 100644 index 000000000..1d564ff33 --- /dev/null +++ b/python/rmm/rmm/python/device_buffer.pyx @@ -0,0 +1,559 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + +cimport cython +from cpython.bytes cimport PyBytes_FromStringAndSize +from libc.stdint cimport uintptr_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from rmm._cuda.stream cimport Stream + +from rmm._cuda.stream import DEFAULT_STREAM + +cimport cuda.ccudart as ccudart +from cuda.ccudart cimport ( + cudaError, + cudaError_t, + cudaMemcpyAsync, + cudaMemcpyKind, + cudaStream_t, +) + +from rmm.cpp.cpp_cuda_stream_view cimport cuda_stream_view +from rmm.cpp.cpp_device_buffer cimport ( + cuda_device_id, + device_buffer, + get_current_cuda_device, + prefetch, +) +from rmm.python.memory_resource cimport ( + DeviceMemoryResource, + device_memory_resource, + get_current_device_resource, +) + + +# The DeviceMemoryResource attribute could be released prematurely +# by the gc if the DeviceBuffer is in a reference cycle. Removing +# the tp_clear function with the no_gc_clear decoration prevents that. +# See https://github.com/rapidsai/rmm/pull/931 for details. +@cython.no_gc_clear +cdef class DeviceBuffer: + + def __cinit__(self, *, + uintptr_t ptr=0, + size_t size=0, + Stream stream=DEFAULT_STREAM, + DeviceMemoryResource mr=None): + """Construct a ``DeviceBuffer`` with optional size and data pointer + + Parameters + ---------- + ptr : int + pointer to some data on host or device to copy over + size : int + size of the buffer to allocate + (and possibly size of data to copy) + stream : optional + CUDA stream to use for construction and/or copying, + defaults to the CUDA default stream. A reference to the + stream is stored internally to ensure it doesn't go out of + scope while the DeviceBuffer is in use. Destroying the + underlying stream while the DeviceBuffer is in use will + result in undefined behavior. + mr : optional + DeviceMemoryResource for the allocation, if not provided + defaults to the current device resource. + + Note + ---- + If the pointer passed is non-null and ``stream`` is the default stream, + it is synchronized after the copy. However if a non-default ``stream`` + is provided, this function is fully asynchronous. + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer(size=5) + """ + cdef const void* c_ptr + cdef device_memory_resource * mr_ptr + # Save a reference to the MR and stream used for allocation + self.mr = get_current_device_resource() if mr is None else mr + self.stream = stream + + mr_ptr = self.mr.get_mr() + with nogil: + c_ptr = ptr + + if c_ptr == NULL or size == 0: + self.c_obj.reset(new device_buffer(size, stream.view(), mr_ptr)) + else: + self.c_obj.reset(new device_buffer(c_ptr, size, stream.view(), mr_ptr)) + + if stream.c_is_default(): + stream.c_synchronize() + + def __len__(self): + return self.size + + def __sizeof__(self): + return self.size + + def __bytes__(self): + return self.tobytes() + + @property + def nbytes(self): + """Gets the size of the buffer in bytes.""" + return self.size + + @property + def ptr(self): + """Gets a pointer to the underlying data.""" + return int(self.c_data()) + + @property + def size(self): + """Gets the size of the buffer in bytes.""" + return int(self.c_size()) + + def __reduce__(self): + return to_device, (self.copy_to_host(),) + + @property + def __cuda_array_interface__(self): + cdef dict intf = { + "data": (self.ptr, False), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0 + } + return intf + + def prefetch(self, device=None, stream=None): + """Prefetch buffer data to the specified device on the specified stream. + + Assumes the storage for this DeviceBuffer is CUDA managed memory + (unified memory). If it is not, this function is a no-op. + + Parameters + ---------- + device : optional + The CUDA device to which to prefetch the memory for this buffer. + Defaults to the current CUDA device. To prefetch to the CPU, pass + :py:attr:`~cuda.cudart.cudaCpuDeviceId` as the device. + stream : optional + CUDA stream to use for prefetching. Defaults to self.stream + """ + cdef cuda_device_id dev = (get_current_cuda_device() + if device is None + else cuda_device_id(device)) + cdef Stream strm = self.stream if stream is None else stream + with nogil: + prefetch(self.c_obj.get()[0].data(), + self.c_obj.get()[0].size(), + dev, + strm.view()) + + def copy(self): + """Returns a copy of DeviceBuffer. + + Returns + ------- + A deep copy of existing ``DeviceBuffer`` + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer.to_device(b"abc") + >>> db_copy = db.copy() + >>> db.copy_to_host() + array([97, 98, 99], dtype=uint8) + >>> db_copy.copy_to_host() + array([97, 98, 99], dtype=uint8) + >>> assert db is not db_copy + >>> assert db.ptr != db_copy.ptr + """ + ret = DeviceBuffer(ptr=self.ptr, size=self.size, stream=self.stream) + ret.mr = self.mr + return ret + + def __copy__(self): + return self.copy() + + @staticmethod + cdef DeviceBuffer c_from_unique_ptr( + unique_ptr[device_buffer] ptr, + Stream stream=DEFAULT_STREAM, + DeviceMemoryResource mr=None, + ): + cdef DeviceBuffer buf = DeviceBuffer.__new__(DeviceBuffer) + if stream.c_is_default(): + stream.c_synchronize() + buf.c_obj = move(ptr) + buf.mr = get_current_device_resource() if mr is None else mr + buf.stream = stream + return buf + + @staticmethod + cdef DeviceBuffer c_to_device(const unsigned char[::1] b, + Stream stream=DEFAULT_STREAM) except *: + """Calls ``to_device`` function on arguments provided""" + return to_device(b, stream) + + @staticmethod + def to_device(const unsigned char[::1] b, + Stream stream=DEFAULT_STREAM): + """Calls ``to_device`` function on arguments provided.""" + return to_device(b, stream) + + cpdef copy_to_host(self, ary=None, Stream stream=DEFAULT_STREAM): + """Copy from a ``DeviceBuffer`` to a buffer on host. + + Parameters + ---------- + ary : ``bytes``-like buffer to write into + stream : CUDA stream to use for copying, default the default stream + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer.to_device(b"abc") + >>> hb = bytearray(db.nbytes) + >>> db.copy_to_host(hb) + >>> print(hb) + bytearray(b'abc') + >>> hb = db.copy_to_host() + >>> print(hb) + bytearray(b'abc') + """ + cdef const device_buffer* dbp = self.c_obj.get() + cdef size_t s = dbp.size() + + cdef unsigned char[::1] hb = ary + if hb is None: + # NumPy leverages huge pages under-the-hood, + # which speeds up the copy from device to host. + hb = ary = np.empty((s,), dtype="u1") + elif len(hb) < s: + raise ValueError( + "Argument `ary` is too small. Need space for %i bytes." % s + ) + + copy_ptr_to_host(dbp.data(), hb[:s], stream) + + return ary + + cpdef copy_from_host(self, ary, Stream stream=DEFAULT_STREAM): + """Copy from a buffer on host to ``self`` + + Parameters + ---------- + ary : ``bytes``-like buffer to copy from + stream : CUDA stream to use for copying, default the default stream + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer(size=10) + >>> hb = b"abcdef" + >>> db.copy_from_host(hb) + >>> hb = db.copy_to_host() + >>> print(hb) + array([97, 98, 99, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) + """ + cdef device_buffer* dbp = self.c_obj.get() + + cdef const unsigned char[::1] hb = ary + cdef size_t s = len(hb) + if s > self.size: + raise ValueError( + "Argument `ary` is too large. Need space for %i bytes." % s + ) + + copy_host_to_ptr(hb[:s], dbp.data(), stream) + + cpdef copy_from_device(self, cuda_ary, + Stream stream=DEFAULT_STREAM): + """Copy from a buffer on host to ``self`` + + Parameters + ---------- + cuda_ary : object to copy from that has ``__cuda_array_interface__`` + stream : CUDA stream to use for copying, default the default stream + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer(size=5) + >>> db2 = rmm.DeviceBuffer.to_device(b"abc") + >>> db.copy_from_device(db2) + >>> hb = db.copy_to_host() + >>> print(hb) + array([97, 98, 99, 0, 0], dtype=uint8) + """ + if not hasattr(cuda_ary, "__cuda_array_interface__"): + raise ValueError( + "Expected object to support `__cuda_array_interface__` " + "protocol" + ) + + cuda_ary_interface = cuda_ary.__cuda_array_interface__ + shape = cuda_ary_interface["shape"] + strides = cuda_ary_interface.get("strides") + dtype = np.dtype(cuda_ary_interface["typestr"]) + + if len(shape) > 1: + raise ValueError( + "Only 1-D contiguous arrays are supported, got {}-D " + "array".format(str(len(shape))) + ) + + if strides is not None: + if strides[0] != dtype.itemsize: + raise ValueError( + "Only 1-D contiguous arrays are supported, got a " + "non-contiguous array" + ) + + cdef uintptr_t src_ptr = cuda_ary_interface["data"][0] + cdef size_t s = shape[0] * dtype.itemsize + if s > self.size: + raise ValueError( + "Argument `hb` is too large. Need space for %i bytes." % s + ) + + cdef device_buffer* dbp = self.c_obj.get() + + copy_device_to_ptr( + src_ptr, + dbp.data(), + s, + stream + ) + + cpdef bytes tobytes(self, Stream stream=DEFAULT_STREAM): + cdef const device_buffer* dbp = self.c_obj.get() + cdef size_t s = dbp.size() + + cdef bytes b = PyBytes_FromStringAndSize(NULL, s) + cdef unsigned char* p = b + cdef unsigned char[::1] mv = (p)[:s] + self.copy_to_host(mv, stream) + + return b + + cdef size_t c_size(self) except *: + return self.c_obj.get()[0].size() + + cpdef void reserve(self, + size_t new_capacity, + Stream stream=DEFAULT_STREAM) except *: + self.c_obj.get()[0].reserve(new_capacity, stream.view()) + + cpdef void resize(self, + size_t new_size, + Stream stream=DEFAULT_STREAM) except *: + self.c_obj.get()[0].resize(new_size, stream.view()) + + cpdef size_t capacity(self) except *: + return self.c_obj.get()[0].capacity() + + cdef void* c_data(self) except *: + return self.c_obj.get()[0].data() + + cdef device_buffer c_release(self) except *: + """ + Releases ownership of the data held by this DeviceBuffer. + """ + return move(cython.operator.dereference(self.c_obj)) + + +@cython.boundscheck(False) +cpdef DeviceBuffer to_device(const unsigned char[::1] b, + Stream stream=DEFAULT_STREAM): + """Return a new ``DeviceBuffer`` with a copy of the data. + + Parameters + ---------- + b : ``bytes``-like data on host to copy to device + stream : CUDA stream to use for copying, default the default stream + + Returns + ------- + ``DeviceBuffer`` with copy of data from host + + Examples + -------- + >>> import rmm + >>> db = rmm._lib.device_buffer.to_device(b"abc") + >>> print(bytes(db)) + b'abc' + """ + + if b is None: + raise TypeError( + "Argument 'b' has incorrect type" + " (expected bytes-like, got NoneType)" + ) + + cdef uintptr_t p = &b[0] + cdef size_t s = len(b) + return DeviceBuffer(ptr=p, size=s, stream=stream) + + +@cython.boundscheck(False) +cdef void _copy_async(const void* src, + void* dst, + size_t count, + ccudart.cudaMemcpyKind kind, + cuda_stream_view stream) except * nogil: + """ + Asynchronously copy data between host and/or device pointers. + + This is a convenience wrapper around cudaMemcpyAsync that + checks for errors. Only used for internal implementation. + + Parameters + ---------- + src : pointer to ``bytes``-like host buffer or device data to copy from + dst : pointer to ``bytes``-like host buffer or device data to copy into + count : the size in bytes to copy + kind : the kind of copy to perform + stream : CUDA stream to use for copying, default the default stream + """ + cdef cudaError_t err = cudaMemcpyAsync(dst, src, count, kind, + stream) + + if err != cudaError.cudaSuccess: + raise RuntimeError(f"Memcpy failed with error: {err}") + + +@cython.boundscheck(False) +cpdef void copy_ptr_to_host(uintptr_t db, + unsigned char[::1] hb, + Stream stream=DEFAULT_STREAM) except *: + """Copy from a device pointer to a buffer on host + + Parameters + ---------- + db : pointer to data on device to copy + hb : ``bytes``-like buffer to write into + stream : CUDA stream to use for copying, default the default stream + + Note + ---- + If ``stream`` is the default stream, it is synchronized after the copy. + However if a non-default ``stream`` is provided, this function is fully + asynchronous. + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer.to_device(b"abc") + >>> hb = bytearray(db.nbytes) + >>> rmm._lib.device_buffer.copy_ptr_to_host(db.ptr, hb) + >>> print(hb) + bytearray(b'abc') + """ + + if hb is None: + raise TypeError( + "Argument `hb` has incorrect type" + " (expected bytes-like, got NoneType)" + ) + + with nogil: + _copy_async(db, &hb[0], len(hb), + cudaMemcpyKind.cudaMemcpyDeviceToHost, stream.view()) + + if stream.c_is_default(): + stream.c_synchronize() + + +@cython.boundscheck(False) +cpdef void copy_host_to_ptr(const unsigned char[::1] hb, + uintptr_t db, + Stream stream=DEFAULT_STREAM) except *: + """Copy from a host pointer to a device pointer + + Parameters + ---------- + hb : ``bytes``-like host buffer to copy + db : pointer to data on device to write into + stream : CUDA stream to use for copying, default the default stream + + Note + ---- + If ``stream`` is the default stream, it is synchronized after the copy. + However if a non-default ``stream`` is provided, this function is fully + asynchronous. + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer(size=10) + >>> hb = b"abc" + >>> rmm._lib.device_buffer.copy_host_to_ptr(hb, db.ptr) + >>> hb = db.copy_to_host() + >>> print(hb) + array([97, 98, 99, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) + """ + + if hb is None: + raise TypeError( + "Argument `hb` has incorrect type" + " (expected bytes-like, got NoneType)" + ) + + with nogil: + _copy_async(&hb[0], db, len(hb), + cudaMemcpyKind.cudaMemcpyHostToDevice, stream.view()) + + if stream.c_is_default(): + stream.c_synchronize() + + +@cython.boundscheck(False) +cpdef void copy_device_to_ptr(uintptr_t d_src, + uintptr_t d_dst, + size_t count, + Stream stream=DEFAULT_STREAM) except *: + """Copy from a device pointer to a device pointer + + Parameters + ---------- + d_src : pointer to data on device to copy from + d_dst : pointer to data on device to write into + count : the size in bytes to copy + stream : CUDA stream to use for copying, default the default stream + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer(size=5) + >>> db2 = rmm.DeviceBuffer.to_device(b"abc") + >>> rmm._lib.device_buffer.copy_device_to_ptr(db2.ptr, db.ptr, db2.size) + >>> hb = db.copy_to_host() + >>> hb + array([97, 98, 99, 0, 0], dtype=uint8) + """ + + with nogil: + _copy_async(d_src, d_dst, count, + cudaMemcpyKind.cudaMemcpyDeviceToDevice, stream.view()) diff --git a/python/rmm/rmm/python/helper.pxd b/python/rmm/rmm/python/helper.pxd new file mode 100644 index 000000000..8ca151c00 --- /dev/null +++ b/python/rmm/rmm/python/helper.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +cdef object parse_bytes(object s) except * diff --git a/python/rmm/rmm/python/helper.pyx b/python/rmm/rmm/python/helper.pyx new file mode 100644 index 000000000..d442ee341 --- /dev/null +++ b/python/rmm/rmm/python/helper.pyx @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper functions for rmm""" + +import re + + +cdef dict BYTE_SIZES = { + 'b': 1, + '': 1, + 'kb': 1000, + 'mb': 1000**2, + 'gb': 1000**3, + 'tb': 1000**4, + 'pb': 1000**5, + 'kib': 1024, + 'mib': 1024**2, + 'gib': 1024**3, + 'tib': 1024**4, + 'pib': 1024**5, +} + + +pattern = re.compile(r"^([0-9]+(?:\.[0-9]*)?)[\t ]*((?i:(?:[kmgtp]i?)?b))?$") + +cdef object parse_bytes(object s): + """Parse a string or integer into a number of bytes. + + Parameters + ---------- + s : int | str + Size in bytes. If an integer is provided, it is returned as-is. + A string is parsed as a floating point number with an (optional, + case-insensitive) byte-specifier, both SI prefixes (kb, mb, ..., pb) + and binary prefixes (kib, mib, ..., pib) are supported. + + Returns + ------- + Requested size in bytes as an integer. + + Raises + ------ + ValueError + If it is not possible to parse the input as a byte specification. + """ + cdef str suffix + cdef double n + cdef int multiplier + + if isinstance(s, int): + return s + + match = pattern.match(s) + + if match is None: + raise ValueError(f"Could not parse {s} as a byte specification") + + n = float(match.group(1)) + + suffix = match.group(2) + if suffix is None: + suffix = "" + + multiplier = BYTE_SIZES[suffix.lower()] + + return int(n*multiplier) diff --git a/python/rmm/rmm/python/lib.pxd b/python/rmm/rmm/python/lib.pxd new file mode 100644 index 000000000..b61e0d569 --- /dev/null +++ b/python/rmm/rmm/python/lib.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libcpp.utility cimport pair + +ctypedef pair[const char*, unsigned int] caller_pair diff --git a/python/rmm/rmm/python/lib.pyx b/python/rmm/rmm/python/lib.pyx new file mode 100644 index 000000000..46753baa3 --- /dev/null +++ b/python/rmm/rmm/python/lib.pyx @@ -0,0 +1,13 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/rmm/rmm/python/logger.pyx b/python/rmm/rmm/python/logger.pyx new file mode 100644 index 000000000..2997ff633 --- /dev/null +++ b/python/rmm/rmm/python/logger.pyx @@ -0,0 +1,208 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from rmm.cpp.cpp_logging cimport logging_level, logger + + +def _validate_level_type(level): + if not isinstance(level, logging_level): + raise TypeError("level must be an instance of the logging_level enum") + + +def should_log(level): + """ + Check if a message at the given level would be logged. + + A message at the given level would be logged if the current debug logging + level is set to a level that is at least as verbose than the given level, + *and* the RMM module is compiled for a logging level at least as verbose. + If these conditions are not both met, this function will return false. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Parameters + ---------- + level : logging_level + The debug logging level. Valid values are instances of the + ``logging_level`` enum. + + Returns + ------- + should_log : bool + True if a message at the given level would be logged, False otherwise. + + Raises + ------ + TypeError + If the logging level is not an instance of the ``logging_level`` enum. + """ + _validate_level_type(level) + return logger().should_log(level) + + +def set_logging_level(level): + """ + Set the debug logging level. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Parameters + ---------- + level : logging_level + The debug logging level. Valid values are instances of the + ``logging_level`` enum. + + Raises + ------ + TypeError + If the logging level is not an instance of the ``logging_level`` enum. + + See Also + -------- + get_logging_level : Get the current debug logging level. + + Examples + -------- + >>> import rmm + >>> rmm.set_logging_level(rmm.logging_level.WARN) # set logging level to warn + """ + _validate_level_type(level) + logger().set_level(level) + + if not should_log(level): + warnings.warn(f"RMM will not log logging_level.{level.name}. This " + "may be because the C++ library is compiled for a " + "less-verbose logging level.") + + +def get_logging_level(): + """ + Get the current debug logging level. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Returns + ------- + level : logging_level + The current debug logging level, an instance of the ``logging_level`` + enum. + + See Also + -------- + set_logging_level : Set the debug logging level. + + Examples + -------- + >>> import rmm + >>> rmm.get_logging_level() # get current logging level + + """ + return logging_level(logger().level()) + + +def flush_logger(): + """ + Flush the debug logger. This will cause any buffered log messages to + be written to the log file. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + See Also + -------- + set_flush_level : Set the flush level for the debug logger. + get_flush_level : Get the current debug logging flush level. + + Examples + -------- + >>> import rmm + >>> rmm.flush_logger() # flush the logger + """ + logger().flush() + + +def set_flush_level(level): + """ + Set the flush level for the debug logger. Messages of this level or higher + will automatically flush to the file. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Parameters + ---------- + level : logging_level + The debug logging level. Valid values are instances of the + ``logging_level`` enum. + + Raises + ------ + TypeError + If the logging level is not an instance of the ``logging_level`` enum. + + See Also + -------- + get_flush_level : Get the current debug logging flush level. + flush_logger : Flush the logger. + + Examples + -------- + >>> import rmm + >>> rmm.flush_on(rmm.logging_level.WARN) # set flush level to warn + """ + _validate_level_type(level) + logger().flush_on(level) + + if not should_log(level): + warnings.warn(f"RMM will not log logging_level.{level.name}. This " + "may be because the C++ library is compiled for a " + "less-verbose logging level.") + + +def get_flush_level(): + """ + Get the current debug logging flush level for the RMM logger. Messages of + this level or higher will automatically flush to the file. + + Debug logging prints messages to a log file. See + `Debug Logging `_ + for more information. + + Returns + ------- + logging_level + The current flush level, an instance of the ``logging_level`` + enum. + + See Also + -------- + set_flush_level : Set the flush level for the logger. + flush_logger : Flush the logger. + + Examples + -------- + >>> import rmm + >>> rmm.flush_level() # get current flush level + + """ + return logging_level(logger().flush_level()) diff --git a/python/rmm/rmm/python/memory_resource.pxd b/python/rmm/rmm/python/memory_resource.pxd new file mode 100644 index 000000000..0adda4eaf --- /dev/null +++ b/python/rmm/rmm/python/memory_resource.pxd @@ -0,0 +1,83 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libcpp.memory cimport shared_ptr + +from rmm.cpp.cpp_memory_resource cimport device_memory_resource + + +cdef class DeviceMemoryResource: + cdef shared_ptr[device_memory_resource] c_obj + cdef device_memory_resource* get_mr(self) noexcept nogil + +cdef class UpstreamResourceAdaptor(DeviceMemoryResource): + cdef readonly DeviceMemoryResource upstream_mr + + cpdef DeviceMemoryResource get_upstream(self) + +cdef class CudaMemoryResource(DeviceMemoryResource): + pass + +cdef class ManagedMemoryResource(DeviceMemoryResource): + pass + +cdef class SystemMemoryResource(DeviceMemoryResource): + pass + +cdef class SamHeadroomMemoryResource(DeviceMemoryResource): + pass + +cdef class CudaAsyncMemoryResource(DeviceMemoryResource): + pass + +cdef class PoolMemoryResource(UpstreamResourceAdaptor): + pass + +cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor): + pass + +cdef class BinningMemoryResource(UpstreamResourceAdaptor): + + cdef readonly list _bin_mrs + + cpdef add_bin( + self, + size_t allocation_size, + DeviceMemoryResource bin_resource=*) + +cdef class CallbackMemoryResource(DeviceMemoryResource): + cdef object _allocate_func + cdef object _deallocate_func + +cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor): + pass + +cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor): + cdef object _log_file_name + cpdef get_file_name(self) + cpdef flush(self) + +cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor): + pass + +cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor): + pass + +cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor): + cdef object _callback + +cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): + pass + +cpdef DeviceMemoryResource get_current_device_resource() diff --git a/python/rmm/rmm/python/memory_resource.pyx b/python/rmm/rmm/python/memory_resource.pyx new file mode 100644 index 000000000..46613667c --- /dev/null +++ b/python/rmm/rmm/python/memory_resource.pyx @@ -0,0 +1,1170 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import warnings +# This import is needed for Cython typing in translate_python_except_to_cpp +# See https://github.com/cython/cython/issues/5589 +from builtins import BaseException +from collections import defaultdict + +cimport cython +from cython.operator cimport dereference as deref +from libc.stddef cimport size_t +from libc.stdint cimport int8_t, uintptr_t +from libcpp cimport bool +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.optional cimport optional +from libcpp.pair cimport pair + +from cuda.cudart import cudaError_t + +from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice + +from rmm._cuda.stream cimport Stream + +from rmm._cuda.stream import DEFAULT_STREAM + +from rmm.cpp.cpp_cuda_stream_view cimport cuda_stream_view +from rmm.cpp.cpp_per_device_resource cimport ( + cuda_device_id, + set_per_device_resource as cpp_set_per_device_resource, +) +from rmm.python.helper cimport parse_bytes +from rmm.python.memory_resource cimport ( + available_device_memory as c_available_device_memory, + percent_of_free_device_memory as c_percent_of_free_device_memory, +) + +from rmm.statistics import Statistics + +from rmm.cpp.cpp_memory_resource cimport ( + CppExcept, + allocate_callback_t, + allocation_handle_type, + binning_memory_resource, + callback_memory_resource, + cuda_async_memory_resource, + cuda_memory_resource, + deallocate_callback_t, + device_memory_resource, + failure_callback_resource_adaptor, + failure_callback_t, + fixed_size_memory_resource, + limiting_resource_adaptor, + logging_resource_adaptor, + managed_memory_resource, + pool_memory_resource, + posix_file_descriptor, + prefetch_resource_adaptor, + sam_headroom_memory_resource, + statistics_resource_adaptor, + system_memory_resource, + throw_cpp_except, + tracking_resource_adaptor, + translate_python_except_to_cpp, +) + + +cdef class DeviceMemoryResource: + + cdef device_memory_resource* get_mr(self) noexcept nogil: + """Get the underlying C++ memory resource object.""" + return self.c_obj.get() + + def allocate(self, size_t nbytes, Stream stream=DEFAULT_STREAM): + """Allocate ``nbytes`` bytes of memory. + + Parameters + ---------- + nbytes : size_t + The size of the allocation in bytes + stream : Stream + Optional stream for the allocation + """ + return self.c_obj.get().allocate(nbytes, stream.view()) + + def deallocate(self, uintptr_t ptr, size_t nbytes, Stream stream=DEFAULT_STREAM): + """Deallocate memory pointed to by ``ptr`` of size ``nbytes``. + + Parameters + ---------- + ptr : uintptr_t + Pointer to be deallocated + nbytes : size_t + Size of the allocation in bytes + stream : Stream + Optional stream for the deallocation + """ + self.c_obj.get().deallocate((ptr), nbytes, stream.view()) + + +# See the note about `no_gc_clear` in `device_buffer.pyx`. +@cython.no_gc_clear +cdef class UpstreamResourceAdaptor(DeviceMemoryResource): + """Parent class for all memory resources that track an upstream. + + Upstream resource tracking requires maintaining a reference to the upstream + mr so that it is kept alive and may be accessed by any downstream resource + adaptors. + """ + + def __cinit__(self, DeviceMemoryResource upstream_mr, *args, **kwargs): + + if (upstream_mr is None): + raise Exception("Argument `upstream_mr` must not be None") + + self.upstream_mr = upstream_mr + + def __dealloc__(self): + # Must cleanup the base MR before any upstream MR + self.c_obj.reset() + + cpdef DeviceMemoryResource get_upstream(self): + return self.upstream_mr + + +cdef class CudaMemoryResource(DeviceMemoryResource): + def __cinit__(self): + self.c_obj.reset( + new cuda_memory_resource() + ) + + def __init__(self): + """ + Memory resource that uses ``cudaMalloc``/``cudaFree`` for + allocation/deallocation. + """ + pass + + +cdef class CudaAsyncMemoryResource(DeviceMemoryResource): + """ + Memory resource that uses ``cudaMallocAsync``/``cudaFreeAsync`` for + allocation/deallocation. + + Parameters + ---------- + initial_pool_size : int | str, optional + Initial pool size in bytes. By default, half the available memory + on the device is used. A string argument is parsed using `parse_bytes`. + release_threshold: int, optional + Release threshold in bytes. If the pool size grows beyond this + value, unused memory held by the pool will be released at the + next synchronization point. + enable_ipc: bool, optional + If True, enables export of POSIX file descriptor handles for the memory + allocated by this resource so that it can be used with CUDA IPC. + """ + def __cinit__( + self, + initial_pool_size=None, + release_threshold=None, + enable_ipc=False + ): + cdef optional[size_t] c_initial_pool_size = ( + optional[size_t]() + if initial_pool_size is None + else optional[size_t]( parse_bytes(initial_pool_size)) + ) + + cdef optional[size_t] c_release_threshold = ( + optional[size_t]() + if release_threshold is None + else optional[size_t]( release_threshold) + ) + + # If IPC memory handles are not supported, the constructor below will + # raise an error from C++. + cdef optional[allocation_handle_type] c_export_handle_type = ( + optional[allocation_handle_type]( + posix_file_descriptor + ) + if enable_ipc + else optional[allocation_handle_type]() + ) + + self.c_obj.reset( + new cuda_async_memory_resource( + c_initial_pool_size, + c_release_threshold, + c_export_handle_type + ) + ) + + +cdef class ManagedMemoryResource(DeviceMemoryResource): + def __cinit__(self): + self.c_obj.reset( + new managed_memory_resource() + ) + + def __init__(self): + """ + Memory resource that uses ``cudaMallocManaged``/``cudaFree`` for + allocation/deallocation. + """ + pass + + +cdef class SystemMemoryResource(DeviceMemoryResource): + def __cinit__(self): + self.c_obj.reset( + new system_memory_resource() + ) + + def __init__(self): + """ + Memory resource that uses ``malloc``/``free`` for + allocation/deallocation. + """ + pass + + +cdef class SamHeadroomMemoryResource(DeviceMemoryResource): + def __cinit__( + self, + size_t headroom + ): + self.c_obj.reset(new sam_headroom_memory_resource(headroom)) + + def __init__( + self, + size_t headroom + ): + """ + Memory resource that uses ``malloc``/``free`` for + allocation/deallocation. + + Parameters + ---------- + headroom : size_t + Size of the reserved GPU memory as headroom + """ + pass + + +cdef class PoolMemoryResource(UpstreamResourceAdaptor): + + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + initial_pool_size=None, + maximum_pool_size=None + ): + cdef size_t c_initial_pool_size + cdef optional[size_t] c_maximum_pool_size + c_initial_pool_size = ( + c_percent_of_free_device_memory(50) if + initial_pool_size is None + else parse_bytes(initial_pool_size) + ) + c_maximum_pool_size = ( + optional[size_t]() if + maximum_pool_size is None + else optional[size_t]( parse_bytes(maximum_pool_size)) + ) + self.c_obj.reset( + new pool_memory_resource[device_memory_resource]( + upstream_mr.get_mr(), + c_initial_pool_size, + c_maximum_pool_size + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr, + object initial_pool_size=None, + object maximum_pool_size=None + ): + """ + Coalescing best-fit suballocator which uses a pool of memory allocated + from an upstream memory resource. + + Parameters + ---------- + upstream_mr : DeviceMemoryResource + The DeviceMemoryResource from which to allocate blocks for the + pool. + initial_pool_size : int | str, optional + Initial pool size in bytes. By default, half the available memory + on the device is used. + maximum_pool_size : int | str, optional + Maximum size in bytes, that the pool can grow to. + """ + pass + + def pool_size(self): + cdef pool_memory_resource[device_memory_resource]* c_mr = ( + (self.get_mr()) + ) + return c_mr.pool_size() + +cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor): + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + size_t block_size=1<<20, + size_t blocks_to_preallocate=128 + ): + self.c_obj.reset( + new fixed_size_memory_resource[device_memory_resource]( + upstream_mr.get_mr(), + block_size, + blocks_to_preallocate + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr, + size_t block_size=1<<20, + size_t blocks_to_preallocate=128 + ): + """ + Memory resource which allocates memory blocks of a single fixed size. + + Parameters + ---------- + upstream_mr : DeviceMemoryResource + The DeviceMemoryResource from which to allocate blocks for the + pool. + block_size : int, optional + The size of blocks to allocate (default is 1MiB). + blocks_to_preallocate : int, optional + The number of blocks to allocate to initialize the pool. + + Notes + ----- + Supports only allocations of size smaller than the configured + block_size. + """ + pass + + +cdef class BinningMemoryResource(UpstreamResourceAdaptor): + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + int8_t min_size_exponent=-1, + int8_t max_size_exponent=-1, + ): + + self._bin_mrs = [] + + if (min_size_exponent == -1 or max_size_exponent == -1): + self.c_obj.reset( + new binning_memory_resource[device_memory_resource]( + upstream_mr.get_mr() + ) + ) + else: + self.c_obj.reset( + new binning_memory_resource[device_memory_resource]( + upstream_mr.get_mr(), + min_size_exponent, + max_size_exponent + ) + ) + + def __dealloc__(self): + + # Must cleanup the base MR before any upstream or referenced Bins + self.c_obj.reset() + + def __init__( + self, + DeviceMemoryResource upstream_mr, + int8_t min_size_exponent=-1, + int8_t max_size_exponent=-1, + ): + """ + Allocates memory from a set of specified "bin" sizes based on a + specified allocation size. + + If min_size_exponent and max_size_exponent are specified, initializes + with one or more FixedSizeMemoryResource bins in the range + ``[2**min_size_exponent, 2**max_size_exponent]``. + + Call :py:meth:`~.add_bin` to add additional bin allocators. + + Parameters + ---------- + upstream_mr : DeviceMemoryResource + The memory resource to use for allocations larger than any of the + bins. + min_size_exponent : size_t + The base-2 exponent of the minimum size FixedSizeMemoryResource + bin to create. + max_size_exponent : size_t + The base-2 exponent of the maximum size FixedSizeMemoryResource + bin to create. + """ + pass + + cpdef add_bin( + self, + size_t allocation_size, + DeviceMemoryResource bin_resource=None + ): + """ + Adds a bin of the specified maximum allocation size to this memory + resource. If specified, uses bin_resource for allocation for this bin. + If not specified, creates and uses a FixedSizeMemoryResource for + allocation for this bin. + + Allocations smaller than allocation_size and larger than the next + smaller bin size will use this fixed-size memory resource. + + Parameters + ---------- + allocation_size : size_t + The maximum allocation size in bytes for the created bin + bin_resource : DeviceMemoryResource + The resource to use for this bin (optional) + """ + if bin_resource is None: + (( + self.c_obj.get()))[0].add_bin(allocation_size) + else: + # Save the ref to the new bin resource to ensure its lifetime + self._bin_mrs.append(bin_resource) + + (( + self.c_obj.get()))[0].add_bin( + allocation_size, + bin_resource.get_mr()) + + @property + def bin_mrs(self) -> list: + """Get the list of binned memory resources.""" + return self._bin_mrs + + +cdef void* _allocate_callback_wrapper( + size_t nbytes, + cuda_stream_view stream, + void* ctx + # Note that this function is specifically designed to rethrow Python + # exceptions as C++ exceptions when called as a callback from C++, so it is + # noexcept from Cython's perspective. +) noexcept nogil: + cdef CppExcept err + with gil: + try: + return ((ctx)( + nbytes, + Stream._from_cudaStream_t(stream.value()) + )) + except BaseException as e: + err = translate_python_except_to_cpp(e) + throw_cpp_except(err) + +cdef void _deallocate_callback_wrapper( + void* ptr, + size_t nbytes, + cuda_stream_view stream, + void* ctx +) except * with gil: + (ctx)((ptr), nbytes, Stream._from_cudaStream_t(stream.value())) + + +cdef class CallbackMemoryResource(DeviceMemoryResource): + """ + A memory resource that uses the user-provided callables to do + memory allocation and deallocation. + + ``CallbackMemoryResource`` should really only be used for + debugging memory issues, as there is a significant performance + penalty associated with using a Python function for each memory + allocation and deallocation. + + Parameters + ---------- + allocate_func: callable + The allocation function must accept two arguments. An integer + representing the number of bytes to allocate and a Stream on + which to perform the allocation, and return an integer + representing the pointer to the allocated memory. + deallocate_func: callable + The deallocation function must accept three arguments. an integer + representing the pointer to the memory to free, a second + integer representing the number of bytes to free, and a Stream + on which to perform the deallocation. + + Examples + -------- + >>> import rmm + >>> base_mr = rmm.mr.CudaMemoryResource() + >>> def allocate_func(size, stream): + ... print(f"Allocating {size} bytes") + ... return base_mr.allocate(size, stream) + ... + >>> def deallocate_func(ptr, size, stream): + ... print(f"Deallocating {size} bytes") + ... return base_mr.deallocate(ptr, size, stream) + ... + >>> rmm.mr.set_current_device_resource( + rmm.mr.CallbackMemoryResource(allocate_func, deallocate_func) + ) + >>> dbuf = rmm.DeviceBuffer(size=256) + Allocating 256 bytes + >>> del dbuf + Deallocating 256 bytes + """ + def __init__( + self, + allocate_func, + deallocate_func, + ): + self._allocate_func = allocate_func + self._deallocate_func = deallocate_func + self.c_obj.reset( + new callback_memory_resource( + (_allocate_callback_wrapper), + (_deallocate_callback_wrapper), + (allocate_func), + (deallocate_func) + ) + ) + + +def _append_id(filename, id): + """ + Append ".dev" onto a filename before the extension + + Example: _append_id("hello.txt", 1) returns "hello.dev1.txt" + + Parameters + ---------- + filename : string + The filename, possibly with extension + id : int + The ID to append + """ + name, ext = os.path.splitext(filename) + return f"{name}.dev{id}{ext}" + + +cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor): + + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + size_t allocation_limit + ): + self.c_obj.reset( + new limiting_resource_adaptor[device_memory_resource]( + upstream_mr.get_mr(), + allocation_limit + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr, + size_t allocation_limit + ): + """ + Memory resource that limits the total allocation amount possible + performed by an upstream memory resource. + + Parameters + ---------- + upstream_mr : DeviceMemoryResource + The upstream memory resource. + allocation_limit : size_t + Maximum memory allowed for this allocator. + """ + pass + + def get_allocated_bytes(self) -> size_t: + """ + Query the number of bytes that have been allocated. Note that this can + not be used to know how large of an allocation is possible due to both + possible fragmentation and also internal page sizes and alignment that + is not tracked by this allocator. + """ + return (( + self.c_obj.get()) + )[0].get_allocated_bytes() + + def get_allocation_limit(self) -> size_t: + """ + Query the maximum number of bytes that this allocator is allowed to + allocate. This is the limit on the allocator and not a representation + of the underlying device. The device may not be able to support this + limit. + """ + return (( + self.c_obj.get()) + )[0].get_allocation_limit() + + +cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor): + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + object log_file_name=None + ): + if log_file_name is None: + log_file_name = os.getenv("RMM_LOG_FILE") + if not log_file_name: + raise ValueError( + "RMM log file must be specified either using " + "log_file_name= argument or RMM_LOG_FILE " + "environment variable" + ) + + # Append the device ID before the file extension + log_file_name = _append_id( + log_file_name, getDevice() + ) + log_file_name = os.path.abspath(log_file_name) + self._log_file_name = log_file_name + + self.c_obj.reset( + new logging_resource_adaptor[device_memory_resource]( + upstream_mr.get_mr(), + log_file_name.encode() + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr, + object log_file_name=None + ): + """ + Memory resource that logs information about allocations/deallocations + performed by an upstream memory resource. + + Parameters + ---------- + upstream : DeviceMemoryResource + The upstream memory resource. + log_file_name : str + Path to the file to which logs are written. + """ + pass + + cpdef flush(self): + (( + self.get_mr()))[0].flush() + + cpdef get_file_name(self): + return self._log_file_name + + def __dealloc__(self): + self.c_obj.reset() + +cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor): + + def __cinit__( + self, + DeviceMemoryResource upstream_mr + ): + self.c_obj.reset( + new statistics_resource_adaptor[device_memory_resource]( + upstream_mr.get_mr() + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr + ): + """ + Memory resource that tracks the current, peak and total + allocations/deallocations performed by an upstream memory resource. + Includes the ability to query these statistics at any time. + + A stack of counters is maintained. Use :meth:`push_counters` and + :meth:`pop_counters` to track statistics at different nesting levels. + + Parameters + ---------- + upstream : DeviceMemoryResource + The upstream memory resource. + """ + pass + + @property + def allocation_counts(self) -> Statistics: + """ + Gets the current, peak, and total allocated bytes and number of + allocations. + + The dictionary keys are ``current_bytes``, ``current_count``, + ``peak_bytes``, ``peak_count``, ``total_bytes``, and ``total_count``. + + Returns: + dict: Dictionary containing allocation counts and bytes. + """ + cdef statistics_resource_adaptor[device_memory_resource]* mr = \ + self.c_obj.get() + + counts = deref(mr).get_allocations_counter() + byte_counts = deref(mr).get_bytes_counter() + return Statistics( + current_bytes=byte_counts.value, + current_count=counts.value, + peak_bytes=byte_counts.peak, + peak_count=counts.peak, + total_bytes=byte_counts.total, + total_count=counts.total, + ) + + def pop_counters(self) -> Statistics: + """ + Pop a counter pair (bytes and allocations) from the stack + + Returns + ------- + The popped statistics + """ + cdef statistics_resource_adaptor[device_memory_resource]* mr = \ + self.c_obj.get() + + bytes_and_allocs = deref(mr).pop_counters() + return Statistics( + current_bytes=bytes_and_allocs.first.value, + current_count=bytes_and_allocs.second.value, + peak_bytes=bytes_and_allocs.first.peak, + peak_count=bytes_and_allocs.second.peak, + total_bytes=bytes_and_allocs.first.total, + total_count=bytes_and_allocs.second.total, + ) + + def push_counters(self) -> Statistics: + """ + Push a new counter pair (bytes and allocations) on the stack + + Returns + ------- + The statistics _before_ the push + """ + + cdef statistics_resource_adaptor[device_memory_resource]* mr = \ + self.c_obj.get() + + bytes_and_allocs = deref(mr).push_counters() + return Statistics( + current_bytes=bytes_and_allocs.first.value, + current_count=bytes_and_allocs.second.value, + peak_bytes=bytes_and_allocs.first.peak, + peak_count=bytes_and_allocs.second.peak, + total_bytes=bytes_and_allocs.first.total, + total_count=bytes_and_allocs.second.total, + ) + +cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor): + + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + bool capture_stacks=False + ): + self.c_obj.reset( + new tracking_resource_adaptor[device_memory_resource]( + upstream_mr.get_mr(), + capture_stacks + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr, + bool capture_stacks=False + ): + """ + Memory resource that logs tracks allocations/deallocations + performed by an upstream memory resource. Includes the ability to + query all outstanding allocations with the stack trace, if desired. + + Parameters + ---------- + upstream : DeviceMemoryResource + The upstream memory resource. + capture_stacks : bool + Whether or not to capture the stack trace with each allocation. + """ + pass + + def get_allocated_bytes(self) -> size_t: + """ + Query the number of bytes that have been allocated. Note that this can + not be used to know how large of an allocation is possible due to both + possible fragmentation and also internal page sizes and alignment that + is not tracked by this allocator. + """ + return (( + self.c_obj.get()) + )[0].get_allocated_bytes() + + def get_outstanding_allocations_str(self) -> str: + """ + Returns a string containing information about the current outstanding + allocations. For each allocation, the address, size and optional + stack trace are shown. + """ + + return (( + self.c_obj.get()) + )[0].get_outstanding_allocations_str().decode('UTF-8') + + def log_outstanding_allocations(self): + """ + Logs the output of `get_outstanding_allocations_str` to the current + RMM log file if enabled. + """ + + (( + self.c_obj.get()))[0].log_outstanding_allocations() + + +# Note that this function is specifically designed to rethrow Python exceptions +# as C++ exceptions when called as a callback from C++, so it is noexcept from +# Cython's perspective. +cdef bool _oom_callback_function(size_t bytes, void *callback_arg) noexcept nogil: + cdef CppExcept err + with gil: + try: + return (callback_arg)(bytes) + except BaseException as e: + err = translate_python_except_to_cpp(e) + throw_cpp_except(err) + + +cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor): + + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + object callback, + ): + self._callback = callback + self.c_obj.reset( + new failure_callback_resource_adaptor[device_memory_resource]( + upstream_mr.get_mr(), + _oom_callback_function, + callback + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr, + object callback, + ): + """ + Memory resource that call callback when memory allocation fails. + + Parameters + ---------- + upstream : DeviceMemoryResource + The upstream memory resource. + callback : callable + Function called when memory allocation fails. + """ + pass + +cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): + + def __cinit__( + self, + DeviceMemoryResource upstream_mr + ): + self.c_obj.reset( + new prefetch_resource_adaptor[device_memory_resource]( + upstream_mr.get_mr() + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr + ): + """ + Memory resource that prefetches all allocations. + + Parameters + ---------- + upstream : DeviceMemoryResource + The upstream memory resource. + """ + pass + + +# Global per-device memory resources; dict of int:DeviceMemoryResource +cdef _per_device_mrs = defaultdict(CudaMemoryResource) + + +cpdef void _initialize( + bool pool_allocator=False, + bool managed_memory=False, + object initial_pool_size=None, + object maximum_pool_size=None, + object devices=0, + bool logging=False, + object log_file_name=None, +) except *: + """ + Initializes RMM library using the options passed + """ + if managed_memory: + upstream = ManagedMemoryResource + else: + upstream = CudaMemoryResource + + if pool_allocator: + typ = PoolMemoryResource + args = (upstream(),) + kwargs = dict( + initial_pool_size=None if initial_pool_size is None + else parse_bytes(initial_pool_size), + maximum_pool_size=None if maximum_pool_size is None + else parse_bytes(maximum_pool_size) + ) + else: + typ = upstream + args = () + kwargs = {} + + cdef DeviceMemoryResource mr + cdef int original_device + + # Save the current device so we can reset it + try: + original_device = getDevice() + except CUDARuntimeError as e: + if e.status == cudaError_t.cudaErrorNoDevice: + warnings.warn(e.msg) + else: + raise e + else: + # reset any previously specified per device resources + global _per_device_mrs + _per_device_mrs.clear() + + if devices is None: + devices = [0] + elif isinstance(devices, int): + devices = [devices] + + # create a memory resource per specified device + for device in devices: + setDevice(device) + + if logging: + mr = LoggingResourceAdaptor( + typ(*args, **kwargs), + log_file_name + ) + else: + mr = typ(*args, **kwargs) + + set_per_device_resource(device, mr) + + # reset CUDA device to original + setDevice(original_device) + + +cpdef get_per_device_resource(int device): + """ + Get the default memory resource for the specified device. + + If the returned memory resource is used when a different device is the + active CUDA device, behavior is undefined. + + Parameters + ---------- + device : int + The ID of the device for which to get the memory resource. + """ + global _per_device_mrs + return _per_device_mrs[device] + + +cpdef set_per_device_resource(int device, DeviceMemoryResource mr): + """ + Set the default memory resource for the specified device. + + Parameters + ---------- + device : int + The ID of the device for which to get the memory resource. + mr : DeviceMemoryResource + The memory resource to set. Must have been created while device was + the active CUDA device. + """ + global _per_device_mrs + _per_device_mrs[device] = mr + + # Since cuda_device_id does not have a default constructor, it must be heap + # allocated + cdef unique_ptr[cuda_device_id] device_id = \ + make_unique[cuda_device_id](device) + + cpp_set_per_device_resource(deref(device_id), mr.get_mr()) + + +cpdef set_current_device_resource(DeviceMemoryResource mr): + """ + Set the default memory resource for the current device. + + Parameters + ---------- + mr : DeviceMemoryResource + The memory resource to set. Must have been created while the current + device is the active CUDA device. + """ + set_per_device_resource(getDevice(), mr) + + +cpdef get_per_device_resource_type(int device): + """ + Get the memory resource type used for RMM device allocations on the + specified device. + + Parameters + ---------- + device : int + The device ID + """ + return type(get_per_device_resource(device)) + + +cpdef DeviceMemoryResource get_current_device_resource(): + """ + Get the memory resource used for RMM device allocations on the current + device. + + If the returned memory resource is used when a different device is the + active CUDA device, behavior is undefined. + """ + return get_per_device_resource(getDevice()) + + +cpdef get_current_device_resource_type(): + """ + Get the memory resource type used for RMM device allocations on the + current device. + """ + return type(get_current_device_resource()) + + +cpdef is_initialized(): + """ + Check whether RMM is initialized + """ + global _per_device_mrs + cdef DeviceMemoryResource each_mr + return all( + [each_mr.get_mr() is not NULL + for each_mr in _per_device_mrs.values()] + ) + + +cpdef _flush_logs(): + """ + Flush the logs of all currently initialized LoggingResourceAdaptor + memory resources + """ + global _per_device_mrs + cdef DeviceMemoryResource each_mr + for each_mr in _per_device_mrs.values(): + if isinstance(each_mr, LoggingResourceAdaptor): + each_mr.flush() + + +def enable_logging(log_file_name=None): + """ + Enable logging of run-time events for all devices. + + Parameters + ---------- + log_file_name: str, optional + Name of the log file. If not specified, the environment variable + RMM_LOG_FILE is used. A ValueError is thrown if neither is available. + A separate log file is produced for each device, + and the suffix `".dev{id}"` is automatically added to the log file + name. + + Notes + ----- + Note that if you use the environment variable CUDA_VISIBLE_DEVICES + with logging enabled, the suffix may not be what you expect. For + example, if you set CUDA_VISIBLE_DEVICES=1, the log file produced + will still have suffix `0`. Similarly, if you set + CUDA_VISIBLE_DEVICES=1,0 and use devices 0 and 1, the log file + with suffix `0` will correspond to the GPU with device ID `1`. + Use `rmm.get_log_filenames()` to get the log file names + corresponding to each device. + """ + global _per_device_mrs + + devices = [0] if not _per_device_mrs.keys() else _per_device_mrs.keys() + + for device in devices: + each_mr = _per_device_mrs[device] + if not isinstance(each_mr, LoggingResourceAdaptor): + set_per_device_resource( + device, + LoggingResourceAdaptor(each_mr, log_file_name) + ) + + +def disable_logging(): + """ + Disable logging if it was enabled previously using + `rmm.initialize()` or `rmm.enable_logging()`. + """ + global _per_device_mrs + for i, each_mr in _per_device_mrs.items(): + if isinstance(each_mr, LoggingResourceAdaptor): + set_per_device_resource(i, each_mr.get_upstream()) + + +def get_log_filenames(): + """ + Returns the log filename (or `None` if not writing logs) + for each device in use. + + Examples + -------- + >>> import rmm + >>> rmm.reinitialize(devices=[0, 1], logging=True, log_file_name="rmm.log") + >>> rmm.get_log_filenames() + {0: '/home/user/workspace/rapids/rmm/python/rmm.dev0.log', + 1: '/home/user/workspace/rapids/rmm/python/rmm.dev1.log'} + """ + global _per_device_mrs + + return { + i: each_mr.get_file_name() + if isinstance(each_mr, LoggingResourceAdaptor) + else None + for i, each_mr in _per_device_mrs.items() + } + + +def available_device_memory(): + """ + Returns a tuple of free and total device memory memory. + """ + cdef pair[size_t, size_t] res + res = c_available_device_memory() + return (res.first, res.second)