Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Callback memory resource #980

Merged
merged 39 commits into from
Apr 19, 2022
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
ea6f54c
Draft of callback_memory_resource
shwina Nov 25, 2021
ba5a43d
building...
shwina Nov 25, 2021
46c6507
Some fixes
shwina Dec 1, 2021
eb8f9e1
Draft of callback_memory_resource
shwina Nov 25, 2021
8f1a62d
building...
shwina Nov 25, 2021
e7c8c88
Some fixes
shwina Dec 1, 2021
1e2a10c
Expose `allocate()` and `deallocate()`
shwina Apr 6, 2022
d2fca0d
Add first cpp test
shwina Apr 6, 2022
63b9690
Add another test
shwina Apr 6, 2022
cdc53e7
Use fmt instead
shwina Apr 8, 2022
e1014d7
C++ docs
shwina Apr 8, 2022
f53ccf1
Add python test
shwina Apr 8, 2022
de221d6
Merge branch 'callback-memory-resource' of github.com:shwina/rmm into…
shwina Apr 8, 2022
e9221fa
Remove files
shwina Apr 8, 2022
f00084c
Doc update
shwina Apr 13, 2022
ecad57f
Update tests/mr/device/callback_mr_tests.cpp
shwina Apr 13, 2022
01e109f
Add note about performance hit
shwina Apr 13, 2022
eacaa27
Merge branch 'callback-memory-resource' of github.com:shwina/rmm into…
shwina Apr 13, 2022
ad46071
More doc
shwina Apr 13, 2022
3cafa3a
Doc update
shwina Apr 13, 2022
4008617
Update include/rmm/mr/device/callback_memory_resource.hpp
shwina Apr 13, 2022
8dc9eca
Update include/rmm/mr/device/callback_memory_resource.hpp
shwina Apr 13, 2022
392572b
Merge branch 'branch-22.06' of https://github.com/rapidsai/rmm into c…
shwina Apr 13, 2022
a776631
Correctly pass base_mr using ctx
shwina Apr 13, 2022
169fa1f
Update include/rmm/mr/device/callback_memory_resource.hpp
shwina Apr 13, 2022
a647f99
Merge branch 'callback-memory-resource' of github.com:shwina/rmm into…
shwina Apr 13, 2022
fa22ecb
Address various review comments
shwina Apr 13, 2022
8d69937
Update include/rmm/mr/device/callback_memory_resource.hpp
shwina Apr 14, 2022
ba3c1bd
Update include/rmm/mr/device/callback_memory_resource.hpp
shwina Apr 14, 2022
c1ba187
Update python/rmm/_lib/memory_resource.pyx
shwina Apr 14, 2022
9d94861
Default to nullptr
shwina Apr 14, 2022
1933e9a
Merge branch 'callback-memory-resource' of github.com:shwina/rmm into…
shwina Apr 14, 2022
347dc9c
Move docs
shwina Apr 14, 2022
b32dfa2
Use mock class in test
shwina Apr 14, 2022
02a05d6
Add a test for allocate/deallocate
shwina Apr 19, 2022
f9e445b
Add header
shwina Apr 19, 2022
d080157
Add example
shwina Apr 19, 2022
5529ca1
Copyright
shwina Apr 19, 2022
3c73ca1
Copyright
shwina Apr 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions include/rmm/mr/device/callback_memory_resource.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <rmm/mr/device/device_memory_resource.hpp>

#include <cstddef>
#include <functional>
#include <utility>

namespace rmm::mr {

/**
* @brief Callback function type used by callback memory resource for allocation.
*
* The signature of the callback function is:
* `void* allocate_callback_t(std::size_t bytes, cuda_stream_view stream, void* arg);
*
* * Returns a pointer to an allocation of at least `bytes` usable immediately on
* `stream`. The stream-ordered behavior requirements are identical to
* `device_memory_resource::allocate`.
*
* * This signature is compatible with `do_allocate` but adds the extra function
* parameter `arg`. The `arg` is provided to the constructor of the
* `callback_memory_resource` and will be forwarded along to every invocation
* of the callback function.
*/
using allocate_callback_t = std::function<void*(std::size_t, cuda_stream_view, void*)>;

/**
* @brief Callback function type used by callback_memory_resource for deallocation.
*
* The signature of the callback function is:
* `void deallocate_callback_t(void* ptr, std::size_t bytes, cuda_stream_view stream, void* arg);
*
* * Deallocates memory pointed to by `ptr`. `bytes` specifies the size of the allocation
* in bytes, and must equal the value of `bytes` that was passed to the allocate callback
* function. The stream-ordered behavior requirements are identical to
* `device_memory_resource::deallocate`.
*
* * This signature is compatible with `do_deallocate` but adds the extra function
* parameter `arg`. The `arg` is provided to the constructor of the
* `callback_memory_resource` and will be forwarded along to every invocation
* of the callback function.
*/
using deallocate_callback_t = std::function<void(void*, std::size_t, cuda_stream_view, void*)>;

/**
* @brief A device memory resource that uses the provided callbacks for memory allocation
* and deallocation.
*/
class callback_memory_resource final : public device_memory_resource {
vyasr marked this conversation as resolved.
Show resolved Hide resolved
public:
/**
* @brief Construct a new callback memory resource.
*
* Constructs a callback memory resource that uses the user-provided callbacks
* `allocate_callback` for allocation and `deallocate_callback` for deallocation.
*
* @param allocate_callback The callback function used for allocation
* @param deallocate_callback The callback function used for deallocation
* @param allocate_callback_arg Additional context passed to `allocate_callback`.
* It is the caller's responsibility to maintain the lifetime of the pointed-to data
* for the duration of the lifetime of the `callback_memory_resource`.
* @param deallocate_callback_arg Additional context passed to `deallocate_callback`.
* It is the caller's responsibility to maintain the lifetime of the pointed-to data
* for the duration of the lifetime of the `callback_memory_resource`.
*/
callback_memory_resource(allocate_callback_t allocate_callback,
deallocate_callback_t deallocate_callback,
void* allocate_callback_arg = nullptr,
void* deallocate_callback_arg = nullptr) noexcept
: allocate_callback_(allocate_callback),
deallocate_callback_(deallocate_callback),
allocate_callback_arg_(allocate_callback_arg),
deallocate_callback_arg_(deallocate_callback_arg)
{
}

callback_memory_resource() = delete;
~callback_memory_resource() override = default;
callback_memory_resource(callback_memory_resource const&) = delete;
callback_memory_resource& operator=(callback_memory_resource const&) = delete;
callback_memory_resource(callback_memory_resource&&) noexcept = default;
callback_memory_resource& operator=(callback_memory_resource&&) noexcept = default;

private:
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
{
return allocate_callback_(bytes, stream, allocate_callback_arg_);
}

void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
{
deallocate_callback_(ptr, bytes, stream, deallocate_callback_arg_);
}

[[nodiscard]] std::pair<std::size_t, std::size_t> do_get_mem_info(cuda_stream_view) const override
{
throw std::runtime_error("cannot get free / total memory");
}

[[nodiscard]] virtual bool supports_streams() const noexcept { return false; }
vyasr marked this conversation as resolved.
Show resolved Hide resolved
[[nodiscard]] virtual bool supports_get_mem_info() const noexcept { return false; }

allocate_callback_t allocate_callback_;
deallocate_callback_t deallocate_callback_;
void* allocate_callback_arg_;
void* deallocate_callback_arg_;
};

} // namespace rmm::mr
8 changes: 6 additions & 2 deletions python/rmm/_lib/memory_resource.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ from libcpp.vector cimport vector
cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass device_memory_resource:
pass
void* allocate(size_t bytes) except +
void deallocate(void* ptr, size_t bytes) except +
Comment on lines +24 to +25
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAICT these methods are being added just for the purpose of the test. I'm not opposed to exposing these functions in the Python API, but that seems like it merits discussion beyond this largely unrelated PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also fine if you want to push back here and just get this done, but I wanted to at least have that discussion recorded if so.

Copy link
Contributor Author

@shwina shwina Apr 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's probably a good idea to expose these anyway for users that want to use RMM but not necessarily construct memory owning DeviceBuffers. Also see my comment below.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with that. Could we add some explicit tests of these APIs for other memory resources then? It seems awkward that the only test of the allocate function of memory allocators (which sounds like a pretty core feature...) would only be tested in this one callback test where we don't even actually validate the allocation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a test for allocate/deallocate


cdef class DeviceMemoryResource:
cdef shared_ptr[device_memory_resource] c_obj

cdef device_memory_resource* get_mr(self)

cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
Expand Down Expand Up @@ -57,6 +57,10 @@ cdef class BinningMemoryResource(UpstreamResourceAdaptor):
size_t allocation_size,
DeviceMemoryResource bin_resource=*)

cdef class CallbackMemoryResource(DeviceMemoryResource):
cdef object _allocate_func
cdef object _deallocate_func

cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor):
cdef object _log_file_name
cpdef get_file_name(self)
Expand Down
76 changes: 75 additions & 1 deletion python/rmm/_lib/memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import warnings
from collections import defaultdict

from cython.operator cimport dereference as deref
from libc.stdint cimport int8_t, int64_t
from libc.stdint cimport int8_t, int64_t, uintptr_t
from libcpp cimport bool
from libcpp.cast cimport dynamic_cast
from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr
Expand All @@ -26,6 +26,7 @@ from libcpp.string cimport string
from cuda.cudart import cudaError_t

from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice
from rmm._lib.cuda_stream_view cimport cuda_stream_view


# NOTE: Keep extern declarations in .pyx file as much as possible to avoid
Expand Down Expand Up @@ -76,6 +77,19 @@ cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
size_t block_size,
size_t block_to_preallocate) except +

cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \
namespace "rmm::mr" nogil:
ctypedef void* (*allocate_callback_t)(size_t, void*)
ctypedef void (*deallocate_callback_t)(void*, size_t, void*)

cdef cppclass callback_memory_resource(device_memory_resource):
callback_memory_resource(
allocate_callback_t allocate_callback,
deallocate_callback_t deallocate_callback,
void* allocate_callback_arg,
void* deallocate_callback_arg
) except +

cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass binning_memory_resource[Upstream](device_memory_resource):
Expand Down Expand Up @@ -168,6 +182,12 @@ cdef class DeviceMemoryResource:
cdef device_memory_resource* get_mr(self):
return self.c_obj.get()

def allocate(self, size_t nbytes):
return <uintptr_t>self.c_obj.get().allocate(nbytes)
bdice marked this conversation as resolved.
Show resolved Hide resolved

def deallocate(self, uintptr_t ptr, size_t nbytes):
self.c_obj.get().deallocate(<void*>(ptr), nbytes)


cdef class UpstreamResourceAdaptor(DeviceMemoryResource):

Expand Down Expand Up @@ -444,6 +464,60 @@ cdef class BinningMemoryResource(UpstreamResourceAdaptor):
bin_resource.get_mr())


cdef void* _allocate_callback_wrapper(
size_t nbytes,
cuda_stream_view stream,
void* ctx
) with gil:
return <void*><uintptr_t>((<object>ctx)(nbytes))
vyasr marked this conversation as resolved.
Show resolved Hide resolved

cdef void _deallocate_callback_wrapper(
void* ptr,
size_t nbytes,
cuda_stream_view stream,
void* ctx
) with gil:
(<object>ctx)(<uintptr_t>(ptr), nbytes)


cdef class CallbackMemoryResource(DeviceMemoryResource):
"""
A memory resource that uses the user-provided callables to do
memory allocation and deallocation.

``CallbackMemoryResource`` should really only be used for
debugging memory issues, as there is a significant performance
penalty associated with using a Python function for each memory
allocation and deallocation.
Comment on lines +488 to +491
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably out of scope for this PR, but would it be possible to instead accept a cdef function as the allocator (as a void * pointer at that point) that wouldn't have these performance implications?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would preclude passing Python functions as the callbacks, which is the primary motivation for the CallbackMemoryResource.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. I'm not suggesting that we could do it with this same class or in this PR. I'm asking if this is a useful feature for future work and another class CythonCallbackMemoryResource (or if there's some way to make this signature polymorphic). Mostly asking if we should open a follow-up issue.


Parameters
----------
allocate_func: callable
The allocation function must accept a single integer argument,
representing the number of bytes to allocate, and return an
integer representing the pointer to the allocated memory.
deallocate_func: callable
The deallocation function must accept two arguments, an integer
representing the pointer to the memory to free, and a second
integer representing the number of bytes to free.
"""
def __init__(
self,
allocate_func,
deallocate_func,
):
self._allocate_func = allocate_func
self._deallocate_func = deallocate_func
self.c_obj.reset(
new callback_memory_resource(
<allocate_callback_t>(_allocate_callback_wrapper),
<deallocate_callback_t>(_deallocate_callback_wrapper),
<void*>(allocate_func),
<void*>(deallocate_func)
)
)


def _append_id(filename, id):
"""
Append ".dev<ID>" onto a filename before the extension
Expand Down
2 changes: 2 additions & 0 deletions python/rmm/mr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
from rmm._lib.memory_resource import (
BinningMemoryResource,
CallbackMemoryResource,
CudaAsyncMemoryResource,
CudaMemoryResource,
DeviceMemoryResource,
Expand All @@ -39,6 +40,7 @@

__all__ = [
"BinningMemoryResource",
"CallbackMemoryResource",
"CudaAsyncMemoryResource",
"CudaMemoryResource",
"DeviceMemoryResource",
Expand Down
22 changes: 22 additions & 0 deletions python/rmm/tests/test_rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,3 +719,25 @@ def test_dev_buf_circle_ref_dealloc():
# deallocate `dbuf1` (which needs the MR alive), a segfault occurs.

gc.collect()


def test_custom_mr(capsys):
base_mr = rmm.mr.CudaMemoryResource()

def allocate_func(size):
print(f"Allocating {size} bytes")
return base_mr.allocate(size)

def deallocate_func(ptr, size):
print(f"Deallocating {size} bytes")
return base_mr.deallocate(ptr, size)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

rmm.mr.set_current_device_resource(
rmm.mr.CallbackMemoryResource(allocate_func, deallocate_func)
)

dbuf = rmm.DeviceBuffer(size=256)
del dbuf
Comment on lines +748 to +749
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we actually check the total memory allocation here instead of just looking for printed output?

Copy link
Contributor Author

@shwina shwina Apr 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for any arbitrary base_mr.

My thought here is that this test doesn't need to check that base_mr is behaving correctly, or really test what happens inside the callbacks. This test should just ensure that the callbacks are indeed invoked as expected.

Maybe there's a better way to do that I'm missing. Modifying a global is one approach I guess?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's a reasonable expectation for the test, but in that case why even have a base mr? The test could remove the actual allocation from the callback entirely. Is calling the allocate function really testing anything other than the fact that you can run arbitrary Python from the callback?

Setting a global would work, but I'm also OK with output capturing for this purpose.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly because it serves as a useful example of how to use CallbackMemoryResource, although I realized that probably belongs in the docstring, so I added it there.


captured = capsys.readouterr()
assert captured.out == "Allocating 256 bytes\nDeallocating 256 bytes\n"
3 changes: 3 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,6 @@ ConfigureTest(ARENA_MR_TEST mr/device/arena_mr_tests.cpp)

# binning MR tests
ConfigureTest(BINNING_MR_TEST mr/device/binning_mr_tests.cpp)

# callback memory resource tests
ConfigureTest(CALLBACK_MR_TEST mr/device/callback_mr_tests.cpp)
17 changes: 17 additions & 0 deletions tests/mock_resource.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#include <rmm/mr/device/device_memory_resource.hpp>

#include <gmock/gmock.h>

namespace rmm::test {

class mock_resource : public rmm::mr::device_memory_resource {
public:
MOCK_METHOD(bool, supports_streams, (), (const, override, noexcept));
MOCK_METHOD(bool, supports_get_mem_info, (), (const, override, noexcept));
MOCK_METHOD(void*, do_allocate, (std::size_t, cuda_stream_view), (override));
MOCK_METHOD(void, do_deallocate, (void*, std::size_t, cuda_stream_view), (override));
using size_pair = std::pair<std::size_t, std::size_t>;
MOCK_METHOD(size_pair, do_get_mem_info, (cuda_stream_view), (const, override));
};

} // namespace rmm::test
11 changes: 1 addition & 10 deletions tests/mr/device/aligned_mr_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include "../../mock_resource.hpp"
#include <rmm/detail/aligned.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/mr/device/aligned_resource_adaptor.hpp>
Expand All @@ -28,16 +29,6 @@ namespace {

using ::testing::Return;

class mock_resource : public rmm::mr::device_memory_resource {
public:
MOCK_METHOD(bool, supports_streams, (), (const, override, noexcept));
MOCK_METHOD(bool, supports_get_mem_info, (), (const, override, noexcept));
MOCK_METHOD(void*, do_allocate, (std::size_t, cuda_stream_view), (override));
MOCK_METHOD(void, do_deallocate, (void*, std::size_t, cuda_stream_view), (override));
using size_pair = std::pair<std::size_t, std::size_t>;
MOCK_METHOD(size_pair, do_get_mem_info, (cuda_stream_view), (const, override));
};

using aligned_mock = rmm::mr::aligned_resource_adaptor<mock_resource>;
using aligned_real = rmm::mr::aligned_resource_adaptor<rmm::mr::device_memory_resource>;

Expand Down
Loading