Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA support for ImageBufAlgo (experimental and very incomplete) #1929

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,10 @@ ifneq (${BUILD_OIIOUTIL_ONLY},)
MY_CMAKE_FLAGS += -DBUILD_OIIOUTIL_ONLY:BOOL=${BUILD_OIIOUTIL_ONLY}
endif

ifneq (${USE_CUDA},)
MY_CMAKE_FLAGS += -DUSE_CUDA:BOOL=${USE_CUDA}
endif

ifdef DEBUG
MY_CMAKE_FLAGS += -DCMAKE_BUILD_TYPE:STRING=Debug
endif
Expand Down Expand Up @@ -510,6 +514,7 @@ help:
@echo " USE_OPENCV=0 Skip anything that needs OpenCV"
@echo " USE_PTEX=0 Skip anything that needs PTex"
@echo " USE_FREETYPE=0 Skip anything that needs Freetype"
@echo " USE_CUDA=1 Build NVIDIA CUDA support (if found)"
@echo " OIIO build-time options:"
@echo " INSTALL_PREFIX=path Set installation prefix (default: ./${INSTALL_PREFIX_BRIEF})"
@echo " NAMESPACE=name Override namespace base name (default: OpenImageIO)"
Expand Down
5 changes: 5 additions & 0 deletions site/spi/Makefile-bits-arnold
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ ifeq (${SP_OS}, rhel7)
-DOPENEXR_CUSTOM_LIB_DIR=/usr/lib64/OpenEXR2
endif

# CUDA customizations
MY_CMAKE_FLAGS += \
-DCUDA_TOOLKIT_ROOT_DIR=/net/soft_scratch/apps/arnold/tools/nvidia/cuda9.1 \
-DCUDA_HOST_COMPILER=/shots/spi/home/lib/arnold/rhel7/llvm_4.0_final/bin/clang++

MY_CMAKE_FLAGS += \
-DOCIO_PATH=${OCIO_PATH} \
-DFIELD3D_HOME=${FIELD3D_HOME} \
Expand Down
2 changes: 2 additions & 0 deletions src/cmake/compiler.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ set (CLANG_TIDY_CHECKS "-*" CACHE STRING "clang-tidy checks to perform")
set (CLANG_TIDY_ARGS "" CACHE STRING "clang-tidy args")
option (CLANG_TIDY_FIX "Have clang-tidy fix source" OFF)
set (GLIBCXX_USE_CXX11_ABI "" CACHE STRING "For gcc, use the new C++11 library ABI (0|1)")
option (USE_CUDA "Use CUDA if found" OFF)
set (CUDA_TARGET_ARCH "sm_30" CACHE STRING "CUDA GPU architecture (e.g. sm_35)")

# Figure out which compiler we're using
if (CMAKE_COMPILER_IS_GNUCC)
Expand Down
24 changes: 24 additions & 0 deletions src/cmake/externalpackages.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# When not in VERBOSE mode, try to make things as quiet as possible
if (NOT VERBOSE)
set (Boost_FIND_QUIETLY true)
set (CUDA_FIND_QUIETLY true)
set (DCMTK_FIND_QUIETLY true)
set (FFmpeg_FIND_QUIETLY true)
set (Field3D_FIND_QUIETLY true)
Expand Down Expand Up @@ -624,4 +625,27 @@ endmacro()


###########################################################################
if (USE_CUDA)
if (NOT CUDA_TOOLKIT_ROOT_DIR AND NOT $ENV{CUDA_TOOLKIT_ROOT_DIR} STREQUAL "")
set (CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_TOOLKIT_ROOT_DIR})
endif ()
if (NOT CUDA_FIND_QUIETLY)
message (STATUS "CUDA_TOOLKIT_ROOT_DIR = ${CUDA_TOOLKIT_ROOT_DIR}")
endif ()
set (CUDA_PROPAGATE_HOST_FLAGS ON)
set (CUDA_VERBOSE_BUILD ${VERBOSE})
find_package (CUDA 7.0 REQUIRED)
list (APPEND CUDA_NVCC_FLAGS ${CSTD_FLAGS} -expt-relaxed-constexpr)
set (CUDA_INCLUDE_DIR ${CUDA_TOOLKIT_ROOT_DIR}/include)
message (STATUS "CUDA version = ${CUDA_VERSION}")
if (NOT CUDA_FIND_QUIETLY)
message (STATUS "CUDA includes = ${CUDA_INCLUDE_DIR}")
message (STATUS "CUDA libraries = ${CUDA_LIBRARIES}")
message (STATUS "CUDA host compiler = ${CUDA_HOST_COMPILER}")
message (STATUS "CUDA nvcc flags = ${CUDA_NVCC_FLAGS}")
endif ()
endif ()

# end Cuda
###########################################################################

3 changes: 3 additions & 0 deletions src/include/OpenImageIO/imagebuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,9 @@ class OIIO_API ImageBuf {
/// Which type of storage is being used for the pixels?
IBStorage storage () const;

/// Is the pixel memory of this ImageBuf visible to Cuda?
bool cuda_storage () const;

/// Is this ImageBuf object initialized?
bool initialized () const;

Expand Down
2 changes: 2 additions & 0 deletions src/include/OpenImageIO/platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,10 @@
// OIIO_HOSTDEVICE is used to supply the function decorators needed when
// compiling for CUDA devices.
#ifdef __CUDACC__
# define OIIO_HOST __host__
# define OIIO_HOSTDEVICE __host__ __device__
#else
# define OIIO_HOST
# define OIIO_HOSTDEVICE
#endif

Expand Down
26 changes: 22 additions & 4 deletions src/libOpenImageIO/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ endif ()
file (TO_NATIVE_PATH "${PLUGIN_SEARCH_PATH}" PLUGIN_SEARCH_PATH_NATIVE)
configure_file (imageio_pvt.h.in "${CMAKE_BINARY_DIR}/include/imageio_pvt.h" @ONLY)

if (BUILDSTATIC)
set (OIIO_LIB_TYPE "STATIC")
else ()
set (OIIO_LIB_TYPE "SHARED")
endif ()

file (GLOB libOpenImageIO_hdrs ../include/OpenImageIO/*.h)

if (NOT USE_EXTERNAL_PUGIXML)
Expand Down Expand Up @@ -47,6 +53,7 @@ set (libOpenImageIO_srcs
imageoutput.cpp iptc.cpp xmp.cpp
color_ocio.cpp
maketexture.cpp
imageio_cuda.cpp
../libutil/argparse.cpp
../libutil/benchmark.cpp
../libutil/errorhandler.cpp
Expand All @@ -73,6 +80,13 @@ set (libOpenImageIO_srcs
${libOpenImageIO_hdrs}
)

#set (cuda_using_srcs imagebufalgo_addsub.cpp)

if (USE_CUDA)
file (GLOB gpu_source_files "*.cu")
message (STATUS "Extra cuda files: ${gpu_source_files}")
list (APPEND libOpenImageIO_srcs ${gpu_source_files})
endif ()

# If the 'EMBEDPLUGINS' option is set, we want to compile the source for
# all the plugins into libOpenImageIO.
Expand Down Expand Up @@ -100,10 +114,12 @@ endif ()
source_group ("libutil" REGULAR_EXPRESSION ".+/libutil/.+")
source_group ("libtexture" REGULAR_EXPRESSION ".+/libtexture/.+")

if (BUILDSTATIC)
add_library (OpenImageIO STATIC ${libOpenImageIO_srcs})
if (USE_CUDA)
add_definitions ("-DOIIO_USE_CUDA=1")
cuda_add_library (OpenImageIO ${OIIO_LIB_TYPE} ${libOpenImageIO_srcs}
OPTIONS -arch ${CUDA_TARGET_ARCH})
else ()
add_library (OpenImageIO SHARED ${libOpenImageIO_srcs})
add_library (OpenImageIO ${OIIO_LIB_TYPE} ${libOpenImageIO_srcs})
endif ()

# if (SANITIZE AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
Expand Down Expand Up @@ -206,7 +222,9 @@ if (EXTRA_DSO_LINK_ARGS)
set_target_properties (OpenImageIO PROPERTIES LINK_FLAGS ${EXTRA_DSO_LINK_ARGS})
endif()

oiio_install_targets (OpenImageIO)
oiio_install_targets (OpenImageIO
#${OIIO_CUDA_LIB}
)


# Testing
Expand Down
51 changes: 44 additions & 7 deletions src/libOpenImageIO/imagebuf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ class ImageBufImpl {
void append_error (const std::string& message) const;

ImageBuf::IBStorage storage () const { return m_storage; }
bool cuda_storage () const { return m_cuda_storage; }

TypeDesc pixeltype () const {
validate_spec ();
Expand Down Expand Up @@ -194,6 +195,7 @@ class ImageBufImpl {
m_current_subimage, m_current_miplevel);
}

// Make sure the pixels are ready to read with an iterator.
bool validate_pixels () const {
if (m_pixels_valid)
return true;
Expand Down Expand Up @@ -259,6 +261,7 @@ class ImageBufImpl {
mutable spin_mutex m_valid_mutex;
mutable bool m_spec_valid; ///< Is the spec valid
mutable bool m_pixels_valid; ///< Image is valid
mutable bool m_cuda_storage = false; ///< Is the pixel memory visible to Cuda?
bool m_badfile; ///< File not found
float m_pixelaspect; ///< Pixel aspect ratio of the image
size_t m_pixel_bytes;
Expand Down Expand Up @@ -373,6 +376,7 @@ ImageBufImpl::ImageBufImpl (const ImageBufImpl &src)
// Source had the image fully in memory (no cache)
if (m_storage == ImageBuf::APPBUFFER) {
// Source just wrapped the client app's pixels, we do the same
m_allocated_size = src.m_localpixels ? src.spec().image_bytes() : 0;
m_localpixels = src.m_localpixels;
} else {
// We own our pixels -- copy from source
Expand Down Expand Up @@ -498,7 +502,28 @@ ImageBufImpl::new_pixels (size_t size, const void *data)
if (m_allocated_size)
free_pixels();
m_allocated_size = size;
m_pixels.reset (size ? new char [size] : nullptr);
m_cuda_storage = false;
if (size) {
#ifdef OIIO_USE_CUDA
if (OIIO::get_int_attribute("cuda") && m_spec.format == TypeFloat) {
char *cudaptr = (char *)OIIO::pvt::cuda_malloc (m_allocated_size);
if (cudaptr) {
OIIO::debug ("IB Cuda allocated %p\n", (void*)cudaptr);
m_pixels.reset (cudaptr);
m_cuda_storage = true;
}
else {
OIIO::debug ("Requested cudaMallocManaged of %s FAILED\n",
m_allocated_size);
}
}
#endif
if (! m_pixels) {
m_pixels.reset (new char [size]);
}
} else {
m_pixels.reset ();
}
IB_local_mem_current += m_allocated_size;
if (data && size)
memcpy (m_pixels.get(), data, size);
Expand All @@ -515,12 +540,21 @@ void
ImageBufImpl::free_pixels ()
{
IB_local_mem_current -= m_allocated_size;
#if OIIO_USE_CUDA
if (m_cuda_storage) {
OIIO::debug ("IB Cuda free %p\n", (void*)m_pixels.get());
OIIO::pvt::cuda_free (m_pixels.release());
m_cuda_storage = false;
}
#endif
m_pixels.reset ();
if (m_allocated_size && pvt::oiio_print_debug > 1)
OIIO::debug ("IB freed %d MB, global IB memory now %d MB\n",
m_allocated_size>>20, IB_local_mem_current>>20);
m_allocated_size = 0;
m_storage = ImageBuf::UNINITIALIZED;
m_localpixels = nullptr;
m_pixels_valid = false;
}


Expand Down Expand Up @@ -577,6 +611,14 @@ ImageBuf::storage () const



bool
ImageBuf::cuda_storage () const
{
return impl()->cuda_storage ();
}



void
ImageBufImpl::clear ()
{
Expand All @@ -588,10 +630,8 @@ ImageBufImpl::clear ()
m_current_miplevel = -1;
m_spec = ImageSpec ();
m_nativespec = ImageSpec ();
m_pixels.reset ();
m_localpixels = NULL;
release_pixels ();
m_spec_valid = false;
m_pixels_valid = false;
m_badfile = false;
m_pixelaspect = 1;
m_pixel_bytes = 0;
Expand Down Expand Up @@ -700,8 +740,6 @@ ImageBufImpl::realloc ()
m_channel_bytes = m_spec.format.size();
m_blackpixel.resize (round_to_multiple (m_pixel_bytes, OIIO_SIMD_MAX_SIZE_BYTES), 0);
// NB make it big enough for SSE
if (m_allocated_size)
m_pixels_valid = true;
if (m_spec.deep) {
m_deepdata.init (m_spec);
m_storage = ImageBuf::LOCALBUFFER;
Expand Down Expand Up @@ -2440,5 +2478,4 @@ ImageBuf::retile (int x, int y, int z, ImageCache::Tile* &tile,
}



OIIO_NAMESPACE_END
18 changes: 18 additions & 0 deletions src/libOpenImageIO/imagebufalgo_addsub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include <OpenImageIO/deepdata.h>
#include <OpenImageIO/dassert.h>
#include "imageio_pvt.h"
#include "imagebufalgo_cuda.h"



Expand Down Expand Up @@ -121,6 +122,15 @@ ImageBufAlgo::add (ImageBuf &dst, Image_or_Const A_, Image_or_Const B_,
return false;
ROI origroi = roi;
roi.chend = std::min (roi.chend, std::min (A.nchannels(), B.nchannels()));

#ifdef OIIO_USE_CUDA
if (dst.cuda_storage() && A.cuda_storage() && B.cuda_storage() &&
dst.roi() == roi && A.roi() == roi && B.roi() == roi) {
return pvt::add_impl_cuda (dst, A, B, dst.roi());
}
// make >/dev/null && OPENIMAGEIO_LOG_TIMES=2 oiiotool -cuda -frames 1-1 -pattern fill:topleft=0,0,0:topright=0.5,0,0:bottomleft=0,0.5,0:bottomright=0.5,0.5,0.5 1920x1080 3 -pattern fill:topleft=0.5,0,0:topright=0,0.5,0:bottomleft=0.5,0.5,0.5:bottomright=0,.5,.5 1920x1080 3 -add -o out.exr
#endif

bool ok;
OIIO_DISPATCH_COMMON_TYPES3 (ok, "add", add_impl, dst.spec().format,
A.spec().format, B.spec().format,
Expand Down Expand Up @@ -209,6 +219,14 @@ ImageBufAlgo::sub (ImageBuf &dst, Image_or_Const A_, Image_or_Const B_,
return false;
ROI origroi = roi;
roi.chend = std::min (roi.chend, std::min (A.nchannels(), B.nchannels()));

#ifdef OIIO_USE_CUDA
if (dst.cuda_storage() && A.cuda_storage() && B.cuda_storage() &&
dst.roi() == roi && A.roi() == roi && B.roi() == roi) {
return pvt::sub_impl_cuda (dst, A, B, dst.roi());
}
#endif

bool ok;
OIIO_DISPATCH_COMMON_TYPES3 (ok, "sub", sub_impl, dst.spec().format,
A.spec().format, B.spec().format,
Expand Down
Loading