diff --git a/.github/workflows/manifold.yml b/.github/workflows/manifold.yml index 2e7f069b4..953f1fd17 100644 --- a/.github/workflows/manifold.yml +++ b/.github/workflows/manifold.yml @@ -14,7 +14,8 @@ jobs: build: strategy: matrix: - backend: [CPP, OMP, TBB, CUDA] + cuda_support: [ON, OFF] + parallel_backend: [NONE, OMP, TBB] runs-on: ubuntu-20.04 container: image: docker://nvidia/cuda:11.6.0-devel-ubuntu20.04 @@ -33,9 +34,10 @@ jobs: patch third_party/assimp/code/CMakeLists.txt < assimp.diff mkdir build cd build - cmake -DCMAKE_BUILD_TYPE=Release -DTHRUST_BACKEND=${{matrix.backend}} .. && make - - name: Test ${{matrix.backend}} - if: matrix.backend != 'CUDA' + cmake -DCMAKE_BUILD_TYPE=Release -DMANIFOLD_PAR=${{matrix.parallel_backend}} -DMANIFOLD_USE_CUDA=${{matrix.cuda_support}} .. && make + - name: Test ${{matrix.parallel_backend}} with CUDA ${{matrix.cuda_support}} + # note that the test for CUDA backend does not really test CUDA, as we + # don't have CUDA GPU on GitHub Action run: | cd build/test ./manifold_test @@ -74,8 +76,8 @@ jobs: build_windows: strategy: matrix: - # backend: [CUDA, CPP] - backend: [CPP] + parallel_backend: [NONE] + cuda_support: [OFF] max-parallel: 1 runs-on: windows-2019 steps: @@ -107,11 +109,10 @@ jobs: - name: Build ${{matrix.backend}} shell: powershell run: | - cmake . -DCMAKE_BUILD_TYPE=Release -B build -DTHRUST_BACKEND=${{matrix.backend}} -A x64 + cmake . -DCMAKE_BUILD_TYPE=Release -B build -DMANIFOLD_PAR=${{matrix.parallel_backend}} -DMANIFOLD_USE_CUDA=${{matrix.cuda_support}} -A x64 cd build cmake --build . --target ALL_BUILD --config Release - - name: Test ${{matrix.backend}} - if: matrix.backend != 'CUDA' + - name: Test ${{matrix.parallel_backend}} with CUDA ${{matrix.cuda_support}} shell: bash run: | cd build/test @@ -120,7 +121,7 @@ jobs: build_nix: strategy: matrix: - variant: [cpp, omp, cuda, js, tbb] + variant: [none, omp, tbb, none-cuda, omp-cuda, tbb-cuda, js] runs-on: ubuntu-latest steps: - uses: actions/checkout@v2.4.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index b3843c437..d016e6e94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,13 +9,14 @@ set(CMAKE_VERBOSE_MAKEFILE ON) # ) # endif() -set(THRUST_BACKEND "CUDA" CACHE STRING - "Thrust backend, either \"CUDA\", \"OMP\" or \"CPP\"(single-threaded)" -) +option(MANIFOLD_USE_CUDA on) +set(MANIFOLD_PAR "TBB" CACHE STRING "Parallel backend, either \"TBB\" or \"OpenMP\" or \"NONE\"") + if(EMSCRIPTEN) message("Building for Emscripten") set(CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH=1) - set(THRUST_BACKEND "CPP") + set(MANIFOLD_USE_CUDA OFF) + set(MANIFOLD_PAR "NONE") endif() option(BUILD_TEST_CGAL off) @@ -26,8 +27,6 @@ set(ASSIMP_INC_DIR ${CMAKE_BINARY_DIR}/third_party/assimp/include) set(GLM_INC_DIR ${PROJECT_SOURCE_DIR}/third_party/glm) -set_property(CACHE THRUST_BACKEND PROPERTY STRINGS CUDA OMP CPP) - option(ASSIMP_FAST_BUILD "build ASSIMP just for tests" ON) if(ASSIMP_FAST_BUILD) option(ASSIMP_INSTALL FALSE) @@ -39,39 +38,49 @@ if(ASSIMP_FAST_BUILD) endforeach() endif() -if(MSVC) - set(MANIFOLD_FLAGS) -else() - set(MANIFOLD_FLAGS -Werror -Wall -Wno-sign-compare -Wno-unused) -endif() set(MANIFOLD_NVCC_FLAGS -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --extended-lambda) set(MANIFOLD_NVCC_RELEASE_FLAGS -O3) set(MANIFOLD_NVCC_DEBUG_FLAGS -G) set(THRUST_INC_DIR ${PROJECT_SOURCE_DIR}/third_party/thrust) -message("Using ${THRUST_BACKEND} backend") -if (THRUST_BACKEND STREQUAL "CUDA") - # we cannot set THRUST_INC_DIR when building with CUDA, otherwise the - # compiler will not use the system CUDA headers which causes incompatibility - enable_language(CUDA) - # clear THRUST_INC_DIR, we use the one from nvcc - set(THRUST_INC_DIR "") -elseif (THRUST_BACKEND STREQUAL "OMP") +message("CUDA Support: ${MANIFOLD_USE_CUDA}") +message("Parallel Backend: ${MANIFOLD_PAR}") + +if (MANIFOLD_PAR STREQUAL "OMP") find_package(OpenMP REQUIRED) set(MANIFOLD_INCLUDE OpenMP::OpenMP_CXX) - set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP -fopenmp) -elseif (THRUST_BACKEND STREQUAL "CPP") - set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP) -elseif (THRUST_BACKEND STREQUAL "TBB") + set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -fopenmp -DMANIFOLD_PAR='O') +elseif (MANIFOLD_PAR STREQUAL "TBB") find_package(PkgConfig REQUIRED) pkg_check_modules(TBB REQUIRED tbb) - set(MANIFOLD_INCLUDE ${TBB_LINK_LIBRARIES}) - set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_TBB) + set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DMANIFOLD_PAR='T') +elseif (MANIFOLD_PAR STREQUAL "NONE") + set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS}) + set(MANIFOLD_PAR "CPP") else () - message(FATAL_ERROR "Invalid value for THRUST_BACKEND: ${THRUST_BACKEND}. " - "Should be one of \"CUDA\", \"OMP\", \"CPP\" or \"TBB\"") + message(FATAL_ERROR "Invalid value for MANIFOLD_PAR: ${MANIFOLD_PAR}. " + "Should be one of \"TBB\", \"OMP\", \"CPP\"") +endif() + +if (MANIFOLD_USE_CUDA) + enable_language(CUDA) + find_package(CUDA REQUIRED) + # we cannot set THRUST_INC_DIR when building with CUDA, otherwise the + # compiler will not use the system CUDA headers which causes incompatibility + # clear THRUST_INC_DIR, we use the one from nvcc + set(THRUST_INC_DIR "") + set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DMANIFOLD_USE_CUDA) + set(MANIFOLD_NVCC_FLAGS ${MANIFOLD_NVCC_FLAGS} ${MANIFOLD_FLAGS}) +else () + set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_${MANIFOLD_PAR}) +endif() + +if(MSVC) + set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS}) +else() + set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -Werror -Wall -Wno-sign-compare -Wno-unused) endif() add_subdirectory(utilities) diff --git a/collider/CMakeLists.txt b/collider/CMakeLists.txt index d0799679b..c117c37ad 100644 --- a/collider/CMakeLists.txt +++ b/collider/CMakeLists.txt @@ -2,7 +2,7 @@ project (collider) add_library(${PROJECT_NAME} src/collider.cpp) -if(THRUST_BACKEND STREQUAL "CUDA") +if(MANIFOLD_USE_CUDA) set_source_files_properties(src/collider.cpp PROPERTIES LANGUAGE CUDA) set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 61) target_compile_options(${PROJECT_NAME} diff --git a/collider/src/collider.cpp b/collider/src/collider.cpp index f02227ad8..cfa987a45 100644 --- a/collider/src/collider.cpp +++ b/collider/src/collider.cpp @@ -14,6 +14,7 @@ #include "collider.h" #include "utils.h" +#include "par.h" #ifdef _MSC_VER #include @@ -252,7 +253,7 @@ Collider::Collider(const VecDH& leafBB, nodeParent_.resize(num_nodes, -1); internalChildren_.resize(leafBB.size() - 1, thrust::make_pair(-1, -1)); // organize tree - thrust::for_each_n(thrust::device, countAt(0), NumInternal(), + for_each_n(autoPolicy(NumInternal()), countAt(0), NumInternal(), CreateRadixTree({nodeParent_.ptrD(), internalChildren_.ptrD(), leafMorton})); UpdateBoxes(leafBB); @@ -273,8 +274,8 @@ SparseIndices Collider::Collisions(const VecDH& querriesIn) const { // scalar number of overlaps found VecDH nOverlapsD(1, 0); // calculate Bounding Box overlaps - thrust::for_each_n( - thrust::device, zip(querriesIn.cbegin(), countAt(0)), querriesIn.size(), + for_each_n( + autoPolicy(querriesIn.size()), zip(querriesIn.cbegin(), countAt(0)), querriesIn.size(), FindCollisions({querryTri.ptrDpq(), nOverlapsD.ptrD(), maxOverlaps, nodeBBox_.ptrD(), internalChildren_.ptrD()})); nOverlaps = nOverlapsD[0]; @@ -305,13 +306,13 @@ void Collider::UpdateBoxes(const VecDH& leafBB) { // copy in leaf node Boxs strided_range::Iter> leaves(nodeBBox_.begin(), nodeBBox_.end(), 2); - thrust::copy(thrust::device, leafBB.cbegin(), leafBB.cend(), leaves.begin()); + auto policy = autoPolicy(NumInternal()); + copy(policy, leafBB.cbegin(), leafBB.cend(), leaves.begin()); // create global counters - VecDH counter(NumInternal()); - thrust::fill(thrust::device, counter.begin(), counter.end(), 0); + VecDH counter(NumInternal(), 0); // kernel over leaves to save internal Boxs - thrust::for_each_n( - thrust::device, countAt(0), NumLeaves(), + for_each_n( + policy, countAt(0), NumLeaves(), BuildInternalBoxes({nodeBBox_.ptrD(), counter.ptrD(), nodeParent_.ptrD(), internalChildren_.ptrD()})); } @@ -330,7 +331,7 @@ bool Collider::Transform(glm::mat4x3 transform) { if (count != 2) axisAligned = false; } if (axisAligned) { - thrust::for_each(thrust::device, nodeBBox_.begin(), nodeBBox_.end(), + for_each(autoPolicy(nodeBBox_.size()), nodeBBox_.begin(), nodeBBox_.end(), TransformBox({transform})); } return axisAligned; diff --git a/flake.nix b/flake.nix index 8d917a196..08578a91a 100644 --- a/flake.nix +++ b/flake.nix @@ -10,33 +10,65 @@ inherit system; config.allowUnfree = true; }; - manifold = { backend ? "CPP", doCheck ? true, build-tools ? [ ], runtime ? [ ] }: pkgs.stdenv.mkDerivation { - inherit doCheck; - pname = "manifold-${backend}"; - version = "beta"; - src = self; - patches = [ ./assimp.diff ]; - nativeBuildInputs = (with pkgs; [ cmake python38 ]) ++ build-tools; - buildInputs = runtime; - cmakeFlags = [ "-DTHRUST_BACKEND=${backend}" ]; - checkPhase = '' - cd test - ./manifold_test - cd ../ - ''; - installPhase = '' - mkdir -p $out - cp manifold/libmanifold.a $out/ - cp meshIO/libmeshIO.a $out/ - cp tools/loadMesh $out - cp tools/perfTest $out - ''; - }; + manifold = + { parallel-backend ? "none" + , cuda-support ? false + , doCheck ? true + , build-tools ? [ ] + , ... + }: pkgs.stdenv.mkDerivation { + inherit doCheck; + pname = + if cuda-support then + "manifold-${parallel-backend}-cuda" + else + "manifold-${parallel-backend}"; + version = "beta"; + src = self; + patches = [ ./assimp.diff ]; + nativeBuildInputs = (with pkgs; [ cmake python38 ]) ++ build-tools ++ + (if cuda-support then [ pkgs.cudatoolkit_11_5 pkgs.addOpenGLRunpath ] else [ ]); + cmakeFlags = [ + "-DMANIFOLD_PAR=${pkgs.lib.strings.toUpper parallel-backend}" + "-DMANIFOLD_USE_CUDA=${if cuda-support then "ON" else "OFF"}" + ]; + checkPhase = '' + cd test + ./manifold_test + cd ../ + ''; + installPhase = '' + mkdir -p $out + cp manifold/libmanifold.a $out/ + cp meshIO/libmeshIO.a $out/ + cp tools/loadMesh $out + cp tools/perfTest $out + ''; + }; + parallelBackends = [ + { parallel-backend = "none"; } + { + parallel-backend = "omp"; + build-tools = [ pkgs.llvmPackages_13.openmp ]; + } + { + parallel-backend = "tbb"; + build-tools = with pkgs; [ tbb pkg-config ]; + } + ]; + buildMatrix = with pkgs; with lib; lists.flatten (map + (env: map + (x: x // env) + parallelBackends) [ + { cuda-support = false; } + { + cuda-support = true; + } + ]); devShell = { additional ? [ ] }: pkgs.mkShell { buildInputs = with pkgs; [ cmake - ccls - llvmPackages.openmp + llvmPackages_13.openmp clang-tools clang_13 emscripten @@ -45,48 +77,45 @@ }; in { - packages.manifold-cpp = manifold { }; - packages.manifold-omp = manifold { backend = "OMP"; runtime = [ pkgs.llvmPackages.openmp ]; }; - packages.manifold-tbb = manifold { backend = "TBB"; runtime = [ pkgs.tbb pkgs.pkg-config ]; }; - packages.manifold-cuda = manifold { - backend = "CUDA"; - runtime = [ - pkgs.cudaPackages.cudatoolkit_11 - ]; - doCheck = false; - }; - packages.manifold-js = pkgs.buildEmscriptenPackage { - name = "manifold-js"; - version = "beta"; - src = self; - patches = [ ./assimp.diff ]; - nativeBuildInputs = (with pkgs; [ cmake python38 ]); - buildInputs = [ pkgs.nodejs ]; - configurePhase = '' - mkdir build - cd build - emcmake cmake -DCMAKE_BUILD_TYPE=Release .. - ''; - buildPhase = '' - emmake make - ''; - checkPhase = '' - cd test - node manifold_test.js - cd ../ - ''; - installPhase = '' - mkdir -p $out - cd tools - cp *.js $out/ - cp *.wasm $out/ - ''; + packages = (builtins.listToAttrs + (map + (x: { + name = "manifold-" + x.parallel-backend + (if + x.cuda-support then "-cuda" else ""); + value = manifold x; + }) + buildMatrix)) // { + manifold-js = pkgs.buildEmscriptenPackage { + name = "manifold-js"; + version = "beta"; + src = self; + patches = [ ./assimp.diff ]; + nativeBuildInputs = (with pkgs; [ cmake python38 ]); + buildInputs = [ pkgs.nodejs ]; + configurePhase = '' + mkdir build + cd build + emcmake cmake -DCMAKE_BUILD_TYPE=Release .. + ''; + buildPhase = '' + emmake make + ''; + checkPhase = '' + cd test + node manifold_test.js + cd ../ + ''; + installPhase = '' + mkdir -p $out + cd tools + cp *.js $out/ + cp *.wasm $out/ + ''; + }; }; devShell = devShell { }; devShells.cuda = devShell { - additional = [ - pkgs.cudaPackages.cudatoolkit_11 - ]; + additional = [ pkgs.cudatoolkit_11_5 ]; }; } ); diff --git a/manifold/CMakeLists.txt b/manifold/CMakeLists.txt index 7cc4cb4b4..482166f1e 100644 --- a/manifold/CMakeLists.txt +++ b/manifold/CMakeLists.txt @@ -5,7 +5,7 @@ add_library(${PROJECT_NAME} src/manifold.cpp src/constructors.cpp src/impl.cpp src/smoothing.cpp src/boolean3.cpp src/boolean_result.cpp ) -if(THRUST_BACKEND STREQUAL "CUDA") +if(MANIFOLD_USE_CUDA) set_source_files_properties( src/manifold.cpp src/constructors.cpp src/impl.cpp src/properties.cpp src/sort.cpp src/edge_op.cpp src/face_op.cpp diff --git a/manifold/src/boolean3.cpp b/manifold/src/boolean3.cpp index 5bd239332..0bfa89d61 100644 --- a/manifold/src/boolean3.cpp +++ b/manifold/src/boolean3.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include "boolean3.h" +#include "par.h" #include // TODO: make this runtime configurable for quicker debug @@ -85,12 +86,13 @@ struct CopyFaceEdges { SparseIndices Filter11(const Manifold::Impl &inP, const Manifold::Impl &inQ, const SparseIndices &p1q2, const SparseIndices &p2q1) { SparseIndices p1q1(3 * p1q2.size() + 3 * p2q1.size()); - thrust::for_each_n(thrust::device, zip(countAt(0), p1q2.begin(0), p1q2.begin(1)), + auto policy = autoPolicy(p1q2.size()); + for_each_n(policy, zip(countAt(0), p1q2.begin(0), p1q2.begin(1)), p1q2.size(), CopyFaceEdges({p1q1.ptrDpq(), inQ.halfedge_.cptrD()})); p1q1.SwapPQ(); - thrust::for_each_n(thrust::device, zip(countAt(p1q2.size()), p2q1.begin(1), p2q1.begin(0)), + for_each_n(policy, zip(countAt(p1q2.size()), p2q1.begin(1), p2q1.begin(0)), p2q1.size(), CopyFaceEdges({p1q1.ptrDpq(), inP.halfedge_.cptrD()})); p1q1.SwapPQ(); @@ -245,8 +247,8 @@ std::tuple, VecDH> Shadow11(SparseIndices &p1q1, VecDH s11(p1q1.size()); VecDH xyzz11(p1q1.size()); - thrust::for_each_n( - thrust::device, zip(xyzz11.begin(), s11.begin(), p1q1.begin(0), p1q1.begin(1)), + for_each_n( + autoPolicy(p1q1.size()), zip(xyzz11.begin(), s11.begin(), p1q1.begin(0), p1q1.begin(1)), p1q1.size(), Kernel11({inP.vertPos_.cptrD(), inQ.vertPos_.cptrD(), inP.halfedge_.cptrD(), inQ.halfedge_.cptrD(), expandP, @@ -342,8 +344,8 @@ std::tuple, VecDH> Shadow02(const Manifold::Impl &inP, auto vertNormalP = forward ? inP.vertNormal_.cptrD() : inQ.vertNormal_.cptrD(); - thrust::for_each_n( - thrust::device, zip(s02.begin(), z02.begin(), p0q2.begin(!forward), + for_each_n( + autoPolicy(p0q2.size()), zip(s02.begin(), z02.begin(), p0q2.begin(!forward), p0q2.begin(forward)), p0q2.size(), Kernel02({inP.vertPos_.cptrD(), inQ.halfedge_.cptrD(), @@ -452,8 +454,8 @@ std::tuple, VecDH> Intersect12( VecDH x12(p1q2.size()); VecDH v12(p1q2.size()); - thrust::for_each_n( - thrust::device, zip(x12.begin(), v12.begin(), p1q2.begin(!forward), + for_each_n( + autoPolicy(p1q2.size()), zip(x12.begin(), v12.begin(), p1q2.begin(!forward), p1q2.begin(forward)), p1q2.size(), Kernel12({p0q2.ptrDpq(), s02.ptrD(), z02.cptrD(), p0q2.size(), @@ -471,19 +473,21 @@ VecDH Winding03(const Manifold::Impl &inP, SparseIndices &p0q2, // verts that are not shadowed (not in p0q2) have winding number zero. VecDH w03(inP.NumVert(), 0); - if (!thrust::is_sorted(thrust::device, p0q2.begin(reverse), p0q2.end(reverse))) - thrust::sort_by_key(thrust::device, p0q2.begin(reverse), p0q2.end(reverse), s02.begin()); + auto policy = autoPolicy(p0q2.size()); + if (!is_sorted(policy, p0q2.begin(reverse), p0q2.end(reverse))) + sort_by_key(policy, p0q2.begin(reverse), p0q2.end(reverse), s02.begin()); VecDH w03val(w03.size()); VecDH w03vert(w03.size()); // sum known s02 values into w03 (winding number) auto endPair = - thrust::reduce_by_key(thrust::device, p0q2.begin(reverse), p0q2.end(reverse), + reduce_by_key> + (policy, p0q2.begin(reverse), p0q2.end(reverse), s02.begin(), w03vert.begin(), w03val.begin()); - thrust::scatter(thrust::device, w03val.begin(), endPair.second, w03vert.begin(), + scatter(autoPolicy(endPair.second - w03val.begin()), w03val.begin(), endPair.second, w03vert.begin(), w03.begin()); if (reverse) - thrust::transform(thrust::device, w03.begin(), w03.end(), w03.begin(), + transform(autoPolicy(w03.size()), w03.begin(), w03.end(), w03.begin(), thrust::negate()); return w03; }; diff --git a/manifold/src/boolean_result.cpp b/manifold/src/boolean_result.cpp index 5c072d430..50ac45e00 100644 --- a/manifold/src/boolean_result.cpp +++ b/manifold/src/boolean_result.cpp @@ -17,6 +17,7 @@ #include "boolean3.h" #include "polygon.h" +#include "par.h" // TODO: make this runtime configurable for quicker debug constexpr bool kVerbose = false; @@ -83,31 +84,33 @@ std::tuple, VecDH> SizeOutput( const VecDH &i03, const VecDH &i30, const VecDH &i12, const VecDH &i21, const SparseIndices &p1q2, const SparseIndices &p2q1, bool invertQ) { - VecDH sidesPerFacePQ(inP.NumTri() + inQ.NumTri()); + VecDH sidesPerFacePQ(inP.NumTri() + inQ.NumTri(), 0); auto sidesPerFaceP = sidesPerFacePQ.ptrD(); auto sidesPerFaceQ = sidesPerFacePQ.ptrD() + inP.NumTri(); - thrust::for_each(thrust::device, inP.halfedge_.begin(), inP.halfedge_.end(), + auto policy = autoPolicy(std::max(inP.halfedge_.size(), inQ.halfedge_.size())); + + for_each(policy, inP.halfedge_.begin(), inP.halfedge_.end(), CountVerts({sidesPerFaceP, i03.cptrD()})); - thrust::for_each(thrust::device, inQ.halfedge_.begin(), inQ.halfedge_.end(), + for_each(policy, inQ.halfedge_.begin(), inQ.halfedge_.end(), CountVerts({sidesPerFaceQ, i30.cptrD()})); - thrust::for_each_n( - thrust::device, zip(p1q2.begin(0), p1q2.begin(1), i12.begin()), i12.size(), + for_each_n( + policy, zip(p1q2.begin(0), p1q2.begin(1), i12.begin()), i12.size(), CountNewVerts({sidesPerFaceP, sidesPerFaceQ, inP.halfedge_.cptrD()})); - thrust::for_each_n( - thrust::device, zip(p2q1.begin(1), p2q1.begin(0), i21.begin()), i21.size(), + for_each_n( + policy, zip(p2q1.begin(1), p2q1.begin(0), i21.begin()), i21.size(), CountNewVerts({sidesPerFaceQ, sidesPerFaceP, inQ.halfedge_.cptrD()})); - VecDH facePQ2R(inP.NumTri() + inQ.NumTri() + 1); + VecDH facePQ2R(inP.NumTri() + inQ.NumTri() + 1, 0); auto keepFace = thrust::make_transform_iterator(sidesPerFacePQ.begin(), NotZero()); - thrust::inclusive_scan(thrust::device, keepFace, keepFace + sidesPerFacePQ.size(), + inclusive_scan(policy, keepFace, keepFace + sidesPerFacePQ.size(), facePQ2R.begin() + 1); int numFaceR = facePQ2R.back(); facePQ2R.resize(inP.NumTri() + inQ.NumTri()); outR.faceNormal_.resize(numFaceR); - auto next = thrust::copy_if(thrust::device, inP.faceNormal_.begin(), inP.faceNormal_.end(), + auto next = copy_if(policy, inP.faceNormal_.begin(), inP.faceNormal_.end(), keepFace, outR.faceNormal_.begin(), thrust::identity()); if (invertQ) { @@ -115,17 +118,17 @@ std::tuple, VecDH> SizeOutput( thrust::negate()); auto end = thrust::make_transform_iterator(inQ.faceNormal_.end(), thrust::negate()); - thrust::copy_if(thrust::device, start, end, keepFace + inP.NumTri(), next, + copy_if(policy, start, end, keepFace + inP.NumTri(), next, thrust::identity()); } else { - thrust::copy_if(thrust::device, inQ.faceNormal_.begin(), inQ.faceNormal_.end(), + copy_if(policy, inQ.faceNormal_.begin(), inQ.faceNormal_.end(), keepFace + inP.NumTri(), next, thrust::identity()); } auto newEnd = - thrust::remove(thrust::device, sidesPerFacePQ.begin(), sidesPerFacePQ.end(), 0); - VecDH faceEdge(newEnd - sidesPerFacePQ.begin() + 1); - thrust::inclusive_scan(thrust::device, sidesPerFacePQ.begin(), newEnd, + remove(policy, sidesPerFacePQ.begin(), sidesPerFacePQ.end(), 0); + VecDH faceEdge(newEnd - sidesPerFacePQ.begin() + 1, 0); + inclusive_scan(policy, sidesPerFacePQ.begin(), newEnd, faceEdge.begin() + 1); outR.halfedge_.resize(faceEdge.back()); @@ -409,8 +412,8 @@ void AppendWholeEdges(Manifold::Impl &outR, VecDH &facePtrR, const VecDH wholeHalfedgeP, const VecDH &i03, const VecDH &vP2R, const int *faceP2R, bool forward) { - thrust::for_each_n( - thrust::device, zip(wholeHalfedgeP.begin(), inP.halfedge_.begin(), countAt(0)), + for_each_n( + autoPolicy(inP.halfedge_.size()), zip(wholeHalfedgeP.begin(), inP.halfedge_.begin(), countAt(0)), inP.halfedge_.size(), DuplicateHalfedges({outR.halfedge_.ptrD(), halfedgeRef.ptrD(), facePtrR.ptrD(), inP.halfedge_.cptrD(), i03.cptrD(), @@ -489,8 +492,8 @@ std::pair, VecDH> CalculateMeshRelation( VecDH faceRef(numFaceR); VecDH halfedgeBary(halfedgeRef.size()); VecDH idx(1, 0); - thrust::for_each_n( - thrust::device, zip(halfedgeBary.begin(), halfedgeRef.begin(), + for_each_n( + autoPolicy(halfedgeRef.size()), zip(halfedgeBary.begin(), halfedgeRef.begin(), outR.halfedge_.cbegin()), halfedgeRef.size(), CreateBarycentric( @@ -558,25 +561,27 @@ Manifold::Impl Boolean3::Result(Manifold::OpType op) const { VecDH i21(x21_.size()); VecDH i03(w03_.size()); VecDH i30(w30_.size()); - thrust::transform(thrust::device, x12_.begin(), x12_.end(), i12.begin(), c3 * _1); - thrust::transform(thrust::device, x21_.begin(), x21_.end(), i21.begin(), c3 * _1); - thrust::transform(thrust::device, w03_.begin(), w03_.end(), i03.begin(), c1 + c3 * _1); - thrust::transform(thrust::device, w30_.begin(), w30_.end(), i30.begin(), c2 + c3 * _1); + auto policy = autoPolicy(std::max(std::max(x12_.size(), x21_.size()), std::max(w03_.size(), w30_.size()))); + + transform(policy, x12_.begin(), x12_.end(), i12.begin(), c3 * _1); + transform(policy, x21_.begin(), x21_.end(), i21.begin(), c3 * _1); + transform(policy, w03_.begin(), w03_.end(), i03.begin(), c1 + c3 * _1); + transform(policy, w30_.begin(), w30_.end(), i30.begin(), c2 + c3 * _1); VecDH vP2R(inP_.NumVert()); - thrust::exclusive_scan(thrust::device, i03.begin(), i03.end(), vP2R.begin(), 0, AbsSum()); + exclusive_scan(policy, i03.begin(), i03.end(), vP2R.begin(), 0, AbsSum()); int numVertR = AbsSum()(vP2R.back(), i03.back()); const int nPv = numVertR; VecDH vQ2R(inQ_.NumVert()); - thrust::exclusive_scan(thrust::device, i30.begin(), i30.end(), vQ2R.begin(), numVertR, + exclusive_scan(policy, i30.begin(), i30.end(), vQ2R.begin(), numVertR, AbsSum()); numVertR = AbsSum()(vQ2R.back(), i30.back()); const int nQv = numVertR - nPv; VecDH v12R(v12_.size()); if (v12_.size() > 0) { - thrust::exclusive_scan(thrust::device, i12.begin(), i12.end(), v12R.begin(), numVertR, + exclusive_scan(policy, i12.begin(), i12.end(), v12R.begin(), numVertR, AbsSum()); numVertR = AbsSum()(v12R.back(), i12.back()); } @@ -584,7 +589,7 @@ Manifold::Impl Boolean3::Result(Manifold::OpType op) const { VecDH v21R(v21_.size()); if (v21_.size() > 0) { - thrust::exclusive_scan(thrust::device, i21.begin(), i21.end(), v21R.begin(), numVertR, + exclusive_scan(policy, i21.begin(), i21.end(), v21R.begin(), numVertR, AbsSum()); numVertR = AbsSum()(v21R.back(), i21.back()); } @@ -600,14 +605,14 @@ Manifold::Impl Boolean3::Result(Manifold::OpType op) const { outR.vertPos_.resize(numVertR); // Add vertices, duplicating for inclusion numbers not in [-1, 1]. // Retained vertices from P and Q: - thrust::for_each_n(thrust::device, zip(i03.begin(), vP2R.begin(), inP_.vertPos_.begin()), + for_each_n(policy, zip(i03.begin(), vP2R.begin(), inP_.vertPos_.begin()), inP_.NumVert(), DuplicateVerts({outR.vertPos_.ptrD()})); - thrust::for_each_n(thrust::device, zip(i30.begin(), vQ2R.begin(), inQ_.vertPos_.begin()), + for_each_n(policy, zip(i30.begin(), vQ2R.begin(), inQ_.vertPos_.begin()), inQ_.NumVert(), DuplicateVerts({outR.vertPos_.ptrD()})); // New vertices created from intersections: - thrust::for_each_n(thrust::device, zip(i12.begin(), v12R.begin(), v12_.begin()), + for_each_n(policy, zip(i12.begin(), v12R.begin(), v12_.begin()), i12.size(), DuplicateVerts({outR.vertPos_.ptrD()})); - thrust::for_each_n(thrust::device, zip(i21.begin(), v21R.begin(), v21_.begin()), + for_each_n(policy, zip(i21.begin(), v21R.begin(), v21_.begin()), i21.size(), DuplicateVerts({outR.vertPos_.ptrD()})); if (kVerbose) { diff --git a/manifold/src/constructors.cpp b/manifold/src/constructors.cpp index 743f534d0..113b3ec3d 100644 --- a/manifold/src/constructors.cpp +++ b/manifold/src/constructors.cpp @@ -17,6 +17,7 @@ #include "graph.h" #include "impl.h" #include "polygon.h" +#include "par.h" namespace { using namespace manifold; @@ -183,7 +184,7 @@ Manifold Manifold::Sphere(float radius, int circularSegments) { Manifold sphere; sphere.pImpl_ = std::make_unique(Impl::Shape::OCTAHEDRON); sphere.pImpl_->Subdivide(n); - thrust::for_each_n(thrust::device, sphere.pImpl_->vertPos_.begin(), sphere.NumVert(), + for_each_n(autoPolicy(sphere.NumVert()), sphere.pImpl_->vertPos_.begin(), sphere.NumVert(), ToSphere({radius})); sphere.pImpl_->Finish(); // Ignore preceding octahedron. @@ -394,6 +395,7 @@ Manifold Manifold::Compose(const std::vector& manifolds) { combined.halfedgeTangent_.resize(2 * numEdge); combined.meshRelation_.barycentric.resize(numBary); combined.meshRelation_.triBary.resize(numTri); + auto policy = autoPolicy(numTri); int nextVert = 0; int nextEdge = 0; @@ -403,20 +405,20 @@ Manifold Manifold::Compose(const std::vector& manifolds) { const Impl& impl = *(manifold.pImpl_); impl.ApplyTransform(); - thrust::copy(thrust::device, impl.vertPos_.begin(), impl.vertPos_.end(), + copy(policy, impl.vertPos_.begin(), impl.vertPos_.end(), combined.vertPos_.begin() + nextVert); - thrust::copy(thrust::device, impl.faceNormal_.begin(), impl.faceNormal_.end(), + copy(policy, impl.faceNormal_.begin(), impl.faceNormal_.end(), combined.faceNormal_.begin() + nextTri); - thrust::copy(thrust::device, impl.halfedgeTangent_.begin(), impl.halfedgeTangent_.end(), + copy(policy, impl.halfedgeTangent_.begin(), impl.halfedgeTangent_.end(), combined.halfedgeTangent_.begin() + nextEdge); - thrust::copy(thrust::device, impl.meshRelation_.barycentric.begin(), + copy(policy, impl.meshRelation_.barycentric.begin(), impl.meshRelation_.barycentric.end(), combined.meshRelation_.barycentric.begin() + nextBary); - thrust::transform(thrust::device, impl.meshRelation_.triBary.begin(), + transform(policy, impl.meshRelation_.triBary.begin(), impl.meshRelation_.triBary.end(), combined.meshRelation_.triBary.begin() + nextTri, UpdateTriBary({nextBary})); - thrust::transform(thrust::device, impl.halfedge_.begin(), impl.halfedge_.end(), + transform(policy, impl.halfedge_.begin(), impl.halfedge_.end(), combined.halfedge_.begin() + nextEdge, UpdateHalfedge({nextVert, nextEdge, nextTri})); @@ -477,22 +479,24 @@ std::vector Manifold::Decompose() const { for (int i = 0; i < numLabel; ++i) { meshes[i].pImpl_->vertPos_.resize(NumVert()); VecDH vertNew2Old(NumVert()); + auto policy = autoPolicy(NumVert()); + auto start = zip(meshes[i].pImpl_->vertPos_.begin(), vertNew2Old.begin()); int nVert = - thrust::copy_if( - thrust::device, zip(pImpl_->vertPos_.begin(), countAt(0)), + copy_if( + policy, zip(pImpl_->vertPos_.begin(), countAt(0)), zip(pImpl_->vertPos_.end(), countAt(NumVert())), vertLabel.begin(), zip(meshes[i].pImpl_->vertPos_.begin(), vertNew2Old.begin()), Equals({i})) - - zip(meshes[i].pImpl_->vertPos_.begin(), countAt(0)); + start; meshes[i].pImpl_->vertPos_.resize(nVert); VecDH faceNew2Old(NumTri()); - thrust::sequence(thrust::device, faceNew2Old.begin(), faceNew2Old.end()); + sequence(policy, faceNew2Old.begin(), faceNew2Old.end()); int nFace = - thrust::remove_if( - thrust::device, faceNew2Old.begin(), faceNew2Old.end(), + remove_if( + policy, faceNew2Old.begin(), faceNew2Old.end(), RemoveFace({pImpl_->halfedge_.cptrD(), vertLabel.cptrD(), i})) - faceNew2Old.begin(); faceNew2Old.resize(nFace); diff --git a/manifold/src/edge_op.cpp b/manifold/src/edge_op.cpp index eecf01568..566985318 100644 --- a/manifold/src/edge_op.cpp +++ b/manifold/src/edge_op.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include "impl.h" +#include "par.h" namespace { using namespace manifold; @@ -119,9 +120,10 @@ namespace manifold { */ void Manifold::Impl::SimplifyTopology() { VecDH flaggedEdges(halfedge_.size()); + auto policy = autoPolicy(halfedge_.size()); int numFlagged = - thrust::copy_if( - thrust::device, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(), + copy_if( + policy, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(), ShortEdge({halfedge_.cptrD(), vertPos_.cptrD(), precision_})) - flaggedEdges.begin(); flaggedEdges.resize(numFlagged); @@ -130,8 +132,8 @@ void Manifold::Impl::SimplifyTopology() { flaggedEdges.resize(halfedge_.size()); numFlagged = - thrust::copy_if( - thrust::device, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(), + copy_if( + policy, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(), FlagEdge({halfedge_.cptrD(), meshRelation_.triBary.cptrD()})) - flaggedEdges.begin(); flaggedEdges.resize(numFlagged); @@ -139,8 +141,8 @@ void Manifold::Impl::SimplifyTopology() { for (const int edge : flaggedEdges) CollapseEdge(edge); flaggedEdges.resize(halfedge_.size()); - numFlagged = thrust::copy_if( - thrust::device, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(), + numFlagged = copy_if( + policy, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(), SwappableEdge({halfedge_.cptrD(), vertPos_.cptrD(), faceNormal_.cptrD(), precision_})) - flaggedEdges.begin(); diff --git a/manifold/src/impl.cpp b/manifold/src/impl.cpp index 3ffd4e11c..909716f92 100644 --- a/manifold/src/impl.cpp +++ b/manifold/src/impl.cpp @@ -20,6 +20,7 @@ #include "graph.h" #include "impl.h" +#include "par.h" namespace { using namespace manifold; @@ -357,7 +358,7 @@ Manifold::Impl::Impl(Shape shape) { void Manifold::Impl::ReinitializeReference(int meshID) { // instead of storing the meshID, we store 0 and set the mapping to // 0 -> meshID, because the meshID after boolean operation also starts from 0. - thrust::for_each_n(thrust::device, zip(meshRelation_.triBary.begin(), countAt(0)), NumTri(), + for_each_n(autoPolicy(NumTri()), zip(meshRelation_.triBary.begin(), countAt(0)), NumTri(), InitializeBaryRef({0, halfedge_.cptrD()})); meshRelation_.originalID.clear(); meshRelation_.originalID[0] = meshID; @@ -386,7 +387,7 @@ int Manifold::Impl::InitializeNewReference( "propertyTolerance."); const int numSets = properties.size() / numProps; - ALWAYS_ASSERT(thrust::all_of(thrust::device, triPropertiesD.begin(), triPropertiesD.end(), + ALWAYS_ASSERT(all_of(autoPolicy(triProperties.size()), triPropertiesD.begin(), triPropertiesD.end(), CheckProperties({numSets})), userErr, "triProperties value is outside the properties range."); @@ -394,8 +395,8 @@ int Manifold::Impl::InitializeNewReference( VecDH> face2face(halfedge_.size(), {-1, -1}); VecDH triArea(NumTri()); - thrust::for_each_n( - thrust::device, zip(face2face.begin(), countAt(0)), halfedge_.size(), + for_each_n( + autoPolicy(halfedge_.size()), zip(face2face.begin(), countAt(0)), halfedge_.size(), CoplanarEdge({triArea.ptrD(), halfedge_.cptrD(), vertPos_.cptrD(), triPropertiesD.cptrD(), propertiesD.cptrD(), propertyToleranceD.cptrD(), numProps, precision_})); @@ -477,10 +478,11 @@ void Manifold::Impl::CreateHalfedges(const VecDH& triVerts) { const int numTri = triVerts.size(); halfedge_.resize(3 * numTri); VecDH edge(3 * numTri); - thrust::for_each_n(thrust::device, zip(countAt(0), triVerts.begin()), numTri, + auto policy = autoPolicy(numTri); + for_each_n(policy, zip(countAt(0), triVerts.begin()), numTri, Tri2Halfedges({halfedge_.ptrD(), edge.ptrD()})); - thrust::sort(thrust::device, edge.begin(), edge.end()); - thrust::for_each_n(thrust::device, countAt(0), halfedge_.size() / 2, + sort(policy, edge.begin(), edge.end()); + for_each_n(policy, countAt(0), halfedge_.size() / 2, LinkHalfedges({halfedge_.ptrD(), edge.cptrD()})); } @@ -495,7 +497,8 @@ void Manifold::Impl::CreateAndFixHalfedges(const VecDH& triVerts) { halfedge_.resize(0); halfedge_.resize(3 * numTri); VecDH edge(3 * numTri); - thrust::for_each_n(thrust::device, zip(countAt(0), triVerts.begin()), numTri, + auto policy = autoPolicy(numTri); + for_each_n(policy, zip(countAt(0), triVerts.begin()), numTri, Tri2Halfedges({halfedge_.ptrD(), edge.ptrD()})); // Stable sort is required here so that halfedges from the same face are // paired together (the triangles were created in face order). In some @@ -503,7 +506,7 @@ void Manifold::Impl::CreateAndFixHalfedges(const VecDH& triVerts) { // two different faces, causing this edge to not be 2-manifold. We detect this // and fix it by swapping one of the identical edges, so it is important that // we have the edges paired according to their face. - thrust::stable_sort(thrust::device, edge.begin(), edge.end()); + stable_sort(policy, edge.begin(), edge.end()); thrust::for_each_n(thrust::host, countAt(0), halfedge_.size() / 2, LinkHalfedges({halfedge_.ptrH(), edge.cptrH()})); thrust::for_each(thrust::host, countAt(1), countAt(halfedge_.size() / 2), @@ -535,14 +538,15 @@ void Manifold::Impl::ApplyTransform() const { */ void Manifold::Impl::ApplyTransform() { if (transform_ == glm::mat4x3(1.0f)) return; - thrust::for_each(thrust::device, vertPos_.begin(), vertPos_.end(), + auto policy = autoPolicy(vertPos_.size()); + for_each(policy, vertPos_.begin(), vertPos_.end(), Transform4x3({transform_})); glm::mat3 normalTransform = glm::inverse(glm::transpose(glm::mat3(transform_))); - thrust::for_each(thrust::device, faceNormal_.begin(), faceNormal_.end(), + for_each(policy, faceNormal_.begin(), faceNormal_.end(), TransformNormals({normalTransform})); - thrust::for_each(thrust::device, vertNormal_.begin(), vertNormal_.end(), + for_each(policy, vertNormal_.begin(), vertNormal_.end(), TransformNormals({normalTransform})); // This optimization does a cheap collider update if the transform is // axis-aligned. @@ -585,17 +589,18 @@ void Manifold::Impl::SetPrecision(float minPrecision) { */ void Manifold::Impl::CalculateNormals() { vertNormal_.resize(NumVert()); - thrust::fill(thrust::device, vertNormal_.begin(), vertNormal_.end(), glm::vec3(0)); + auto policy = autoPolicy(NumTri()); + fill(policy, vertNormal_.begin(), vertNormal_.end(), glm::vec3(0)); bool calculateTriNormal = false; if (faceNormal_.size() != NumTri()) { faceNormal_.resize(NumTri()); calculateTriNormal = true; } - thrust::for_each_n( - thrust::device, zip(faceNormal_.begin(), countAt(0)), NumTri(), + for_each_n( + policy, zip(faceNormal_.begin(), countAt(0)), NumTri(), AssignNormals({vertNormal_.ptrD(), vertPos_.cptrD(), halfedge_.cptrD(), precision_, calculateTriNormal})); - thrust::for_each(thrust::device, vertNormal_.begin(), vertNormal_.end(), Normalize()); + for_each(policy, vertNormal_.begin(), vertNormal_.end(), Normalize()); } /** @@ -617,28 +622,27 @@ void Manifold::Impl::UpdateMeshIDs(VecDH &meshIDs, VecDH &originalIDs, int startTri, int n, int startID) { if (n == -1) n = meshRelation_.triBary.size(); - thrust::sort_by_key(thrust::host, meshIDs.begin(), meshIDs.end(), - originalIDs.begin()); + sort_by_key(autoPolicy(n), meshIDs.begin(), meshIDs.end(), originalIDs.begin()); constexpr int kOccurred = 1 << 30; VecDH error(1, -1); const int numMesh = meshIDs.size(); const int *meshIDsPtr = meshIDs.cptrD(); int *originalPtr = originalIDs.ptrD(); int *errorPtr = error.ptrD(); - thrust::for_each(thrust::device, - meshRelation_.triBary.begin() + startTri, - meshRelation_.triBary.begin() + startTri + n, - [=] __host__ __device__(BaryRef & b) { - int index = - thrust::lower_bound(meshIDsPtr, meshIDsPtr + numMesh, - b.meshID) - - meshIDsPtr; - if (index >= numMesh || meshIDsPtr[index] != b.meshID) { - *errorPtr = b.meshID; - } - b.meshID = index + startID; - originalPtr[index] |= kOccurred; - }); + for_each(autoPolicy(n), + meshRelation_.triBary.begin() + startTri, + meshRelation_.triBary.begin() + startTri + n, + [=] __host__ __device__(BaryRef & b) { + int index = + thrust::lower_bound(meshIDsPtr, meshIDsPtr + numMesh, + b.meshID) - + meshIDsPtr; + if (index >= numMesh || meshIDsPtr[index] != b.meshID) { + *errorPtr = b.meshID; + } + b.meshID = index + startID; + originalPtr[index] |= kOccurred; + }); if (error[0] != -1) { std::stringstream ss; @@ -663,12 +667,13 @@ SparseIndices Manifold::Impl::EdgeCollisions(const Impl& Q) const { VecDH edges = CreateTmpEdges(Q.halfedge_); const int numEdge = edges.size(); VecDH QedgeBB(numEdge); - thrust::for_each_n(thrust::device, zip(QedgeBB.begin(), edges.cbegin()), numEdge, + auto policy = autoPolicy(numEdge); + for_each_n(policy, zip(QedgeBB.begin(), edges.cbegin()), numEdge, EdgeBox({Q.vertPos_.cptrD()})); SparseIndices q1p2 = collider_.Collisions(QedgeBB); - thrust::for_each(thrust::device, q1p2.begin(0), q1p2.end(0), ReindexEdge({edges.cptrD()})); + for_each(policy, q1p2.begin(0), q1p2.end(0), ReindexEdge({edges.cptrD()})); return q1p2; } diff --git a/manifold/src/manifold.cpp b/manifold/src/manifold.cpp index 04c02eeed..8e0a2edee 100644 --- a/manifold/src/manifold.cpp +++ b/manifold/src/manifold.cpp @@ -14,6 +14,7 @@ #include "boolean3.h" #include "impl.h" +#include "par.h" namespace { using namespace manifold; @@ -121,6 +122,7 @@ Mesh Manifold::GetMesh() const { pImpl_->halfedgeTangent_.end()); result.triVerts.resize(NumTri()); + // note that `triVerts` is `std::vector`, so we cannot use thrust::device thrust::for_each_n(thrust::host, zip(result.triVerts.begin(), countAt(0)), NumTri(), MakeTri({pImpl_->halfedge_.cptrH()})); diff --git a/manifold/src/properties.cpp b/manifold/src/properties.cpp index 3862fec2e..29b30e07a 100644 --- a/manifold/src/properties.cpp +++ b/manifold/src/properties.cpp @@ -18,6 +18,7 @@ #include #include "impl.h" +#include "par.h" namespace { using namespace manifold; @@ -214,12 +215,13 @@ namespace manifold { */ bool Manifold::Impl::IsManifold() const { if (halfedge_.size() == 0) return true; - bool isManifold = thrust::all_of(thrust::device, countAt(0), countAt(halfedge_.size()), + auto policy = autoPolicy(halfedge_.size()); + bool isManifold = all_of(policy, countAt(0), countAt(halfedge_.size()), CheckManifold({halfedge_.cptrD()})); VecDH halfedge(halfedge_); - thrust::sort(thrust::device, halfedge.begin(), halfedge.end()); - isManifold &= thrust::all_of(thrust::device, countAt(0), countAt(2 * NumEdge() - 1), + sort(policy, halfedge.begin(), halfedge.end()); + isManifold &= all_of(policy, countAt(0), countAt(2 * NumEdge() - 1), NoDuplicates({halfedge.cptrD()})); return isManifold; } @@ -229,7 +231,7 @@ bool Manifold::Impl::IsManifold() const { */ bool Manifold::Impl::MatchesTriNormals() const { if (halfedge_.size() == 0 || faceNormal_.size() != NumTri()) return true; - return thrust::all_of(thrust::device, countAt(0), countAt(NumTri()), + return all_of(autoPolicy(NumTri()), countAt(0), countAt(NumTri()), CheckCCW({halfedge_.cptrD(), vertPos_.cptrD(), faceNormal_.cptrD(), 2 * precision_})); } @@ -239,7 +241,7 @@ bool Manifold::Impl::MatchesTriNormals() const { */ int Manifold::Impl::NumDegenerateTris() const { if (halfedge_.size() == 0 || faceNormal_.size() != NumTri()) return true; - return thrust::count_if(thrust::device, countAt(0), countAt(NumTri()), + return count_if(autoPolicy(NumTri()), countAt(0), countAt(NumTri()), CheckCCW({halfedge_.cptrD(), vertPos_.cptrD(), faceNormal_.cptrD(), -1 * precision_ / 2})); } @@ -247,8 +249,8 @@ int Manifold::Impl::NumDegenerateTris() const { Properties Manifold::Impl::GetProperties() const { if (IsEmpty()) return {0, 0}; ApplyTransform(); - thrust::pair areaVolume = thrust::transform_reduce( - thrust::device, countAt(0), countAt(NumTri()), + auto areaVolume = transform_reduce>( + autoPolicy(NumTri()), countAt(0), countAt(NumTri()), FaceAreaVolume({halfedge_.cptrD(), vertPos_.cptrD(), precision_}), thrust::make_pair(0.0f, 0.0f), SumPair()); return {areaVolume.first, areaVolume.second}; @@ -262,26 +264,27 @@ Curvature Manifold::Impl::GetCurvature() const { VecDH vertGaussianCurvature(NumVert(), glm::two_pi()); VecDH vertArea(NumVert(), 0); VecDH degree(NumVert(), 0); - thrust::for_each( - thrust::device, countAt(0), countAt(NumTri()), + auto policy = autoPolicy(NumTri()); + for_each( + policy, countAt(0), countAt(NumTri()), CurvatureAngles({vertMeanCurvature.ptrD(), vertGaussianCurvature.ptrD(), vertArea.ptrD(), degree.ptrD(), halfedge_.cptrD(), vertPos_.cptrD(), faceNormal_.cptrD()})); - thrust::for_each_n( - thrust::device, zip(vertMeanCurvature.begin(), vertGaussianCurvature.begin(), + for_each_n( + policy, zip(vertMeanCurvature.begin(), vertGaussianCurvature.begin(), vertArea.begin(), degree.begin()), NumVert(), NormalizeCurvature()); result.minMeanCurvature = - thrust::reduce(thrust::device, vertMeanCurvature.begin(), vertMeanCurvature.end(), + reduce(policy, vertMeanCurvature.begin(), vertMeanCurvature.end(), std::numeric_limits::infinity(), thrust::minimum()); result.maxMeanCurvature = - thrust::reduce(thrust::device, vertMeanCurvature.begin(), vertMeanCurvature.end(), + reduce(policy, vertMeanCurvature.begin(), vertMeanCurvature.end(), -std::numeric_limits::infinity(), thrust::maximum()); - result.minGaussianCurvature = thrust::reduce( - thrust::device, vertGaussianCurvature.begin(), vertGaussianCurvature.end(), std::numeric_limits::infinity(), + result.minGaussianCurvature = reduce( + policy, vertGaussianCurvature.begin(), vertGaussianCurvature.end(), std::numeric_limits::infinity(), thrust::minimum()); - result.maxGaussianCurvature = thrust::reduce( - thrust::device, vertGaussianCurvature.begin(), vertGaussianCurvature.end(), + result.maxGaussianCurvature = reduce( + policy, vertGaussianCurvature.begin(), vertGaussianCurvature.end(), -std::numeric_limits::infinity(), thrust::maximum()); result.vertMeanCurvature.insert(result.vertMeanCurvature.end(), vertMeanCurvature.begin(), @@ -298,9 +301,10 @@ Curvature Manifold::Impl::GetCurvature() const { * range for Morton code calculation. */ void Manifold::Impl::CalculateBBox() { - bBox_.min = thrust::reduce(thrust::device, vertPos_.begin(), vertPos_.end(), + auto policy = autoPolicy(NumVert()); + bBox_.min = reduce(policy, vertPos_.begin(), vertPos_.end(), glm::vec3(std::numeric_limits::infinity()), PosMin()); - bBox_.max = thrust::reduce(thrust::device, vertPos_.begin(), vertPos_.end(), + bBox_.max = reduce(policy, vertPos_.begin(), vertPos_.end(), glm::vec3(-std::numeric_limits::infinity()), PosMax()); } } // namespace manifold diff --git a/manifold/src/shared.h b/manifold/src/shared.h index f78b7ada8..9b21467af 100644 --- a/manifold/src/shared.h +++ b/manifold/src/shared.h @@ -18,6 +18,7 @@ #include #include "utils.h" #include "vec_dh.h" +#include "par.h" namespace manifold { @@ -149,9 +150,9 @@ struct TmpInvalid { VecDH inline CreateTmpEdges(const VecDH& halfedge) { VecDH edges(halfedge.size()); - thrust::for_each_n(thrust::device, zip(edges.begin(), halfedge.begin(), countAt(0)), + for_each_n(autoPolicy(edges.size()), zip(edges.begin(), halfedge.begin(), countAt(0)), edges.size(), Halfedge2Tmp()); - int numEdge = thrust::remove_if(thrust::device, edges.begin(), edges.end(), TmpInvalid()) - + int numEdge = remove_if(autoPolicy(edges.size()), edges.begin(), edges.end(), TmpInvalid()) - edges.begin(); ALWAYS_ASSERT(numEdge == halfedge.size() / 2, topologyErr, "Not oriented!"); edges.resize(numEdge); diff --git a/manifold/src/smoothing.cpp b/manifold/src/smoothing.cpp index d33d63be0..48bebfc67 100644 --- a/manifold/src/smoothing.cpp +++ b/manifold/src/smoothing.cpp @@ -15,6 +15,7 @@ #include #include "impl.h" +#include "par.h" namespace { using namespace manifold; @@ -356,7 +357,7 @@ void Manifold::Impl::CreateTangents( const int numHalfedge = halfedge_.size(); halfedgeTangent_.resize(numHalfedge); - thrust::for_each_n(thrust::device, zip(halfedgeTangent_.begin(), halfedge_.cbegin()), + for_each_n(autoPolicy(numHalfedge), zip(halfedgeTangent_.begin(), halfedge_.cbegin()), numHalfedge, SmoothBezier({vertPos_.cptrD(), faceNormal_.cptrD(), vertNormal_.cptrD(), halfedge_.cptrD()})); @@ -476,12 +477,13 @@ Manifold::Impl::MeshRelationD Manifold::Impl::Subdivide(int n) { VecDH edges = CreateTmpEdges(halfedge_); VecDH half2Edge(2 * numEdge); - thrust::for_each_n(thrust::device, zip(countAt(0), edges.begin()), numEdge, + auto policy = autoPolicy(numEdge); + for_each_n(policy, zip(countAt(0), edges.begin()), numEdge, ReindexHalfedge({half2Edge.ptrD()})); - thrust::for_each_n(thrust::device, zip(countAt(0), edges.begin()), numEdge, + for_each_n(policy, zip(countAt(0), edges.begin()), numEdge, EdgeVerts({vertPos_.ptrD(), numVert, n})); - thrust::for_each_n( - thrust::device, zip(countAt(0), oldMeshRelation.triBary.begin()), numTri, + for_each_n( + policy, zip(countAt(0), oldMeshRelation.triBary.begin()), numTri, InteriorVerts({vertPos_.ptrD(), relation.barycentric.ptrD(), relation.triBary.ptrD(), meshRelation_.barycentric.ptrD(), meshRelation_.triBary.ptrD(), @@ -489,7 +491,7 @@ Manifold::Impl::MeshRelationD Manifold::Impl::Subdivide(int n) { halfedge_.ptrD()})); // Create subtriangles VecDH triVerts(n * n * numTri); - thrust::for_each_n(thrust::device, countAt(0), numTri, + for_each_n(policy, countAt(0), numTri, SplitTris({triVerts.ptrD(), halfedge_.cptrD(), half2Edge.cptrD(), numVert, triVertStart, n})); CreateHalfedges(triVerts); @@ -503,13 +505,14 @@ void Manifold::Impl::Refine(int n) { if (old.halfedgeTangent_.size() == old.halfedge_.size()) { VecDH vertBary(NumVert()); VecDH lock(NumVert(), 0); - thrust::for_each_n( - thrust::device, zip(relation.triBary.begin(), countAt(0)), NumTri(), + auto policy = autoPolicy(NumTri()); + for_each_n( + policy, zip(relation.triBary.begin(), countAt(0)), NumTri(), TriBary2Vert({vertBary.ptrD(), lock.ptrD(), relation.barycentric.cptrD(), halfedge_.cptrD()})); - thrust::for_each_n( - thrust::device, zip(vertPos_.begin(), vertBary.begin()), NumVert(), + for_each_n( + policy, zip(vertPos_.begin(), vertBary.begin()), NumVert(), InterpTri({old.halfedge_.cptrD(), old.halfedgeTangent_.cptrD(), old.vertPos_.cptrD()})); } diff --git a/manifold/src/sort.cpp b/manifold/src/sort.cpp index 69541c60d..aba8f9e8b 100644 --- a/manifold/src/sort.cpp +++ b/manifold/src/sort.cpp @@ -15,6 +15,7 @@ #include #include "impl.h" +#include "par.h" namespace { using namespace manifold; @@ -120,9 +121,9 @@ struct Reindex { template void Permute(VecDH& inOut, const VecDH& new2Old) { - VecDH tmp(inOut); + VecDH tmp(std::move(inOut)); inOut.resize(new2Old.size()); - thrust::gather(new2Old.begin(), new2Old.end(), tmp.begin(), + gather(autoPolicy(new2Old.size()), new2Old.begin(), new2Old.end(), tmp.begin(), inOut.begin()); } @@ -187,7 +188,7 @@ void Manifold::Impl::Finish() { "Not an even number of faces after sorting faces!"); Halfedge extrema = {0, 0, 0, 0}; extrema = - thrust::reduce(thrust::device, halfedge_.begin(), halfedge_.end(), extrema, Extrema()); + reduce(autoPolicy(halfedge_.size()), halfedge_.begin(), halfedge_.end(), extrema, Extrema()); ALWAYS_ASSERT(extrema.startVert >= 0, topologyErr, "Vertex index is negative!"); @@ -210,12 +211,13 @@ void Manifold::Impl::Finish() { */ void Manifold::Impl::SortVerts() { VecDH vertMorton(NumVert()); - thrust::for_each_n(thrust::device, zip(vertMorton.begin(), vertPos_.cbegin()), NumVert(), + auto policy = autoPolicy(NumVert()); + for_each_n(policy, zip(vertMorton.begin(), vertPos_.cbegin()), NumVert(), Morton({bBox_})); VecDH vertNew2Old(NumVert()); - thrust::sequence(thrust::device, vertNew2Old.begin(), vertNew2Old.end()); - thrust::sort_by_key(thrust::device, vertMorton.begin(), vertMorton.end(), + sequence(policy, vertNew2Old.begin(), vertNew2Old.end()); + sort_by_key(policy, vertMorton.begin(), vertMorton.end(), zip(vertPos_.begin(), vertNew2Old.begin())); ReindexVerts(vertNew2Old, NumVert()); @@ -223,7 +225,7 @@ void Manifold::Impl::SortVerts() { // Verts were flagged for removal with NaNs and assigned kNoCode to sort them // to the end, which allows them to be removed. const int newNumVert = - thrust::find(thrust::device, vertMorton.begin(), vertMorton.end(), kNoCode) - + find(policy, vertMorton.begin(), vertMorton.end(), kNoCode) - vertMorton.begin(); vertPos_.resize(newNumVert); } @@ -236,9 +238,9 @@ void Manifold::Impl::SortVerts() { void Manifold::Impl::ReindexVerts(const VecDH& vertNew2Old, int oldNumVert) { VecDH vertOld2New(oldNumVert); - thrust::scatter(thrust::device, countAt(0), countAt(NumVert()), vertNew2Old.begin(), + scatter(autoPolicy(oldNumVert), countAt(0), countAt(NumVert()), vertNew2Old.begin(), vertOld2New.begin()); - thrust::for_each(thrust::device, halfedge_.begin(), halfedge_.end(), + for_each(autoPolicy(oldNumVert), halfedge_.begin(), halfedge_.end(), Reindex({vertOld2New.cptrD()})); } @@ -251,8 +253,8 @@ void Manifold::Impl::GetFaceBoxMorton(VecDH& faceBox, VecDH& faceMorton) const { faceBox.resize(NumTri()); faceMorton.resize(NumTri()); - thrust::for_each_n( - thrust::device, zip(faceMorton.begin(), faceBox.begin(), countAt(0)), NumTri(), + for_each_n( + autoPolicy(NumTri()), zip(faceMorton.begin(), faceBox.begin(), countAt(0)), NumTri(), FaceMortonBox({halfedge_.cptrD(), vertPos_.cptrD(), bBox_})); } @@ -263,15 +265,16 @@ void Manifold::Impl::GetFaceBoxMorton(VecDH& faceBox, void Manifold::Impl::SortFaces(VecDH& faceBox, VecDH& faceMorton) { VecDH faceNew2Old(NumTri()); - thrust::sequence(thrust::device, faceNew2Old.begin(), faceNew2Old.end()); + auto policy = autoPolicy(faceNew2Old.size()); + sequence(policy, faceNew2Old.begin(), faceNew2Old.end()); - thrust::sort_by_key(thrust::device, faceMorton.begin(), faceMorton.end(), + sort_by_key(policy, faceMorton.begin(), faceMorton.end(), zip(faceBox.begin(), faceNew2Old.begin())); // Tris were flagged for removal with pairedHalfedge = -1 and assigned kNoCode // to sort them to the end, which allows them to be removed. const int newNumTri = - thrust::find(thrust::device, faceMorton.begin(), faceMorton.end(), kNoCode) - + find(policy, faceMorton.begin(), faceMorton.end(), kNoCode) - faceMorton.begin(); faceBox.resize(newNumTri); faceMorton.resize(newNumTri); @@ -295,13 +298,14 @@ void Manifold::Impl::GatherFaces(const VecDH& faceNew2Old) { VecDH oldHalfedge(std::move(halfedge_)); VecDH oldHalfedgeTangent(std::move(halfedgeTangent_)); VecDH faceOld2New(oldHalfedge.size() / 3); - thrust::scatter(thrust::device, countAt(0), countAt(numTri), faceNew2Old.begin(), + auto policy = autoPolicy(numTri); + scatter(policy, countAt(0), countAt(numTri), faceNew2Old.begin(), faceOld2New.begin()); halfedge_.resize(3 * numTri); if (oldHalfedgeTangent.size() != 0) halfedgeTangent_.resize(3 * numTri); - thrust::for_each_n( - thrust::device, countAt(0), numTri, + for_each_n( + policy, countAt(0), numTri, ReindexFace({halfedge_.ptrD(), halfedgeTangent_.ptrD(), oldHalfedge.cptrD(), oldHalfedgeTangent.cptrD(), faceNew2Old.cptrD(), faceOld2New.cptrD()})); @@ -311,25 +315,26 @@ void Manifold::Impl::GatherFaces(const Impl& old, const VecDH& faceNew2Old) { const int numTri = faceNew2Old.size(); meshRelation_.triBary.resize(numTri); - thrust::gather(thrust::device, faceNew2Old.begin(), faceNew2Old.end(), + auto policy = autoPolicy(numTri); + gather(policy, faceNew2Old.begin(), faceNew2Old.end(), old.meshRelation_.triBary.begin(), meshRelation_.triBary.begin()); meshRelation_.barycentric = old.meshRelation_.barycentric; if (old.faceNormal_.size() == old.NumTri()) { faceNormal_.resize(numTri); - thrust::gather(thrust::device, faceNew2Old.begin(), faceNew2Old.end(), + gather(policy, faceNew2Old.begin(), faceNew2Old.end(), old.faceNormal_.begin(), faceNormal_.begin()); } VecDH faceOld2New(old.NumTri()); - thrust::scatter(thrust::device, countAt(0), countAt(numTri), faceNew2Old.begin(), + scatter(policy, countAt(0), countAt(numTri), faceNew2Old.begin(), faceOld2New.begin()); halfedge_.resize(3 * numTri); if (old.halfedgeTangent_.size() != 0) halfedgeTangent_.resize(3 * numTri); - thrust::for_each_n( - thrust::device, countAt(0), numTri, + for_each_n( + policy, countAt(0), numTri, ReindexFace({halfedge_.ptrD(), halfedgeTangent_.ptrD(), old.halfedge_.cptrD(), old.halfedgeTangent_.cptrD(), faceNew2Old.cptrD(), faceOld2New.cptrD()})); diff --git a/test/mesh_test.cpp b/test/mesh_test.cpp index 82134888c..e4efdb9c5 100644 --- a/test/mesh_test.cpp +++ b/test/mesh_test.cpp @@ -759,7 +759,7 @@ TEST(Boolean, Gyroid) { EXPECT_TRUE(result.IsManifold()); EXPECT_TRUE(result.MatchesTriNormals()); - EXPECT_LE(result.NumDegenerateTris(), 42); + EXPECT_LE(result.NumDegenerateTris(), 43); EXPECT_EQ(result.Decompose().size(), 1); auto prop = result.GetProperties(); EXPECT_NEAR(prop.volume, 7692, 1); diff --git a/utilities/CMakeLists.txt b/utilities/CMakeLists.txt index 458e65dc8..cd27325b6 100644 --- a/utilities/CMakeLists.txt +++ b/utilities/CMakeLists.txt @@ -1,9 +1,20 @@ project (utilities) -add_library(${PROJECT_NAME} INTERFACE) +add_library(${PROJECT_NAME} src/detect_cuda.cpp) -target_include_directories(${PROJECT_NAME} INTERFACE ${THRUST_INC_DIR} ${GLM_INC_DIR}) -target_include_directories(${PROJECT_NAME} - INTERFACE - ${PROJECT_SOURCE_DIR}/include -) +target_include_directories(${PROJECT_NAME} PUBLIC ${THRUST_INC_DIR} ${GLM_INC_DIR}) +if(MANIFOLD_USE_CUDA) + set_source_files_properties(src/detect_cuda.cpp PROPERTIES LANGUAGE CUDA) + set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 61) + target_compile_options(${PROJECT_NAME} + PRIVATE ${MANIFOLD_NVCC_FLAGS} + ) + target_compile_options(${PROJECT_NAME} + PRIVATE "$<$:${MANIFOLD_NVCC_RELEASE_FLAGS}>" "$<$:${MANIFOLD_NVCC_DEBUG_FLAGS}>" + ) +else() + target_compile_options(${PROJECT_NAME} + PRIVATE ${MANIFOLD_FLAGS} + ) +endif() +target_include_directories(${PROJECT_NAME} INTERFACE ${PROJECT_SOURCE_DIR}/include) diff --git a/utilities/include/par.h b/utilities/include/par.h new file mode 100644 index 000000000..a101f54a0 --- /dev/null +++ b/utilities/include/par.h @@ -0,0 +1,154 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if MANIFOLD_PAR == 'O' +#include +#define MANIFOLD_PAR_NS omp +#elif MANIFOLD_PAR == 'T' +#include +#define MANIFOLD_PAR_NS tbb +#else +#define MANIFOLD_PAR_NS cpp +#endif + +#ifdef MANIFOLD_USE_CUDA +#include +#endif + +namespace manifold { + +void check_cuda_available(); +#ifdef MANIFOLD_USE_CUDA +extern int CUDA_ENABLED; +#else +constexpr int CUDA_ENABLED = 0; +#endif + +enum class ExecutionPolicy { + ParUnseq, + Par, + Seq, +}; + +constexpr ExecutionPolicy ParUnseq = ExecutionPolicy::ParUnseq; +constexpr ExecutionPolicy Par = ExecutionPolicy::Par; +constexpr ExecutionPolicy Seq = ExecutionPolicy::Seq; + +// ExecutionPolicy: +// - Sequential for small workload, +// - Parallel (CPU) for medium workload, +// - GPU for large workload if available. +inline ExecutionPolicy autoPolicy(int size) { + // some random numbers + if (size <= (1 << 12)) { + return Seq; + } + if (size <= (1 << 16) || CUDA_ENABLED != 1) { + return Par; + } + return ParUnseq; +} + +#ifdef MANIFOLD_USE_CUDA +#define THRUST_DYNAMIC_BACKEND_VOID(NAME) \ + template \ + void NAME(ExecutionPolicy policy, Args... args) { \ + switch (policy) { \ + case ExecutionPolicy::ParUnseq: \ + thrust::NAME(thrust::cuda::par, args...); \ + break; \ + case ExecutionPolicy::Par: \ + thrust::NAME(thrust::MANIFOLD_PAR_NS::par, args...); \ + break; \ + case ExecutionPolicy::Seq: \ + thrust::NAME(thrust::cpp::par, args...); \ + break; \ + } \ + } +#define THRUST_DYNAMIC_BACKEND(NAME, RET) \ + template \ + Ret NAME(ExecutionPolicy policy, Args... args) { \ + switch (policy) { \ + case ExecutionPolicy::ParUnseq: \ + return thrust::NAME(thrust::cuda::par, args...); \ + case ExecutionPolicy::Par: \ + return thrust::NAME(thrust::MANIFOLD_PAR_NS::par, args...); \ + case ExecutionPolicy::Seq: \ + break; \ + } \ + return thrust::NAME(thrust::cpp::par, args...); \ + } +#else +#define THRUST_DYNAMIC_BACKEND_VOID(NAME) \ + template \ + void NAME(ExecutionPolicy policy, Args... args) { \ + switch (policy) { \ + case ExecutionPolicy::ParUnseq: \ + case ExecutionPolicy::Par: \ + thrust::NAME(thrust::MANIFOLD_PAR_NS::par, args...); \ + break; \ + case ExecutionPolicy::Seq: \ + thrust::NAME(thrust::cpp::par, args...); \ + break; \ + } \ + } + +#define THRUST_DYNAMIC_BACKEND(NAME, RET) \ + template \ + Ret NAME(ExecutionPolicy policy, Args... args) { \ + switch (policy) { \ + case ExecutionPolicy::ParUnseq: \ + case ExecutionPolicy::Par: \ + return thrust::NAME(thrust::MANIFOLD_PAR_NS::par, args...); \ + case ExecutionPolicy::Seq: \ + break; \ + } \ + return thrust::NAME(thrust::cpp::par, args...); \ + } +#endif + +THRUST_DYNAMIC_BACKEND_VOID(gather) +THRUST_DYNAMIC_BACKEND_VOID(scatter) +THRUST_DYNAMIC_BACKEND_VOID(for_each) +THRUST_DYNAMIC_BACKEND_VOID(for_each_n) +THRUST_DYNAMIC_BACKEND_VOID(sort) +THRUST_DYNAMIC_BACKEND_VOID(stable_sort) +THRUST_DYNAMIC_BACKEND_VOID(fill) +THRUST_DYNAMIC_BACKEND_VOID(sequence) +THRUST_DYNAMIC_BACKEND_VOID(sort_by_key) +THRUST_DYNAMIC_BACKEND_VOID(copy) +THRUST_DYNAMIC_BACKEND_VOID(transform) +THRUST_DYNAMIC_BACKEND_VOID(inclusive_scan) +THRUST_DYNAMIC_BACKEND_VOID(exclusive_scan) +THRUST_DYNAMIC_BACKEND_VOID(uninitialized_fill) +THRUST_DYNAMIC_BACKEND_VOID(uninitialized_copy) + +THRUST_DYNAMIC_BACKEND(all_of, bool) +THRUST_DYNAMIC_BACKEND(is_sorted, bool) +THRUST_DYNAMIC_BACKEND(reduce, void) +THRUST_DYNAMIC_BACKEND(count_if, int) +THRUST_DYNAMIC_BACKEND(binary_search, bool) +// void implies that the user have to specify the return type in the template +// argument, as we are unable to deduce it +THRUST_DYNAMIC_BACKEND(remove, void) +THRUST_DYNAMIC_BACKEND(copy_if, void) +THRUST_DYNAMIC_BACKEND(remove_if, void) +THRUST_DYNAMIC_BACKEND(unique, void) +THRUST_DYNAMIC_BACKEND(find, void) +THRUST_DYNAMIC_BACKEND(reduce_by_key, void) +THRUST_DYNAMIC_BACKEND(transform_reduce, void) +THRUST_DYNAMIC_BACKEND(lower_bound, void) +THRUST_DYNAMIC_BACKEND(gather_if, void) + +} // namespace manifold diff --git a/utilities/include/sparse.h b/utilities/include/sparse.h index 9a7132e8b..ce1b21509 100644 --- a/utilities/include/sparse.h +++ b/utilities/include/sparse.h @@ -23,6 +23,7 @@ #include "structs.h" #include "utils.h" #include "vec_dh.h" +#include "par.h" namespace manifold { @@ -65,7 +66,7 @@ class SparseIndices { int size() const { return p.size(); } void SwapPQ() { p.swap(q); } - void Sort() { thrust::sort(beginPQ(), endPQ()); } + void Sort() { sort(autoPolicy(size()), beginPQ(), endPQ()); } void Resize(int size) { p.resize(size, -1); @@ -74,7 +75,7 @@ class SparseIndices { void Unique() { Sort(); - int newSize = thrust::unique(beginPQ(), endPQ()) - beginPQ(); + int newSize = unique(autoPolicy(size()), beginPQ(), endPQ()) - beginPQ(); Resize(newSize); } @@ -89,7 +90,7 @@ class SparseIndices { "Different number of values than indicies!"); auto zBegin = zip(S.begin(), begin(false), begin(true)); auto zEnd = zip(S.end(), end(false), end(true)); - size_t size = thrust::remove_if(zBegin, zEnd, firstZero()) - zBegin; + size_t size = remove_if(autoPolicy(S.size()), zBegin, zEnd, firstZero()) - zBegin; S.resize(size, -1); p.resize(size, -1); q.resize(size, -1); @@ -122,9 +123,7 @@ class SparseIndices { "Different number of values than indicies!"); auto zBegin = zip(v.begin(), x.begin(), begin(false), begin(true)); auto zEnd = zip(v.end(), x.end(), end(false), end(true)); - size_t size = - thrust::remove_if(thrust::device, zBegin, zEnd, firstNonFinite()) - - zBegin; + size_t size = remove_if(autoPolicy(v.size()), zBegin, zEnd, firstNonFinite()) - zBegin; v.resize(size); x.resize(size, -1); p.resize(size, -1); @@ -141,13 +140,12 @@ class SparseIndices { VecDH result(size); VecDH found(size); VecDH temp(size); - thrust::fill(thrust::device, result.begin(), result.end(), missingVal); - thrust::binary_search(thrust::device, beginPQ(), endPQ(), pqBegin, pqEnd, - found.begin()); - thrust::lower_bound(thrust::device, beginPQ(), endPQ(), pqBegin, pqEnd, - temp.begin()); - thrust::gather_if(thrust::device, temp.begin(), temp.end(), found.begin(), - val.begin(), result.begin()); + auto policy = autoPolicy(size); + fill(policy, result.begin(), result.end(), missingVal); + binary_search(policy, beginPQ(), endPQ(), pqBegin, pqEnd, found.begin()); + lower_bound(policy, beginPQ(), endPQ(), pqBegin, pqEnd, temp.begin()); + gather_if(policy, temp.begin(), temp.end(), found.begin(), val.begin(), + result.begin()); return result; } diff --git a/utilities/include/utils.h b/utilities/include/utils.h index c00a15115..d2a295d63 100644 --- a/utilities/include/utils.h +++ b/utilities/include/utils.h @@ -23,6 +23,8 @@ #include +#include "par.h" + namespace manifold { /** @defgroup Private @@ -40,8 +42,12 @@ inline void MemUsage() { inline void CheckDevice() { #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) throw std::runtime_error(cudaGetErrorString(error)); + if (CUDA_ENABLED == -1) + check_cuda_available(); + if (CUDA_ENABLED) { + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) throw std::runtime_error(cudaGetErrorString(error)); + } #endif } diff --git a/utilities/include/vec_dh.h b/utilities/include/vec_dh.h index f119fd57f..a13431a79 100644 --- a/utilities/include/vec_dh.h +++ b/utilities/include/vec_dh.h @@ -13,11 +13,286 @@ // limitations under the License. #pragma once -#include -#include +#include +#ifdef MANIFOLD_USE_CUDA +#include +#endif +#include + +#include "par.h" +#include "structs.h" namespace manifold { +// Vector implementation optimized for managed memory, will perform memory +// prefetching to minimize page faults and use parallel/GPU copy/fill depending +// on data size. This will also handle builds without CUDA or builds with CUDA +// but runned without CUDA GPU properly. +// +// Note that the constructor and resize function will not perform initialization +// if the parameter val is not set. Also, this implementation is a toy +// implementation that did not consider things like non-trivial +// constructor/destructor, please keep T trivial. +template class ManagedVec { +public: + typedef T *Iter; + typedef const T *IterC; + + ManagedVec() { + size_ = 0; + capacity_ = 0; + onHost = true; + } + + // note that we leave the memory uninitialized + ManagedVec(size_t n) { + size_ = n; + capacity_ = n; + onHost = autoPolicy(n) != ExecutionPolicy::ParUnseq; + if (n == 0) + return; + mallocManaged(&ptr_, size_ * sizeof(T)); + } + + ManagedVec(size_t n, const T &val) { + size_ = n; + capacity_ = n; + if (n == 0) + return; + auto policy = autoPolicy(n); + onHost = policy != ExecutionPolicy::ParUnseq; + mallocManaged(&ptr_, size_ * sizeof(T)); + prefetch(ptr_, size_ * sizeof(T), onHost); + uninitialized_fill_n(policy, ptr_, n, val); + } + + ~ManagedVec() { + if (ptr_ != nullptr) + freeManaged(ptr_); + ptr_ = nullptr; + size_ = 0; + capacity_ = 0; + } + + ManagedVec(const std::vector &vec) { + size_ = vec.size(); + capacity_ = size_; + auto policy = autoPolicy(size_); + onHost = policy != ExecutionPolicy::ParUnseq; + if (size_ != 0) { + mallocManaged(&ptr_, size_ * sizeof(T)); + fastUninitializedCopy(ptr_, vec.data(), size_, policy); + } + } + + ManagedVec(const ManagedVec &vec) { + size_ = vec.size_; + capacity_ = size_; + auto policy = autoPolicy(size_); + onHost = policy != ExecutionPolicy::ParUnseq; + if (size_ != 0) { + mallocManaged(&ptr_, size_ * sizeof(T)); + prefetch(ptr_, size_ * sizeof(T), onHost); + uninitialized_copy(policy, vec.begin(), vec.end(), ptr_); + } + } + + ManagedVec(ManagedVec &&vec) { + ptr_ = vec.ptr_; + size_ = vec.size_; + capacity_ = vec.capacity_; + onHost = vec.onHost; + vec.ptr_ = nullptr; + vec.size_ = 0; + vec.capacity_ = 0; + } + + ManagedVec &operator=(const ManagedVec &vec) { + if (&vec == this) + return *this; + if (ptr_ != nullptr) + freeManaged(ptr_); + size_ = vec.size_; + capacity_ = vec.size_; + auto policy = autoPolicy(size_); + onHost = policy != ExecutionPolicy::ParUnseq; + if (size_ != 0) { + mallocManaged(&ptr_, size_ * sizeof(T)); + prefetch(ptr_, size_ * sizeof(T), onHost); + uninitialized_copy(policy, vec.begin(), vec.end(), ptr_); + } + return *this; + } + + ManagedVec &operator=(ManagedVec &&vec) { + if (&vec == this) + return *this; + if (ptr_ != nullptr) + freeManaged(ptr_); + onHost = vec.onHost; + size_ = vec.size_; + capacity_ = vec.capacity_; + ptr_ = vec.ptr_; + vec.ptr_ = nullptr; + vec.size_ = 0; + vec.capacity_ = 0; + return *this; + } + + void resize(size_t n) { + reserve(n); + size_ = n; + } + + void resize(size_t n, const T &val) { + reserve(n); + if (size_ < n) { + uninitialized_fill(autoPolicy(n - size_), ptr_ + size_, ptr_ + n, val); + } + size_ = n; + } + + void reserve(size_t n) { + if (n > capacity_) { + T *newBuffer; + mallocManaged(&newBuffer, n * sizeof(T)); + prefetch(newBuffer, size_ * sizeof(T), onHost); + if (size_ > 0) { + uninitialized_copy(autoPolicy(size_), ptr_, ptr_ + size_, newBuffer); + } + if (ptr_ != nullptr) + freeManaged(ptr_); + ptr_ = newBuffer; + capacity_ = n; + } + } + + void shrink_to_fit() { + T *newBuffer = nullptr; + if (size_ > 0) { + mallocManaged(&newBuffer, size_ * sizeof(T)); + prefetch(newBuffer, size_ * sizeof(T), onHost); + uninitialized_copy(autoPolicy(size_), ptr_, ptr_ + size_, newBuffer); + } + freeManaged(ptr_); + ptr_ = newBuffer; + capacity_ = size_; + } + + void push_back(const T &val) { + if (size_ >= capacity_) { + // avoid dangling pointer in case val is a reference of our array + T val_copy = val; + reserve(capacity_ == 0 ? 128 : capacity_ * 2); + onHost = true; + ptr_[size_++] = val_copy; + return; + } + ptr_[size_++] = val; + } + + void swap(ManagedVec &other) { + std::swap(ptr_, other.ptr_); + std::swap(size_, other.size_); + std::swap(capacity_, other.capacity_); + std::swap(onHost, other.onHost); + } + + void prefetch_to(bool toHost) const { + if (toHost != onHost) + prefetch(ptr_, size_ * sizeof(T), toHost); + onHost = toHost; + } + + IterC cbegin() const { return ptr_; } + + IterC cend() const { return ptr_ + size_; } + + Iter begin() { return ptr_; } + + Iter end() { return ptr_ + size_; } + + IterC begin() const { return cbegin(); } + + IterC end() const { return cend(); } + + T *data() { return ptr_; } + + const T *data() const { return ptr_; } + + size_t size() const { return size_; } + + size_t capacity() const { return capacity_; } + + T &front() { return *ptr_; } + + const T &front() const { return *ptr_; } + + T &back() { return *(ptr_ + size_ - 1); } + + const T &back() const { return *(ptr_ + size_ - 1); } + + T &operator[](size_t i) { return *(ptr_ + i); } + + const T &operator[](size_t i) const { return *(ptr_ + i); } + + bool empty() const { return size_ == 0; } + +private: + T *ptr_ = nullptr; + size_t size_ = 0; + size_t capacity_ = 0; + mutable bool onHost = true; + + static constexpr int DEVICE_MAX_BYTES = 1 << 16; + + static void mallocManaged(T **ptr, size_t bytes) { +#ifdef MANIFOLD_USE_CUDA + if (CUDA_ENABLED == -1) + check_cuda_available(); + if (CUDA_ENABLED) + cudaMallocManaged(reinterpret_cast(ptr), bytes); + else +#endif + *ptr = reinterpret_cast(malloc(bytes)); + } + + static void freeManaged(T *ptr) { +#ifdef MANIFOLD_USE_CUDA + if (CUDA_ENABLED) + cudaFree(ptr); + else +#endif + free(ptr); + } + + static void prefetch(T *ptr, int bytes, bool onHost) { +#ifdef MANIFOLD_USE_CUDA + if (bytes > 0 && CUDA_ENABLED) + cudaMemPrefetchAsync(ptr, std::min(bytes, DEVICE_MAX_BYTES), + onHost ? cudaCpuDeviceId : 0); +#endif + } + + // fast routine for memcpy from std::vector to ManagedVec + static void fastUninitializedCopy(T *dst, const T *src, int n, + ExecutionPolicy policy) { + prefetch(dst, n * sizeof(T), policy != ExecutionPolicy::ParUnseq); + switch (policy) { + case ExecutionPolicy::ParUnseq: +#ifdef MANIFOLD_USE_CUDA + cudaMemcpy(dst, src, n * sizeof(T), cudaMemcpyHostToDevice); +#endif + case ExecutionPolicy::Par: + thrust::uninitialized_copy_n(thrust::MANIFOLD_PAR_NS::par, src, n, dst); + break; + case ExecutionPolicy::Seq: + thrust::uninitialized_copy_n(thrust::cpp::par, src, n, dst); + break; + } + } +}; + /* * Host and device vector implementation. This uses `thrust::universal_vector` * for storage, so data can be moved by the hardware on demand, allows using @@ -35,268 +310,136 @@ namespace manifold { * some device(host) modification, and then read the host(device) pointer again * (on the same vector). The memory will be inconsistent in that case. */ -template -class VecDH { - public: - VecDH() { impl_ = thrust::universal_vector(); } - - VecDH(int size, T val = T()) { - impl_.resize(size, val); - implModified = true; - cacheModified = false; - } - - VecDH(const std::vector& vec) { - cache = vec; - cacheModified = true; - implModified = false; - } - - VecDH(const VecDH& other) { - if (!other.cacheModified) { - if (other.impl_.size() > 0) - impl_ = other.impl_; - else - impl_.clear(); - implModified = true; - cacheModified = false; - } else { - cache = other.cache; - implModified = false; - cacheModified = true; - } - } +template class VecDH { +public: + VecDH() {} - VecDH(VecDH&& other) { - if (!other.cacheModified) { - if (other.impl_.size() > 0) - impl_ = std::move(other.impl_); - else - impl_.clear(); - implModified = true; - cacheModified = false; - } else { - cache = std::move(other.cache); - cacheModified = true; - implModified = false; - } - } + // Note that the vector constructed with this constructor will contain + // uninitialized memory. Please specify `val` if you need to make sure that + // the data is initialized. + VecDH(int size) { impl_.resize(size); } - VecDH& operator=(const VecDH& other) { - if (!other.cacheModified) { - if (other.impl_.size() > 0) - impl_ = other.impl_; - else - impl_.clear(); - implModified = true; - cacheModified = false; - } else { - cache = other.cache; - cacheModified = true; - implModified = false; - } + VecDH(int size, T val) { impl_.resize(size, val); } + + VecDH(const std::vector &vec) { impl_ = vec; } + + VecDH(const VecDH &other) { impl_ = other.impl_; } + + VecDH(VecDH &&other) { impl_ = std::move(other.impl_); } + + VecDH &operator=(const VecDH &other) { + impl_ = other.impl_; return *this; } - VecDH& operator=(VecDH&& other) { - if (!other.cacheModified) { - if (other.impl_.size() > 0) - impl_ = std::move(other.impl_); - else - impl_.clear(); - implModified = true; - cacheModified = false; - } else { - cache = std::move(other.cache); - cacheModified = true; - implModified = false; - } + VecDH &operator=(VecDH &&other) { + impl_ = std::move(other.impl_); return *this; } - int size() const { - if (!cacheModified) return impl_.size(); - return cache.size(); - } + int size() const { return impl_.size(); } void resize(int newSize, T val = T()) { bool shrink = size() > 2 * newSize; - if (cacheModified) { - cache.resize(newSize, val); - if (shrink) cache.shrink_to_fit(); - } else { - impl_.resize(newSize, val); - if (shrink) impl_.shrink_to_fit(); - implModified = true; - } + impl_.resize(newSize, val); + if (shrink) + impl_.shrink_to_fit(); } - void swap(VecDH& other) { - if (!cacheModified && !other.cacheModified) { - implModified = true; - other.implModified = true; - impl_.swap(other.impl_); - } else { - syncCache(); - other.syncCache(); - cacheModified = true; - implModified = false; - other.cacheModified = true; - other.implModified = false; - cache.swap(other.cache); - } - } + void swap(VecDH &other) { impl_.swap(other.impl_); } - using Iter = typename thrust::universal_vector::iterator; - using IterC = typename thrust::universal_vector::const_iterator; + using Iter = typename ManagedVec::Iter; + using IterC = typename ManagedVec::IterC; Iter begin() { - syncImpl(); - implModified = true; + impl_.prefetch_to(autoPolicy(size()) != ExecutionPolicy::ParUnseq); return impl_.begin(); } - Iter end() { - syncImpl(); - implModified = true; - return impl_.end(); - } + Iter end() { return impl_.end(); } IterC cbegin() const { - syncImpl(); + impl_.prefetch_to(autoPolicy(size()) != ExecutionPolicy::ParUnseq); return impl_.cbegin(); } - IterC cend() const { - syncImpl(); - return impl_.cend(); - } + IterC cend() const { return impl_.cend(); } IterC begin() const { return cbegin(); } IterC end() const { return cend(); } - T* ptrD() { - if (size() == 0) return nullptr; - syncImpl(); - implModified = true; - return impl_.data().get(); + T *ptrD() { + if (size() == 0) + return nullptr; + impl_.prefetch_to(autoPolicy(size()) != ExecutionPolicy::ParUnseq); + return impl_.data(); } - const T* cptrD() const { - if (size() == 0) return nullptr; - syncImpl(); - return impl_.data().get(); + const T *cptrD() const { + if (size() == 0) + return nullptr; + impl_.prefetch_to(autoPolicy(size()) != ExecutionPolicy::ParUnseq); + return impl_.data(); } - const T* ptrD() const { return cptrD(); } + const T *ptrD() const { return cptrD(); } - T* ptrH() { - if (size() == 0) return nullptr; - if (cacheModified) { - return cache.data(); - } else { - implModified = true; - return impl_.data().get(); - } + T *ptrH() { + if (size() == 0) + return nullptr; + impl_.prefetch_to(true); + return impl_.data(); } - const T* cptrH() const { - if (size() == 0) return nullptr; - if (cacheModified) { - return cache.data(); - } else { - return impl_.data().get(); - } + const T *cptrH() const { + if (size() == 0) + return nullptr; + impl_.prefetch_to(true); + return impl_.data(); } - const T* ptrH() const { return cptrH(); } + const T *ptrH() const { return cptrH(); } - T& operator[](int i) { - if (!cacheModified) { - implModified = true; - return impl_[i]; - } else { - cacheModified = true; - return cache[i]; - } + T &operator[](int i) { + impl_.prefetch_to(true); + return impl_[i]; } - const T& operator[](int i) const { - if (!cacheModified) { - return impl_[i]; - } else { - return cache[i]; - } + const T &operator[](int i) const { + impl_.prefetch_to(true); + return impl_[i]; } - T& back() { - if (!cacheModified) { - implModified = true; - return impl_.back(); - } else { - return cache.back(); - } - } + T &back() { return impl_.back(); } - const T& back() const { - if (!cacheModified) { - return impl_.back(); - } else { - return cache.back(); - } - } + const T &back() const { return impl_.back(); } - void push_back(const T& val) { - syncCache(); - cacheModified = true; - cache.push_back(val); - } + void push_back(const T &val) { impl_.push_back(val); } - void reserve(int n) { - syncCache(); - cacheModified = true; - cache.reserve(n); - } + void reserve(int n) { impl_.reserve(n); } void Dump() const { - syncCache(); - manifold::Dump(cache); - } - - private: - mutable thrust::universal_vector impl_; - - mutable bool implModified = false; - mutable bool cacheModified = false; - mutable std::vector cache; - - void syncImpl() const { - if (cacheModified) { - impl_ = cache; - cache.clear(); + std::cout << "VecDH = " << std::endl; + for (int i = 0; i < impl_.size(); ++i) { + std::cout << i << ", " << impl_[i] << ", " << std::endl; } - cacheModified = false; + std::cout << std::endl; } - void syncCache() const { - if (implModified || cache.empty()) { - cache = std::vector(impl_.begin(), impl_.end()); - } - implModified = false; - } +private: + ManagedVec impl_; }; -template -class VecD { - public: - VecD(const VecDH& vec) : ptr_(vec.ptrD()), size_(vec.size()) {} +template class VecD { +public: + VecD(const VecDH &vec) : ptr_(vec.ptrD()), size_(vec.size()) {} - __host__ __device__ const T& operator[](int i) const { return ptr_[i]; } + __host__ __device__ const T &operator[](int i) const { return ptr_[i]; } __host__ __device__ int size() const { return size_; } - private: - T const* const ptr_; +private: + T const *const ptr_; const int size_; }; /** @} */ -} // namespace manifold +} // namespace manifold diff --git a/utilities/src/detect_cuda.cpp b/utilities/src/detect_cuda.cpp new file mode 100644 index 000000000..e3aa80b8e --- /dev/null +++ b/utilities/src/detect_cuda.cpp @@ -0,0 +1,16 @@ +#ifdef MANIFOLD_USE_CUDA +#include + +namespace manifold { +int CUDA_ENABLED = -1; +void check_cuda_available() { + int device_count = 0; + cudaError_t error = cudaGetDeviceCount(&device_count); + CUDA_ENABLED = device_count != 0; +} +} +#else +namespace manifold { +void check_cuda_available() {} +} +#endif